Subset data based on string values - r

I would like to subset my data frame based on the index column; I would like to keep those cases whose index is saved in myvar (eg. 110, 111). I don't understand why I receive 0 observations when running this code:
newdata <- df[ which(df$index=="myvars"), ]
Sample data:
df<-structure(list(index = c(111, 110, 101, 111), et = c(1, 1, 1,
1), d1_t2 = c(0, 1, 1, 1), d1_t3 = c(0, 0, 1, 1), d1_t4 = c(0,
1, 0, 1), d2_t1 = c(0, 0, 1, 1), d2_t2 = c(0, 1, 1, 1), d2_t3 = c(0,
0, 0, 1), d2_t4 = c(1, 0, 1, 1), d3_t1 = c(1, 0, 1, 1), d3_t2 = c(1,
1, 0, 1), d3_t3 = c(1, 0, 1, 1), d3_t4 = c(1, 1, 0, 1), d4_t1 = c(0,
0, 1, 1), d4_t2 = c(1, 1, 0, 1), d4_t3 = c(0, 0, 1, 1), d4_t4 = c(1,
0, 1, 1), d5_t1 = c(1, 0, 0, 1), d5_t2 = c(0, 1, 1, 1), d5_t3 = c(1,
0, 1, 1), d5_t4 = c(0, 0, 1, 1), d6_t1 = c(1, 0, 0, 1), d6_t2 = c(0,
0, 1, 1), d6_t3 = c(1, 0, 1, 1), d6_t4 = c(1, 0, 1, 1), d7_t1 = c(1,
1, 1, 1), d7_t2 = c(1, 1, 1, 1), d7_t3 = c(1, 0, 1, 1), d7_t4 = c(1,
0, 1, 1)), row.names = c(NA, 4L), class = "data.frame")
Code:
myvars<-c("110", "111")

try
myvars<-c(110, 111) # <-- !! no quotes !!
df[ which(df$index %in% myvars ), ] #also, no quotes round myvars

There are several basic problems with what you are trying to do.
You are not using the variable 'myvars' -- you are using a string with the value "myvars". None of your rows has the index "myvars".
You are using == which is good for one value (e.g. values==4), but myvars has multiple values in it. Instead, you could use df$index %in% myvars
This does work, but you have integer indices, and are accessing them with strings. This is unnecessary, and could lead to problems in other places.
You may be confused because of your very large and complex example data. You only need one column to test -- not twenty.

Related

Sorting barplot based on multi-categories in r

I am trying to get a bar plot for sentiment scores corrected as per the following order and put into two separate colors:
(NEGATIVE) anger, disgust, fear, sadness, negative --- (POSITIVE) anticipation, joy, surprise, trust, positive.
Below is the code which only gives a decreasing plot.
barplot(sort(colSums(s), decreasing = TRUE),
las = 2,
col = rainbow(2),
ylab = 'Count',
main = 'User Synergies')
> dput(head(s))
structure(list(anger = c(1, 0, 0, 0, 0, 0), anticipation = c(0,
0, 5, 0, 0, 0), disgust = c(0, 0, 0, 0, 0, 0), fear = c(1, 0,
2, 1, 0, 0), joy = c(1, 0, 1, 0, 0, 0), sadness = c(1, 0, 2,
1, 0, 0), surprise = c(0, 0, 2, 1, 0, 0), trust = c(4, 2, 3,
1, 0, 1), negative = c(2, 0, 3, 2, 1, 1), positive = c(4, 4,
7, 1, 0, 2)), row.names = c(NA, 6L), class = "data.frame")
Another way:
positive <- c("anticipation", "joy", "surprise", "trust", "positive")
negative <- c("anger", "disgust", "fear", "sadness", "negative")
barplot(colSums(s[,c(negative, positive)]),
las = 2,
col = c(rep("red", length(negative)), rep("cyan", length(positive))),
ylab = 'Count', ylim = c(0, 20),
main = 'User Synergies')
The result:
Try this ,
df <- structure(list(anger = c(1, 0, 0, 0, 0, 0),
anticipation = c(0, 0, 5, 0, 0, 0),
disgust = c(0, 0, 0, 0, 0, 0),
fear = c(1, 0,2, 1, 0, 0),
joy = c(1, 0, 1, 0, 0, 0),
sadness = c(1, 0, 2, 1, 0, 0),
surprise = c(0, 0, 2, 1, 0, 0),
trust = c(4, 2, 3, 1, 0, 1),
negative = c(2, 0, 3, 2, 1, 1),
positive = c(4, 4,7, 1, 0, 2)),
row.names = c(NA, 6L), class = "data.frame")
pn <- rainbow(2) # "#FF0000" "#00FFFF" one for positive and the other for negative
s <- sort(colSums(df) , decreasing = TRUE)
names(s)
#> [1] "positive" "trust" "negative" "anticipation" "fear"
#> [6] "sadness" "surprise" "joy" "anger" "disgust"
# arrange colors based on names of sorted columns
col <- c(pn[1] , pn[1] , pn[2] , pn[1] , pn[2] ,
pn[2] , pn[1] , pn[1] , pn[2] , pn[2])
barplot(s ,
las = 2,
col = col,
ylab = 'Count',
main = 'User Synergies')
Created on 2022-05-31 by the reprex package (v2.0.1)
You may try
library(dplyr)
library(reshape2)
df <- data.frame(
anger = 200,
disgust = 100,
fear = 900,
sadness = 400,
negative = 1500,
anticipation = 2000,
joy = 1200,
surprise = 300,
trust = 2500,
positive = 5000
)
pall <- c("red", "blue")
colSums(df) %>%
melt %>%
tibble::rownames_to_column(., "sentiments") %>%
mutate(sentiments = factor(sentiments, levels = c("anger", "disgust", "fear", "sadness", "negative", "anticipation", "joy", "surprise", "trust", "positive"))) %>%
mutate(colo = ifelse(sentiments %in% c("anger", "disgust", "fear", "sadness", "negative"), 0, 1) %>% as.factor) %>%
barplot(data = ., value ~ sentiments, col = pall[.$colo], las = 2, xlab = "")
Another approach :
df <- structure(list(anger = c(1, 0, 0, 0, 0, 0),
anticipation = c(0, 0, 5, 0, 0, 0),
disgust = c(0, 0, 0, 0, 0, 0),
fear = c(1, 0,2, 1, 0, 0),
joy = c(1, 0, 1, 0, 0, 0),
sadness = c(1, 0, 2, 1, 0, 0),
surprise = c(0, 0, 2, 1, 0, 0),
trust = c(4, 2, 3, 1, 0, 1),
negative = c(2, 0, 3, 2, 1, 1),
positive = c(4, 4,7, 1, 0, 2)),
row.names = c(NA, 6L), class = "data.frame")
s <- sort(colSums(df) , decreasing = TRUE)
pos <- c("positive" , "trust" , "anticipation" ,
"surprise" , "joy")
col <- names(s)
col <- ifelse(col %in% pos , "cyan" , "red")
barplot(s ,
las = 2,
col = col,
ylab = 'Count',
main = 'User Synergies')
Created on 2022-05-31 by the reprex package (v2.0.1)

Iterate through elements in a list with shared element names in R

I have a list that looks something like this (a must-reduced version of a list with 301 sub-elements):
myList <- list()
myList$Speaker1 <- list("ID" = c(1, 2, 3, 4, 5),
"S1C1.Sonorant" = c(0, 0, 0, 0.5, 0, -1),
"S1C1.Consonantal" = c(0, 0, 0, 0, 0, 1),
"S1C1.Voice" = c(0, 0, 1, 1, 1, -1),
"S1C1.Nasal" = c(0, 0, 1, 0, 1, -1))
myList$Speaker2 <- list("ID" = c(1, 2, 3, 4, 5),
"S1C1.Sonorant" = c(0, 0, 0, 0.5, 0, -1),
"S1C1.Consonantal" = c(0, 0, 0, 0, 0, 1),
"S1C1.Voice" = c(0, 0, 1, 1, 1, -1),
"S1C1.Nasal" = c(0, 0, 1, 0, 1, -1))
myList$Speaker3 <- list("ID" = c(1, 2, 3, 4, 5),
"S1C1.Sonorant" = c(0, 0, 0, 0.5, 0, -1),
"S1C1.Consonantal" = c(0, 0, 0, 0, 0, 1),
"S1C1.Voice" = c(0, 0, 1, 1, 1, -1),
"S1C1.Nasal" = c(0, 0, 1, 0, 1, -1))
For each speaker, I want to run some functions through all the sub-elements that include the string S1C1.. So far, I have the following, which calls each column containing S1C1 individually:
my_matrix <- lapply(myList, FUN = function(element) {
ones <- rep(1, nrow(element)) # count repeated rows
sonorant_vec.S1C1 <- element$S1C1.Sonorant
sonorant_mat.S1C1 <- (sonorant_vec.S1C1 %*% t(ones) - ones %*% t(sonorant_vec.S1C1))^2
consonantal_vec.S1C1 <- element$S1C1.Consonantal
consonantal_mat.S1C1 <- (consonantal_vec.S1C1 %*% t(ones) - ones %*% t(consonantal_vec.S1C1))^2
voice_vec.S1C1 <- element$S1C1.Voice
voice_mat.S1C1 <- (voice_vec.S1C1 %*% t(ones) - ones %*% t(voice_vec.S1C1))^2
nasal_vec.S1C1 <- element$S1C1.Nasal
nasal_mat.S1C1 <- (nasal_vec.S1C1 %*% t(ones) - ones %*% t(nasal_vec.S1C1))^2
mat.S1C1 <- sonorant_mat.S1C1 +
consonantal_mat.S1C1 +
voice_mat.S1C1 +
nasal_mat.S1C1
rownames(mat.S1C1) <- element$S1C1.S1C1
colnames(mat.S1C1) <- element$S1C1.S1C1
all_mat <- sqrt(mat.S1C1[,])
return(all_mat)
})
Is there a way I can iterate through all the sub-elements that start with the string S1C1.? The current code works but is very long!

Optimum algorithm to check various combinations of items when number of items is too large

I have a data frame which has 20 columns/items in it, and 593 rows (number of rows doesn't matter though) as shown below:
Using this the reliability of test is obtained as 0.94, with the help of alpha from psych package psych::alpha. The output also gives me the the new value of cronbach's alpha if I drop one of the items. However, I want to know how many items can I drop to retain an alpha of at least 0.8 I used a brute force approach for the purpose where I am creating the combination of all the items that exists in my data frame and check if their alpha is in the range (0.7,0.9). Is there a better way of doing this, as this is taking forever to run because number of items is too large to check for all the combination of items. Below is my current piece of code:
numberOfItems <- 20
for(i in 2:(2^numberOfItems)-1){
# ignoring the first case i.e. i=1, as it doesn't represent any model
# convert the value of i to binary, e.g. i=5 will give combination = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1
# using the binaryLogic package
combination <- as.binary(i, n=numberOfItems)
model <- c()
for(j in 1:length(combination)){
# choose which columns to consider depending on the combination
if(combination[j])
model <- c(model, j)
}
itemsToUse <- itemResponses[, c(model)]
#cat(model)
if(length(model) > 13){
alphaVal <- psych::alpha(itemsToUse)$total$raw_alpha
if(alphaVal > 0.7 && alphaVal < 0.9){
cat(alphaVal)
print(model)
}
}
}
A sample output from this code is as follows:
0.8989831 1 4 5 7 8 9 10 11 13 14 15 16 17 19 20
0.899768 1 4 5 7 8 9 10 11 12 13 15 17 18 19 20
0.899937 1 4 5 7 8 9 10 11 12 13 15 16 17 19 20
0.8980605 1 4 5 7 8 9 10 11 12 13 14 15 17 19 20
Here are the first 10 rows of data:
dput(itemResponses)
structure(list(CESD1 = c(1, 2, 2, 0, 1, 0, 0, 0, 0, 1), CESD2 = c(2,
3, 1, 0, 0, 1, 1, 1, 0, 1), CESD3 = c(0, 3, 0, 1, 1, 0, 0, 0,
0, 0), CESD4 = c(1, 2, 0, 1, 0, 1, 1, 1, 0, 0), CESD5 = c(0,
1, 0, 2, 1, 2, 2, 0, 0, 0), CESD6 = c(0, 3, 0, 1, 0, 0, 2, 0,
0, 0), CESD7 = c(1, 2, 1, 1, 2, 0, 1, 0, 1, 0), CESD8 = c(1,
3, 1, 1, 0, 1, 0, 0, 1, 0), CESD9 = c(0, 1, 0, 2, 0, 0, 1, 1,
0, 1), CESD10 = c(0, 1, 0, 2, 0, 0, 1, 1, 0, 1), CESD11 = c(0,
2, 1, 1, 1, 1, 2, 3, 0, 0), CESD12 = c(0, 3, 1, 1, 1, 0, 2, 0,
0, 0), CESD13 = c(0, 3, 0, 2, 1, 2, 1, 0, 1, 0), CESD14 = c(0,
3, 1, 2, 1, 1, 1, 0, 1, 1), CESD15 = c(0, 2, 0, 1, 0, 1, 0, 1,
1, 0), CESD16 = c(0, 2, 2, 0, 0, 1, 1, 0, 0, 0), CESD17 = c(0,
0, 0, 0, 0, 1, 1, 0, 0, 0), CESD18 = c(0, 2, 0, 0, 0, 0, 0, 0,
0, 1), CESD19 = c(0, 3, 0, 0, 0, 0, 0, 1, 1, 0), CESD20 = c(0,
3, 0, 1, 0, 0, 0, 0, 0, 0)), .Names = c("CESD1", "CESD2", "CESD3",
"CESD4", "CESD5", "CESD6", "CESD7", "CESD8", "CESD9", "CESD10",
"CESD11", "CESD12", "CESD13", "CESD14", "CESD15", "CESD16", "CESD17",
"CESD18", "CESD19", "CESD20"), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
The idea is to replace the computation of alpha with the so-called discrimination for each item from classical test theory (CTT). The discrimination is the correlation of the item score with a "true score" (which we would assume to be the row sum).
Let the data be
dat <- structure(list(CESD1 = c(1, 2, 2, 0, 1, 0, 0, 0, 0, 1), CESD2 = c(2, 3, 1, 0, 0, 1, 1, 1, 0, 1),
CESD3 = c(0, 3, 0, 1, 1, 0, 0, 0, 0, 0), CESD4 = c(1, 2, 0, 1, 0, 1, 1, 1, 0, 0),
CESD5 = c(0, 1, 0, 2, 1, 2, 2, 0, 0, 0), CESD6 = c(0, 3, 0, 1, 0, 0, 2, 0, 0, 0),
CESD7 = c(1, 2, 1, 1, 2, 0, 1, 0, 1, 0), CESD8 = c(1, 3, 1, 1, 0, 1, 0, 0, 1, 0),
CESD9 = c(0, 1, 0, 2, 0, 0, 1, 1, 0, 1), CESD10 = c(0, 1, 0, 2, 0, 0, 1, 1, 0, 1),
CESD11 = c(0, 2, 1, 1, 1, 1, 2, 3, 0, 0), CESD12 = c(0, 3, 1, 1, 1, 0, 2, 0, 0, 0),
CESD13 = c(0, 3, 0, 2, 1, 2, 1, 0, 1, 0), CESD14 = c(0, 3, 1, 2, 1, 1, 1, 0, 1, 1),
CESD15 = c(0, 2, 0, 1, 0, 1, 0, 1, 1, 0), CESD16 = c(0, 2, 2, 0, 0, 1, 1, 0, 0, 0),
CESD17 = c(0, 0, 0, 0, 0, 1, 1, 0, 0, 0), CESD18 = c(0, 2, 0, 0, 0, 0, 0, 0, 0, 1),
CESD19 = c(0, 3, 0, 0, 0, 0, 0, 1, 1, 0), CESD20 = c(0, 3, 0, 1, 0, 0, 0, 0, 0, 0)),
.Names = c("CESD1", "CESD2", "CESD3", "CESD4", "CESD5", "CESD6", "CESD7", "CESD8", "CESD9",
"CESD10", "CESD11", "CESD12", "CESD13", "CESD14", "CESD15", "CESD16", "CESD17",
"CESD18", "CESD19", "CESD20"), row.names = c(NA, -10L),
class = c("tbl_df", "tbl", "data.frame"))
We compute (1) the discrimination and (2) the alpha coefficient.
stat <- t(sapply(1:ncol(dat), function(ii){
dd <- dat[, ii]
# discrimination is the correlation of the item to the rowsum
disc <- if(var(dd, na.rm = TRUE) > 0) cor(dd, rowSums(dat[, -ii]), use = "pairwise")
# alpha that would be obtained when we skip this item
alpha <- psych::alpha(dat[, -ii])$total$raw_alpha
c(disc, alpha)
}))
dimnames(stat) <- list(colnames(dat), c("disc", "alpha^I"))
stat <- data.frame(stat)
Observe that the discrimination (which is more efficient to compute) is inversely proportional to alpha that is obtained when deleting this item. In other words, alpha is highest when there are many high "discriminating" items (that correlate with each other).
plot(stat, pch = 19)
Use this information to select the sequence with which the items should be deleted to fall below a benchmark (say .9, since the toy data doesn't allow for a lower mark):
1) delete as many items as possible to stay above the benchmark; that is, start with the least discriminating items.
stat <- stat[order(stat$disc), ]
this <- sapply(1:(nrow(stat)-2), function(ii){
ind <- match(rownames(stat)[1:ii], colnames(dat))
alpha <- psych::alpha(dat[, -ind, drop = FALSE])$total$raw_alpha
})
delete_these <- rownames(stat)[which(this > .9)]
psych::alpha(dat[, -match(delete_these, colnames(dat)), drop = FALSE])$total$raw_alpha
length(delete_these)
2) delete as few items as possible to stay above the benchmark; that is, start with the highest discriminating items.
stat <- stat[order(stat$disc, decreasing = TRUE), ]
this <- sapply(1:(nrow(stat)-2), function(ii){
ind <- match(rownames(stat)[1:ii], colnames(dat))
alpha <- psych::alpha(dat[, -ind, drop = FALSE])$total$raw_alpha
})
delete_these <- rownames(stat)[which(this > .9)]
psych::alpha(dat[, -match(delete_these, colnames(dat)), drop = FALSE])$total$raw_alpha
length(delete_these)
Note, that 1) is coherent with classical item selection procedures in (psychological/educational) diagnostic/assessments: remove items from the assessment, that fall below a benchmark in terms of discriminatory power.
I changed the code as follows, now I am dropping a fixed number of items and changing the value of numberOfItemsToDrop from 1 to 20 manually. Although it is a lil better, but it still is taking too long to run :(
I hope there is some better way of doing this.
numberOfItemsToDrop <- 13
combinations <- combinat::combn(20, numberOfItemsToDrop)
timesToIterate <- length(combinations)/numberOfItemsToDrop
for(i in 1:timesToIterate){
model <- combinations[,i]
itemsToUse <- itemResponses[, -c(model)]
alphaVal <- psych::alpha(itemsToUse)$total$raw_alpha
if(alphaVal < 0.82){
cat("Cronbach's alpha =",alphaVal, ", number of items dropped = ", length(model), " :: ")
print(model)
}
}

R - Predict(), renaming columns, and " had 10 rows but variables found have 20 rows "

From other threads I've seen people provide solutions that are specific to exact problems, but I don't understand the underlying reason of what's going wrong.
I do...
modTest = glm( trainLabels[,1] ~ A + B + C +
D + E + F + G +
H + I, family=binomial(link='logit') )
The above is 20 labels, and 9 vectors each with 20 values.
I then try to predict on 10 unseen examples. This is 10 rows, 9 features, same order.
preds = predict( modTest, testFeatures )
I get the error...
Warning message:
'newdata' had 10 rows but variables found have 20 rows
Edit : Simplified, removed long feature names, etc.
> names(trainFeatures)
[1] "Neg" "Pos" "Num" "UN" "UNA" "UNUA" "UP" "UPA" "UPUA"
names(testFeatures)
[1] "Neg" "Pos" "Num" "UN" "UNA" "UNUA" "UP" "UPA" "UPUA"
Edit: Dputs...
To use the dputs, what I did was...
modTest = glm( trainLabels[,1] ~ as.matrix(trainFeatures) )
preds = predict( modTest, testFeatures )
Warning message:
'newdata' had 10 rows but variables found have 20 rows
Not sure why I'm getting that warning still.
dput(trainLabels)
structure(list(Neg = c(1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
0, 0, 0, 1, 1, 1, 0), Pos = c(1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
1, 1, 0, 0, 0, 1, 1, 1, 0), Num = c(1, 1, 0, 0, 0, 0, 1, 0, 0,
0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0), UN = c(1, 1, 0, 0, 0, 0, 1,
0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0), UNA = c(1, 1, 0, 0, 0,
0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0), UNUA = c(1, 1,
0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0), UP = c(1,
1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0), UPA = c(1,
1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0), UPUA = c(1,
1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0)), .Names = c("Neg",
"Pos", "Num", "UN", "UNA", "UNUA", "UP", "UPA", "UPUA"), row.names = c(NA,
-20L), class = "data.frame")
dput(trainFeatures)
structure(list(Neg = c(39106, 44664, 114130, 26526, 22122, 19175,
29438, 17741, 17589, 20666, 66024, 168336, 86283, 74826, 88998,
75756, 16041, 17087, 15235, 16659), Pos = c(16129, 21064, 57730,
10314, 18105, 16837, 19300, 16873, 13681, 18414, 27148, 120497,
60031, 49016, 59250, 36264, 15786, 16315, 14556, 16057), Num = c(82994,
121367, 306842, 55458, 69148, 63167, 85891, 58674, 55874, 67505,
152475, 427106, 221043, 190043, 223744, 177388, 51657, 54883,
48378, 54115), UN = c(32343, 35433, 74835, 22271, 17686, 15498,
22416, 14238, 14078, 16800, 54636, 121211, 68079, 59913, 70884,
61408, 13221, 14114, 12647, 13487), UNA = c(95.1499874, 95.0987263,
95.3942596, 95.5444865, 113.1263844, 112.3827424, 111.2684513,
113.2184128, 112.4336258, 114.1739588, 113.5086472, 111.6715378,
112.2842917, 111.9490612, 113.6465561, 111.5254103, 112.2179148,
111.2933853, 112.9056117, 113.1511475), UNUA = c(-94.4280737,
-94.5019854, -94.9246672, -95.0379578, -113.2247115, -112.3497485,
-111.1631387, -113.2051289, -112.1822898, -114.0431466, -113.7435412,
-111.6226818, -112.4077795, -111.9886653, -113.8072166, -111.6138577,
-113.0855995, -112.3075275, -114.2628431, -114.1088453), UP = c(10384,
13015, 24470, 6891, 13445, 12852, 13008, 13093, 9878, 14272,
14938, 77058, 40595, 32518, 39889, 21424, 8322, 8451, 7440, 8071
), UPA = c(58.6289931, 57.73430079, 61.3480343, 57.8297594, 62.1749994,
65.1140073, 62.619361, 63.6791219, 63.412582, 65.1856906, 45.18365794,
71.32918265, 56.04488913, 58.13008276, 53.16603128, 50.36242011,
64.6742956, 64.0982314, 63.4422878, 64.24099034), UPUA = c(88.9216885,
88.3012858, 88.1996008, 88.9910129, 91.0232669, 89.4524702, 91.9122816,
89.8549338, 90.6487273, 88.2063941, 99.9573821, 109.9128868,
103.7989926, 104.0274764, 103.4209936, 101.5065677, 85.8110039,
87.0786241, 86.1020646, 86.8835026)), .Names = c("Neg", "Pos",
"Num", "UN", "UNA", "UNUA", "UP", "UPA", "UPUA"), row.names = c(NA,
-20L), class = "data.frame")
dput(testLabels)
structure(list(Neg = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 1), Pos = c(0,
1, 1, 1, 0, 1, 1, 1, 1, 1), Num = c(0, 1, 1, 1, 0, 1, 1, 1, 1,
1), UN = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 1), UNA = c(0, 1, 1, 1,
0, 1, 1, 1, 1, 1), UNUA = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 1), UP = c(0,
1, 1, 1, 0, 1, 1, 1, 1, 1), UPA = c(0, 1, 1, 1, 0, 1, 1, 1, 1,
1), UPUA = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 1)), .Names = c("Neg",
"Pos", "Num", "UN", "UNA", "UNUA", "UP", "UPA", "UPUA"), row.names = c(NA,
-10L), class = "data.frame")
> dput(testFeatures)
structure(list(Neg = c(51404, 32447, 24642, 95979, 15743, 90843,
13813, 11496, 12871, 13546), Pos = c(23350, 13525, 19941, 49984,
10867, 64404, 13324, 11302, 12918, 13118), Num = c(121342, 68160,
77219, 248890, 49259, 232645, 43707, 35674, 40734, 42979), UN = c(40766,
27363, 19590, 71772, 12615, 71496, 11529, 9739, 10810, 11346),
UNA = c(95.2486872, 93.4642772, 111.3853297, 112.6770471,
110.0845355, 113.6696598, 111.8409793, 116.0476022, 120.3481302,
111.9496978), UNUA = c(-94.6150698, -92.5605373, -111.1994432,
-112.4947319, -109.7130777, -113.8083912, -112.5678322, -116.5407619,
-121.4756386, -113.4991191), UP = c(14285, 9043, 14862, 31626,
7491, 43903, 7021, 5559, 6149, 6789), UPA = c(61.25585053,
62.6231081, 64.191128, 64.6397131, 63.4911744, 58.4792454,
63.5063289, 60.5667637, 60.3857056, 64.1569975), UPUA = c(88.4605419,
88.2790682, 90.0217465, 88.8441004, 91.0222662, 105.0494229,
85.8914139, 86.7685668, 84.8304901, 86.9786109)), .Names = c("Neg",
"Pos", "Num", "UN", "UNA", "UNUA", "UP", "UPA", "UPUA"), row.names = c(NA,
-10L), class = "data.frame")
So, I ran the code with all the data you provided and get the results just fine. Here is the model fit:
modTest = glm(trainLabels[,1] ~ Neg + Pos + Num +
UN + UNA + UNUA + UP +
UPA + UPUA, family=binomial(link='logit'),
data = trainFeatues)
Here are the predicted values on test data:
predict( modTest, testFeatures)
1 2 3 4 5 6 7 8
4.6711576 -1.3572345 -2.0639104 18.7625539 -7.6961149 0.4317324 -0.8983256 -8.2052158
9 10
-1.5968013 10.8357174
NOTE: an alternative specification can be like this:
modTest = glm(trainLabels[,1] ~ trainFeatues$Neg + trainFeatues$Pos +
trainFeatues$Num + trainFeatues$UN + trainFeatues$UNA +
trainFeatues$UNUA + trainFeatues$UP + trainFeatues$UPA +
trainFeatues$UPUA, family=binomial(link='logit'))
However, the fit model is as follows:
modTest$coefficients
(Intercept) trainFeatues$Neg trainFeatues$Pos trainFeatues$Num trainFeatues$UN
4.027803e+01 8.874801e-04 -3.000123e-03 1.277138e-04 -4.521793e-04
trainFeatues$UNA trainFeatues$UNUA trainFeatues$UP trainFeatues$UPA trainFeatues$UPUA
-1.519463e+01 -1.480503e+01 2.930261e-03 4.741432e-01 -3.690940e-01
When you feed the train data to predict, this is causing problems since the features fit above are not matching the new data being fed to predict. Leading to:
predict( modTest, testFeatures)
1 2 3 4 5 6 7
0.21651890 3.23450117 -2.16298672 -0.06949967 -0.91026504 -0.91484739 -1.69209826
8 9 10 11 12 13 14
-2.45603982 -6.35855600 -1.84871546 -0.25027815 2.72625440 -0.50422297 -1.76701963
15 16 17 18 19 20
0.05033351 0.65101666 0.27680835 1.79176029 6.79618311 -0.16186455
Warning message:
'newdata' had 10 rows but variables found have 20 rows

R error when trying to cluster data using pam (package cluster)

I am trying to run k-means clustering on a data set which was preprocessed (categorical to dummy, na cleaning etc.).
here is an extract (head) of the data:
dput(head(clustering.set.in))
structure(list(activity_type = c(1, 1, 1, 1, 1, 1), app_id.PXkw7OJ1se = c(0,
1, 1, 1, 1, 0), app_id.PXszbKVa5M = c(0, 0, 0, 0, 0, 0), app_id.PXw3GFQKBm = c(1,
0, 0, 0, 0, 0), browser_version = c(48, 42, 9, 9, 48, 44), continent.AS = c(0,
1, 1, 0, 0, 0), continent.EU = c(0, 0, 0, 0, 1, 0), continent.SA = c(0,
0, 0, 0, 0, 0), f_activex = c(1, 1, 1, 1, 1, 1), f_atob = c(2,
2, 2, 2, 2, 2), f_audio = c(2, 2, 2, 2, 2, 2), f_battery = c(2,
2, 1, 1, 2, 2), f_bind = c(2, 2, 2, 2, 2, 2), f_flash = c(1,
2, 2, 2, 2, 2), f_getComputedStyle = c(2, 2, 2, 2, 2, 2), f_matchSelector = c(2,
2, 2, 2, 2, 2), f_mimeTypes = c(2, 2, 2, 2, 2, 2), f_mimeTypesLength = c(0,
8, 11, 55, 7, 8), f_navigationTiming = c(2, 2, 1, 2, 2, 2), f_orientationEvents = c(2,
1, 1, 1, 1, 1), f_plugins = c(2, 2, 2, 2, 2, 2), f_pluginsLength = c(0,
6, 6, 15, 5, 6), f_raf = c(2, 2, 2, 2, 2, 2), f_resourceTiming = c(2,
2, 1, 1, 2, 2), f_sse = c(2, 2, 2, 2, 2, 2), f_webgl = c(1, 2,
2, 2, 2, 1), f_websql = c(1, 2, 2, 2, 2, 2), f_xdr = c(1, 1,
1, 1, 1, 1), n_appCodeName = c(2, 2, 2, 2, 2, 2), n_doNotTrack = c(2,
2, 1, 2, 2, 2), n_geolocation = c(2, 2, 2, 2, 2, 2), n_mimeTypes = c(2,
2, 2, 2, 2, 2), n_platform.iPhone = c(0, 0, 0, 0, 0, 0), n_platform.Linux.armv7l = c(1,
0, 0, 0, 0, 0), n_platform.MacIntel = c(0, 0, 1, 1, 0, 0), n_platform.Win32 = c(0,
1, 0, 0, 1, 0), n_plugins = c(2, 2, 2, 2, 2, 2), n_product.Sub20030107 = c(1,
1, 1, 1, 1, 1), n_product.Sub20100101 = c(0, 0, 0, 0, 0, 0),
n_product.Submissing = c(0, 0, 0, 0, 0, 0), os_family.Android = c(1,
0, 0, 0, 0, 0), os_family.iOS = c(0, 0, 0, 0, 0, 0), os_family.Mac.OS.X = c(0,
0, 1, 1, 0, 0), os_family.Windows = c(0, 1, 0, 0, 1, 0),
os_version = c(6, 8.1, 10, 10, 7, 0), site_history_length = c(31,
1, 1, 1, 1, 1), w_chrome...loadTimes....csi....app....webstore....runtime.. = c(0,
1, 0, 0, 1, 0), w_chrome...loadTimes....csi.. = c(0, 0, 0,
0, 0, 0), w_chrome... = c(1, 0, 1, 1, 0, 0), window_dimensions = c(2,
1, 2, 2, 2, 2), window_history = c(50, 1, 1, 1, 1, 3)), .Names = c("activity_type",
"app_id.PXkw7OJ1se", "app_id.PXszbKVa5M", "app_id.PXw3GFQKBm",
"browser_version", "continent.AS", "continent.EU", "continent.SA",
"f_activex", "f_atob", "f_audio", "f_battery", "f_bind", "f_flash",
"f_getComputedStyle", "f_matchSelector", "f_mimeTypes", "f_mimeTypesLength",
"f_navigationTiming", "f_orientationEvents", "f_plugins", "f_pluginsLength",
"f_raf", "f_resourceTiming", "f_sse", "f_webgl", "f_websql",
"f_xdr", "n_appCodeName", "n_doNotTrack", "n_geolocation", "n_mimeTypes",
"n_platform.iPhone", "n_platform.Linux.armv7l", "n_platform.MacIntel",
"n_platform.Win32", "n_plugins", "n_product.Sub20030107", "n_product.Sub20100101",
"n_product.Submissing", "os_family.Android", "os_family.iOS",
"os_family.Mac.OS.X", "os_family.Windows", "os_version", "site_history_length",
"w_chrome...loadTimes....csi....app....webstore....runtime..",
"w_chrome...loadTimes....csi..", "w_chrome...", "window_dimensions",
"window_history"), row.names = c(NA, 6L), class = "data.frame")
I am trying to cluster kmeans this data sets (k=2)
and getting error message:
Error in pam(clustering.set.in, k) :
negative length vectors are not allowed
my line of code:
pam(clustering.set.in, 2)
Any suggestions ?
it turns out that one column has na values in it.
Removed it with
new.data[is.na(new.data)] <- 1
and it seems to work fine now

Resources