R Summarize Collapsed Data.Table - r

I have data such as this
data <- data.table(
"School" = c(1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1,
1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0),
"Grade" = c(0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,
0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0),
"CAT" = c(1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1),
"FOX" = c(1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1,
1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0),
"DOG" = c(0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1)
)
and wish to achieve a new data table such as this:
dataWANT <- data.frame(
"VARIABLE" = c('CAT', 'CAT', 'CAT', 'FOX', 'FOX', 'FOX', 'DOG', 'DOG', 'DOG'),
"SCHOOL" = c(1, 1, 0, 1, 1, 0, 1, 1, 0),
"GRADE" = c(0, 1, 1, 0, 1, 1, 0, 1, 1),
"MEAN" = c(NA)
)
dataWANT takes the mean for CAT and FOX and DOG by SCHOOL, GRADE, and SCHOOL X GRADE when they are equal to 1.
I know how to do this one at a time but that is not good for doing this with a big data.
data[, CAT1 := mean(CAT), by = list(SCHOOL)]
data[, FOX1 := mean(FOX), by = list(GRADE)]
data[, DOG1 := mean(DOG), by = list(SCHOOL, GRADE)]
data$CAT2 = unique(data[SCHOOL == 1, CAT1])
data$FOX2 = unique(data[GRADE == 1, FOX1])
data$DOG2 = unique(data[SCHOOL == 1 & GRADE == 1, DOG1])
Please only use this:
data <- data.table(
"SCHOOL" = c(1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1,
1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0),
"GRADE" = c(0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,
0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0),
"CAT" = c(1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1),
"FOX" = c(1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1,
1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0),
"DOG" = c(0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1)
)
data[, CAT1 := mean(CAT), by = list(SCHOOL)]
data[, CAT2 := mean(CAT), by = list(GRADE)]
data[, CAT3 := mean(CAT), by = list(SCHOOL, GRADE)]
data[, FOX1 := mean(FOX), by = list(SCHOOL)]
data[, FOX2 := mean(FOX), by = list(GRADE)]
data[, FOX3 := mean(FOX), by = list(SCHOOL, GRADE)]
data[, DOG1 := mean(DOG), by = list(SCHOOL)]
data[, DOG2 := mean(DOG), by = list(GRADE)]
data[, DOG3 := mean(DOG), by = list(SCHOOL, GRADE)]
dataWANT <- data.frame(
"VARIABLE" = c('CAT', 'CAT', 'CAT', 'FOX', 'FOX', 'FOX', 'DOG', 'DOG', 'DOG'),
"TYPE" = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
"MEAN" = c(0.48, 0.44, 0.428, 0.6, 0.611, 0.6428, 0.52, 0.61, 0.6428)
)
where:
TYPE equals to 1 when MEAN in estimated by SCHOOL,
TYPE equals to 2 when MEAN is estimated by GRADE,
TYPE equals to 3 when MEAN is estimated by SCHOOL and GRADE

We could use rbindlist after creating a list by taking the MEAN after melting the dataset (as in the other post)
library(data.table)
cols <- c('CAT', 'FOX', 'DOG')
data1 <- melt(data, measure.vars = cols)
list_cols <- list('SCHOOL', 'GRADE', c('SCHOOL', 'GRADE'))
lst1 <- lapply(list_cols, function(x)
data1[, .(MEAN = mean(value, na.rm = TRUE)), c(x, 'variable')])
rbindlist(lapply(lst1, function(x) {
nm1 <- setdiff(names(x), c('variable', 'MEAN'))
x[Reduce(`&`, lapply(mget(nm1), as.logical)),
.(VARIABLE = variable, MEAN)]}), idcol = 'TYPE')[order(VARIABLE)]
# TYPE VARIABLE MEAN
#1: 1 CAT 0.4800000
#2: 2 CAT 0.4444444
#3: 3 CAT 0.4285714
#4: 1 FOX 0.6000000
#5: 2 FOX 0.5555556
#6: 3 FOX 0.6428571
#7: 1 DOG 0.5200000
#8: 2 DOG 0.6111111
#9: 3 DOG 0.6428571

Do you mean to get something like this?
library(data.table)
melt(data, measure.vars = c('CAT', 'FOX', 'DOG'))[,
.(MEAN = mean(value, na.rm = TRUE)), .(School, Grade, variable)]
To group it by different columns, we can do :
cols <- c('CAT', 'FOX', 'DOG')
data1 <- melt(data, measure.vars = cols)
list_cols <- list('School', 'Grade', c('School', 'Grade'))
lapply(list_cols, function(x)
data1[, .(MEAN = mean(value, na.rm = TRUE)), c(x, 'variable')])

You could subset and calculate your means first using lapply(.SD,...) then melt that into your output:
melt(data[School != 0 | Grade != 0, lapply(.SD, mean), by = .(School, Grade)], id.vars = c("School", "Grade"))
Adding this after also adds the TYPE variable
...][, TYPE := School + (2*Grade)]
Putting it all together and tidying it up too, it matches your desired output
dataWANT <- melt(data[School != 0 | Grade != 0, lapply(.SD, mean), by = .(School, Grade)], id.vars = c("School", "Grade"))[, TYPE := School + (2*Grade)][order(variable, TYPE), .("VARIABLE" = variable, TYPE, "MEAN" = value)]

Related

R function to change value after a condition has been fulfilled

Participants in an experiment took a test that has a rule that says "once a participant has gotten 6 items wrong in a window of 8 items, you stop running the test". However, some experimenters kept testing past this point. I now need to find a way in which I can automatically see where the test should have been stopped, and change all values following the end to 0 (= item wrong). I am not even sure if this is something that can be done in R.
To be clear, I would like to go row by row (which are the participants) and once there are six 0s in a given window of 8 columns (items), I would need all values after the sixth 0 to be 0 too.
While the reproducible data is below, here is a visualization of what I would need, where the blue cells are the ones that should change to 0:
Pre-changes
Post-changes
Reproducible data:
structure(list(Participant_ID = c("E01P01", "E01P02", "E01P03",
"E01P04", "E01P05", "E01P06", "E01P07", "E01P08", "E02P01", "E02P02"
), A2 = c(1, 1, 1, 0, 0, 1, 1, 1, 1, 1), A3 = c(1, 1, 0, 0, 0,
1, 0, 0, 0, 0), B1 = c(1, 1, 1, 0, 0, 1, 0, 0, 1, 1), B2 = c(1,
1, 1, 1, 1, 1, 0, 0, 0, 1), C3 = c(1, 0, 0, 1, 0, 1, 0, 0, 0,
1), C4 = c(1, 0, 0, 0, 0, 1, 0, 0, 1, 1), D1 = c(1, 0, 0, 0,
0, 1, 0, 0, 0, 0), D3 = c(1, 1, 1, 1, 0, 0, 1, 0, 0, 1), E1 = c(1,
0, 0, 0, 0, 1, 0, 0, 0, 1), E3 = c(1, 1, 0, 1, 0, 1, 0, 0, 0,
0), F1 = c(1, 0, 0, 0, 1, 0, 0, 1, 0, 0), F4 = c(1, 1, 1, 1,
0, 1, 0, 1, 1, 0), G1 = c(1, 0, 0, 0, 0, 1, 0, 0, 0, 1), G2 = c(0,
0, 0, 0, 1, 1, 1, 0, 1, 1)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
Any help is highly appreciated!
Here is a solution that involves some pivoting, rollsum, cumsum, if_else logic, then pivoting back. Let me know if it works.
library(tidyverse)
library(zoo)
structure(list(Participant_ID = c("E01P01", "E01P02", "E01P03",
"E01P04", "E01P05", "E01P06", "E01P07", "E01P08", "E02P01", "E02P02"
), A2 = c(1, 1, 1, 0, 0, 1, 1, 1, 1, 1), A3 = c(1, 1, 0, 0, 0,
1, 0, 0, 0, 0), B1 = c(1, 1, 1, 0, 0, 1, 0, 0, 1, 1), B2 = c(1,
1, 1, 1, 1, 1, 0, 0, 0, 1), C3 = c(1, 0, 0, 1, 0, 1, 0, 0, 0,
1), C4 = c(1, 0, 0, 0, 0, 1, 0, 0, 1, 1), D1 = c(1, 0, 0, 0,
0, 1, 0, 0, 0, 0), D3 = c(1, 1, 1, 1, 0, 0, 1, 0, 0, 1), E1 = c(1,
0, 0, 0, 0, 1, 0, 0, 0, 1), E3 = c(1, 1, 0, 1, 0, 1, 0, 0, 0,
0), F1 = c(1, 0, 0, 0, 1, 0, 0, 1, 0, 0), F4 = c(1, 1, 1, 1,
0, 1, 0, 1, 1, 0), G1 = c(1, 0, 0, 0, 0, 1, 0, 0, 0, 1), G2 = c(0,
0, 0, 0, 1, 1, 1, 0, 1, 1)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame")) %>%
as_tibble() %>%
pivot_longer(-1) %>%
group_by(Participant_ID) %>%
mutate(running_total = zoo::rollsumr(value==0, k = 8, fill = 0),
should_terminate = cumsum(running_total >= 6),
value = if_else(should_terminate > 0, 0, value)) %>%
ungroup() %>%
select(Participant_ID, name, value) %>%
pivot_wider(names_from = name, values_from = value)

Chi Square Test of Independence of Whole Dataset

I have a 3185x90 dataset of binary values and want to do a chi-squared test of independence, comparing all column variables against each other.
I've been tried using different variations of code from google searches with chisq.test() and some for loops, but none of them have worked so far.
How do I do this?
This is the frame I've tinkered with. My dataset is oak.
chi_trial <- data.frame(a = c(0,1), b = c(0,1))
for(row in 1:nrow(oak)){
print(row)
print(chisq.test(c(oak[row,1],d[row,2])))
}
I also tried this:
apply(d, 1, chisq.test)
which gives me the error: Error in FUN(newX[, i], ...) :
all entries of 'x' must be nonnegative and finite
dput(oak[1:2],)
structure(list(post_flu = structure(c(1, 1, 1, 1, 1, 0, 0, 0,
0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
label = "Receipt of Flu Vaccine - Encounter Survey", format.stata = "%10.0g")), row.names = c(NA,
-3185L), class = c("tbl_df", "tbl", "data.frame"), label = "Main Oakland Clinic Analysis Dataset")
I added a sample of my data with the final lines of the output. The portion of the dataset is small, but it all looks like this.
You could use something like the code below, which is similar to R's cor function. I don't have your data, so I'm simulating some. Note that I get one significant p-value, using the traditional cut-off of 0.05.
set.seed(3)
nr=3185; nc=3
oak <- as.data.frame(matrix(sample(0:1, size=nr*nc, replace=TRUE), ncol=nc))
oak
mult.chi <- function(data){
nc <- ncol(data)
res <- matrix(0, nrow=nc, ncol=nc) # or NA
for(i in 1:(nc-1))
for(j in (i+1):nc)
res[i,j] <- suppressWarnings(chisq.test(oak[,i], oak[,j])$p.value)
rownames(res) <- colnames(data)
colnames(res) <- colnames(data)
res
}
mult.chi(oak)
# V1 V2 V3
# V1 0 0.7847063 0.32012466
# V2 0 0.0000000 0.01410326
# V3 0 0.0000000 0.00000000
So consider applying a multiple testing adjustment as mentioned in the comments.
Here is a solution with combn to get all combinations of column numbers 2 by 2. Tested with the data in #Edward's answer.
chisq2cols <- function(X){
y <- matrix(0, ncol(X), ncol(X))
cmb <- combn(ncol(X), 2)
y[upper.tri(y)] <- apply(cmb, 2, function(k){
tbl <- table(X[k])
chisq.test(tbl)$p.value
})
y
}
chisq2cols(oak)
# [,1] [,2] [,3]
#[1,] 0 0.7847063 0.32012466
#[2,] 0 0.0000000 0.01410326
#[3,] 0 0.0000000 0.00000000

Optimum algorithm to check various combinations of items when number of items is too large

I have a data frame which has 20 columns/items in it, and 593 rows (number of rows doesn't matter though) as shown below:
Using this the reliability of test is obtained as 0.94, with the help of alpha from psych package psych::alpha. The output also gives me the the new value of cronbach's alpha if I drop one of the items. However, I want to know how many items can I drop to retain an alpha of at least 0.8 I used a brute force approach for the purpose where I am creating the combination of all the items that exists in my data frame and check if their alpha is in the range (0.7,0.9). Is there a better way of doing this, as this is taking forever to run because number of items is too large to check for all the combination of items. Below is my current piece of code:
numberOfItems <- 20
for(i in 2:(2^numberOfItems)-1){
# ignoring the first case i.e. i=1, as it doesn't represent any model
# convert the value of i to binary, e.g. i=5 will give combination = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1
# using the binaryLogic package
combination <- as.binary(i, n=numberOfItems)
model <- c()
for(j in 1:length(combination)){
# choose which columns to consider depending on the combination
if(combination[j])
model <- c(model, j)
}
itemsToUse <- itemResponses[, c(model)]
#cat(model)
if(length(model) > 13){
alphaVal <- psych::alpha(itemsToUse)$total$raw_alpha
if(alphaVal > 0.7 && alphaVal < 0.9){
cat(alphaVal)
print(model)
}
}
}
A sample output from this code is as follows:
0.8989831 1 4 5 7 8 9 10 11 13 14 15 16 17 19 20
0.899768 1 4 5 7 8 9 10 11 12 13 15 17 18 19 20
0.899937 1 4 5 7 8 9 10 11 12 13 15 16 17 19 20
0.8980605 1 4 5 7 8 9 10 11 12 13 14 15 17 19 20
Here are the first 10 rows of data:
dput(itemResponses)
structure(list(CESD1 = c(1, 2, 2, 0, 1, 0, 0, 0, 0, 1), CESD2 = c(2,
3, 1, 0, 0, 1, 1, 1, 0, 1), CESD3 = c(0, 3, 0, 1, 1, 0, 0, 0,
0, 0), CESD4 = c(1, 2, 0, 1, 0, 1, 1, 1, 0, 0), CESD5 = c(0,
1, 0, 2, 1, 2, 2, 0, 0, 0), CESD6 = c(0, 3, 0, 1, 0, 0, 2, 0,
0, 0), CESD7 = c(1, 2, 1, 1, 2, 0, 1, 0, 1, 0), CESD8 = c(1,
3, 1, 1, 0, 1, 0, 0, 1, 0), CESD9 = c(0, 1, 0, 2, 0, 0, 1, 1,
0, 1), CESD10 = c(0, 1, 0, 2, 0, 0, 1, 1, 0, 1), CESD11 = c(0,
2, 1, 1, 1, 1, 2, 3, 0, 0), CESD12 = c(0, 3, 1, 1, 1, 0, 2, 0,
0, 0), CESD13 = c(0, 3, 0, 2, 1, 2, 1, 0, 1, 0), CESD14 = c(0,
3, 1, 2, 1, 1, 1, 0, 1, 1), CESD15 = c(0, 2, 0, 1, 0, 1, 0, 1,
1, 0), CESD16 = c(0, 2, 2, 0, 0, 1, 1, 0, 0, 0), CESD17 = c(0,
0, 0, 0, 0, 1, 1, 0, 0, 0), CESD18 = c(0, 2, 0, 0, 0, 0, 0, 0,
0, 1), CESD19 = c(0, 3, 0, 0, 0, 0, 0, 1, 1, 0), CESD20 = c(0,
3, 0, 1, 0, 0, 0, 0, 0, 0)), .Names = c("CESD1", "CESD2", "CESD3",
"CESD4", "CESD5", "CESD6", "CESD7", "CESD8", "CESD9", "CESD10",
"CESD11", "CESD12", "CESD13", "CESD14", "CESD15", "CESD16", "CESD17",
"CESD18", "CESD19", "CESD20"), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
The idea is to replace the computation of alpha with the so-called discrimination for each item from classical test theory (CTT). The discrimination is the correlation of the item score with a "true score" (which we would assume to be the row sum).
Let the data be
dat <- structure(list(CESD1 = c(1, 2, 2, 0, 1, 0, 0, 0, 0, 1), CESD2 = c(2, 3, 1, 0, 0, 1, 1, 1, 0, 1),
CESD3 = c(0, 3, 0, 1, 1, 0, 0, 0, 0, 0), CESD4 = c(1, 2, 0, 1, 0, 1, 1, 1, 0, 0),
CESD5 = c(0, 1, 0, 2, 1, 2, 2, 0, 0, 0), CESD6 = c(0, 3, 0, 1, 0, 0, 2, 0, 0, 0),
CESD7 = c(1, 2, 1, 1, 2, 0, 1, 0, 1, 0), CESD8 = c(1, 3, 1, 1, 0, 1, 0, 0, 1, 0),
CESD9 = c(0, 1, 0, 2, 0, 0, 1, 1, 0, 1), CESD10 = c(0, 1, 0, 2, 0, 0, 1, 1, 0, 1),
CESD11 = c(0, 2, 1, 1, 1, 1, 2, 3, 0, 0), CESD12 = c(0, 3, 1, 1, 1, 0, 2, 0, 0, 0),
CESD13 = c(0, 3, 0, 2, 1, 2, 1, 0, 1, 0), CESD14 = c(0, 3, 1, 2, 1, 1, 1, 0, 1, 1),
CESD15 = c(0, 2, 0, 1, 0, 1, 0, 1, 1, 0), CESD16 = c(0, 2, 2, 0, 0, 1, 1, 0, 0, 0),
CESD17 = c(0, 0, 0, 0, 0, 1, 1, 0, 0, 0), CESD18 = c(0, 2, 0, 0, 0, 0, 0, 0, 0, 1),
CESD19 = c(0, 3, 0, 0, 0, 0, 0, 1, 1, 0), CESD20 = c(0, 3, 0, 1, 0, 0, 0, 0, 0, 0)),
.Names = c("CESD1", "CESD2", "CESD3", "CESD4", "CESD5", "CESD6", "CESD7", "CESD8", "CESD9",
"CESD10", "CESD11", "CESD12", "CESD13", "CESD14", "CESD15", "CESD16", "CESD17",
"CESD18", "CESD19", "CESD20"), row.names = c(NA, -10L),
class = c("tbl_df", "tbl", "data.frame"))
We compute (1) the discrimination and (2) the alpha coefficient.
stat <- t(sapply(1:ncol(dat), function(ii){
dd <- dat[, ii]
# discrimination is the correlation of the item to the rowsum
disc <- if(var(dd, na.rm = TRUE) > 0) cor(dd, rowSums(dat[, -ii]), use = "pairwise")
# alpha that would be obtained when we skip this item
alpha <- psych::alpha(dat[, -ii])$total$raw_alpha
c(disc, alpha)
}))
dimnames(stat) <- list(colnames(dat), c("disc", "alpha^I"))
stat <- data.frame(stat)
Observe that the discrimination (which is more efficient to compute) is inversely proportional to alpha that is obtained when deleting this item. In other words, alpha is highest when there are many high "discriminating" items (that correlate with each other).
plot(stat, pch = 19)
Use this information to select the sequence with which the items should be deleted to fall below a benchmark (say .9, since the toy data doesn't allow for a lower mark):
1) delete as many items as possible to stay above the benchmark; that is, start with the least discriminating items.
stat <- stat[order(stat$disc), ]
this <- sapply(1:(nrow(stat)-2), function(ii){
ind <- match(rownames(stat)[1:ii], colnames(dat))
alpha <- psych::alpha(dat[, -ind, drop = FALSE])$total$raw_alpha
})
delete_these <- rownames(stat)[which(this > .9)]
psych::alpha(dat[, -match(delete_these, colnames(dat)), drop = FALSE])$total$raw_alpha
length(delete_these)
2) delete as few items as possible to stay above the benchmark; that is, start with the highest discriminating items.
stat <- stat[order(stat$disc, decreasing = TRUE), ]
this <- sapply(1:(nrow(stat)-2), function(ii){
ind <- match(rownames(stat)[1:ii], colnames(dat))
alpha <- psych::alpha(dat[, -ind, drop = FALSE])$total$raw_alpha
})
delete_these <- rownames(stat)[which(this > .9)]
psych::alpha(dat[, -match(delete_these, colnames(dat)), drop = FALSE])$total$raw_alpha
length(delete_these)
Note, that 1) is coherent with classical item selection procedures in (psychological/educational) diagnostic/assessments: remove items from the assessment, that fall below a benchmark in terms of discriminatory power.
I changed the code as follows, now I am dropping a fixed number of items and changing the value of numberOfItemsToDrop from 1 to 20 manually. Although it is a lil better, but it still is taking too long to run :(
I hope there is some better way of doing this.
numberOfItemsToDrop <- 13
combinations <- combinat::combn(20, numberOfItemsToDrop)
timesToIterate <- length(combinations)/numberOfItemsToDrop
for(i in 1:timesToIterate){
model <- combinations[,i]
itemsToUse <- itemResponses[, -c(model)]
alphaVal <- psych::alpha(itemsToUse)$total$raw_alpha
if(alphaVal < 0.82){
cat("Cronbach's alpha =",alphaVal, ", number of items dropped = ", length(model), " :: ")
print(model)
}
}

R - Predict(), renaming columns, and " had 10 rows but variables found have 20 rows "

From other threads I've seen people provide solutions that are specific to exact problems, but I don't understand the underlying reason of what's going wrong.
I do...
modTest = glm( trainLabels[,1] ~ A + B + C +
D + E + F + G +
H + I, family=binomial(link='logit') )
The above is 20 labels, and 9 vectors each with 20 values.
I then try to predict on 10 unseen examples. This is 10 rows, 9 features, same order.
preds = predict( modTest, testFeatures )
I get the error...
Warning message:
'newdata' had 10 rows but variables found have 20 rows
Edit : Simplified, removed long feature names, etc.
> names(trainFeatures)
[1] "Neg" "Pos" "Num" "UN" "UNA" "UNUA" "UP" "UPA" "UPUA"
names(testFeatures)
[1] "Neg" "Pos" "Num" "UN" "UNA" "UNUA" "UP" "UPA" "UPUA"
Edit: Dputs...
To use the dputs, what I did was...
modTest = glm( trainLabels[,1] ~ as.matrix(trainFeatures) )
preds = predict( modTest, testFeatures )
Warning message:
'newdata' had 10 rows but variables found have 20 rows
Not sure why I'm getting that warning still.
dput(trainLabels)
structure(list(Neg = c(1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
0, 0, 0, 1, 1, 1, 0), Pos = c(1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
1, 1, 0, 0, 0, 1, 1, 1, 0), Num = c(1, 1, 0, 0, 0, 0, 1, 0, 0,
0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0), UN = c(1, 1, 0, 0, 0, 0, 1,
0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0), UNA = c(1, 1, 0, 0, 0,
0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0), UNUA = c(1, 1,
0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0), UP = c(1,
1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0), UPA = c(1,
1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0), UPUA = c(1,
1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0)), .Names = c("Neg",
"Pos", "Num", "UN", "UNA", "UNUA", "UP", "UPA", "UPUA"), row.names = c(NA,
-20L), class = "data.frame")
dput(trainFeatures)
structure(list(Neg = c(39106, 44664, 114130, 26526, 22122, 19175,
29438, 17741, 17589, 20666, 66024, 168336, 86283, 74826, 88998,
75756, 16041, 17087, 15235, 16659), Pos = c(16129, 21064, 57730,
10314, 18105, 16837, 19300, 16873, 13681, 18414, 27148, 120497,
60031, 49016, 59250, 36264, 15786, 16315, 14556, 16057), Num = c(82994,
121367, 306842, 55458, 69148, 63167, 85891, 58674, 55874, 67505,
152475, 427106, 221043, 190043, 223744, 177388, 51657, 54883,
48378, 54115), UN = c(32343, 35433, 74835, 22271, 17686, 15498,
22416, 14238, 14078, 16800, 54636, 121211, 68079, 59913, 70884,
61408, 13221, 14114, 12647, 13487), UNA = c(95.1499874, 95.0987263,
95.3942596, 95.5444865, 113.1263844, 112.3827424, 111.2684513,
113.2184128, 112.4336258, 114.1739588, 113.5086472, 111.6715378,
112.2842917, 111.9490612, 113.6465561, 111.5254103, 112.2179148,
111.2933853, 112.9056117, 113.1511475), UNUA = c(-94.4280737,
-94.5019854, -94.9246672, -95.0379578, -113.2247115, -112.3497485,
-111.1631387, -113.2051289, -112.1822898, -114.0431466, -113.7435412,
-111.6226818, -112.4077795, -111.9886653, -113.8072166, -111.6138577,
-113.0855995, -112.3075275, -114.2628431, -114.1088453), UP = c(10384,
13015, 24470, 6891, 13445, 12852, 13008, 13093, 9878, 14272,
14938, 77058, 40595, 32518, 39889, 21424, 8322, 8451, 7440, 8071
), UPA = c(58.6289931, 57.73430079, 61.3480343, 57.8297594, 62.1749994,
65.1140073, 62.619361, 63.6791219, 63.412582, 65.1856906, 45.18365794,
71.32918265, 56.04488913, 58.13008276, 53.16603128, 50.36242011,
64.6742956, 64.0982314, 63.4422878, 64.24099034), UPUA = c(88.9216885,
88.3012858, 88.1996008, 88.9910129, 91.0232669, 89.4524702, 91.9122816,
89.8549338, 90.6487273, 88.2063941, 99.9573821, 109.9128868,
103.7989926, 104.0274764, 103.4209936, 101.5065677, 85.8110039,
87.0786241, 86.1020646, 86.8835026)), .Names = c("Neg", "Pos",
"Num", "UN", "UNA", "UNUA", "UP", "UPA", "UPUA"), row.names = c(NA,
-20L), class = "data.frame")
dput(testLabels)
structure(list(Neg = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 1), Pos = c(0,
1, 1, 1, 0, 1, 1, 1, 1, 1), Num = c(0, 1, 1, 1, 0, 1, 1, 1, 1,
1), UN = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 1), UNA = c(0, 1, 1, 1,
0, 1, 1, 1, 1, 1), UNUA = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 1), UP = c(0,
1, 1, 1, 0, 1, 1, 1, 1, 1), UPA = c(0, 1, 1, 1, 0, 1, 1, 1, 1,
1), UPUA = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 1)), .Names = c("Neg",
"Pos", "Num", "UN", "UNA", "UNUA", "UP", "UPA", "UPUA"), row.names = c(NA,
-10L), class = "data.frame")
> dput(testFeatures)
structure(list(Neg = c(51404, 32447, 24642, 95979, 15743, 90843,
13813, 11496, 12871, 13546), Pos = c(23350, 13525, 19941, 49984,
10867, 64404, 13324, 11302, 12918, 13118), Num = c(121342, 68160,
77219, 248890, 49259, 232645, 43707, 35674, 40734, 42979), UN = c(40766,
27363, 19590, 71772, 12615, 71496, 11529, 9739, 10810, 11346),
UNA = c(95.2486872, 93.4642772, 111.3853297, 112.6770471,
110.0845355, 113.6696598, 111.8409793, 116.0476022, 120.3481302,
111.9496978), UNUA = c(-94.6150698, -92.5605373, -111.1994432,
-112.4947319, -109.7130777, -113.8083912, -112.5678322, -116.5407619,
-121.4756386, -113.4991191), UP = c(14285, 9043, 14862, 31626,
7491, 43903, 7021, 5559, 6149, 6789), UPA = c(61.25585053,
62.6231081, 64.191128, 64.6397131, 63.4911744, 58.4792454,
63.5063289, 60.5667637, 60.3857056, 64.1569975), UPUA = c(88.4605419,
88.2790682, 90.0217465, 88.8441004, 91.0222662, 105.0494229,
85.8914139, 86.7685668, 84.8304901, 86.9786109)), .Names = c("Neg",
"Pos", "Num", "UN", "UNA", "UNUA", "UP", "UPA", "UPUA"), row.names = c(NA,
-10L), class = "data.frame")
So, I ran the code with all the data you provided and get the results just fine. Here is the model fit:
modTest = glm(trainLabels[,1] ~ Neg + Pos + Num +
UN + UNA + UNUA + UP +
UPA + UPUA, family=binomial(link='logit'),
data = trainFeatues)
Here are the predicted values on test data:
predict( modTest, testFeatures)
1 2 3 4 5 6 7 8
4.6711576 -1.3572345 -2.0639104 18.7625539 -7.6961149 0.4317324 -0.8983256 -8.2052158
9 10
-1.5968013 10.8357174
NOTE: an alternative specification can be like this:
modTest = glm(trainLabels[,1] ~ trainFeatues$Neg + trainFeatues$Pos +
trainFeatues$Num + trainFeatues$UN + trainFeatues$UNA +
trainFeatues$UNUA + trainFeatues$UP + trainFeatues$UPA +
trainFeatues$UPUA, family=binomial(link='logit'))
However, the fit model is as follows:
modTest$coefficients
(Intercept) trainFeatues$Neg trainFeatues$Pos trainFeatues$Num trainFeatues$UN
4.027803e+01 8.874801e-04 -3.000123e-03 1.277138e-04 -4.521793e-04
trainFeatues$UNA trainFeatues$UNUA trainFeatues$UP trainFeatues$UPA trainFeatues$UPUA
-1.519463e+01 -1.480503e+01 2.930261e-03 4.741432e-01 -3.690940e-01
When you feed the train data to predict, this is causing problems since the features fit above are not matching the new data being fed to predict. Leading to:
predict( modTest, testFeatures)
1 2 3 4 5 6 7
0.21651890 3.23450117 -2.16298672 -0.06949967 -0.91026504 -0.91484739 -1.69209826
8 9 10 11 12 13 14
-2.45603982 -6.35855600 -1.84871546 -0.25027815 2.72625440 -0.50422297 -1.76701963
15 16 17 18 19 20
0.05033351 0.65101666 0.27680835 1.79176029 6.79618311 -0.16186455
Warning message:
'newdata' had 10 rows but variables found have 20 rows

R - Check different matrices with a possible lag

This issue is quite tricky to explain but I am sure some of you already faced it.
So I have two matrix.
Matrix 1 (mat 1) and
Matrix 2 (mat 2)
What I want to do is to record in a third matrix (mat3) the value of mat2, after checking for matrix 1, but with a LAG. Let me explain.
After the value 1 in matrix 1, I want to check if matrix 2 as a 1 too but within the range of a certain LAG, for example, 1 or 2 episodes after (column).
For example, row number 4 has a 1 in matrix 1 at the 6th column.
So I want to check if in matrix 2 for row number 4 it has a 1 directly after or after 2 or 3 more columns.
Do you understand the idea ?
mat1 = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0,
1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1), .Dim = c(10L, 21L), .Dimnames = list(NULL, c("wit5.020",
"wit5.021", "wit5.022", "wit5.023", "wit5.024", "wit5.025", "wit5.026",
"wit5.027", "wit5.028", "wit5.029", "wit5.030", "wit5.031", "wit5.032",
"wit5.033", "wit5.034", "wit5.035", "wit5.036", "wit5.037", "wit5.038",
"wit5.039", "wit5.040")))
mat2 = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0,
1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
0, 1, 0, 1), .Dim = c(10L, 21L))
So mat3 - where I want to store the result of the check
mat3 = matrix(0, nrow = nrow(mat1), ncol = ncol(mat1))
So here is an example of a possible loop
in order to check the LAG - this loop doesn't work but it could give you an idea maybe of the solution.
I am not sure where to introduce the lag. I thought maybe in the i, but I am not sure.
for(j in 1:ncol(mat1)){
for(i in 1:nrow(mat1)){
if( mat1[i,j] == 1 & mat2[i,j] == 1 | mat2[i+1,j] == 1 | mat2[i+2,j] == 1) # lag here
{mat[i,j] <- 1}
else
{mat[i,j] <- 0}
}
}
Any ideas are very welcome.
Here's a simple way to do it:
lag <- 3 # or whatever lag you want
nr <- nrow(mat1)
nc <- ncol(mat1)
mat3 <- matrix(0, ncol=nc, nrow=nr)
for (r in 1:nr) {
for (c in 1:nc) {
if (mat1[r,c] == 1 && any(mat2[r,c:min(c+lag,nc)] == 1))
mat3[r,c] <- 1
}
}
Note the use of mat2[r,c:min(c+lag,nc)]. This selects all elements from current column c up through column c + lag, but it makes sure not to go past nc (the total number of columns). That is, this code is used to avoid an out-of-bounds error.
There's probably a faster, more vectory way of doing this, but the above code should work.

Resources