Getting sums by row in data.table - r

I am trying to implement the rowsums solution proposed here Getting rowSums in a data table in R . Basically I want a variable with the sum of top15, top16 and top17 for each row. This output produces an answer but its clearly not right, I am sure I understand what is happening.
I am looking for a data.table solution - I am running this on millions of cases
library( data.table)
d <- structure(list(top15 = c(1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1), top16 = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0), top17 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0)), class = c("data.table",
"data.frame"), row.names = c(NA, -20L))
d[ , tops:=lapply(.SD,sum), .SDcols=c(paste0("top", 15:17))]

We can use rowSums on the Subset of data.table (.SD), which can also take care of the NA elements with na.rm
nm1 <- paste0("top", 15:17)
d[, tops := rowSums(.SD, na.rm = TRUE), .SDcols = nm1]
Or if there are no NA elements, then do + with Reduce
d[, tops := Reduce(`+`, .SD), .SDcols = nm1]

Related

R function to change value after a condition has been fulfilled

Participants in an experiment took a test that has a rule that says "once a participant has gotten 6 items wrong in a window of 8 items, you stop running the test". However, some experimenters kept testing past this point. I now need to find a way in which I can automatically see where the test should have been stopped, and change all values following the end to 0 (= item wrong). I am not even sure if this is something that can be done in R.
To be clear, I would like to go row by row (which are the participants) and once there are six 0s in a given window of 8 columns (items), I would need all values after the sixth 0 to be 0 too.
While the reproducible data is below, here is a visualization of what I would need, where the blue cells are the ones that should change to 0:
Pre-changes
Post-changes
Reproducible data:
structure(list(Participant_ID = c("E01P01", "E01P02", "E01P03",
"E01P04", "E01P05", "E01P06", "E01P07", "E01P08", "E02P01", "E02P02"
), A2 = c(1, 1, 1, 0, 0, 1, 1, 1, 1, 1), A3 = c(1, 1, 0, 0, 0,
1, 0, 0, 0, 0), B1 = c(1, 1, 1, 0, 0, 1, 0, 0, 1, 1), B2 = c(1,
1, 1, 1, 1, 1, 0, 0, 0, 1), C3 = c(1, 0, 0, 1, 0, 1, 0, 0, 0,
1), C4 = c(1, 0, 0, 0, 0, 1, 0, 0, 1, 1), D1 = c(1, 0, 0, 0,
0, 1, 0, 0, 0, 0), D3 = c(1, 1, 1, 1, 0, 0, 1, 0, 0, 1), E1 = c(1,
0, 0, 0, 0, 1, 0, 0, 0, 1), E3 = c(1, 1, 0, 1, 0, 1, 0, 0, 0,
0), F1 = c(1, 0, 0, 0, 1, 0, 0, 1, 0, 0), F4 = c(1, 1, 1, 1,
0, 1, 0, 1, 1, 0), G1 = c(1, 0, 0, 0, 0, 1, 0, 0, 0, 1), G2 = c(0,
0, 0, 0, 1, 1, 1, 0, 1, 1)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
Any help is highly appreciated!
Here is a solution that involves some pivoting, rollsum, cumsum, if_else logic, then pivoting back. Let me know if it works.
library(tidyverse)
library(zoo)
structure(list(Participant_ID = c("E01P01", "E01P02", "E01P03",
"E01P04", "E01P05", "E01P06", "E01P07", "E01P08", "E02P01", "E02P02"
), A2 = c(1, 1, 1, 0, 0, 1, 1, 1, 1, 1), A3 = c(1, 1, 0, 0, 0,
1, 0, 0, 0, 0), B1 = c(1, 1, 1, 0, 0, 1, 0, 0, 1, 1), B2 = c(1,
1, 1, 1, 1, 1, 0, 0, 0, 1), C3 = c(1, 0, 0, 1, 0, 1, 0, 0, 0,
1), C4 = c(1, 0, 0, 0, 0, 1, 0, 0, 1, 1), D1 = c(1, 0, 0, 0,
0, 1, 0, 0, 0, 0), D3 = c(1, 1, 1, 1, 0, 0, 1, 0, 0, 1), E1 = c(1,
0, 0, 0, 0, 1, 0, 0, 0, 1), E3 = c(1, 1, 0, 1, 0, 1, 0, 0, 0,
0), F1 = c(1, 0, 0, 0, 1, 0, 0, 1, 0, 0), F4 = c(1, 1, 1, 1,
0, 1, 0, 1, 1, 0), G1 = c(1, 0, 0, 0, 0, 1, 0, 0, 0, 1), G2 = c(0,
0, 0, 0, 1, 1, 1, 0, 1, 1)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame")) %>%
as_tibble() %>%
pivot_longer(-1) %>%
group_by(Participant_ID) %>%
mutate(running_total = zoo::rollsumr(value==0, k = 8, fill = 0),
should_terminate = cumsum(running_total >= 6),
value = if_else(should_terminate > 0, 0, value)) %>%
ungroup() %>%
select(Participant_ID, name, value) %>%
pivot_wider(names_from = name, values_from = value)

Apply a set of functions to multiple data frames, considering a specific sequence

Consider this data:
df <- structure(list(V1 = c(1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0), V2 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0), V3 = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1), V4 = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 1, 1, 0, 0, 0), V5 = c(1, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1), V6 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1, 0, 0), V7 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
1, 1, 1, 1, 0, 0, 0, 0, 0, 0), V8 = c(1, 1, 1, 0, 0, 0, 0, 1,
1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0), V9 = c(1,
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1, 1, 1)), class = "data.frame", row.names = c(NA, -46L))
I would like to apply the following functions for all possible pairs in df.
Applying to the pair V1 and V2:
df$V1V2 <- (df$V1 * df$V2) + (1 - df$V1) * (1 - df$V2)
sum <- (sum(df$V1V2)/46)
df$VD <- (df$V1/sd(df$V1))
df$VI <- (df$V2/sd(df$V2))
est <- lm(df$VD ~ df$VI)
summary(est)
ndf <- data.frame(NA)
ndf$V1V2 <- summary(est)[["coefficients"]][, "Pr(>|t|)"][2]
Applying to the pair V1 and V3:
df$V1V3 <- (df$V1 * df$V3) + (1 - df$V1) * (1 - df$V3)
sum <- (sum(df$V1V3)/46)
df$VD <- (df$V1/sd(df$V1))
df$VI <- (df$V3/sd(df$V3))
est <- lm(df$VD ~ df$VI)
summary(est)
ndf$V1V3 <- summary(est)[["coefficients"]][, "Pr(>|t|)"][2]
I could apply this to all other pairs of V1 (i.e. V3, V4, V5, V6, V7, V8, and V9). Nonetheless, I am sure this is not the best approach.
UPDATE
This is how far I got:
dfV1 <- df[, c("V1", "V2", "V1", "V3", "V1", "V4", "V1", "V5", "V1", "V6", "V1", "V7", "V1", "V8", "V1", "V9")]
colnames(dfV1) <- c("V1", "V2", "V1", "V3", "V1", "V4", "V1", "V5", "V1", "V6", "V1", "V7", "V1", "V8", "V1", "V9")
sep <- lapply(seq(1, ncol(dfV1), by=2), function(i)
dfV1[i: pmin((i+1), ncol(dfV1))])
V1V2 <- sep[[1]]
V1V3 <- sep[[2]]
V1V4 <- sep[[3]]
V1V5 <- sep[[4]]
V1V6 <- sep[[5]]
V1V7 <- sep[[6]]
V1V8 <- sep[[7]]
V1V9 <- sep[[8]]
list_V1 <- tibble::lst(V1V2, V1V3, V1V4, V1V5, V1V6, V1V7, V1V8, V1V9)
library(dplyr)
my_func <- function(x) {
x <- x %>%
mutate(First = (x[,1] * x[,2] + (1 - x[,1] * (1 - x[,2]))),
VD = x[,1] / sd(x[,1]),
VI = x[,2] / sd(x[,2]))
}
res <- lapply(list_V1, my_func)
list2env(res, .GlobalEnv)
df.IC.V1 <- cbind.data.frame(V1V2$First, V1V3$First, V1V4$First, V1V5$First, V1V6$First, V1V7$First, V1V8$First, V1V9$First)
IC.all.V1 <- data.frame(colSums(df.IC.V1)/46)
I do not know how to apply this part to the list dfV1:
est <- lm(df$VD ~ df$VI)
summary(est)
ndf$V1V3 <- summary(est)[["coefficients"]][, "Pr(>|t|)"][2]
Avoid wide data and keep data long or tidy which helps in virtually every aspect of data science: aggregation, modeling, and plotting. Therefore, consider reshape (or tidy semantics of pivot_longer) to reformat data to long and with a generalized method, call by to run your model by each of the 9 different variables.
Finally, you may want to separate out results of this process in a new data frame to avoid repetition of values in original due to different lengths and especially since your model outputs two rows for intercept and variable with multiple columns.
long_df <- reshape(df, idvar="V1", varying=names(df)[-1],
times=names(df)[-1],
v.names="value", timevar="variable",
new.row.names=1:1E4, direction="long")
pairwise_model <- function(sub) {
sub$vpair <- (sub$V1 * sub$value) + (1 - sub$value) * (1 - sub$value)
sum_v <- (sum(sub$vpair)/46)
sub$VD <- (sub$V1/sd(sub$V1))
sub$VI <- (sub$value/sd(sub$value))
est <- lm(VD ~ VI, data=sub)
print(summary(est))
ndf <- data.frame(variable = sub$variable[[1]],
summary(est)[["coefficients"]])
return(ndf)
}
df_list <- by(long_df, long_df$variable, pairwise_model)
results_df <- do.call(rbind, df_list)
results_df
# variable Estimate Std..Error t.value Pr...t..
# V2.(Intercept) V2 0.70897157 0.1697361 4.1769050 1.376794e-04
# V2.VI V2 0.20546559 0.1475392 1.3926169 1.707334e-01
# V3.(Intercept) V3 0.00000000 0.4324256 0.0000000 1.000000e+00
# V3.VI V3 0.29294628 0.1441419 2.0323471 4.818178e-02
# V4.(Intercept) V4 0.59719677 0.1461565 4.0860091 1.829775e-04
# V4.VI V4 0.47663445 0.1325296 3.5964390 8.118808e-04
# V5.(Intercept) V5 0.77259722 0.1604268 4.8158874 1.766754e-05
# V5.VI V5 0.13627939 0.1493492 0.9124883 3.664845e-01
# V6.(Intercept) V6 0.68720490 0.1557572 4.4120255 6.533736e-05
# V6.VI V6 0.31399751 0.1431310 2.1937768 3.357833e-02
# V7.(Intercept) V7 0.57392936 0.1516681 3.7841150 4.627413e-04
# V7.VI V7 0.46128020 0.1337587 3.4486009 1.253140e-03
# V8.(Intercept) V8 0.90047538 0.1869717 4.8161045 1.765498e-05
# V8.VI V8 -0.09345783 0.1500959 -0.6226544 5.367256e-01
# V9.(Intercept) V9 0.60871296 0.1645124 3.7001029 5.960399e-04
# V9.VI V9 0.35598290 0.1408800 2.5268512 1.517916e-02
Online Demo

Chi Square Test of Independence of Whole Dataset

I have a 3185x90 dataset of binary values and want to do a chi-squared test of independence, comparing all column variables against each other.
I've been tried using different variations of code from google searches with chisq.test() and some for loops, but none of them have worked so far.
How do I do this?
This is the frame I've tinkered with. My dataset is oak.
chi_trial <- data.frame(a = c(0,1), b = c(0,1))
for(row in 1:nrow(oak)){
print(row)
print(chisq.test(c(oak[row,1],d[row,2])))
}
I also tried this:
apply(d, 1, chisq.test)
which gives me the error: Error in FUN(newX[, i], ...) :
all entries of 'x' must be nonnegative and finite
dput(oak[1:2],)
structure(list(post_flu = structure(c(1, 1, 1, 1, 1, 0, 0, 0,
0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
label = "Receipt of Flu Vaccine - Encounter Survey", format.stata = "%10.0g")), row.names = c(NA,
-3185L), class = c("tbl_df", "tbl", "data.frame"), label = "Main Oakland Clinic Analysis Dataset")
I added a sample of my data with the final lines of the output. The portion of the dataset is small, but it all looks like this.
You could use something like the code below, which is similar to R's cor function. I don't have your data, so I'm simulating some. Note that I get one significant p-value, using the traditional cut-off of 0.05.
set.seed(3)
nr=3185; nc=3
oak <- as.data.frame(matrix(sample(0:1, size=nr*nc, replace=TRUE), ncol=nc))
oak
mult.chi <- function(data){
nc <- ncol(data)
res <- matrix(0, nrow=nc, ncol=nc) # or NA
for(i in 1:(nc-1))
for(j in (i+1):nc)
res[i,j] <- suppressWarnings(chisq.test(oak[,i], oak[,j])$p.value)
rownames(res) <- colnames(data)
colnames(res) <- colnames(data)
res
}
mult.chi(oak)
# V1 V2 V3
# V1 0 0.7847063 0.32012466
# V2 0 0.0000000 0.01410326
# V3 0 0.0000000 0.00000000
So consider applying a multiple testing adjustment as mentioned in the comments.
Here is a solution with combn to get all combinations of column numbers 2 by 2. Tested with the data in #Edward's answer.
chisq2cols <- function(X){
y <- matrix(0, ncol(X), ncol(X))
cmb <- combn(ncol(X), 2)
y[upper.tri(y)] <- apply(cmb, 2, function(k){
tbl <- table(X[k])
chisq.test(tbl)$p.value
})
y
}
chisq2cols(oak)
# [,1] [,2] [,3]
#[1,] 0 0.7847063 0.32012466
#[2,] 0 0.0000000 0.01410326
#[3,] 0 0.0000000 0.00000000

R Summarize Collapsed Data.Table

I have data such as this
data <- data.table(
"School" = c(1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1,
1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0),
"Grade" = c(0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,
0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0),
"CAT" = c(1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1),
"FOX" = c(1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1,
1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0),
"DOG" = c(0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1)
)
and wish to achieve a new data table such as this:
dataWANT <- data.frame(
"VARIABLE" = c('CAT', 'CAT', 'CAT', 'FOX', 'FOX', 'FOX', 'DOG', 'DOG', 'DOG'),
"SCHOOL" = c(1, 1, 0, 1, 1, 0, 1, 1, 0),
"GRADE" = c(0, 1, 1, 0, 1, 1, 0, 1, 1),
"MEAN" = c(NA)
)
dataWANT takes the mean for CAT and FOX and DOG by SCHOOL, GRADE, and SCHOOL X GRADE when they are equal to 1.
I know how to do this one at a time but that is not good for doing this with a big data.
data[, CAT1 := mean(CAT), by = list(SCHOOL)]
data[, FOX1 := mean(FOX), by = list(GRADE)]
data[, DOG1 := mean(DOG), by = list(SCHOOL, GRADE)]
data$CAT2 = unique(data[SCHOOL == 1, CAT1])
data$FOX2 = unique(data[GRADE == 1, FOX1])
data$DOG2 = unique(data[SCHOOL == 1 & GRADE == 1, DOG1])
Please only use this:
data <- data.table(
"SCHOOL" = c(1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1,
1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0),
"GRADE" = c(0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,
0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0),
"CAT" = c(1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1),
"FOX" = c(1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1,
1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0),
"DOG" = c(0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1)
)
data[, CAT1 := mean(CAT), by = list(SCHOOL)]
data[, CAT2 := mean(CAT), by = list(GRADE)]
data[, CAT3 := mean(CAT), by = list(SCHOOL, GRADE)]
data[, FOX1 := mean(FOX), by = list(SCHOOL)]
data[, FOX2 := mean(FOX), by = list(GRADE)]
data[, FOX3 := mean(FOX), by = list(SCHOOL, GRADE)]
data[, DOG1 := mean(DOG), by = list(SCHOOL)]
data[, DOG2 := mean(DOG), by = list(GRADE)]
data[, DOG3 := mean(DOG), by = list(SCHOOL, GRADE)]
dataWANT <- data.frame(
"VARIABLE" = c('CAT', 'CAT', 'CAT', 'FOX', 'FOX', 'FOX', 'DOG', 'DOG', 'DOG'),
"TYPE" = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
"MEAN" = c(0.48, 0.44, 0.428, 0.6, 0.611, 0.6428, 0.52, 0.61, 0.6428)
)
where:
TYPE equals to 1 when MEAN in estimated by SCHOOL,
TYPE equals to 2 when MEAN is estimated by GRADE,
TYPE equals to 3 when MEAN is estimated by SCHOOL and GRADE
We could use rbindlist after creating a list by taking the MEAN after melting the dataset (as in the other post)
library(data.table)
cols <- c('CAT', 'FOX', 'DOG')
data1 <- melt(data, measure.vars = cols)
list_cols <- list('SCHOOL', 'GRADE', c('SCHOOL', 'GRADE'))
lst1 <- lapply(list_cols, function(x)
data1[, .(MEAN = mean(value, na.rm = TRUE)), c(x, 'variable')])
rbindlist(lapply(lst1, function(x) {
nm1 <- setdiff(names(x), c('variable', 'MEAN'))
x[Reduce(`&`, lapply(mget(nm1), as.logical)),
.(VARIABLE = variable, MEAN)]}), idcol = 'TYPE')[order(VARIABLE)]
# TYPE VARIABLE MEAN
#1: 1 CAT 0.4800000
#2: 2 CAT 0.4444444
#3: 3 CAT 0.4285714
#4: 1 FOX 0.6000000
#5: 2 FOX 0.5555556
#6: 3 FOX 0.6428571
#7: 1 DOG 0.5200000
#8: 2 DOG 0.6111111
#9: 3 DOG 0.6428571
Do you mean to get something like this?
library(data.table)
melt(data, measure.vars = c('CAT', 'FOX', 'DOG'))[,
.(MEAN = mean(value, na.rm = TRUE)), .(School, Grade, variable)]
To group it by different columns, we can do :
cols <- c('CAT', 'FOX', 'DOG')
data1 <- melt(data, measure.vars = cols)
list_cols <- list('School', 'Grade', c('School', 'Grade'))
lapply(list_cols, function(x)
data1[, .(MEAN = mean(value, na.rm = TRUE)), c(x, 'variable')])
You could subset and calculate your means first using lapply(.SD,...) then melt that into your output:
melt(data[School != 0 | Grade != 0, lapply(.SD, mean), by = .(School, Grade)], id.vars = c("School", "Grade"))
Adding this after also adds the TYPE variable
...][, TYPE := School + (2*Grade)]
Putting it all together and tidying it up too, it matches your desired output
dataWANT <- melt(data[School != 0 | Grade != 0, lapply(.SD, mean), by = .(School, Grade)], id.vars = c("School", "Grade"))[, TYPE := School + (2*Grade)][order(variable, TYPE), .("VARIABLE" = variable, TYPE, "MEAN" = value)]

Intersecting ranges of consecutive values in logical vectors in R

I have two logical vectors which look like this:
x = c(0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0)
y = c(0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0)
I would like to count the intersections between ranges of consecutive values. Meaning that consecutive values (of 1s) are handled as one range. So in the above example, each vector contains one range of 1s and these ranges intersect only once.
Is there any R package for range intersections which could help here?
I think this should work (calling your logical vectors x and y):
sum(rle(x & y)$values)
A few examples:
x = c(0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0)
y = c(0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0)
sum(rle(x & y)$values)
# [1] 1
x = c(1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0)
y = c(0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0)
sum(rle(x & y)$values)
# [1] 2
x = c(1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0)
y = c(0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0)
sum(rle(x & y)$values)
# [1] 3
By way of explanation, x & y gives the intersections on a per-element level, rle collapses runs of adjacent intersections, and sum counts.

Resources