How can I automate t-test for nested variables in R? - r

I would like to automate the collection of summary statistics that arise from t-tests. In the example below I have nested variables Age, Location, and Treatment. For each Age & Location I would like to run a t-test based on Treatment which has the two categorical names Control & Treatment. Put another way, I would like to know about the difference between the Control and Treatment means at each Location for each Age.
I would like to run the t-tests using the col_t_welch function in matrixTests because the output already has several of the summary statistics I'm looking for (i.e., mean.diff, stderr, and pvalue). How could I set up my dataframe (df1) to be able to fun a for-loop for a nested t-test?
Reproducible Example:
library(matrixTests)
library(ggplot2)
set.seed(123)
df1 <- data.frame(matrix(ncol = 4, nrow = 36))
x <- c("Age","Location","Treatment","Value")
colnames(df1) <- x
df1$Age <- as.factor(rep(c(1,2,3), each = 12))
df1$Location <- as.factor(rep(c("Central","North"), each = 6))
df1$Treatment <- as.factor(rep(c("Control","Treatment"), each = 3))
df1$Value <- round(rnorm(36,200,25),0)
# I can't get the for-loop below to work because I'm not sure how to set up the data frame, but I was thinking something along these lines.
i <- 1
p <- numeric(length = 3*2)
mean_diff <- numeric(length = 3*2)
SE_diff <- numeric(length = 3*2)
for(j in c("1", "2", "3")){
for(k in c("Control", "Treatment")){
ttest <- col_t_welch(Value, data = df1, subset = Age == j & Treatment == k))
p[i] <- a$pvalue
mean_diff[i] <- ttest$mean.diff
SE_diff[i] <- ttest$stderr
i <- i + 1
}
}
The ideal final data frame would look like d2 below.
d2 <- expand.grid(Age = rep(c(1,2,3), 1),
Location = rep(c("Central","North"), 1),
mean_diff = NA,
SE_diff = NA,
pvalue = NA)
C1 <- df1[c(1:6),3:4]
N1 <- df1[c(7:12),3:4]
C2 <- df1[c(13:18),3:4]
N2 <- df1[c(19:24),3:4]
C3 <- df1[c(25:30),3:4]
N3 <- df1[c(31:36),3:4]
c1_mod <- col_t_welch(x=C1[1:3,2], y=C1[4:6,2])
n1_mod <- col_t_welch(x=N1[1:3,2], y=N1[4:6,2])
c2_mod <- col_t_welch(x=C2[1:3,2], y=C2[4:6,2])
n2_mod <- col_t_welch(x=N2[1:3,2], y=N2[4:6,2])
c3_mod <- col_t_welch(x=C3[1:3,2], y=C3[4:6,2])
n3_mod <- col_t_welch(x=N3[1:3,2], y=N3[4:6,2])
d2[1,3] <- c1_mod$mean.diff
d2[1,4] <- c1_mod$stderr
d2[1,5] <- c1_mod$pvalue
d2[2,3] <- c2_mod$mean.diff
d2[2,4] <- c2_mod$stderr
d2[2,5] <- c2_mod$pvalue
d2[3,3] <- c3_mod$mean.diff
d2[3,4] <- c3_mod$stderr
d2[3,5] <- c3_mod$pvalue
d2[4,3] <- n1_mod$mean.diff
d2[4,4] <- n1_mod$stderr
d2[4,5] <- n1_mod$pvalue
d2[5,3] <- n2_mod$mean.diff
d2[5,4] <- n2_mod$stderr
d2[5,5] <- n2_mod$pvalue
d2[6,3] <- n3_mod$mean.diff
d2[6,4] <- n3_mod$stderr
d2[6,5] <- n3_mod$pvalue
d2

I think this might help you
Libraries
library(matrixTests)
library(tidyverse)
Data
set.seed(123)
df1 <- data.frame(matrix(ncol = 4, nrow = 36))
x <- c("Age","Location","Treatment","Value")
colnames(df1) <- x
df1$Age <- as.factor(rep(c(1,2,3), each = 12))
df1$Location <- as.factor(rep(c("Central","North"), each = 6))
df1$Treatment <- as.factor(rep(c("Control","Treatment"), each = 3))
df1$Value <- round(rnorm(36,200,25),0)
How to
df1 %>%
group_nest(Age,Location,Treatment) %>%
pivot_wider(names_from = Treatment,values_from = data) %>%
mutate(
test = map2(
.x = Control,
.y = Treatment,
.f = ~col_t_welch(.x,.y)
)
) %>%
unnest(test) %>%
select(Age,Location,pvalue,mean.diff,stderr)
Result
# A tibble: 6 x 5
Age Location pvalue mean.diff stderr
<fct> <fct> <dbl> <dbl> <dbl>
1 1 Central 0.675 -9.67 21.3
2 1 North 0.282 -22 17.7
3 2 Central 0.925 -3 28.4
4 2 North 0.570 9.33 14.6
5 3 Central 0.589 -14.7 25.0
6 3 North 0.311 -11.3 8.59

Related

R: return dataframe from function with for loop

With looping over the dataframe data:
data <- data.frame(q1 = c("2017-10-11", NA, NA),
q2 = c(NA, "2014-11-11", "2017-10-13"))
I want to create a new dataframe that is counting the rows not containing NAs in each column.
This way it worked:
df <- data %>%
mutate(c = nrow(data) - sapply(data, function(i) sum(is.na(i))))
But I want to write function in order to make it more reproducible:
func <- function(data){
y <- data_old$v1
x <- vector()
nr <- nrow(data)
for(i in colnames(data)){
x[i] <- nr - sum(is.na(data$i))
}
df <- data.frame(y, x = (x))
return(df)
}
But I am just getting a vector of NAs. What am I doing wrong?
The output is supposed to look like this:
df <- data.frame(y = c("q1", "q2"), x = c(1, 2))
data <- data.frame(q1 = c("2017-10-11", NA, NA),
q2 = c(NA, "2014-11-11", "2017-10-13"))
func <- function(data) {
data.frame(y = colnames(data),
x = apply(data, 2, function(x) sum(is.na(x))))
}
func(data) |> print()
y x
q1 q1 2
q2 q2 1

filter values from a coefficient matrix in R

I'm new in R.
I have a bit problem picking the data from the matrix.
I generated a coefficient matrix as below:
cor_mat <- cor(xxx, method = "spearman")
And I would like to take values that the spearman's correlation coefficiency is bigger or smaller than 0.39 or -0.39.
My code is below:
x1 <- c()
x2 <- c()
score <- c()
for (i in 1:length(rownames(cor_mat))) {
for (j in 1:length(colnames(cor_mat))) {
if(abs(cor_mat[i,j])>0.39 && rownames(cor_mat)[i] != colnames(cor_mat)[j]){
x1 <- c(x1,paste0(rownames(cor_mat)[i]))
x2 <- c(x2,colnames(cor_mat)[j])
score <- c(score, paste0(cor_mat[i,j]))
}
}
}
high_cor_df <- data.frame(x1,x2,as.numeric(score))
attach(high_cor_df)
It works, but too slow.
Could anyone help me to solve it?
Thank you very much in advance!
How about this approach using the tidyverse packages.
library(tidyverse)
set.seed(123)
xx <- data.frame(x = rnorm(100),
y = runif(100),
z = seq(100),
w = seq(100)*runif(100),
p = rev(seq(100))*runif(100))
cor_mat <- cor(xx, method = "spearman")
cor_mat %>%
as.data.frame() %>%
rownames_to_column(var = "var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "cor_coeff") %>%
filter(var1 != var2 & abs(cor_coeff) >= 0.39)
# A tibble: 4 x 3
var1 var2 cor_coeff
<chr> <chr> <dbl>
1 z w 0.641
2 z p -0.599
3 w z 0.641
4 p z -0.599

Access all the columns with a particular name in nested lists in R

I wonder how I can access all the columns with a particular name in nested lists. Below there is a reproducible example. How can I call all the "mean" columns and collate all in a single data.frame where the data.frame as two other columns which specify associated classes and Output1/Output2 (Example 1). Example 2 is a little more complicated where the nested "mean" list is a data.frame. I need to access both "ts" and "value" columns. In other words, I need to know the ts corresponding to each value (in addition to classes and Output1/Output2).
Example 1
classes <- c("F", "G", "M", "O")
classes <- structure(unique(classes), names = unique(classes))
S1 = data.frame(X1 = rnorm(100), X2 = rnorm(100), X3 = rnorm(100), X4 = rep(classes, 25))
S2 = data.frame(X1 = rnorm(100), X2 = rnorm(100), X3 = rnorm(100), X4 = rep(classes, 25))
P <- lapply(classes, function(c){
Output1 <- list ("model" = lm(X3~ X1+X2, data = S1),"mean" = apply(S1[S1$X4 == c, 1:3], 2, mean), "sum" = apply(S1[S1$X4 == c, 1:3], 2, sum))
Output2 <- list ("model" = lm(X3~ X1+X2, data = S2), "mean" = apply(S2[S2$X4 == c, 1:3], 2, mean), "sum" = apply(S2[S2$X4 == c, 1:3], 2, sum))
output <- list ("Output1" = Output1, "Output2" = Output2)
return(output)
})
Example 2
classes <- c("F", "G", "M", "O")
classes <- structure(unique(classes), names = unique(classes))
S1 = data.frame( X1 = rnorm(100), X2 = rnorm(100), X3 = rnorm(100), X4 = rep(classes, 25), ts = seq(from = ISOdate(1910,1,1), by = "30 min", length.out = 100 ))
S2 = data.frame( X1 = rnorm(100), X2 = rnorm(100), X3 = rnorm(100), X4 = rep(classes, 25),ts = seq(from = ISOdate(1910,1,1), by = "30 min", length.out = 100 ))
P <- lapply(classes, function(c){
Output1 <- list ("model" = lm(X3~ X1+X2, data = S1),"mean" = data.frame(ts = S1[S1$X4 == c, "ts"],
value = S1[S1$X4 == c, "X1"]) ,
"sum" = apply(S1[S1$X4 == c, 1:3], 2, sum))
Output2 <- list ("model" = lm(X3~ X1+X2, data = S2),
"mean" = data.frame(ts = S2[S2$X4 == c, "ts"],
value = S2[S2$X4 == c, "X1"]),
"sum" = apply(S2[S2$X4 == c, 1:3], 2, sum))
output <- list ("Output1" = Output1, "Output2" = Output2)
return(output)
})
We can get "mean" columns from P using rvest's pluck and bind them together with map_df.
purrr::map_df(P, ~rvest::pluck(.x, "mean"), .id = "Class")
# A tibble: 12 x 3
# Class Output1 Output2
# <chr> <dbl> <dbl>
# 1 F 0.0315 -0.0946
# 2 F 0.0935 0.219
# 3 F 0.155 0.172
# 4 G 0.123 0.182
# 5 G -0.114 -0.128
# 6 G -0.0654 -0.0990
# 7 M 0.111 0.0794
# 8 M -0.176 0.405
# 9 M 0.265 -0.0747
#10 O 0.0207 -0.250
#11 O -0.0407 0.0117
#12 O -0.162 -0.195
In base R, you can do :
temp <- lapply(P, function(x) sapply(x, `[[`, "mean"))
do.call(rbind, Map(cbind.data.frame, temp, Class = names(temp)))
EDIT
For the dataframe example, we can use bind_rows after pluck.
map_df(P, ~rvest::pluck(.x, "mean") %>% bind_rows(.id= "output"), .id = "Class")

Recap statistic after lapply

I have a dataframe with multiple columns and two different groups - see below.
set.seed(123)
d <- data.frame(
q1 = rnorm(20),
q2 = rnorm(20),
q3 = rnorm(20),
group = sample(c("A", "B"), size = 20, replace = TRUE))
I use lapply to calculate the ttest for each column between the two groups as reported below:
lapply(d[,-4], function(i) t.test(i ~ d$group))
lapply returns for each column the results listing several statistical info data (I just reported column q1)
$q1
Welch Two Sample t-test
data: i by d$group
t = -0.76262, df = 17.323, p-value = 0.4559
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-1.2294678 0.5759458
sample estimates:
mean in group A mean in group B
-0.05443279 0.27232820
I want to recap the main stat info (t, df, pvalue) as single table for each column (q1, q2, q3...)
You can use lapply() again to extract each parameter and bind_rows():
library(dplyr)
lapply(l, function(x) {
data.frame(t = x$statistic,
df = x$parameter,
pv = x$p.value) # returns a dataframe for each element in l
}) %>% bind_rows()
# t df pv
# 1 -1.031983 13.533116 0.32017136
# 2 -2.458574 9.771018 0.03427922
# 3 1.421821 11.416813 0.18181697
You can do this in one shot:
lapply(d[,-4], function(i) {
res <- t.test(i ~ d$group)
data.frame(t = res$statistic,
df = res$parameter,
pv = res$p.value)
}) %>% bind_rows()
If you want to keep reference to the column names pass .id to bind_rows():
lapply(d[,-4], function(i) {
res <- t.test(i ~ d$group)
data.frame(t = res$statistic,
df = res$parameter,
pv = res$p.value)
}) %>% bind_rows(.id='id')
# id t df pv
# 1 q1 -0.7626249 17.32329 0.4559469
# 2 q2 -1.6467070 17.73117 0.1172263
# 3 q3 0.5288851 13.01589 0.6057874
Example:
set.seed(123)
d <- data.frame(
q1 = rnorm(20),
q2 = rnorm(20),
q3 = rnorm(20),
group = sample(c("A", "B"), size = 20, replace = TRUE))
l <- lapply(d[,-4], function(i) {
t.test(i ~ d$group)
})

Create t.test table with dplyr?

Suppose I have data that looks like this:
set.seed(031915)
myDF <- data.frame(
Name= rep(c("A", "B"), times = c(10,10)),
Group = rep(c("treatment", "control", "treatment", "control"), times = c(5,5,5,5)),
X = c(rnorm(n=5,mean = .05, sd = .001), rnorm(n=5,mean = .02, sd = .001),
rnorm(n=5,mean = .08, sd = .02), rnorm(n=5,mean = .03, sd = .02))
)
I want to create a t.test table with a row for "A" and one for "B"
I can write my own function that does that:
ttestbyName <- function(Name) {
b <- t.test(myDF$X[myDF$Group == "treatment" & myDF$Name==Name],
myDF$X[myDF$Group == "control" & myDF$Name==Name],
conf.level = 0.90)
dataNameX <- data.frame(Name = Name,
treatment = round(b$estimate[[1]], digits = 4),
control = round(b$estimate[[2]], digits = 4),
CI = paste('(',round(b$conf.int[[1]],
digits = 4),', ',
round(b$conf.int[[2]],
digits = 4), ')',
sep=""),
pvalue = round(b$p.value, digits = 4),
ntreatment = nrow(myDF[myDF$Group == "treatment" & myDF$Name==Name,]),
ncontrol = nrow(myDF[myDF$Group == "control" & myDF$Name==Name,]))
}
library(parallel)
Test_by_Name <- mclapply(unique(myDF$Name), ttestbyName)
Test_by_Name <- do.call("rbind", Test_by_Name)
and the output looks like this:
Name treatment control CI pvalue ntreatment ncontrol
1 A 0.0500 0.0195 (0.0296, 0.0314) 0.0000 5 5
2 B 0.0654 0.0212 (0.0174, 0.071) 0.0161 5 5
I'm wondering if there is a cleaner way of doing this with dplyr. I thought about using groupby, but I'm a little lost.
Thanks!
Not much cleaner, but here's an improvement:
library(dplyr)
ttestbyName <- function(myName) {
bt <- filter(myDF, Group=="treatment", Name==myName)
bc <- filter(myDF, Group=="control", Name==myName)
b <- t.test(bt$X, bc$X, conf.level=0.90)
dataNameX <- data.frame(Name = myName,
treatment = round(b$estimate[[1]], digits = 4),
control = round(b$estimate[[2]], digits = 4),
CI = paste('(',round(b$conf.int[[1]],
digits = 4),', ',
round(b$conf.int[[2]],
digits = 4), ')',
sep=""),
pvalue = round(b$p.value, digits = 4),
ntreatment = nrow(bt), # changes only in
ncontrol = nrow(bc)) # these 2 nrow() args
}
You should really replace the do.call function with rbindlist from data.table:
library(data.table)
Test_by_Name <- lapply(unique(myDF$Name), ttestbyName)
Test_by_Name <- rbindlist(Test_by_Name)
or, even better, use the %>% pipes:
Test_by_Name <- myDF$Name %>%
unique %>%
lapply(., ttestbyName) %>%
rbindlist
> Test_by_Name
Name treatment control CI pvalue ntreatment ncontrol
1: A 0.0500 0.0195 (0.0296, 0.0314) 0.0000 5 5
2: B 0.0654 0.0212 (0.0174, 0.071) 0.0161 5 5
An old question, but the broom package has since been made available for this exact purpose (as well as other statistical tests):
library(broom)
library(dplyr)
myDF %>% group_by(Name) %>%
do(tidy(t.test(X~Group, data = .)))
Source: local data frame [2 x 9]
Groups: Name [2]
Name estimate estimate1 estimate2 statistic p.value
(fctr) (dbl) (dbl) (dbl) (dbl) (dbl)
1 A -0.03050475 0.01950384 0.05000860 -63.838440 1.195226e-09
2 B -0.04423181 0.02117864 0.06541046 -3.104927 1.613625e-02
Variables not shown: parameter (dbl), conf.low (dbl), conf.high (dbl)
library(tidyr)
library(dplyr)
myDF %>% group_by(Group) %>% mutate(rowname=1:n())%>%
spread(Group, X) %>%
group_by(Name) %>%
do(b = t.test(.$control, .$treatment)) %>%
mutate(
treatment = round(b[['estimate']][[2]], digits = 4),
control = round(b[['estimate']][[1]], digits = 4),
CI = paste0("(", paste(b[['conf.int']], collapse=", "), ")"),
pvalue = b[['p.value']]
)
# Name treatment control CI pvalue
#1 A 0.0500 0.0195 (-0.031677109707283, -0.0293323994902097) 1.195226e-09
#2 B 0.0654 0.0212 (-0.0775829100729602, -0.010880719830447) 1.613625e-02
You can add ncontrol, ntreatment manually.
You can do it with a custom t.test function and do:
my.t.test <- function(data, formula, ...)
{
tt <- t.test(formula=formula, data=data, ...)
ests <- tt$estimate
names(ests) <- sub("mean in group ()", "\\1",names(ests))
counts <- xtabs(formula[c(1,3)],data)
names(counts) <- paste0("n",names(counts))
cbind(
as.list(ests),
data.frame(
CI = paste0("(", paste(tt$conf.int, collapse=", "), ")"),
pvalue = tt$p.value,
stringsAsFactors=FALSE
),
as.list(counts)
)
}
myDF %>% group_by(Name) %>% do(my.t.test(.,X~Group))
Source: local data frame [2 x 7]
Groups: Name
Name control treatment CI pvalue ncontrol ntreatment
1 A 0.01950384 0.05000860 (-0.031677109707283, -0.0293323994902097) 1.195226e-09 5 5
2 B 0.02117864 0.06541046 (-0.0775829100729602, -0.010880719830447) 1.613625e-02 5 5

Resources