I have a table with 40+ columns and 200.000+ rows.
Something like this:
ID GROUP-A1 GROUP-A2 GROUP A3...A20 GROUP-B1 GROUP-B2 GROUP-B3...B20
1 5 6 3 5....3 10 21 9 15
2 3 4 6 2....13 23 42 34 23
3 5 3 1 0....12 10 12 43 15
4 0 0 2 5....3 10 21 23 15
I would like to run a t-test for the two groups A (1..20) and B (1..20) for every measurement I have (each row), which are independent. And possibly, have the resulting stats in the table next to each row or in a separate table, so I can easily select the significant ones.
I looked at few R packages but they mostly would require reformatting the table I have, to put measurements and groups in columns, and I would need 200.000+ separate tables in that case.
Any idea?
Something like this?
apply(df,1,function(x){t.test(x[2:21],x[22:41])})
To save the test statistic or p-value in a new column you could do
df$st=apply(df,1,function(x){t.test(x[2:21],x[22:41])$stat})
or $p.value
You can run all tests with the following code.
i_group_a <- grep("GROUP.A", names(df1), ignore.case = TRUE)
i_group_b <- grep("GROUP.B", names(df1), ignore.case = TRUE)
ttest_list <- lapply(seq_along(i_group_a), function(k){
i <- i_group_a[k]
j <- i_group_b[k]
t.test(df1[[i]], df1[[j]])
})
ttest_list[[1]]
#
# Welch Two Sample t-test
#
#data: df1[[i]] and df1[[j]]
#t = -2.8918, df = 3.7793, p-value = 0.04763
#alternative hypothesis: true difference in means is not equal to 0
#95 percent confidence interval:
# -19.826402 -0.173598
#sample estimates:
#mean of x mean of y
# 3.25 13.25
To extract, for instance, the p-values:
pval <- sapply(ttest_list, `[[`, 'p.value')
pval
#[1] 0.04762593 0.04449075 0.04390115 0.00192454
Data.
df1 <- read.table(text = "
ID GROUP-A1 GROUP-A2 GROUP-A3 GROUP-A20 GROUP-B1 GROUP-B2 GROUP-B3 GROUP-B20
1 5 6 3 5 10 21 9 15
2 3 4 6 2 23 42 34 23
3 5 3 1 0 10 12 43 15
4 0 0 2 5 10 21 23 15
", header = TRUE)
You can do this with tidyverse using purrr. It does however require to format your data differently. Here is an example:
require(tidyverse)
set.seed(314)
simulate your data
df <- data.frame(ID = rep(1:5,each = 20),
participant = rep(rep(1:10,2),5),
group = rep(rep(c('A','B'),each = 10),5),
answer = sample(1:10,100, replace = T))
dfflat <- df %>%
unite(column, group,participant) %>%
spread(column,answer)
dfflat:
ID A_1 A_10 A_2 A_3 A_4 A_5 A_6 A_7 A_8 A_9 B_1 B_10 B_2 B_3 B_4 B_5 B_6 B_7 B_8 B_9
1 1 1 8 3 8 3 3 4 3 4 6 4 4 2 3 3 6 4 8 6 1
2 2 7 6 5 6 3 1 6 4 1 3 3 6 7 1 5 5 2 10 10 6
3 3 4 3 8 5 9 7 9 7 3 1 8 2 7 6 8 3 5 6 9 4
4 4 5 4 8 2 4 1 4 6 2 2 1 1 7 10 6 9 7 7 10 1
5 5 4 1 5 10 3 5 3 10 8 3 7 3 4 6 6 9 10 7 4 5
the equivalent in long format:
dfflat %>%
gather(participant,answer,-ID) %>%
separate(participant,c('group','number'))
ID group number answer
1 1 A 1 1
2 2 A 1 7
3 3 A 1 4
4 4 A 1 5
5 5 A 1 4
6 1 A 10 8
7 2 A 10 6
8 3 A 10 3
9 4 A 10 4
10 5 A 10 1
11 1 A 2 3
12 2 A 2 5
13 3 A 2 8
14 4 A 2 8
15 5 A 2 5
16 1 A 3 8
17 2 A 3 6
18 3 A 3 5
19 4 A 3 2
20 5 A 3 10
...
Test the hypothesis with t.test per ID and extract the p.value
dfflat %>%
gather(participant,answer,-ID) %>%
separate(participant,c('group','number')) %>%
group_by(ID) %>%
nest() %>%
mutate(test = map(data, ~ with(.x, t.test(answer[group == 'A'],answer[group == 'B']))),
p.value = map_dbl(test,pluck,'p.value'))
results in:
# A tibble: 5 x 4
ID data test p.value
<int> <list> <list> <dbl>
1 1 <tibble [20 x 3]> <S3: htest> 0.841
2 2 <tibble [20 x 3]> <S3: htest> 0.284
3 3 <tibble [20 x 3]> <S3: htest> 0.863
4 4 <tibble [20 x 3]> <S3: htest> 0.137
5 5 <tibble [20 x 3]> <S3: htest> 0.469
Related
As the title, How do I get the second/third largest/smallest value across rows by dplyr? Is there an elegant way to achieve it?
a <- data.frame(gp1=c(3:11), gp2=c(1:9), gp3=c(8,8,2,6,6,6,12,12,6))
## the max/min value is very simple
a %>%
rowwise() %>%
mutate(max1=max(gp1, gp2, gp3))
#
# # A tibble: 9 × 4
# # Rowwise:
# gp1 gp2 gp3 max1
# <int> <int> <dbl> <dbl>
# 1 3 1 8 8
# 2 4 2 8 8
# 3 5 3 2 5
# 4 6 4 6 6
# 5 7 5 6 7
# 6 8 6 6 8
# 7 9 7 12 12
# 8 10 8 12 12
# 9 11 9 6 11
The result should be similar to this:
#
# # A tibble: 9 × 4
# # Rowwise:
# gp1 gp2 gp3 max1 max2
# <int> <int> <dbl> <dbl> <dbl>
# 1 3 1 8 8 3
# 2 4 2 8 8 4
# 3 5 3 2 5 3
# 4 6 4 6 6 6
# 5 7 5 6 7 6
# 6 8 6 6 8 6
# 7 9 7 12 12 9
# 8 10 8 12 12 12
# 9 11 9 6 11 9
You can use c_across along with sort. The use of rev here reverses the sorted data, making it easy to select the largest value with index 1, the second-largest with index 2, etc.
Note that column "max2" in your example output makes errors in certain rows (I think you may have been including the "max1" column in some cases).
a %>%
rowwise() %>%
mutate(
max1 = max(gp1, gp2, gp3),
max2 = rev(sort(c_across(c(gp1, gp2, gp3))))[2]
)
gp1 gp2 gp3 max1 max2
<int> <int> <dbl> <dbl> <dbl>
1 3 1 8 8 3
2 4 2 8 8 4
3 5 3 2 5 3
4 6 4 6 6 6
5 7 5 6 7 6
6 8 6 6 8 6
7 9 7 12 12 9
8 10 8 12 12 10
9 11 9 6 11 9
A solution with pmap which does not involve rowwise:
library(purrr)
a %>%
mutate(max1 = pmax(gp1, gp2, gp3),
max2 = pmap(., ~ rev(sort(c(..1, ..2, ..3)))[2]))
gp1 gp2 gp3 max1 max2
1 3 1 8 8 3
2 4 2 8 8 4
3 5 3 2 5 3
4 6 4 6 6 6
5 7 5 6 7 6
6 8 6 6 8 6
7 9 7 12 12 9
8 10 8 12 12 10
9 11 9 6 11 9
I am sure there is a shorter way to automate it, but here is a quick solution for now:
library(dplyr)
library(slider)
a %>%
rowwise() %>%
mutate(output = list(slide_dfc(sort(c_across(everything()), decreasing = TRUE), max, .before = 1, .complete = TRUE))) %>%
unnest_wider(output) %>%
rename_with(~ sub('\\.+(\\d)', 'Max_\\1', .), contains('.')) %>%
suppressMessages()
# A tibble: 9 × 5
gp1 gp2 gp3 Max_1 Max_2
<int> <int> <dbl> <dbl> <dbl>
1 3 1 8 8 3
2 4 2 8 8 4
3 5 3 2 5 3
4 6 4 6 6 6
5 7 5 6 7 6
6 8 6 6 8 6
7 9 7 12 12 9
8 10 8 12 12 10
9 11 9 6 11 9
An option with pmax
library(dplyr)
a %>%
mutate(max1 = do.call(pmax, across(everything())),
across(starts_with('gp'), ~ replace(.x, .x == max1, NA))) %>%
transmute(max2 = do.call(pmax, c(across(starts_with('gp')), na.rm = TRUE))) %>%
bind_cols(a, .)
-output
gp1 gp2 gp3 max2
1 3 1 8 3
2 4 2 8 4
3 5 3 2 3
4 6 4 6 4
5 7 5 6 6
6 8 6 6 6
7 9 7 12 9
8 10 8 12 10
9 11 9 6 9
Or in base R
a$max2 <- do.call(pmax, c(replace(a, cbind(seq_len(nrow(a)),
max.col(a, 'first')), NA), na.rm = TRUE))
a$max2
[1] 3 4 3 6 6 6 9 10 9
I am having some trouble getting mutate, across, and case_when to function properly, I've recreated a simple version of my problem here:
a <- c(1:10)
b <- c(2:11)
c <- c(3:12)
test <- tibble(a, b, c)
# A tibble: 10 x 3
a b c
<int> <int> <int>
1 1 2 3
2 2 3 4
3 3 4 5
4 4 5 6
5 5 6 7
6 6 7 8
7 7 8 9
8 8 9 10
9 9 10 11
10 10 11 12
My goal is to replace all of the 3's with 4's, and keep everything else the same. I have the following code:
test_1 <-
test %>%
mutate(across(a:c, ~ case_when(. == 3 ~ 4)))
# A tibble: 10 x 3
a b c
<dbl> <dbl> <dbl>
1 NA NA 4
2 NA 4 NA
3 4 NA NA
4 NA NA NA
5 NA NA NA
6 NA NA NA
7 NA NA NA
8 NA NA NA
9 NA NA NA
10 NA NA NA
It's close but I get NA values where I want to maintain the value in the original tibble. How do I maintain the original values using the mutate across structure?
Thank you in advance!
What about this?
> test %>%
+ mutate(across(a:c, ~ case_when(. == 3 ~ 4, TRUE ~ 1 * (.))))
# A tibble: 10 x 3
a b c
<dbl> <dbl> <dbl>
1 1 2 4
2 2 4 4
3 4 4 5
4 4 5 6
5 5 6 7
6 6 7 8
7 7 8 9
8 8 9 10
9 9 10 11
10 10 11 12
or
> test %>%
+ replace(. == 3, 4)
# A tibble: 10 x 3
a b c
<int> <int> <int>
1 1 2 4
2 2 4 4
3 4 4 5
4 4 5 6
5 5 6 7
6 6 7 8
7 7 8 9
8 8 9 10
9 9 10 11
10 10 11 12
In base R, we can do
test[test ==3] <- 4
This also works:
a <- c(1:10)
b <- c(2:11)
c <- c(3:12)
tibble(a, b, c) %>%
modify(~ ifelse(. == 3, 4, .))
# A tibble: 10 x 3
a b c
<dbl> <dbl> <dbl>
1 1 2 4
2 2 4 4
3 4 4 5
4 4 5 6
5 5 6 7
6 6 7 8
7 7 8 9
8 8 9 10
9 9 10 11
10 10 11 12
I have the dataframe
df <- data.frame(e_1=c(1,2,3,4,5), e_2=c(1,3,5,7,9), e_3=c(2,4,6,8,1),
e_4=c(1,2,4,5,7), e_5=c(1,8,9,6,4), Lanes=c(3,4,3,5,4))
I try to use:
max(combn(df[,(1:5)],df$Lanes,FUN = function(i) rowSums(df[,(1:5)][i])))
I get the error
Error in combn(df[, (1:5)], df$Lanes, FUN = function(i) rowSums(df[, (1:5)][i])) : length(m) == 1L is not TRUE
I guess you can try using combn row-wise, e.g.,
df$comb <- apply(df,1,function(v) max(combn(v[1:5],v["Lanes"],sum)))
such that
> df
e_1 e_2 e_3 e_4 e_5 Lanes comb
1 1 1 2 1 1 3 4
2 2 3 4 2 8 4 17
3 3 5 6 4 9 3 20
4 4 7 8 5 6 5 30
5 5 9 1 7 4 4 25
Using dplyr and purrr for this one could look as follows.
library(dplyr)
library(purrr)
library(rlang)
df <- data.frame(e_1=c(1,2,3,4,5), e_2=c(1,3,5,7,9), e_3=c(2,4,6,8,1),
e_4=c(1,2,4,5,7), e_5=c(1,8,9,6,4), Lanes=c(3,4,3,5,4))
df %>%
mutate(eSum = pmap(list(!!!parse_exprs(colnames(.))),
~ max(colSums(combn(c(..1, ..2, ..3, ..4, ..5), ..6)))))
# e_1 e_2 e_3 e_4 e_5 Lanes eSum
# 1 1 1 2 1 1 3 4
# 2 2 3 4 2 8 4 17
# 3 3 5 6 4 9 3 20
# 4 4 7 8 5 6 5 30
# 5 5 9 1 7 4 4 25
An option with c_across from dplyr
library(dplyr)
df %>%
rowwise %>%
mutate(Comb = max(combn(c_across(starts_with('e')), Lanes, FUN = sum)))
# A tibble: 5 x 7
# Rowwise:
# e_1 e_2 e_3 e_4 e_5 Lanes Comb
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 1 2 1 1 3 4
#2 2 3 4 2 8 4 17
#3 3 5 6 4 9 3 20
#4 4 7 8 5 6 5 30
#5 5 9 1 7 4 4 25
I have a data.frame which looks like the following:
id <- c("a","a","a","a","b","b","b","b")
age_from <- c(0,2,3,7,0,1,2,6)
age_to <- c(2,3,7,10,1,2,6,10)
y <- c(100,150,100,250,300,200,100,150)
df <- data.frame(id,age_from,age_to,y)
df$years <- df$age_to - df$age_from
Which gives a df that looks like:
id age_from age_to y years
1 a 0 2 100 2
2 a 2 3 150 1
3 a 3 7 100 4
4 a 7 10 250 3
5 b 0 1 300 1
6 b 1 2 200 1
7 b 2 6 100 4
8 b 6 10 150 4
Instead of having an unequal number of years per row, I would like to have 20 rows, 10 for each id, with each row accounting for one year. This would also involve averaging the y column across the number of years listed in the years column.
I believe this may have to be done using a loop 1:n with the n equaling a value in the years column. Although I am not sure how to start with this.
You can use rep to repeat the rows by the number of given years.
x <- df[rep(seq_len(nrow(df)), df$years),]
x
# id age_from age_to y years
#1 a 0 2 50.00000 2
#1.1 a 0 2 50.00000 2
#2 a 2 3 150.00000 1
#3 a 3 7 25.00000 4
#3.1 a 3 7 25.00000 4
#3.2 a 3 7 25.00000 4
#3.3 a 3 7 25.00000 4
#4 a 7 10 83.33333 3
#4.1 a 7 10 83.33333 3
#4.2 a 7 10 83.33333 3
#5 b 0 1 300.00000 1
#6 b 1 2 200.00000 1
#7 b 2 6 25.00000 4
#7.1 b 2 6 25.00000 4
#7.2 b 2 6 25.00000 4
#7.3 b 2 6 25.00000 4
#8 b 6 10 37.50000 4
#8.1 b 6 10 37.50000 4
#8.2 b 6 10 37.50000 4
#8.3 b 6 10 37.50000 4
When you mean with averaging the y column across the number of years to divide by the number of years:
x$y <- x$y / x$years
In case age_from should go from 0 to 9 and age_to from 1 to 10 for each id:
x$age_from <- x$age_from + ave(x$age_from, x$id, x$age_from, FUN=seq_along) - 1
#x$age_from <- ave(x$age_from, x$id, FUN=seq_along) - 1 #Alternative
x$age_to <- x$age_from + 1
Here is a solution with tidyr and dplyr.
First of all we complete age_from from 0 to 9 as you wanted, by keeping only the existing ids.
You will have several NAs on age_to, y and years. So, we fill them by dragging down each value in order to complete the immediately following values that are NA.
Now you can divide y by years (I assumed you meant this by setting the average value so to leave the sum consistent).
At that point, you only need to recalculate age_to accordingly.
Remember to ungroup at the end!
library(tidyr)
library(dplyr)
df %>%
complete(id, age_from = 0:9) %>%
group_by(id) %>%
fill(y, years, age_to) %>%
mutate(y = y/years) %>%
mutate(age_to = age_from + 1) %>%
ungroup()
# A tibble: 20 x 5
id age_from age_to y years
<chr> <dbl> <dbl> <dbl> <dbl>
1 a 0 1 50 2
2 a 1 2 50 2
3 a 2 3 150 1
4 a 3 4 25 4
5 a 4 5 25 4
6 a 5 6 25 4
7 a 6 7 25 4
8 a 7 8 83.3 3
9 a 8 9 83.3 3
10 a 9 10 83.3 3
11 b 0 1 300 1
12 b 1 2 200 1
13 b 2 3 25 4
14 b 3 4 25 4
15 b 4 5 25 4
16 b 5 6 25 4
17 b 6 7 37.5 4
18 b 7 8 37.5 4
19 b 8 9 37.5 4
20 b 9 10 37.5 4
A tidyverse solution.
library(tidyverse)
df %>%
mutate(age_to = age_from + 1) %>%
group_by(id) %>%
complete(nesting(age_from = 0:9, age_to = 1:10)) %>%
fill(y, years) %>%
mutate(y = y / years)
# A tibble: 20 x 5
# Groups: id [2]
id age_from age_to y years
<chr> <dbl> <dbl> <dbl> <dbl>
1 a 0 1 50 2
2 a 1 2 50 2
3 a 2 3 150 1
4 a 3 4 25 4
5 a 4 5 25 4
6 a 5 6 25 4
7 a 6 7 25 4
8 a 7 8 83.3 3
9 a 8 9 83.3 3
10 a 9 10 83.3 3
11 b 0 1 300 1
12 b 1 2 200 1
13 b 2 3 25 4
14 b 3 4 25 4
15 b 4 5 25 4
16 b 5 6 25 4
17 b 6 7 37.5 4
18 b 7 8 37.5 4
19 b 8 9 37.5 4
20 b 9 10 37.5 4
data=data.frame(person=c(1,1,1,2,2,2,2,3,3,3,3),
t=c(3,NA,9,4,7,NA,13,3,NA,NA,12),
WANT=c(3,6,9,4,7,10,13,3,6,9,12))
So basically I am wanting to create a new variable 'WANT' which takes the PREVIOUS value in t and ADDS 3 to it, and if there are many NA in a row then it keeps doing this. My attempt is:
library(dplyr)
data %>%
group_by(person) %>%
mutate(WANT_TRY = fill(t) + 3)
Here's one way -
data %>%
group_by(person) %>%
mutate(
# cs = cumsum(!is.na(t)), # creates index for reference value; uncomment if interested
w = case_when(
# rle() gives the running length of NA
is.na(t) ~ t[cumsum(!is.na(t))] + 3*sequence(rle(is.na(t))$lengths),
TRUE ~ t
)
) %>%
ungroup()
# A tibble: 11 x 4
person t WANT w
<dbl> <dbl> <dbl> <dbl>
1 1 3 3 3
2 1 NA 6 6
3 1 9 9 9
4 2 4 4 4
5 2 7 7 7
6 2 NA 10 10
7 2 13 13 13
8 3 3 3 3
9 3 NA 6 6
10 3 NA 9 9
11 3 12 12 12
Here is another way. We can do linear interpolation with the imputeTS package.
library(dplyr)
library(imputeTS)
data2 <- data %>%
group_by(person) %>%
mutate(WANT2 = na.interpolation(WANT)) %>%
ungroup()
data2
# # A tibble: 11 x 4
# person t WANT WANT2
# <dbl> <dbl> <dbl> <dbl>
# 1 1 3 3 3
# 2 1 NA 6 6
# 3 1 9 9 9
# 4 2 4 4 4
# 5 2 7 7 7
# 6 2 NA 10 10
# 7 2 13 13 13
# 8 3 3 3 3
# 9 3 NA 6 6
# 10 3 NA 9 9
# 11 3 12 12 12
This is harder than it seems because of the double NA at the end. If it weren't for that, then the following:
ifelse(is.na(data$t), c(0, data$t[-nrow(data)])+3, data$t)
...would give you want you want. The simplest way, that uses the same logic but doesn't look very clever (sorry!) would be:
.impute <- function(x) ifelse(is.na(x), c(0, x[-length(x)])+3, x)
.impute(.impute(data$t))
...which just cheats by doing it twice. Does that help?
You can use functional programming from purrr and "NA-safe" addition from hablar:
library(hablar)
library(dplyr)
library(purrr)
data %>%
group_by(person) %>%
mutate(WANT2 = accumulate(t, ~.x %plus_% 3))
Result
# A tibble: 11 x 4
# Groups: person [3]
person t WANT WANT2
<dbl> <dbl> <dbl> <dbl>
1 1 3 3 3
2 1 NA 6 6
3 1 9 9 9
4 2 4 4 4
5 2 7 7 7
6 2 NA 10 10
7 2 13 13 13
8 3 3 3 3
9 3 NA 6 6
10 3 NA 9 9
11 3 12 12 12