Resampling from a large data.frame - r

I have a large data.frame of this structure:
min.reps <- 1
max.reps <- 3
set.seed(1)
df <- do.call(rbind,lapply(1:100, function(i) {
reps <- seq(1,as.integer(runif(1,min.reps, max.reps)), 1)
vals <- runif(length(reps), 0, 100)
return(data.frame(id=rep(i,length(reps)),rep=reps,val=vals,stringsAsFactors=F))
}))
head(df)
id rep val
1 1 1 37.212390
2 2 1 90.820779
3 2 2 20.168193
4 3 1 94.467527
5 3 2 66.079779
6 4 1 6.178627
Each df$id has between min.reps and max.reps observations (df$val). In reality instead of 100 ids I have ~5,000,000 ids.
For each df$id I'd like to add one more value, sampled from a normal distribution with mean and sd as the median and mad over its existing values, respectively.
This is trivial to do this way:
add.reps <- 1
all.ids <- unique(df$id)
require(dplyr)
new.df <- do.call(rbind, lapply(all.ids, function(i) {
id.df <- dplyr::filter(df, id == i)
add.df <- rbind(id.df, data.frame(id = rep(i,add.reps), rep = max(id.df$rep) + add.reps, val = rnorm(add.reps, median(id.df$val), mad(id.df$val)), stringsAsFactors = F))
}))
But I'm wondering if there's a much faster way to achieve this given the dimensions of my real data.frame.

This should be much faster:
add.reps <- 1
do.call(rbind, lapply(split(df, df$id), function(x) rbind(x,
data.frame(id = rep(unique(x$id), add.reps), rep = max(x$rep) + add.reps,
val = rnorm(add.reps, median(x$val), mad(x$val)), stringsAsFactors = F))))

Ok, so so far:
require(microbenchmark)
microbenchmark(
new.df <- do.call(rbind, lapply(all.ids, function(i) {
id.df <- dplyr::filter(df, id == i)
add.df <- rbind(id.df, data.frame(id = rep(i,add.reps), rep = max(id.df$rep) + add.reps, val = rnorm(add.reps, median(id.df$val), mad(id.df$val)), stringsAsFactors = F))
}))
)
new.df <- do.call(rbind, lapply(all.ids, function(i) { id.df <- dplyr::filter(df, id == i) add.df <- rbind(id.df, data.frame(id = rep(i, add.reps), rep = max(id.df$rep) + add.reps, val = rnorm(add.reps, median(id.df$val), mad(id.df$val)), stringsAsFactors = F)) }))
min lq mean median uq max neval
212.9906 225.1345 371.9314 260.9686 332.5619 1621.586 100
vs.
microbenchmark(
new.df <- do.call(rbind, lapply(split(df, df$id), function(x) rbind(x,
data.frame(id = rep(unique(x$id), add.reps), rep = max(x$rep) + add.reps,
val = rnorm(add.reps, median(x$val), mad(x$val)), stringsAsFactors = F))))
)
new.df <- do.call(rbind, lapply(split(df, df$id), function(x) rbind(x, data.frame(id = rep(unique(x$id), add.reps), rep = max(x$rep) + add.reps, val = rnorm(add.reps, median(x$val), mad(x$val)), stringsAsFactors = F))))
min lq mean median uq max neval
133.8357 135.1846 202.9654 137.2722 160.5121 1401.03 100
I wonder if this can still be further improved

Related

How to append/combine more than 2 lists of data frames into one list efficiently?

I have lists of data frames that I want to combine into one. For instance:
ls1 <- list(data.frame(A = rnorm(3), B = rnorm(3)), data.frame(A = rnorm(3), B = rnorm(3)))
ls2 <- list(data.frame(A = rnorm(3), B = rnorm(3)), data.frame(A = rnorm(3), B = rnorm(3)))
ls3 <- list(data.frame(A = rnorm(3), B = rnorm(3)), data.frame(A = rnorm(3), B = rnorm(3)))
I found this answer on stackoverflow using mapply. I don't think this works with my case.
res1 <- mapply(c, ls1, ls2, ls3, SIMPLIFY = F)
res1 |> View()
It creates a list of length 2 with sublists of length 6. My desired output is this:
res2 <- append(ls1, ls2)
res2 <- append(res2, ls3)
res2 |> View()
I have more than 10 lists of data frames that I want to combine into one. Instead of using append, is there a more efficient way to combine them all in one list?
One option could be:
Reduce(c, mget(ls(pattern = "ls")))
Compared with the original results:
setequal(Reduce(c, mget(ls(pattern = "ls"))), res2)
[1] TRUE
unlist will be a little faster than Reduce.
n <- 3L
for (i in 1:1e3) assign(paste0("ls", i), list(data.frame(A = rnorm(n), B = rnorm(n)), data.frame(A = rnorm(n), B = rnorm(n))))
f1 <- function(envir = .GlobalEnv) unlist(mget(ls(pattern = "ls", envir = envir), envir), FALSE, FALSE)
f2 <- function(envir = .GlobalEnv) Reduce(c, mget(ls(pattern = "ls", envir = envir), envir))
microbenchmark::microbenchmark(f1 = f1(),
f2 = f2(),
check = "identical")
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f1 1.8792 1.91630 1.971708 1.93410 1.98085 3.7408 100
#> f2 6.3691 6.47915 7.100086 6.59355 6.96825 15.7032 100

Apply statistical test to many variables: improve speed

I have a dataframe with 40 rows and ~40000 columns. The 40 rows are split into group "A" and group "B" (20 each). For each column, I would like to apply a statistical test (wilcox.test()) comparing the two groups. I started using a for loop to run through the 40000 columns but it was very slow.
Minimal Reproducible Example (MRE):
library(tidyverse)
set.seed(123)
metrics <- paste("metric_", 1:40000, sep = "")
patient_IDs <- paste("patientID_", 1:40, sep = "")
m <- matrix(sample(1:20, 1600000, replace = TRUE), ncol = 40000, nrow = 40,
dimnames=list(patient_IDs, metrics))
test_data <- as.data.frame(m)
test_data$group <- c(rep("A", 20), rep("B", 20))
# Collate list of metrics to analyse ("check") for significance
list_to_check <- colnames(test_data)[1:40000]
Original 'loop' method (this is what I want to vectorise):
# Create a variable to store the results
results_A_vs_B <- c()
# Loop through the "list_to_check" and,
# for each 'value', compare group A with group B
# and load the results into the "results_A_vs_B" variable
for (i in list_to_check) {
outcome <- wilcox.test(test_data[test_data$group == "A", ][[i]],
test_data[test_data$group == "B", ][[i]],
exact = FALSE)
if (!is.nan(outcome$p.value) && outcome$p.value <= 0.05) {
results_A_vs_B[i] <- paste(outcome$p.value, sep = "\t")
}
}
# Format the results into a dataframe
summarised_results_A_vs_B <- as.data.frame(results_A_vs_B) %>%
rownames_to_column(var = "A vs B") %>%
rename("Wilcox Test P-value" = "results_A_vs_B")
Benchmarking the answers so far:
# Ronak Shah's "Map" approach
Map_func <- function(dataset, list_to_check) {
tmp <- split(dataset[list_to_check], dataset$group)
stack(Map(function(x, y) wilcox.test(x, y, exact = FALSE)$p.value, tmp[[1]], tmp[[2]]))
}
# #Onyambu's data.table method
dt_func <- function(dataset, list_to_check) {
melt(setDT(dataset), measure.vars = list_to_check)[, dcast(.SD, rowid(group) + variable ~ group)][, wilcox.test(A, B, exact = FALSE)$p.value, variable]
}
# #Park's dplyr method (with some minor tweaks)
dplyr_func <- function(dataset, list_to_check){
dataset %>%
summarise(across(all_of(list_to_check),
~ wilcox.test(.x ~ group, exact = FALSE)$p.value)) %>%
pivot_longer(cols = everything(),
names_to = "Metrics",
values_to = "Wilcox Test P-value")
}
library(microbenchmark)
res_map <- microbenchmark(Map_func(test_data, list_to_check), times = 10)
res_dplyr <- microbenchmark(dplyr_func(test_data, list_to_check), times = 2)
library(data.table)
res_dt <- microbenchmark(dt_func(test_data, list_to_check), times = 10)
autoplot(rbind(res_map, res_dt, res_dplyr))
# Excluding dplyr
autoplot(rbind(res_map, res_dt))
--
Running the code on a server took a couple of seconds longer but the difference between Map and data.table was more pronounced (laptop = 4 cores, server = 8 cores):
autoplot(rbind(res_map, res_dt))
Here is another option -
Map_approach <- function(dataset, list_to_check) {
tmp <- split(dataset[list_to_check], dataset$group)
stack(Map(function(x, y) wilcox.test(x, y)$p.value, tmp[[1]], tmp[[2]]))
}
Map_approach(data_subset, list_to_check)
# values ind
#1 5.359791e-05 value_1
#2 5.499685e-08 value_2
#3 1.503951e-06 value_3
#4 6.179352e-08 value_4
#5 5.885650e-08 value_5
Testing it on larger sample Map is slightly faster than the for loop.
n <- 1e6
data_subset <- data.frame(patient_ID = 1:n,
group = c(rep("A", n/2),
rep("B", n/2)),
value_1 = c(sample(1:10, n/2, replace = TRUE),
sample(5:15, n/2, replace = TRUE)),
value_2 = c(sample(1:5, n/2, replace = TRUE),
sample(15:n/2, n/2, replace = TRUE)),
value_3 = c(sample(1:12, n/2, replace = TRUE),
sample(8:17, n/2, replace = TRUE)),
value_4 = c(sample(5:10, n/2, replace = TRUE),
sample(15:25, n/2, replace = TRUE)),
value_5 = c(sample(20:40, n/2, replace = TRUE),
sample(10:15, n/2, replace = TRUE)))
microbenchmark::microbenchmark(loop = wilcox_loop(data_subset, list_to_check),
Map = Map_approach(data_subset, list_to_check))
#Unit: seconds
# expr min lq mean median uq max neval cld
# loop 5.254573 5.631162 5.788624 5.734480 5.920424 6.756319 100 b
# Map 4.710790 5.084783 5.201711 5.160722 5.309048 5.721540 100 a
May you try this code? It's slightly faster in my computer.
wilcox_loop2 <- function(data_subset, list_to_check){
A = data_subset[data_subset$group == "A",]
B = data_subset[data_subset$group == "B",]
outcome <- sapply(list_to_check, function(x) wilcox.test(A[[x]],
B[[x]],
exact = FALSE)$p.value)
as.data.frame(outcome) %>%
rownames_to_column(var = "A vs B") %>%
rename("Wilcox Test P-value" = "outcome")
}
I'm not sure it's OK to split data into A and B...
My system time costs is like
microbenchmark::microbenchmark(origin = wilcox_loop(data_subset, list_to_check),
test = wilcox_loop2(data_subset, list_to_check))
Unit: milliseconds
expr min lq mean median uq max neval cld
origin 4.815601 5.006951 6.490757 5.385502 6.790752 21.5876 100 b
test 3.817801 4.116151 5.146963 4.330500 4.870651 15.8271 100 a

Find variables that occur only in ONE row in R

Using BASE R, I wonder how to answer the following question:
Are there any value on X or Y that occurs only in one row but not others? If yes, produce my desired output below.
f <- data.frame(id = c(rep("AA",4), rep("BB",2), rep("CC",2)), X = c(1,2,2,3,1,4,3,3),
Y = c(99,7,8,7,6,7,7,7))
Desired output:
list(BB = c(X = 4, Y = 6), AA = c(Y = c(99, 8)))
# $BB
# X Y
# 4 6
# $AA
# Y1 Y2 # Would be a plus if shows `Y Y` instead of `Y1 Y2`
# 99 8
There are two big ideas with this base approach:
Since we need to compare all the values, we should just recombine everything into one data.frame.
Making the unsplit data.frame long will save us some extra steps.
#https://stackoverflow.com/questions/58786052/find-variables-that-occur-only-once-across-a-split-data-frame-in-r/58788854#58788854
f <- data.frame(id = c(rep("AA",4), rep("BB",2), rep("CC",2)), X = c(1,2,2,3,1,4,3,3),
Y = c(99,7,8,7,6,7,7,7))
m <- split(f, f$id) # Here is `m`
unsplit <- do.call(rbind, c(m, make.row.names = F))
molten <- data.frame(unsplit[, 1, drop = F], stack(unsplit[, -1]))
# res <- subset(molten, !duplicated(values) & !duplicated(values, fromLast = T))
res <- molten[as.logical(ave(molten[['values']], molten[['ind']], FUN = function(x) !duplicated(x) & !duplicated(x, fromLast = T))), ]
#I would stop here
res
#> id values ind
#> 6 BB 4 X
#> 9 AA 99 Y
#> 11 AA 8 Y
#> 13 BB 6 Y
#to get exact output
res_vector <- res$values
names(res_vector) <- res$ind
split(res_vector, as.character(res$id))
#> $AA
#> Y Y
#> 99 8
#>
#> $BB
#> X Y
#> 4 6
Created on 2019-11-10 by the reprex package (v0.3.0)
Here's another base approach that may be less complicated:
####Way 1 with rapply
vec <- rapply(lapply(m, '[', mods), I)
unique_vec <- vec[!duplicated(vec) & !duplicated(vec, fromLast = T)]
vec_names <- do.call(rbind, strsplit(names(unique_vec), '.', fixed = T))
names(unique_vec) <- substr(vec_names[, 2], 1, 1) #turns Y1 into Y
split(unique_vec, vec_names[, 1])
###Way 2 with data.frame already do.call(rbind, m)
vec <- unlist(
lapply(f[, -1],
function(x){
ind <- !duplicated(x) & !duplicated(x, fromLast = T)
ret <- x[ind]
names(ret) <- f[ind, 1]
ret
}
)
)
#this is likely overly simplified:
split(vec, sub('.*\\.', '', names(vec)))
#this leads to exact result
vec_names <- do.call(rbind, strsplit(names(vec), '.', fixed = T))
names(vec) <- vec_names[, 1]
split(vec, vec_names[, 2])
$AA
Y Y
99 8
$BB
X Y
4 6
OP brings up using table() in a hint. duplicated() is very performant:
unlist(lapply(f[mods], function(y) names(which(table(y) == 1))))
# X Y1 Y2 Y3
# "4" "6" "8" "99"
vec
#X.BB Y.AA Y.AA Y.BB
# 4 99 8 6
# A tibble: 2 x 13
expression min median `itr/sec` mem_alloc
<bch:expr> <bch> <bch:> <dbl> <bch:byt>
1 table_meth 321us 336us 2794. 10.3KB
2 dup_meth 132us 136us 7105. 31.7KB
bench::mark(
table_meth = {unlist(lapply(f[mods], function(y) names(which(table(y) == 1))))},
dup_meth = {
#could get slight performance boost with
#f_id <- f[['id']]
unlist(
lapply(f[, -1],
function(x){
ind <- !duplicated(x) & !duplicated(x, fromLast = T)
ret <- x[ind]
names(ret) <- f[ind, 1]
#names(ret) <- f_id[ind]
ret
}
)
)}
, check = F
)
And similar idea in data.table:
library(data.table)
molten_dt <- melt(rbindlist(m), id.vars = 'id')
molten_dt[!duplicated(value, by = variable) &
!duplicated(value, by = variable, fromLast = T)]
And similar idea in dplyr:
library(dplyr)
library(tidyr)
m%>%
bind_rows()%>%
pivot_longer(cols = -id)%>%
group_by(name)%>%
filter(!duplicated(value) & !duplicated(value, fromLast = T))%>%
group_by(id)%>%
group_split()
It's not pure functional programming but it is base R:
lapply(split(df, df$id), function(z){
X <- z$X[which(!(z$X %in% df$X[duplicated(df$X)]))]
Y <- z$Y[which(!(z$Y %in% df$Y[duplicated(df$Y)]))]
cbind(X, Y)
}
)
Data:
df <-
structure(list(
id = structure(
c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L),
.Label = c("AA", "BB", "CC"),
class = "factor"
),
X = c(1,
2, 2, 3, 1, 4, 3, 3),
Y = c(99, 7, 8, 7, 6, 7, 7, 7)
),
class = "data.frame",
row.names = c(NA,-8L))

Efficient way to paste multiple column pairs in R data.table

I'm looking for an efficient way to paste/combine multiple pairs of adjacent columns at once using data.table. My feeble attempt is slow and not so elegant:
library(data.table)
dt <- data.table(ids = 1:3,
x1 = c("A","B","C"),
x2 = 1:3,
y1 = c("D", "E", "F"),
y2 = 4:6,
z1 = c("G", "H", "I"),
z3 = 7:9)
paste.pairs <- function(x, sep = "-"){
xx <- unlist(x)
x.len <- length(x)
r <- rep(NA, x.len/2)
s <- seq(1, x.len, by = 2)
for(i in 1:(x.len/2)) {
r[i] <- paste(xx[i], xx[i+1], sep = sep)
}
return(as.list(r))
}
dt[, paste.pairs(.SD), by = "ids"]
Is there a better way?
An option with Map by creating column index with seq
i1 <- seq(1, length(dt)-1, 2)
i2 <- seq(2, length(dt)-1, 2)
dt[, Map(paste,
.SD[, i1, with = FALSE], .SD[, i2, with = FALSE],
MoreArgs = list(sep="-")),
by = "ids"]
Another option would be to split by the names of the dataset and then paste
data.frame(lapply(split.default(dt[, -1, with = FALSE],
sub("\\d+$", "", names(dt)[-1])), function(x) do.call(paste, c(x, sep="-"))))
# x y z
#1 A-1 D-4 G-7
#2 B-2 E-5 H-8
#3 C-3 F-6 I-9
Or another option is with melt/dcast
dcast(melt(dt, id.var = 'ids')[, paste(value, collapse = "-"),
.(grp = sub("\\d+", "", variable), ids)], ids ~ grp, value.var = 'V1')
a solution using matrices
#create matrices
#use the columns you want to paste together...
m1 <- as.matrix( dt[,c(2,4,6)] )
m2 <- as.matrix( dt[, c(3,5,7)] )
#paste the matrices element-by-element, and convert result back to data.table
as.data.table( matrix( paste( m1, m2, sep="-"), nrow=nrow(m1), dimnames=dimnames(m1) ) )
Should run pretty fast, and is very readable and easy to adapt.
output
# x1 y1 z1
# 1: A-1 D-4 G-7
# 2: B-2 E-5 H-8
# 3: C-3 F-6 I-9
benchmarks
microbenchmark::microbenchmark(
wimpel = {
#create matrices
m1 <- as.matrix( dt[,c(2,4,6)] )
m2 <- as.matrix( dt[, c(3,5,7)] )
#paste the matrices element-by-element, and comvert to data.table
as.data.table( matrix( paste( m1, m2, sep="-"), nrow=nrow(m1), dimnames=dimnames(m1) ) )
},
akrun_df = {
data.frame(lapply(split.default(dt[, -1, with = FALSE],
sub("\\d+$", "", names(dt)[-1])), function(x) do.call(paste, c(x, sep="-"))))
},
akrun_map = {
i1 <- seq(2, length(dt), 2)
i2 <- seq(3, length(dt), 2)
dt[, Map(paste, .SD[, i1, with = FALSE], .SD[, i2, with = FALSE], MoreArgs = list(sep="-"))]
},
akrun_dcast = {
dcast(melt(dt, id.var = 'ids')[, paste(value, collapse = "-"),.(grp = sub("\\d+", "", variable), ids)], ids ~ grp, value.var = 'V1')
},
times = 10 )
# Unit: microseconds
# expr min lq mean median uq max neval
# wimpel 303.072 315.122 341.2417 319.1895 327.775 531.429 10
# akrun_df 1022.790 1028.515 1251.7812 1069.1850 1172.519 2779.460 10
# akrun_map 742.013 751.051 785.6059 778.1650 799.855 884.812 10
# akrun_dcast 4104.719 4175.215 4414.6596 4348.7430 4650.911 4939.221 10

Using tidyverse to randomly get means from conditions

I have a long format data frame that looks like this:
What I'd like to do is (1) randomly split the trials from Condition 1 into 4 groups and then calculate the mean of Y for each ID and (2) carry out this same procedure for the trials in Condition 2. This is what this new output would look like:
Is there a concise way to do this using the tidyverse? I'm just getting started and am having a having a hard time with this!
Assuming that your data frame is not that small, this should do exactly what you need.
set.seed(2)
df <- tibble(ID = c(rep(1,50),rep(2,50)),
Condition = rep(c(1,1,2,2,2,1,1,2,2,2,1,1,1,2,2,1,1,1,2,2),5),
Y = rnorm(100,100,20)) ##Creates some fake data
result_df <- df %>% mutate(randomizer = sapply(1:length(row_number()), function(i)sample(c(1,2,3,4),1))) %>%
group_by(Condition) %>%
group_by(ID, Condition, randomizer) %>%
summarize(mean_y = mean(Y, na.rm = TRUE)) %>%
mutate(condition_status = paste0("Condition",Condition,"M",randomizer)) %>%
ungroup() %>%
dplyr::select(-Condition, -randomizer) %>%
spread(condition_status, mean_y)
result_df
Not the best way but it does work
x <- data.frame(c(rep(1,10), rep(2,10)), c(1,1,2,2,2,1,1,2,2,2,1,1,1,2,2,1,1,1,2,2), c(100,200,80,58,89,100,200,80,58,89,95,72,99,120,130,95,72,99,120,130))
colnames(x) <- c("ID", "Condition", "Y")
id <- unique(x[,1])
con <- unique(x[,2])
#multiple by the number of groups to split into
y <- data.frame(matrix(, ncol = (length(id) * 4) + 1, nrow = length(id)))
y[,1] <- id
colnames(y) <- c("ID", "Condition1M1", "Condition1M2", "Condition1M3", "Condition1M4", "Condition2M1", "Condition2M2", "Condition2M3", "Condition2M4")
x1 <- x[which(x[,2] == con[1]),]
x2 <- x[which(x[,2] == con[2]),]
#specify sample size
n <- 5
x11 <- x1[sample(nrow(x1), n, replace = FALSE),]
x12 <- x1[sample(nrow(x1), n, replace = FALSE),]
x13 <- x1[sample(nrow(x1), n, replace = FALSE),]
x14 <- x1[sample(nrow(x1), n, replace = FALSE),]
x21 <- x2[sample(nrow(x2), n, replace = FALSE),]
x22 <- x2[sample(nrow(x2), n, replace = FALSE),]
x23 <- x2[sample(nrow(x2), n, replace = FALSE),]
x24 <- x2[sample(nrow(x2), n, replace = FALSE),]
y[1,2] <- mean(x11[which(x11[,1] == id[1]),3])
y[1,3] <- mean(x12[which(x12[,1] == id[1]),3])
y[1,4] <- mean(x13[which(x13[,1] == id[1]),3])
y[1,5] <- mean(x14[which(x14[,1] == id[1]),3])
y[2,2] <- mean(x21[which(x21[,1] == id[2]),3])
y[2,3] <- mean(x22[which(x22[,1] == id[2]),3])
y[2,4] <- mean(x23[which(x23[,1] == id[2]),3])
y[2,5] <- mean(x24[which(x24[,1] == id[2]),3])
y[1,6] <- mean(x11[which(x11[,1] == id[1]),3])
y[1,7] <- mean(x12[which(x12[,1] == id[1]),3])
y[1,8] <- mean(x13[which(x13[,1] == id[1]),3])
y[1,9] <- mean(x14[which(x14[,1] == id[1]),3])
y[2,6] <- mean(x21[which(x21[,1] == id[2]),3])
y[2,7] <- mean(x22[which(x22[,1] == id[2]),3])
y[2,8] <- mean(x23[which(x23[,1] == id[2]),3])
y[2,9] <- mean(x24[which(x24[,1] == id[2]),3])
Results:
ID Condition1M1 Condition1M2 Condition1M3 Condition1M4 Condition2M1 Condition2M2 Condition2M3 Condition2M4
1 1 133.3333 100 150 133.3333 133.3333 100 150 133.3333
2 2 125.0000 125 125 120.0000 125.0000 125 125 120.0000

Resources