R - split large dataframe into list in parallel - r

I have a large transaction dataset (around 5 million rows), i need to split all transactions by ID (around 1 million unique ID). The expected results would be unique ID with item in lists.
I did try the most simple and direct way to split the transaction dataset (by referring to Why is split inefficient on large data frames with many groups? ), i know that convert dataframe into datatable might be more efficient.
Sample source df
set.seed(123)
n = 500000 #number of sample data (500k as trial)
x <- data.frame(ID = paste(LETTERS[1:8],sample(1:round(n/3), n, replace = TRUE),sep = ""),
Item= sample(c('apple','orange','lemon','tea','rice'), n, replace=TRUE)
)
Convert character to factor
x$ID <- as.character(x$ID)
x$Item <- as.factor(x$Item)
Convert df into dt, then split dt into lists
library(data.table)
x <- as.data.table(x)
system.time(
xx <- split(x$Item, x$ID)
)
Expected results in lists
head(xx, 2)
#$A100
#[1] tea orange
#Levels: apple lemon orange rice tea
#$A101
#[1] rice
#Levels: apple lemon orange rice tea
Problem: After running for 2 hours, on my machine (4 cores, 16Gb RAM, Win10, R 3.4.3) it still running and never completes. I did check my CPU usage when it's running, it only consumed 35-40% of the CPU usage.
My idea:
I'm thinking is there any way to fully utilized the computational power of my machine (run the "split" in parallel), using only detectCores() - 1 = 3 cores.
1st: Split the large transaction dataset by IDs into 3 smaller partitions (smaller dataset)
2nd: Using foreach loop to run split 3 partitions (smaller dataset) into list in parallel, then append(row bind) each list for every iteration until the end.
Question: Is my idea practical? i did read about mclapply and it's mc.cores, but seems like mc.cores = 1 is the only option for windows, so it won't help for my case. Is there any better and more efficient way to do the split for large dataset? Any comment is welcome, Thanks!

Surprisingly and interestingly, consider by (the object-oriented wrapper to tapply) which operates similarly as split on data frames with an added feature to run splits into a function call. The equivalent to split would be to return the argument or call identity.
by(x$Item, x$ID, function(x) x)
by(x$Item, x$ID, identity)
Do note, the return of by is a by class object which essentially is a list with additional attributes.
Using your random data frame example, base::split did not finish after 1 hour, but base::by did well below 5 mins on my machine with a 64 GB RAM! Usually, I assumed by would have more overhead being a sibling to the apply family but my opinion may soon change.
50K ROW EXAMPLE
set.seed(123)
n = 50000 #number of sample data (50k as trial)
x <- data.frame(ID = paste(LETTERS[1:8],sample(1:round(n/3), n, replace = TRUE),sep = ""),
Item= sample(c('apple','orange','lemon','tea','rice'), n, replace=TRUE)
)
system.time( xx <- split(x$Item, x$ID) )
# user system elapsed
# 20.09 0.00 20.09
system.time( xx2 <- by(x$Item, x$ID, identity) )
# user system elapsed
# 1.55 0.00 1.55
all.equal(unlist(xx), unlist(xx2))
# [1] TRUE
identical(unlist(xx), unlist(xx2))
# [1] TRUE
500K ROW EXAMPLE
set.seed(123)
n = 500000 #number of sample data (500k as trial)
x <- data.frame(ID = paste(LETTERS[1:8],sample(1:round(n/3), n, replace = TRUE),sep = ""),
Item= sample(c('apple','orange','lemon','tea','rice'), n, replace=TRUE)
)
system.time( xx <- split(x$Item, x$ID) )
# DID NOT FINISH AFTER 1 HOUR
system.time( xx2 <- by(x$Item, x$ID, identity) )
# user system elapsed
# 23.00 0.06 23.09
Source code reveals split.default might run more processes at the R (unlike C or Fortran) level with a for loop across factor levels:
getAnywhere(split.data.frame)
function (x, f, drop = FALSE, sep = ".", lex.order = FALSE, ...)
{
if (!missing(...))
.NotYetUsed(deparse(...), error = FALSE)
if (is.list(f))
f <- interaction(f, drop = drop, sep = sep, lex.order = lex.order)
else if (!is.factor(f))
f <- as.factor(f)
else if (drop)
f <- factor(f)
storage.mode(f) <- "integer"
if (is.null(attr(x, "class")))
return(.Internal(split(x, f)))
lf <- levels(f)
y <- vector("list", length(lf))
names(y) <- lf
ind <- .Internal(split(seq_along(x), f))
for (k in lf) y[[k]] <- x[ind[[k]]]
y
}
Conversely, source code for by.data.frame reveals a call to tapply which itself is a wrapper to lapply:
getAnywhere(by.data.frame)
function (data, INDICES, FUN, ..., simplify = TRUE)
{
if (!is.list(INDICES)) {
IND <- vector("list", 1L)
IND[[1L]] <- INDICES
names(IND) <- deparse(substitute(INDICES))[1L]
}
else IND <- INDICES
FUNx <- function(x) FUN(data[x, , drop = FALSE], ...)
nd <- nrow(data)
structure(eval(substitute(tapply(seq_len(nd), IND, FUNx,
simplify = simplify)), data), call = match.call(), class = "by")
}

The factors seems to be the key here. I don't have 64GB RAM 😆 but maybe you can try again with stringsAsFactors = F. My results for a smaller test are below and it seems split is quite faster when not using factors.
n <- 50000
x <- data.frame(ID = paste(LETTERS[1:8],sample(1:round(n/3), n, replace = TRUE),sep = ""),
Item= sample(c('apple','orange','lemon','tea','rice'), n, replace=TRUE),
stringsAsFactors = T
)
x2 <- data.frame(ID = paste(LETTERS[1:8],sample(1:round(n/3), n, replace = TRUE),sep = ""),
Item= sample(c('apple','orange','lemon','tea','rice'), n, replace=TRUE),
stringsAsFactors = F)
splitFactor <- function() split(x$Item, x$ID)
byFactor <- function() by(x$Item, x$ID, identity)
splitNotFactor <- function() split(x2$Item, x2$ID)
byNotFactor <- function() by(x2$Item, x2$ID, identity)
a <- microbenchmark::microbenchmark(splitFactor(),
byFactor(),
splitNotFactor(),
byNotFactor(),
times = 3
)
Unit: milliseconds
expr min lq mean median uq max neval cld
splitFactor() 51743.1633 51936.7261 52025.1205 52130.2889 52166.0990 52201.9091 3 d
byFactor() 1963.0673 1987.7360 2030.5779 2012.4048 2064.3332 2116.2616 3 b
splitNotFactor() 399.7618 401.6796 412.4632 403.5973 418.8139 434.0306 3 a
byNotFactor() 2410.3804 2518.3651 2578.3501 2626.3499 2662.3349 2698.3199 3 c
splitNotFactor() should also result in an object with much smaller memory footprint than the other functions.

Related

Quickly apply & operation to pairs of columns in R

Let’s say I have two large data.tables and need to combine their columns pairwise using the & operation. The combinations are dictated by grid (combine dt1 column1 with dt2 column2, etc.)
Right now I'm using a mclapply loop and the script takes hours when I run the full dataset. I tried converting the data to a matrix and using a vectorized approach but that took even longer. Is there a faster and/or more elegant way to do this?
mx1 <- replicate(10, sample(c(T,F), size = 1e6, replace = T)) # 1e6 rows x 10 columns
mx1 <- as.data.table(mx1)
colnames(mx1) <- LETTERS[1:10]
mx2 <- replicate(10, sample(c(T,F), size = 1e6, replace = T)) # 1e6 rows x 10 columns
mx2 <- as.data.table(mx2)
colnames(mx2) <- letters[1:10]
grid <- expand.grid(col1 = colnames(mx1), col2 = colnames(mx2)) # the combinations I want to evaluate
out <- new_layer <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) { # <--- mclapply loop
mx1[[col1]] & mx2[[col2]]
}, SIMPLIFY = F)
setDT(out) # convert output into data table
colnames(out) <- paste(grid$col1, grid$col2, sep = "_")
For context, this data is from a gene expression matrix where 1 row = 1 cell
This can be done directly with no mapply: Just ensure that the with argument is FALSE
ie:
mx1[, grid$col1, with = FALSE] & mx2[, grid$col2, with=FALSE]
After some digging around I found a package called bit that is specifically designed for fast boolean operations. Converting each column of my data.table from logical to bit gave me a 100-fold increase in compute speed.
# Load libraries.
library(data.table)
library(bit)
# Create data set.
mx1 <- replicate(10, sample(c(T,F), size = 5e6, replace = T)) # 5e6 rows x 10 columns
colnames(mx1) <- LETTERS[1:10]
mx2 <- replicate(10, sample(c(T,F), size = 5e6, replace = T)) # 5e6 rows x 10 columns
colnames(mx2) <- letters[1:10]
grid <- expand.grid(col1 = colnames(mx1), col2 = colnames(mx2)) # combinations I want to evaluate
# Single operation with logical matrix.
system.time({
out <- mx1[, grid$col1] & mx2[, grid$col2]
}) # 26.014s
# Loop with logical matrix.
system.time({
out <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) {
mx1[, col1] & mx2[, col2]
})
}) # 31.914s
# Single operation with logical data.table.
mx1.dt <- as.data.table(mx1)
mx2.dt <- as.data.table(mx2)
system.time({
out <- mx1.dt[, grid$col1, with = F] & mx2.dt[, grid$col2, with = F] # 26.014s
}) # 32.349s
# Loop with logical data.table.
system.time({
out <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) {
mx1.dt[[col1]] & mx2.dt[[col2]]
})
}) # 15.031s <---- SECOND FASTEST TIME, ~2X IMPROVEMENT
# Loop with bit data.table.
mx1.bit <- mx1.dt[, lapply(.SD, as.bit)]
mx2.bit <- mx2.dt[, lapply(.SD, as.bit)]
system.time({
out <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) {
mx1.bit[[col1]] & mx2.bit[[col2]]
})
}) # 0.383s <---- FASTEST TIME, ~100X IMPROVEMENT
# Convert back to logical table.
out <- setDT(out)
colnames(out) <- paste(grid$col1, grid$col2, sep = "_")
out <- out[, lapply(.SD, as.logical)]
There are also special functions like sum.bit and ri that you can use to aggregate data without converting it back to logical.

How to parallelise code to sort and sum a list in R?

I have some code:
test<-therapyDF %>% group_by(therapyDF$prodcode) %>% summarize(count=n_distinct(therapyDF$patid))
that is designed to group all prodcode entries (drug) and then count how many patients (patid) have an instance of each drug. For example, the raw data is held in a dataframe similar to:
patid prodcode
1 A
1 B
2 C
3 A
3 A
3 B
Thus, output will be:
A 2
B 2
C 1
Is there any way of parallelising this code? The real data is in excess of 100 million records and it's been more than 8 hours of running in serial.
I'm struggling to adopt the *apply methodologies of R and the numerous R parallel packages. Splitting the original data frame would be a bit tricky, as the data is not very well organised (just how I got it) and it would require grouping an extracting by the $patid. I am running this on an 8 core intel Linux box.
Thanks
It is possible to parallelize a group_by-summarize with
summarize_par <- function(grouped_df, ...) {
sizes <- attr(grouped_df, "group_sizes")
ord <- order(sizes, decreasing = TRUE)
one_group <- function(gdf, i, size) {
size_i <- sizes[i]
structure(
gdf[attr(gdf, "indices")[[i]] + 1, ],
indices = list(0:(size_i - 1)),
group_sizes = size_i,
biggest_group_sizes = size_i,
labels = attr(gdf, "labels")[i, , drop = FALSE]
)
}
dots <- dplyr:::named_quos(...)
res <- foreach(ic = ord) %dopar% {
dplyr::summarise(one_group(grouped_df, ic), !!!dots)
}
do.call(rbind, res[match(seq_along(ord), ord)])
}
Test:
N <- 2e7
therapyDF <- data.frame(patid = sample.int(N/2, size = N, replace = TRUE),
prodcode = sample(LETTERS, size = N, replace = TRUE))
library(dplyr)
system.time(true <- therapyDF %>%
group_by(prodcode) %>%
summarize(count=n_distinct(patid)))
library(foreach)
library(doParallel)
registerDoParallel(cl <- makeForkCluster(detectCores() / 2))
system.time(test <- therapyDF %>%
group_by(prodcode) %>%
summarize_par(count=n_distinct(patid)))
all.equal(true$count, test$count)
stopCluster(cl)
I get:
sequential: 8.228
parallel with 2 cores: 6.525
Here, my computer has not a lot of cores neither a lot of memory.
You can expect better results with a better computer.
There is a performance issue with n_distinct at the moment (see https://github.com/tidyverse/dplyr/issues/977). You should use length(unique(patid)) instead of n_distinct(patid) to speed things up.
You should give data.table a try:
N <- 1e8
therapyDF <- data.frame(patid = sample.int(N/2, size = N, replace = TRUE),
prodcode = sample(LETTERS, size = N, replace = TRUE))
library(dplyr)
system.time(therapyDF %>%
group_by(prodcode) %>%
summarize(count=n_distinct(patid)))
#> User System verstrichen
#> 36.939 1.196 38.136
library(data.table)
setDT(therapyDF)
system.time(therapyDF[, .(count = uniqueN(patid)), by = prodcode])
#> User System verstrichen
#> 5.727 0.116 5.843
It uses OpenMP for parallel processing.

Speed up for loop in R, calculating pairwise dissimilarities

I'm trying to compute all the pairwise dissimilarities between observations in a data set consisting of only nominal variables using some self-defined dissimilarity metric.
Data looks like
set.seed(3424)
(mydata <- data.table(paste(sample(letters[1:5], 5, replace=T),
sample(LETTERS[1:5], 5, replace=T),
sep = ","),
paste(sample(LETTERS[1:5], 5, replace=T),
sample(LETTERS[1:5], 5, replace=T),
sep = ","),
paste(sample(letters[1:5], 5, replace=T),
sample(letters[1:5], 5, replace=T),
sep = ",")))
V1 V2 V3
1: a,A E,E b,b
2: e,D C,A d,d
3: d,B B,C d,d
4: c,B A,E b,d
5: a,B C,D d,a
library(data.table)
library(dplyr)
library(stringr)
metric <- function(pair){
intersection <- 0
union <- 0
for(i in 1:ncol(mydata)){
A <- pair[[1]][[i]]
B <- pair[[2]][[i]]
if(sum(is.na(A),is.na(B))==1)
union = union + 1
if(sum(is.na(A),is.na(B))==0){
intersection <- intersection + length(intersect(A,B))/length(union(A,B))
union = union + 1
}
}
1 - intersection/union
}
diss <- matrix(nrow = nrow(mydata), ncol = nrow(mydata))
for(i in 1:(nrow(mydata)-1)){
print(i) ## to check progress ##
for(j in (i+1):nrow(mydata)){
pair <- rbind(mydata[i], mydata[j])
diss[j, i] <- apply(pair, 1, function(x) strsplit(x, split=",")) %>% metric()
}
}
These loops work, but really slow when mydata has 1000+ rows and 100+ columns.
The metric I used here is Jaccard index, but a nested version. Since each element in the data is not a single value. So instead of treating each two levels as either match(0) or different(1), I use Jaccard when comparing levels as well.
Update:
Some context about my data, not the toy data I made up.
Each row represents a query, i.e. "SELECT ... FROM ... WHERE ...
...".
Each column contains part of the information in the query, i.e. 1st column contains everything between "SELECT" and "FROM", 2nd column contains what's between "FROM" and "WHERE", etc.
There are 100 columns and 400 rows, I don't why there are so many columns though.
Number of elements in one cell could be really arbitrary, some cells contain very long lists of values, while many are actually NAs. E.g.
SELECT
1: NA
2:p1.PLAYERID,f1.PLAYERNAME,p2.PLAYERID,f2.PLAYERNAME
3: PLAYER f1,PLAYER f2,PLAYS p1
4: NA
5: NA
6: c1.table_name t1,c2.table_name t2
7: NA
...
400: asd,vrht,yuetr,wxeq,yiknuy,sce,ercher
You can gain some speed pretty easily by doing less work. If you are only interested in pairwise comparisons, you only need to do N choose 2 comparisons, instead of N^2. You can implement that with F2() below.
set.seed(3424)
(mydata <- data.table(sample(letters[1:5], 50, replace = T),
sample(LETTERS[1:5], 50, replace = T),
sample(1:3, 50, replace = T)))
mydf<-data.frame(mydata)
f1<- function(){
diss <- matrix(nrow = nrow(mydata), ncol = nrow(mydata))
for(i in 1:(nrow(mydata)-1)){
print(i) ## to check progress ##
for(j in (i+1):nrow(mydata)){
pair <- rbind(mydata[i], mydata[j])
diss[j, i] <- apply(pair, 1, function(x) strsplit(x, split=",")) %>% metric()
}
}
return(diss)
}
f2<-function(){
met<-NULL
A<-NULL
B<-NULL
choices<-choose(nrow(mydf),2)
combs<-combn(nrow(mydf),2)
for(i in 1:choices) {
print(i)
pair<-rbind(mydf[combs[1,i],], mydf[combs[2,i],])
met[i]<- apply(pair, 1, function(x) strsplit(x, split=",")) %>% metric()
A[i]<-mydf[combs[1,i],1]
B[i]<-mydf[combs[2,i],2]
}
results<-data.frame(A,B, met)
return(results)
}
library(microbenchmark)
microbenchmark(f1(), f2(), times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
f1() 1381 1391.2 1416.8 1417.6 1434.9 1456 10
f2() 907 923.6 942.3 946.9 948.9 1008 10
It is a little faster, but not mind-blowingly so. My guess is that some more work needs to be done on the metric function you define. I tried to look at it and determine a way to vectorize it, but I could not find a way. If that can be done this problem would be trivial. For example, I have a similar program that measures pairwise cosine similarity between ~400 vectors of length ~5000. It has to make 400 choose 2 = 79800 comparisons and the entire program takes about 6 seconds to run.
It's similar to the original, but I made a few changes. It runs more quickly, but I didn't bother timing it. 1000 with this code seems about like 100 with the original.
The main changes:
remove rbind by passing in variables to mapply calculate union
variable instead of adding every time (union <- union + 1)
split strings all at once outside of loops
check length intersection before calculating union and adding intersection (lenint > 0)
Hopefully something helps your case.
rownum <- 1000
(mydata <- data.table(paste(sample(letters[1:5], rownum, replace=T),
sample(LETTERS[1:5], rownum, replace=T),
sep = ","),
paste(sample(LETTERS[1:5], rownum, replace=T),
sample(LETTERS[1:5], rownum, replace=T),
sep = ","),
paste(sample(letters[1:5], rownum, replace=T),
sample(letters[1:5], rownum, replace=T),
sep = ",")))
allsplit <- lapply(mydata,strsplit,split = ',')
allsplitdf <- cbind(allsplit[['V1']],allsplit[['V2']],allsplit[['V3']])
allsplitlist <- split(allsplitdf,1:nrow(allsplitdf))
metric2 <- function(p1,p2){
for(i in seq_along(p1)){
intersection <- 0
A <- p1[[i]]
B <- p2[[i]]
if(!any(is.na(A),is.na(B))){
lenint <- length(intersect(A,B))
if(lenint > 0){
intersection <- intersection + lenint/length(union(A,B))
}
}
}
1 - intersection/length(p1)
}
diss <- matrix(nrow = nrow(mydata), ncol = nrow(mydata))
for(i in 1:(nrow(mydata)-1)){
print(i) ## to check progress ##
for(j in (i+1):nrow(mydata)){
diss[j, i] <- mapply(metric2,p1 = allsplitlist[i],p2 = allsplitlist[j])
}
}
When constructing an algorithm it is important to keep in mind the speed/space trade off. What I mean by the speed/space trade off is that by storing your data within a different schema you can usually eliminate for loops. However, data stored within this new schema will generally occupy more space.
The reason your example is slow is because, among other things, you are looping over all the rows and the columns of you're data. With a 1000x100 data.frame that is 1e5 computations. One way to eliminate theloop over your rows is to store you data a bit differently. For example, I use the expand.grid command to combine all pairwise comparisons within the same data.frame, dTMP. I then strip the comma and allow each member of the pair to occupy it's own column (i.e. "a,A" which is originally contained in one variable, is now "a" and "A" and represent entries in two separate variables). In general, reshaping data into different formats is quick, or atleast quicker than looping over each row. This reshaping clearly, however, generate a data set which takes up more RAM. In your case the data.frame will be 1e6x4. Which is very large, but not so large as to clog up all your RAM.
The reward to doing all that hard work is that now it is trivial and extremely fast to obtain the intersect and union variables. You will of course still need to loop over each column, however, we've eliminated one loop by simply arranging your data. It is possible to remove the loop over the columns loop by utilizing 3D arrays, however, such an array would not fit into memory.
f3 <- function(){
intersection <- 0
for(v in names(mydata)){
dTMP <- expand.grid(mydata[[v]], mydata[[v]], stringsAsFactors = FALSE)[,c(2,1)]
#There is likely a more elegant way to do this.
dTMP <-
dTMP$Var2 %>%
str_split(., ",") %>%
unlist(.) %>%
matrix(., ncol = 2, nrow = nrow(dTMP), byrow = TRUE) %>%
cbind(., dTMP$Var1%>%
str_split(., ",") %>%
unlist(.) %>%
matrix(., ncol = 2, nrow = nrow(dTMP), byrow = TRUE)) %>%
as.data.frame(., stringsAsFactors = FALSE)
names(dTMP) <- c("v1", "v2", "v3", "v4")
intersect <- rowSums(dTMP[,c("v1", "v2")] == dTMP[,c("v3", "v4")])
intersect <- ifelse(rowSums(dTMP[,c("v1", "v2")] == dTMP[,c("v4", "v3")]) !=0, rowSums(dTMP[,c("v1", "v2")] == dTMP[,c("v4", "v3")]), intersect)
intersect <- ifelse(dTMP[, "v1"] == dTMP[, "v2"], 1, intersect)
MYunion <- sapply(as.data.frame(t(dTMP)), function(x) n_distinct(x))
intersection <- intersection + intersect/MYunion
}
union <- ncol(mydata)
return(matrix(1 - intersection/union, nrow = nrow(mydata), ncol = nrow(mydata), byrow = TRUE)) #This is the diss matrix, I think. Double check that I got the rows and columns correct
}
Update
I'm still having trouble replicating your results, however, I believe the newly updated code is very close. There is only one cell (2,1) of the dissimilarity matrix which our results differ when set.seed(3424). The problem with the current iteration, however, is that I need to implement a sapply to obtain MYunion. If you can think of a faster way do to do this, you'll get big speed gains. Read this SO post for suggests: Efficient Means of Identifying Number of Distinct Elements in a Row

R: Row resampling loop speed improvement

I'm subsampling rows from a dataframe with c("x","y","density") columns at a variety of c("s_size","reps"). Reps= replicates, s_size= number of rows subsampled from the whole dataframe.
> head(data_xyz)
x y density
1 6 1 0
2 7 1 17600
3 8 1 11200
4 12 1 14400
5 13 1 0
6 14 1 8000
#Subsampling###################
subsample_loop <- function(s_size, reps, int) {
tm1 <- system.time( #start timer
{
subsample_bound = data.frame()
#Perform Subsampling of the general
for (s_size in seq(1,s_size,int)){
for (reps in 1:reps) {
subsample <- sample.df.rows(s_size, data_xyz)
assign(paste("sample" ,"_","n", s_size, "_", "r", reps , sep=""), subsample)
subsample_replicate <- subsample[,] #temporary variable
subsample_replicate <- cbind(subsample, rep(s_size,(length(subsample_replicate[,1]))),
rep(reps,(length(subsample_replicate[,1]))))
subsample_bound <- rbind(subsample_bound, subsample_replicate)
}
}
}) #end timer
colnames(subsample_bound) <- c("x","y","density","s_size","reps")
subsample_bound
} #end function
Here's the function call:
source("R/functions.R")
subsample_data <- subsample_loop(s_size=206, reps=5, int=10)
Here's the row subsample function:
# Samples a number of rows in a dataframe, outputs a dataframe of the same # of columns
# df Data Frame
# N number of samples to be taken
sample.df.rows <- function (N, df, ...)
{
df[sample(nrow(df), N, replace=FALSE,...), ]
}
It's way too slow, I've tried a few times with apply functions and had no luck. I'll be doing somewhere around 1,000-10,000 replicates for each s_size from 1:250.
Let me know what you think! Thanks in advance.
=========================================================================
UPDATE EDIT: Sample data from which to sample:
https://www.dropbox.com/s/47mpo36xh7lck0t/density.csv
Joran's code in a function (in a sourced function.R file):
foo <- function(i,j,data){
res <- data[sample(nrow(data),i,replace = FALSE),]
res$s_size <- i
res$reps <- rep(j,i)
res
}
resampling_custom <- function(dat, s_size, int, reps) {
ss <- rep(seq(1,s_size,by = int),each = reps)
id <- rep(seq_len(reps),times = s_size/int)
out <- do.call(rbind,mapply(foo,i = ss,j = id,MoreArgs = list(data = dat),SIMPLIFY = FALSE))
}
Calling the function
set.seed(2)
out <- resampling_custom(dat=retinal_xyz, s_size=206, int=5, reps=10)
outputs data, unfortunately with this warning message:
Warning message:
In mapply(foo, i = ss, j = id, MoreArgs = list(data = dat), SIMPLIFY = FALSE) :
longer argument not a multiple of length of shorter
I put very little thought into actually optimizing this, I was just concentrating on doing something that's at least reasonable while matching your procedure.
Your big problem is that you are growing objects via rbind and cbind. Basically anytime you see someone write data.frame() or c() and expand that object using rbind, cbind or c, you can be very sure that the resulting code will essentially be the slowest possible way of doing what ever task is being attempted.
This version is around 12-13 times faster, and I'm sure you could squeeze some more out of this if you put some real thought into it:
s_size <- 200
int <- 10
reps <- 30
ss <- rep(seq(1,s_size,by = int),each = reps)
id <- rep(seq_len(reps),times = s_size/int)
foo <- function(i,j,data){
res <- data[sample(nrow(data),i,replace = FALSE),]
res$s_size <- i
res$reps <- rep(j,i)
res
}
out <- do.call(rbind,mapply(foo,i = ss,j = id,MoreArgs = list(data = dat),SIMPLIFY = FALSE))
The best part about R is that not only is this way, way faster, it's also way less code.

How do I sub sample data by group efficiently?

I do have a similar problem that is explained in this question. Similar to that question I have a data frame that has 3 columns (id, group, value). I want to take n samples with replacement from each group and produce a smaller data frame with n samples from each group.
However, I am doing hundreds of subsamples in a simulation code and the solution based on ddply is very slow to be used in my code. I tried to rewrite a simple code to see if I can get a better performance but it is still slow (not better than the ddply solution if not worse). Below is my code. I am wondering if it can be improved for performance
#Producing example DataFrame
dfsize <- 10
groupsize <- 7
test.frame.1 <- data.frame(id = 1:dfsize, group = rep(1:groupsize,each = ceiling(dfsize/groupsize))[1:dfsize], junkdata = sample(1:10000, size =dfsize))
#Main function for subsampling
sample.from.group<- function(df, dfgroup, size, replace){
outputsize <- 1
newdf <-df # assuming a sample cannot be larger than the original
uniquegroups <- unique(dfgroup)
for (uniquegroup in uniquegroups){
dataforgroup <- which(dfgroup==uniquegroup)
mysubsample <- df[sample(dataforgroup, size, replace),]
sizeofsample <- nrow(mysubsample)
newdf[outputsize:(outputsize+sizeofsample-1), ] <- mysubsample
outputsize <- outputsize + sizeofsample
}
return(newdf[1:(outputsize-1),])
}
#Using the function
sample.from.group(test.frame.1, test.frame.1$group, 100, replace = TRUE)
Here's two plyr based solutions:
library(plyr)
dfsize <- 1e4
groupsize <- 7
testdf <- data.frame(
id = seq_len(dfsize),
group = rep(1:groupsize, length = dfsize),
junkdata = sample(1:10000, size = dfsize))
sample_by_group_1 <- function(df, dfgroup, size, replace) {
ddply(df, dfgroup, function(x) {
x[sample(nrow(df), size = size, replace = replace), , drop = FALSE]
})
}
sample_by_group_2 <- function(df, dfgroup, size, replace) {
idx <- split_indices(df[[dfgroup]])
subs <- lapply(idx, sample, size = size, replace = replace)
df[unlist(subs, use.names = FALSE), , drop = FALSE]
}
library(microbenchmark)
microbenchmark(
ddply = sample_by_group_1(testdf, "group", 100, replace = TRUE),
plyr = sample_by_group_2(testdf, "group", 100, replace = TRUE)
)
# Unit: microseconds
# expr min lq median uq max neval
# ddply 4488 4723 5059 5360 36606 100
# plyr 443 487 507 536 31343 100
The second approach is much faster because it does the subsetting in a single step - if you can figure out how to do it in one step, it's usually any easy way to get better performance.
I think this is cleaner and possibly faster:
z <- sapply(unique(test.frame.1$group), FUN= function(x){
sample(which(test.frame.1$group==x), 100, TRUE)
})
out <- test.frame.1[z,]
out

Resources