Compare two ffdf - r

I have two very large data sets (50M rows, 130 columns) which i can't compare with basic packages. Therefore i have to use an ffdf. It's the first time i am working with the ff package. I am trying to compare two ffdf and then write the differences in two outputfile ("in_file1_not_in_file2", "in_file2_not_in_file1"). Here is an example:
# For easy reproduction; normally a CSV file
set.seed(1234)
data1 <- data.frame(row.names=1:10, var1=sample(c(TRUE,FALSE), 10, replace=TRUE), var2=sample(1:8, 10, replace=TRUE), var3=as.factor(sample(c('AAA','BBB','CCC'), 10, replace=TRUE)))
data2 <- data.frame(row.names=1:10, var1=sample(c(TRUE,FALSE), 10, replace=TRUE), var2=sample(1:10, 10, replace=TRUE), var3=as.factor(sample(c('AAA','BBB','CCC'), 10, replace=TRUE)))
# Convert to an ffdf
ffdata1 <- as.ffdf(data1)
ffdata2 <- as.ffdf(data2)
So now i am stuck. Normally i would combine all rows in one column and compare this with each other. Something like this:
# Normally - Combined columns
data1$CCID <- apply(data1, 1, paste, collapse='.')
data2$CCID <- apply(data2, 1, paste, collapse='.')
# Combine columns of ffdf?
ffdata1$CCID <- ??
ffdata2$CCID <- ??
# Normally - Comparison
cdata3 <- sapply(data2$CCID, FUN=function(x) { x == data1$CCID })
output1 <- data2[colSums(cdata3)>0,]
output2 <- data1[rowSums(cdata3)>0,]
# Comparison of ffdf?
ffcdata3 <- ??
ffoutput1 <- ??
ffoutput2 <- ??
I hope it is understandable and sorry that i have just no idea how to work with ffdf.

Related

How to efficiently split each row into test and train subsets using R?

I have a data table that provides the length and composition of given vectors
for example:
set.seed(1)
dt = data.table(length = c(100, 150),
n_A = c(30, 30),
n_B = c(20, 100),
n_C = c(50, 20))
I need to randomly split each vector into two subsets with 80% and 20% of observations respectively. I can currently do this using a for loop. For example:
dt_80_list <- list() # create output lists
dt_20_list <- list()
for (i in 1:nrow(dt)){ # for each row in the data.table
sample_vec <- sample( c( rep("A", dt$n_A[i]), # create a randomised vector with the given nnumber of each component.
rep("B", dt$n_B[i]),
rep("C", dt$n_C[i]) ) )
sample_vec_80 <- sample_vec[1:floor(length(sample_vec)*0.8)] # subset 80% of the vector
dt_80_list[[i]] <- data.table( length = length(sample_vec_80), # count the number of each component in the subset and output to list
n_A = length(sample_vec_80[which(sample_vec_80 == "A")]),
n_B = length(sample_vec_80[which(sample_vec_80 == "B")]),
n_C = length(sample_vec_80[which(sample_vec_80 == "C")])
)
dt_20_list[[i]] <- data.table( length = dt$length[i] - dt_80_list[[i]]$length, # subtract the number of each component in the 80% to identify the number in the 20%
n_A = dt$n_A[i] - dt_80_list[[i]]$n_A,
n_B = dt$n_B[i] - dt_80_list[[i]]$n_B,
n_C = dt$n_C[i] - dt_80_list[[i]]$n_C
)
}
dt_80 <- do.call("rbind", dt_80_list) # collapse lists to output data.tables
dt_20 <- do.call("rbind", dt_20_list)
However, the dataset I need to apply this to is very large, and this is too slow. Does anyone have any suggestions for how I could improve performance?
Thanks.
(I assumed your dataset consists of many more rows (but only a few colums).)
Here's a version I came up with, with mainly three changes
use .N and by= to count the number of "A","B","C" drawn in each row
use the size argument in sample
join the original dt and dt_80 to calculate dt_20 without a for-loop
## draw training data
dt_80 <- dcast(
dt[,row:=1:nrow(dt)
][, .(draw=sample(c(rep("A80",n_A),
rep("B80",n_B),
rep("C80",n_C)),
size=.8*length) )
, by=row
][,.N,
by=.(row,draw)],
row~draw,value.var="N")[,length80:=A80+B80+C80]
## draw test data
dt_20 <- dt[dt_80,
.(A20=n_A-A80,
B20=n_B-B80,
C20=n_C-C80),on="row"][,length20:=A20+B20+C20]
There is probably still room for optimization, but I hope it already helps :)
EDIT
Here I add my initial first idea, I did not post this because the code above is much faster. But this one might be more memory-efficient which seems crucial in your case. So, even if you already have a working solution, this might be of interest...
library(data.table)
library(Rfast)
## add row numbers
dt[,row:=1:nrow(dt)]
## sampling function
sampfunc <- function(n_A,n_B,n_C){
draw <- sample(c(rep("A80",n_A),
rep("B80",n_B),
rep("C80",n_C)),
size=.8*(n_A+n_B+n_C))
out <- Rfast::Table(draw)
return(as.list(out))
}
## draw training data
dt_80 <- dt[,sampfunc(n_A,n_B,n_C),by=row]

r loop for filtering through each column

I have a data frame like this:
gene expression data frame
Assuming column name as different samples and row name as different genes.
Now I want to know the number of genes left after I filter from each column with a number
For example,
sample1_more_than_5 <- df[(df[,1]>5),]
sample1_more_than_10 <- df[(df[,1]>10),]
sample1_more_than_20 <- df[(df[,1]>20),]
sample1_more_than_30 <- df[(df[,1]>30),]
Then,
sample2_more_than_5 <- df[(df[,2]>5),]
sample2_more_than_10 <- df[(df[,2]>10),]
sample2_more_than_20 <- df[(df[,2]>20),]
sample2_more_than_30 <- df[(df[,2]>30),]
But I don't want to repeat this 100 times as I have 100 samples.
Can anyone write a loop for me for this situation? Thank you
Here is a solution using two loops that calculates, by each sample (columns), the number of genes (rows) that have a value greater than the one indicated in the nums vector.
#Create the vector with the numbers used to filter each columns
nums<-c(5, 10, 20, 30)
#Loop for each column
resul <- apply(df, 2, function(x){
#Get the length of rows that have a higher value than each nums entry
sapply(nums, function(y){
length(x[x>y])
})
})
#Transform the data into a data.frame and add the nums vector in the first column
resul<-data.frame(greaterthan = nums,
as.data.frame(resul))
We can loop over the columns and do this and create the grouping with cut
lst1 <- lapply(df, function(x) split(x, cut(x, breaks = c(5, 10, 20, 30))))
or findInterval and then split
lst1 <- lapply(df, function(x) split(x, findInterval(x, c(5, 10, 20, 30))))
If we go by the way the objects are created in the OP's post, there would be 100 * 4 i.e. 400 objects (100 columns) in the global environment. Instead, it can be single list object.
The objects can be created, but it is not recommended
v1 <- c(5, 10, 20, 30)
v2 <- seq_along(df)
for(i in v2) {
for(j in v1) {
assign(sprintf('sample%d_more_than_%d', i, j),
value = df[df[,i] > j,, drop = FALSE])
}
}

generate m-matrices of differnet lengths using foreach command

Take this simple worked example with dummy data:
ab <- c(1:500)
cd <- sample(1:100, 500, replace = T)
ef <- sample(1:10, 500, replace = T)
df1 <- data.frame(ab, cd, ef)
m <- 4
Now I want to use the foreach command to generate m matrices
Each matrix will vary by length using:
#size1 <- sample(50:60, 1)
#indices <- sample(1:500, size1)
#df2 <- df1[indices,]
I have not sure if how to generate the different matrices with the foreach command
Result = foreach(i=1:m,.combine=matrix(df2)) %do%{
size1 <- sample(50:60, 1)
indices <- sample(1:500, size1)
df2 <- df1[indices,]
}
The default of foreach is to save a list. The following code saves a list of matrices of different dimensions.
Result <- foreach(i=1:5) %do%{
# randomly select number of rows and columns
random.rows <- sample(1:5, 1)
random.columns <- sample(1:5, 1)
# generate matrix out of this
matrix(sample(1:100, random.rows*random.columns), random.rows)
}
The Result object is a list of length 5 with matrices whose sizes vary between 1X1 and 5X5.

R: Sample a vector with replacement multiple times

I'd like to sample a vector x of length 7 with replacement and sample that vector 10 separate times. I've tried the something like the following but can't get the resulting 7x10 output I'm looking for. This produces a 1x7 vector but I can't figure out to get the other 9 vectors
x <- runif(7, 0, 1)
for(i in 1:10){
samp <- sample(x, size = length(x), replace = T)
}
This is a very convenient way to do this:
replicate(10,sample(x,length(x),replace = TRUE))
Since you seem to want to sample with replacement, you can just get the 7*10 samples at once (which is more efficient for large sizes):
x <- runif(7)
n <- 10
xn <- length(x)
matrix(x[sample.int(xn, xn*n, replace=TRUE)], nrow=xn)
# Or slightly shorter:
matrix(sample(x, length(x)*n, replace=TRUE), ncol=n)
The second version uses sample directly, but there are some issues with that: if x is a numeric of length 1, bad things happen. sample.int is safer.
x <- c(pi, -pi)
sample(x, 5, replace=T) # OK
x <- pi
sample(x, 5, replace=T) # OOPS, interpreted as 1:3 instead of pi...
Looks like you got a suitable answer, but here's an approach that's similar to your first attempt. The difference is that we define samp with the appropriate dimensions, and then iteratively index into that object and fill it one row at a time:
samp <- matrix(NA, ncol = 7, nrow = 10)
for(i in 1:10){
samp[i,] <- sample(x, size = length(x), replace = T)
}

Using coefficient of variation in aggregate

I have a data frame with 50000 rows and 200 columns. There are duplicate rows in the data and I want to aggregate the data by choosing the row with maximum coefficient of variation among the duplicates using aggregate function in R. With aggregate I can use "mean", "sum" by default but not coefficient variation.
For example
aggregate(data, as.columnname, FUN=mean)
Works fine.
I have a custom function for calculating coefficient of variation but not sure how to use it with aggregate.
co.var <- function(x)
(
100*sd(x)/mean(x)
)
I have tried
aggregate(data, as.columnname, function (x) max (co.var (x, data[index (x),])
but it is giving an error as object x is not found.
Assuming that I understand your problem, I would suggest using tapply() instead of aggregate() (see ?tapply for more info). However, a minimal working example would be very helpful.
co.var <- function(x) ( 100*sd(x)/mean(x) )
## Data with multiple repeated measurements.
## There are three things (ID 1, 2, 3) that
## are measured two times, twice each (val1 and val2)
myDF<-data.frame(ID=c(1,2,3,1,2,3),val1=c(20,10,5,25,7,2),
val2=c(19,9,4,24,4,1))
## Calculate coefficient of variation for each measurement set
myDF$coVar<-apply(myDF[,c("val1","val2")],1,co.var)
## Use tapply() instead of aggregate
mySel<-tapply(seq_len(nrow(myDF)),myDF$ID,function(x){
curSub<-myDF[x,]
return(x[which(curSub$coVar==max(curSub$coVar))])
})
## The mySel vector is then the vector of rows that correspond to the
## maximum coefficient of variation for each ID
myDF[mySel,]
EDIT:
There are faster ways, one of which is below. However, with a 40000 by 100 dataset, the above code only took between 16 and 20 seconds on my machine.
# Create a big dataset
myDF <- data.frame(val1 = c(20, 10, 5, 25, 7, 2),
val2 = c(19, 9, 4, 24, 4, 1))
myDF <- myDF[sample(seq_len(nrow(myDF)), 40000, replace = TRUE), ]
myDF <- cbind(myDF, rep(myDF, 49))
myDF$ID <- sample.int(nrow(myDF)/5, nrow(myDF), replace = TRUE)
# Define a new function to work (slightly) better with large datasets
co.var.df <- function(x) ( 100*apply(x,1,sd)/rowMeans(x) )
# Create two datasets to benchmark the two methods
# (A second method proved slower than the third, hence the naming)
myDF.firstMethod <- myDF
myDF.thirdMethod <- myDF
Time the original method
startTime <- Sys.time()
myDF.firstMethod$coVar <- apply(myDF.firstMethod[,
grep("val", names(myDF.firstMethod))], 1, co.var)
mySel <- tapply(seq_len(nrow(myDF.firstMethod)),
myDF.firstMethod$ID, function(x) {
curSub <- myDF.firstMethod[x, ]
return(x[which(curSub$coVar == max(curSub$coVar))])
}, simplify = FALSE)
endTime <- Sys.time()
R> endTime-startTime
Time difference of 17.87806 secs
Time second method
startTime3 <- Sys.time()
coVar3<-co.var.df(myDF.thirdMethod[,
grep("val",names(myDF.thirdMethod))])
mySel3 <- tapply(seq_along(coVar3),
myDF[, "ID"], function(x) {
return(x[which(coVar3[x] == max(coVar3[x]))])
}, simplify = FALSE)
endTime3 <- Sys.time()
R> endTime3-startTime3
Time difference of 2.024207 secs
And check to see that we get the same results:
R> all.equal(mySel,mySel3)
[1] TRUE
There is an additional change from the original post, in that the edited code considers that there may be more than one row with the highest CV for a given ID. Therefore, to get the results from the edited code, you must unlist the mySel or mySel3 objects:
myDF.firstMethod[unlist(mySel),]
myDF.thirdMethod[unlist(mySel3),]

Resources