I am looking for speed improvement for a function which imports several ".txt" files to one data frame (adding file name). The number of ".txt" files is > 10 000 and all those files have the same structure and are located in one directory with several sub directories. Size of all 10 000 files is around 800 MB in total. It takes couple of hours to load all 10 000 file to a df.
My PC: Toshiba P50t with 8GB RAM and 1TB HDD
Please see the code I am using.
I am happy to hear suggestions how to improve loading speed (I would prefer not to use intermediary tool like load data to MS SQL and import it to R) I have tried to use fread instead of read_csv without significant speed difference.
files_to_df_v01 <- function( directory , Output_file_name , What_stocks) {
List <- data.frame(dir(directory, pattern="*.txt", recursive = T))
names(List)[1] <- "Path_file"
List <- arrange(List,List$Path_file)
List_wse_stocks <- (filter ( List , str_count(List$Path_file , pattern = What_stocks ) > 0 ))
library(readr)
rownumber = 1
setwd(directory)
############## LOOP ################
for (i in List_wse_stocks$Path_file) {
if (file.info(i)$size != 0) {
dat <- read_csv(i,col_types = cols(Ticker = col_character(), Date = col_date(format = "%Y-%m-%d"), Open = col_double(), High = col_double(), Low = col_double(), Close = col_double(), Volume = col_integer(), OpenInt = col_integer() ))
L_ = (str_locate_all(i,"/"))
sapply(L_,max)
File_name <- substr(i,sapply(L_,max)+1, nchar(i))
dat$Ticker <- substr(File_name,1,nchar(File_name)-4)
datt = dat %>% select(Ticker, Date, Open, High, Low, Close, Volume, OpenInt)
if (rownumber == 1) { rownumber = rownumber + 1
GPW_wse_stocks <- datt }
else{GPW_wse_stocks <- rbind(GPW_wse_stocks, datt)}
}
}
# ) ############## END of LOOP
save(GPW_wse_stocks,file=Output_file_name)
return(data.frame(GPW_wse_stocks))
}
Using data.table I managed to get around 4 times faster solution:
# Creating test data :
dir.create("Test")
dd <- "Test/csvReadingTest2"
dir.create(dd)
dir.create(file.path(dd, "v1"))
dir.create(file.path(dd, "v2"))
n <- 3000
f <- function(x) sample(x, n, replace = T)
require(data.table)
set.seed(123)
d1 <- data.table(Ticker = f(LETTERS),
Date = f(seq.Date(as.Date("2016-01-01"), by = "month",
length.out = n/100)),
Open = f(c(1.2, 1.3)), High = f(c(1.2, 1.3)),
Low = f(c(1.2, 1.3)), Close = f(c(1.2, 1.3)),
Volume = f(1:10), OpenInt = f(1:10))
d1
# Ticker Date Open High Low Close Volume OpenInt
# 1: H 2203-04-01 1.2 1.3 1.2 1.2 6 4
# 2: N 2121-05-01 1.2 1.3 1.2 1.2 9 6
# 3: E 2060-04-01 1.3 1.2 1.2 1.3 1 3
# 4: V 2132-04-01 1.3 1.3 1.3 1.2 7 8
# 5: F 2253-04-01 1.2 1.3 1.3 1.2 3 10
# ---
# 2996: J 2027-05-01 1.3 1.3 1.2 1.2 7 6
# 2997: K 2177-05-01 1.2 1.3 1.2 1.2 5 4
# 2998: S 2200-03-01 1.2 1.2 1.2 1.2 6 2
# 2999: V 2110-05-01 1.3 1.3 1.3 1.2 4 3
# 3000: Q 2043-05-01 1.2 1.3 1.2 1.2 3 5
invisible(lapply(1:100, function(x) fwrite(d1, paste0(dd, "/v1/d", x, ".txt"))))
invisible(lapply(1:100, function(x) fwrite(d1, paste0(dd, "/v2/d", x, ".txt"))))
A little bit modified your function:
################################################################################
yourFunction_modified <- function(directory, Output_file_name, What_stocks) {
# require(plyr)
require(dplyr)
require(stringr)
library(readr)
# List <- data.frame(dir(directory, pattern = "*.txt", recursive = T))
# names(List)[1] <- "Path_file"
# List <- arrange(List, List$Path_file)
# List_wse_stocks <- (filter(List , str_count(List$Path_file ,
# pattern = What_stocks ) > 0 ))
l <- list.files(directory, recursive = T, full.names = T, pattern = "*.txt")
l <- l[grepl(What_stocks, l)]
rownumber = 1
for (i in l) {
if (file.info(i)$size != 0) {
dat <- read_csv(i,
col_types = cols(Ticker = col_character(),
Date = col_date(format = "%Y-%m-%d"),
Open = col_double(), High = col_double(),
Low = col_double(), Close = col_double(),
Volume = col_integer(),
OpenInt = col_integer()))
L_ = (str_locate_all(i,"/"))
File_name <- substr(i,sapply(L_,max) + 1, nchar(i))
dat$Ticker <- substr(File_name,1,nchar(File_name) - 4)
datt = dat %>% select(Ticker, Date, Open, High, Low, Close,
Volume, OpenInt)
if (rownumber == 1) {
rownumber = rownumber + 1
GPW_wse_stocks <- datt
} else {
GPW_wse_stocks <- rbind(GPW_wse_stocks, datt)
}
}
}
save(GPW_wse_stocks, file = Output_file_name)
return(data.frame(GPW_wse_stocks))
}
system.time(
x <- yourFunction_modified(dd, file.path(dirname(dd), "csvReadingTest2.Rdat"),
"/d[0-9]")
)
# 25 - 18 sek
My function:
myFun <- function(directory, Output_file_name, What_stocks) {
require(data.table)
require(Hmisc)
l <- list.files(directory, recursive = T, full.names = T, pattern = "*.txt")
l <- l[grepl(What_stocks, l)]
l <- l[file.info(l)$size != 0]
dtList <- lapply(l, function(i) {
dat <- fread(i)
File_name <- basename(i)
dat$Ticker <- substr(File_name, 1, nchar(File_name) - 4)
necessary <- Cs(Ticker, Date, Open, High, Low, Close, Volume, OpenInt)
# Delete unnecesary columns:
for (ii in setdiff(colnames(dat), necessary)) {
set(dat, j = ii, value = NULL)
}
dat
})
dtList[1:2]
dt <- rbindlist(dtList, use.names = T, fill = T, idcol = F)
require(fasttime)
dt[, Date := as.Date(fastPOSIXct(Date))]
save(dt, file = Output_file_name)
return(dt[])
}
system.time(
x2 <- myFun(dd, file.path(dirname(dd), "csvReadingTest2v2.Rdat"),
"/d[0-9]")
)
# 6 - 4 sek
all.equal(as.data.table(x), x2)
# [1] TRUE1
rbindlist(lapply(files, fread)) is pretty quick, though if you have a high number of small files and you don't care about preserving the filename, you may be best using the operating system directly.
Set up data because OP didn't: 10,000 files of 100 rows.
setwd(tempdir())
dir.create("48492154")
setwd("48492154")
dates <- as.character(seq.Date(as.Date("2012-01-01"),
as.Date(Sys.Date()),
length.out = 500))
library(data.table)
for (i in 1:1e4) {
DT <- data.table(Ticker = 1:100,
Date = sample(dates, size = 100),
Open = round(runif(100) + 100, 1),
Close = round(runif(100) + 100, 1),
Volume = sample(1:100),
OpenInt = 1:100)
cat(i, "of 10,000\r")
flush.console()
fwrite(DT, paste0(i, ".csv"), showProgress = FALSE)
}
Simple method (also handles repeated headers and gets the colClasses nearer to the truth.)
system.time({
res <- rbindlist(lapply(dir(pattern = "\\.csv"), fread))
})
#> user system elapsed
#> 5.46 3.17 8.62
Using Windows's system copy:
system.time({
# Windows only
shell("copy /b *.csv out.txt > dump.log")
new_res <- fread("out.txt")
# Delete the headers mixed in (whereas rbindlist() above
# handles this automatically -- and better)
for (j in names(new_res)) {
new_res <- new_res[.subset2(new_res, j) != j]
}
})
#> user system elapsed
#> 0.76 0.13 3.31
Related
Rscript test.R ../Data/bam/a.bam:0 ../Data/bam/b.bam:0.1 ../Data/bam/c.bam:0.5 ../Data/bam/d.bam:1
I want to make a list of keys and values for commandline arguments. I have use following code.
#test.R
args <- commandArgs(trailingOnly = TRUE)
key_value_pairs <- strsplit(args, " ")
key_value_pairs <- lapply(key_value_pairs, function(x) strsplit(basename(x), ":")[[1]])
key_value_pairs <- as.data.frame(key_value_pairs, stringsAsFactors = FALSE)
colnames(key_value_pairs) <- c("key", "value")
key_value_pairs$value <- as.numeric(key_value_pairs$value)
print(key_value_pairs)
i got follwoing output:
key value
1 a.bam NA
2 0
NA
1 b.bam
2 0.1
NA
1 c.bam
2 0.5
NA
1 d.bam
2 1
NA
but i want out like:
key value
a.bam 0.0
b.bam 0.1
c.bam 0.5
d.bam 1
Can someone help me to find the issue and how to solve it. Thanks
#test.R
args <- commandArgs(trailingOnly = TRUE)
key_value_pairs <- strsplit(args, " ")
key_value_pairs <- lapply(key_value_pairs, function(x) strsplit(basename(x), ":")[[1]])
key_value_pairs <- as.data.frame(key_value_pairs, stringsAsFactors = FALSE)
colnames(key_value_pairs) <- c("key", "value")
key_value_pairs$value <- as.numeric(key_value_pairs$value)
print(key_value_pairs)
Note that the command args are already separated by space so you don't need to do that yourself. And you need a different strategy for creating your data.frame. This should work
args <- commandArgs(trailingOnly = TRUE)
key_value_pairs <- lapply(args, function(x) strsplit(basename(x), ":")[[1]])
key_value_pairs <- as.data.frame(do.call("rbind", key_value_pairs), stringsAsFactors = FALSE)
colnames(key_value_pairs) <- c("key", "value")
key_value_pairs$value <- as.numeric(key_value_pairs$value)
key_value_pairs
# key value
# 1 a.bam 0.0
# 2 b.bam 0.1
# 3 c.bam 0.5
# 4 d.bam 1.0
An alternate way to do this would be
args <- commandArgs(trailingOnly = TRUE)
key_value_pairs <- read.table(text=basename(args), sep=":",
col.names = c("key", "value"))
That will pretty much do everything in one go.
Summary of real-world problem
Essentially this is a scenario evaluation, of a linear system of equations.
I have two data tables.
s_dt contains the scenarios, drivers (d) and values (v) for each observed scenario (o).
c_dt contains a series of terms (n) for a number of fitted model bases (b).
The individual powers of drivers, and associated coefficients are coded into (d and t) as name-value pairs.
Each basis (b) is essentially a polynomial with n terms.
The issue
Repro case below gives desired output format.
But is far too slow for required use case, even on a cut-down problem.
Numbers are junk, but I can't share actual data. Running on real-world data gives similar timing.
Circa 3sec for "lil" problem on my system (12 threads).
But "big" problem is 4000 times larger. So expect circa 3hours. Ouch!
Aim is to have the "big" problem run sub 5min (or ideally much faster!)
So, awesome clever people, how can this be made a lot faster?
(And what is the root cause of the slowdown?)
I'll happily accept base/tidyverse solutions too, if they meet the performance needs. I just assumed data.table was the best way to go for the size of the problem.
Current solution
Run fun on s_dt, grouping by o.
fun: Joins c_dt with each group data, to populate v, thus enabling calculation of r (the result of evaluating each of the polynomial equations).
In data.table parlance:
s_dt[, fun(.SD), keyby = .(o)]
Repro case
Creates two data.tables that have the combinations and field types matching real-world problem.
But with cut-down size for illustrative purposes.
Defines fun, then runs to populate r for all scenarios.
library(data.table)
# problem sizing ----
dims <- list(o = 50000, d = 50, b = 250, n = 200) # "big" problem - real-life size
dims <- list(o = 100, d = 50, b = 25, n = 200) # "lil" problem (make runtime shorter as example)
# build some test data tables ----
build_s <- function() {
o <- seq_len(dims$o)
d <- paste0("d",seq_len(dims$d))
v <- as.double(seq_len(dims$o * dims$d))/10000
CJ(o, d)[, `:=`(v = v)]
}
s_dt <- build_s()
build_c <- function() {
b <- paste0("c", seq_len(dims$b))
n <- seq_len(dims$n)
d <- c("c", paste0("d", seq_len(dims$d)))
t <- as.double(rep_len(0:6, dims$b * dims$n * (dims$d+1)))
dt <- CJ(d, b, n)[, `:=`(t = t)]
dt <- dt[t != 0]
}
c_dt <- build_c()
# define fun and evaluate ----
# (this is what needs optimising)
profvis::profvis({
fun <- function(dt) {
# don't use chaining here, for more useful profvis output
dt <- dt[c_dt, on = .(d)]
dt <- dt[, r := fcase(d == "c", t,
is.na(v), 0,
rep(TRUE, .N), v^t)]
dt <- dt[, .(r = prod(r)), keyby = .(b, n)]
dt <- dt[, .(r = sum(r)), keyby = .(b)]
}
res <- s_dt[, fun(.SD), keyby = .(o)]
})
Example inputs and outputs
> res
o b r
1: 1 c1 0.000000e+00
2: 1 c10 0.000000e+00
3: 1 c11 0.000000e+00
4: 1 c12 0.000000e+00
5: 1 c13 0.000000e+00
---
2496: 100 c5 6.836792e-43
2497: 100 c6 6.629646e-43
2498: 100 c7 6.840915e-43
2499: 100 c8 6.624668e-43
2500: 100 c9 6.842608e-43
> s_dt
o d v
1: 1 d1 0.0001
2: 1 d10 0.0002
3: 1 d11 0.0003
4: 1 d12 0.0004
5: 1 d13 0.0005
---
4996: 100 d50 0.4996
4997: 100 d6 0.4997
4998: 100 d7 0.4998
4999: 100 d8 0.4999
5000: 100 d9 0.5000
> c_dt
d b n t
1: c c1 2 1
2: c c1 3 2
3: c c1 4 3
4: c c1 5 4
5: c c1 6 5
---
218567: d9 c9 195 5
218568: d9 c9 196 6
218569: d9 c9 198 1
218570: d9 c9 199 2
218571: d9 c9 200 3
This would be difficult to fully vectorize. The "big" problem requires so many operations that going parallel is probably the most straightforward way to get to ~5 minutes.
But first, we can get a ~3x speed boost by using RcppArmadillo for the product and sum calculations instead of data.table's grouping operations.
library(data.table)
library(parallel)
Rcpp::cppFunction(
"std::vector<double> sumprod(arma::cube& a) {
for(unsigned int i = 1; i < a.n_slices; i++) a.slice(0) %= a.slice(i);
return(as<std::vector<double>>(wrap(sum(a.slice(0), 0))));
}",
depends = "RcppArmadillo",
plugins = "cpp11"
)
cl <- makeForkCluster(detectCores() - 1L)
The following approach requires extensive preprocessing. The upshot is that it makes it trivial to parallelize. However, it will work only if the values of s_dt$d are the same for each o as in the MRE:
identical(s_dt$d, rep(s_dt[o == 1]$d, length.out = nrow(s_dt)))
#> [1] TRUE
Now let's build the functions to accept s_dt and c_dt:
# slightly modified original function for comparison
fun1 <- function(dt, c_dt) {
# don't use chaining here, for more useful profvis output
dt <- dt[c_dt, on = .(d)]
dt <- dt[, r := fcase(d == "c", t,
is.na(v), 0,
rep(TRUE, .N), v^t)]
dt <- dt[, .(r = prod(r)), keyby = .(b, n)]
dt <- dt[, .(r = sum(r)), keyby = .(b)]
}
fun2 <- function(s_dt, c_dt, cl = NULL) {
s_dt <- copy(s_dt)
c_dt <- copy(c_dt)
# preprocess to get "a", "tt", "i", and "idxs"
i_dt <- s_dt[o == 1][, idxs := .I][c_dt, on = .(d)][, ic := .I][!is.na(v)]
ub <- unique(c_dt$b)
un <- unique(c_dt$n)
nb <- length(ub)
nn <- length(un)
c_dt[, `:=`(i = match(n, un) + nn*(match(b, ub) - 1L), r = 0)]
c_dt[, `:=`(i = i + (0:(.N - 1L))*nn*nb, ni = .N), i]
c_dt[d == "c", r := t]
a <- array(1, c(nn, nb, max(c_dt$ni)))
a[c_dt$i] <- c_dt$r # 3-d array to store v^t (updated for each unique "o")
i <- c_dt$i[i_dt$ic] # the indices of "a" to update (same for each unique "o")
tt <- c_dt$t[i_dt$ic] # c_dt$t ordered for "a" (same for each unique "o")
idxs <- i_dt$idxs # the indices to order s_dt$v (same for each unique "o")
uo <- unique(s_dt$o)
v <- collapse::gsplit(s_dt$v, s_dt$o)
if (is.null(cl)) {
# non-parallel solution
data.table(
o = rep(uo, each = length(ub)),
b = rep(ub, length(v)),
r = unlist(
lapply(
v,
function(x) {
a[i] <- x[idxs]^tt
sumprod(a)
}
)
),
key = "o"
)
} else {
# parallel solution
clusterExport(cl, c("a", "tt", "i", "idxs"), environment())
data.table(
o = rep(uo, each = length(ub)),
b = rep(ub, length(v)),
r = unlist(
parLapply(
cl,
v,
function(x) {
a[i] <- x[idxs]^tt
sumprod(a)
}
)
),
key = "o"
)
}
}
Now the data:
# problem sizing ----
bigdims <- list(o = 50000, d = 50, b = 250, n = 200) # "big" problem - real-life size
lildims <- list(o = 100, d = 50, b = 25, n = 200) # "lil" problem (make runtime shorter as example)
# build some test data tables ----
build_s <- function(dims) {
o <- seq_len(dims$o)
d <- paste0("d",seq_len(dims$d))
v <- as.double(seq_len(dims$o * dims$d))/10000
CJ(o, d)[, `:=`(v = v)]
}
build_c <- function(dims) {
b <- paste0("c", seq_len(dims$b))
n <- seq_len(dims$n)
d <- c("c", paste0("d", seq_len(dims$d)))
t <- as.double(rep_len(0:6, dims$b * dims$n * (dims$d+1)))
dt <- CJ(d, b, n)[, `:=`(t = t)]
dt <- dt[t != 0]
}
Timing the lil problem, which is so small that parallelization doesn't help:
s_dt <- build_s(lildims)
c_dt <- build_c(lildims)
microbenchmark::microbenchmark(fun1 = s_dt[, fun1(.SD, c_dt), o],
fun2 = fun2(s_dt, c_dt),
times = 10,
check = "equal")
#> Unit: seconds
#> expr min lq mean median uq max neval
#> fun1 3.204402 3.237741 3.383257 3.315450 3.404692 3.888289 10
#> fun2 1.134680 1.138761 1.179907 1.179872 1.210293 1.259249 10
Now the big problem:
s_dt <- build_s(bigdims)
c_dt <- build_c(bigdims)
system.time(dt2p <- fun2(s_dt, c_dt, cl))
#> user system elapsed
#> 24.937 9.386 330.600
stopCluster(cl)
A bit longer than 5 minutes with 31 cores.
To cross validation for CCLE (Cancer Cell Line Encyclopedia) drug data I tried to convert the following codes from matlab to R. However, I was unsuccessful. Matlab codes work fine and can create both a *cross.mat that is a group of 10 fold CV data for each data set and a *data.mat that is the grouped data of 10 times of CV of each data set.
I will be appreciate if you can help me find my mistake.
#This function is about 10-fold cross-validation data grouping
getcrossMatrixs <- function(MM){
library(pracma)
N <- nnz(MM)
zeroM <- matrix(0L, nrow = dim(MM)[1], ncol = dim(MM)[2])
D <- randperm(N)
first <- floor(N/10)
w = which(MM != 0, arr.ind=TRUE);
nrows=w[,1]; ncols=w[,2]
crossdata <- list()
for (i in 1:10) {
crossdata[[i]] <- zeroM
}
for (i in 1:10){
for (j in (1+(i-1)*first):(i*first)){
crossdata[[i]][c(nrows[D[j]]),c(ncols[D[j]]) ] <- MM[c(nrows[D[j]]),c(ncols[D[j]])]
}
}
k <- (N-(10*first))
i <- 10*first+1
for (j in 1:k){
crossdata[[j]][c(nrows[D[i]]),c(ncols[D[i]]) ] <- MM[c(nrows[D[i]]),c(ncols[D[i]])]
i <- i+1
}
}
#The following lines is the main for calling above function.
library(foreach)
n.cores <- parallel::detectCores()
my.cluster <- parallel::makeCluster(
n.cores,
type = "PSOCK"
)
print(my.cluster)
#> socket cluster with 16 nodes on host 'localhost'
doParallel::registerDoParallel(cl = my.cluster)
foreach::getDoParRegistered()
#> [1] TRUE
CCLEdata <- list()
#MM<-matrix(read_csv("MM.csv", col_names = FALSE, show_col_types = FALSE), rownames.force = NA)
MM <- matrix(seq(0, 4.5, length.out = 11784), nrow = 491) #datamatrix like CCLE drug activity area sensitivity matrrix(491*24)
foreach(i = 1:10) %dopar% {
CCLEcross <- getcrossMatrixs(MM)
CCLEdata[[i]] <- CCLEcross
}
#> [[1]]
#> NULL
#>
#> [[2]]
#> NULL
#>
#> [[3]]
#> NULL
#>
#> [[4]]
#> NULL
#>
#> [[5]]
#> NULL
#>
#> [[6]]
#> NULL
#>
#> [[7]]
#> NULL
#>
#> [[8]]
#> NULL
#>
#> [[9]]
#> NULL
#>
#> [[10]]
#> NULL
Created on 2022-08-29 with reprex v2.0.2
Actually when I use the original CCLE dataset the error is changing in the main.R:
Error in { : task 1 failed - "is.numeric(x) || is.complex(x) is not TRUE"
or
Error in { :
task 1 failed - "attempt to select less than one element in integerOneIndex"
%These are from Matlab
function [crossdata] = getcrossMatrixs(MM)
N = nnz(MM(:));
zeroM = zeros(size(MM));
D = randperm(N);
first = floor(N/10);
[nrows,ncols] = find(MM);
crossdata = {};
for i = 1:10
crossdata{i} = zeroM;
end
for i = 1:10
for j = 1+(i-1)*first:i*first
crossdata{i}(nrows(D(j)),ncols(D(j))) = MM(nrows(D(j)),ncols(D(j)));
end
end
k=N -10*first ;
i=10*first+1;
for j=1:k
crossdata{j}(nrows(D(i)),ncols(D(i))) = MM(nrows(D(i)),ncols(D(i)));
i=i+1;
end
end
load('MM.mat')
parfor i=1:10
[CCLEcross] = getcrossMatrixs(MM);
CCLEdata{i}=CCLEcross;
end
I didn't look too closely to figure out what was wrong. I based this function on the Matlab function supplied. Note that for this particular example, going parallel is more expensive due to overhead. Parallel will provide performance with large enough matrices and/or more samples.
library(parallel)
MM <- matrix(seq(0, 4.5, length.out = 11784), nrow = 491)
getcrossMatrixs <- function(MM, parts = 10L) {
D <- sample(which(MM != 0))
first <- length(D) %/% parts
last <- length(D) %% parts
idx <- c(0L, cumsum(c(rep(first + 1L, last), rep(first, parts - last))))
mZero <- matrix(0, nrow(MM), ncol(MM))
lapply(1:parts, function(i, m) {m[D[(idx[i] + 1L):idx[i + 1L]]] <- MM[D[(idx[i] + 1L):idx[i + 1L]]]; m}, mZero)
}
reps <- 10L
clust <- makeCluster(min(detectCores() - 1L, reps))
clusterExport(clust, c("getcrossMatrixs", "MM"))
CCLEdata <- parLapply(clust, 1:reps, function(x) getcrossMatrixs(MM))
stopCluster(clust)
# check that each set of matrices returned has all elements of MM
identical(rep(list(MM), reps), lapply(1:reps, function(i) Reduce("+", CCLEdata[[i]], matrix(0, nrow(MM), ncol(MM)))))
#> [1] TRUE
And here's a cleaned-up version of the Matlab function:
function [crossdata] = getcrossMatrixs(MM)
idx = find(MM);
N = length(nrows);
zeroM = zeros(size(MM));
idx = idx(randperm(N));
first = floor(N/10);
crossdata = cell(10, 1);
for i = 1:10
crossdata{i} = zeroM;
end
for i = 1:10
j = 1 + (i - 1)*first:i*first;
crossdata{i}(idx(j)) = MM(idx(j));
end
k = N - 10*first;
j = 10*first + 1;
for i = 1:k
crossdata{i}(idx(j)) = MM(idx(j));
j = j + 1;
end
end
I am trying to read a large csv file into R. I only want to read and work with some of the rows that fulfil a particular condition (e.g. Variable2 >= 3). This is a much smaller dataset.
I want to read these lines directly into a dataframe, rather than load the whole dataset into a dataframe and then select according to the condition, since the whole dataset does not easily fit into memory.
You could use the read.csv.sql function in the sqldf package and filter using SQL select. From the help page of read.csv.sql:
library(sqldf)
write.csv(iris, "iris.csv", quote = FALSE, row.names = FALSE)
iris2 <- read.csv.sql("iris.csv",
sql = "select * from file where `Sepal.Length` > 5", eol = "\n")
By far the easiest (in my book) is to use pre-processing.
R> DF <- data.frame(n=1:26, l=LETTERS)
R> write.csv(DF, file="/tmp/data.csv", row.names=FALSE)
R> read.csv(pipe("awk 'BEGIN {FS=\",\"} {if ($1 > 20) print $0}' /tmp/data.csv"),
+ header=FALSE)
V1 V2
1 21 U
2 22 V
3 23 W
4 24 X
5 25 Y
6 26 Z
R>
Here we use awk. We tell awk to use a comma as a field separator, and then use the conditon 'if first field greater than 20' to decide if we print (the whole line via $0).
The output from that command can be read by R via pipe().
This is going to be faster and more memory-efficient than reading everythinb into R.
I was looking into readr::read_csv_chunked when I saw this question and thought I would do some benchmarking. For this example, read_csv_chunked does well and increasing the chunk size was beneficial. sqldf was only marginally faster than awk.
library(tidyverse)
library(sqldf)
library(data.table)
library(microbenchmark)
# Generate an example dataset with two numeric columns and 5 million rows
tibble(
norm = rnorm(5e6, mean = 5000, sd = 1000),
unif = runif(5e6, min = 0, max = 10000)
) %>%
write_csv('medium.csv')
microbenchmark(
readr = read_csv_chunked('medium.csv', callback = DataFrameCallback$new(function(x, pos) subset(x, unif > 9000)), col_types = 'dd', progress = F),
readr2 = read_csv_chunked('medium.csv', callback = DataFrameCallback$new(function(x, pos) subset(x, unif > 9000)), col_types = 'dd', progress = F, chunk_size = 1000000),
sqldf = read.csv.sql('medium.csv', sql = 'select * from file where unif > 9000', eol = '\n'),
awk = read.csv(pipe("awk 'BEGIN {FS=\",\"} {if ($2 > 9000) print $0}' medium.csv")),
awk2 = read_csv(pipe("awk 'BEGIN {FS=\",\"} {if ($2 > 9000) print $0}' medium.csv"), col_types = 'dd', progress = F),
fread = fread(cmd = "awk 'BEGIN {FS=\",\"} {if ($2 > 9000) print $0}' medium.csv"),
check = function(values) all(sapply(values[-1], function(x) all.equal(values[[1]], x))),
times = 10L
)
# Updated 2020-05-29
# Unit: seconds
# expr min lq mean median uq max neval
# readr 2.6 2.7 3.1 3.1 3.5 4.0 10
# readr2 2.3 2.3 2.4 2.4 2.6 2.7 10
# sqldf 14.1 14.1 14.7 14.3 15.2 16.0 10
# awk 18.2 18.3 18.7 18.5 19.3 19.6 10
# awk2 18.1 18.2 18.6 18.4 19.1 19.4 10
# fread 17.9 18.0 18.2 18.1 18.2 18.8 10
# R version 3.6.2 (2019-12-12)
# macOS Mojave 10.14.6
# data.table 1.12.8
# readr 1.3.1
# sqldf 0.4-11
You can read the file in chunks, process each chunk, and then stitch only the subsets together.
Here is a minimal example assuming the file has 1001 (incl. the header) lines and only 100 will fit into memory. The data has 3 columns, and we expect at most 150 rows to meet the condition (this is needed to pre-allocate the space for the final data:
# initialize empty data.frame (150 x 3)
max.rows <- 150
final.df <- data.frame(Variable1=rep(NA, max.rows=150),
Variable2=NA,
Variable3=NA)
# read the first chunk outside the loop
temp <- read.csv('big_file.csv', nrows=100, stringsAsFactors=FALSE)
temp <- temp[temp$Variable2 >= 3, ] ## subset to useful columns
final.df[1:nrow(temp), ] <- temp ## add to the data
last.row = nrow(temp) ## keep track of row index, incl. header
for (i in 1:9){ ## nine chunks remaining to be read
temp <- read.csv('big_file.csv', skip=i*100+1, nrow=100, header=FALSE,
stringsAsFactors=FALSE)
temp <- temp[temp$Variable2 >= 3, ]
final.df[(last.row+1):(last.row+nrow(temp)), ] <- temp
last.row <- last.row + nrow(temp) ## increment the current count
}
final.df <- final.df[1:last.row, ] ## only keep filled rows
rm(temp) ## remove last chunk to free memory
Edit: Added stringsAsFactors=FALSE option on #lucacerone's suggestion in the comments.
You can open the file in read mode using the function file (e.g. file("mydata.csv", open = "r")).
You can read the file one line at a time using the function readLines with option n = 1, l = readLines(fc, n = 1).
Then you have to parse your string using function such as strsplit, regular expressions, or you can try the package stringr (available from CRAN).
If the line met the conditions to import the data, you import it.
To summarize I would do something like this:
df = data.frame(var1=character(), var2=int(), stringsAsFactors = FALSE)
fc = file("myfile.csv", open = "r")
i = 0
while(length( (l <- readLines(fc, n = 1) ) > 0 )){ # note the parenthesis surrounding l <- readLines..
##parse l here: and check whether you need to import the data.
if (need_to_add_data){
i=i+1
df[i,] = #list of data to import
}
}
I have a large table consisting of several genes (newID) with associated values. Some genes (newID) are unique, some have several instances (appear in multiple rows). How to exclude from the table those with only one occurrence (row)? IN the example below, only the last row would be removed as it is unique.
head(exons.s, 10)
Row.names exonID pvalue log2fold.5_t.GFP_t. newID
1 ENSMUSG00000000001_Gnai3:E001 E001 0.3597070 0.029731989 ENSMUSG00000000001
2 ENSMUSG00000000001_Gnai3:E002 E002 0.6515167 0.028984837 ENSMUSG00000000001
3 ENSMUSG00000000001_Gnai3:E003 E003 0.8957798 0.009665072 ENSMUSG00000000001
4 ENSMUSG00000000001_Gnai3:E004 E004 0.5308266 -0.059273822 ENSMUSG00000000001
5 ENSMUSG00000000001_Gnai3:E005 E005 0.4507640 -0.061276835 ENSMUSG00000000001
6 ENSMUSG00000000001_Gnai3:E006 E006 0.5147357 -0.068357886 ENSMUSG00000000001
7 ENSMUSG00000000001_Gnai3:E007 E007 0.5190718 -0.063959853 ENSMUSG00000000001
8 ENSMUSG00000000001_Gnai3:E008 E008 0.8999434 0.032186993 ENSMUSG00000000001
9 ENSMUSG00000000001_Gnai3:E009 E009 0.5039369 0.133313175 ENSMUSG00000000001
10 ENSMUSG00000000003_Pbsn:E001 E001 NA NA ENSMUSG00000000003
> dim(exons.s)
[1] 234385 5
With plyr I would go about it like this:
## remove single exon genes:
multEx <- function(df){
if (nrow(df) > 1){return(df)}
}
genes.mult.ex <- ddply(exons.s , .(newID), multEx, .parallel=TRUE)
But this is very slow. I thought this would be easy with data.table but I can't figure it out:
exons.s <- data.table(exons.s, key="newID")
x.dt.out <- exons.s[, lapply(.SD, multEx), by=newID]
I am new to data.table so any pointers in the right direction would be welcome.
Create a column giving the number of rows in each group, then subset:
exons.s[,n:=.N,by=newID]
exons.s[n>1]
There is a simpler and more effiecent way of doing this using the duplicated() function instead of counting the group sizes.
First we need to generate a test dastaset:
# Generate test datasets
smallNumberSampled <- 1e3
largeNumberSampled <- 1e6
smallDataset <- data.table(id=paste('id', 1:smallNumberSampled, sep='_'), value1=sample(x = 1:26, size = smallNumberSampled, replace = T), value2=letters[sample(x = 1:26, size = smallNumberSampled, replace = T)])
largeDataset <- data.table(id=paste('id', 1:largeNumberSampled, sep='_'), value1=sample(x = 1:26, size = largeNumberSampled, replace = T), value2=letters[sample(x = 1:26, size = largeNumberSampled, replace = T)])
# add 2 % duplicated rows:
smallDataset <- rbind(smallDataset, smallDataset[sample(x = 1:nrow(smallDataset), size = nrow(smallDataset)* 0.02)])
largeDataset <- rbind(largeDataset, largeDataset[sample(x = 1:nrow(largeDataset), size = nrow(largeDataset)* 0.02)])
Then we implement the three solutions as functions:
# Original suggestion
getDuplicatedRows_Count <- function(dt, columnName) {
dt[,n:=.N,by=columnName]
return( dt[n>1] )
}
# Duplicated using subsetting
getDuplicatedRows_duplicated_subset <- function(dt, columnName) {
# .. means "look up one level"
return( dt[which( duplicated(dt[, ..columnName]) | duplicated(dt[, ..columnName], fromLast = T) ),] )
}
# Duplicated using the "by" argument to avoid copying
getDuplicatedRows_duplicated_by <- function(dt, columnName) {
return( dt[which( duplicated(dt[,by=columnName]) | duplicated(dt[,by=columnName], fromLast = T) ),] )
}
Then we test that they give the same results
results1 <- getDuplicatedRows_Count (smallDataset, 'id')
results2 <- getDuplicatedRows_duplicated_subset(smallDataset, 'id')
results3 <- getDuplicatedRows_duplicated_by(smallDataset, 'id')
> identical(results1, results2)
[1] TRUE
> identical(results2, results3)
[1] TRUE
And the we time the average performance of the 3 solutions:
# Small dataset
> system.time( temp <- replicate(n = 100, expr = getDuplicatedRows_Count (smallDataset, 'id')) ) / 100
user system elapsed
0.00176 0.00007 0.00186
> system.time( temp <- replicate(n = 100, expr = getDuplicatedRows_duplicated_subset(smallDataset, 'id')) ) / 100
user system elapsed
0.00206 0.00005 0.00221
> system.time( temp <- replicate(n = 100, expr = getDuplicatedRows_duplicated_by (smallDataset, 'id')) ) / 100
user system elapsed
0.00141 0.00003 0.00147
#Large dataset
> system.time( temp <- replicate(n = 100, expr = getDuplicatedRows_Count (largeDataset, 'id')) ) / 100
user system elapsed
0.28571 0.01980 0.31022
> system.time( temp <- replicate(n = 100, expr = getDuplicatedRows_duplicated_subset(largeDataset, 'id')) ) / 100
user system elapsed
0.24386 0.03596 0.28243
> system.time( temp <- replicate(n = 100, expr = getDuplicatedRows_duplicated_by (largeDataset, 'id')) ) / 100
user system elapsed
0.22080 0.03918 0.26203
Which shows that the duplicated() approach scales better, especially if the "by=" option is used.
UPDATE: 21 nov 2014. Test of identical output (As suggested by Arun - thanks) identified a problem with me using data.table v 1.9.2 where duplicated's fromLast does not work. I updated to v 1.9.4 and redid the analysis and now the differences is much smaller.
UPDATE: 26 nov 2014. Included and tested the "by=" approach to extract column from the data.table (as suggested by Arun so credit goes there). Furthermore the test of runtime was averaged over 100 test to ensure correctness of result.