Stream processing large csv file in R - r

I need to make a couple of relatively simple changes to a very large csv file (c.8.5GB). I tried initially using various reader functions: read.csv, readr::read.csv, data.table::fread. However: they all run out of memory.
I'm thinking I need to use a stream processing approach instead; read a chunk, update it, write it, repeat. I found this answer which is on the right lines; however I don't how to terminate the loop (I'm relatively new to R).
So I have 2 questions:
What's the right way to make the while loop work?
Is there a better way (for some definition of 'better')? e.g. is there some way to do this using dplyr & pipes?
Current code as follows:
src_fname <- "testdata/model_input.csv"
tgt_fname <- "testdata/model_output.csv"
#Changes needed in file: rebase identifiers, set another col to constant value
rebase_data <- function(data, offset) {
data$'Unique Member ID' <- data$'Unique Member ID' - offset
data$'Client Name' <- "TestClient2"
return(data)
}
CHUNK_SIZE <- 1000
src_conn = file(src_fname, "r")
data <- read.csv(src_conn, nrows = CHUNK_SIZE, check.names=FALSE)
cols <- colnames(data)
offset <- data$'Unique Member ID'[1] - 1
data <- rebase_data(data, offset)
#1st time through, write the headers
tgt_conn = file(tgt_fname, "w")
write.csv(data,tgt_conn, row.names=FALSE)
#loop over remaining data
end = FALSE
while(end == FALSE) {
data <- read.csv(src_conn, nrows = CHUNK_SIZE, check.names=FALSE, col.names = cols)
data <- rebase_data(data, offset)
#write.csv doesn't support col.names=FALSE; so use write.table which does
write.table(data, tgt_conn, row.names=FALSE, col.names=FALSE, sep=",")
# ??? How to test for EOF and set end = TRUE if so ???
# This doesn't work, presumably because nrow() != CHUNK_SIZE on final loop?
if (nrow(data) < CHUNK_SIZE) {
end <- TRUE
}
}
close(src_conn)
close(tgt_conn)
Thanks for any pointers.

Sorry to poke a 2-year-old thread, but now with readr::read_csv_chunked (auto-loaded along with dplyr when loading tidyverse), we could also do like:
require(tidyverse)
## For non-exploratory code, as #antoine-sac suggested, use:
# require(readr) # for function `read_csv_chunked` and `read_csv`
# require(dplyr) # for the pipe `%>%` thus less parentheses
src_fname = "testdata/model_input.csv"
tgt_fname = "testdata/model_output.csv"
CHUNK_SIZE = 1000
offset = read_csv(src_fname, n_max=1)$comm_code %>% as.numeric() - 1
rebase.chunk = function(df, pos) {
df$comm_code = df$comm_code %>% as.numeric() - offset
df$'Client Name' = "TestClient2"
is.append = ifelse(pos > 1, T, F)
df %>% write_csv(
tgt_fname,
append=is.append
)
}
read_csv_chunked(
src_fname,
callback=SideEffectChunkCallback$new(rebase.chunk),
chunk_size = chunck.size,
progress = T # optional, show progress bar
)
Here the tricky part is to set is.append based on parameter pos, which indicates the start row number of the data frame df within original file. Within readr::write_csv, when append=F the header (columns name) will be written to file, otherwise not.

Try this out:
library("chunked")
read_chunkwise(src_fname, chunk_size=CHUNK_SIZE) %>%
rebase_data(offset) %>%
write_chunkwise(tgt_fname)
You may need to fiddle a bit with the colnames to get exactly what you want.
(Disclaimer: haven't tried the code)
Note that there is no vignette with the package but the standard usage is described on github: https://github.com/edwindj/chunked/

OK I found a solution, as follows:
# src_fname <- "testdata/model_input.csv"
# tgt_fname <- "testdata/model_output.csv"
CHUNK_SIZE <- 20000
#Changes needed in file: rebase identifiers, set another col to constant value
rebase_data <- function(data, offset) {
data$'Unique Member ID' <- data$'Unique Member ID' - offset
data$'Client Name' <- "TestClient2"
return(data)
}
#--------------------------------------------------------
# Get the structure first to speed things up
#--------------------------------------------------------
structure <- read.csv(src_fname, nrows = 2, check.names = FALSE)
cols <- colnames(structure)
offset <- structure$'Unique Member ID'[1] - 1
#Open the input & output files for reading & writing
src_conn = file(src_fname, "r")
tgt_conn = file(tgt_fname, "w")
lines_read <- 0
end <- FALSE
read_header <- TRUE
write_header <- TRUE
while(end == FALSE) {
data <- read.csv(src_conn, nrows = CHUNK_SIZE, check.names=FALSE, col.names = cols, header = read_header)
if (nrow(data) > 0) {
lines_read <- lines_read + nrow(data)
print(paste0("lines read this chunk: ", nrow(data), ", lines read so far: ", lines_read))
data <- rebase_data(data, offset)
#write.csv doesn't support col.names=FALSE; so use write.table which does
write.table(data, tgt_conn, row.names=FALSE, col.names=write_header, sep = ",")
}
if (nrow(data) < CHUNK_SIZE) {
end <- TRUE
}
read_header <- FALSE
write_header <- FALSE
}
close(src_conn)
close(tgt_conn)

Related

R: Split and write very large data frame into slices

I have a large dataframe my_df in R containing 1983000 records. The following lines of sample code take the chunk of 1000 rows starting from 25001, do some processing, and write the processed data into a file to the local disk.
my_df1 <- my_df[25001:26000,]
my_df1$end <- as.POSIXct(paste(my_df1$end,"23:59",sep = ""))
my_df1$year <- lubridate::year(my_df1$start)
str_data <- my_df1
setwd("path_to_local_dir/data25001_26000")
write.table(str_data, file = "data25001-26000.csv",row.names = F,col.names = F,quote = F)
and so on like this:
my_df2 <- my_df[26001:27000,]
...
I would like automate this task such that the chunks of 1000 records are processed and written to a new directory. Any advise on how this could be done?
Consider generalizing your process in a function, data_to_disk, and call function with an iterator method like lapply passing a sequence of integers with seq() for each subsequent thousand. Also, incorporate a dynamic directory creation (but maybe dump all 1,000+ files in one directory instead of 1,000+ dirs?).
data_to_disk <- function(num) {
str_data <- within(my_df[num:(num + 999)], {
end <- as.POSIXct(paste0(end, "23:59"))
year <- lubridate::year($start)
})
my_dir <- paste0("path_to_local_dir/data", num, "_", num + 999)
if(!dir.exists(my_dir)) dir.create(my_dir)
write.table(str_data, file = paste0(my_dir, "/", "data", num, "-", num + 999, ".csv"),
row.names = FALSE, col.names = FALSE, quote = FALSE)
return(my_df)
}
seqs <- seq(25001, nrow(my_df), by=1000)
head(seqs)
# [1] 25001 26001 27001 28001 29001 30001
tail(seqs)
# [1] 1977001 1978001 1979001 1980001 1981001 1982001
# LIST OF 1,958 DATA FRAMES
df_list <- lapply(seqs, data_to_disk)
Here is my code doing the sliced loop:
step1 = 1000
runto = nrow(my_df)
nsteps = ceiling(runto/step1)
for( part in seq_len(nsteps) ) { # part = 1
cat( part, 'of', nsteps, '\n')
fr = (part-1)*step1 + 1
to = min(part*step1, runto)
my_df1 = my_df[fr:to,]
# ...
write.table(str_data, file = paste0("data",fr,"-",to,".csv"))
}
rm(part, step1, runto, nsteps, fr, to)
You can add a grouping variable to your data first (e.g., to identify every 1000 rows), then use d_ply() to split the data and write to file.
df <- data.frame(var=runif(1000000))
df$fold <- cut(seq(1,nrow(df)),breaks=100,labels=FALSE)
df %>% filter(fold<=2) %>% # only writes first two files
d_ply(.,.(fold), function(i){
# make filenames 'data1.csv', 'data2.csv'
write_csv(i,paste0('data',distinct(i,fold),'.csv'))
})
This is similar to #Parfait but takes a lot of stuff out of the function. Specifically, it creates a copy of the entire dataset and then performs the time manipulation functions.
my_df1 <- my_df
my_df1$end <- as.POSIXct(paste(my_df1$end,"23:59",sep = ""))
my_df1$year <- lubridate::year(my_df1$start)
lapply(seq(25001, nrow(my_df1), by = 1000),
function(i) write.table(my_df1[i:i+1000-1,]
, file = paste0('path_to_logal_dir/data'
, i, '-', i+1000-1, '.csv')
,row.names = F,col.names = F,quote = F)
)
For me, I'd probably just do:
write.table(my_df1, file = ...)
and be done with it. I don't see the advantages of splitting it up - 1 million rows really isn't that many.

Breaking the golden rules on FOR loops with R

Firstly, apologies as this may seem a bit long winded, but I hope to give as much information on this problem as I can...
I have written a script that loops through a set of files defined in a csv file. Each file within this csv listing is an XML file, each one is for a particular event in an application, and all files within this list are of the same event type. However, each file can contain different data. For instance, one could hold an attribute with no child nodes beneath, while others contain nodes.
My script works perfectly fine, but when it gets to about XML file 5000, it has slowed down considerably.
Problem is that my code creates a blank dataframe initially, and then grows is at new columns are detected.
I understand that this is a big NO NO when it comes to writing R FOR loops, but am unsure how to get around this problem, give my smallest file listing is 69000, which makes going through each one in turn and counting the nodes a task in itself.
Are there any ideas on how to get around this?
pseudo code or actual R code to do this would be great. So would ideas/opinions, as I am unsure on the best approach to this task.
Here is my current code.
library(XML)
library(xml2)
library(plyr)
library(tidyverse)
library(reshape2)
library(foreign)
library(rio)
# Get file data to be used
#
setwd('c:/temp/xml')
headerNames <- c('GUID','EventId','AppId','RequestFile', 'AE_Type', 'AE_Drive')
GetNames <- rowid_to_column(read.csv(file= 'c:/temp/xml/R_EventIdA.csv', fileEncoding="UTF-8-BOM", header = FALSE, col.names = headerNames),'ID')
inputfiles <- as.character(GetNames[,5]) # Gets list of files
# Create empty dataframes
#
df <- data.frame()
transposed.df1 <- data.frame()
allxmldata <- data.frame()
findchildren<-function(nodes, df) {
numchild <- sapply(nodes, function(x){length(xml_children(x))})
xml.value <- xml_text(nodes[numchild==0])
xml.name <- xml_name(nodes[numchild==0])
xml.path <- sapply(nodes[numchild==0], function(x) {gsub(', ','_', toString(rev(xml_name(xml_parents(x)))))})
fieldname <- paste(xml.path,xml.name,sep = '_')
contents <- sapply(xml.value, function(f){is.na(f)<-which(f == '');f})
if (length(fieldname) > 0) {
fieldname <- paste(fieldname,xml.value, sep = '_')
dftemp <- data.frame(fieldname, contents)
df <- rbind(df, dftemp)
print(dim(df))
}
if (sum(numchild)>0){
findchildren(xml_children(nodes[numchild>0]), df) }
else{ return(df)
}
}
findchildren2<-function(nodes, df){
numchild<-sapply(nodes, function(embeddedinputfile){length(xml_children(embeddedinputfile))})
xmlvalue<-xml_text(nodes[numchild==0])
xmlname<-xml_name(nodes[numchild==0])
xmlpath<-sapply(nodes[numchild==0], function(embeddedinputfile) {gsub(', ','_', toString(rev(xml_name(xml_parents(embeddedinputfile)))))})
fieldname<-paste(xmlpath,xmlname,sep = '_')
contents<-sapply(xmlvalue, function(f){is.na(f)<-which(f == '');f})
if (length(fieldname) > 0) {
dftemp<-data.frame(fieldname, contents)
df<-rbind(df, dftemp)
print(dim(df))
}
if (sum(numchild)>0){
findchildren2(xml_children(nodes[numchild>0]), df) }
else{ return(df)
}
}
# Loop all files
#
for (x in inputfiles) {
df1 <- findchildren(xml_children(read_xml(x)),df)
## original xml dataframe
if (length(df1) > 0) {
xml.df1 <- data.frame(spread(df1, key = fieldname, value = contents), fix.empty.names = TRUE)
}
##
xml.df1 %>%
pluck('Response_RawData') -> rawxml
if (length(rawxml)>0) {
df.rawxml <- data.frame(rawxml)
export(df.rawxml,'embedded.xml')
embeddedinputfile <-as.character('embedded.xml')
rm(df1)
df1 <- findchildren2(xml_children(read_xml(embeddedinputfile)),df)
if (length(df1) > 0) {
xml.df2 <- spread(df1, key = fieldname, value = contents)
}
allxmldata <- rbind.fill(allxmldata,cbind(xml.df1,xml.df2))
} else {
allxmldata <- rbind.fill(allxmldata,cbind(xml.df1))
}
}
if(nrow(allxmldata)==nrow(GetNames)) {
alleventdata<-cbind(GetNames,allxmldata)
}
dbConn2 <- odbcDriverConnect('driver={SQL Server};server=PC-XYZ;database=Events;trusted_connection=true')
sqlSave(dbConn2, alleventdata, tablename = 'AE_EventA', append = TRUE )

For-loop in R to create a new file (but gives incorrect/unexpected output)

I'm currently busy with some data and I need to check their validity.
Therefore, I would like to use a for-loop to go through all my data files.
In this for-loop, I would like to calculate some things (like mean, min,max...).
My code below works but produced an incorrectly written csv file. The problem occurs after the calculations (and their values) are done during csv file creation. CSV:
"c.1..1..1004.89081855716..630.174466667434..461.738905906677.." "c.1..1..950.990843858612..479.98560814955..517.955102920532.."
1 1
1 1
1004.89081855716 950.990843858612
630.174466667434 479.98560814955
461.738905906677 517.955102920532
1535.86795806885 1452.30199813843
-13.3948961645365 3.72026950120926
1259.26423788071 1159.17089223862
Approach/What I'm expecting:
So I start from some data files with eye tracking data in it.
As you can see at the beginning of the code, I try to get some values out of this eye tracking data (validity, new file with only validity == 1 data...). Once I created the filtered_data dataframe, I want to calculate some extra values out of it (mean, sd, min/max).
My plan is to create a new csv file (validity_loop.csv) in which I can find all my calculations (validity_left, validity_right,mean_eye_x, mean_eye_y, min_eye_x,max_eye_x,min_eye_y,max_eye_y). All in a row. One row for each data set (file_list[i]).
Can someone help me in how to tackle and solve this issue?
Here is my code:
set <- setwd("/Users/Sarah/Documents")
file_list <- list.files(set, pattern = ".csv", all.files = TRUE)
validity_list <- data_list <- vector("list", "length" = length(file_list))
for(i in seq_along(file_list)){
filename = file_list[i]
#read files
data_frame = read.csv(filename, sep = ",", dec = ".",
header = TRUE,
stringsAsFactors = FALSE)
#what has to be done
#validity
validity_left <- mean(is.numeric(data_frame$left_gaze_point_validity))
validity_right <-mean(is.numeric(data_frame$right_gaze_point_validity))
#Zuiver dataframe (validity ==1)
to_keep = which(data_frame$left_gaze_point_validity == 1 &
data_frame$right_gaze_point_validity==1)
filtered_data = data_frame[to_keep,]
filtered_data$left_eye_x = as.numeric(filtered_data$left_eye_x)
filtered_data$left_eye_y = as.numeric(filtered_data$left_eye_y)
filtered_data$right_eye_x = as.numeric(filtered_data$right_eye_x)
filtered_data$right_eye_y = as.numeric(filtered_data$right_eye_y)
#1 eye-data
filtered_data$eye_x <- (filtered_data$left_eye_x+filtered_data$right_eye_x)/2
filtered_data$eye_y <- (filtered_data$left_eye_y+filtered_data$right_eye_y)/2
#Pixels
filtered_data$eye_x <- (filtered_data$eye_x)*1920
filtered_data$eye_y <- (filtered_data$eye_y)*1080
#SD and Mean + min-max
mean_eye_x<- mean(filtered_data$eye_x)
mean_eye_y <- mean(filtered_data$eye_y)
sd_eye_x <- sd(filtered_data$eye_x)
sd_eye_y <- sd(filtered_data$eye_y)
min_eye_x <- min(filtered_data$eye_x)
min_eye_y <- min(filtered_data$eye_y)
max_eye_x <- max(filtered_data$eye_x)
max_eye_y <- max(filtered_data$eye_y)
#add everything to new file
validity_list[[i]] <- c(validity_left, validity_right,
mean_eye_x, mean_eye_y,
min_eye_x, min_eye_y,
max_eye_x, max_eye_y)
}
#new document
write.table(validity_list,
file = "Master T&O/Thesis /Loop/Validity/validity_loop.csv",
col.names = TRUE, row.names = FALSE)
I managed to get a new data frame in R, which contains the value of my validity_list as a matrix form.
#FOR LOOP poging 2
set <- setwd("/Users/Sarah/Documents/Master T&O/Thesis /Loop")
file_list <- list.files(set, pattern = ".csv", all.files = TRUE)
validity_list <- vector("list", "length" = length(file_list))
for(i in seq_along(file_list)){
filename = file_list[i]
#read files
data_frame = read.csv(filename, sep = ",", dec = ".", header = TRUE, stringsAsFactors = FALSE)
#what has to be done
#validity
validity_left <- mean(is.numeric(data_frame$left_gaze_point_validity))
validity_right <-mean(is.numeric(data_frame$right_gaze_point_validity))
#Zuiver dataframe (validity ==1)
to_keep = which(data_frame$left_gaze_point_validity == 1 & data_frame$right_gaze_point_validity==1)
filtered_data = data_frame[to_keep,]
filtered_data$left_eye_x = as.numeric(filtered_data$left_eye_x)
filtered_data$left_eye_y = as.numeric(filtered_data$left_eye_y)
filtered_data$right_eye_x = as.numeric(filtered_data$right_eye_x)
filtered_data$right_eye_y = as.numeric(filtered_data$right_eye_y)
#1 eye-data
filtered_data$eye_x <- (filtered_data$left_eye_x+filtered_data$right_eye_x)/2
filtered_data$eye_y <- (filtered_data$left_eye_y+filtered_data$right_eye_y)/2
#Pixels
filtered_data$eye_x <- (filtered_data$eye_x)*1920
filtered_data$eye_y <- (filtered_data$eye_y)*1080
#SD and Mean + min-max
mean_eye_x<- mean(filtered_data$eye_x)
mean_eye_y <- mean(filtered_data$eye_y)
sd_eye_x <- sd(filtered_data$eye_x)
sd_eye_y <- sd(filtered_data$eye_y)
min_eye_x <- min(filtered_data$eye_x)
min_eye_y <- min(filtered_data$eye_y)
max_eye_x <- max(filtered_data$eye_x)
max_eye_y <- max(filtered_data$eye_y)
#add everything to new file
validity_list[[i]] <- c(validity_left, validity_right,mean_eye_x, mean_eye_y, min_eye_x,max_eye_x,min_eye_y,max_eye_y)
validity_matrix <- matrix(unlist(validity_list), ncol = 8, byrow = TRUE)
}
#new document
write.table(validity_matrix, file = "/Users/Sarah/Documents/Master T&O/Thesis /Loop/Validity/validity_loop.csv", dec = ".")
The only problem I have now, is the fact that my values for the validity_list items are wrong, but that's another problem and I'm trying to fix it!
If I get it then the following line grabs all your data together:
validity_list[[i]] <- c (validity_left, validity_right,mean_eye_x,
mean_eye_y, min_eye_x,max_eye_x,min_eye_y,max_eye_y).
if it's like in python then I would have:
validity_list = (validity_left, validity_right,mean_eye_x,
mean_eye_y, min_eye_x,max_eye_x,min_eye_y,max_eye_y)
... whereas the '=' tell the interpreter that everything behind it is a tuple '(', data, ')' ...which makes it one single dataset and if I then write it... it would be end up in one column. If you do a pick using a for-loop I would get "validity_left" writing in a separate column. In your case adding this to your below code an option?
for item in validity_list:
function to process item..etc.

write results sequentially in a loop in r

I have a bunt of single files which need to apply a test. I need to find the way to write automatically results of each file into a file. Here is what I do:
library(ape)
stud_files <- list.files("path/dir/data",full.names = T)
for (f in stud_files) {
df <- read.table(f, header=TRUE, sep=";")
df_xts <- as.xts(df$cola, order.by = as.Date(df$colb,"%m/%d/%Y"))
pet <- testa(df_xts)
res <- data.frame(estimate = pet$estimate,
p.value=pet$p.value,
logi = pet$alternative)
write.dna(res,file = "res_testa.xls",format = "sequential")
}
This loop works well, except the last command which aim to write the results of each file consecutively, it saved only the last performance. And the results save as string, not a table as I define above (data.frame). Any idea in this case? Thanks in advance
Check help(write.dna).
write.dna(x, file, format = "interleaved", append = FALSE,
nbcol = 6, colsep = " ", colw = 10, indent = NULL,
blocksep = 1)
append a logical, if TRUE the data are appended to the file without
erasing the data possibly existing in the file, otherwise the file (if
it exists) is overwritten (FALSE the default).
Set append = TRUE and you should be all set.
As some of the comments point out, however, you are probably better off generating your table, and then writing it all at once to a file. Unless you have billions of files, you likely won't run out of memory.
Here is how I would approach this.
library(ape)
library(data.table)
stud_files <- list.files("path/dir/data",full.names = T)
sumfunc <- function(f) {
df <- read.table(f, header=TRUE, sep=";")
df_xts <- as.xts(df$cola, order.by = as.Date(df$colb,"%m/%d/%Y"))
pet <- testa(df_xts)
res <- data.table(estimate = pet$estimate,
p.value=pet$p.value,
logi = pet$alternative)
return(res)
}
lres <- lapply(stud_files, sumfunc)
dat <- rbindlist(lres)
write.table(dat,
file = "res_testa.csv",
sep = ",",
quote = FALSE,
row.names = FALSE)

Read numeric input as string R

So, i have this input csv of the form,
id,No.,V,S,D
1,0100000109,623,233,331
2,0200000109,515,413,314
3,0600000109,611,266,662
I need to read the No. Column as it is(i.e., as a character). I know i can use something like this for that:
data <- read.csv("input.csv", colClasses = c("MSISDN" = "character"))
I have a code that i'm using to read the csv file in chunks:
chunk_size <- 2
con <- file("input.csv", open = "r")
data_frame <- read.csv(con,nrows = chunk_size,colClasses = c("MSISDN" = "character"),quote="",header = TRUE,)
header <- names(data_frame)
print(header)
print(data_frame)
if(nrow(data_frame) == chunk_size) {
repeat {
data_frame <- read.csv(con,nrows = chunk_size, header = FALSE, quote="")
names(data_frame)<-c(header)
print(header)
print(data_frame)
if(nrow(data_frame) < chunk_size) {
break
}
}
}
close(con)
But, here what the issue i'm facing is that, the first chunk will only read the No. Column as a character, the rest of the chunks will not.
How can i resolve this?
PS: the original input file has about 150+ columns and about 20 Million rows.
You can read the data as string with readLines and split it:
fileName <- "input.csv"
df <- do.call(rbind.data.frame, strsplit(readLines(fileName), ",")[-1]) # skipping headlines
colnames(df) <- c("id","No.","V","S","D") #adding headlines
or the direct approach with read.csv:
fileName <- "input.csv"
col <- c("integer","character","integer","integer","integer")
df <- read.csv(file = fileName,
sep = ",",
colClasses=col,
header = TRUE,
stringsAsFactors = FALSE)
You need to give the column type colClasses in the read.csv() inside the repeat procedure.
You no longer have the header so you need to define an unnamed vector to specify the colClasses.
Let's say the size of colClasses is 150.
myColClasses=rep("numeric",150)
myColClasses[2] <- "character"
repeat {
data_frame <- read.csv(con,nrows = chunk_size, colClasses=myColClasses, header = FALSE, quote="")
...

Resources