R foreach: Read and manipulate multiple files in parallel

R foreach: Read and manipulate multiple files in parallel - r

I have 500 tar.xz files containing 2000 csv files. I need to untar them a few tar files at a time (because of disk space), process them into a data.table, delete the csv files from disk and then save the result as RDS before moving on to the next few tar file.
My function works fine in serial but in parallel it gets the files mixed up between cores. Why is this?
Some sample data:
for(j in 1:5){
for(i in 1:5){
a<-df[sample(x = 1:nrow(df), size = 50, replace = TRUE),]
write.csv(a,paste0("seed_",i,".csv"))
lf<-list.files(pattern=".csv")
}
tar(tarfile = paste0("seed_",j,".tar"),files = lf,compression = c("xz"), tar="tar")
}
Example code with foreach
require(dplyr)
require(tidyr)
require(foreach)
require(doParallel)
require(magrittr)
#List all tar files in directory
list_of_files<-list.files(pattern = ".tar")
packsINeed<-c("vroom","magrittr","dplyr","tidyr","doParallel")
#Start for loop
myCluster<-makeCluster(6,type="PSOCK")
registerDoParallel(myCluster)
foreach(i= 1:NROW(list_of_files),.packages = packsINeed)%dopar%{
print(paste(list_of_files[i], "which is", i, "of", NROW(list_of_files) ))
print("2. Untar .csv files inside")
untar(tarfile = list_of_files[i], exdir = "tempOutputFiles")
print("#3. Read in files and add up two columns")
df<-vroom::vroom(list.files("tempOutputFiles/$.csv"), id="path")
df$A<-df$B+df$C
print("#4. save RDS")
saveRDS(object = df, file = paste0(tools::file_path_sans_ext(list_of_files[i], compression = TRUE),".rds"))
print("#5. Clean up files")
.files<-list.files("tempOutputFiles",pattern=".csv")
file.remove(basename(.files))
}
Using mclapply - behaves the same
require(dplyr)
require(tidyr)
require(foreach)
require(doParallel)
require(magrittr)
#List all tar files in directory
list_of_files<-list.files(pattern = ".tar")
myParFun<-fun(filename){
print(paste(filename))
print("2. Untar all .csv files inside")
untar(tarfile = filename, exdir = "tempOutputFiles")
print("#3. Read in files and add up two columns")
df<-vroom::vroom(list.files("tempOutputFiles/$.csv"), id="path")
df$A<-df$B+df$C
print("#4. save RDS")
saveRDS(object = df, file = paste0(tools::file_path_sans_ext(filename, compression = TRUE),".rds"))
print("#5. Clean up files")
.files<-list.files("tempOutputFiles",pattern=".csv")
file.remove(.files)
}
mclapply(FUN=myParFun, list_of_files, mc.cores=4)
Based on Waldi's comment I've created a directory for each file in list_of_files and it now works fine. But is there snore approach? Using tempdir for example?

As suggested in comments, the code below creates one directory per process / tar file, untars, merges the CSVs in a .rds file and deletes them.
Note that it seems that vroom needs the altrep = FALSE argument to avoid a permission denied error at deletion.
# Generate sample tars for test
write.csv(mtcars,'file1.csv')
write.csv(mtcars,'file2.csv')
write.csv(iris,'file3.csv')
write.csv(iris,'file4.csv')
tar('tar1.tar',files=c('file1.csv','file2.csv'),tar="tar")
tar('tar2.tar',files=c('file3.csv','file4.csv'),tar="tar")
require(dplyr)
require(tidyr)
require(foreach)
require(doParallel)
require(magrittr)
#List all tar files in directory
list_of_files<-list.files(pattern = "\\.tar")
packsINeed<-c("vroom","magrittr","dplyr","tidyr","doParallel")
#Start for loop
myCluster<-makeCluster(2,type="PSOCK")
registerDoParallel(myCluster)
foreach(i= 1:NROW(list_of_files),.packages = packsINeed)%dopar%{
print(paste(list_of_files[i], "which is", i, "of", NROW(list_of_files) ))
print("2. Untar .csv files inside")
fileout <- tools::file_path_sans_ext(list_of_files[i], compression = TRUE)
exdir <- paste0("temp",fileout)
untar(tarfile = list_of_files[i], exdir = exdir)
print("#3. Read in files and add up two columns")
df<-vroom::vroom(file.path(exdir,dir(exdir,"*.csv")),altrep = FALSE)
# df$A<-df$B+df$C # These columns don't exist in mtcars used as example
print("#4. save RDS")
saveRDS(object = df, file = file.path(exdir,paste0(fileout,".rds")))
print("#5. Clean up files")
.files<-list.files(exdir,pattern="\\.csv")
file.remove(file.path(exdir,.files))
}
Not sure where the .rds should go, so left for the time being in the temporary folder.

Related

R - write_parquet in loop

fwrite is looping over many .csv files in the working directory but when I write a .parquet it overwrites each time.
I have tried several approaches, basically I am trying to use file name I to keep the .csv file name as shown below without overwriting it.
rm(list = ls())
gc()
# Set up environment #
require("data.table")
require("arrow")
# Set directory to data, define files #
setwd("E:/TransferComplete/07/")
files <- list.files(pattern = "csv")
for (i in files){ setwd("E:/TransferComplete/07/")
loopStart <- Sys.time()
bb <- fread(i,header = TRUE,sep = ",", data.table = FALSE, stringsAsFactors = FALSE,
select = c("x","y","z"))
gc()
write_parquet(bb,
'E:/P/i.parquet')
loopEnd <- Sys.time()
loopTime <- round(as.numeric(loopEnd) - as.numeric(loopStart), 0)
}

Replace this
write_parquet(bb,
'E:/P/i.parquet')
to this
write_parquet(bb,paste0('E:/P/',i,'.parquet'))

You were very close in your question. When you're writing the .parquet, you need to separate the i when writing the file or the loop will keep writing a file called i.parquet.
write_parquet(bb,paste0('E:/P/',i,'.parquet'))

Issue with looping the nc open function

I am trying to loop the
ncin_old<-nc_open("filename", write=TRUE, readunlim=TRUE, verbose=FALSE,
auto_GMT=TRUE, suppress_dimvals=FALSE )
function like this
library(ncdf.tools)
library(ncdf4)
library(ncdf4.helpers)
library(RNetCDF)
library(abind)
setwd("D:/Rwork/Project") # set working folder
# This is the directory where the file for analysing are
dir("D:/Rwork/Project/MASTER_FILES")-> xlab
filelist <- paste("MASTER_FILES/", dir("MASTER_FILES"), sep="")
N <- length(filelist) # Loop over the individual files
for(j in 1:N) {
ncin_old <- nc_open("filelist[j]", write=TRUE, readunlim=TRUE, verbose=FALSE,
auto_GMT=TRUE, suppress_dimvals=FALSE )
}
But I get this error
Error in nc_open("filelist[j]", write = TRUE, readunlim = TRUE,
verbose = FALSE, : Error in nc_open trying to open file
filelist[j]
If I drop everything after filelist[j] the lat file in the loop opens
but the nc_open(x, write) does not seem to like being looped.

I have fixed some issues of your code as below. I think now it is correct.
library(ncdf4)
# set the folder with the files
setwd("D:/Rwork/Project/MASTER_FILES")
# you need the files path, not the directory path
# list only the files with the .nc extension
filelist <- list.files(pattern = "\\.nc$")
# Loop over the individual files
# The filelist cannot be between quotation marks as in your code
N <- length(filelist)
for(j in 1:N) {
ncin_old <- nc_open(filelist[j], write=TRUE, readunlim=TRUE, verbose=FALSE,
auto_GMT=TRUE, suppress_dimvals=FALSE)
}

I've used lapply:
library(ncdf4)
# set the folder that contains all the files
setwd("C:/...")
# create a list with the files with the .nc extension
filelist <- list.files(pattern = "*.nc")
filelist # It contains all the files .nc
# To open all files: Loop over the individual files
for (i in 1:length(filelist)) {
all_nc_files <- lapply(filelist, nc_open)
}
Running that, I obtain "all_nc_files", which contains all .nc files opened and now I can work with them.
Hope it works!

How to append multiple files in R

I'm trying to read a list of files and append them into a new file with all the records. I do not intend to change anything in the original files. I've tried couple of methods.
Method 1: This methods creates a new file but at each iteration the previous file gets added again. Because I'm binding the data frame recursively.
files <- list.files(pattern = "\\.csv$")
#temparary data frame to load the contents on the current file
temp_df <- data.frame(ModelName = character(), Object = character(),stringsAsFactors = F)
#reading each file within the range and append them to create one file
for (i in 1:length(files)){
#read the file
currentFile = read.csv(files[i])
#Append the current file
temp_df = rbind(temp_df, currentFile)
}
#writing the appended file
write.csv(temp_df,"Models_appended.csv",row.names = F,quote = F)
Method 2: I got this method from Rbloggers . This methods won't write to a new file but keeps on modifying the original file.
multmerge = function(){
filenames= list.files(pattern = "\\.csv$")
datalist = lapply(filenames, function(x){read.csv(file=x,header=T)})
Reduce(function(x,y) {merge(x,y)}, temp_df)
}
Can someone advice me on how to achieve my goal?

it could look like this:
files <- list.files(pattern = "\\.csv$")
DF <- read.csv(files[1])
#reading each file within the range and append them to create one file
for (f in files[-1]){
df <- read.csv(f) # read the file
DF <- rbind(DF, df) # append the current file
}
#writing the appended file
write.csv(DF, "Models_appended.csv", row.names=FALSE, quote=FALSE)
or short:
files <- list.files(pattern = "\\.csv$")
DF <- read.csv(files[1])
for (f in files[-1]) DF <- rbind(DF, read.csv(f))
write.csv(DF, "Models_appended.csv", row.names=FALSE, quote=FALSE)

You can use this to load everything into one data set.
dataset <- do.call("rbind", lapply(file.list, FUN = function(file) {
read.table(file, header=TRUE, sep="\t")
}))
And then just save with write.csv.

Or you could go ahead and use shell commands in R:
system2("cat", args = "*.csv", stdout = "appendedfiles.csv")
This would be for unix based systems; I'm not sure what you would do for windows.

Try this for your ListOfFileNames:
ListOfFileNames<-list.files(pattern=".txt")
outFile <- file("all.txt", "w")
for (i in ListOfFileNames){
x <- readLines(i)
writeLines(x, outFile) # in the link the 1st and last line are skipped
}
close(outFile)
Source: https://r.789695.n4.nabble.com/R-Read-multiple-text-files-and-combine-into-single-file-td817344.html

There's an easy answer with rbind_list() now!
For instance:
dataFiles = map(Sys.glob("*.csv"), read.csv)
or however you're reading the files into a list
dat = rbind_list(dataFiles)
and dat will be what you're looking for!

If using Windows, it's easy to do this using the command prompt.
Start -> Run -> type "cmd" and hit return key
cd <path to folder>
copy /b *.txt <outputname>.txt
Concrete example:
cd C:\User\danny\docs\folder_with_txt files
copy /b *.txt concatenated.txt
Note that if you're changing drive letters to do this before cd
D:\> c:
C:\> cd C:\User\danny\docs\folder_with_txt files
copy /b *.txt concatenated.txt

l_ply Confused About how to Pass Variables Through to Function

I'm sure this must have been answered somewhere, so; if you have a pointer to an answer that helps please let me know... ;o)
I have a number of fairly sizeable processing tasks (mainly multi-label text classifiers) which read in large volumes of files, do stuff with that, output a result then move on to the next.
I have this working neatly sequentially but wanted to parallelise things.
By way of a really basic example...
require(plyr)
fileDir <- "/Users/barneyc/sourceFiles"
outputDir <- "/Users/barneyc/outputFiles"
files <- as.list(list.files(full.names=TRUE,recursive=FALSE,pattern=".csv"))
l_ply(files, function(x){
print(x)
#change to dir containing source files
setwd(fileDir)
# read file
content <- read.csv(file=x,header=TRUE)
# change directory to output
setwd(outputDir)
# append the itemID from CSV file to
write.table(content$itemID,file="ids.csv", append = TRUE, sep=",", row.names=FALSE,col.names=TRUE)
}, .parallel=FALSE )
Will iterate through all the files in directory fileDir, opening each CSV, extracting a value from the file and appending this to an output CSV held in the directory outputDir. A basic example but it runs just fine to illustrate the problem.
To run this in parallel creates a problem in so far as the directory variables (fileDir & outputDir) are essentially unknown by the anonymous function (x), ala...
require(plyr)
require(doParallel)
fileDir <- "/Users/barneyc/sourceFiles"
outputDir <- "/Users/barneyc/outputFiles"
files <- as.list(list.files(full.names=TRUE,recursive=FALSE,pattern=".csv"))
cl<-makeCluster(4) # make a cluster of available cores
registerDoParallel(cl) # raise cluster
l_ply(files, function(x){
print(x)
#change to dir containing source files
#setwd(fileDir)
# read file
content <- read.csv(file=x,header=TRUE)
# change directory to output
setwd(y)
# append the itemID from CSV file to
write.table(content$itemID,file="ids.csv", append = TRUE, sep=",", row.names=FALSE,col.names=TRUE)
}, .parallel=TRUE )
stopCluster() # kill the cluster
Can anyone shed light on how I pass those two directory variables through to the function here?

So thanks to #Roland my parallel function would now be...
require(plyr)
require(doParallel)
fileDir <- "/Users/barneyc/sourceFiles"
outputDir <- "/Users/barneyc/outputFiles"
files <- as.list(list.files(full.names=TRUE,recursive=FALSE,pattern=".csv"))
cl<-makeCluster(4) # make a cluster of available cores
registerDoParallel(cl) # raise cluster
l_ply(files, function(x,y,z){
filename <- x
fileDir <- y
outputDir <- z
#change to dir containing source files
setwd(fileDir)
# read file
content <- read.csv(file=filename,header=TRUE)
# change directory to output
setwd(outputDir)
# append the itemID from CSV file to
write.table(content$itemID,file="ids.csv", append = TRUE, sep=",", row.names=FALSE,col.names=TRUE)
}, y=fileDir, z=outputDir, .parallel=TRUE )
stopCluster() # kill the cluster

Automate zip file reading in R

I need to automate R to read a csv datafile that's into a zip file.
For example, I would type:
read.zip(file = "myfile.zip")
And internally, what would be done is:
Unzip myfile.zip to a temporary folder
Read the only file contained on it using read.csv
If there is more than one file into the zip file, an error is thrown.
My problem is to get the name of the file contained into the zip file, in orded to provide it do the read.csv command. Does anyone know how to do it?
UPDATE
Here's the function I wrote based on #Paul answer:
read.zip <- function(zipfile, row.names=NULL, dec=".") {
# Create a name for the dir where we'll unzip
zipdir <- tempfile()
# Create the dir using that name
dir.create(zipdir)
# Unzip the file into the dir
unzip(zipfile, exdir=zipdir)
# Get the files into the dir
files <- list.files(zipdir)
# Throw an error if there's more than one
if(length(files)>1) stop("More than one data file inside zip")
# Get the full name of the file
file <- paste(zipdir, files[1], sep="/")
# Read the file
read.csv(file, row.names, dec)
}
Since I'll be working with more files inside the tempdir(), I created a new dir inside it, so I don't get confused with the files. I hope it may be useful!

Another solution using unz:
read.zip <- function(file, ...) {
zipFileInfo <- unzip(file, list=TRUE)
if(nrow(zipFileInfo) > 1)
stop("More than one data file inside zip")
else
read.csv(unz(file, as.character(zipFileInfo$Name)), ...)
}

You can use unzip to unzip the file. I just mention this as it is not clear from your question whether you knew that. In regard to reading the file. Once your extracted the file to a temporary dir (?tempdir), just use list.files to find the files that where dumped into the temporary directory. In your case this is just one file, the file you need. Reading it using read.csv is then quite straightforward:
l = list.files(temp_path)
read.csv(l[1])
assuming your tempdir location is stored in temp_path.

I found this thread as I was trying to automate reading multiple csv files from a zip. I adapted the solution to the broader case. I haven't tested it for weird filenames or the like, but this is what worked for me so I thought I'd share:
read.csv.zip <- function(zipfile, ...) {
# Create a name for the dir where we'll unzip
zipdir <- tempfile()
# Create the dir using that name
dir.create(zipdir)
# Unzip the file into the dir
unzip(zipfile, exdir=zipdir)
# Get a list of csv files in the dir
files <- list.files(zipdir)
files <- files[grep("\\.csv$", files)]
# Create a list of the imported csv files
csv.data <- sapply(files, function(f) {
fp <- file.path(zipdir, f)
return(read.csv(fp, ...))
})
return(csv.data)}

If you have zcat installed on your system (which is the case for linux, macos, and cygwin) you could also use:
zipfile<-"test.zip"
myData <- read.delim(pipe(paste("zcat", zipfile)))
This solution also has the advantage that no temporary files are created.

Here is an approach I am using that is based heavily on #Corned Beef Hash Map 's answer. Here are some of the changes I made:
My approach makes use of the data.table package's fread(), which
can be fast (generally, if it's zipped, sizes might be large, so you
stand to gain a lot of speed here!).
I also adjusted the output format so that it is a named list, where
each element of the list is named after the file. For me, this was a
very useful addition.
Instead of using regular expressions to sift through the files
grabbed by list.files, I make use of list.file()'s pattern
argument.
Finally, I by relying on fread() and by making pattern an
argument to which you could supply something like "" or NULL or
".", you can use this to read in many types of data files; in fact,
you can read in multiple types of at once (if your .zip contains
.csv, .txt in you want both, e.g.). If there are only some types of
files you want, you can specify the pattern to only use those, too.
Here is the actual function:
read.csv.zip <- function(zipfile, pattern="\\.csv$", ...){
# Create a name for the dir where we'll unzip
zipdir <- tempfile()
# Create the dir using that name
dir.create(zipdir)
# Unzip the file into the dir
unzip(zipfile, exdir=zipdir)
# Get a list of csv files in the dir
files <- list.files(zipdir, rec=TRUE, pattern=pattern)
# Create a list of the imported csv files
csv.data <- sapply(files,
function(f){
fp <- file.path(zipdir, f)
dat <- fread(fp, ...)
return(dat)
}
)
# Use csv names to name list elements
names(csv.data) <- basename(files)
# Return data
return(csv.data)
}

The following refines the above answers. FUN could be read.csv, cat, or anything you like, providing the first argument will accept a file path. E.g.
head(read.zip.url("http://www.cms.gov/Medicare/Coding/ICD9ProviderDiagnosticCodes/Downloads/ICD-9-CM-v32-master-descriptions.zip", filename = "CMS32_DESC_LONG_DX.txt"))
read.zip.url <- function(url, filename = NULL, FUN = readLines, ...) {
zipfile <- tempfile()
download.file(url = url, destfile = zipfile, quiet = TRUE)
zipdir <- tempfile()
dir.create(zipdir)
unzip(zipfile, exdir = zipdir) # files="" so extract all
files <- list.files(zipdir)
if (is.null(filename)) {
if (length(files) == 1) {
filename <- files
} else {
stop("multiple files in zip, but no filename specified: ", paste(files, collapse = ", "))
}
} else { # filename specified
stopifnot(length(filename) ==1)
stopifnot(filename %in% files)
}
file <- paste(zipdir, files[1], sep="/")
do.call(FUN, args = c(list(file.path(zipdir, filename)), list(...)))
}

Another approach that uses fread from the data.table package
fread.zip <- function(zipfile, ...) {
# Function reads data from a zipped csv file
# Uses fread from the data.table package
## Create the temporary directory or flush CSVs if it exists already
if (!file.exists(tempdir())) {dir.create(tempdir())
} else {file.remove(list.files(tempdir(), full = T, pattern = "*.csv"))
}
## Unzip the file into the dir
unzip(zipfile, exdir=tempdir())
## Get path to file
file <- list.files(tempdir(), pattern = "*.csv", full.names = T)
## Throw an error if there's more than one
if(length(file)>1) stop("More than one data file inside zip")
## Read the file
fread(file,
na.strings = c(""), # read empty strings as NA
...
)
}
Based on the answer/update by #joão-daniel

unzipped file location
outDir<-"~/Documents/unzipFolder"
get all the zip files
zipF <- list.files(path = "~/Documents/", pattern = "*.zip", full.names = TRUE)
unzip all your files
purrr::map(.x = zipF, .f = unzip, exdir = outDir)

I just wrote a function based on top read.zip that may help...
read.zip <- function(zipfile, internalfile=NA, read.function=read.delim, verbose=TRUE, ...) {
# function based on http://stackoverflow.com/questions/8986818/automate-zip-file-reading-in-r
# check the files within zip
unzfiles <- unzip(zipfile, list=TRUE)
if (is.na(internalfile) || is.numeric(internalfile)) {
internalfile <- unzfiles$Name[ifelse(is.na(internalfile),1,internalfile[1])]
}
# Create a name for the dir where we'll unzip
zipdir <- tempfile()
# Create the dir using that name
if (verbose) catf("Directory created:",zipdir,"\n")
dir.create(zipdir)
# Unzip the file into the dir
if (verbose) catf("Unzipping file:",internalfile,"...")
unzip(zipfile, file=internalfile, exdir=zipdir)
if (verbose) catf("Done!\n")
# Get the full name of the file
file <- paste(zipdir, internalfile, sep="/")
if (verbose)
on.exit({
catf("Done!\nRemoving temporal files:",file,".\n")
file.remove(file)
file.remove(zipdir)
})
else
on.exit({file.remove(file); file.remove(zipdir);})
# Read the file
if (verbose) catf("Reading File...")
read.function(file, ...)
}

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

R foreach: Read and manipulate multiple files in parallel - r

Related

R - write_parquet in loop

Issue with looping the nc open function

How to append multiple files in R

l_ply Confused About how to Pass Variables Through to Function

Automate zip file reading in R

Categories

Resources