R efficiently bind_rows over many dataframes stored on harddrive - r

I have roughly 50000 .rda files. Each contains a dataframe named results with exactly one row. I would like to append them all into one dataframe.
I tried the following, which works, but is slow:
root_dir <- paste(path, "models/", sep="")
files <- paste(root_dir, list.files(root_dir), sep="")
load(files[1])
results_table = results
rm(results)
for(i in c(2:length(files))) {
print(paste("We are at step ", i,sep=""))
load(files[i])
results_table= bind_rows(list(results_table, results))
rm(results)
}
Is there a more efficient way to do this?

Using .rds is a little bit easier. But if we are limited to .rda the following might be useful. I'm not certain if this is faster than what you have done:
library(purrr)
library(dplyr)
library(tidyr)
## make and write some sample data to .rda
x <- 1:10
fake_files <- function(x){
df <- tibble(x = x)
save(df, file = here::here(paste0(as.character(x),
".rda")))
return(NULL)
}
purrr::map(x,
~fake_files(x = .x))
## map and load the .rda files into a single tibble
load_rda <- function(file) {
foo <- load(file = file) # foo just provides the name of the objects loaded
return(df) # note df is the name of the rda returned object
}
rda_files <- tibble(files = list.files(path = here::here(""),
pattern = "*.rda",
full.names = TRUE)) %>%
mutate(data = pmap(., ~load_rda(file = .x))) %>%
unnest(data)

This is untested code but should be pretty efficient:
root_dir <- paste(path, "models/", sep="")
files <- paste(root_dir, list.files(root_dir), sep="")
data_list <- lapply("mydata.rda", function(f) {
message("loading file: ", f)
name <- load(f) # this should capture the name of the loaded object
return(eval(parse(text = name))) # returns the object with the name saved in `name`
})
results_table <- data.table::rbindlist(data_list)
data.table::rbindlist is very similar to dplyr::bind_rows but a little faster.

Related

R plot for multiple files in folder

By using R programming I want to read files in folder. perform some operations on it, plot and save as csv1.
Read next file, perform same operations, plot and save the new modified dataframe in csv1 with rbind function. Remember I want 1 plot from all files I read in for loop and save plot as pdf.
Right now i am using following code but my system crash due to shortage of RAM
all_paths <-
list.files(path = "/work/newplots",
pattern = "*.*",
full.names = TRUE)
all_filenames <- all_paths %>%
basename() %>%
as.list()
all_content <-
all_paths %>%
lapply(read.table,
header = TRUE,
skip=60,
sep=',',
encoding = "UTF-8")
file <- data.frame()
for (i in 1:length(all_filenames)) {
all_lists <- mapply(c, all_content, i, SIMPLIFY = FALSE)
data <- rbindlist(all_lists, fill = T)
names(data)[1] <- "File.Path"
x1 <- data %>% select(V1) %>% unique()
data <- data %>% data.frame(str_split_fixed(data$File.Path, " ", 23))%>% select(-c(File.Path))%>% filter(X1=='Interactions')
data<- cbind(x1,data)
data <- data %>% select(-c(2)) %>%select(V1,X2)
data$X2 <-as.numeric(data$X2)
file <- write.table(data,"/work/con1_10.csv",row.names = FALSE)
file <- append(file,data)
p<-plot(data$X2, xlab="Cycle number",ylab="Interactions",type = "p")
print(p)
Z<- (2*data$X2)/20006
px<-plot(Z, xlab="Cycle number", ylab="Z")
print(px)
}

Use map() function to read and merge another files list

I have three files list
filesl <- list.files(pattern = 'RP-L-.*\\.xls', full.names = TRUE)
filesr <- list.files(pattern = 'RP-R-.*\\.xls', full.names = TRUE)
filesv <- list.files(pattern = 'RP-V-.*\\.xls', full.names = TRUE)
And I've attemped to read and merge two of them as follows:
data <- map2_dfr(filesl, filesr, ~ {
ldat <- readxl::read_excel(.x)
rdat <- readxl::read_excel(.y)
df <- rbind.data.frame(ldat, rdat, by = c('ID', 'GR', 'SES', 'COND'))
})
If I would like to add the third one list filesv what am I supposed to set as function? I guess I should use one from package purrr enabling function iteration on multiple elements (such pmap, imap and so on, but I am not able to change the commands properly).
Any suggestions as regards
Thanks in advance
data <- pmap_dfr(list(filesl, filesr, filesv), ~ {
ldat <- readxl::read_excel(..1)
rdat <- readxl::read_excel(..2)
vdat <- readxl::read_excel(..3)
df <- rbind.data.frame(ldat, rdat, vdat, by = c('ID', 'GR', 'SES', 'COND'))
})
pmap takes a list of lists to iterate on. Then the arguments can be referred to as ..1, ..2, etc in the anonymous function.

Import multiple CSV files with Softball statistics and plot the progress [duplicate]

I have written the following function to combine 300 .csv files. My directory name is "specdata". I have done the following steps for execution,
x <- function(directory) {
dir <- directory
data_dir <- paste(getwd(),dir,sep = "/")
files <- list.files(data_dir,pattern = '\\.csv')
tables <- lapply(paste(data_dir,files,sep = "/"), read.csv, header = TRUE)
pollutantmean <- do.call(rbind , tables)
}
# Step 2: call the function
x("specdata")
# Step 3: inspect results
head(pollutantmean)
Error in head(pollutantmean) : object 'pollutantmean' not found
What is my mistake? Can anyone please explain?
There's a lot of unnecessary code in your function. You can simplify it to:
load_data <- function(path) {
files <- dir(path, pattern = '\\.csv', full.names = TRUE)
tables <- lapply(files, read.csv)
do.call(rbind, tables)
}
pollutantmean <- load_data("specdata")
Be aware that do.call + rbind is relatively slow. You might find dplyr::bind_rows or data.table::rbindlist to be substantially faster.
To update Prof. Wickham's answer above with code from the more recent purrr library which he coauthored with Lionel Henry:
Tbl <-
list.files(pattern="*.csv") %>%
map_df(~read_csv(.))
If the typecasting is being cheeky, you can force all the columns to be as characters with this.
Tbl <-
list.files(pattern="*.csv") %>%
map_df(~read_csv(., col_types = cols(.default = "c")))
If you are wanting to dip into subdirectories to construct your list of files to eventually bind, then be sure to include the path name, as well as register the files with their full names in your list. This will allow the binding work to go on outside of the current directory. (Thinking of the full pathnames as operating like passports to allow movement back across directory 'borders'.)
Tbl <-
list.files(path = "./subdirectory/",
pattern="*.csv",
full.names = T) %>%
map_df(~read_csv(., col_types = cols(.default = "c")))
As Prof. Wickham describes here (about halfway down):
map_df(x, f) is effectively the same as do.call("rbind", lapply(x, f)) but under the hood is much more efficient.
and a thank you to Jake Kaupp for introducing me to map_df() here.
This can be done very succinctly with dplyr and purrr from the tidyverse. Where x is a list of the names of your csv files you can simply use:
bind_rows(map(x, read.csv))
Mapping read.csv to x produces a list of dfs that bind_rows then neatly combines!
```{r echo = FALSE, warning = FALSE, message = FALSE}
setwd("~/Data/R/BacklogReporting/data/PastDue/global/") ## where file are located
path = "~/Data/R/BacklogReporting/data/PastDue/global/"
out.file <- ""
file.names <- dir(path, pattern = ".csv")
for(i in 1:length(file.names)){
file <- read.csv(file.names[i], header = TRUE, stringsAsFactors = FALSE)
out.file <- rbind(out.file, file)
}
write.csv(out.file, file = "~/Data/R/BacklogReporting/data/PastDue/global/global_stacked/past_due_global_stacked.csv", row.names = FALSE) ## directory to write stacked file to
past_due_global_stacked <- read.csv("C:/Users/E550143/Documents/Data/R/BacklogReporting/data/PastDue/global/global_stacked/past_due_global_stacked.csv", stringsAsFactors = FALSE)
files <- list.files(pattern = "\\.csv$") %>% t() %>% paste(collapse = ", ")
```
If your csv files are into an other directory, you could use something like this:
readFilesInDirectory <- function(directory, pattern){
files <- list.files(path = directory,pattern = pattern)
for (f in files){
file <- paste(directory,files, sep ="")
temp <- lapply(file, fread, sep=",")
data <- rbindlist( temp )
}
return(data)
}
In your current function pollutantmean is available only in the scope of the function x. Modify your function to this
x <- function(directory) {
dir <- directory
data_dir <- paste(getwd(),dir,sep = "/")
files <- list.files(data_dir,pattern = '\\.csv')
tables <- lapply(paste(data_dir,files,sep = "/"), read.csv, header = TRUE)
assign('pollutantmean',do.call(rbind , tables))
}
assign should put result of do.call(rbind, tables) into variable called pollutantmean in global environment.

Multiple bind_rows with R Dplyr

I need to bind_row 27 excel files. Although I can do it manually, I want to do it with a loop. The problem with the loop is that it will bind the first file to i and then the first file to i+1, hence losing i. How can I fix this?
nm <- list.files(path="sample/sample/sample/")
df <- data.frame()
for(i in 1:27){
my_data <- bind_rows(df, read_excel(path = nm[i]))
}
We could loop over the files with map, read the data with read_excel and rbind with _dfr
library(purrr)
my_data <- map_dfr(nm, read_excel)
In the Op's code, the issue is that in each iteration, it is creating a temporary dataset 'my_data', instead, it should be binded to the original 'df' already created
for(i in 1:27){
df <- rbind(df, read_excel(path = nm[i]))
}
We could use sapply
result <- sapply(files, read_excel, simplify=FALSE) %>%
bind_rows(.id = "id")
You can still use your for loop:
my_data<-vector('list', 27)
for(i in 1:27){
my_data[i] <- read_excel(path = nm[i])
}
do.call(rbind, my_data)

How to loop through a folder of CSV files in R

I have a folder containing a bunch of CSV files that are titled "yob1980", "yob1981", "yob1982" etc.
I have to use a for loop to go through each file and put its contents into a data frame - the columns in the data frame should be "1980", "1981", "1982" etc
Here is what I have:
file_list <- list.files()
temp = list.files(pattern="*.txt")
babynames <- do.call(rbind,lapply(temp,read.csv, FALSE))
names(babynames) <- c("Name", "Gender", "Count")
I feel like I need a for loop, but I'm not sure how to loop through the files. Anyone point me in the right direction?
My favourite way to do this is using ldply from the plyr package. It has the advantage of returning a dataframe, so you don't need to do the rbind step afterwards:
library( plyr )
babynames <- ldply( .data = list.files(pattern="*.txt"),
.fun = read.csv,
header = FALSE,
col.names=c("Name", "Gender", "Count") )
As an added benefit, you can multi-thread the import very easily, making importing large multi-file datasets quite a bit faster:
library( plyr )
library( doMC )
registerDoMC( cores = 4 )
babynames <- ldply( .data = list.files(pattern="*.txt"),
.fun = read.csv,
header = FALSE,
col.names=c("Name", "Gender", "Count"),
.parallel = TRUE )
Changing the above slightly to include a Year column in the resulting data frame, you can create a function first, then execute that function within ldply in the same way you would execute read.csv
readFun <- function( filename ) {
# read in the data
data <- read.csv( filename,
header = FALSE,
col.names = c( "Name", "Gender", "Count" ) )
# add a "Year" column by removing both "yob" and ".txt" from file name
data$Year <- gsub( "yob|.txt", "", filename )
return( data )
}
# execute that function across all files, outputting a data frame
doMC::registerDoMC( cores = 4 )
babynames <- plyr::ldply( .data = list.files(pattern="*.txt"),
.fun = readFun,
.parallel = TRUE )
This will give you your data in a concise and tidy way, which is how I'd recommend moving forward from here. While it is possible to then separate each year's data into it's own column, it's likely not the best way to go.
Note: depending on your preference, it may be a good idea to convert the Year column to say, integer class. But that's up to you.
Using purrr
library(tidyverse)
files <- list.files(path = "./data/", pattern = "*.csv")
df <- files %>%
map(function(x) {
read.csv(paste0("./data/", x))
}) %>%
reduce(rbind)
A for loop might be more appropriate than lapply in this case.
file_list = list.files(pattern="*.txt")
data_list <- vector("list", "length" = length(file.list))
for (i in seq_along(file_list)) {
filename = file_list[[i]]
# Read data in
df <- read.csv(filename, header = FALSE, col.names = c("Name", "Gender", "Count"))
# Extract year from filename
year = gsub("yob", "", filename)
df[["Filename"]] = year
# Add year to data_list
data_list[[i]] <- df
}
babynames <- do.call(rbind, data_list)
Consider an anonymous function within an lapply():
files = list.files(pattern="*.txt")
dfList <- lapply(files, function(i) {
df <- read.csv(i, header=FALSE, col.names=c("Name", "Gender", "Count"))
df$Year <- gsub("yob", "", i)
return(df)
})
finaldf <- do.call(rbind, dflist)

Resources