read tables and assign to a string in a loop in R - r

I'm sure there is a trivial answer to this but I can't seem to find the right code. I have a list of files and a list of strings that I would like to assign the contents of those files to as dataframes. Then I would like to perform other things on the dataframes within the same loop. I also need to keep each dataframe for downstream work. here is my code:
samples <- c('fc14','g14','fc18','g18','fc21','g21')
fc_samples <- grep("fc", samples, value=TRUE)
fc_files <- c('fc14_g14_full_annot_uniq.txt','fc18_g18_full_annot_uniq.txt','fc21_g21_full_annot_uniq.txt')
# make dataframes
for (file in fc_files)
{ fc_n <- 1
g_n <- 1
print(file);
# THE BIT THAT DOESN'T WORK
assign(paste("data", fc_samples[fc_n], sep='_'), read.table(file,sep = "\t", header=T));
# HERE I EXPECT THE TOP OF MY DF TO BE PRINTED BUT IT ISN'T
head(data_fc14);
# I TRY THIS INSTEAD
do.call("<-",list(paste("data", fc_samples[fc_n], sep='_'), read.table(file,sep = "\t", header=T)))
# I TRY TO PRINT THE DF AGAIN BUT STILL NO LUCK
head(paste("data", fc_samples[fc_n], sep='_'))
# FIRST DOWNSTREAM THING I WOULD LIKE TO DO,
# WON'T WORK UNTIL I SOLVE THE DF ASSIGNMENT ISSUE
names(paste("data", fc_samples[fc_n], sep='_'))[names(paste("data", fc_samples[fc_n], sep='_'))==c('SAMPLE_fc','CHROM_fc','START_fc','REF_fc','ALT_fc','REGION_fc','DP_fc','FREQ_fc','GENE_fc','AFFECTS_fc','dbSNP_fc',
# 'NOVEL_fc')] <- c('SAMPLE','CHROM','START','REF','ALT','REGION','DP','FREQ','GENE','AFFECTS','dbSNP','NOVEL')
# ITERATE TO THE NEXT FILE
fc_n <- fc_n+1
}
I tried solutions from here and here but it didn't help.
If anyone has an elegant solution to this then that would be great! Thanks in advance!

Fixing your code:
samples <- c('fc14','g14','fc18','g18','fc21','g21')
fc_samples <- grep("fc", samples, value=TRUE)
# Make dummy example files
fc_files <- file.path("example-data", c(
'fc14_g14_full_annot_uniq.txt','fc18_g18_full_annot_uniq.txt',
'fc21_g21_full_annot_uniq.txt'))
set.seed(123) ; dummy_df <-
setNames(
as.data.frame(replicate(12, rnorm(7))),
c('SAMPLE_fc','CHROM_fc','START_fc','REF_fc','ALT_fc','REGION_fc',
'DP_fc','FREQ_fc','GENE_fc','AFFECTS_fc','dbSNP_fc','NOVEL_fc')
)
if (!dir.exists("./example-data")) dir.create("example-data")
invisible({
lapply(fc_files, write.table, x = dummy_df, sep = "\t")
})
# "fc_n <- 1" should be outside the loop:
fc_n <- 1
for (file in fc_files) {
g_n <- 1
assign(paste("data", fc_samples[fc_n], sep='_'),
read.table(file,sep = "\t", header=T))
# Copy data to be able to change its names
f <- get(paste("data", fc_samples[fc_n], sep='_'))
names(f)[names(f) == c('SAMPLE_fc','CHROM_fc','START_fc',
'REF_fc','ALT_fc','REGION_fc',
'DP_fc','FREQ_fc','GENE_fc','AFFECTS_fc',
'dbSNP_fc','NOVEL_fc')] <-
c('SAMPLE','CHROM','START','REF','ALT','REGION','DP','FREQ',
'GENE','AFFECTS','dbSNP','NOVEL')
# Assign it back, now that names have been changed
assign(paste("data", fc_samples[fc_n], sep='_'), f)
fc_n <- fc_n+1
}
A "more elegant" way:
assign()ing is not considered best practice, rather work with lists.
Though I occasionally use it myself, there are sometimes good reasons to.
# For the '%>%' pipe
library(magrittr)
data <-
samples %>%
grep(pattern = "fc", value = TRUE) %>%
setNames(nm = .) %>%
lapply(grep, x = fc_files, value = TRUE) %>%
lapply(read.table, sep = "\t", header = TRUE) %>%
lapply(function(f) setNames(f, sub("_fc", "", names(f))))
identical(data_fc14, data$fc14)
# [1] TRUE
identical(data_fc18, data$fc18)
# [1] TRUE
identical(data_fc21, data$fc21)
# [1] TRUE
# Clean up
print(unlink("example-data", recursive = TRUE))

samples <- c('fc14','g14','fc18','g18','fc21','g21')
fc_samples <- grep("fc", samples, value=TRUE)
fc_files <- c('fc14_g14_full_annot_uniq.txt','fc18_g18_full_annot_uniq.txt','fc21_g21_full_annot_uniq.txt')
g_files <- c('g14_full_annot_uniq.txt','g18_full_annot_uniq.txt','g21_full_annot_uniq.txt')
# make dataframes
df_names <- c("data_fc14","data_fc18","data_fc21")
fc_n <- 1
for (file in fc_files)
{
assign(df_names[fc_n], read.table(file,sep = "\t", header=T)); #WORKS
#do.call("<-",list(paste("data", fc_samples[fc_n], sep='_'), read.table(file,sep = "\t", header=T))); #ALSO WORKS
print(head(df_names[fc_n]))
print(head(eval(as.symbol(df_names[fc_n]))))
df <- eval(as.symbol(df_names[fc_n]))
names(df)[names(df) == c('SAMPLE_fc','CHROM_fc','START_fc','REF_fc','ALT_fc','REGION_fc','DP_fc','FREQ_fc','GENE_fc','AFFECTS_fc','dbSNP_fc',
'NOVEL_fc')] <- c('SAMPLE','CHROM','START','REF','ALT','REGION','DP','FREQ','GENE','AFFECTS','dbSNP','NOVEL')
assign(df_names[fc_n], df)
print(head(eval(as.symbol(df_names[fc_n]))))
print(file);
fc_n <- fc_n+1
}
Thanks to all that helped, I solved it using the advise from "apom" in the end as it is most intuitive for more novice R users.

Related

generate variable names in for loop

Hope you don't mind if this is too easy for you.
In R, I am using fromJSON() to read from 3 urls (tier 1 url) , in the JSON file there is "link" field which give me another url (tier 2 url) and I use that and read.table() to get my final data. My code now is like this:
# note, this code does not run
urlJohn <- www.foo1.com
urlJane <- www.foo2.com
urlJoe <- www.foo3.com
tempJohn <- fromJson(urlJohn)
tempJohn[["data"]][["rows"]]$link %<>%
{clean up this data}
dataJohn <- read.table(tempJohn[["data"]][["rows"]]$link,
header = TRUE,
sep = ",")
tempJane <- fromJson(urlJane)
tempJane[["data"]][["rows"]]$link %<>%
{clean up this data}
dataJane <- read.table(tempJane[["data"]][["rows"]]$link,
header = TRUE,
sep = ",")
tempJoe <- fromJson(urlJoe)
tempJoe[["data"]][["rows"]]$link %<>%
{clean up this data}
dataJoe <- read.table(tempJoe[["data"]][["rows"]]$link,
header = TRUE,
sep = ",")
As you can see, I am just copying-n-pasting code blocks. What I wish is this:
# note, this code also does not run
urlJohn <- www.foo1.com
urlJane <- www.foo2.com
urlJoe <- www.foo3.com
source <- c("John", "Jane", "joe")
for (i in source){
temp <- paste(temp, i, sep = "")
url <- paste(url, i, sep = "")
data <- paste(data, i, sep = "")
temp <- fromJson(url)
temp[["data"]][["rows"]]$link %<>%
{clean up this data}
data <- read.table(temp[["data"]][["rows"]]$link,
header = TRUE,
sep = ",")
}
What do I need to do to make the for loop work? If my question is not clear, please ask me to clarify it.
I usually find using lapply convenient than a for loop. Although you can easily convert this to a for loop if needed.
URLs <- c('www.foo1.com', 'www.foo2.com', 'www.foo3.com')
lapply(URLs, function(x) {
temp <- jsonlite::fromJSON(x)
temp[["data"]][["rows"]]$link %<>% {clean up this data}
read.table(temp[["data"]][["rows"]]$link,header = TRUE,sep = ",")
}) -> list_data
list_data
Thanks to #Ronak Shah. The R community strongly favors "non-For-loop" solution.
The way to get my desired result is lapply.
Below is non-running codes in mnemonics:
URLs <- c('www.foo1.com', 'www.foo2.com', 'www.foo3.com')
lapply(URLs, function(x) {
temp <- jsonlite::fromJSON(x)
x <- temp[["data"]][["rows"]]$link %<>% {clean up this data}
y <- read.table(temp[["data"]][["rows"]]$link,header = TRUE,sep = ",")
return(list(x, y))
})
And this is a running example.
x <- list(alpha = 1:10,
beta = exp(-3:3),
logic = c(TRUE,FALSE,FALSE,TRUE))
lapply(x, function(x){
temp <- sum(x) / 2
temp2 <- list(x,
temp)
return(temp2)
}
)

R efficiently bind_rows over many dataframes stored on harddrive

I have roughly 50000 .rda files. Each contains a dataframe named results with exactly one row. I would like to append them all into one dataframe.
I tried the following, which works, but is slow:
root_dir <- paste(path, "models/", sep="")
files <- paste(root_dir, list.files(root_dir), sep="")
load(files[1])
results_table = results
rm(results)
for(i in c(2:length(files))) {
print(paste("We are at step ", i,sep=""))
load(files[i])
results_table= bind_rows(list(results_table, results))
rm(results)
}
Is there a more efficient way to do this?
Using .rds is a little bit easier. But if we are limited to .rda the following might be useful. I'm not certain if this is faster than what you have done:
library(purrr)
library(dplyr)
library(tidyr)
## make and write some sample data to .rda
x <- 1:10
fake_files <- function(x){
df <- tibble(x = x)
save(df, file = here::here(paste0(as.character(x),
".rda")))
return(NULL)
}
purrr::map(x,
~fake_files(x = .x))
## map and load the .rda files into a single tibble
load_rda <- function(file) {
foo <- load(file = file) # foo just provides the name of the objects loaded
return(df) # note df is the name of the rda returned object
}
rda_files <- tibble(files = list.files(path = here::here(""),
pattern = "*.rda",
full.names = TRUE)) %>%
mutate(data = pmap(., ~load_rda(file = .x))) %>%
unnest(data)
This is untested code but should be pretty efficient:
root_dir <- paste(path, "models/", sep="")
files <- paste(root_dir, list.files(root_dir), sep="")
data_list <- lapply("mydata.rda", function(f) {
message("loading file: ", f)
name <- load(f) # this should capture the name of the loaded object
return(eval(parse(text = name))) # returns the object with the name saved in `name`
})
results_table <- data.table::rbindlist(data_list)
data.table::rbindlist is very similar to dplyr::bind_rows but a little faster.

R: Split and write very large data frame into slices

I have a large dataframe my_df in R containing 1983000 records. The following lines of sample code take the chunk of 1000 rows starting from 25001, do some processing, and write the processed data into a file to the local disk.
my_df1 <- my_df[25001:26000,]
my_df1$end <- as.POSIXct(paste(my_df1$end,"23:59",sep = ""))
my_df1$year <- lubridate::year(my_df1$start)
str_data <- my_df1
setwd("path_to_local_dir/data25001_26000")
write.table(str_data, file = "data25001-26000.csv",row.names = F,col.names = F,quote = F)
and so on like this:
my_df2 <- my_df[26001:27000,]
...
I would like automate this task such that the chunks of 1000 records are processed and written to a new directory. Any advise on how this could be done?
Consider generalizing your process in a function, data_to_disk, and call function with an iterator method like lapply passing a sequence of integers with seq() for each subsequent thousand. Also, incorporate a dynamic directory creation (but maybe dump all 1,000+ files in one directory instead of 1,000+ dirs?).
data_to_disk <- function(num) {
str_data <- within(my_df[num:(num + 999)], {
end <- as.POSIXct(paste0(end, "23:59"))
year <- lubridate::year($start)
})
my_dir <- paste0("path_to_local_dir/data", num, "_", num + 999)
if(!dir.exists(my_dir)) dir.create(my_dir)
write.table(str_data, file = paste0(my_dir, "/", "data", num, "-", num + 999, ".csv"),
row.names = FALSE, col.names = FALSE, quote = FALSE)
return(my_df)
}
seqs <- seq(25001, nrow(my_df), by=1000)
head(seqs)
# [1] 25001 26001 27001 28001 29001 30001
tail(seqs)
# [1] 1977001 1978001 1979001 1980001 1981001 1982001
# LIST OF 1,958 DATA FRAMES
df_list <- lapply(seqs, data_to_disk)
Here is my code doing the sliced loop:
step1 = 1000
runto = nrow(my_df)
nsteps = ceiling(runto/step1)
for( part in seq_len(nsteps) ) { # part = 1
cat( part, 'of', nsteps, '\n')
fr = (part-1)*step1 + 1
to = min(part*step1, runto)
my_df1 = my_df[fr:to,]
# ...
write.table(str_data, file = paste0("data",fr,"-",to,".csv"))
}
rm(part, step1, runto, nsteps, fr, to)
You can add a grouping variable to your data first (e.g., to identify every 1000 rows), then use d_ply() to split the data and write to file.
df <- data.frame(var=runif(1000000))
df$fold <- cut(seq(1,nrow(df)),breaks=100,labels=FALSE)
df %>% filter(fold<=2) %>% # only writes first two files
d_ply(.,.(fold), function(i){
# make filenames 'data1.csv', 'data2.csv'
write_csv(i,paste0('data',distinct(i,fold),'.csv'))
})
This is similar to #Parfait but takes a lot of stuff out of the function. Specifically, it creates a copy of the entire dataset and then performs the time manipulation functions.
my_df1 <- my_df
my_df1$end <- as.POSIXct(paste(my_df1$end,"23:59",sep = ""))
my_df1$year <- lubridate::year(my_df1$start)
lapply(seq(25001, nrow(my_df1), by = 1000),
function(i) write.table(my_df1[i:i+1000-1,]
, file = paste0('path_to_logal_dir/data'
, i, '-', i+1000-1, '.csv')
,row.names = F,col.names = F,quote = F)
)
For me, I'd probably just do:
write.table(my_df1, file = ...)
and be done with it. I don't see the advantages of splitting it up - 1 million rows really isn't that many.

How to loop through a folder of CSV files in R

I have a folder containing a bunch of CSV files that are titled "yob1980", "yob1981", "yob1982" etc.
I have to use a for loop to go through each file and put its contents into a data frame - the columns in the data frame should be "1980", "1981", "1982" etc
Here is what I have:
file_list <- list.files()
temp = list.files(pattern="*.txt")
babynames <- do.call(rbind,lapply(temp,read.csv, FALSE))
names(babynames) <- c("Name", "Gender", "Count")
I feel like I need a for loop, but I'm not sure how to loop through the files. Anyone point me in the right direction?
My favourite way to do this is using ldply from the plyr package. It has the advantage of returning a dataframe, so you don't need to do the rbind step afterwards:
library( plyr )
babynames <- ldply( .data = list.files(pattern="*.txt"),
.fun = read.csv,
header = FALSE,
col.names=c("Name", "Gender", "Count") )
As an added benefit, you can multi-thread the import very easily, making importing large multi-file datasets quite a bit faster:
library( plyr )
library( doMC )
registerDoMC( cores = 4 )
babynames <- ldply( .data = list.files(pattern="*.txt"),
.fun = read.csv,
header = FALSE,
col.names=c("Name", "Gender", "Count"),
.parallel = TRUE )
Changing the above slightly to include a Year column in the resulting data frame, you can create a function first, then execute that function within ldply in the same way you would execute read.csv
readFun <- function( filename ) {
# read in the data
data <- read.csv( filename,
header = FALSE,
col.names = c( "Name", "Gender", "Count" ) )
# add a "Year" column by removing both "yob" and ".txt" from file name
data$Year <- gsub( "yob|.txt", "", filename )
return( data )
}
# execute that function across all files, outputting a data frame
doMC::registerDoMC( cores = 4 )
babynames <- plyr::ldply( .data = list.files(pattern="*.txt"),
.fun = readFun,
.parallel = TRUE )
This will give you your data in a concise and tidy way, which is how I'd recommend moving forward from here. While it is possible to then separate each year's data into it's own column, it's likely not the best way to go.
Note: depending on your preference, it may be a good idea to convert the Year column to say, integer class. But that's up to you.
Using purrr
library(tidyverse)
files <- list.files(path = "./data/", pattern = "*.csv")
df <- files %>%
map(function(x) {
read.csv(paste0("./data/", x))
}) %>%
reduce(rbind)
A for loop might be more appropriate than lapply in this case.
file_list = list.files(pattern="*.txt")
data_list <- vector("list", "length" = length(file.list))
for (i in seq_along(file_list)) {
filename = file_list[[i]]
# Read data in
df <- read.csv(filename, header = FALSE, col.names = c("Name", "Gender", "Count"))
# Extract year from filename
year = gsub("yob", "", filename)
df[["Filename"]] = year
# Add year to data_list
data_list[[i]] <- df
}
babynames <- do.call(rbind, data_list)
Consider an anonymous function within an lapply():
files = list.files(pattern="*.txt")
dfList <- lapply(files, function(i) {
df <- read.csv(i, header=FALSE, col.names=c("Name", "Gender", "Count"))
df$Year <- gsub("yob", "", i)
return(df)
})
finaldf <- do.call(rbind, dflist)

looping a function on a df list and save results in r

I have different dataframes and what I want to do is:
apply a function repeated times to each dataframe
save results of each repetition on a new dataframe keeping the name of the original dataframes and adding something else to differentiate it
Here is what I have tried until now
# read all files to list
dataframes <- dir( pattern = ".txt")
list_dataframes <- llply(dataframes, read.csv, header = T, sep =" ", dec=".", na.string = "nd")
n <- length(dataframes)
# apply myfunction 10 times
for (j in 1:10){
modified_list <- llply(list_dataframes, myfunction)
}
if (j <10){
num.char <- paste("n0", j, sep="")
} else num.char <- paste("n", j, sep="")
# save back data frames
for (i in 1:n)
write.table(file = paste( "newfile/_modified",num.char, ".csv", sep = ""),
modified_list[i], row.names = F)
What I want as a result is the modified dataframes (in this case the 10 repetitions for each df of the list)that will have:
the name of the original df
the new name
and the number of iteration
Something likeoriginaldfname_newname_n0
I can not find where I'm missing up. Any help will be deeply appreciated
Two major issues, I think:
the } (line 9 above) should be after your second for loop;
your last line should probably reference modified_list[[i]] instead of using the single-[ notation.
So your code should work (untested, slightly modified for style) as:
library(plyr)
# read all files to list
dataframes <- dir(pattern = ".txt")
list_dataframes <- llply(dataframes, read.csv,
header = T, sep = " ", dec=".", na.string = "nd")
n <- length(dataframes)
# apply myfunction 10 times
for (j in 1:10) {
modified_list <- llply(list_dataframes, myfunction)
# save back data frames
for (i in 1:n)
write.table(file = sprintf("newfile/%s_newname_%02d.csv", dataframes[i], j),
modified_list[[i]], row.names = FALSE)
}
If this were code golf, the last portion could be reduced a little with:
for (j in 1:10) {
mapply(function(df, nm) write.csv(file = sprintf('newfile/%s_newname_%02d.csv', nm, j),
df, row.names = FALSE),
llply(list_dataframes, myfunction), dataframes)
}
(This doesn't necessarily make it perfectly clearer, but it does reduce things a bit. Use it if you at some point prefer to not use for loops, though the performance in this case will be almost identical.)
Note:
Please include required libraries, e.g., library(plyr).
Though lapply would have worked just fine, I kept the use of llply to match your example.

Resources