using rbind problem inside nested for loop - r

I am trying to open and read multiple NetCDF files and I want to save the result to one list of many data frames.
in my working directory, I have the folder "main_folder" contains five folders (x1,x2,x3,x4, and x5) each folder of these five contains a different number of subfolders, let me say for example the folder x1 contains subfolders from folder "y1", to "y20". the folder y1 contains n1 number of NetCDF files. the folder y2 contains n2 number of NetCDF files and so on. similarly for the other folders x2, x3,x4,x5.
From the folder x1, I want to open, read and get the variables from all NetCDF files and make them as one data frame df1.
and from the folder x2, I want to make the second data frame df2 and so on.
at the End I will be having five data frames corresponding to each folder content. and then I want to make a list of these five data frames.
I wrote one code, it works except one problem which is the second data frame in the list contain the data of df1 appended to it the data of the second file df2. and df5 contains the data of df1+df2+df3+df4+df5.
How can I solve this problem.
here is my code
setwd("E:/main_folder")
#1# list all files in the main_folder
folders<- as.list(list.files("E:/main_folder"))
#2# make list of subfiles
subfiles<- lapply(folders, function(x) as.list(list.files(paste("E:/main_folder",x, sep="/"))))
#3# list the netcdf files from each subfiles
files1<- lapply(subfiles[[1]], function(x) list.files(paste( folders[1],x, sep = "/"),pattern='*.nc',full.names=TRUE))
files2<- lapply(subfiles[[2]], function(x) list.files(paste( folders[2],x, sep = "/"),pattern='*.nc',full.names=TRUE))
files3<- lapply(subfiles[[3]], function(x) list.files(paste( folders[3],x, sep = "/"),pattern='*.nc',full.names=TRUE))
files4<- lapply(subfiles[[4]], function(x) list.files(paste( folders[4],x, sep = "/"),pattern='*.nc',full.names=TRUE))
files5<- lapply(subfiles[[5]], function(x) list.files(paste( folders[5],x, sep = "/"),pattern='*.nc',full.names=TRUE))
#4# join all files in one list
filelist<- list(files1,files2,files3,files4,files5)
#5# Read the NetCDF and get the desired variables
df<- data.frame()
MissionsData<- list()
for (i in seq_along(filelist)){
n<- length(filelist[[i]])
for (j in 1:n){
for( m in 1:length( filelist[[i]][[j]])){
nc<- nc_open(filelist[[i]][[j]][[m]])
lat<- ncvar_get(nc, "glat.00")
lon<- ncvar_get(nc, "glon.00")
ssh<- ncvar_get(nc, "ssh.53")
jdn<- ncvar_get(nc, "jday.00")
df<- rbind(df,data.frame(lat,lon,ssh,jdn))
nc_close(nc)
}
}
MissionsData[[i]]<- df
}
In addition, Can I make step #3# in one go instead of typing them manually?

#3 Nesting the code inside another `lapply` should do the job:
filelist = lapply(subfiles, function(subfile){
lapply(subfile, function(x) list.files(paste(folders[1],x, sep = "/"),
pattern='*.nc', full.names=TRUE))
})
#This might work as #5.
#It was written without reproducible code so I didn't test it
MissionsData = lapply(filelist, function(x){
# I don't see the j and m indexes used for any other purpose than looping
# so I just unlist these files into a vector
files_i = unlist(x, recursive = TRUE)
df_list = lapply(files_i, function(file_i){
nc = nc_open(file_i)
lat = ncvar_get(nc, "glat.00")
lon = ncvar_get(nc, "glon.00")
ssh = ncvar_get(nc, "ssh.53")
jdn = ncvar_get(nc, "jday.00")
nc_close(nc)
return(data.frame(lat,lon,ssh,jdn))
})
df = do.call(rbind, df_list)
})

Related

Loop function for reading csv files and store them in a list

I have a folder in which are stored approximately 10 subfolders, containing 4 .csv files each! Each subfolder corresponds to a weather station, and each file in each subfolder contain temperature data for different period
(e.g. station134_2000_2005.csv,station134_2006_2011.csv,station134_2012_2018.csv etc.) .
I wrote a loop for opening each folder, and rbind all data in one data frame but it is not very handy to do my work.
I need to create a loop so that those 4 files from each subfolder, rbined together to a dataframe, and then stored in a different "slot" in a list, or if it's easier,each station rbined csv data (namely each subfolder) to be exported from the loop as dataframe.
The code I wrote, which opens all files in all folders and create a big (rbined) data frame is:
directory <- list.files() # to have the names of each subfolder
stations <- data.frame() # to store all the rbined csv files
library(plyr)
for(i in directory){
periexomena <- list.files(i,full.names = T, pattern = "\\.csv$")
for(f in periexomena){
data_files <- read.csv(f, stringsAsFactors = F, sep = ";", dec = ",")
stations <- rbind.fill(data_files,stations)
}
Does anyone knows how can I have a list with each subfolder's rbined 4 csv files data in different slot, or how can I modify the abovementioned code in order to export in different data frame, the data from each subfolder?
Try:
slotted <- lapply(setNames(nm = directory), function(D) {
alldat <- lapply(list.files(D, pattern="\\.csv$", full.names=TRUE),
function(fn) {
message(fn)
read.csv2(fn, stringsAsFactors=FALSE)
})
# stringsAsFactors=F should be the default as of R-3.6, I believe
do.call(rbind.fill, alldat)
})

Save multiple files with multiple names with R

I have three files (T1.dnd, T2.dnd, T3.dnd) in which in all of them i have to substitude a specific row with another one with mgsub function. After doing that i have to save these files separately in a new folder (Output) with their respective names (T1.dnd, T2.dnd, T3.dnd) with a for loop. However the loop save only the last file (T3.dnd) and not all three (so T1.dnd and T2.dnd are missing).
How can i save all the three files with their names?
Any help will be greatly appreciated.
library(mgsub)
setwd("C:/Users/feder/Project/test")
treat <- as.list(c("C:/Users/feder/Project/test/T1.dnd",
"C:/Users/feder/Project/test/T2.dnd",
"C:/Users/feder/Project/test/T3.dnd"))
step <- list()
step2 <- list()
step <- lapply(treat, readLines)
step2 <- lapply(step, mgsub, c("______Leaf_fraction__________0.3100"),
c("______Leaf_fraction__________0.1500"))
files <- list.files("C:/Users/feder/Project/test")
names_files <- as.list(files)
savewd <- c("C:/Users/feder/Project/Output")
for (i in length(step2)){
writeLines(step2[[i]], paste(savewd, names_files[[i]], sep = "/"))
}

Select CSV files and read in pairs

I am comparing two pairs of csv files each at a time. The files I have each end with a number like cars_file2.csv, Lorries_file3.csv, computers_file4.csv, phones_file5.csv. I have like 70 files per folder and the way I am comparing is, I compare cars_file2.csv and Lorries_file3.csv then Lorries_file3.csv and
computers_file4.csv, and the pattern is 2,3,3,4,4,5 like that. Is there a smart way I can handle this instead of manually coming back and change file like the way I am reading here or I can use the last number on each csv to read them smartly. NOTE the files have same suffixes _file:
library(daff)
setwd("path")
# Load csvs to compare into data frames
x_original <- read.csv("cars_file2.csv", strip.white=TRUE, stringsAsFactors = FALSE)
x_changed <- read.csv("Lorries_file3.csv", strip.white=TRUE, stringsAsFactors = FALSE)
render(diff_data(x_original,x_changed ,ignore_whitespace=TRUE,count_like_a_spreadsheet = FALSE))
My intention is to compare each two pairs of csv and recorded, Field additions, deletions and modified
You may want to load all files at once and do your comparison with a full list of files.
This may help:
# your path
path <- "insert your path"
# get folders in this path
dir_data <- as.list(list.dirs(path))
# get all filenames
dir_data <- lapply(dir_data,function(x){
# list of folders
files <- list.files(x)
files <- paste(x,files,sep="/")
# only .csv files
files <- files[substring(files,nchar(files)-3,nchar(files)) %in% ".csv"]
# remove possible errors
files <- files[!is.na(files)]
# save if there are files
if(length(files) >= 1){
return(files)
}
})
# delete NULL-values
dir_data <- compact(dir_data)
# make it a named vector
dir_data <- unique(unlist(dir_data))
names(dir_data) <- sub(pattern = "(.*)\\..*$", replacement = "\\1", basename(dir_data))
names(dir_data) <- as.numeric(substring(names(dir_data),nchar(names(dir_data)),nchar(names(dir_data))))
# remove possible NULL-values
dir_data <- dir_data[!is.na(names(dir_data))]
# make it a list again
dir_data <- as.list(dir_data)
# load data
data_upload <- lapply(dir_data,function(x){
if(file.exists(x)){
data <- read.csv(x,header=T,sep=";")
}else{
data <- "file not found"
}
return(data)
})
# setup for comparison
diffs <- lapply(as.character(sort(as.numeric(names(data_upload)))),function(x){
# check if the second dataset exists
if(as.character(as.numeric(x)+1) %in% names(data_upload)){
# first dataset
print(data_upload[[x]])
# second dataset
print(data_upload[[as.character(as.numeric(x)+1)]])
# do your operations here
comparison <- render(diff_data(data_upload[[x]],
data_upload[[as.character(as.numeric(x)+1)]],
ignore_whitespace=T,count_like_a_spreadsheet = F))
numbers <- c(x, as.numeric(x)+1)
# save both the comparison data and the numbers of the datasets
return(list(comparison,numbers))
}
})
# you can find the differences here
diffs
This script loads all csv-files in a folder and its sub-folders and puts them into a list by their numbers. In case there are no doubles, this will work. If you have doubles, you will have to adjust the part where the vector is named so that you can index the full names of the files afterwards.
A simple for- loop using paste will read-in the pairs:
for (i in 1:70) { # assuming the last pair is cars_file70.csv and Lorries_file71.csv
x_original <- read.csv(paste0("cars_file",i,".csv"), strip.white=TRUE, stringsAsFactors = FALSE)
x_changed <- read.csv(paste0("Lorries_file3",i+1,".csv"), strip.white=TRUE, stringsAsFactors = FALSE)
render(diff_data(x_original,x_changed ,ignore_whitespace=TRUE,count_like_a_spreadsheet = FALSE))
}
For simplicity I used 2 .csv files.
csv_1
1,2,4
csv_2
1,8,10
Load all the .csv files from folder,
files <- dir("Your folder path", pattern = '\\.csv', full.names = TRUE)
tables <- lapply(files, read.csv)
#create empty list to store comparison output
diff <- c()
Loop through all loaded files and compare,
for (pos in 1:length(csv)) {
if (pos != length(csv)) { #ignore last one
#save comparison output
diff[[pos]] <- diff_data(as.data.frame(csv[pos]), as.data.frame(csv[pos + 1]), ignore_whitespace=TRUE,count_like_a_spreadsheet = FALSE)
}
}
Compared output by diff
[[1]]
Daff Comparison: ‘as.data.frame(tables[pos])’ vs. ‘as.data.frame(tables[pos + 1])’
+++ +++ --- ---
## X1 X8 X10 X2 X4

applying same function on multiple files in R

I am new to R program and currently working on a set of financial data. Now I got around 10 csv files under my working directory and I want to analyze one of them and apply the same command to the rest of csv files.
Here are all the names of these files: ("US%10y.csv", "UK%10y.csv", "GER%10y.csv","JAP%10y.csv", "CHI%10y.csv", "SWI%10y.csv","SOA%10y.csv", "BRA%10y.csv", "CAN%10y.csv", "AUS%10y.csv")
For example, because the Date column in CSV files are Factor so I need to change them to Date format:
CAN <- read.csv("CAN%10y.csv", header = T, sep = ",")
CAN$Date <- as.character(CAN$Date)
CAN$Date <- as.Date(CAN$Date, format ="%m/%d/%y")
CAN_merge <- merge(all.dates.frame, CAN, all = T)
CAN_merge$Bid.Yield.To.Maturity <- NULL
all.dates.frame is a data frame of 731 consecutive days. I want to merge them so that each file will have the same number of rows which later enables me to combine 10 files together to get a 731 X 11 master data frame.
Surely I can copy and paste this code and change the file name, but is there any simple approach to use apply or for loop to do that ???
Thank you very much for your help.
This should do the trick. Leave a comment if a certain part doesn't work. Wrote this blind without testing.
Get a list of files in your current directory ending in name .csv
L = list.files(".", ".csv")
Loop through each of the name and reads in each file, perform the actions you want to perform, return the data.frame DF_Merge and store them in a list.
O = lapply(L, function(x) {
DF <- read.csv(x, header = T, sep = ",")
DF$Date <- as.character(CAN$Date)
DF$Date <- as.Date(CAN$Date, format ="%m/%d/%y")
DF_Merge <- merge(all.dates.frame, CAN, all = T)
DF_Merge$Bid.Yield.To.Maturity <- NULL
return(DF_Merge)})
Bind all the DF_Merge data.frames into one big data.frame
do.call(rbind, O)
I'm guessing you need some kind of indicator, so this may be useful. Create a indicator column based on the first 3 characters of your file name rep(substring(L, 1, 3), each = 731)
A dplyr solution (though untested since no reproducible example given):
library(dplyr)
file_list <- c("US%10y.csv", "UK%10y.csv", "GER%10y.csv","JAP%10y.csv", "CHI%10y.csv", "SWI%10y.csv","SOA%10y.csv", "BRA%10y.csv", "CAN%10y.csv", "AUS%10y.csv")
can_l <- lapply(
file_list
, read.csv
)
can_l <- lapply(
can_l
, function(df) {
df %>% mutate(Date = as.Date(as.character(Date), format ="%m/%d/%y"))
}
)
# Rows do need to match when column-binding
can_merge <- left_join(
all.dates.frame
, bind_cols(can_l)
)
can_merge <- can_merge %>%
select(-Bid.Yield.To.Maturity)
One possible solution would be to read all the files into R in the form of a list, and then use lapply to to apply a function to all data files. For example:
# Create vector of file names in working direcotry
files <- list.files()
files <- files[grep("csv", files)]
#create empty list
lst <- vector("list", length(files))
#Read files in to list
for(i in 1:length(files)) {
lst[[i]] <- read.csv(files[i])
}
#Apply a function to the list
l <- lapply(lst, function(x) {
x$Date <- as.Date(as.character(x$Date), format = "%m/%d/%y")
return(x)
})
Hope it's helpful.

How to apply a function to every possible pairwise combination of files stored in a common directory

I have a directory containing a large number of csv files. I would like to load the data into R and apply a function to every possible pair combination of csv files in the directory, then write the output to file.
The function that I would like to apply is matchpt() from the biobase library which compares locations between two data frames.
Here is an example of what I would like to do (although I have many more files than this):
Three files in directory: A, B and C
Perform matchpt on each pairwise combination:
nn1 = matchpt(A,B)
nn2 = matchpt(A,C)
nn3 = matchpt(B,C)
Write nn1, nn2 and nn3 to csv file.
I have not been able to find any solutions for this yet and would appreciate any suggestions. I am really not sure where to go from here but I am assuming that some sort of nested for loop is required to somehow cycle sequentially through all pairwise combinations of files. Below is a beginning at something but this only compares the first file with all the others in the directory so does not work!
library("Biobase")
# create two lists of identical filenames stored in the directory:
filenames1 = list.files(path=dir, pattern="csv$", full.names=FALSE, recursive=FALSE)
filenames2 = list.files(path=dir, pattern="csv$", full.names=FALSE, recursive=FALSE)
for(i in 1:length(filenames2)){
# load the first data frame in list 1
df1 <- lapply(filenames1[1], read.csv, header=TRUE, stringsAsFactors=FALSE)
df1 <- data.frame(df1)
# load a second data frame from list 2
df2 <- lapply(filenames2[i], read.csv, header=TRUE, stringsAsFactors=FALSE)
df2 <- data.frame(df2)
# isolate the relevant columns from within the two data frames
dat1 <- as.matrix(df1[, c("lat", "long")])
dat2 <- as.matrix(df2[, c("lat", "long")])
# run the matchpt function on the two data frames
nn <- matchpt(dat1, dat2)
#Extract the unique id code in the two filenames (for naming the output file)
file1 = filenames1[1]
code1 = strsplit(file1,"_")[[1]][1]
file2 = filenames2[i]
code2 = strsplit(file2,"_")[[1]][1]
outname = paste(code1, code2, sep=”_”)
outfile = paste(code, "_nn.csv", sep="")
write.csv(nn, file=outname, row.names=FALSE)
}
Any suggestions on how to solve this problem would be greatly appreciated. Many thanks!
You could do something like:
out <- combn( list.files(), 2, FUN=matchpt )
write.table( do.call( rbind, out ), file='output.csv', sep=',' )
This assumes that matchpt is expecting 2 strings with the names of the files and that the result is the same structure each time so that the rbinding makes sense.
You could also write your own function to pass to combn that takes the 2 file names, runs matchpt and then appends the results to the csv file. Remember that if you pass an open filehandle to write.table then it will append to the file instead of overwriting what is there.
Try this example:
#dummy filenames
filenames <- paste0("file_",1:5,".txt")
#loop through unique combination
for(i in 1:(length(filenames)-1))
for(j in (i+1):length(filenames))
{
flush.console()
print(paste("i=",i,"j=",j,"|","file1=",filenames[i],"file2=",filenames[j]))
}
In response to my question I seem to have found a solution. The below uses a for loop to perform every pairwise combination of files in a common directory (this seems to work and gives EVERY combination of files i.e. A & B and B & A):
# create a list of filenames
filenames = list.files(path=dir, pattern="csv$", full.names=FALSE, recursive=FALSE)
# For loop to compare the files
for(i in 1:length(filenames)){
# load the first data frame in the list
df1 = lapply(filenames[i], read.csv, header=TRUE, stringsAsFactors=FALSE)
df1 = data.frame(df1)
file1 = filenames[i]
code1 = strsplit(file1,"_")[[1]][1] # extract unique id code of file (in case where the id comes before an underscore)
# isolate the columns of interest within the first data frame
d1 <- as.matrix(df1[, c("lat_UTM", "long_UTM")])
# load the comparison file
for (j in 1:length(filenames)){
# load the second data frame in the list
df2 = lapply(filenames[j], read.csv, header=TRUE, stringsAsFactors=FALSE)
df2 = data.frame(df2)
file2 = filenames[j]
code2 = strsplit(file2,"_")[[1]][1] # extract uniqe id code of file 2
# isolate the columns of interest within the second data frame
d2 <- as.matrix(df2[, c("lat_UTM", "long_UTM")])
# run the comparison function on the two data frames (in this case matchpt)
out <- matchpt(d1, d2)
# Merge the unique id code in the two filenames (for naming the output file)
outname = paste(code1, code2, sep="_")
outfile = paste(outname, "_out.csv", sep="")
# write the result to file
write.csv(out, file=outfile, row.names=FALSE)
}
}

Resources