loop over variable that is regular expression in r - r

I am searching for files in a folder by using a regular expression, such as:
for(i in c('exp\\d_baseline', 'exp\\d_treatment', 'control\\d_baseline', 'control\\d_treatment'){
file.list <- dir(file_path, pattern = i, full.names = T)
# ...irrelevant manipulation on these files here (concatenation)
output_name <- paste0(i, '_concat.csv')
}
This gives me filenames with the \ from the pattern detection. How can I remove the \'s when specifying my output filenames?

One option is sub with fixed = TRUE before applying the paste
output_name <- paste0(sub("\\d", "", i, fixed = TRUE), "_concat.csv")

Related

Error when looping through files and outputting separate files in R?

I have a list of 50 text files all beginning with NEW.
I want to loop through each textfile/dataframe and run some function and then output the results via the write.table function. Therefore for each file, a function is applied and then an output should be created containing the original name with output at the end.
Here is my code.
fileNames <- Sys.glob("*NEW.*")
for (fileName in fileNames) {
df <- read.table(fileName, header = TRUE)
FUNCTION (not shown as this works)
...
result <-print(chr1$results) #for each file a result would be printed.
write.table(result, file = paste0(fileName,"_output.txt"), quote = F, sep = "\t", row.names = F, col.names = T)
#for each file a new separate file is created with the original output name retained.
}
However, I only get one output rather than 50 output files. It seems like its only looping through one file. What am I doing wrong?
readme <- function(folder_name = "my_texts"){
file_list <- list.files(path = folder_name, pattern = "*.txt",
recursive = TRUE, full.names = TRUE).
#list files with .txt ending
textdata <- lapply(file_list, function(x) {.
paste(readLines(x), collapse=" ").
}).
#apply readlines over the file list.
data.table::setattr(textdata, "names", file_list) .
#add names attribute to textdata from file_list.
lapply(names(file_list), function(x){.
lapply(names(file_list[[x]]), function(y) setattr(DT[[x]], y,
file_list[[x]][[y]])).
}).
#set names attribute over the list.
df1 <- data.frame(doc_id = rep(names(textdata), lengths(textdata)),
doc_text = unlist(textdata), row.names = NULL).
#convert to dataframe where names attribute is doc_id and textdata is text.
return(df1).
}

R - load multiple csv files and drop .csv from name

I have some files in a base directory that I use to house all my .csv files
base_dir <- file.path(path)
file_list <- list.files(path = base_dir, pattern = "*.csv")
I would like to load all of them at once:
for (i in 1:length(file_list)){
assign(file_list[i],
read.csv(paste(base_dir, file_list[i], sep = ""))
)}
However, this produces files with ".csv" in the names in R.
What I would like to do is load all the files but drop the ".csv" from the name once they are loaded.
I have tried the following:
for (i in 1:length(file_list)){ assign(file_list[i],
read.csv(substr(paste(base_dir, file_list[i], sep = ""), 1,
nchar(file_list[i]) -4))
)}
But I received an error: No such file or directory
Is there a way to do this somewhat efficiently?
Normally one reads them into a list rather than having them as free objects floating around in the workspace. Use dir or Sys.glob to generate the full path names and then use read.csv to read each one in. L's names will be pathnames so reduce them to the basename and remove .csv .
# paths <- dir(path = path, pattern = "\\.csv$", full = TRUE)
paths <- Sys.glob(sprintf("%s/*.csv", path))
L <- Map(read.csv, paths)
names(L) <- sub("\\.csv$", "", basename(names(L)))
If you really want them as free floating objects anyways then add this:
list2env(L, .GlobalEnv)
We can use sub to remove the .csv at the end
for (i in 1:length(file_list)){
assign(sub("\\.csv$", "", basename(file_list[i])),
read.csv(paste(base_dir, file_list[i], sep = ""))
)}
Or another option is file_path_sans_ext
for (i in 1:length(file_list)){
assign(tools::file_path_sans_ext(basename(file_list[i])),
read.csv(paste(base_dir, file_list[i], sep = ""))
)}
The error produced in OP's code is because the substr is applied on the 'value' part i.e. the one goes into reading the file instead of the 'x' i.e. the corrected code would be
for(i in 1:length(file_list)){
assign(substr(paste(base_dir, file_list[i], sep = ""),
1, nchar(file_list[i]) -4),
read.csv(file_list[i])
)}
Also, if the working directory is different, it may be better to specify full.names = TRUE
file_list <- list.files(path = base_dir, pattern = "*\\.csv$", full.names = TRUE)

How to assign new data to the variable within variable in R

I have my file names as all.files in working directory. I want to read these files in loop and assign the names as gsub(".csv","", all.files) for each file.
all.files <- c("harvestA.csv", "harvestB.csv", "harvestC.csv", "harvestD.csv",
"seedA.csv", "seedB.csv", "seedC.csv", "seedD.csv")
I tried something like this below but it won't work. What do I need here?
for(i in 1:length(all.files)){
assign(gsub(".csv","", all.files)[i]) <- read.table(
all.files[i],
header = TRUE,
sep = ","
)
}
You could keep them in a named list as it is not a good practice to clutter the environment with lot of global variables
list_df <- lapply(all.files, read.csv)
names(list_df) <- sub("\\.csv", "", all.files)
You can always extract individual dataframes as list_df[["harvestA"]], list_df[["harvestB"]] etc.
If you still need them as separate dataframes
list2env(list_df, .GlobalEnv)
The . is a metacharacter in regex matching any character. So, we can use fixed = TRUE to match the literal dot. Also, in the OP's code, with the assign, there is no need for another assignment operator (<-), the second argument in assign is value and here it is the dataset read with the read.table
for(i in 1:length(all.files)){
assign(sub(".csv","", all.files, fixed = TRUE)[i], read.table(
all.files[i],
header = TRUE,
sep = ","
))
}
An option using strsplit
for (i in seq_along(all.files)) {
assign(x = strsplit(allfiles[i],"\\.")[[1]][1],
value = read.csv(all.files[i]),
pos = .GlobalEnv)
}

Passing a list of one function to another, across different directories

I am trying to assign random numbers to a copy of a file in various directories (tricky to replicate). The directory structure is as follows:
1100
1100/Images
I first create the new directories and copy the images across. For this I have the following working
NewImageGen <- function(singledir){
#
Directoryforrandoms <- "RandomNumAsignment"
Directoryforrandoms <- paste(singledir, "/", Directoryforrandoms, sep = "")
dir.create(Directoryforrandoms,
showWarnings = F)
#
Imagedir <- paste(singledir, '/Images', sep = '')
filestocopy <- list.files(Imagedir,
recursive = F,
full.names = T)
file.copy(from = filestocopy,
to = Directoryforrandoms,
overwrite = F)
#
newfiles <- list.files(path = Directoryforrandoms,
pattern = ".tif", # they are all tiff files
full.names = T)
#
return(newfiles)
}
NewImages <- pblapply(alldirs, FUN = NewImageGen)
This gives me a large list, which is divided into four (due to there being four directories in this instance). I want to pass the newfiles to another function which generates a random number prefix and sticks it on the front. I can do this on a regular list of files using:
RandomNumGen <- function(singleimg){
randomnumber <- as.character(sample(100000:999999, 1, replace=F))
singlerename <- sub('^', randomnumber, singleimg)
file.rename(from = singleimg,
to = singlerename)
}
It runs through all elements of the list but returns a frustrating false.
Any help would be top notch!
pblapply() returns a list while list.files() returns a character vector.
So if your function works with the return value of list.files() try changing your call from pblapply() to pbsapply().
Edit:
newfiles, i.e. the return value of the NewImageGen() function, contains full paths (as you set full.names = T in the last list.files() call).
sub('^', randomnumber, singleimg) will add the random number before the first part of the path like 677570/home/Jim/1100/Images/RandomNumAsignment/original_image_name.tif.
Instead you want to do
file.path(dirname(singleimg), sub('^', randomnumber, basename(singleimg))

Combine csv files with common file identifier

I have a list of approximately 500 csv files each with a filename that consists of a six-digit number followed by a year (ex. 123456_2015.csv). I would like to append all files together that have the same six-digit number. I tried to implement the code suggested in this question:
Import and rbind multiple csv files with common name in R but I want the appended data to be saved as new csv files in the same directory as the original files are currently saved. I have also tried to implement the below code however the csv files produced from this contain no data.
rm(list=ls())
filenames <- list.files(path = "C:/Users/smithma/Desktop/PM25_test")
NAPS_ID <- gsub('.+?\\([0-9]{5,6}?)\\_.+?$', '\\1', filenames)
Unique_NAPS_ID <- unique(NAPS_ID)
n <- length(Unique_NAPS_ID)
for(j in 1:n){
curr_NAPS_ID <- as.character(Unique_NAPS_ID[j])
NAPS_ID_pattern <- paste(".+?\\_(", curr_NAPS_ID,"+?)\\_.+?$", sep = "" )
NAPS_filenames <- list.files(path = "C:/Users/smithma/Desktop/PM25_test", pattern = NAPS_ID_pattern)
write.csv(do.call("rbind", lapply(NAPS_filenames, read.csv, header = TRUE)),file = paste("C:/Users/smithma/Desktop/PM25_test/MERGED", "MERGED_", Unique_NAPS_ID[j], ".csv", sep = ""), row.names=FALSE)
}
Any help would be greatly appreciated.
Because you're not doing any data manipulation, you don't need to treat the files like tabular data. You only need to copy the file contents.
filenames <- list.files("C:/Users/smithma/Desktop/PM25_test", full.names = TRUE)
NAPS_ID <- substr(basename(filenames), 1, 6)
Unique_NAPS_ID <- unique(NAPS_ID)
for(curr_NAPS_ID in Unique_NAPS_ID){
NAPS_filenames <- filenames[startsWith(basename(filenames), curr_NAPS_ID)]
output_file <- paste0(
"C:/Users/nwerth/Desktop/PM25_test/MERGED_", curr_NAPS_ID, ".csv"
)
for (fname in NAPS_filenames) {
line_text <- readLines(fname)
# Write the header from the first file
if (fname == NAPS_filenames[1]) {
cat(line_text[1], '\n', sep = '', file = output_file)
}
# Append every line in the file except the header
line_text <- line_text[-1]
cat(line_text, file = output_file, sep = '\n', append = TRUE)
}
}
My changes:
list.files(..., full.names = TRUE) is usually the best way to go.
Because the digits appear at the start of the filenames, I suggest substr. It's easier to get an idea of what's going on when skimming the code.
Instead of looping over the indices of a vector, loop over the values. It's more succinct and less likely to cause problems if the vector's empty.
startsWith and endsWith are relatively new functions, and they're great.
You only care about copying lines, so just use readLines to get them in and cat to get them out.
You might consider something like this:
##will take the first 6 characters of each file name
six.digit.filenames <- substr(filenames, 1,6)
path <- "C:/Users/smithma/Desktop/PM25_test/"
unique.numbers <- unique(six.digit.filenames)
for(j in unique.numbers){
sub <- filenames[which(substr(filenames,1,6) == j)]
data.for.output <- c()
for(file in sub){
##now do your stuff with these files including read them in
data <- read.csv(paste0(path,file))
data.for.output <- rbind(data.for.output,data)
}
write.csv(data.for.output,paste0(path,j, '.csv'), row.names = F)
}

Categories

Resources