Related
I have radiotelemetry data that is downloaded as a series of text files. I was provided with code in 2018 that looped through all the text files and converted them into CSV files. Up until 2021 this code worked. However, now the below code (specifically the lapply loop), returns the following error:
"Error in setnames(x, value) :
Can't assign 1 names to a 4 column data.table"
# set the working directory to the folder that contain this script, must run in RStudio
setwd(dirname(rstudioapi::callFun("getActiveDocumentContext")$path))
# get the path to the master data folder
path_to_data <- paste(getwd(), "data", sep = "/", collapse = NULL)
# extract .TXT file
files <- list.files(path=path_to_data, pattern="*.TXT", full.names=TRUE, recursive=TRUE)
# regular expression of the record we want
regex <- "^\\d*\\/\\d*\\/\\d*\\s*\\d*:\\d*:\\d*\\s*\\d*\\s*\\d*\\s*\\d*\\s*\\d*"
# vector of column names, no whitespace
columns <- c("Date", "Time", "Channel", "TagID", "Antenna", "Power")
# loop through all .TXT files, extract valid records and save to .csv files
lapply(files, function(x){
df <- read_table(file) # read the .TXT file to a DataFrame
dt <- data.table(df) # convert the dataframe to a more efficient data structure
colnames(dt) <- c("columns") # modify the column name
valid <- dt %>% filter(str_detect(col, regex)) # filter based on regular expression
valid <- separate(valid, col, into = columns, sep = "\\s+") # split into columns
towner_name <- str_sub(basename(file), start = 1 , end = 2) # extract tower name
valid$Tower <- rep(towner_name, nrow(valid)) # add Tower column
file_path <- file.path(dirname(file), paste(str_sub(basename(file), end = -5), ".csv", sep=""))
write.csv(valid, file = file_path, row.names = FALSE, quote = FALSE) # save to .csv
})
I looked up possible fixes for this and found using "setnames(skip_absent=TRUE)" in the loop resolved the setnames error but instead gave the error "Error in is.data.frame(x) : argument "x" is missing, with no default"
lapply(files, function(file){
df <- read_table(file) # read the .TXT file to a DataFrame
dt <- data.table(df) # convert the dataframe to a more efficient data structure
setnames(skip_absent = TRUE)
colnames(dt) <- c("col") # modify the column name
valid <- dt %>% filter(str_detect(col, regex)) # filter based on regular expression
valid <- separate(valid, col, into = columns, sep = "\\s+") # split into columns
towner_name <- str_sub(basename(file), start = 1 , end = 2) # extract tower name
valid$Tower <- rep(towner_name, nrow(valid)) # add Tower column
file_path <- file.path(dirname(file), paste(str_sub(basename(file), end = -5), ".csv", sep=""))
write.csv(valid, file = file_path, row.names = FALSE, quote = FALSE) # save to .csv
})
I'm confused at to why this code is no longer working despite working fine last year? Any help would be greatly appreciated!
The error occured at this line colnames(dt) <- c("columns") where you provided only one value to rename the (supposedly) 4-column dataframe. If you meant to replace a particular column, you can
colnames(dt)[i] <- c("columns")
where i is the index of the column you are renaming. Alternatively, provide a vector with 4 new names.
I have a large number of csv files in a directory that I need to rename based off of corresponding cols in another index/reference data frame. Here is a three element sample of what I'm dealing with:
dir.create("dir1")
write.csv(mtcars[1:2,], "dir1/20821659.csv", row.names=FALSE)
write.csv(mtcars[3:4,], "dir1/20821654.csv", row.names=FALSE)
write.csv(mtcars[5:6,], "dir1/20821657.csv", row.names=FALSE)
Now I have another data frame with the orignial names of these files in one column, and another column that I would like to use to rename them:
location <- c("SFM01_2", "SFM05_2", "02M08_2")
sn <- c("20821659", "20821654", "20821657")
df<- data.frame(location, sn)
For example, the location name that corresponds to the first file name (20821659) is SFM01_2, and I would like to change that file name to SFM01_2 and so on for all the many files in this folder.
You could loop over the rows, each time using paste0() to create a mv command, which is then provided to system()
purrr::walk(1:nrow(df),function(i) {
cmd = paste0("mv dir1/",df[["sn"]][i], ".csv dir1/", df[["location"]][i], ".csv")
system(command=cmd)
})
Tested. file.rename returns TRUE on success.
dir1 <- "dir1"
apply(df, 1, \(x) {
new <- paste0(x[1], ".csv")
new <- file.path(dir1, new)
old <- paste0(x[2], ".csv")
old <- file.path(dir1, old)
if(file.exists(old)) file.rename(old, new)
})
#[1] TRUE TRUE TRUE
Here is a solution using mapply. You can create a new dataframe with the full paths of the files. Then, rename the file using the specification of the 2 columns row by row .
dir.create("dir1")
write.csv(mtcars[1:2,], "dir1/20821659.csv", row.names=FALSE)
write.csv(mtcars[3:4,], "dir1/20821654.csv", row.names=FALSE)
write.csv(mtcars[5:6,], "dir1/20821657.csv", row.names=FALSE)
list.files('dir1') # "20821654.csv" "20821657.csv" "20821659.csv"
location <- c("SFM01_2", "SFM05_2", "02M08_2")
sn <- c("20821659", "20821654", "20821657")
df<- data.frame(location, sn)
# Create a new dataframe with the full paths of the files
df2 <- sapply(df, function(i){
paste0('dir1/', i, '.csv')
})
# rename the file using the specification of the 2 columns row by row
mapply(FUN = file.rename, from = df2[, 2], to = df2[, 1],
MoreArgs = NULL, SIMPLIFY = TRUE, USE.NAMES = TRUE)
list.files('dir1') # "02M08_2.csv" "SFM01_2.csv" "SFM05_2.csv"
Before I dive into the question, here is a similar problem asked but there is not yet a solution.
So, I am working in R, and there is a folder in my working directory called columns that contains 198 similar .csv files with the name format of a 6-digit integer (e.g. 100000) that increases inconsistently (since the name of those files are actually names for each variable).
Now, I have would like to full join them, but somehow I have to import all of those files into R and then join them. Naturally, I thought about using a list to contain those files and then use a loop to join them. This is the code I tried to use:
#These are the first 3 columns containing identifiers
matrix_starter <- read_csv("files/matrix_starter.csv")
## import_multiple_csv_files_to_R
# Purpose: Import multiple csv files to the Global Environment in R
# set working directory
setwd("columns")
# list all csv files from the current directory
list.files(pattern=".csv$") # use the pattern argument to define a common pattern for import files with regex. Here: .csv
# create a list from these files
list.filenames <- list.files(pattern=".csv$")
#list.filenames
# create an empty list that will serve as a container to receive the incoming files
list.data <- list()
# create a loop to read in your data
for (i in 1:length(list.filenames))
{
list.data[[i]] <- read.csv(list.filenames[i])
list.data[[i]] <- list.data[[i]] %>%
select(`Occupation.Title`,`X2018.Employment`) %>%
rename(`Occupation title` = `Occupation.Title`) #%>%
#rename(list.filenames[i] = `X2018.Employment`)
}
# add the names of your data to the list
names(list.data) <- list.filenames
# now you can index one of your tables like this
list.data$`113300.csv`
# or this
list.data[1]
# source: https://www.edureka.co/community/1902/how-can-i-import-multiple-csv-files-into-r
The chunk above solve the importing part. Now I have a list of .csv files. Next, I would like to join them:
for (i in 1:length(list.filenames)){
matrix_starter <- matrix_starter %>% full_join(list.data[[i]], by = `Occupation title`)
}
However, this does not work nicely. I end up with somewhere around 47,000 rows, to which I only expect around 1700 rows. Please let me know your opinion.
Reading the files into R as a list and including the file name as a column can be done like this:
files <- list.files(path = path,
full.names = TRUE,
all.files = FALSE)
files <- files[!file.info(files)$isdir]
data <- lapply(files,
function(x) {
data <- read_xls(
x,
sheet = 1
)
data$File_name <- x
data
})
I am assuming now that all your excel files have the same structure: the same columns and column types.
If that is the case you can use dplyr::bind_rows to create one combined data frame.
You could off course loop through the list and left_join the list elements. E.g. by using Reduce and merge.
Update based on mihndang's comment. Is this what you are after when you say: Is there a way to use the file name to name the column and also not include the columns of file names?
library(dplyr)
library(stringr)
path <- "./files"
files <- list.files(path = path,
full.names = TRUE,
all.files = FALSE)
files <- files[!file.info(files)$isdir]
data <- lapply(files,
function(x) {
read.csv(x, stringsAsFactors = FALSE)
})
col1 <- paste0(str_sub(basename(files[1]), start = 1, end = -5), ": Values")
col2 <- paste0(str_sub(basename(files[1]), start = 1, end = -5), ": Character")
df1 <- data[[1]] %>%
rename(!!col1 := Value,
!!col2 := Character)
I created two simple .csv files in ./files: file1.csv and file2.csv. I read them into a list. I extract the first list element (the DF) and work out column names in a variable. I then rename the columns in the DF by passing the two variables to them. The column name includes the file name.
Result:
> View(df1)
> df1
file1: Values file1: Character
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 f
7 7 g
8 8 h
9 9 i
10 10 j
I guess you are looking for :
result <- Reduce(function(x, y) merge(x, y, by = `Occupation title`, all = TRUE), list.data)
which can be done using purrrs Reduce as well :
result <- purrr::reduce(list.data, dplyr::full_join, by = `Occupation title`)
When you do full join it adds every combination and gives us the tables. if you are looking for unique records then you might want to use left join where keep dataframe/table on left whose all columns you want keep as reference and keep the file you want to join on right.
Hope this helps.
I am comparing two pairs of csv files each at a time. The files I have each end with a number like cars_file2.csv, Lorries_file3.csv, computers_file4.csv, phones_file5.csv. I have like 70 files per folder and the way I am comparing is, I compare cars_file2.csv and Lorries_file3.csv then Lorries_file3.csv and
computers_file4.csv, and the pattern is 2,3,3,4,4,5 like that. Is there a smart way I can handle this instead of manually coming back and change file like the way I am reading here or I can use the last number on each csv to read them smartly. NOTE the files have same suffixes _file:
library(daff)
setwd("path")
# Load csvs to compare into data frames
x_original <- read.csv("cars_file2.csv", strip.white=TRUE, stringsAsFactors = FALSE)
x_changed <- read.csv("Lorries_file3.csv", strip.white=TRUE, stringsAsFactors = FALSE)
render(diff_data(x_original,x_changed ,ignore_whitespace=TRUE,count_like_a_spreadsheet = FALSE))
My intention is to compare each two pairs of csv and recorded, Field additions, deletions and modified
You may want to load all files at once and do your comparison with a full list of files.
This may help:
# your path
path <- "insert your path"
# get folders in this path
dir_data <- as.list(list.dirs(path))
# get all filenames
dir_data <- lapply(dir_data,function(x){
# list of folders
files <- list.files(x)
files <- paste(x,files,sep="/")
# only .csv files
files <- files[substring(files,nchar(files)-3,nchar(files)) %in% ".csv"]
# remove possible errors
files <- files[!is.na(files)]
# save if there are files
if(length(files) >= 1){
return(files)
}
})
# delete NULL-values
dir_data <- compact(dir_data)
# make it a named vector
dir_data <- unique(unlist(dir_data))
names(dir_data) <- sub(pattern = "(.*)\\..*$", replacement = "\\1", basename(dir_data))
names(dir_data) <- as.numeric(substring(names(dir_data),nchar(names(dir_data)),nchar(names(dir_data))))
# remove possible NULL-values
dir_data <- dir_data[!is.na(names(dir_data))]
# make it a list again
dir_data <- as.list(dir_data)
# load data
data_upload <- lapply(dir_data,function(x){
if(file.exists(x)){
data <- read.csv(x,header=T,sep=";")
}else{
data <- "file not found"
}
return(data)
})
# setup for comparison
diffs <- lapply(as.character(sort(as.numeric(names(data_upload)))),function(x){
# check if the second dataset exists
if(as.character(as.numeric(x)+1) %in% names(data_upload)){
# first dataset
print(data_upload[[x]])
# second dataset
print(data_upload[[as.character(as.numeric(x)+1)]])
# do your operations here
comparison <- render(diff_data(data_upload[[x]],
data_upload[[as.character(as.numeric(x)+1)]],
ignore_whitespace=T,count_like_a_spreadsheet = F))
numbers <- c(x, as.numeric(x)+1)
# save both the comparison data and the numbers of the datasets
return(list(comparison,numbers))
}
})
# you can find the differences here
diffs
This script loads all csv-files in a folder and its sub-folders and puts them into a list by their numbers. In case there are no doubles, this will work. If you have doubles, you will have to adjust the part where the vector is named so that you can index the full names of the files afterwards.
A simple for- loop using paste will read-in the pairs:
for (i in 1:70) { # assuming the last pair is cars_file70.csv and Lorries_file71.csv
x_original <- read.csv(paste0("cars_file",i,".csv"), strip.white=TRUE, stringsAsFactors = FALSE)
x_changed <- read.csv(paste0("Lorries_file3",i+1,".csv"), strip.white=TRUE, stringsAsFactors = FALSE)
render(diff_data(x_original,x_changed ,ignore_whitespace=TRUE,count_like_a_spreadsheet = FALSE))
}
For simplicity I used 2 .csv files.
csv_1
1,2,4
csv_2
1,8,10
Load all the .csv files from folder,
files <- dir("Your folder path", pattern = '\\.csv', full.names = TRUE)
tables <- lapply(files, read.csv)
#create empty list to store comparison output
diff <- c()
Loop through all loaded files and compare,
for (pos in 1:length(csv)) {
if (pos != length(csv)) { #ignore last one
#save comparison output
diff[[pos]] <- diff_data(as.data.frame(csv[pos]), as.data.frame(csv[pos + 1]), ignore_whitespace=TRUE,count_like_a_spreadsheet = FALSE)
}
}
Compared output by diff
[[1]]
Daff Comparison: ‘as.data.frame(tables[pos])’ vs. ‘as.data.frame(tables[pos + 1])’
+++ +++ --- ---
## X1 X8 X10 X2 X4
I have a directory containing a large number of csv files. I would like to load the data into R and apply a function to every possible pair combination of csv files in the directory, then write the output to file.
The function that I would like to apply is matchpt() from the biobase library which compares locations between two data frames.
Here is an example of what I would like to do (although I have many more files than this):
Three files in directory: A, B and C
Perform matchpt on each pairwise combination:
nn1 = matchpt(A,B)
nn2 = matchpt(A,C)
nn3 = matchpt(B,C)
Write nn1, nn2 and nn3 to csv file.
I have not been able to find any solutions for this yet and would appreciate any suggestions. I am really not sure where to go from here but I am assuming that some sort of nested for loop is required to somehow cycle sequentially through all pairwise combinations of files. Below is a beginning at something but this only compares the first file with all the others in the directory so does not work!
library("Biobase")
# create two lists of identical filenames stored in the directory:
filenames1 = list.files(path=dir, pattern="csv$", full.names=FALSE, recursive=FALSE)
filenames2 = list.files(path=dir, pattern="csv$", full.names=FALSE, recursive=FALSE)
for(i in 1:length(filenames2)){
# load the first data frame in list 1
df1 <- lapply(filenames1[1], read.csv, header=TRUE, stringsAsFactors=FALSE)
df1 <- data.frame(df1)
# load a second data frame from list 2
df2 <- lapply(filenames2[i], read.csv, header=TRUE, stringsAsFactors=FALSE)
df2 <- data.frame(df2)
# isolate the relevant columns from within the two data frames
dat1 <- as.matrix(df1[, c("lat", "long")])
dat2 <- as.matrix(df2[, c("lat", "long")])
# run the matchpt function on the two data frames
nn <- matchpt(dat1, dat2)
#Extract the unique id code in the two filenames (for naming the output file)
file1 = filenames1[1]
code1 = strsplit(file1,"_")[[1]][1]
file2 = filenames2[i]
code2 = strsplit(file2,"_")[[1]][1]
outname = paste(code1, code2, sep=”_”)
outfile = paste(code, "_nn.csv", sep="")
write.csv(nn, file=outname, row.names=FALSE)
}
Any suggestions on how to solve this problem would be greatly appreciated. Many thanks!
You could do something like:
out <- combn( list.files(), 2, FUN=matchpt )
write.table( do.call( rbind, out ), file='output.csv', sep=',' )
This assumes that matchpt is expecting 2 strings with the names of the files and that the result is the same structure each time so that the rbinding makes sense.
You could also write your own function to pass to combn that takes the 2 file names, runs matchpt and then appends the results to the csv file. Remember that if you pass an open filehandle to write.table then it will append to the file instead of overwriting what is there.
Try this example:
#dummy filenames
filenames <- paste0("file_",1:5,".txt")
#loop through unique combination
for(i in 1:(length(filenames)-1))
for(j in (i+1):length(filenames))
{
flush.console()
print(paste("i=",i,"j=",j,"|","file1=",filenames[i],"file2=",filenames[j]))
}
In response to my question I seem to have found a solution. The below uses a for loop to perform every pairwise combination of files in a common directory (this seems to work and gives EVERY combination of files i.e. A & B and B & A):
# create a list of filenames
filenames = list.files(path=dir, pattern="csv$", full.names=FALSE, recursive=FALSE)
# For loop to compare the files
for(i in 1:length(filenames)){
# load the first data frame in the list
df1 = lapply(filenames[i], read.csv, header=TRUE, stringsAsFactors=FALSE)
df1 = data.frame(df1)
file1 = filenames[i]
code1 = strsplit(file1,"_")[[1]][1] # extract unique id code of file (in case where the id comes before an underscore)
# isolate the columns of interest within the first data frame
d1 <- as.matrix(df1[, c("lat_UTM", "long_UTM")])
# load the comparison file
for (j in 1:length(filenames)){
# load the second data frame in the list
df2 = lapply(filenames[j], read.csv, header=TRUE, stringsAsFactors=FALSE)
df2 = data.frame(df2)
file2 = filenames[j]
code2 = strsplit(file2,"_")[[1]][1] # extract uniqe id code of file 2
# isolate the columns of interest within the second data frame
d2 <- as.matrix(df2[, c("lat_UTM", "long_UTM")])
# run the comparison function on the two data frames (in this case matchpt)
out <- matchpt(d1, d2)
# Merge the unique id code in the two filenames (for naming the output file)
outname = paste(code1, code2, sep="_")
outfile = paste(outname, "_out.csv", sep="")
# write the result to file
write.csv(out, file=outfile, row.names=FALSE)
}
}