Merge specific columns from csv files and use the filenames as headers - r

I would like to merge specific columns from two csv files and use the filename as the column header.In this example, I want to merge the third column from each file into a single data frame. the csv files have the same number of rows and columns.
Sample data sets:
File1.csv
V1,V2,V3,V4
1,1986,64,61
File2.csv
V1,V2,V3,V4
1,1990,100,61
Expected Result:
"File1","File2"
64,100
Here's my script:
my.file.list <- list.files(pattern = "*.csv")
my.list <- lapply(X = my.file.list, FUN = function(x) {
read.csv(x, header=TRUE,colClasses = c("NULL", "NULL", "numeric", "NULL"), sep = ",")[,1]
})
my.df <- do.call("cbind", my.list)
How do I add the column headers based from the file names?
I tried this:
sub('.csv','',basename(my.file.list),fixed=TRUE)
but I don't know how to add this as headers.
I'll appreciate any help.

my.file.list <- list.files(pattern = "*.csv")
my.list <- list()
for (i in 1:length(my.file.list)) {
df <- read.csv(my.file.list[[i]], header=TRUE, sep=",")["V3"]
names(df) <- paste0("FILE", i)
my.list[[i]] <- df
}
my.df <- do.call("cbind", my.list)

#Tim Biegeleisen Many thanks for the help. I got the idea now. Here's the improve version of your answer that I can use for files with different filenames.
my.file.list <- list.files(pattern = "*.csv")
my.list <- list()
for (i in 1:length(my.file.list)) {
df <- read.csv(my.file.list[[i]], header=TRUE, sep=",")["V3"]
names(df) <-paste0(sub('.csv','',basename(my.file.list[i]),fixed=TRUE), i)
my.list[[i]] <- df
}
my.df <- do.call("cbind", my.list)

Related

Read Multiple txt files in an order and combine them into one dataframe but label the origin of each row in the new generated dataframe in r

I have 6 txt files and I want to combine them into 1 dataframe. I know how to read them simultaneously and combine them in default way.
I learned to do this in this website:
txt_files_ls = list.files(path=mypath, pattern="*.txt")
txt_files_df <- lapply(txt_files_ls, function(x) {read.table(file = x, header = T, sep ="\t")})
# Combine them
combined_df <- do.call("rbind", lapply(txt_files_df, as.data.frame))
Now I want to do is set the read.table to read the txt files in a sequential manner as i defined, So that after combining them, I will be able to labeled the rows with the name of their original txt file name. Thank you
You can try this:
txt_files_ls = list.files(path=mypath, pattern="*.txt")
#The function for reading
read.data <- function(x)
{
y <- read.table(file = x, header = T, sep ="\t")
y$var <- x
return(y)
}
#Read data
txt_files_df <- lapply(txt_files_ls,read.data)
# Combine them
combined_df <- do.call("rbind", lapply(txt_files_df, as.data.frame))
Where var contains the name of each file.

R: Merging data frames based on file name

My files (example as I have hundreds of these files):
France.csv
France_variables.csv
Germany.csv
Germany_variables.csv
Spain.csv
Spain_variables.csv
Portugal.csv
Portugal_variables.csv
I want to merge France with France_variables, Germany with Germany_variables etc. I know I can use rbind with the two files but I want to do this as a loop because I have lots of these to merge. I'm not sure how to do a string search and then rbind in a loop or if there is a better way of doing this.
I am new to R so any help would be much appreciated.
You can use something like this:
library(tidyverse)
#Get Unique countries
country <- unique(gsub('\\..*$|_.*', '', list.files(path = ".", pattern = "csv")))
#Loop
for (i in country) {
dat <- list.files(path = ".", pattern = i) %>% map(read_csv) %>% reduce(rbind)
assign( paste("df", i, sep = "_"), dat)
rm(dat)
}
This will create dataframes like df_France, df_Germany , etc.
Play with the 'grepl', and see if you can get this to work......
# set the working directory (where files are saved)
setwd("C:/your_path_here/")
file_names = list.files(getwd())
file_names = file_names[grepl(".TXT",file_names)]
# print file_names vector
file_names
# see the data structure
str(file)
# run read.csv on all values of file_names
files = lapply(file_names, read.csv, header=F, stringsAsFactors = F)
files = do.call(rbind,files)
# run only on WY.TXT and NM.TXT
str(files)
# set column names
names(files) = c("col1", "col2", "col3", "col4", "col5")
str(files)
# finally...
write.table(files, "C:/your_path/mydata.txt", sep="\t")
write.csv(files,"C:/your_path/mydata.csv")
http://www.rforexcelusers.com/combine-delimited-files-r/

Error when during merging excel files in r with blank sheet

I'm using the following code to merge several excel files with multiple sheets. I get an error when it runs across a sheet that has the same header as the other files but is not populated with data. This is the error:
Error in data.frame(sub.id, condition, s.frame, ss) :
arguments imply differing number of rows: 0, 2
How can I avoid the error? Here is the code I am using below.
file.names <- list.files(pattern='*.xls')
sheet.names <- getSheets(loadWorkbook('File.xls'))
sheet.names <-sheet.names[1:12]
e.names <- paste0(rep('v', 16), c(1:16))
data.1 <- data.frame(matrix(rep(NA,length(e.names)),
ncol = length(e.names)))
names(data.1) <- e.names
for (i in 1:length(file.names)) {
wb <- loadWorkbook(file.names[i])
for (j in 1:length(sheet.names)) {
ss <- readWorksheet(wb, sheet.names[j], startCol = 2, header = TRUE)
condition <- rep(sheet.names[j], nrow(ss))
sub.id <- rep(file.names[i], nrow(ss))
s.frame <- seq(1:nrow(ss))
df.1 <- data.frame(sub.id, condition, s.frame, ss)
names(df.1) <- e.names
data.1 <- rbind(data.1, df.1)
rm(ss, condition, s.frame, sub.id, df.1)
}
rm(wb)
}
I suppose this solution will work for you. It loads all .xlsx files in a specified folder into a list of lists. Sheet-names and -headers shouldn't be an issue.
library(openxlsx)
# Define folder where your files are
path_folder <- "C:/path_to_files/"
# load file names into a list
f <- list.files(path_folder)
f <- ifelse(substring(f,nchar(f)-4,nchar(f))==".xlsx",f,NA)
f <- f[!is.na(f)]
data_list <- as.list(f)
# get sheet-names
names(data_list) <- data_list
data_list <- lapply(data_list, function(x){getSheetNames(paste0(path_folder, x))})
# load data into a list of lists
data_list <- lapply(data_list, function(x){as.list(x)})
data_list <- lapply(names(data_list),function(x){
sapply(data_list[[x]],function(y){read.xlsx(paste0(path_folder, x),sheet=y)})
})
# name the list elements
names(data_list) <- gsub(".xlsx", "", f)
You end up with a list (containing each file) of lists (containing the sheets of each file).
From here you can remove empty sheets, merge and edit them as you like.
Added an if-statement to check if there was more than one row if not skip reading in and it resolved the error.
for (i in 1:length(file.names)) {
wb <- loadWorkbook(file.names[i])
for (j in 1:length(sheet.names)) {
ss <- readWorksheet(wb, sheet.names[j], startCol = 2, header = TRUE)
if (nrow(ss) > 1)
{
condition <- rep(sheet.names[j], nrow(ss))
sub.id <- rep(file.names[i], nrow(ss))
s.frame <- seq(1:nrow(ss))
df.1 <- data.frame(sub.id, condition, s.frame, ss)
names(df.1) <- e.names
data.1 <- rbind(data.1, df.1)
rm(ss, condition, s.frame, sub.id, df.1)
}
}
rm(wb)
}

Merge multiple excel files into R taking only 2nd sheet, retaining file name as 'data source'

I'm trying to merge multiple excel files into a single data.frame in R - all files are pulled from a common folder, pulling only the 2nd sheet, which will always have a specific name ('Value Assessment').
In addition be able to retain each file name in a column, so the source of merged data is maintained.
I've been able to load the files and merge into one data.frame, but can't figure out how to retain file name as 'source name'.
setwd(/.)
file.list <- list.files(pattern='*.xlsx')
df.list <- lapply(file.list,read_excel)
df <- rbindlist(df.list, idcol = "id")
Using setNames():
file.list <- list.files(pattern = '*.xlsx')
file.list <- setNames(file.list, file.list)
df.list <- lapply(file.list, read_excel, sheet = 2)
df.list <- Map(function(df, name) {
df$source_name <- name
df
}, df.list, names(df.list))
df <- rbindlist(df.list, idcol = "id")
(Note: probably a typo, you were missing sheet = 2).
Try this: Merge All Data from All Excel Files:
library(xlsx)
setwd("C:/Users/your_path_here/excel_files")
data.files = list.files(pattern = "*.xlsx")
data <- lapply(data.files, function(x) read.xlsx(x, sheetIndex = 2))
for (i in data.files) {
data <- rbind(data, read.xlsx(i, sheetIndex = 1))
}

How can I turn the filename into a variable when reading multiple csvs into R

I have a bunch of csv files that follow the naming scheme: est2009US.csv.
I am reading them into R as follows:
myFiles <- list.files(path="~/Downloads/gtrends/", pattern = "^est[[:digit:]][[:digit:]][[:digit:]][[:digit:]]US*\\.csv$")
myDB <- do.call("rbind", lapply(myFiles, read.csv, header = TRUE))
I would like to find a way to create a new variable that, for each record, is populated with the name of the file the record came from.
You can avoid looping twice by using an anonymous function that assigns the file name as a column to each data.frame in the same lapply that you use to read the csvs.
myDB <- do.call("rbind", lapply(myFiles, function(x) {
dat <- read.csv(x, header=TRUE)
dat$fileName <- tools::file_path_sans_ext(basename(x))
dat
}))
I stripped out the directory and file extension. basename() returns the file name, not including the directory, and tools::file_path_sans_ext() removes the file extension.
plyr makes this very easy:
library(plyr)
paths <- dir(pattern = "\\.csv$")
names(paths) <- basename(paths)
all <- ldply(paths, read.csv)
Because paths is named, all will automatically get a column containing those names.
Nrows <- lapply( lapply(myFiles, read.csv, header=TRUE), NROW)
# might have been easier to store: lapply(myFiles, read.csv, header=TRUE)
myDB$grp <- rep( myFiles, Nrows) )
You can create the object from lapply first.
Lapply <- lapply(myFiles, read.csv, header=TRUE))
names(Lapply) <- myFiles
for(i in myFiles)
Lapply[[i]]$Source = i
do.call(rbind, Lapply)

Resources