Merging multiple text files into one using R - r

I am trying to merge multiple text files into a csv and have done it successfully using the following code. I have one additional requirement, I need to add the name of the file in a separate column indicating where the data came from. Please suggest.
rm
(list=ls())
setwd("D:/Cersai Rejection Reasons/IT_Oct18-Jun19")
file_list <- list.files()
df <- data.frame(file_list)
library(plyr)
library(dplyr)
files <- dir("D:/Cersai Rejection Reasons/IT_Oct18-Jun19",
full.names = TRUE)
df <- lapply(files, function(x)
read.table(x, sep = '\t', header = FALSE)) %>%
plyr::ldply()
write.csv(df, file="D:/consolidatetext.csv")

library(dplyr)
df <- lapply(files, function(x) {
df <- read.table(x, sep = '\t', header = FALSE, stringsAsFactors = FALSE)
df$source <- x
return(df)
}) %>%
bind_rows()
write.csv(df, file="D:/consolidatetext.csv")

Related

Write a function to manipulate and then write a dataframe

I would like to read in multiple .csv files (dataframes) from a folder and apply a function that I create to all the files. And finally this function will write the new .csv files.
I want the function to do the following 3 things
df$Class <- gsub("null", "OTHER", df$Class)
df$Class <- gsub(': ', ',', df$Class)
df <- df %>% select(c(Image, everything(.), -Name))
I don't really know how to put these thing into a function, but I've tried
`
file_names <- list.files(pattern = "\\.csv$")
tidy_up_fxn <- function(file_names) {
df <- do.call(bind_rows,lapply(file_names,data.table::fread))
df$Class <- gsub("null", "OTHER", df$Class)
df$Class <- gsub(': ', ',', df$Class)
df <- df %>% select(c(Image, everything(.), -Name))
out <- function(df)
fwrite(out, file = file_names, sep = ",")
}
tidy_up_fxn(file_names)
`
When I run it, R gets busy for a few seconds and then nothing happens. Please, help correct my function!
The following works the way I intended to
file_names <- list.files(pattern = "\\.csv$")
tidy_up_fxn <- function(file_names) {
df <- bind_rows(lapply(file_names,data.table::fread))
df$Class <- gsub("null", "OTHER", df$Class)
df$Class <- gsub(': ', ',', df$Class)
df <- df %>% select(c(Image, everything(.), -Name))
fwrite(df, file = "new.csv", sep = ",")
}
tidy_up_fxn(file_names)
Thank you all!!

Apply function to files from different folders (R)

I have discovered R a couple of years ago and it has been very handy to clean up dataframes, prepare some data and to handle other basic tasks.
Now I would like to try using R to apply basic treatments but on many different files stored in different folders at once.
Here is the script I would like to improve into one function that would loop through my folder "dataset_2006" and "dataset_2007" to do all the work.
library(dplyr)
library(readr)
library(sf)
library(purrr)
setwd("C:/Users/Downloads/global_data/dataset_2006")
shp2006 <- list.files(pattern = 'data_2006.*\\.shp$', full.names = TRUE)
listOfShp <- lapply(shp2006, st_read)
combinedShp <- do.call(what = sf:::rbind.sf, args=listOfShp)
#import and merge CSV files into one data frame
folderfiles <- list.files(pattern = 'csv_2006_.*\\.csv$', full.names = TRUE)
csv_data <- folderfiles %>%
set_names() %>%
map_dfr(.f = read_delim,
delim = ";",
.id = "file_name")
new_shp_2006 <- merge(combinedShp, csv_data , by = "ID") %>% filter(label %in% c("AR45T", "GK879"))
st_write(new_shp_2006 , "new_shp_2006.shp", overwrite = TRUE)
setwd("C:/Users/Downloads/global_data/dataset_2007")
shp2007 <- list.files(pattern = 'data_2007.*\\.shp$', full.names = TRUE)
listOfShp <- lapply(shp2007, st_read)
combinedShp <- do.call(what = sf:::rbind.sf, args=listOfShp)
#import and merge CSV files into one data frame
folderfiles <- list.files(pattern = 'csv_2007_.*\\.csv$', full.names = TRUE)
csv_data <- folderfiles %>%
set_names() %>%
map_dfr(.f = read_delim,
delim = ";",
.id = "file_name")
new_shp_2007 <- merge(combinedShp, csv_data , by = "ID") %>% filter(label %in% c("AR45T", "GK879"))
st_write(new_shp_2007 , "new_shp_2007.shp", overwrite = TRUE)
This is easy to achieve with a for-loop to loop over multiple items. To allow us to use wildcards, we can also add the function Sys.glob():
myfunction <- function(directories) {
for(dir in Sys.glob(directories)) {
# do something with a single dir
print(dir)
}
}
# you can specify multiple directories manually:
myfunction(c('C:/Users/Downloads/global_data/dataset_2006',
'C:/Users/Downloads/global_data/dataset_2007'))
# or use a wildcard to automatically get all files/directories that match the pattern:
myfunction('C:/Users/Downloads/global_data/dataset_200*')

R plot for multiple files in folder

By using R programming I want to read files in folder. perform some operations on it, plot and save as csv1.
Read next file, perform same operations, plot and save the new modified dataframe in csv1 with rbind function. Remember I want 1 plot from all files I read in for loop and save plot as pdf.
Right now i am using following code but my system crash due to shortage of RAM
all_paths <-
list.files(path = "/work/newplots",
pattern = "*.*",
full.names = TRUE)
all_filenames <- all_paths %>%
basename() %>%
as.list()
all_content <-
all_paths %>%
lapply(read.table,
header = TRUE,
skip=60,
sep=',',
encoding = "UTF-8")
file <- data.frame()
for (i in 1:length(all_filenames)) {
all_lists <- mapply(c, all_content, i, SIMPLIFY = FALSE)
data <- rbindlist(all_lists, fill = T)
names(data)[1] <- "File.Path"
x1 <- data %>% select(V1) %>% unique()
data <- data %>% data.frame(str_split_fixed(data$File.Path, " ", 23))%>% select(-c(File.Path))%>% filter(X1=='Interactions')
data<- cbind(x1,data)
data <- data %>% select(-c(2)) %>%select(V1,X2)
data$X2 <-as.numeric(data$X2)
file <- write.table(data,"/work/con1_10.csv",row.names = FALSE)
file <- append(file,data)
p<-plot(data$X2, xlab="Cycle number",ylab="Interactions",type = "p")
print(p)
Z<- (2*data$X2)/20006
px<-plot(Z, xlab="Cycle number", ylab="Z")
print(px)
}

Import multiple CSV files with Softball statistics and plot the progress [duplicate]

I have written the following function to combine 300 .csv files. My directory name is "specdata". I have done the following steps for execution,
x <- function(directory) {
dir <- directory
data_dir <- paste(getwd(),dir,sep = "/")
files <- list.files(data_dir,pattern = '\\.csv')
tables <- lapply(paste(data_dir,files,sep = "/"), read.csv, header = TRUE)
pollutantmean <- do.call(rbind , tables)
}
# Step 2: call the function
x("specdata")
# Step 3: inspect results
head(pollutantmean)
Error in head(pollutantmean) : object 'pollutantmean' not found
What is my mistake? Can anyone please explain?
There's a lot of unnecessary code in your function. You can simplify it to:
load_data <- function(path) {
files <- dir(path, pattern = '\\.csv', full.names = TRUE)
tables <- lapply(files, read.csv)
do.call(rbind, tables)
}
pollutantmean <- load_data("specdata")
Be aware that do.call + rbind is relatively slow. You might find dplyr::bind_rows or data.table::rbindlist to be substantially faster.
To update Prof. Wickham's answer above with code from the more recent purrr library which he coauthored with Lionel Henry:
Tbl <-
list.files(pattern="*.csv") %>%
map_df(~read_csv(.))
If the typecasting is being cheeky, you can force all the columns to be as characters with this.
Tbl <-
list.files(pattern="*.csv") %>%
map_df(~read_csv(., col_types = cols(.default = "c")))
If you are wanting to dip into subdirectories to construct your list of files to eventually bind, then be sure to include the path name, as well as register the files with their full names in your list. This will allow the binding work to go on outside of the current directory. (Thinking of the full pathnames as operating like passports to allow movement back across directory 'borders'.)
Tbl <-
list.files(path = "./subdirectory/",
pattern="*.csv",
full.names = T) %>%
map_df(~read_csv(., col_types = cols(.default = "c")))
As Prof. Wickham describes here (about halfway down):
map_df(x, f) is effectively the same as do.call("rbind", lapply(x, f)) but under the hood is much more efficient.
and a thank you to Jake Kaupp for introducing me to map_df() here.
This can be done very succinctly with dplyr and purrr from the tidyverse. Where x is a list of the names of your csv files you can simply use:
bind_rows(map(x, read.csv))
Mapping read.csv to x produces a list of dfs that bind_rows then neatly combines!
```{r echo = FALSE, warning = FALSE, message = FALSE}
setwd("~/Data/R/BacklogReporting/data/PastDue/global/") ## where file are located
path = "~/Data/R/BacklogReporting/data/PastDue/global/"
out.file <- ""
file.names <- dir(path, pattern = ".csv")
for(i in 1:length(file.names)){
file <- read.csv(file.names[i], header = TRUE, stringsAsFactors = FALSE)
out.file <- rbind(out.file, file)
}
write.csv(out.file, file = "~/Data/R/BacklogReporting/data/PastDue/global/global_stacked/past_due_global_stacked.csv", row.names = FALSE) ## directory to write stacked file to
past_due_global_stacked <- read.csv("C:/Users/E550143/Documents/Data/R/BacklogReporting/data/PastDue/global/global_stacked/past_due_global_stacked.csv", stringsAsFactors = FALSE)
files <- list.files(pattern = "\\.csv$") %>% t() %>% paste(collapse = ", ")
```
If your csv files are into an other directory, you could use something like this:
readFilesInDirectory <- function(directory, pattern){
files <- list.files(path = directory,pattern = pattern)
for (f in files){
file <- paste(directory,files, sep ="")
temp <- lapply(file, fread, sep=",")
data <- rbindlist( temp )
}
return(data)
}
In your current function pollutantmean is available only in the scope of the function x. Modify your function to this
x <- function(directory) {
dir <- directory
data_dir <- paste(getwd(),dir,sep = "/")
files <- list.files(data_dir,pattern = '\\.csv')
tables <- lapply(paste(data_dir,files,sep = "/"), read.csv, header = TRUE)
assign('pollutantmean',do.call(rbind , tables))
}
assign should put result of do.call(rbind, tables) into variable called pollutantmean in global environment.

file.info returning NA values in r

I am trying to get size of files and other details from my directory however it is returning NA values for some files but it is returning details for other files. Below is the code i used. will there be any administrator settings for files to fetch these details??
library(tidyr)
library(dplyr)
wd <- "F:\\working\\others"
setwd(wd)
#get file list - your list of files would be different
fileList <- list.files()[1:240]
class(fileList)
#result
cbind(
file.info(fileList)[,c("size"), drop=FALSE],
x = as.character(file.mtime(fileList))) %>%
separate(x,
into = c("DateModified","TimeModified"),
sep=" ") %>%
add_rownames %>%
select(DateModified,
TimeModified,
Size=size,
FileName=rowname)
try this:
the trick is in the full.names = TRUE
ldir <- normalizePath("<type here the path of directory>")
finf <- file.info(dir(path = ldir, full.names = TRUE), extra_cols = FALSE)
View(finf)

Resources