How to save multiple files in loop with original filenames - r

I'm trying to import several SAS datafiles from a folder and then save them back into the folder as R dataframes with the same original SAS dataset name. Everything works except I can't figure out how to save the file with the original file name (i.e., I can't figure out the x in > save(xxx, file = ...).
The code I've tried is as follows:
path <- "path to folder with sas files"
list.files(pattern=".sas7bdat$")
list.filenames<-list.files(pattern=".sas7bdat$")
for (i in 1:length(list.filenames)){
assign(list.filenames[i], read_sas(list.filenames[i]))
filename <- paste(list.filenames[i])
save(list.filenames[i],file = paste0(path, paste(list.filenames[i], "Rdat", sep = ".")))
}
doesn't work...
for (i in 1:length(list.filenames)){
assign(list.filenames[i], read_sas(list.filenames[i]))
filename <- paste(list.filenames[i])
save(list.filenames[[i]],file = paste0(path, paste(list.filenames[i], "Rdat", sep = ".")))
}
doesn't work
for (i in 1:length(list.filenames)){
assign(list.filenames[i], read_sas(list.filenames[i]))
filename <- paste(list.filenames[i])
save(filename,file = paste0(path, paste(list.filenames[i], "Rdat", sep = ".")))
}
Any help on figuring out how to save the files with the original names from list.filenames[i]?

Use the "list" argument of save. Something like
path <- "path to folder with sas files"
list.filenames <- list.files(path, pattern="\\.sas7bdat$")
for (i in list.filenames) {
datName <- tools::file_path_sans_ext(i)
assign(datName, read_sas(i))
save(list=datName, file = paste0(path, paste(datName, "Rdat", sep = ".")))
}
would work. Also, I imagine you want pattern=".sas7bdat$" as pattern="\\.sas7bdat$, since "." is a wildcard in regex.

Related

Apply a script by looping through multiple files in a directory and copy the changes back to every corresponding file

I want to apply the below script to every file in the Weather directory and copy the changes back to the same csv file (Bladen.csv in this case).
Bladen <- read.csv("C:/Users//Desktop/Weather/Bladen.csv",header=T, na.strings=c("","NA"))
Bladen <- Bladen[,c(1,6,11,17,18,19)]
I would try something like this:
setwd('/adress/to/the/path')
files <- dir()
for(i in files){
Bladen <- read.csv(i, header=T, na.strings=c("","NA"))
Bladen <- Bladen[,c(1,6,11,17,18,19)]
write.csv(Bladen, i)
}
Please tell me if it works for you.
If you are looking to update each file in your directory by adding the same column to each file and writing the file back to the same directory.
setwd(set_your_path)
filenames <- list.files()
lapply(filenames, function(i){
Bladen = read.csv(i, sep = ",", header = TRUE, na.strings = c("NA","N/A","null",""," "))
Bladen<- Bladen[, c(1,6,11,17,18,19)]
write.csv(Bladen, i, sep = ",")
})

Combine csv files with common file identifier

I have a list of approximately 500 csv files each with a filename that consists of a six-digit number followed by a year (ex. 123456_2015.csv). I would like to append all files together that have the same six-digit number. I tried to implement the code suggested in this question:
Import and rbind multiple csv files with common name in R but I want the appended data to be saved as new csv files in the same directory as the original files are currently saved. I have also tried to implement the below code however the csv files produced from this contain no data.
rm(list=ls())
filenames <- list.files(path = "C:/Users/smithma/Desktop/PM25_test")
NAPS_ID <- gsub('.+?\\([0-9]{5,6}?)\\_.+?$', '\\1', filenames)
Unique_NAPS_ID <- unique(NAPS_ID)
n <- length(Unique_NAPS_ID)
for(j in 1:n){
curr_NAPS_ID <- as.character(Unique_NAPS_ID[j])
NAPS_ID_pattern <- paste(".+?\\_(", curr_NAPS_ID,"+?)\\_.+?$", sep = "" )
NAPS_filenames <- list.files(path = "C:/Users/smithma/Desktop/PM25_test", pattern = NAPS_ID_pattern)
write.csv(do.call("rbind", lapply(NAPS_filenames, read.csv, header = TRUE)),file = paste("C:/Users/smithma/Desktop/PM25_test/MERGED", "MERGED_", Unique_NAPS_ID[j], ".csv", sep = ""), row.names=FALSE)
}
Any help would be greatly appreciated.
Because you're not doing any data manipulation, you don't need to treat the files like tabular data. You only need to copy the file contents.
filenames <- list.files("C:/Users/smithma/Desktop/PM25_test", full.names = TRUE)
NAPS_ID <- substr(basename(filenames), 1, 6)
Unique_NAPS_ID <- unique(NAPS_ID)
for(curr_NAPS_ID in Unique_NAPS_ID){
NAPS_filenames <- filenames[startsWith(basename(filenames), curr_NAPS_ID)]
output_file <- paste0(
"C:/Users/nwerth/Desktop/PM25_test/MERGED_", curr_NAPS_ID, ".csv"
)
for (fname in NAPS_filenames) {
line_text <- readLines(fname)
# Write the header from the first file
if (fname == NAPS_filenames[1]) {
cat(line_text[1], '\n', sep = '', file = output_file)
}
# Append every line in the file except the header
line_text <- line_text[-1]
cat(line_text, file = output_file, sep = '\n', append = TRUE)
}
}
My changes:
list.files(..., full.names = TRUE) is usually the best way to go.
Because the digits appear at the start of the filenames, I suggest substr. It's easier to get an idea of what's going on when skimming the code.
Instead of looping over the indices of a vector, loop over the values. It's more succinct and less likely to cause problems if the vector's empty.
startsWith and endsWith are relatively new functions, and they're great.
You only care about copying lines, so just use readLines to get them in and cat to get them out.
You might consider something like this:
##will take the first 6 characters of each file name
six.digit.filenames <- substr(filenames, 1,6)
path <- "C:/Users/smithma/Desktop/PM25_test/"
unique.numbers <- unique(six.digit.filenames)
for(j in unique.numbers){
sub <- filenames[which(substr(filenames,1,6) == j)]
data.for.output <- c()
for(file in sub){
##now do your stuff with these files including read them in
data <- read.csv(paste0(path,file))
data.for.output <- rbind(data.for.output,data)
}
write.csv(data.for.output,paste0(path,j, '.csv'), row.names = F)
}

Looping through files using dynamic name variable in R

I have a large number of files to import which are all saved as zip files.
From reading other posts it seems I need to pass the zip file name and then the name of the file I want to open. Since I have a lot of them I thought I could loop through all the files and import them one by one.
Is there a way to pass the name dynamically or is there an easier way to do this?
Here is what I have so far:
Temp_Data <- NULL
Master_Data <- NULL
file.names <- c("f1.zip", "f2.zip", "f3.zip", "f4.zip", "f5.zip")
for (i in 1:length(file.names)) {
zipFile <- file.names[i]
dataFile <- sub(".zip", ".csv", zipFile)
Temp_Data <- read.table(unz(zipFile,
dataFile), sep = ",")
Master_Data <- rbind(Master_Data, Temp_Data)
}
I get the following error:
In open.connection(file, "rt") :
I can import them manually using:
dt <- read.table(unz("D:/f1.zip", "f1.csv"), sep = ",")
I can create the sting dynamically but it feels long winded - and doesn't work when I wrap it with read.table(unz(...)). It seems it can't find the file name and so throws an error
cat(paste(toString(shQuote(paste("D:/",zipFile, sep = ""))),",",
toString(shQuote(dataFile)), sep = ""), "\n")
But if I then print this to the console I get:
"D:/f1.zip","f1.csv"
I can then paste this into `read.table(unz(....)) and it works so I feel like I am close
I've tagged in data.table since this is what I almost always use so if it can be done with 'fread' that would be great.
Any help is appreciated
you can use the list.files command here:
first set your working directory, where all your files are stored there:
setwd("C:/Users/...")
then
file.names = list.files(pattern = "*.zip", recursive = F)
then your for loop will be:
for (i in 1:length(file.names)) {
#open the files
zipFile <- file.names[i]
dataFile <- sub(".zip", ".csv", zipFile)
Temp_Data <- read.table(unz(zipFile,
dataFile), sep = ",")
# your function for the opened file
Master_Data <- rbind(Master_Data, Temp_Data)
#write the file finaly
write_delim(x = Master_Data, path = paste(file.names[[i]]), delim = "\t",
col_names = T )}

How to append multiple files in R

I'm trying to read a list of files and append them into a new file with all the records. I do not intend to change anything in the original files. I've tried couple of methods.
Method 1: This methods creates a new file but at each iteration the previous file gets added again. Because I'm binding the data frame recursively.
files <- list.files(pattern = "\\.csv$")
#temparary data frame to load the contents on the current file
temp_df <- data.frame(ModelName = character(), Object = character(),stringsAsFactors = F)
#reading each file within the range and append them to create one file
for (i in 1:length(files)){
#read the file
currentFile = read.csv(files[i])
#Append the current file
temp_df = rbind(temp_df, currentFile)
}
#writing the appended file
write.csv(temp_df,"Models_appended.csv",row.names = F,quote = F)
Method 2: I got this method from Rbloggers . This methods won't write to a new file but keeps on modifying the original file.
multmerge = function(){
filenames= list.files(pattern = "\\.csv$")
datalist = lapply(filenames, function(x){read.csv(file=x,header=T)})
Reduce(function(x,y) {merge(x,y)}, temp_df)
}
Can someone advice me on how to achieve my goal?
it could look like this:
files <- list.files(pattern = "\\.csv$")
DF <- read.csv(files[1])
#reading each file within the range and append them to create one file
for (f in files[-1]){
df <- read.csv(f) # read the file
DF <- rbind(DF, df) # append the current file
}
#writing the appended file
write.csv(DF, "Models_appended.csv", row.names=FALSE, quote=FALSE)
or short:
files <- list.files(pattern = "\\.csv$")
DF <- read.csv(files[1])
for (f in files[-1]) DF <- rbind(DF, read.csv(f))
write.csv(DF, "Models_appended.csv", row.names=FALSE, quote=FALSE)
You can use this to load everything into one data set.
dataset <- do.call("rbind", lapply(file.list, FUN = function(file) {
read.table(file, header=TRUE, sep="\t")
}))
And then just save with write.csv.
Or you could go ahead and use shell commands in R:
system2("cat", args = "*.csv", stdout = "appendedfiles.csv")
This would be for unix based systems; I'm not sure what you would do for windows.
Try this for your ListOfFileNames:
ListOfFileNames<-list.files(pattern=".txt")
outFile <- file("all.txt", "w")
for (i in ListOfFileNames){
x <- readLines(i)
writeLines(x, outFile) # in the link the 1st and last line are skipped
}
close(outFile)
Source: https://r.789695.n4.nabble.com/R-Read-multiple-text-files-and-combine-into-single-file-td817344.html
There's an easy answer with rbind_list() now!
For instance:
dataFiles = map(Sys.glob("*.csv"), read.csv)
or however you're reading the files into a list
dat = rbind_list(dataFiles)
and dat will be what you're looking for!
If using Windows, it's easy to do this using the command prompt.
Start -> Run -> type "cmd" and hit return key
cd <path to folder>
copy /b *.txt <outputname>.txt
Concrete example:
cd C:\User\danny\docs\folder_with_txt files
copy /b *.txt concatenated.txt
Note that if you're changing drive letters to do this before cd
D:\> c:
C:\> cd C:\User\danny\docs\folder_with_txt files
copy /b *.txt concatenated.txt

Automate zip file reading in R

I need to automate R to read a csv datafile that's into a zip file.
For example, I would type:
read.zip(file = "myfile.zip")
And internally, what would be done is:
Unzip myfile.zip to a temporary folder
Read the only file contained on it using read.csv
If there is more than one file into the zip file, an error is thrown.
My problem is to get the name of the file contained into the zip file, in orded to provide it do the read.csv command. Does anyone know how to do it?
UPDATE
Here's the function I wrote based on #Paul answer:
read.zip <- function(zipfile, row.names=NULL, dec=".") {
# Create a name for the dir where we'll unzip
zipdir <- tempfile()
# Create the dir using that name
dir.create(zipdir)
# Unzip the file into the dir
unzip(zipfile, exdir=zipdir)
# Get the files into the dir
files <- list.files(zipdir)
# Throw an error if there's more than one
if(length(files)>1) stop("More than one data file inside zip")
# Get the full name of the file
file <- paste(zipdir, files[1], sep="/")
# Read the file
read.csv(file, row.names, dec)
}
Since I'll be working with more files inside the tempdir(), I created a new dir inside it, so I don't get confused with the files. I hope it may be useful!
Another solution using unz:
read.zip <- function(file, ...) {
zipFileInfo <- unzip(file, list=TRUE)
if(nrow(zipFileInfo) > 1)
stop("More than one data file inside zip")
else
read.csv(unz(file, as.character(zipFileInfo$Name)), ...)
}
You can use unzip to unzip the file. I just mention this as it is not clear from your question whether you knew that. In regard to reading the file. Once your extracted the file to a temporary dir (?tempdir), just use list.files to find the files that where dumped into the temporary directory. In your case this is just one file, the file you need. Reading it using read.csv is then quite straightforward:
l = list.files(temp_path)
read.csv(l[1])
assuming your tempdir location is stored in temp_path.
I found this thread as I was trying to automate reading multiple csv files from a zip. I adapted the solution to the broader case. I haven't tested it for weird filenames or the like, but this is what worked for me so I thought I'd share:
read.csv.zip <- function(zipfile, ...) {
# Create a name for the dir where we'll unzip
zipdir <- tempfile()
# Create the dir using that name
dir.create(zipdir)
# Unzip the file into the dir
unzip(zipfile, exdir=zipdir)
# Get a list of csv files in the dir
files <- list.files(zipdir)
files <- files[grep("\\.csv$", files)]
# Create a list of the imported csv files
csv.data <- sapply(files, function(f) {
fp <- file.path(zipdir, f)
return(read.csv(fp, ...))
})
return(csv.data)}
If you have zcat installed on your system (which is the case for linux, macos, and cygwin) you could also use:
zipfile<-"test.zip"
myData <- read.delim(pipe(paste("zcat", zipfile)))
This solution also has the advantage that no temporary files are created.
Here is an approach I am using that is based heavily on #Corned Beef Hash Map 's answer. Here are some of the changes I made:
My approach makes use of the data.table package's fread(), which
can be fast (generally, if it's zipped, sizes might be large, so you
stand to gain a lot of speed here!).
I also adjusted the output format so that it is a named list, where
each element of the list is named after the file. For me, this was a
very useful addition.
Instead of using regular expressions to sift through the files
grabbed by list.files, I make use of list.file()'s pattern
argument.
Finally, I by relying on fread() and by making pattern an
argument to which you could supply something like "" or NULL or
".", you can use this to read in many types of data files; in fact,
you can read in multiple types of at once (if your .zip contains
.csv, .txt in you want both, e.g.). If there are only some types of
files you want, you can specify the pattern to only use those, too.
Here is the actual function:
read.csv.zip <- function(zipfile, pattern="\\.csv$", ...){
# Create a name for the dir where we'll unzip
zipdir <- tempfile()
# Create the dir using that name
dir.create(zipdir)
# Unzip the file into the dir
unzip(zipfile, exdir=zipdir)
# Get a list of csv files in the dir
files <- list.files(zipdir, rec=TRUE, pattern=pattern)
# Create a list of the imported csv files
csv.data <- sapply(files,
function(f){
fp <- file.path(zipdir, f)
dat <- fread(fp, ...)
return(dat)
}
)
# Use csv names to name list elements
names(csv.data) <- basename(files)
# Return data
return(csv.data)
}
The following refines the above answers. FUN could be read.csv, cat, or anything you like, providing the first argument will accept a file path. E.g.
head(read.zip.url("http://www.cms.gov/Medicare/Coding/ICD9ProviderDiagnosticCodes/Downloads/ICD-9-CM-v32-master-descriptions.zip", filename = "CMS32_DESC_LONG_DX.txt"))
read.zip.url <- function(url, filename = NULL, FUN = readLines, ...) {
zipfile <- tempfile()
download.file(url = url, destfile = zipfile, quiet = TRUE)
zipdir <- tempfile()
dir.create(zipdir)
unzip(zipfile, exdir = zipdir) # files="" so extract all
files <- list.files(zipdir)
if (is.null(filename)) {
if (length(files) == 1) {
filename <- files
} else {
stop("multiple files in zip, but no filename specified: ", paste(files, collapse = ", "))
}
} else { # filename specified
stopifnot(length(filename) ==1)
stopifnot(filename %in% files)
}
file <- paste(zipdir, files[1], sep="/")
do.call(FUN, args = c(list(file.path(zipdir, filename)), list(...)))
}
Another approach that uses fread from the data.table package
fread.zip <- function(zipfile, ...) {
# Function reads data from a zipped csv file
# Uses fread from the data.table package
## Create the temporary directory or flush CSVs if it exists already
if (!file.exists(tempdir())) {dir.create(tempdir())
} else {file.remove(list.files(tempdir(), full = T, pattern = "*.csv"))
}
## Unzip the file into the dir
unzip(zipfile, exdir=tempdir())
## Get path to file
file <- list.files(tempdir(), pattern = "*.csv", full.names = T)
## Throw an error if there's more than one
if(length(file)>1) stop("More than one data file inside zip")
## Read the file
fread(file,
na.strings = c(""), # read empty strings as NA
...
)
}
Based on the answer/update by #joão-daniel
unzipped file location
outDir<-"~/Documents/unzipFolder"
get all the zip files
zipF <- list.files(path = "~/Documents/", pattern = "*.zip", full.names = TRUE)
unzip all your files
purrr::map(.x = zipF, .f = unzip, exdir = outDir)
I just wrote a function based on top read.zip that may help...
read.zip <- function(zipfile, internalfile=NA, read.function=read.delim, verbose=TRUE, ...) {
# function based on http://stackoverflow.com/questions/8986818/automate-zip-file-reading-in-r
# check the files within zip
unzfiles <- unzip(zipfile, list=TRUE)
if (is.na(internalfile) || is.numeric(internalfile)) {
internalfile <- unzfiles$Name[ifelse(is.na(internalfile),1,internalfile[1])]
}
# Create a name for the dir where we'll unzip
zipdir <- tempfile()
# Create the dir using that name
if (verbose) catf("Directory created:",zipdir,"\n")
dir.create(zipdir)
# Unzip the file into the dir
if (verbose) catf("Unzipping file:",internalfile,"...")
unzip(zipfile, file=internalfile, exdir=zipdir)
if (verbose) catf("Done!\n")
# Get the full name of the file
file <- paste(zipdir, internalfile, sep="/")
if (verbose)
on.exit({
catf("Done!\nRemoving temporal files:",file,".\n")
file.remove(file)
file.remove(zipdir)
})
else
on.exit({file.remove(file); file.remove(zipdir);})
# Read the file
if (verbose) catf("Reading File...")
read.function(file, ...)
}

Resources