Extract folder names inside of *.rar and *.zip files - r

I have a folder with multiple *.rar and *.zip files.
Each *.rar and *.zip files have one folder and inside this folder have multiples folders.
I would like to generate a dataset with the names of these multiple folders.
How can I do this using R?
I trying:
temp <- list.files(pattern = "\\.zip$")
lapply(temp, function(x) unzip(x, list = T))
But it returns:
I would like to get just the names: "Nova pasta1" and Nova pasta2"
Thanks

Let's create an simple set of directories/files that are representative of your own. You described having a single .zip file that contains multiple zipped directories, which may contain unzipped files and/or sub-directoris.
# Example main directory
dir.create("main_dir")
# Example directory with 1 file and a subdirectory with 1 file
dir.create("main_dir/example_dir1")
write.csv(data.frame(x = 5), file = "main_dir/example_dir1/example_file.csv")
dir.create("main_dir/example_dir1/example_subdir")
write.csv(data.frame(x = 5), file = "main_dir/example_dir1/example_subdir/example_subdirfile.csv")
# Example directory with 1 file
dir.create("main_dir/example_dir2")
write.csv(data.frame(x = "foo"), file = "main_dir/example_dir2/example_file2.csv")
# NOTE: I was having issues with using `zip()` to zip each directory
# then the main (top) directory, so I manually zipped them below.
# Manually zip example_dir1 and example_dir2, then zip main_dir at this point.
Given this structure, we can get the paths to all of the directories within the highest level directory (main_dir) using unzip(list = TRUE) since we know the name of the single zipped directory containing all of these additional zipped sub-directories.
# Unzip the highest level directory available, get all of the .zip dirs within
ex_path <- "main_dir"
all_zips <- unzip(zipfile = paste0(ex_path, ".zip"), list = TRUE)
all_zips
# We can remove the main_path string if we want so that we only
# the zip files within our main directory instead of the full path.
library(dplyr)
all_zips %>%
filter(Name != paste0(ex_path, "/")) %>%
mutate(Name = sub(paste0(ex_path, "/"), "", Name))
If you had multiple zipped directories with nested directories similar to main_dir, you could just put their paths in a list and apply the function to each element of the list. Below I reproduce this.
# Example of multiple zip directory paths in a list
ziplist <- list(ex_path, ex_path, ex_path)
lapply(ziplist, function(x) {
temp <- unzip(zipfile = paste0(x, ".zip"), list = TRUE)
temp <- temp %>% mutate(main_path = x)
temp <- temp %>%
filter(Name != paste0(ex_path, "/")) %>%
mutate(Name = sub(paste0(ex_path, "/"), "", Name))
temp
})
If all of the .zip files in the current working directory are files you want to do this for, you can get ziplist above via:
list.files(pattern = ".zip") %>% as.list()

I appreciate all help, but I think that I found a short way to solve my question.
temp.zip <- list.files(pattern = ".zip")
temp.rar <- list.files(pattern = ".rar")
mydata <- lapply(c(temp.rar, temp.zip),
function(x) unique(c(na.omit(str_extract(unlist
(untar(tarfile = x,
list = TRUE)),
'(?<=/).*(?=/)')))))
unlist(mydata)
Thanks all

Related

How to match file names in directory on R with names in CSV column

I am trying to write an r script that will match the file name inside a directory and compare it to a file name located in a csv file. This is so I can tell what files have already been downloaded and what data I need to download. I have written code that will read the files from the directory and list them as a df as well as reading in the csv file. However I am having trouble changing the file name to pull out the string I want as well as matching the file name with the name column in the csv file. I also would want to ideally create a new spread sheet that can tell me what files match so I know what has been downloaded. This is what I have so far.
# read files from directory and list as df
file_names <-list.files(path="peaches/",
pattern="jpg",
all.files=TRUE,
full.names=TRUE,
recursive=TRUE) %>%
# turn into df
as.data.frame(x = file_names)
# read in xl file
name_data <- read_excel("peaches/all_data.xlsx")
# change the file_name from the string peaches//fruit/1234/12pink.jpg.txt to -> 12pink
# match the file name with the name column in name_data
# create a new spread sheet that pulls the id and row if it has been downloaded [enter image description here][1]
Example files/directory
Let's create an example directory with some example files. This will let us prove that the solution works and is key to a reproducible solution.
library(dplyr)
library(writexl)
library(readxl)
# Example directory with example files
dir.create(path = "peaches")
write.csv(data.frame(x = 5), file = "peaches/foo.csv")
write.csv(data.frame(x = 20), file = "peaches/foo.nrrd.csv")
write.csv(data.frame(x = 1), file = "peaches/foo2.nrrd.csv")
write.csv(data.frame(z = 2), file = "peaches/bar.csv")
write.csv(data.frame(z = 5), file = "peaches/bar.rrdr.csv")
# Example Excel file
write_xlsx(data.frame(name = c("foo", "hotdog")),
path = "peaches/all_data.xlsx")
Solution
We can now use our example files and directory to show a solution to the problem.
# Get file paths in a data.frame for those that contain ".jpg"
# Use data.frame() to avoid row names instead of as.data.frame()
# Need to use \\ to escape the period in the regular expression
file_names <- list.files(
path = "peaches/",
pattern = "\\.jpg",
all.files = TRUE,
full.names = TRUE,
recursive = TRUE
) %>%
data.frame(paths = .)
# Extract part of file name (i.e. removing directory substrings) that
# comes before .nrrd and add a column. Can get file name with basename()
# and use regular expressions for the other part.
file_names$match_string <- file_names %>%
pull(paths) %>%
basename() %>%
gsub(pattern = "\\.jpg.*", replacement = "")
file_names$match_string
#> [1] "foo" "foo2"
# Read in excel file with file names to match (if possible)
name_data <- read_excel("peaches/all_data.xlsx")
name_data$name
#> [1] "foo" "hotdog"
# Create match indicator and row number
name_data <- name_data %>%
mutate(
matched = case_when(name %in% file_names$match_string ~ 1,
TRUE ~ 0),
rowID = row_number()
)
# Create excel spreadsheet of files already downloaded
name_data %>%
filter(matched == 1) %>%
write_xlsx(path = "peaches/already_downloaded.xlsx")

Move csv files to specific named folder in archive

I've 16 folders with specific person name in Risk/Archive/ folder and I want to copy my excel files (which also contain specific person name) from Risk/ folder to Risk/archive/ folder matching with the folder name
I'm using below code but it's not what i want to accomplish.
f = list.files('Risk/')
d = list.dirs('Risk/Archive')
if (length(f) > 0) {
File = lapply(paste0('Risk/',f), function(i){
x <- read.xlsx(i, sheet = 1, startRow=2, colNames = TRUE, check.names = FALSE, cols = c(1:73))
file.copy(from=i, to='Risk/Archive/',
overwrite = TRUE, recursive = FALSE,copy.mode = TRUE)
x})
File <- do.call("rbind.data.frame", File)}
There might be a better way to do this, but if I understand correctly, I think this should do the trick:
# Get list of names of people
names <- list.dirs(path = "./Risk/Archive",
full.names = F,
recursive = F)
# Get list of files to copy
files <- list.files(path = "./Risk",
pattern = ".xlsx",
full.names = T)
# Loop through each name and move the file for that person
for(name in 1:length(names)){
# Current name in loop
cname <- names[name]
# Get index of file that contains current name
name.idx <- grep(files, pattern = cname)
# Get file path for file that matches current name
file.path <- files[name.idx]
# Make file path for archive folder for current name
name.path <- paste0("./Risk/Archive/", cname)
# Copy file from "Risk" folder to "Archive" folder for current name
file.copy(from = file.path,
to = name.path,
overwrite = T)
# Remove original file after archiving
file.remove(file.path)
# Output message
cat(paste0("Moved file for: ", cname, "\n"))
}

Add a suffix to filenames based on subfolder names within a directory in R

I have a number of (sub)folders stored within a directory folder. Each subfolder contains 5-35 .jpg aerial photograph files that are named by flightline name and number (ie: bej-3-83). I would like to add a suffix to each of these files based upon the subfolder they are stored upon. For example if 'bej-3-83' is stored within 'T13N_10W' subfolder I would like my R script to rename 'bej-3-83' as 'bej-3-83-T13N_10W' and so forth for each file stored within each subfolder.
I can partially accomplish this process albeit still with more manual input than I'd like using this script:
folder = "C:\\...\\T23N_R14W"
files <- list.files(folder,pattern = "\\.jpg$",full.names = T)
files
sapply(files,FUN=function(eachPath){
file.rename(from=eachPath,to= sub(pattern="_clip", paste0("_T23N_R14W"),eachPath))
})
But as you can see this script uses a manual paste input of the subfolder name which isn't useful when you're trying to create a script that does what I need in one fell swoop.
I'm seeing similar questions and answers which utilize 'pushd' and 'popd' and I've attached to of those threads below as links. I'm trying to read as much as I can on these functions but so far the process to make it work has me stuck.
How to rename files in folders to foldername using batch file
Rename Files Based On Folder Name
Sincerely,
Henry
You might have to change the dir_separator to \ on windows:
make_filename <- function(file_path) {
s <- unlist(strsplit(file_path, dir_separator))
fname <- gsub('\\.jpg$', '', s[length(s)])
parent_dir <- s[(length(s) - 1)]
new_fname <- paste0(parent_dir, "_", fname, '.jpg')
path <- paste(s[-length(s)], collapse = dir_separator)
return(paste(path, new_fname, sep = dir_separator))
}
folder = './data'
dir_separator = '/'
files <- paste0(folder, dir_separator, list.files(folder, recursive = T))
sapply(files, function(x) file.rename(from = x, to = make_filename(x)))
A recursive approach.
Pass the path to the root folder containing your files and the extension of the files you want to rename, to rename_batch.
Defaults are working directory and jpeg.
library(stringr)
# An auxiliary function
rename_file <- function(str, extra){
file_name <- tools::file_path_sans_ext(str)
file_ext <- tools::file_ext(str)
return(paste0(file_name, '-', extra, '.', file_ext))
}
rename_batch <- function(path = "./",
extension = 'jpeg'){
# Separate files from folders
l <- list.files(path)
files <- l[grepl(paste0("\\." , extension), l)]
folders <- list.dirs(path, F, F)
present_folder <-
stringr::str_extract(path, '(?<=/)([^/]+)$')
# Check if there is a / at the end of path and removes it
# for consistency
path_len <- nchar(path)
last <- substr(path, path_len, path_len)
if (last == '/') {
path <- substr(path, 1, path_len - 1)
}
if (length(files) > 0) {
file_updtate <- paste0(path, '/', files)
file.rename(file_updtate, rename_file(file_updtate, present_folder))
}
if (length(folders) > 0) {
for (i in paste0(path, '/', folders)) {
cat('Renaming in:', i, '\n')
rename_batch(i)
}
}
}

Read the file created/modified last in different directories in R

I'd want to read the CSV files modified( or created) most recently in differents directories and then put it in a pre-existing single dataframe (df_total).
I have two kinds of directories to read:
A:/LogIIS/FOLDER01/"files.csv"
On others there a folder with several files.csv, as the example bellow:
"A:/LogIIS/FOLDER02/FOLDER_A/"files.csv"
"A:/LogIIS/FOLDER02/FOLDER_B/"files.csv"
"A:/LogIIS/FOLDER02/FOLDER_C/"files.csv"
"A:/LogIIS/FOLDER03/FOLDER_A/"files.csv"
"A:/LogIIS/FOLDER03/FOLDER_B/"files.csv"
"A:/LogIIS/FOLDER03/FOLDER_C/"files.csv"
"A:/LogIIS/FOLDER03/FOLDER_D/"files.csv"
Something like this...
#get a vector of all filenames
files <- list.files(path="A:/LogIIS",pattern="files.csv",full.names = TRUE,recursive = TRUE)
#get the directory names of these (for grouping)
dirs <- dirname(files)
#find the last file in each directory (i.e. latest modified time)
lastfiles <- tapply(files,dirs,function(v) v[which.max(file.mtime(v))])
You can then loop through these and read them in.
If you just want the latest file overall, this will be files[which.max(file.mtime(files))].
Here a tidyverse-friendly solution
list.files("data/",full.names = T) %>%
enframe(name = NULL) %>%
bind_cols(pmap_df(., file.info)) %>%
filter(mtime==max(mtime)) %>%
pull(value)
Consider creating a data frame of files as file.info maintains OS file system metadata per path such as created time:
setwd("A:/LogIIS")
files <- list.files(getwd(), full.names = TRUE, recursive = TRUE)
# DATAFRAME OF FILE, DIR, AND METADATA
filesdf <- cbind(file=files,
dir=dirname(files),
data.frame(file.info(files), row.names =NULL),
stringsAsFactors=FALSE)
# SORT BY DIR AND CREATED TIME (DESC)
filesdf <- with(filesdf, filesdf[order(dir, -xtfrm(ctime)),])
# AGGREGATE LATEST FILE PER DIR
latestfiles <- aggregate(.~dir, filesdf, FUN=function(i) head(i)[[1]])
# LOOP THROUGH LATEST FILE VECTOR FOR IMPORT
df_total <- do.call(rbind, lapply(latestfiles$file, read.csv))
Here is a pipe-friendly way to get the most recent file in a folder. It uses an anonymous function which in my view is slightly more readable than a one-liner. file.mtime is faster than file.info(fpath)$ctime.
dir(path = "your_path_goes_here", full.names = T) %>% # on W, use pattern="^your_pattern"
(function(fpath){
ftime <- file.mtime(fpath) # file.info(fpath)$ctime for file CREATED time
return(fpath[which.max(ftime)]) # returns the most recent file path
})

Automate zip file reading in R

I need to automate R to read a csv datafile that's into a zip file.
For example, I would type:
read.zip(file = "myfile.zip")
And internally, what would be done is:
Unzip myfile.zip to a temporary folder
Read the only file contained on it using read.csv
If there is more than one file into the zip file, an error is thrown.
My problem is to get the name of the file contained into the zip file, in orded to provide it do the read.csv command. Does anyone know how to do it?
UPDATE
Here's the function I wrote based on #Paul answer:
read.zip <- function(zipfile, row.names=NULL, dec=".") {
# Create a name for the dir where we'll unzip
zipdir <- tempfile()
# Create the dir using that name
dir.create(zipdir)
# Unzip the file into the dir
unzip(zipfile, exdir=zipdir)
# Get the files into the dir
files <- list.files(zipdir)
# Throw an error if there's more than one
if(length(files)>1) stop("More than one data file inside zip")
# Get the full name of the file
file <- paste(zipdir, files[1], sep="/")
# Read the file
read.csv(file, row.names, dec)
}
Since I'll be working with more files inside the tempdir(), I created a new dir inside it, so I don't get confused with the files. I hope it may be useful!
Another solution using unz:
read.zip <- function(file, ...) {
zipFileInfo <- unzip(file, list=TRUE)
if(nrow(zipFileInfo) > 1)
stop("More than one data file inside zip")
else
read.csv(unz(file, as.character(zipFileInfo$Name)), ...)
}
You can use unzip to unzip the file. I just mention this as it is not clear from your question whether you knew that. In regard to reading the file. Once your extracted the file to a temporary dir (?tempdir), just use list.files to find the files that where dumped into the temporary directory. In your case this is just one file, the file you need. Reading it using read.csv is then quite straightforward:
l = list.files(temp_path)
read.csv(l[1])
assuming your tempdir location is stored in temp_path.
I found this thread as I was trying to automate reading multiple csv files from a zip. I adapted the solution to the broader case. I haven't tested it for weird filenames or the like, but this is what worked for me so I thought I'd share:
read.csv.zip <- function(zipfile, ...) {
# Create a name for the dir where we'll unzip
zipdir <- tempfile()
# Create the dir using that name
dir.create(zipdir)
# Unzip the file into the dir
unzip(zipfile, exdir=zipdir)
# Get a list of csv files in the dir
files <- list.files(zipdir)
files <- files[grep("\\.csv$", files)]
# Create a list of the imported csv files
csv.data <- sapply(files, function(f) {
fp <- file.path(zipdir, f)
return(read.csv(fp, ...))
})
return(csv.data)}
If you have zcat installed on your system (which is the case for linux, macos, and cygwin) you could also use:
zipfile<-"test.zip"
myData <- read.delim(pipe(paste("zcat", zipfile)))
This solution also has the advantage that no temporary files are created.
Here is an approach I am using that is based heavily on #Corned Beef Hash Map 's answer. Here are some of the changes I made:
My approach makes use of the data.table package's fread(), which
can be fast (generally, if it's zipped, sizes might be large, so you
stand to gain a lot of speed here!).
I also adjusted the output format so that it is a named list, where
each element of the list is named after the file. For me, this was a
very useful addition.
Instead of using regular expressions to sift through the files
grabbed by list.files, I make use of list.file()'s pattern
argument.
Finally, I by relying on fread() and by making pattern an
argument to which you could supply something like "" or NULL or
".", you can use this to read in many types of data files; in fact,
you can read in multiple types of at once (if your .zip contains
.csv, .txt in you want both, e.g.). If there are only some types of
files you want, you can specify the pattern to only use those, too.
Here is the actual function:
read.csv.zip <- function(zipfile, pattern="\\.csv$", ...){
# Create a name for the dir where we'll unzip
zipdir <- tempfile()
# Create the dir using that name
dir.create(zipdir)
# Unzip the file into the dir
unzip(zipfile, exdir=zipdir)
# Get a list of csv files in the dir
files <- list.files(zipdir, rec=TRUE, pattern=pattern)
# Create a list of the imported csv files
csv.data <- sapply(files,
function(f){
fp <- file.path(zipdir, f)
dat <- fread(fp, ...)
return(dat)
}
)
# Use csv names to name list elements
names(csv.data) <- basename(files)
# Return data
return(csv.data)
}
The following refines the above answers. FUN could be read.csv, cat, or anything you like, providing the first argument will accept a file path. E.g.
head(read.zip.url("http://www.cms.gov/Medicare/Coding/ICD9ProviderDiagnosticCodes/Downloads/ICD-9-CM-v32-master-descriptions.zip", filename = "CMS32_DESC_LONG_DX.txt"))
read.zip.url <- function(url, filename = NULL, FUN = readLines, ...) {
zipfile <- tempfile()
download.file(url = url, destfile = zipfile, quiet = TRUE)
zipdir <- tempfile()
dir.create(zipdir)
unzip(zipfile, exdir = zipdir) # files="" so extract all
files <- list.files(zipdir)
if (is.null(filename)) {
if (length(files) == 1) {
filename <- files
} else {
stop("multiple files in zip, but no filename specified: ", paste(files, collapse = ", "))
}
} else { # filename specified
stopifnot(length(filename) ==1)
stopifnot(filename %in% files)
}
file <- paste(zipdir, files[1], sep="/")
do.call(FUN, args = c(list(file.path(zipdir, filename)), list(...)))
}
Another approach that uses fread from the data.table package
fread.zip <- function(zipfile, ...) {
# Function reads data from a zipped csv file
# Uses fread from the data.table package
## Create the temporary directory or flush CSVs if it exists already
if (!file.exists(tempdir())) {dir.create(tempdir())
} else {file.remove(list.files(tempdir(), full = T, pattern = "*.csv"))
}
## Unzip the file into the dir
unzip(zipfile, exdir=tempdir())
## Get path to file
file <- list.files(tempdir(), pattern = "*.csv", full.names = T)
## Throw an error if there's more than one
if(length(file)>1) stop("More than one data file inside zip")
## Read the file
fread(file,
na.strings = c(""), # read empty strings as NA
...
)
}
Based on the answer/update by #joão-daniel
unzipped file location
outDir<-"~/Documents/unzipFolder"
get all the zip files
zipF <- list.files(path = "~/Documents/", pattern = "*.zip", full.names = TRUE)
unzip all your files
purrr::map(.x = zipF, .f = unzip, exdir = outDir)
I just wrote a function based on top read.zip that may help...
read.zip <- function(zipfile, internalfile=NA, read.function=read.delim, verbose=TRUE, ...) {
# function based on http://stackoverflow.com/questions/8986818/automate-zip-file-reading-in-r
# check the files within zip
unzfiles <- unzip(zipfile, list=TRUE)
if (is.na(internalfile) || is.numeric(internalfile)) {
internalfile <- unzfiles$Name[ifelse(is.na(internalfile),1,internalfile[1])]
}
# Create a name for the dir where we'll unzip
zipdir <- tempfile()
# Create the dir using that name
if (verbose) catf("Directory created:",zipdir,"\n")
dir.create(zipdir)
# Unzip the file into the dir
if (verbose) catf("Unzipping file:",internalfile,"...")
unzip(zipfile, file=internalfile, exdir=zipdir)
if (verbose) catf("Done!\n")
# Get the full name of the file
file <- paste(zipdir, internalfile, sep="/")
if (verbose)
on.exit({
catf("Done!\nRemoving temporal files:",file,".\n")
file.remove(file)
file.remove(zipdir)
})
else
on.exit({file.remove(file); file.remove(zipdir);})
# Read the file
if (verbose) catf("Reading File...")
read.function(file, ...)
}

Resources