Move csv files to specific named folder in archive - r

I've 16 folders with specific person name in Risk/Archive/ folder and I want to copy my excel files (which also contain specific person name) from Risk/ folder to Risk/archive/ folder matching with the folder name
I'm using below code but it's not what i want to accomplish.
f = list.files('Risk/')
d = list.dirs('Risk/Archive')
if (length(f) > 0) {
File = lapply(paste0('Risk/',f), function(i){
x <- read.xlsx(i, sheet = 1, startRow=2, colNames = TRUE, check.names = FALSE, cols = c(1:73))
file.copy(from=i, to='Risk/Archive/',
overwrite = TRUE, recursive = FALSE,copy.mode = TRUE)
x})
File <- do.call("rbind.data.frame", File)}

There might be a better way to do this, but if I understand correctly, I think this should do the trick:
# Get list of names of people
names <- list.dirs(path = "./Risk/Archive",
full.names = F,
recursive = F)
# Get list of files to copy
files <- list.files(path = "./Risk",
pattern = ".xlsx",
full.names = T)
# Loop through each name and move the file for that person
for(name in 1:length(names)){
# Current name in loop
cname <- names[name]
# Get index of file that contains current name
name.idx <- grep(files, pattern = cname)
# Get file path for file that matches current name
file.path <- files[name.idx]
# Make file path for archive folder for current name
name.path <- paste0("./Risk/Archive/", cname)
# Copy file from "Risk" folder to "Archive" folder for current name
file.copy(from = file.path,
to = name.path,
overwrite = T)
# Remove original file after archiving
file.remove(file.path)
# Output message
cat(paste0("Moved file for: ", cname, "\n"))
}

Related

Extract folder names inside of *.rar and *.zip files

I have a folder with multiple *.rar and *.zip files.
Each *.rar and *.zip files have one folder and inside this folder have multiples folders.
I would like to generate a dataset with the names of these multiple folders.
How can I do this using R?
I trying:
temp <- list.files(pattern = "\\.zip$")
lapply(temp, function(x) unzip(x, list = T))
But it returns:
I would like to get just the names: "Nova pasta1" and Nova pasta2"
Thanks
Let's create an simple set of directories/files that are representative of your own. You described having a single .zip file that contains multiple zipped directories, which may contain unzipped files and/or sub-directoris.
# Example main directory
dir.create("main_dir")
# Example directory with 1 file and a subdirectory with 1 file
dir.create("main_dir/example_dir1")
write.csv(data.frame(x = 5), file = "main_dir/example_dir1/example_file.csv")
dir.create("main_dir/example_dir1/example_subdir")
write.csv(data.frame(x = 5), file = "main_dir/example_dir1/example_subdir/example_subdirfile.csv")
# Example directory with 1 file
dir.create("main_dir/example_dir2")
write.csv(data.frame(x = "foo"), file = "main_dir/example_dir2/example_file2.csv")
# NOTE: I was having issues with using `zip()` to zip each directory
# then the main (top) directory, so I manually zipped them below.
# Manually zip example_dir1 and example_dir2, then zip main_dir at this point.
Given this structure, we can get the paths to all of the directories within the highest level directory (main_dir) using unzip(list = TRUE) since we know the name of the single zipped directory containing all of these additional zipped sub-directories.
# Unzip the highest level directory available, get all of the .zip dirs within
ex_path <- "main_dir"
all_zips <- unzip(zipfile = paste0(ex_path, ".zip"), list = TRUE)
all_zips
# We can remove the main_path string if we want so that we only
# the zip files within our main directory instead of the full path.
library(dplyr)
all_zips %>%
filter(Name != paste0(ex_path, "/")) %>%
mutate(Name = sub(paste0(ex_path, "/"), "", Name))
If you had multiple zipped directories with nested directories similar to main_dir, you could just put their paths in a list and apply the function to each element of the list. Below I reproduce this.
# Example of multiple zip directory paths in a list
ziplist <- list(ex_path, ex_path, ex_path)
lapply(ziplist, function(x) {
temp <- unzip(zipfile = paste0(x, ".zip"), list = TRUE)
temp <- temp %>% mutate(main_path = x)
temp <- temp %>%
filter(Name != paste0(ex_path, "/")) %>%
mutate(Name = sub(paste0(ex_path, "/"), "", Name))
temp
})
If all of the .zip files in the current working directory are files you want to do this for, you can get ziplist above via:
list.files(pattern = ".zip") %>% as.list()
I appreciate all help, but I think that I found a short way to solve my question.
temp.zip <- list.files(pattern = ".zip")
temp.rar <- list.files(pattern = ".rar")
mydata <- lapply(c(temp.rar, temp.zip),
function(x) unique(c(na.omit(str_extract(unlist
(untar(tarfile = x,
list = TRUE)),
'(?<=/).*(?=/)')))))
unlist(mydata)
Thanks all

Find second-from-last modified file in folder rather than final modified file in folder

the below code finds the most recently modified file within a folder. Can anyone advise please how this can be amended to instead find the penultimate/second to last file that was modified? Can the which.max be amended to search for a -1 or is alternative code required?
#Open latest file
#get a vector of all filenames
files <- list.files(path="MYFILEPATH",pattern="files.xls",full.names = TRUE,recursive = TRUE)
#get the directory names of these (for grouping)
dirs <- dirname(files)
#find the last file in each directory (i.e. latest modified time)
lastfiles <- tapply(files,dirs,function(v) v[which.max(file.mtime(v))])
File_List <- file.info(list.files("//MYFILEPATH", full.names = T))
Path_Of_Interest <- rownames(File_List)[which.max(File_List$mtime)]
Split the files by directory, then loop, order and get the 2nd from the last:
lapply(split(files, dirname(files)), function(i){
i[ order(file.mtime(i)) ][ length(i) - 1 ]
})
Something like this?
Instead of getting the entire file.info, the code below calls file.mtime.
Tested with files in a sub-directory of my home dir.
# my tmp dir
MYFILEPATH <- "~/tmp"
# get a vector of all filenames
files <- list.files(
path=MYFILEPATH,
pattern="\\.txt$",
full.names = TRUE,
recursive = TRUE
)
File_mtime <- file.mtime(files)
i <- order(File_mtime, decreasing = TRUE)
Second_Path <- if(length(i) > 1L) files[ i[2L] ] else files
Second_Path
The code above as a function.
The default is to return the last file changed. Argument default n = 1 must set to n = 2 for the function to return the before-last file changed.
nth_from_last_changed <- function(path = ".", pattern = NULL, n = 1L,
full.names = TRUE, recursive = TRUE, ...){
#get a vector of all filenames
files <- list.files(
path = path,
pattern = pattern,
full.names = full.names,
recursive = recursive,
...
)
# get their last change time and order decreasing
i <- order(file.mtime(files), decreasing = TRUE)
if(length(i) > n - 1L)
files[ i[n] ]
else files[length(i)]
}
nth_from_last_changed("~/tmp", "\\.txt", n = 2)

Read second sheet of xlsx file from various subdirectories of a main directory R

I want to read the sheet that contains the word "All"or "all" of an excel workbook for every subdirectory based on a specific pattern.
I have tried list.files() but it does not work properly.
files_to_read = list.files(
path = common_path, # directory to search within
pattern = "X - GEN", # regex pattern, some explanation below
recursive = TRUE, # search subdirectories
full.names = TRUE # return the full path
)
data_lst = lapply(files_to_read, read.xlsx)
I am assuming your sub-directories have a similar name that can be identifiable?
Assumption, let's say:
your sub-directory starts with 'this' and
the files that are saved in sub-directory starts with the file name 'my_file'
the tab that you are trying to read in contains the word 'all'.
If the tab that you are reading in is located in same position (e.g. 2nd tab of every file) then it is easier as you can specify the sheet within read.xlsx as sheet = 2 but if this is not the case then one way you could do is by creating your own function that allows for this.
Then
library(openxlsx)
# getting the name of subdirectories starting with the word 'this'
my_dir <- list.files(pattern = "^this", full.names = TRUE)
# getting the name of the files starting with 'my_file', e.g. my_file.xlsx, my_file2.xlsx
my_files <- list.files(my_dir, pattern = "^my_file", full.names = TRUE)
my_read_xlsx <- function(files_to_read, sheets_to_read) {
# files to import
wb <- loadWorkbook(files_to_read)
# getting the sheet names that contain 'all' or any other strings that you specify
# ignore.case is there so that case is not sensitive when reading in excel tabs
ws <- names(wb)[grepl(sheets_to_read, names(wb), ignore.case = TRUE)]
# reading in the excel tab specified as above
xl_data <- read.xlsx(wb, ws)
return(xl_data)
}
# Using the function created above and import tabs containing 'all'
my_list <- lapply(my_files, FUN = function(x) my_read_xlsx(x, sheet = "ALL"))
# Converting the list into a data.frame
my_data <- do.call("rbind", my_list)

Specifying pathname in map_dfr

The structure of my directory is as follows:
Extant_Data -> Data -> Raw
-> course_enrollment
-> frpm
I have a few different function to to read in some text files and excel files respectively.
read_fun = function(path){
test = read.delim(path, sep="\t", header=TRUE, fill = TRUE, colClasses = c(rep("character",23)))
test
}
read_fun_frpm= function(path){
test = read_excel(path, sheet = 2, col_names = frpm_names)
}
I feed this into map_dfr so that the function reads in each of the files and rowbinds them.
allfiles = list.files(path = "Extant_Data/Data/Raw/course_enrollment",
pattern = "CourseEnrollment.txt",
full.names=FALSE,
recursive = T)
# Rowbind all the course enrollment data
# !!! BUT I HAVE set the working directory to a subdirectory so that it finds those files
setwd("/Extant_Data/Data/Raw/course_enrollment")
course_combined <- map_dfr(allfiles,read_fun)
allfiles = list.files(path = "Extant_Data/Data/Raw/frpm/post12",
pattern = "frpm*",
full.names=FALSE,
recursive = T)
# Rowbind all the course enrollment data
# !!!I have to change the directory AGAIN
setwd(""Extant_Data/Data/Raw/frpm/post12")
frpm_combined <- map_dfr(allfiles,read_fun_frpm)
As mentioned in the comments, I have to keep changing the working directory so that map_dfr can locate the files. I don't think this is best practice, how might I work around this so I don't have to keep changing the directory? Any suggestions appreciated. Sorry it's hard to provide a re-producible example.
Note: This throws an error.
frpm_combined <- map_dfr(allfiles,read_fun_frpm('Extant_Data/Data/Raw/frpm/post12'))

R - Add Column w/ File Name on For Loop

I use assing in a for loop to batch read in all the .csv files in a working directory. I then use substr to clean the names of the files. I would like to add a column to each of the files with the file name for better analysis later in the code. However, I am having trouble referencing the file in the for loopafter the file names have been cleaned to add a column.
#read in all files in folder
files <- list.files(pattern = "*.csv")
for (i in 1:length(files)){
assign(substr(files[i], start = 11, stop = nchar(files[i])-4), #clean file names
read.csv(files[i], stringsAsFactors = FALSE))
substr(files[i], start = 11, stop = nchar(files[i])-4)['FileFrom'] <- files[i]
}
assign does not seem to be the right function here, I think you need to use eval(parse()) on a string cmd that you set up. The inline notes explain more:
# read in all files in folder
files <- list.files(pattern = "*.csv")
# loop through the files
for (i in 1:length(files)){
# save the clean filename as a char var because it will be called again
fnClean = substr(files[i], start = 1, stop = nchar(files[i])-4)
# create a cmd as a string to be parsed and evaluated on-the-fly
# the point here is that you can use the 'fnClean' var in the string
# without knowing what it is - assign is expecting an inline string
# ...not a string saved as a var, so it can't be used in this case
loadFileCMD = paste0(fnClean,' = read.csv(files[i], stringsAsFactors =
FALSE)')
print(loadFileCMD) # check the cmd
eval(parse(text=loadFileCMD))
# create another string command to be evaluated to insert the file name
# to the 'FileFrom' field
addFnCMD = paste0(fnClean,'$FileFrom = files[i]')
print(addFnCMD) # check the cmd
eval(parse(text=addFnCMD))
}
Would this work?
#read in all files in folder
files <- list.files(pattern = "*.csv")
filesCopy <- files
for (i in 1:length(files)){
assign(substr(files[i], start = 11, stop = nchar(files[i])-4), #clean file names
read.csv(files[i], stringsAsFactors = FALSE))
substr(files[i], start = 11, stop = nchar(files[i])-4)['FileFrom'] <- filesCopy[i]
}

Resources