Select file from the list of the folder - r

I have a multiple folder with files name in numeric (12345.in). I am trying to write a function which will list the nearest file if the file in the command is not in the folder
soili=371039 #this is the file name
Getmapunit <- function(soili){
soilfile=list.files(pattern = paste0(soili,".in"), recursive = TRUE)
if (length(soilfile)==0){
soilfile=list.files(pattern = paste0(soili+1,".in"), recursive = TRUE)
}
soilfile
}
soilfile=Getmapunit(soili)
#i want to extract the file name closest to 371039, i was able to write function to get file name with next number

I would try to extract the number of each file and check for the nearest value:
library(magrittr)
library(stringr)
soili <- 371039
# get all files in the specific folder
files <- list.files(path = "file folder", full.names = F)
# extract number of each file and turn it into an integer
numbers <- str_extract(files, ".*(?=.in") %>% as.integer()
# get the number of the nearest file
nearest_file <- numbers[which.min(abs(soili - numbers)]
# turn it into a filename
paste0(as.character(nearest_file), ".in")
You can also put everything into one pipe:
soili <- 371039
nearest_file <- list.files(path = "file folder", full.names = F) %>%
str_extract(files, ".*(?=.in") %>%
as.integer() %>%
.[which.min(abs(soili - .)] %>%
paste0(as.character(nearest_file), ".in")
Of course, you can also translate this approach into a function.
Edit:
If you have all the files in different folders, you can use this approach:
soili <- 371039
files <- list.files(path = "highest_file_folder", full.names = T)
nearest_file <- files %>%
str_extract(., "[^/]*$") %>%
str_extract(".*(?=.in)") %>%
as.integer() %>%
.[which.min(abs(soili - .)] %>%
paste0(as.character(nearest_file), ".in")
# getting filepath with nearest_file out of the files vector
files[str_detect(files, nearest_file)]
# little example
files <- c("./folder1/12345.in", "./folder2/56789.in") %>%
str_extract(., "[^/]*$") %>%
str_extract(.,".*(?=.in)")

Related

import multible CSV files and use file name as column [duplicate]

This question already has answers here:
Importing multiple .csv files into R and adding a new column with file name
(2 answers)
Closed 14 days ago.
I have numerous csv files in multiple directories that I want to read into a R tribble or data.table. I use "list.files()" with the recursive argument set to TRUE to create a list of file names and paths, then use "lapply()" to read in multiple csv files, and then "bind_rows()" stick them all together:
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
tbl <- lapply(filenames, read_csv) %>%
bind_rows()
This approach works fine. However, I need to extract a substring from the each file name and add it as a column to the final table. I can get the substring I need with "str_extract()" like this:
sites <- str_extract(filenames, "[A-Z]{2}-[A-Za-z0-9]{3}")
I am stuck however on how to add the extracted substring as a column as lapply() runs through read_csv() for each file.
I generally use the following approach, based on dplyr/tidyr:
data = tibble(File = files) %>%
extract(File, "Site", "([A-Z]{2}-[A-Za-z0-9]{3})", remove = FALSE) %>%
mutate(Data = lapply(File, read_csv)) %>%
unnest(Data) %>%
select(-File)
tidyverse approach:
Update:
readr 2.0 (and beyond) now has built-in support for reading a list of files with the same columns into one output table in a single command. Just pass the filenames to be read in the same vector to the reading function. For example reading in csv files:
(files <- fs::dir_ls("D:/data", glob="*.csv"))
dat <- read_csv(files, id="path")
Alternatively using map_dfr with purrr:
Add the filename using the .id = "source" argument in purrr::map_dfr()
An example loading .csv files:
# specify the directory, then read a list of files
data_dir <- here("file/path")
data_list <- fs::dir_ls(data_dir, regexp = ".csv$")
# return a single data frame w/ purrr:map_dfr
my_data = data_list %>%
purrr::map_dfr(read_csv, .id = "source")
# Alternatively, rename source from the file path to the file name
my_data = data_list %>%
purrr::map_dfr(read_csv, .id = "source") %>%
dplyr::mutate(source = stringr::str_replace(source, "file/path", ""))
You could use purrr::map2 here, which works similarly to mapply
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
sites <- str_extract(filenames, "[A-Z]{2}-[A-Za-z0-9]{3}") # same length as filenames
library(purrr)
library(dplyr)
library(readr)
stopifnot(length(filenames)==length(sites)) # returns error if not the same length
ans <- map2(filenames, sites, ~read_csv(.x) %>% mutate(id = .y)) # .x is element in filenames, and .y is element in sites
The output of map2 is a list, similar to lapply
If you have a development version of purrr, you can use imap, which is a wrapper for map2 with an index
data.table approach:
If you name the list, then you can use this name to add to the data.table when binding the list together.
workflow
files <- list.files( whatever... )
#read the files from the list
l <- lapply( files, fread )
#names the list using the basename from `l`
# this also is the step to manipuly the filesnamaes to whatever you like
names(l) <- basename( l )
#bind the rows from the list togetgher, putting the filenames into the colum "id"
dt <- rbindlist( dt.list, idcol = "id" )
You just need to write your own function that reads the csv and adds the column you want, before combining them.
my_read_csv <- function(x) {
out <- read_csv(x)
site <- str_extract(x, "[A-Z]{2}-[A-Za-z0-9]{3}")
cbind(Site=site, out)
}
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
tbl <- lapply(filenames, my_read_csv) %>% bind_rows()
You can build a filenames vector based on "sites" with the exact same length as tbl and then combine the two using cbind
### Get file names
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
sites <- str_extract(filenames, "[A-Z]{2}-[A-Za-z0-9]{3}")
### Get length of each csv
file_lengths <- unlist(lapply(lapply(filenames, read_csv), nrow))
### Repeat sites using lengths
file_names <- rep(sites,file_lengths))
### Create table
tbl <- lapply(filenames, read_csv) %>%
bind_rows()
### Combine file_names and tbl
tbl <- cbind(tbl, filename = file_names)

How can I read multiple csvs and retain the number in the file name for each?

I have multiple csv files in a folder none of which have a header. I want to preserve the order set out by the number at the end of the file. The file names are "output-1.csv", "output-2.csv" and so on. Is there a way to include the file name of each csv so I know which data corresponds to which file. The answer [here][1] gets close to what I want.
library(tidyverse)
#' Load the data ----
mydata <-
list.files(path = "C:\\Users\\Documents\\Manuscripts\\experiment1\\output",
pattern = "*.csv") %>%
map_df( ~ read_csv(., col_names = F))
mydata
You can use:
library(tidyverse)
mydata <- list.files("C:\\Users\\Documents\\Manuscripts\\experiment1\\output",
pattern = ".csv$", full.names = T) %>%
set_names(str_sub(basename(.), 1, -5)) %>%
map_dfr(read_csv, .id = "file")

In R I wish to find the latest xlsx file in a folder and then import the data from that file

In R I wish to find the latest xlsx file in a folder and then import the data from that file. All files have the same format. I just keep getting blank. Please advise correct code.
CompanyFileNames <- file.info(list.files
(path = "Y:/...Data",
pattern = "*port.xlsx",
full.names = T))
CompanyFilelatest <- subset(CompanyFileNames, mtime == max(mtime))
CompanyFilelatest <- CompanyFilelatest[0]
Companymonthly <- sapply(CompanyFilelatest,
read_excel, simplify=FALSE)
%>% bind_rows(.id = "id")
write.csv(Companymonthly, "Companymonthly.csv")
What you need is the filepath of the latest file, which is stored as the rowname of CompanyFilelatest. Extract the file path with rownames() and then this should work.
CompanyFileNames <- file.info(list.files
(path = getwd(),
pattern = "*.xlsx",
full.names = T))
CompanyFilelatest <- subset(CompanyFileNames, mtime == max(mtime))
CompanyFilelatest <- rownames(CompanyFilelatest) # use rownames not subseting with 0
Companymonthly <- sapply(CompanyFilelatest,
read_excel, simplify=FALSE) %>% bind_rows(.id = "id")
write.csv(Companymonthly, "Companymonthly.csv")

Read in multiple files and append file name to data frame [duplicate]

This question already has answers here:
Importing multiple .csv files into R and adding a new column with file name
(2 answers)
Closed 15 days ago.
I have numerous csv files in multiple directories that I want to read into a R tribble or data.table. I use "list.files()" with the recursive argument set to TRUE to create a list of file names and paths, then use "lapply()" to read in multiple csv files, and then "bind_rows()" stick them all together:
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
tbl <- lapply(filenames, read_csv) %>%
bind_rows()
This approach works fine. However, I need to extract a substring from the each file name and add it as a column to the final table. I can get the substring I need with "str_extract()" like this:
sites <- str_extract(filenames, "[A-Z]{2}-[A-Za-z0-9]{3}")
I am stuck however on how to add the extracted substring as a column as lapply() runs through read_csv() for each file.
I generally use the following approach, based on dplyr/tidyr:
data = tibble(File = files) %>%
extract(File, "Site", "([A-Z]{2}-[A-Za-z0-9]{3})", remove = FALSE) %>%
mutate(Data = lapply(File, read_csv)) %>%
unnest(Data) %>%
select(-File)
tidyverse approach:
Update:
readr 2.0 (and beyond) now has built-in support for reading a list of files with the same columns into one output table in a single command. Just pass the filenames to be read in the same vector to the reading function. For example reading in csv files:
(files <- fs::dir_ls("D:/data", glob="*.csv"))
dat <- read_csv(files, id="path")
Alternatively using map_dfr with purrr:
Add the filename using the .id = "source" argument in purrr::map_dfr()
An example loading .csv files:
# specify the directory, then read a list of files
data_dir <- here("file/path")
data_list <- fs::dir_ls(data_dir, regexp = ".csv$")
# return a single data frame w/ purrr:map_dfr
my_data = data_list %>%
purrr::map_dfr(read_csv, .id = "source")
# Alternatively, rename source from the file path to the file name
my_data = data_list %>%
purrr::map_dfr(read_csv, .id = "source") %>%
dplyr::mutate(source = stringr::str_replace(source, "file/path", ""))
You could use purrr::map2 here, which works similarly to mapply
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
sites <- str_extract(filenames, "[A-Z]{2}-[A-Za-z0-9]{3}") # same length as filenames
library(purrr)
library(dplyr)
library(readr)
stopifnot(length(filenames)==length(sites)) # returns error if not the same length
ans <- map2(filenames, sites, ~read_csv(.x) %>% mutate(id = .y)) # .x is element in filenames, and .y is element in sites
The output of map2 is a list, similar to lapply
If you have a development version of purrr, you can use imap, which is a wrapper for map2 with an index
data.table approach:
If you name the list, then you can use this name to add to the data.table when binding the list together.
workflow
files <- list.files( whatever... )
#read the files from the list
l <- lapply( files, fread )
#names the list using the basename from `l`
# this also is the step to manipuly the filesnamaes to whatever you like
names(l) <- basename( l )
#bind the rows from the list togetgher, putting the filenames into the colum "id"
dt <- rbindlist( dt.list, idcol = "id" )
You just need to write your own function that reads the csv and adds the column you want, before combining them.
my_read_csv <- function(x) {
out <- read_csv(x)
site <- str_extract(x, "[A-Z]{2}-[A-Za-z0-9]{3}")
cbind(Site=site, out)
}
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
tbl <- lapply(filenames, my_read_csv) %>% bind_rows()
You can build a filenames vector based on "sites" with the exact same length as tbl and then combine the two using cbind
### Get file names
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
sites <- str_extract(filenames, "[A-Z]{2}-[A-Za-z0-9]{3}")
### Get length of each csv
file_lengths <- unlist(lapply(lapply(filenames, read_csv), nrow))
### Repeat sites using lengths
file_names <- rep(sites,file_lengths))
### Create table
tbl <- lapply(filenames, read_csv) %>%
bind_rows()
### Combine file_names and tbl
tbl <- cbind(tbl, filename = file_names)

file.info returning NA values in r

I am trying to get size of files and other details from my directory however it is returning NA values for some files but it is returning details for other files. Below is the code i used. will there be any administrator settings for files to fetch these details??
library(tidyr)
library(dplyr)
wd <- "F:\\working\\others"
setwd(wd)
#get file list - your list of files would be different
fileList <- list.files()[1:240]
class(fileList)
#result
cbind(
file.info(fileList)[,c("size"), drop=FALSE],
x = as.character(file.mtime(fileList))) %>%
separate(x,
into = c("DateModified","TimeModified"),
sep=" ") %>%
add_rownames %>%
select(DateModified,
TimeModified,
Size=size,
FileName=rowname)
try this:
the trick is in the full.names = TRUE
ldir <- normalizePath("<type here the path of directory>")
finf <- file.info(dir(path = ldir, full.names = TRUE), extra_cols = FALSE)
View(finf)

Resources