Combine a bunch of data files in R - r

Sample
fileABC=data.frame("id" = c(1:10),
"var1" = c("5*",8,1,4,"3*",5,6,7,"7*",1),
"var2" = c(7,4,6,"1*","8*",2,0,7,"0*",1))
fileQWE=data.frame("id" = c(1:10),
"var1" = c(7,"5*",8,1,4,"3*",5,"7*",6,2),
"var2" = c("8*",2,0,7,7,4,6,"1*","0*",1))
var1=c(fileABC$var1,fileQWE$var1)
var2=c(fileABC$var2,fileQWE$var2)
WANT = data.frame("id"=c(1:10,1:10)
"var1"=var1,
"var2"=var2,
"source"=c(rep("fileABC",10), rep("fileQWE",10)))
I have fileABC and fileQWE. Firstly of all, these files contain random * that I want to eliminate. Secondly, I want to read in R both of the files and save them as fileABC and fileQWE. Then I want to create data frame WANT which combines var1 from both files and var2 from both files and makes new variable "source" that equals to fileABC or fileQWE depending on where it comes from. I included my attempt but as you can run and see it does not work.

We can use bind_rows and get the numeric elements extracted with parse_number
library(tidyverse)
bind_rows(lst(fileABC, fileQWE), .id = 'source') %>%
mutate_at(vars(starts_with("var")),
list(~ readr::parse_number(as.character(.))))
Or using base R
lst1 <- mget(ls(pattern = "^file"))
out <- do.call(rbind, Map(cbind, lst1, source = names(lst1)))
row.names(out) <- NULL
out[2:3] <- lapply(out[2:3], function(x) as.numeric(sub("*", "", x, fixed = TRUE)))
If we want to read directly from .csv file (assuming files are in the working directory)
files <- list.files(pattern = ".csv")
names(files) <- str_remove(files, "\\.csv")
library(readr)
imap_dfr(files, ~ read.csv(.x) %>%
mutate_at(vars(starts_with("var")),
list(~ readr::parse_number(.))) %>%
mutate(source = .y))

Related

Write_csv for a list of csv.files mantaining original file names

I have a df list (df_list) and i want a tydiverse approach to write all csv files from my list while mantaining their original file names.
So far i did:
df = dir(pattern = "\\.csv$", full.names = TRUE)
df_list = vector("list",length(df))
for(i in seq_along(networks))
{
df_list[[i]] = read.csv(df[[i]], sep = ";")
}
imap(df_list, ~write_csv(.x, paste0(.y,".csv")))
my current output is:
1.csv; 2.csv; 3.csv ...
The below will read in a set of files from an example directory, apply a function to those files, then save the files with the exact same names.
library(purrr)
library(dplyr)
# Create example directory with example .csv files
dir.create(path = "example")
data.frame(x1 = letters) %>% write.csv(., file = "example/example1.csv")
data.frame(x2 = 1:20) %>% write.csv(., file = "example/example2.csv")
# Get relative paths of all .csv files in the example subdirectory
path_list <- list.files(pattern = "example.*csv", recursive = TRUE) %>%
as.list()
# Read every file into list
file_list <- path_list %>%
map(~ read.csv(.x, sep = ","))
# Do something to the data
file_list_updated <- file_list %>%
map( ~ .x %>% mutate(foo = 5))
# Write the updated files to the old file names
map2(.x = file_list_updated,
.y = path_list,
~ write.csv(x = .x, file = .y))

Apply function to files from different folders (R)

I have discovered R a couple of years ago and it has been very handy to clean up dataframes, prepare some data and to handle other basic tasks.
Now I would like to try using R to apply basic treatments but on many different files stored in different folders at once.
Here is the script I would like to improve into one function that would loop through my folder "dataset_2006" and "dataset_2007" to do all the work.
library(dplyr)
library(readr)
library(sf)
library(purrr)
setwd("C:/Users/Downloads/global_data/dataset_2006")
shp2006 <- list.files(pattern = 'data_2006.*\\.shp$', full.names = TRUE)
listOfShp <- lapply(shp2006, st_read)
combinedShp <- do.call(what = sf:::rbind.sf, args=listOfShp)
#import and merge CSV files into one data frame
folderfiles <- list.files(pattern = 'csv_2006_.*\\.csv$', full.names = TRUE)
csv_data <- folderfiles %>%
set_names() %>%
map_dfr(.f = read_delim,
delim = ";",
.id = "file_name")
new_shp_2006 <- merge(combinedShp, csv_data , by = "ID") %>% filter(label %in% c("AR45T", "GK879"))
st_write(new_shp_2006 , "new_shp_2006.shp", overwrite = TRUE)
setwd("C:/Users/Downloads/global_data/dataset_2007")
shp2007 <- list.files(pattern = 'data_2007.*\\.shp$', full.names = TRUE)
listOfShp <- lapply(shp2007, st_read)
combinedShp <- do.call(what = sf:::rbind.sf, args=listOfShp)
#import and merge CSV files into one data frame
folderfiles <- list.files(pattern = 'csv_2007_.*\\.csv$', full.names = TRUE)
csv_data <- folderfiles %>%
set_names() %>%
map_dfr(.f = read_delim,
delim = ";",
.id = "file_name")
new_shp_2007 <- merge(combinedShp, csv_data , by = "ID") %>% filter(label %in% c("AR45T", "GK879"))
st_write(new_shp_2007 , "new_shp_2007.shp", overwrite = TRUE)
This is easy to achieve with a for-loop to loop over multiple items. To allow us to use wildcards, we can also add the function Sys.glob():
myfunction <- function(directories) {
for(dir in Sys.glob(directories)) {
# do something with a single dir
print(dir)
}
}
# you can specify multiple directories manually:
myfunction(c('C:/Users/Downloads/global_data/dataset_2006',
'C:/Users/Downloads/global_data/dataset_2007'))
# or use a wildcard to automatically get all files/directories that match the pattern:
myfunction('C:/Users/Downloads/global_data/dataset_200*')

import multible CSV files and use file name as column [duplicate]

This question already has answers here:
Importing multiple .csv files into R and adding a new column with file name
(2 answers)
Closed 14 days ago.
I have numerous csv files in multiple directories that I want to read into a R tribble or data.table. I use "list.files()" with the recursive argument set to TRUE to create a list of file names and paths, then use "lapply()" to read in multiple csv files, and then "bind_rows()" stick them all together:
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
tbl <- lapply(filenames, read_csv) %>%
bind_rows()
This approach works fine. However, I need to extract a substring from the each file name and add it as a column to the final table. I can get the substring I need with "str_extract()" like this:
sites <- str_extract(filenames, "[A-Z]{2}-[A-Za-z0-9]{3}")
I am stuck however on how to add the extracted substring as a column as lapply() runs through read_csv() for each file.
I generally use the following approach, based on dplyr/tidyr:
data = tibble(File = files) %>%
extract(File, "Site", "([A-Z]{2}-[A-Za-z0-9]{3})", remove = FALSE) %>%
mutate(Data = lapply(File, read_csv)) %>%
unnest(Data) %>%
select(-File)
tidyverse approach:
Update:
readr 2.0 (and beyond) now has built-in support for reading a list of files with the same columns into one output table in a single command. Just pass the filenames to be read in the same vector to the reading function. For example reading in csv files:
(files <- fs::dir_ls("D:/data", glob="*.csv"))
dat <- read_csv(files, id="path")
Alternatively using map_dfr with purrr:
Add the filename using the .id = "source" argument in purrr::map_dfr()
An example loading .csv files:
# specify the directory, then read a list of files
data_dir <- here("file/path")
data_list <- fs::dir_ls(data_dir, regexp = ".csv$")
# return a single data frame w/ purrr:map_dfr
my_data = data_list %>%
purrr::map_dfr(read_csv, .id = "source")
# Alternatively, rename source from the file path to the file name
my_data = data_list %>%
purrr::map_dfr(read_csv, .id = "source") %>%
dplyr::mutate(source = stringr::str_replace(source, "file/path", ""))
You could use purrr::map2 here, which works similarly to mapply
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
sites <- str_extract(filenames, "[A-Z]{2}-[A-Za-z0-9]{3}") # same length as filenames
library(purrr)
library(dplyr)
library(readr)
stopifnot(length(filenames)==length(sites)) # returns error if not the same length
ans <- map2(filenames, sites, ~read_csv(.x) %>% mutate(id = .y)) # .x is element in filenames, and .y is element in sites
The output of map2 is a list, similar to lapply
If you have a development version of purrr, you can use imap, which is a wrapper for map2 with an index
data.table approach:
If you name the list, then you can use this name to add to the data.table when binding the list together.
workflow
files <- list.files( whatever... )
#read the files from the list
l <- lapply( files, fread )
#names the list using the basename from `l`
# this also is the step to manipuly the filesnamaes to whatever you like
names(l) <- basename( l )
#bind the rows from the list togetgher, putting the filenames into the colum "id"
dt <- rbindlist( dt.list, idcol = "id" )
You just need to write your own function that reads the csv and adds the column you want, before combining them.
my_read_csv <- function(x) {
out <- read_csv(x)
site <- str_extract(x, "[A-Z]{2}-[A-Za-z0-9]{3}")
cbind(Site=site, out)
}
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
tbl <- lapply(filenames, my_read_csv) %>% bind_rows()
You can build a filenames vector based on "sites" with the exact same length as tbl and then combine the two using cbind
### Get file names
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
sites <- str_extract(filenames, "[A-Z]{2}-[A-Za-z0-9]{3}")
### Get length of each csv
file_lengths <- unlist(lapply(lapply(filenames, read_csv), nrow))
### Repeat sites using lengths
file_names <- rep(sites,file_lengths))
### Create table
tbl <- lapply(filenames, read_csv) %>%
bind_rows()
### Combine file_names and tbl
tbl <- cbind(tbl, filename = file_names)

Open .mol files and compiling information

I'm trying to create a program that opens a lot of files (.mol), and copies specific information from those files and saves it into a spreadsheet (TAB delimited files '\t').
I have 10000 mol files on my computer that look like SN00000001 SN00000002 SN00000003 ... SN00010000.
(download link => http://bioinf-applied.charite.de/supernatural_new/src/download_mol.php?sn_id=SN00000001)
I have two questions:
I already tried to use function load.molecules (rcdk) and ChemmineR (loadsdf) but I did not succeed to open a .mol file in R.
It´s possible to open each .mol file and save specific information such as "ID", "Name", "Molecular Formula" it into a unique spreadsheet using R?
Ok, I will send you the code
# get the full path of your mol files
mol_files <- list.files(path = file.path(getwd(), "/Users/189919604/Desktop/Download
SuperNatural II/SN00000001"), # specify your folder here
pattern = "*mol",
full.names = TRUE)
# create tibble, with filenames (incl. the full path)
df <- tibble(filenames = mol_files)
# create function to extract all the information
extract_info <- function(sdfset) {
# function to extract information from a sdfset (ChemmineR)
# this only works if there is one molecule in the sdfset
ID <- sdfset#SDF[[1]]#datablock["SNID"]
Name <- sdfset#SDF[[1]]#header["Molecule_Name"]
Molecular_Formula <- sdfset#SDF[[1]]#datablock["Molecular Formula"]
sdf_info <- tibble(SNID = ID,
Name = Name,
MolFormula = Molecular_Formula)
return(sdf_info)
}
# read all files and extract info
df <- df %>%
mutate(sdf_data = map(.x = filenames,
.f = ~ read.SDFset(sdfstr = .x)),
info = map(.x = sdf_data,
.f = ~ extract_info(sdfset = .x)))
# make a nice tibble with only the info you want
all_info <- df %>%
select(molecule) %>%
unnest(info)
# write to file
write_delim(x = all_info,
path = file.path(getwd(), "test.tsv"),
delim = "\t")
I hope this works, I only tested it with 2 mol files. I used read.SDFset from ChemmineR package to read all the mol files. The package tidyverse I use, is to work with tibbles. Tibbles are actually dataframes with some extra properties / functionalities.
library(tidyverse)
library(ChemmineR)
# get the full path of your mol files
mol_files <- list.files(# specify your folder here in case of windows also add your drive letter e.g.: "c:/users/path/to/my/mol_files"
path = "/home/rico/r-stuff/temp",
pattern = "*mol",
full.names = TRUE)
# create tibble, with filenames (incl. the full path)
df <- tibble(filenames = mol_files)
# create function to extract all the information
extract_info <- function(sdfset) {
# function to extract information from a sdfset (ChemmineR)
# this only works if there is one molecule in the sdfset
ID <- sdfset#SDF[[1]]#datablock["SNID"]
Name <- sdfset#SDF[[1]]#header["Molecule_Name"]
Molecular_Formula <- sdfset#SDF[[1]]#datablock["Molecular Formula"]
sdf_info <- tibble(SNID = ID,
Name = Name,
MolFormula = Molecular_Formula)
return(sdf_info)
}
# read all files and extract info
df <- df %>%
mutate(sdf_data = map(.x = filenames,
.f = ~ read.SDFset(sdfstr = .x)),
info = map(.x = sdf_data,
.f = ~ extract_info(sdfset = .x)))
# make a nice tibble with only the info you want
all_info <- df %>%
select(info) %>%
unnest(info)
# write to file
write_delim(x = all_info,
path = file.path(getwd(), "temp", "test.tsv"),
delim = "\t")

Read in multiple files and append file name to data frame [duplicate]

This question already has answers here:
Importing multiple .csv files into R and adding a new column with file name
(2 answers)
Closed 15 days ago.
I have numerous csv files in multiple directories that I want to read into a R tribble or data.table. I use "list.files()" with the recursive argument set to TRUE to create a list of file names and paths, then use "lapply()" to read in multiple csv files, and then "bind_rows()" stick them all together:
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
tbl <- lapply(filenames, read_csv) %>%
bind_rows()
This approach works fine. However, I need to extract a substring from the each file name and add it as a column to the final table. I can get the substring I need with "str_extract()" like this:
sites <- str_extract(filenames, "[A-Z]{2}-[A-Za-z0-9]{3}")
I am stuck however on how to add the extracted substring as a column as lapply() runs through read_csv() for each file.
I generally use the following approach, based on dplyr/tidyr:
data = tibble(File = files) %>%
extract(File, "Site", "([A-Z]{2}-[A-Za-z0-9]{3})", remove = FALSE) %>%
mutate(Data = lapply(File, read_csv)) %>%
unnest(Data) %>%
select(-File)
tidyverse approach:
Update:
readr 2.0 (and beyond) now has built-in support for reading a list of files with the same columns into one output table in a single command. Just pass the filenames to be read in the same vector to the reading function. For example reading in csv files:
(files <- fs::dir_ls("D:/data", glob="*.csv"))
dat <- read_csv(files, id="path")
Alternatively using map_dfr with purrr:
Add the filename using the .id = "source" argument in purrr::map_dfr()
An example loading .csv files:
# specify the directory, then read a list of files
data_dir <- here("file/path")
data_list <- fs::dir_ls(data_dir, regexp = ".csv$")
# return a single data frame w/ purrr:map_dfr
my_data = data_list %>%
purrr::map_dfr(read_csv, .id = "source")
# Alternatively, rename source from the file path to the file name
my_data = data_list %>%
purrr::map_dfr(read_csv, .id = "source") %>%
dplyr::mutate(source = stringr::str_replace(source, "file/path", ""))
You could use purrr::map2 here, which works similarly to mapply
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
sites <- str_extract(filenames, "[A-Z]{2}-[A-Za-z0-9]{3}") # same length as filenames
library(purrr)
library(dplyr)
library(readr)
stopifnot(length(filenames)==length(sites)) # returns error if not the same length
ans <- map2(filenames, sites, ~read_csv(.x) %>% mutate(id = .y)) # .x is element in filenames, and .y is element in sites
The output of map2 is a list, similar to lapply
If you have a development version of purrr, you can use imap, which is a wrapper for map2 with an index
data.table approach:
If you name the list, then you can use this name to add to the data.table when binding the list together.
workflow
files <- list.files( whatever... )
#read the files from the list
l <- lapply( files, fread )
#names the list using the basename from `l`
# this also is the step to manipuly the filesnamaes to whatever you like
names(l) <- basename( l )
#bind the rows from the list togetgher, putting the filenames into the colum "id"
dt <- rbindlist( dt.list, idcol = "id" )
You just need to write your own function that reads the csv and adds the column you want, before combining them.
my_read_csv <- function(x) {
out <- read_csv(x)
site <- str_extract(x, "[A-Z]{2}-[A-Za-z0-9]{3}")
cbind(Site=site, out)
}
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
tbl <- lapply(filenames, my_read_csv) %>% bind_rows()
You can build a filenames vector based on "sites" with the exact same length as tbl and then combine the two using cbind
### Get file names
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
sites <- str_extract(filenames, "[A-Z]{2}-[A-Za-z0-9]{3}")
### Get length of each csv
file_lengths <- unlist(lapply(lapply(filenames, read_csv), nrow))
### Repeat sites using lengths
file_names <- rep(sites,file_lengths))
### Create table
tbl <- lapply(filenames, read_csv) %>%
bind_rows()
### Combine file_names and tbl
tbl <- cbind(tbl, filename = file_names)

Resources