I'm trying to create a program that opens a lot of files (.mol), and copies specific information from those files and saves it into a spreadsheet (TAB delimited files '\t').
I have 10000 mol files on my computer that look like SN00000001 SN00000002 SN00000003 ... SN00010000.
(download link => http://bioinf-applied.charite.de/supernatural_new/src/download_mol.php?sn_id=SN00000001)
I have two questions:
I already tried to use function load.molecules (rcdk) and ChemmineR (loadsdf) but I did not succeed to open a .mol file in R.
It´s possible to open each .mol file and save specific information such as "ID", "Name", "Molecular Formula" it into a unique spreadsheet using R?
Ok, I will send you the code
# get the full path of your mol files
mol_files <- list.files(path = file.path(getwd(), "/Users/189919604/Desktop/Download
SuperNatural II/SN00000001"), # specify your folder here
pattern = "*mol",
full.names = TRUE)
# create tibble, with filenames (incl. the full path)
df <- tibble(filenames = mol_files)
# create function to extract all the information
extract_info <- function(sdfset) {
# function to extract information from a sdfset (ChemmineR)
# this only works if there is one molecule in the sdfset
ID <- sdfset#SDF[[1]]#datablock["SNID"]
Name <- sdfset#SDF[[1]]#header["Molecule_Name"]
Molecular_Formula <- sdfset#SDF[[1]]#datablock["Molecular Formula"]
sdf_info <- tibble(SNID = ID,
Name = Name,
MolFormula = Molecular_Formula)
return(sdf_info)
}
# read all files and extract info
df <- df %>%
mutate(sdf_data = map(.x = filenames,
.f = ~ read.SDFset(sdfstr = .x)),
info = map(.x = sdf_data,
.f = ~ extract_info(sdfset = .x)))
# make a nice tibble with only the info you want
all_info <- df %>%
select(molecule) %>%
unnest(info)
# write to file
write_delim(x = all_info,
path = file.path(getwd(), "test.tsv"),
delim = "\t")
I hope this works, I only tested it with 2 mol files. I used read.SDFset from ChemmineR package to read all the mol files. The package tidyverse I use, is to work with tibbles. Tibbles are actually dataframes with some extra properties / functionalities.
library(tidyverse)
library(ChemmineR)
# get the full path of your mol files
mol_files <- list.files(# specify your folder here in case of windows also add your drive letter e.g.: "c:/users/path/to/my/mol_files"
path = "/home/rico/r-stuff/temp",
pattern = "*mol",
full.names = TRUE)
# create tibble, with filenames (incl. the full path)
df <- tibble(filenames = mol_files)
# create function to extract all the information
extract_info <- function(sdfset) {
# function to extract information from a sdfset (ChemmineR)
# this only works if there is one molecule in the sdfset
ID <- sdfset#SDF[[1]]#datablock["SNID"]
Name <- sdfset#SDF[[1]]#header["Molecule_Name"]
Molecular_Formula <- sdfset#SDF[[1]]#datablock["Molecular Formula"]
sdf_info <- tibble(SNID = ID,
Name = Name,
MolFormula = Molecular_Formula)
return(sdf_info)
}
# read all files and extract info
df <- df %>%
mutate(sdf_data = map(.x = filenames,
.f = ~ read.SDFset(sdfstr = .x)),
info = map(.x = sdf_data,
.f = ~ extract_info(sdfset = .x)))
# make a nice tibble with only the info you want
all_info <- df %>%
select(info) %>%
unnest(info)
# write to file
write_delim(x = all_info,
path = file.path(getwd(), "temp", "test.tsv"),
delim = "\t")
Related
I am trying to write an r script that will match the file name inside a directory and compare it to a file name located in a csv file. This is so I can tell what files have already been downloaded and what data I need to download. I have written code that will read the files from the directory and list them as a df as well as reading in the csv file. However I am having trouble changing the file name to pull out the string I want as well as matching the file name with the name column in the csv file. I also would want to ideally create a new spread sheet that can tell me what files match so I know what has been downloaded. This is what I have so far.
# read files from directory and list as df
file_names <-list.files(path="peaches/",
pattern="jpg",
all.files=TRUE,
full.names=TRUE,
recursive=TRUE) %>%
# turn into df
as.data.frame(x = file_names)
# read in xl file
name_data <- read_excel("peaches/all_data.xlsx")
# change the file_name from the string peaches//fruit/1234/12pink.jpg.txt to -> 12pink
# match the file name with the name column in name_data
# create a new spread sheet that pulls the id and row if it has been downloaded [enter image description here][1]
Example files/directory
Let's create an example directory with some example files. This will let us prove that the solution works and is key to a reproducible solution.
library(dplyr)
library(writexl)
library(readxl)
# Example directory with example files
dir.create(path = "peaches")
write.csv(data.frame(x = 5), file = "peaches/foo.csv")
write.csv(data.frame(x = 20), file = "peaches/foo.nrrd.csv")
write.csv(data.frame(x = 1), file = "peaches/foo2.nrrd.csv")
write.csv(data.frame(z = 2), file = "peaches/bar.csv")
write.csv(data.frame(z = 5), file = "peaches/bar.rrdr.csv")
# Example Excel file
write_xlsx(data.frame(name = c("foo", "hotdog")),
path = "peaches/all_data.xlsx")
Solution
We can now use our example files and directory to show a solution to the problem.
# Get file paths in a data.frame for those that contain ".jpg"
# Use data.frame() to avoid row names instead of as.data.frame()
# Need to use \\ to escape the period in the regular expression
file_names <- list.files(
path = "peaches/",
pattern = "\\.jpg",
all.files = TRUE,
full.names = TRUE,
recursive = TRUE
) %>%
data.frame(paths = .)
# Extract part of file name (i.e. removing directory substrings) that
# comes before .nrrd and add a column. Can get file name with basename()
# and use regular expressions for the other part.
file_names$match_string <- file_names %>%
pull(paths) %>%
basename() %>%
gsub(pattern = "\\.jpg.*", replacement = "")
file_names$match_string
#> [1] "foo" "foo2"
# Read in excel file with file names to match (if possible)
name_data <- read_excel("peaches/all_data.xlsx")
name_data$name
#> [1] "foo" "hotdog"
# Create match indicator and row number
name_data <- name_data %>%
mutate(
matched = case_when(name %in% file_names$match_string ~ 1,
TRUE ~ 0),
rowID = row_number()
)
# Create excel spreadsheet of files already downloaded
name_data %>%
filter(matched == 1) %>%
write_xlsx(path = "peaches/already_downloaded.xlsx")
I have a df list (df_list) and i want a tydiverse approach to write all csv files from my list while mantaining their original file names.
So far i did:
df = dir(pattern = "\\.csv$", full.names = TRUE)
df_list = vector("list",length(df))
for(i in seq_along(networks))
{
df_list[[i]] = read.csv(df[[i]], sep = ";")
}
imap(df_list, ~write_csv(.x, paste0(.y,".csv")))
my current output is:
1.csv; 2.csv; 3.csv ...
The below will read in a set of files from an example directory, apply a function to those files, then save the files with the exact same names.
library(purrr)
library(dplyr)
# Create example directory with example .csv files
dir.create(path = "example")
data.frame(x1 = letters) %>% write.csv(., file = "example/example1.csv")
data.frame(x2 = 1:20) %>% write.csv(., file = "example/example2.csv")
# Get relative paths of all .csv files in the example subdirectory
path_list <- list.files(pattern = "example.*csv", recursive = TRUE) %>%
as.list()
# Read every file into list
file_list <- path_list %>%
map(~ read.csv(.x, sep = ","))
# Do something to the data
file_list_updated <- file_list %>%
map( ~ .x %>% mutate(foo = 5))
# Write the updated files to the old file names
map2(.x = file_list_updated,
.y = path_list,
~ write.csv(x = .x, file = .y))
I have a script that merge all csv files in a folder.
My problem is that a new column named "...20" is created with empty data. How can I avoid that ?
Thanks for helping
My script :
folderfiles <- list.files(path = "//myserver/Depots/",
pattern = "\\.csv$",
full.names = TRUE)
data_csv <- folderfiles %>%
set_names() %>%
map_dfr(.f = read_delim,
delim = ";",
)
and the message :
It's difficult to debug this without access to specific files. However, you can attempt to specify the columns you want to read using the cols_only function. For example, let's assume that you only want to read the mpg column. You can do that in the following manner:
library("fs")
library("readr")
library("tidyverse")
# Generating some sample files
temp_dir_files <- path_temp("cars")
dir_create(temp_dir_files)
for (i in 1:10) {
write_csv(mtcars, file = path(temp_dir_files, paste0("cars", i, ".csv")))
}
# Selected column import
# read_* can handle a vector of paths
read_csv(
file = dir_ls(temp_dir_files, glob = "*.csv"),
col_types = cols_only(
mpg = col_double()
),
id = "input_file"
)
The cols_only specification passed to read_csv will force the read_csv to skip the remaining columns and only import the column with the matching name.
I have multiple csv files in a folder none of which have a header. I want to preserve the order set out by the number at the end of the file. The file names are "output-1.csv", "output-2.csv" and so on. Is there a way to include the file name of each csv so I know which data corresponds to which file. The answer [here][1] gets close to what I want.
library(tidyverse)
#' Load the data ----
mydata <-
list.files(path = "C:\\Users\\Documents\\Manuscripts\\experiment1\\output",
pattern = "*.csv") %>%
map_df( ~ read_csv(., col_names = F))
mydata
You can use:
library(tidyverse)
mydata <- list.files("C:\\Users\\Documents\\Manuscripts\\experiment1\\output",
pattern = ".csv$", full.names = T) %>%
set_names(str_sub(basename(.), 1, -5)) %>%
map_dfr(read_csv, .id = "file")
Sample
fileABC=data.frame("id" = c(1:10),
"var1" = c("5*",8,1,4,"3*",5,6,7,"7*",1),
"var2" = c(7,4,6,"1*","8*",2,0,7,"0*",1))
fileQWE=data.frame("id" = c(1:10),
"var1" = c(7,"5*",8,1,4,"3*",5,"7*",6,2),
"var2" = c("8*",2,0,7,7,4,6,"1*","0*",1))
var1=c(fileABC$var1,fileQWE$var1)
var2=c(fileABC$var2,fileQWE$var2)
WANT = data.frame("id"=c(1:10,1:10)
"var1"=var1,
"var2"=var2,
"source"=c(rep("fileABC",10), rep("fileQWE",10)))
I have fileABC and fileQWE. Firstly of all, these files contain random * that I want to eliminate. Secondly, I want to read in R both of the files and save them as fileABC and fileQWE. Then I want to create data frame WANT which combines var1 from both files and var2 from both files and makes new variable "source" that equals to fileABC or fileQWE depending on where it comes from. I included my attempt but as you can run and see it does not work.
We can use bind_rows and get the numeric elements extracted with parse_number
library(tidyverse)
bind_rows(lst(fileABC, fileQWE), .id = 'source') %>%
mutate_at(vars(starts_with("var")),
list(~ readr::parse_number(as.character(.))))
Or using base R
lst1 <- mget(ls(pattern = "^file"))
out <- do.call(rbind, Map(cbind, lst1, source = names(lst1)))
row.names(out) <- NULL
out[2:3] <- lapply(out[2:3], function(x) as.numeric(sub("*", "", x, fixed = TRUE)))
If we want to read directly from .csv file (assuming files are in the working directory)
files <- list.files(pattern = ".csv")
names(files) <- str_remove(files, "\\.csv")
library(readr)
imap_dfr(files, ~ read.csv(.x) %>%
mutate_at(vars(starts_with("var")),
list(~ readr::parse_number(.))) %>%
mutate(source = .y))