Performing the same operations on multiple datasets in R - r

I'm trying to make a function in R, that performs some specific operations on a lot of different data sets, with the following code:
library(parallel)
cluster = makeCluster(2)
setwd("D:\\Speciale")
data_func <- function(kommune) {
rm(list=ls())
library(dplyr)
library(data.table)
library (tidyr)
#Load address and turbine datasets
distances <- fread(file="Adresser og distancer\\kommune.csv", header=TRUE, sep=",", colClasses = c("longitude" = "character", "latitude" = "character", "min_distance" = "character", "distance_turbine" = "character", "id_turbine" = "character"), encoding="Latin-1")
turbines <- fread(file="turbines_DK.csv", header=TRUE, sep=",", colClasses = c("lon" = "character", "lat" = "character", "id_turbine" = "character", "total_height" = "character", "location" = "character"), encoding="Latin-1")
Some cleaning of the data and construction of new variables
#write out the dataset
setwd("D:\\Speciale\\Analysedata")
fwrite(mock_final, file = "final_kommune.csv", row.names = FALSE)
}
do.call(rbind, parLapply(cl = cluster, c("Albertslund", "Alleroed"), data_func))
When I do this, I get the following error message:
Error in checkForRemoteErrors(val) :
2 nodes produced errors; first error: File 'Adresser og distancer\kommune.csv' does not exist or is non-readable. getwd()=='C:/Users/KSAlb/OneDrive/Dokumenter'
I need it to change the name of the files. Here it should insert Albertslund instead of kommune in the file names, perform the operations, write out a CSV file (changing "final_kommune.csv" to "final_Albertslund.csv"), clear the environment and then move on to the next data set, Alleroed.
Albertslund and Alleroed are just examples, there is a total of 98 data sets I need to process.

Maybe something like the code below can be of help. Untested, since there are no data.
library(parallel)
library(dplyr)
library(data.table)
library(tidyr)
data_func <- function(kommune, inpath = "Adresser og distancer",
turbines, outpath = "D:/Speciale/Analysedata") {
filename <- paste0(kommune, ".csv")
filename <- file.path(inpath, filename)
#Load address and turbine datasets
distances <- fread(
file = filename,
header = TRUE,
sep = ",",
colClasses = c("longitude" = "character", "latitude" = "character", "min_distance" = "character", "distance_turbine" = "character", "id_turbine" = "character"),
encoding = "Latin-1"
)
#Some cleaning of the data and construction of new variables
#write out the dataset
outfile <- paste0("final_", kommune, ".csv")
outfile <- file.path(outpath, outfile)
fwrite(mock_final, file = outfile, row.names = FALSE)
}
cluster = makeCluster(2)
setwd("D:\\Speciale")
# Read turbines file just once
turbines <- fread(
file = "turbines_DK.csv",
header = TRUE,
sep=",",
colClasses = c("lon" = "character", "lat" = "character", "id_turbine" = "character", "total_height" = "character", "location" = "character"),
encoding = "Latin-1"
)
kommune_vec <- c("Albertslund", "Alleroed")
do.call(rbind, parLapply(cl = cluster, kommune_vec, data_func, turbines = turbines))

Related

data.table::fread error when converting MAF files to data table

I want to merge the 50 MAF files with the sample information so that I can read it as a data.table and subset it.
library(maftools)
# Load MAF files
maf = system.file("extdata", list.files(path="mafs/"), package="maftools")
# Load sample information
si <- system.file("extdata", "sample-information.tsv", package="maftools")
d = read.maf(maf=maf, clinicalData=si)
Traceback:
Error in data.table::fread(file = maf, sep = "\t", stringsAsFactors = FALSE, :
File '' does not exist or is non-readable. getwd()=='C:/Users/User/Documents/VanAllen'
> traceback()
3: stop("File '", file, "' does not exist or is non-readable. getwd()=='",
getwd(), "'")
2: data.table::fread(file = maf, sep = "\t", stringsAsFactors = FALSE,
verbose = FALSE, data.table = TRUE, showProgress = TRUE,
header = TRUE, fill = TRUE, skip = "Hugo_Symbol", quote = "")
1: read.maf(maf = maf, clinicalData = si)
1: data.table::fread(input = maf)
Maftools documentation:
https://www.bioconductor.org/packages/release/bioc/manuals/maftools/man/maftools.pdf
When I run your code, maf indeed points to no character ( "" ), which of course cannot be read by fread. However when I try
fread("R/x86_64-pc-linux-gnu-library/3.6/maftools/extdata/brca.maf.gz")
it works as expected.

Error in as.data.frame.default(x[[i]], optional = TRUE) : cannot coerce class ‘"function"’ to a data.frame

I have been trying to combine files, and I keep getting a variety of errors each time I try to run it Error in
list.files(path = "~/Documents", full.names = FALSE) %>% lapply(read_csv) %>% : could not find function "%>%"
> write.csv(data, file = "Fecundity.csv", row_names = FALSE)
Error in utils::write.table(data, file = "Fecundity.csv", row_names = FALSE, :
unused argument (row_names = FALSE)
> write.csv(data, file = "Fecundity.csv", row.names = FALSE)
Error in as.data.frame.default(x[[i]], optional = TRUE) :
cannot coerce class ‘"function"’ to a data.frame
> write.csv(data, file = "Fecundity.csv", row.names = TRUE)
Error in as.data.frame.default(x[[i]], optional = TRUE) :
cannot coerce class ‘"function"’ to a data.frame
this is the code
library(data.table)
setwd(/cloud/project/Data sheets)
files <- list.files(pattern = ".cvs")
temp <- lapply(files, fread, sep= ".")
data <- rbindlist(temp)
write.csv(data, file = "Fecundity.csv", row.names = FALSE)`
Do it this way.
library(data.table)
library(fs)
files = dir_ls("csvFiles", regexp = "\\.csv$")
if(length(files)>0){
for(i in 1:length(files)){
lines = fread(text = files[i], sep = "|", header=FALSE)
fwrite(lines, "csvFiles/Fecundity.csv", append = TRUE)
}
}
This is the fastest way I know. This code will read thousands of files in seconds!

R asks for a list which seems to be a list according to is.list (=TRUE)

I am using the RAM package.
The function I use is very simple for diversity index, adding up a column in my metadata ;
outname <-OTU.diversity(data=OTUtables, meta=metatables)
(Arguments: data a list of OTU tables.
meta the metadata to append the outputs)
I am looping it but I get this error:
please provide otu tables as list; see ?RAM.input.formatting
So I go to that help menu and read this:
one data set:
data=list(data=otu)
multiple data sets:
data=list(data1=otu1, data2=otu2, data3=otu3)
here is my code:
i <- 1
for(i in 1:nrow(metadataMasterTax)){
temp <- read.table(paste(metadataMasterTax$DataAnFilePath[i], metadataMasterTax$meta[i], sep = ""),
sep = "\t", header = TRUE, dec = ".", comment.char = "", quote = "", stringsAsFactors = TRUE,
as.is = TRUE)
temp2 <- temp
temp2$row.names <- NULL #to unactivate numbers generated in the margin
trans <- read.table(paste(metadataMasterTax$taxPath[i], metadataMasterTax$taxName[i], sep = ""),
sep = "\t", header = TRUE, dec = ".", comment.char = "", quote = "", stringsAsFactors = TRUE,
as.is = TRUE, check.names = FALSE)
trans2 <- trans
trans2$row.names <- NULL #to unactivate numbers generated in the margin
data=list(data=trans2[i])
temp2[i] <- OTU.diversity(data=trans2[i], meta=temp2[i])
# Error in OTU.diversity(trans2, temp2) :
# please provide otu tables as list; see ?RAM.input.formatting
# is.list(trans2)
# [1] TRUE
# is.list(data)
# [1] TRUE
temp$taxonomy <- temp2$taxonomy
write.table(temp, file=paste(pathDataAn, "diversityDir/", metadataMasterTax$ShortName[i], ".meta.div.tsv", sep = ""),
append = FALSE,
sep = "\t",
row.names = FALSE)
}
Can anyone help me please....
thanks a lot
Because the main problem appears to be getting the OTU.diversity function to work, I focus on this issue. The code snippet below runs OTU.diversity without any problems, using the Google sheets data provided by OP.
library(gsheet)
library(RAM)
for (i in 1:2) {
# Meta data
temp <- as.data.frame(gsheet2tbl("https://drive.google.com/open?id=1hF47MbYZ1MG6RzGW-fF6tbMT3z4AxbGN5sAOxL4E8xM"))
temp$row.names <- NULL
# OTU
trans <- as.data.frame(gsheet2tbl("https://drive.google.com/open?id=1gOaEjDcs58T8v1GA-OKhnUsyRDU8Jxt2lQZuPWo6XWU"))
trans$row.names <- NULL
rownames(temp) <- colnames(trans)[-ncol(trans)]
temp2 <- OTU.diversity(data = list(data = trans), meta = temp)
write.table(temp2,
file = paste0("file", i, ".meta.div.tsv"), # replace
append = FALSE,
sep = "\t",
row.names = FALSE)
}
Replace for (i in 1:2) with for(i in 1:nrow(metadataMasterTax)), as.data.frame(gsheet2tbl(...)) with read.table(...), and the file argument in write.table with the appropriate string.

Simplify R code to import big data as character

I am currently using the code below very often to import a big dataset into R and forcing it to treat everything as character in order to avoid the truncation of rows. The code seems to work well, but I was wondering whether any of you knows how it could be simplified or improved to so it doesn't get so repetitive each time I need to do it.
library(readr)
library(stringr)
dataset.path <- choose.files(caption = "Select dataset", multi = FALSE)
data.columns <- read_delim(dataset.path, delim = '\t', col_names = TRUE, n_max = 0)
data.coltypes <- c(rep("c", ncol(data.columns)))
data.coltypes <- str_c(data.coltypes, collapse = "")
dataset <- read_delim(dataset.path, delim = '\t', col_names = TRUE, col_types = data.coltypes)
like #Roland has suggested, you should write a function. here is one possibility:
foo <- function(){
require(readr)
dataset.path <- choose.files(caption = "Select dataset", multi = FALSE)
data.columns <- read_delim(dataset.path, delim = '\t', col_names = TRUE, n_max = 0)
data.coltypes <- paste(rep("c", ncol(data.columns)), collapse = "")
dataset <- read_delim(dataset.path, delim = '\t', col_names = TRUE, col_types = data.coltypes)
}
you can then just call foo() whenever you need to read a database in using this method.
your two liner:
data.coltypes <- c(rep("c", ncol(data.columns)))
data.coltypes <- str_c(data.coltypes, collapse = "")
can be collapsed into just one line and only using base R paste instead of str_c in the stringr package.

Can I nest apply functions in R?

I have a series of CSV files that I want to prepare to append together. My appended file will be large, so I'd like to convert some string variables to numeric and date formats in the individual files rather than the larger appended file.
With other software, I would have one for loop that opens the file and nested for loops that would iterate over certain groups of variables. For this project, I am attempting to use R and apply functions.
I have mapply and lapply functions that work independently. I'm now trying to figure out how to combine them. Can I nest them? (See below for the independent parts and the nesting.)
(This code references code in the answer to How do I update data frame variables with sapply results?)
(Is it customary to provide an example CSV to give a reproducible example? Does R have built-in example CSVs?)
These work separately:
insert.division <- function(fileroot, divisionname){
ext <- ".csv"
file <- paste(fileroot, ext, sep = "")
data <- read.csv(file, header = TRUE, stringsAsFactors = FALSE)
data$division <- divisionname
write.csv(data, file = paste(fileroot, "_adj3", ext, sep = ""),
row.names = FALSE)
}
files <- c(
"file1",
"file2",
"file3",
"file4",
"file5"
)
divisions <- c(1:5)
#Open the files, insert division name, save new versions
mapply(insert.division, fileroot = files, divisionname = divisions)
#Change currency variables from string to numeric
currency.vars <- c(
"Price",
"RetailPrice"
)
df[currency.vars] <- lapply(
df[currency.vars],
function(x) as.numeric(sub("^\\(","-", gsub("[$,]|\\)$","", x)))
)
Combined version:
file.prep <- function(fileroot, divisionname, currency.vars){
ext <- ".csv"
file <- paste(fileroot, ext, sep = "")
data <- read.csv(file, header = TRUE, stringsAsFactors = FALSE)
data$division <- divisionname
df[currency.vars] <- lapply(
df[currency.vars],
function(x) as.numeric(sub("^\\(","-", gsub("[$,]|\\)$","", x)))
)
write.csv(data, file = paste(fileroot, "_adj", ext, sep = ""),
row.names = FALSE)
}
#Open the files, insert division name, change the currency variables,
#save new versions
mapply(file.prep, fileroot = files, divisionname = divisions,
currency.vars = df[currency.vars])
I'm not really sure why you're writing it back to file after changing the data, but here's an example of how I might approach the problem.
## Set up three csv files
set.seed(1)
DF <- data.frame(
w = paste0("($", sample(1500, 30) / 100, ")"),
x = Sys.Date() + 0:29,
y = sample(letters, 30, TRUE),
z = paste0("($", sample(1500, 30) / 100, ")")
)
fnames <- paste0("file", 1:3, ".csv")
Map(write.csv, split(DF, c(1, 10, 20)), fnames, row.names = FALSE)
Using your file.prep() function, you could adjust it a little and do
file.prep <- function(fileroot, divname, vars) {
ext <- ".csv"
file <- paste0(fileroot, ext)
data <- read.csv(file, stringsAsFactors = FALSE)
data$division <- divname
data[vars] <- lapply(data[vars], function(x) {
type.convert(gsub("[()$]", "", x))
})
write.csv(data, row.names = FALSE, file = paste0(fileroot, "_adj", ext))
}
divname <- 1:3
fnames <- paste0("file", divname)
Map(file.prep, fnames, divname, MoreArgs = list(vars = c("w", "z")))

Resources