Read many files in parallel and extract data - r

I have 1000 json files. And I would like to read them in parallel. I have 4 CPU cores.
I have a character vector which has the names of all the files as following:-
cik_files <- list.files("./data/", pattern = ".json")
And using this vector I load the file and extract the data and add it to the following list:-
data <- list()
Below is the code for extracting the data:-
for(i in 1:1000){
data1 <- fromJSON(paste0("./data/", cik_files[i]), flatten = TRUE)
if(("NetIncomeLoss" %in% names(data1$facts$`us-gaap`))){
data1 <- data1$facts$`us-gaap`$NetIncomeLoss$units$USD
data1 <- data1[grep("CY20[0-9]{2}$", data1$frame), c(3, 9)]
try({if(nrow(data1) > 0){
data1$cik <- strtrim(cik_files[i], 13)
data[[length(data) + 1]] <- data1
}}, silent = TRUE)
}
}
This however, takes quite a lot of time. So I was wondering how I can run the code within the for loop but in parallel.
Thanks in advance.

Here is an attempt to solve the problem in the question. Untested, since there is no data.
Step 1
First of all, rewrite the loop in the question as a function.
f <- function(i, path = "./data", cik_files){
filename <- file.path(path, cik_files[i])
data1 <- fromJSON(filename, flatten = TRUE)
if(("NetIncomeLoss" %in% names(data1$facts$`us-gaap`))){
data1 <- data1$facts$`us-gaap`$NetIncomeLoss$units$USD
found <- grep("CY20[0-9]{2}$", data1$frame)
if(length(found) > 0){
tryCatch({
out <- data1[found, c(3, 9)]
out$cik <- strtrim(cik_files[i], 13)
out
},
error = function(e) e,
warning = function(w) w)
} else NULL
} else NULL
}
Step 2
Now load the package parallel and run one of the following, depending on OS.
library(parallel)
# Not on Windows
library(jsonlite)
json_list <- mclapply(seq_along(cik_files), f, cik_files = cik_files)
# Windows
ncores <- detectCores()
cl <- makeCluster(ncores - 1L)
clusterExport(cl, "cik_files")
clusterEvalQ(cl, "cik_files")
clusterEvalQ(cl, library(jsonlite))
json_list <- parLapply(cl, seq_along(cik_files), f, cik_files = cik_files)
stopCluster(cl)
Step 3
Extract the data from the returned list json_list.
err <- sapply(json_list, inherits, "error")
warn <- sapply(json_list, inherits, "warning")
ok <- !(err | warn)
json_list[ok] # correctly read in

Related

Problem with accessing elements from parLapply() output

I have a problem with accessing elements from the output of parLapply(). When I use the non-parallel lapply() function I can access the elements with the following code.
out_list <- lapply(list, function)
out_list[[2]][1:5, 1:5] # out_list[[2]] is a matrix in my specific case
But when I try to do the same, but with the output of the parLapply() function, I get an error.
The code:
out_list <- parLapply(cl = cluster, list, function)
out_list[[2]][1:5, 1:5]
The error message:
in extract_matrix(x, i, j, ...) :
out_list instance has been unmapped.
Here is the full code:
#!/usr/bin/Rscript
path_to_files = '***********'
file.names <- list.files(path = path_to_files, pattern = "*.bed", full.names = TRUE, recursive = FALSE) # making a list of the desired files
# sequentially -------------------------------------------------------------------------------------------------------------------------------------
library(BGData)
print("Executing lapply...")
example_BEDMatrix_list <- lapply(file.names, BEDMatrix)
print("lapply() done.")
example_BEDMatrix_list[[4]][1:5, 1:5]
#------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# parallel ------------------------------------------------------------------------------------------------------------------------------------------------------
library(BGData)
library(parallel)
print("Creating cluster...")
copies_of_r <- detectCores() - 5
cluster <- makeCluster(copies_of_r)
clusterExport(cl=cluster, c('file.names'))
print("Cluster created")
print("Executing parLapply()...")
BEDMatrix_list <- parLapply(cluster, file.names[2:4], BEDMatrix)
BEDMatrix_list[[2]][1:5, 1:5]
print("parLapply() executed")
print("stopping cluster...")
stopCluster(cluster)
print("Cluster stopped")
How can I fix this?

R programming (beginner): Combining two lists--> dataframe -> csv

I tried to combine two lists into one dataframe:
all_stas <- list()
for(i in vid_id){
stas <- get_stats(video_id = i)
all_stas <- rbind(all_stas,stas)
}
View(all_stas)
all_detail <- list()
for(i in vid_id){
detail1 <- get_video_details(video_id = i)
all_detail <- rbind(all_detail,detail1)
}
View(all_detail)
df <- data.frame(all_stas,all_detail)
write.csv(df, file = "new_file.csv")
Afterwards I would like to store it into a csv file.
When I run it it gives me the following warning message
Warning message:
In rbind(all_stas, stas) :
number of columns of result is not a multiple of vector length (arg 2)
Does anyone of you know how I can make the code work?
This block below is triggering an error
all_stas <- list()
for(i in vid_id){
stas <- get_stats(video_id = i)
all_stas <- rbind(all_stas,stas)}
If I understand your code correctly you can get around that error by
all_stas <- list()
for(i in vid_id){
all_stas[[i]] <- get_stats(video_id = i)}

%dopar% error after several iterations

I've been trying to get a parallelized foreach loop running in R, it works fine for approximately ten iterations but then crashes, showing the error:
Error in { : task 7 failed - "missing value where TRUE/FALSE needed"
Calls: %dopar% -> <Anonymous>
Execution halted
I append the results of each loop to a file, which does show the output to be as expected. My script is as followed,using the combn_sub function from this post:
LBRA <- fread(
input = "LBRA.012",
data.table = FALSE)
str_bra <- nrow(LBRA)
br1sums <- colSums(LBRA)
b1non <- which(br1sums == 0)
LBRA_trim <- LBRA[,-b1non]
library(foreach)
library(doMC)
registerDoMC(28)
foreach(X = seq(2, (nrow(LBRA)-1))) %dopar% {
com <- combn_sub(
x = nrow(LBRA),
m = X,
nset = 1000)
out_in <- matrix(
ncol = 2,
nrow = 1)
colnames(out) <- c("SNPs", "k")
for (A in seq(1, ncol(com))){
rowselect <- com[, A]
sub <- LBRA_trim[rowselect, ]
subsum <- colSums(sub)
length <- length(which(subsum != 0)) - 1
out_in <- rbind(out_in, c(length, X))
}
write.table(
file = "plateau.csv",
sep = "\t",
x = out_in,
append = TRUE)
}
I had a similar problem with my foreach call...
tmpcol <- foreach(j = idxs:idxe, .combine=cbind) %dopar% { imp(j) }
Error in { : task 18 failed - "missing value where TRUE/FALSE needed"
Changing the .errorhandling parameter only ignores the error
tmpcol <- foreach(j = idxs:idxe, .combine=cbind, .errorhandling="pass") %dopar% { imp(j) }
Warning message:
In fun(accum, result.18) :
number of rows of result is not a multiple of vector length (arg 2)
I suggest running the function in your foreach call for X=7. The problem in my case was my function, imp(j), was throwing an error (for j=18, it was hitching on an NA calculation) which resulted in the vague output from foreach.
As #Roland points out, it's a very bad idea to write to a file within a foreach loop. Even writing in append mode, the individual cores will attempt to write to the file simultaneously and may clobber each other's input. Instead, capture the results of the foreach statement using the .combine="rbind" option and then write to file after the loop:
cluster <- makeCluster(28, outfile="MulticoreLogging.txt");
registerDoMc(cluster);
foreach_outcome_table <- foreach(X = seq(2, (nrow(LBRA)-1)), .combine="rbind") %dopar% {
print(cat(paste(Sys.info()[['nodename']], Sys.getpid(), sep='-'), "now performing loop", X, "\n"));
com <- combn_sub(x = nrow(LBRA), m = X, nset = 1000);
out_in <- matrix(ncol = 2,nrow = 1);
colnames(out_in) <- c("SNPs", "k");
for (A in seq(1, ncol(com))){
rowselect <- com[, A];
sub <- LBRA_trim[rowselect, ];
subsum <- colSums(sub);
length <- length(which(subsum != 0)) - 1;
out_in <- rbind(out_in, c(length, X));
}
out_in;
}
write.table(file = "plateau.csv",sep = "\t", x = foreach_outcome_table, append = TRUE);
Further, you could replace the inner for loop with a nested foreach loop which would probably be more efficient.
There could be many reasons for the error, "missing value where TRUE/FALSE needed".
What helped for me was to remove the %dopar% and run the same code on a single item. This revealed more/clearer error messages which, I think, get lost when running in parallel. My error had nothing to do with the %dopar% itself.

R JSONlite: How to tackle below error?

See the below R code, I'm using JSONlite package to scrape data from a website:
library(jsonlite)
url <- "http://fantasy.premierleague.com/web/api/elements/"
seasonsdata <- data.frame(matrix(NA,nrow=1,ncol=20))
seasonsdata <- seasonsdata[-1,]
fetchData <- function(i) {res <- try(a <- fromJSON(paste0(url,i)))
if(!inherits(res,"try-error")) {b<-data.frame(a[1],a[20],a[21],as.data.frame(a$season_history))}}
seasonsdata <- lapply(1:696, fetchData)
seasonsdata <-do.call(rbind,lapply(seasonsdata,data.frame,stringsAsFactors=FALSE))
The code is working fine for 'i' till 10 at least, I'm getting the desired output. However, as I increase 'i' to 696, I'm getting the error:
Error in data.frame(a[1], a[20], a[21], as.data.frame(a$season_history)) :
arguments imply differing number of rows: 1, 0
Any advise?
If a$season_history is empty (page 57 is an example) then when you do data.frame(a[1], a[20], a[21], as.data.frame(a$season_history)) the first 3 elements have one row (they are scalars) and the last element has zero rows. In your function you can first check if a$season_history is there. If it it's not you can create a row of NAs in its places.
However there is another problem with your code you may not be aware of yet. Not every page to 696 exists and you get a 404 error when you try to pull the data from it. I added some steps to remove those pages before you do the final do.call(rbind, ...) step.
library(jsonlite)
url <- "http://fantasy.premierleague.com/web/api/elements/"
seasonsdata <- data.frame(matrix(NA, nrow = 1, ncol = 20))
seasonsdata <- seasonsdata[-1, ]
fetchData <- function(i) {
res <- try(a <- fromJSON(paste0(url, i)))
if (!inherits(res, "try-error")) {
if (nrow(as.data.frame(a$season_history)) == 0) {
b <- data.frame(a[1], a[20], a[21], as.data.frame(matrix(NA, ncol = 17)))
} else {
b <- data.frame(a[1], a[20], a[21], as.data.frame(a$season_history))
}
}
}
seasonsdata <- lapply(1:696, fetchData)
seasonsdata <- seasonsdata[!sapply(seasonsdata, is.null)]
seasonsdata <- seasonsdata[sapply(seasonsdata, is.data.frame)]
seasonsdata <- do.call(rbind,lapply(seasonsdata, data.frame, stringsAsFactors = FALSE))

how to save the output of a foreach loop in R

I have a trouble with saving my data output after foreach loop
here is the function to read my data and process it
readFiles <- function(x){
data <- read.table("filelist",
skip=grep('# Begin: Data Text', readLines(filelist)),
na.strings=c("NA", "-", "?"),
colClasses="numeric")
my <- as.matrix(data[1:57600,2]);
mesh <- array(my, dim = c(120,60,8));
Ms <- 1350*10^3 # A/m
asd2 <- (mesh[70:75,24:36 ,2])/Ms; # in A/m
ort_my <- mean(asd2);
return(ort_my)
}
here is the codes for doing parallel process
#R Code to run functions in parallel
detectCores() #This will tell you how many cores are available
library("foreach");
library("parallel");
library(doParallel)
#library("doMC") this is for Linux
#registerDoMC(12) #Register the parallel backend
cl<-makeCluster(4)
registerDoParallel(cl) # Register 12 cpu for the parallel backend
OutputList <- foreach(i=1:length(filelist),
.combine='c', .packages=c("data.table")) %dopar% (readFiles)
#registerDoSEQ() #Very important to close out parallel backend.
aa<-OutputList
stopCluster(cl)
print(Sys.time()-strt)
write.table(aa, file="D:/ads.txt",sep='\t')
Everything goes smoothly but when I check OutputList what I see only function(x)
I want to write ort_my for each file in filelist.
here is what I see
[[70]]
function (x)
{
data <- read.table("filelist", skip = grep("# Begin: Data Text",
readLines(filelist)), na.strings = c("NA", "-", "?"),
colClasses = "numeric")
my <- as.matrix(data[1:57600, 2])
mesh <- array(my, dim = c(120, 60, 8))
Ms <- 1350 * 10^3
asd2 = (mesh[70:75, 24:36, 2])/Ms
ort_my <- mean(asd2)
return(ort_my)
}
<environment: 0x00000000151aef20>
How can I do that?
best regards
Now I used doSNOW package to do same thing
library(foreach)
library(doSNOW)
getDoParWorkers()
getDoParName()
registerDoSNOW(makeCluster(8, type = "SOCK"))
getDoParWorkers()
getDoParName()
strt<-Sys.time()
data1 <- list() # creates a list
filelist <- dir(pattern = "*.omf") # creates the list of all the csv files in the directory
i=1:length(filelist)
readFiles <- function(m){ for (k in 1:length(filelist))
data[[k]] <- read.csv(filelist[k],sep = "",as.is = TRUE, comment.char = "", skip=37); # to read .omf files skip 37 skips 37 line of the header
my <- as.matrix(data[[k]][1:57600,2]);
mesh <- array(my, dim = c(120,60,8));
Ms<-1350*10^3 # A/m
asd2=(mesh[70:75,24:36 ,2])/Ms; # in A/m
ort_my<- mean(asd2);
return(ort_my)
}
out <- foreach(m=1:i, .combine=rbind,.verbose=T) %dopar% readFiles(m)
print(Sys.time()-strt)
I have error messages in following;
Error in readFiles(m) :
task 1 failed - "object of type 'closure' is not subsettable"
In addition: Warning message:
In 1:i : numerical expression has 70 elements: only the first used
As ?"%dopar%" states, in obj %dopar% ex, ex is an R expression to evaluate. If your free variable in foreach is i, you should use readFiles(i). Currently, you're in fact returning a function object.
BTW, you have some mess in the code. For example, I think that readFiles is independent of x (even if it has x as a formal argument)... Shouldn't it be readLines(filelist[[x]])?

Resources