Rhadoop mapreduce for multiple input files - r

I'm building a mapreduce program, using R, that extracts the relevant features from a set of features in a dataset using genetic algorithm. I need to put many files as an input to my mapreduce job. My code below is my mapreduce program but it works only for one input file (data.csv).
library(caret)
library(dplyr)
library(rmr2)
Sys.setenv(HADOOP_CMD="/home/rania/hadoop-2.7.3/bin/hadoop")
Sys.getenv("HADOOP_CMD")
Sys.setenv(HADOOP_STREAMING="/home/rania/hadoop-streaming-2.7.3.jar")
library(rhdfs)
hdfs.init()
rmr.options(backend = "hadoop")
hdfs.mkdir("/user/rania/genetic")
hdfs.mkdir("/user/rania/genetic/data")
I put my files in one folder in hdfs
hadoop fs -copyFromLocal /home/rania/Downloads/matrices/*.csv /user/rania/genetic/data/
This is the map function
mon.map <- function(.,data){
data <- read.csv("/home/rania/Downloads/dataset.csv", header = T, sep = ";")
y <- c(1,0,1,0,1,1,1,1,0,0,1,0,1)
ga_ctrl <- gafsControl(functions = rfGA, # Assess fitness with RF
method = "cv") # 10 fold cross validation
set.seed(10)
lev <- c("1","0")
rf_ga3 <- gafs(x = data, y = y,
iters = 10, # 100 generations of algorithm
popSize = 4, # population size for each generation
levels = lev,
gafsControl = ga_ctrl)
keyval(rf_ga3$ga$final, data[names(data) %in% rf_ga3$ga$final] )
}
This is the reduce function
mon.reduce <- function(k,v){
keyval(k,v) }
Now i apply the mapreduce job
hdfs.root = 'genetic'
hdfs.data = file.path(hdfs.root, 'data')
hdfs.out = file.path(hdfs.root, 'out')
csv.format <- make.output.format("csv")
genetic = function (input, output) {mapreduce(input=input, output=output, input.format="csv",output.format=csv.format, map=mon.map,reduce=mon.reduce)}
out = genetic(hdfs.data, hdfs.out)
Then we print the result from hdfs
results <- from.dfs(out, format="csv")
print(results)
OR
hdfs.cat("/genetic/out/part-00000")
I tried to change the map function to make it work for many files but it failed
mon.map <- function(.,data){
data <- list.files(path="/home/rania/Downloads/matrices/", full.names=TRUE, pattern="\\.csv") %>% lapply(read.csv, header=TRUE, sep=",")
y <- c(1,0,1,0,1,1,1,1,0,0,1,0,1)
for (i in 1:4){
ga_ctrl <- gafsControl(functions = rfGA, # Assess fitness with RF
method = "cv") # 10 fold cross validation
set.seed(10)
lev <- c("1","0")
rf_ga3 <- gafs(x = data[[i]], y = y,
iters = 10, # 100 generations of algorithm
popSize = 4, # population size for each generation
levels = lev,
gafsControl = ga_ctrl)
}
keyval(rf_ga3$ga$final, do.call(cbind, Map(`[`, data, c(rf_ga3$ga$final))) )
}
what can i change in the previous map function to make it work for many input files? thanks

Related

Using "lapply" in R to create multiple raster files from folder with lidar data

How can I read all files in a folder, perform a script and create separate outputs from all files
containing the original name? I have a folder with .las files and I need to create corresponding .asc files from them. My script as below:
library(lidR)
# Path to data
LASfile <- ("path/1234.las")
# Sorting out points in point cloud data, keeping vegetation and ground point classes.
las <- readLAS(LASfile, filter="-keep_class 1 2") # Keep high vegetation and ground point classes
# Normalizing ground points to 0 elevation (idwinterpolation), instead of meters above sea level.
dtm <- grid_terrain(las, algorithm = knnidw(k = 8, p = 2))
las_normalized <- normalize_height(las, dtm)
# Create a filter to remove points above 95th percentile of height
lasfilternoise = function(las, sensitivity)
{
p95 <- grid_metrics(las, ~quantile(Z, probs = 0.95), 10)
las <- merge_spatial(las, p95, "p95")
las <- filter_poi(las, Z < p95*sensitivity)
las$p95 <- NULL
return(las)
}
# Generating a pitfree canopy height modela model without null values (Khosravipour et al., 2014)
las_denoised <- lasfilternoise(las_normalized, sensitivity = 1.2)
chm <- grid_canopy(las_denoised, 0.32, pitfree(c(0,2,5,10,15), c(3,1.5), subcircle = 0.2))
# Applying a median filter, 5x5 moving window to smooth the image and remove noise
ker <- matrix(1,3,3)
chms <- raster::focal(chm, w = ker, fun = median)
plot(chms)
library(raster)
# Writing output file
writeRaster(chms, filename="path/1234.asc", format="ascii", overwrite=TRUE) # Ändra till relevant för varje körning
citation("lidR")
I tried using lapply but I dont know how to use it in the right way.
Must be something like this to read all files in the folder: list.files("path", pattern = "*.las", full.names = TRUE)
and something like this to write the output files: lapply(r, writeRaster, filename = paste0(f, ".asc"), format = "ascii")
But I cannot get it right
An example of my LAZ to LAS+Index conversion:
convertLAZ <- function(lazfile, outdir = "") {
if(!dir.exists({{outdir}})) { dir.create({{outdir}}, recursive = TRUE)}
print(lazfile)
las <- lidR::readLAS(files = {{lazfile}}, filter = "-keep_class 2 9")
.file <- stringi::stri_replace_all_regex({{lazfile}}, "^.*/", "")
lidR::writeLAS(las, file = paste0({{outdir}}, "/", stringi::stri_replace_all_fixed(.file, "laz", "las")), index = TRUE)
}
f <- list.files("data/laz", pattern = "*.laz", full.names = TRUE)
lapply(f, convertLAZ, outdir = "data/las22")
You can expand it to rasterization, normalization, etc and saving as .asc. But I would encourage you to have a look on https://r-lidar.github.io/lidRbook/engine.html. In short: process your LAZ/LAS files as LAScatalog, and then tile the result raster and save to .asc.
And an example how to use parallel processing (in below example 3+1 processes - please note, it can be memory hungry, so be careful with number of workers/processing parameters like opt_chunk_buffer.
library(future)
options(parallelly.availableCores.methods = "mc.cores")
options(mc.cores = 3)
plan(multisession)
parallelly::availableWorkers()
library(lidR)
myPath <- "data/las"
ctg <- readLAScatalog(myPath)
crs(ctg) <- "EPSG:2180"
ctg#output_options$drivers$SpatRaster$param$overwrite <- TRUE
opt_output_files(ctg) <- "data/dtm2/barycz__{XLEFT}_{YBOTTOM}"
opt_chunk_size(ctg) <- 500
opt_chunk_buffer(ctg) <- 600
opt_filter(ctg) <- "-keep_class 2 9"
summary(ctg)
vr <- rasterize_terrain(ctg, 0.25, tin())
plot(vr)
Solved it now
.libPaths( c( "C:/Users/Public/R/win-library/4.2" , .libPaths() ) )
library(lidR)
createASCI <- function(lasfile, outdir = "") {
if(!dir.exists({{outdir}})) { dir.create({{outdir}}, recursive = TRUE)}
print(lasfile)
las <- lidR::readLAS(files = {{lasfile}}, filter = "-keep_class 1 2 3 4 5")
.file <- stringi::stri_replace_all_regex({{lasfile}}, "^.*/", "")
# Normalizing ground points to 0 elevation (idwinterpolation), instead of meters above sea level.
dtm <- grid_terrain(las, algorithm = knnidw(k = 8, p = 2))
las_normalized <- normalize_height(las, dtm)
# Create a filter to remove points above 95th percentile of height
lasfilternoise = function(las, sensitivity)
{
p95 <- grid_metrics(las, ~quantile(Z, probs = 0.95), 10)
las <- merge_spatial(las, p95, "p95")
las <- filter_poi(las, Z < p95*sensitivity)
las$p95 <- NULL
return(las)
}
# Generating a pitfree canopy height modela model without null values (Khosravipour et al., 2014)
las_denoised <- lasfilternoise(las_normalized, sensitivity = 1.2)
chm <- grid_canopy(las_denoised, 0.32, pitfree(c(0,2,5,10,15), c(3,1.5), subcircle = 0.2))
# Applying a median filter, 5x5 moving window to smooth the image and remove noise
ker <- matrix(1,3,3)
chms <- raster::focal(chm, w = ker, fun = median)
writeRaster(chms, file = paste0({{outdir}}, "/", stringi::stri_replace_all_fixed(.file, "las", "asc")), index = TRUE)
}
f <- list.files("C:/Lasdata", pattern = "*.las", full.names = TRUE)
lapply(f, createASCI, outdir = "C:/Lasdata/nytt")

Get multiple values out of parallel foreach loop

I've tried searching but haven't located anything that's gotten me all the way.
I'm running an occupancy prediction model on a stack of three rasters. Due to the large amount of processing that needs to happen I'm using a parallel foreach loop.
I need to retrieve three variables out of the results from the loop: test, na, and pred. I need those three values to fill in the new raster values and maintain the same extent. Unless someone knows a way to fill in the gaps created by NA values during processing?
Below is the code I've been trying to use based on posts I've found.
I also tried nesting foreach loops, but I'm not sure I understand how those work, or if that would achieve my ends.
library(parallel)
library(doSNOW)
multiResultClass<- function(test = NULL, tmp = NULL, na = NULL, pred = NULL){
results<- list(
test = test,
tmp = tmp,
na = na,
pred = pred
)
class(results)<- append(class(results), "multiResultClass")
return(results)
}
nc<- detectCores()-1
cl<- makeCluster(nc)
registerDoSNOW(cl)
predicts<- foreach (i = 1:nrow(pm), .multicombine = T, .maxcombine = 1000,
.packages = c("unmarked", "raster"), .verbose = T)%dopar%{
results<- multiResultClass()
test<- cellFromRow(pm, i)
tmp<- data.frame(pm[test])
na<- any(is.na(tmp[i, ]))
if(length(which(na) != nrow(tmp))){
pred<- predict(fmBest, "state", tmp)
}
results$test<- test
results$tmp<- tmp
results$na<- na
results$pred<- pred
return(results)
}
foreach(i = 1:nrow(pm))%do%{
test<- predicts[[i]]$test
na<- predicts[[i]]$na
pred<- predicts[[i]]$pred
}
stopCluster(cl)
I have a working foreach loop that gets me the pred values, but without test and na I haven't found a way to properly fill in the raster template the data needs to go into. That foreach loop is below:
library(parallel)
library(doSNOW)
ns<- detectCores()-1
cl<- makeCluster(ns);cl
registerDoSNOW(cl)
predictions<-
foreach (i = 1:nrow(pm), .multicombine = T, .maxcombine = 5000,
.packages = c("unmarked", "raster"), .verbose = T)%dopar%{
test<- cellFromRow(pm, i)
tmp<- data.frame(pm[test])
na<- any(is.na(tmp[i, ]))
if(length(which(na) != nrow(tmp))){
predict(fmBest, "state", tmp)
}
}
stopCluster(cl)
I finally found a combine function I could make work. Below is the code I used that returns values for test, na, and pred all in a large list.
library(doSNOW)
nc<- detectCores()-1
cl<- makeCluster(nc);cl
registerDoSNOW(cl)
comb<- function(...){
mapply('rbind', ..., SIMPLIFY = F)
}
predictions<- foreach(i = 1:nrow(pm), .combine = 'comb', .multicombine = T,
.maxcombine = 200, .packages = c("unmarked", "raster"), .verbose = T,
.inorder = F)%dopar%{
#get cell number values from raster stack
test<- cellFromRow(pm, i)
# make into a data.frame for prediction
tmp<- data.frame(pm[test])
# test which are na
na<- any(is.na(tmp[i, ]))
# avoid NA values entering the predict function
if(length(which(na)) != nrow(tmp)){
# # Predict the new data
pred<- predict(fmBest, "state", tmp)
}
list(test, na, pred)
}
stopCluster(cl)

Scoping issue when using doParallel

I am trying to estimate multiple nonparametric models using the doParallel package. My problem though seems to be related to the np package.
Take a look at this reproducible example:
library(np)
library(doParallel)
df <- data.frame(Y = runif(100, 0, 10), X = rnorm(100))
models <- list(as.formula(Y ~ X))
npestimate <- function(m, data) {
LCLS <- npregbw(m, data = data, regtype = "lc", bwmethod = "cv.ls")
LLLS <- npregbw(m, data = data, regtype = "ll", bwmethod = "cv.ls")
# sigt <- npsigtest(LCLS, boot.method = "wild", boot.type = "I")
return(list(LCLS = LCLS, LLLS = LLLS))
}
cl <- makeCluster(length(models))
registerDoParallel(cl)
results <- foreach(m = models, .packages = "np", .verbose = T) %dopar%
npestimate(m, data = df)
stopCluster(cl)
As you can see I created a function called npestimate() in order to compute different stuff for each model. I commented out one line where I want to run significance tests using npsigtest. Usually, npsigtest gets the data used by looking in the environment where npregbw was called.
But this does not work here. I am not sure why but npsigtest just cannot find the data that was used in the two lines of code right above.
The data is automatically exported to the nodes, so using .export in foreach is redundant.
Any suggestions how to make this work?
npsigtest copies pretty much the approach used within lm and functions for lm objects. It thus has the same potential scoping pitfalls. The issue is the environment associated with the formula:
environment(models[[1]])
#<environment: R_GlobalEnv>
It's easy to fix:
npestimate <- function(m, data) {
environment(m) <- environment()
LCLS <- npregbw(m, data = data, regtype = "lc", bwmethod = "cv.ls")
LLLS <- npregbw(m, data = data, regtype = "ll", bwmethod = "cv.ls")
sigt <- npsigtest(LCLS, boot.method = "wild", boot.type = "I")
return(list(LCLS = LCLS, LLLS = LLLS))
}
I actually often prefer eval(bquote()) constructs because of such issues.

Import 2nd and 3rd columns from multiple .csv files into R code and write the output in a single excel sheet

I need to import the 2nd and the 3rd column from a folder of multiple.csv files perform an operation on these data (operation will be called from another separate R code) and write the results in a single excel sheet(either xlsx or .csv). I have given a lot of tries by incorporating various commands, still doesn't work. I am attaching the code below:
library(doParallel)
cl <- makeCluster(2)
registerDoParallel(cl)
library(foreach)
# load SSR functions
source("H:\\Users\\Deep\\Desktop\\SHHS\\SSR_functions.R")
# CCM analysis for 2sp model time
# load data
x <- read.table("H:\\Users\\Deep\\Desktop\\SHHS\\rawRR2.txt",fill=TRUE)
y <- read.table("H:\\Users\\Deep\\Desktop\\SHHS\\rawQT2.txt",fill=TRUE)
nc <- NCOL(x)
foreach(i = 97:nc) %dopar% {
temp_xv=x[,i]
temp_yv=y[,i]
xv = temp_xv[is.na(temp_xv)==FALSE]
yv = temp_yv[is.na(temp_yv)==FALSE]
F=length(xv)
lib <- c(1, F)
pred <- c(1, F)
lib_sizes <- c(F/100, F/50, F/25, F/12.5, F/6.25, F/3.125, F)
E <- 3
x_xmap_y <- ccm(xv, yv, lib_sizes, lib, pred, E)
y_xmap_x <- ccm(yv, xv, lib_sizes, lib, pred, E)
# compute mean rhos at each L
x_xmap_y$L <- as.factor(x_xmap_y$L)
x_xmap_y_means <- do.call(rbind, lapply(split(x_xmap_y,x_xmap_y$L), function(x){max(0, mean(x$rho))}))
y_xmap_x$L <- as.factor(y_xmap_x$L)
y_xmap_x_means <- do.call(rbind, lapply(split(y_xmap_x,y_xmap_x$L), function(x){max(0, mean(x$rho))}))
output=cbind(x_xmap_y_means, y_xmap_x_means)
write.table(output, file=(paste("H:\\Users\\Deep\\Desktop\\SHHS\\output\\m3_",i,".csv" ,sep="")), col.names = FALSE )
}

How to use a distinct data set per chain in Stan?

I have a data set with many missing observations and I used the Amelia package to create imputed data sets. I'd like to know if it's possible to run the same model in parallel with a different data set per chain and combine the results into a single Stan object.
# Load packages
library(Amelia)
library(rstan)
# Load built-in data
data(freetrade)
# Create 2 imputed data sets (polity is an ordinal variable)
df.imp <- amelia(freetrade, m = 2, ords = "polity")
# Check the first data set
head(df.imp$imputations[[1]])
# Run the model in Stan
code <- '
data {
int<lower=0> N;
vector[N] tariff;
vector[N] polity;
}
parameters {
real b0;
real b1;
real<lower=0> sigma;
}
model {
b0 ~ normal(0,100);
b1 ~ normal(0,100);
tariff ~ normal(b0 + b1 * polity, sigma);
}
'
# Create a list from the first and second data sets
df1 <- list(N = nrow(df.imp$imputations[[1]]),
tariff = df.imp$imputations[[1]]$tariff,
polity = df.imp$imputations[[1]]$polity)
df2 <- list(N = nrow(df.imp$imputations[[2]]),
tariff = df.imp$imputations[[2]]$tariff,
polity = df.imp$imputations[[2]]$polity)
# Run the model
m1 <- stan(model_code = code, data = df1, chains = 1, iter = 1000)
My question is how to run the last line of code on both data sets at the same time, running 2 chains and combining the output with the same stan() function. Any suggestions?
You can run the models separately, and then combine them using sflist2stanfit().
E.g.
seed <- 12345
s1 <- stan_model(model_code = code) # compile the model
m1 <- sampling(object = s1, data = df1, chains = 1,
seed = seed, chain_id = 1, iter = 1000)
m2 <- sampling(object = s1, data = df2, chains = 1,
seed = seed, chain_id = 2, iter = 1000)
f12 <- sflist2stanfit(list(m1, m2))
You will have to use one of the packages for Parallel computing in R.
According to this post, it should then work:
Will RStan run on a supercomputer?
Here is an example that may work (I use this code with JAGS, will test it with Stan later):
library( doParallel )
cl <- makeCluster( 2 ) # for 2 processes
registerDoParallel( cl )
library(rstan)
# make a function to combine the results
stan.combine <- function(...) { return( sflist2stanfit( list(...) ) ) }
mydatalist <- list(df1 , df2)
myseeds <- c(123, 456)
# now start the chains
nchains <- 2
m_both <- foreach(i=1:nchains ,
.packages = c( 'rstan' ),
.combine = "stan.combine") %dopar% {
result <- stan(model_code = code,
data = mydatalist[[i]], # use the right dataset
seed=myseeds[i], # use different seeds
chains = 1, iter = 1000)
return(result) }
Let me know whether it works with Stan. As I said, I haven't tested it yet.

Resources