Using R on a Hadoop installed cluster

Using R on a Hadoop installed cluster - r

I am a R user. I know very little about Linux command, PuTTY or Hadoop/Hive. So please correct me, if I am wrong.
I am now working with a team. They have a running Ubuntu system on a cluster. I can use PuTTY to access this Ubuntu system and access the data files by using code:
user$hadoop fs -ls /datafolder/
or by using hive:
user$hive
hive>use datafolder;
hive>show tables;
On the opposite, the team that I am working with knows very little about R, so they want me to do the R part. I have installed R on the cluster, and also installed rJava HRive and other packages in R. (I am not sure I did this correctly, but R seems running OK).
Now I can do some testing. I can run the following code on R # cluster:
user$R
>install.packages(c('Rcpp', 'RJSONIO', 'bitops', 'digest', 'functional', 'stringr', 'plyr', 'reshape2','caTools'))
>Sys.setenv(HADOOP_CMD="/opt/cloudera/bin/hadoop")
>Sys.setenv(HADOOP_HOME="/opt/cloudera/lib/hadoop")
>Sys.setenv(HADOOP_STREAMING="/opt/cloudera/lib/hadoop-mapreduce/hadoop-streaming.jar")
>library(rmr2)
>library(rhdfs)
>hdfs.init()
Testing:
>ints = to.dfs(1:10)
>calc = mapreduce(input = ints, map = function(k,v) cbind(v, v/2, 2*v))
>test <- from.dfs(calc)
>test
I can successfully load "test" back by using from.dfs. It seemed like I can save a dummy dataset to Hadoop, and can get it back from Hadoop successfully (correct?)
Now, my question is, how to let R import those datasets that I can see from
user$hadoop fs -ls /datafolder/
or
>hive use datafolder;

this is example of wordcount with load result back to R:
Sys.setenv(HADOOP_CMD="/usr/bin/hadoop")
Sys.setenv(HADOOP_STREAMING="/opt/cloudera/parcels/CDH-5.1.0-1.cdh5.1.0.p0.53/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming.jar")
Sys.setenv(JAVA_HOME="/usr/java/jdk1.7.0_55-cloudera")
Sys.setenv(HADOOP_COMMON_LIB_NATIVE_DIR="/opt/cloudera/parcels/CDH-5.1.0-1.cdh5.1.0.p0.53/lib/hadoop/lib/native")
Sys.setenv(HADOOP_OPTS="-Djava.library.path=HADOOP_HOME/lib")
library(rhdfs)
hdfs.init()
library(rmr2)
## space and word delimiter
map <- function(k,lines) {
words.list <- strsplit(lines, '\\s')
words <- unlist(words.list)
return( keyval(words, 1) )
}
reduce <- function(word, counts) {
keyval(word, sum(counts))
}
wordcount <- function (input, output=NULL) {
mapreduce(input=input, output=output, input.format="text", map=map, reduce=reduce)
}
## variables
hdfs.root <- '/user/node'
hdfs.data <- file.path(hdfs.root, 'data')
hdfs.out <- file.path(hdfs.root, 'out')
## run mapreduce job
##out <- wordcount(hdfs.data, hdfs.out)
system.time(out <- wordcount(hdfs.data, hdfs.out))
## fetch results from HDFS
results <- from.dfs(out)
results.df <- as.data.frame(results, stringsAsFactors=F)
colnames(results.df) <- c('word', 'count')
##head(results.df)
## sorted output TOP10
head(results.df[order(-results.df$count),],10)

Related

How to fix C function R_nc4_get_vara_double returned error in ncdf4 parallel processing in R

I want to download nc data through OPENDAP from a remote storage. I use parallel backend with foreach - dopar loop as follows:
# INPUTS
inputs=commandArgs(trailingOnly = T)
interimpath=as.character(inputs[1])
gcm=as.character(inputs[2])
period=as.character(inputs[3])
var=as.character(inputs[4])
source='MACAV2'
cat('\n\n EXTRACTING DATA FOR',var, gcm, period, '\n\n')
# CHANGING LIBRARY PATHS
.libPaths("/storage/home/htn5098/local_lib/R40") # local library for packages
setwd('/storage/work/h/htn5098/DataAnalysis')
source('./src/Rcodes/CWD_function_package.R') # Calling the function Rscript
# CALLING PACKAGES
library(foreach)
library(doParallel)
library(parallel)
library(filematrix)
# REGISTERING CORES FOR PARALLEL PROCESSING
no_cores <- detectCores()
cl <- makeCluster(no_cores)
registerDoParallel(cl)
invisible(clusterEvalQ(cl,.libPaths("/storage/home/htn5098/local_lib/R40"))) # Really have to import library paths into the workers
invisible(clusterEvalQ(cl, c(library(ncdf4))))
# EXTRACTING DATA FROM THE .NC FILES TO MATRIX FORM
url <- readLines('./data/external/MACAV2_OPENDAP_allvar_allgcm_allperiod.txt')
links <- grep(x = url,pattern = paste0('.*',var,'.*',gcm,'_.*',period), value = T)
start=c(659,93,1) # lon, lat, time
count=c(527,307,-1)
spfile <- read.csv('./data/external/SERC_MACAV2_Elev.csv',header = T)
grids <- sort(unique(spfile$Grid))
clusterExport(cl,list('ncarray2matrix','start','count','grids')) #exporting data into clusters for parallel processing
cat('\nChecking when downloading all grids\n')
# k <- foreach(x = links,.packages = c('ncdf4')) %dopar% {
# nc <- nc_open(x)
# nc.var=ncvar_get(nc,varid=names(nc$var),start=start,count=count)
# return(nc.var)
# nc_close(nc)
# }
k <- foreach(x = links,.packages = c('ncdf4'),.errorhandling = 'pass') %dopar% {
nc <- nc_open(x)
print(nc)
nc.var=ncvar_get(nc,varid=names(nc$var),start=c(659,93,1),count=c(527,307,-1))
nc_close(nc)
return(dim(nc.var))
Sys.sleep(10)
}
# k <- parSapply(cl,links,function(x) {
# nc <- nc_open(x)
# nc.var=ncvar_get(nc,varid=names(nc$var),start=start,count=count)
# nc_close(nc)
# return(nc.var)
# })
print(k)
However, I keep getting this error:
<simpleError in ncvar_get_inner(ncid2use, varid2use, nc$var[[li]]$missval, addOffset, scaleFact, start = start, count = count, verbose = verbose, signedbyte = signedbyte, collapse_degen = collapse_degen): C function R_nc4_get_vara_double returned error>
What could be the reason for this problem? Can you recommend a solution for this that is time-efficient (I have to repeat this for about 20 files)?
Thank you.

I had the same error in my code. The problem was not the code itself. It was one of the files that I wanted to read. It has something wrong, so R couldn't open it. I identified the file and downloaded it again, and the same code worked perfectly.

I also encountered the same error. For me, restarting R session did the trick.

Parallel package for windows 10 in R

I have this dataset that I'm trying to parse in R. The data from HMDB and the dataset name is Serum Metabolites (in a format of xml file). The xml file contains about 25K metabolites nodes, each I want to parse to sub-nodes
I have a code that parses the XML file to a list object in R.
Since the XML file is quite big and since for each metabolite there are about 12 sub-nodes I want, It takes a long time to parse the file. about 3 hours to 1,000 metabolites.
I'm trying to use the package parallel but receive and error.
The packages:
library("XML")
library("xml2")
library( "magrittr" ) #for pipe operator %>%
library("pbapply") # to track on progress
library("parallel")
The function:
# The function receives an XML file (its location) and returns a list of nodes
Short_Parser_HMDB <- function(xml.file_location){
start.time<- Sys.time()
# Read as xml file
doc <- read_xml( xml.file_location )
#get metabolite nodes (only first three used in this sample)
met.nodes <- xml_find_all( doc, ".//d1:metabolite" ) [1:1000] # [(i*1000+1):(1000*i+1000)] # [1:3]
#list of data.frame
xpath_child.v <- c( "./d1:accession",
"./d1:name" ,
"./d1:description",
"./d1:synonyms/d1:synonym" ,
"./d1:chemical_formula" ,
"./d1:smiles" ,
"./d1:inchikey" ,
"./d1:biological_properties/d1:pathways/d1:pathway/d1:name" ,
"./d1:diseases/d1:disease/d1:name" ,
"./d1:diseases/d1:disease/d1:references",
"./d1:kegg_id" ,
"./d1:meta_cyc_id"
)
child.names.v <- c( "accession",
"name" ,
"description" ,
"synonyms" ,
"chemical_formula" ,
"smiles" ,
"inchikey" ,
"pathways_names" ,
"diseases_name",
"references",
"kegg_id" ,
"meta_cyc_id"
)
#first, loop over the met.nodes
L.sec_acc <- parLapply(cl, met.nodes, function(x) { # pblapply to track progress or lapply but slows down dramticlly the function and parLapply fo parallel
#second, loop over the xpath desired child-nodes
temp <- parLapply(cl, xpath_child.v, function(y) {
xml_find_all(x, y ) %>% xml_text(trim = T) %>% data.frame( value = .)
})
#set their names
names(temp) = child.names.v
return(temp)
})
end.time<- Sys.time()
total.time<- end.time-start.time
print(total.time)
return(L.sec_acc )
}
Now create the enviroment :
# select the location where the XML file is
location= "D:/path/to/file//HMDB/DataSets/serum_metabolites/serum_metabolites.xml"
cl <-makeCluster(detectCores(), type="PSOCK")
clusterExport(cl, c("Short_Parser_HMDB", "cl"))
clusterEvalQ(cl,{library("parallel")
library("magrittr")
library("XML")
library("xml2")
})
And execute :
Short_outp<-Short_Parser_HMDB(location)
stopCluster(cl)
The error received:
> Short_outp<-Short_Parser_HMDB(location)
Error in checkForRemoteErrors(val) :
one node produced an error: invalid connection
base on those links, Tried to implement the parallel :
Parallel Processing in R
How to call global function from the parLapply function?
Error in R parallel:Error in checkForRemoteErrors(val) : 2 nodes produced errors; first error: cannot open the connection
but couldn't find invalid connection as an error
I'm using windows 10 the latest R version 4.0.2 (not sure if it's enough information)
Any hint or idea will be appreciated

How to apply rma() normalization to a unique CEL file?

I have implemented a R script that performs batch correction on a gene expression dataset. To do the batch correction, I first need to normalize the data in each CEL file through the Affy rma() function of Bioconductor.
If I run it on the GSE59867 dataset obtained from GEO, everything works.
I define a batch as the data collection date: I put all the CEL files having the same date into a specific folder, and then consider that date/folder as a specific batch.
On the GSE59867 dataset, a batch/folder contains only 1 CEL file. Nonetheless, the rma() function works on it perfectly.
But, instead, if I try to run my script on another dataset (GSE36809), I have some troubles: if I try to apply the rma() function to a batch/folder containing only 1 file, I get the following error:
Error in `colnames<-`(`*tmp*`, value = "GSM901376_c23583161.CEL.gz") :
attempt to set 'colnames' on an object with less than two dimensions
Here's my specific R code, to let you understand.
You first have to download the file GSM901376_c23583161.CEL.gz:
setwd(".")
options(stringsAsFactors = FALSE)
fileURL <- "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM901nnn/GSM901376/suppl/GSM901376%5Fc23583161%2ECEL%2Egz"
fileDownloadCommand <- paste("wget ", fileURL, " ", sep="")
system(fileDownloadCommand)
Library installation:
source("https://bioconductor.org/biocLite.R")
list.of.packages <- c("easypackages")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
listOfBiocPackages <- c("oligo", "affyio","BiocParallel")
bioCpackagesNotInstalled <- which( !listOfBiocPackages %in% rownames(installed.packages()) )
cat("package missing listOfBiocPackages[", bioCpackagesNotInstalled, "]: ", listOfBiocPackages[bioCpackagesNotInstalled], "\n", sep="")
if( length(bioCpackagesNotInstalled) ) {
biocLite(listOfBiocPackages[bioCpackagesNotInstalled])
}
library("easypackages")
libraries(list.of.packages)
libraries(listOfBiocPackages)
Application of rma()
thisFileDate <- "GSM901376_c23583161.CEL.gz"
thisDateRawData <- read.celfiles(thisDateCelFiles)
thisDateNormData <- rma(thisDateRawData)
After the call to rma(), I get the error.
How can I solve this problem?
I also tried to skip this normalization, by saving the thisDateRawData object directly. But then I have the problem that I cannot combine together this thisDateRawData (that is a ExpressionFeatureSet) with the outputs of rma() (that are ExpressionSet objects).
(EDIT: I extensively edited the question, and added a piece of R code you should be able to run on your pc.)

Hmm. This is a puzzling problem. the oligo::rma() function might be buggy for class GeneFeatureSet with single samples. I got it to work with a single sample by using lower-level functions, but it means I also had to create the expression set from scratch by specifying the slots:
# source("https://bioconductor.org/biocLite.R")
# biocLite("GEOquery")
# biocLite("pd.hg.u133.plus.2")
# biocLite("pd.hugene.1.0.st.v1")
library(GEOquery)
library(oligo)
# # Instead of using .gz files, I extracted the actual CELs.
# # This is just to illustrate how I read in the files; your usage will differ.
# projectDir <- "" # Path to .tar files here
# setwd(projectDir)
# untar("GSE36809_RAW.tar", exdir = "GSE36809")
# untar("GSE59867_RAW.tar", exdir = "GSE59867")
# setwd("GSE36809"); gse3_cels <- dir()
# sapply(paste(gse3_cels, sep = "/"), gunzip); setwd(projectDir)
# setwd("GSE59867"); gse5_cels <- dir()
# sapply(paste(gse5_cels, sep = "/"), gunzip); setwd(projectDir)
#
# Read in CEL
#
# setwd("GSE36809"); gse3_cels <- dir()
# gse3_efs <- read.celfiles(gse3_cels[1])
# # Assuming you've read in the CEL files as a GeneFeatureSet or
# # ExpressionFeatureSet object (i.e. gse3_efs in this example),
# # you can now fit the RMA and create an ExpressionSet object with it:
exprsData <- basicRMA(exprs(gse3_efs), pnVec = featureNames(gse3_efs))
gse3_expset <- new("ExpressionSet")
slot(gse3_expset, "assayData") <- assayDataNew(exprs = exprsData)
slot(gse3_expset, "phenoData") <- phenoData(gse3_efs)
slot(gse3_expset, "featureData") <- annotatedDataFrameFrom(attr(gse3_expset,
'assayData'), byrow = TRUE)
slot(gse3_expset, "protocolData") <- protocolData(gse3_efs)
slot(gse3_expset, "annotation") <- slot(gse3_efs, "annotation")
Hopefully the above approach will work in your code.

R: Memory Management during xmlEventParse of Huge (>20GB) files

Building on this previous question (see here), I am attempting to read in many, large xml files via xmlEventParse whilst saving node-varying data. Working with this sample xml: https://www.nlm.nih.gov/databases/dtd/medsamp2015.xml.
The code below uses xpathSapply to extract the necessary values and a series of if statements to combine the values in a way that matches the unique value (PMID) to each of the non-unique values (LastName) within a record - for which there may be no LastNames. The goal is to write a series of small csv's along the way (here, after every 1000 LastNames) to minimize the amount of memory used.
When run on the full-sized data set, the code successfully outputs files in batches, however something is still being stored in memory that eventually causes a system error once all RAM is used. I've watched the task manager while the code runs and can see R's memory grow as the program progresses. And if I stop the program mid-run and then clear the R workspace, including hidden items, the memory still appears to be in use by R. It is not until I shutdown R is the memory freed up again.
Run this a few times yourself and you'll see R's memory usage grow even after clearing the workspace.
Please help! This problem appears to be common to others reading in large XML files in this manner (See for example comments in this question).
My code is as follows:
library(XML)
filename <- "~/Desktop/medsamp2015.xml"
tempdat <- data.frame(pmid=as.numeric(),
lname=character(),
stringsAsFactors=FALSE)
cnt <- 1
branchFunction <- function() {
func <- function(x, ...) {
v1 <- xpathSApply(x, path = "//PMID", xmlValue)
v2 <- xpathSApply(x, path = "//Author/LastName", xmlValue)
print(cbind(c(rep(v1,length(v2))), v2))
#below is where I store/write the temp data along the way
#but even without doing this, memory is used (even after clearing)
tempdat <<- rbind(tempdat,cbind(c(rep(v1,length(v2))), v2))
if (nrow(tempdat) > 1000){
outname <- paste0("~/Desktop/outfiles",cnt,".csv")
write.csv(tempdat, outname , row.names = F)
tempdat <<- data.frame(pmid=as.numeric(),
lname=character(),
stringsAsFactors=FALSE)
cnt <<- cnt+1
}
}
list(MedlineCitation = func)
}
myfunctions <- branchFunction()
#RUN
xmlEventParse(
file = filename,
handlers = NULL,
branches = myfunctions
)

Here is an example, we have a launch script invoke.sh, that calls an R Script and passes the url and filename as parameters... In this case, I had previously downloaded the test file medsamp2015.xml and put in the ./data directory.
My sense would be to create a loop in the invoke.sh script and iterate through the list of target file names. For each file you invoke an R instance, download it, process the file and move on to the next.
Caveat: I didn't check or change your function against any other download files and formats. I would turn off the printing of the output by removing the print() wrapper on line 62.
print( cbind(c(rep(v1, length(v2))), v2))
See: runtime.txt for print out.
The output .csv files are placed in the ./data directory.
Note: This is a derivative of a previous answer provided by me on this subject:
R memory not released in Windows. I hope it helps by way of example.
Launch Script
1 #!/usr/local/bin/bash -x
2
3 R --no-save -q --slave < ./47162861.R --args "https://www.nlm.nih.gov/databases/dtd" "medsamp2015.xml"
R File - 47162861.R
# Set working directory
projectDir <- "~/dev/stackoverflow/47162861"
setwd(projectDir)
# -----------------------------------------------------------------------------
# Load required Packages...
requiredPackages <- c("XML")
ipak <- function(pkg) {
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
ipak(requiredPackages)
# -----------------------------------------------------------------------------
# Load required Files
# trailingOnly=TRUE means that only your arguments are returned
args <- commandArgs(trailingOnly = TRUE)
if ( length(args) != 0 ) {
dataDir <- file.path(projectDir,"data")
fileUrl = args[1]
fileName = args[2]
} else {
dataDir <- file.path(projectDir,"data")
fileUrl <- "https://www.nlm.nih.gov/databases/dtd"
fileName <- "medsamp2015.xml"
}
# -----------------------------------------------------------------------------
# Download file
# Does the directory Exist? If it does'nt create it
if (!file.exists(dataDir)) {
dir.create(dataDir)
}
# Now we check if we have downloaded the data already if not we download it
if (!file.exists(file.path(dataDir, fileName))) {
download.file(fileUrl, file.path(dataDir, fileName), method = "wget")
}
# -----------------------------------------------------------------------------
# Now we extrat the data
tempdat <- data.frame(pmid = as.numeric(), lname = character(),
stringsAsFactors = FALSE)
cnt <- 1
branchFunction <- function() {
func <- function(x, ...) {
v1 <- xpathSApply(x, path = "//PMID", xmlValue)
v2 <- xpathSApply(x, path = "//Author/LastName", xmlValue)
print(cbind(c(rep(v1, length(v2))), v2))
# below is where I store/write the temp data along the way
# but even without doing this, memory is used (even after
# clearing)
tempdat <<- rbind(tempdat, cbind(c(rep(v1, length(v2))),
v2))
if (nrow(tempdat) > 1000) {
outname <- file.path(dataDir, paste0(cnt, ".csv")) # Create FileName
write.csv(tempdat, outname, row.names = F) # Write File to created directory
tempdat <<- data.frame(pmid = as.numeric(), lname = character(),
stringsAsFactors = FALSE)
cnt <<- cnt + 1
}
}
list(MedlineCitation = func)
}
myfunctions <- branchFunction()
# -----------------------------------------------------------------------------
# RUN
xmlEventParse(file = file.path(dataDir, fileName),
handlers = NULL,
branches = myfunctions)
Test File and output
~/dev/stackoverflow/47162861/data/medsamp2015.xml
$ ll
total 2128
drwxr-xr-x# 7 hidden staff 238B Nov 10 11:05 .
drwxr-xr-x# 9 hidden staff 306B Nov 10 11:11 ..
-rw-r--r--# 1 hidden staff 32K Nov 10 11:12 1.csv
-rw-r--r--# 1 hidden staff 20K Nov 10 11:12 2.csv
-rw-r--r--# 1 hidden staff 23K Nov 10 11:12 3.csv
-rw-r--r--# 1 hidden staff 37K Nov 10 11:12 4.csv
-rw-r--r--# 1 hidden staff 942K Nov 10 11:05 medsamp2015.xml
Runtime Output
> ./invoke.sh > runtime.txt
+ R --no-save -q --slave --args https://www.nlm.nih.gov/databases/dtd medsamp2015.xml
Loading required package: XML
File: runtime.txt

Parallel proccessing in R doParallel foreach save data

Progress has been made on getting the parallel processing part working but saving the vector with the fetch distances is not working properly. The error I get is
df_Test_Fetch <- data.frame(x_lake_length)
Error in data.frame(x_lake_length) : object 'x_lake_length' not found
write.table(df_Test_Fetch,file="C:/tempTest_Fetch.csv",row.names=TRUE,col.names=TRUE, sep=",")
Error in is.data.frame(x) : object 'df_Test_Fetch' not found
I have tried altering the code below so that the foreach step is output to x_lake_length. But that did not output the vector as I hoped. How can I get the actually results to be saved to a csv file. I am running a windows 8 computer with R x64 3.3.0.
Thanks you in advance
Jen
Here is the full code.
# make sure there is no prexisting data
rm(x_lake_length)
# Libraries ---------------------------------------------------------------
if (!require("pacman")) install.packages("pacman")
pacman::p_load(lakemorpho,rgdal,maptools,sp,doParallel,foreach,
doParallel)
# HPC ---------------------------------------------------------------------
cores_2_use <- detectCores() - 2
cl <- makeCluster(cores_2_use, useXDR = F)
clusterSetRNGStream(cl, 9956)
registerDoParallel(cl, cores_2_use)
# Data --------------------------------------------------------------------
ogrDrivers()
dsn <- system.file("vectors", package = "rgdal")[1]
# the line below is commented out but when I run the script on my data the line below is what I use instead of the one above
# then making the name changes as needed
# dsn<-setwd("J:\\Elodea\\ByHUC6\\")
ogrListLayers(dsn)
ogrInfo(dsn=dsn, layer="trin_inca_pl03")
owd <- getwd()
setwd(dsn)
ogrInfo(dsn="trin_inca_pl03.shp", layer="trin_inca_pl03")
setwd(owd)
x <- readOGR(dsn=dsn, layer="trin_inca_pl03")
summary(x)
# Analysis ----------------------------------------------------------------
myfun <- function(x,i){tmp<-lakeMorphoClass(x[i,],NULL,NULL,NULL)
x_lake_length<-vector("numeric",length = nrow(x))
x_lake_length[i]<-lakeMaxLength(tmp,200)
print(i)
Sys.sleep(0.1)}
foreach(i = 1:nrow(x),.combine=cbind,.packages=c("lakemorpho","rgdal")) %dopar% (
myfun(x,i)
)
options(digits=10)
df_Test_Fetch <- data.frame(x_lake_length)
write.table(df_Test_Fetch,file="C:/temp/Test_Fetch.csv",row.names=TRUE,col.names=TRUE, sep=",")
print(proc.time())

I think this is what you want, though without understanding the subject matter I can't be 100% sure.
What I did was add a return() to your parallelized function and assigned the value of that returned object to x_lake_length when you call the foreach. But I'm only guessing that that's what you were trying to do, so please correct me if I'm wrong.
# make sure there is no prexisting data
rm(x_lake_length)
# Libraries ---------------------------------------------------------------
if (!require("pacman")) install.packages("pacman")
pacman::p_load(lakemorpho,rgdal,maptools,sp,doParallel,foreach,
doParallel)
# HPC ---------------------------------------------------------------------
cores_2_use <- detectCores() - 2
cl <- makeCluster(cores_2_use, useXDR = F)
clusterSetRNGStream(cl, 9956)
registerDoParallel(cl, cores_2_use)
# Data --------------------------------------------------------------------
ogrDrivers()
dsn <- system.file("vectors", package = "rgdal")[1]
# the line below is commented out but when I run the script on my data the line below is what I use instead of the one above
# then making the name changes as needed
# dsn<-setwd("J:\\Elodea\\ByHUC6\\")
ogrListLayers(dsn)
ogrInfo(dsn=dsn, layer="trin_inca_pl03")
owd <- getwd()
setwd(dsn)
ogrInfo(dsn="trin_inca_pl03.shp", layer="trin_inca_pl03")
setwd(owd)
x <- readOGR(dsn=dsn, layer="trin_inca_pl03")
summary(x)
# Analysis ----------------------------------------------------------------
myfun <- function(x,i){tmp<-lakeMorphoClass(x[i,],NULL,NULL,NULL)
x_lake_length<-vector("numeric",length = nrow(x))
x_lake_length[i]<-lakeMaxLength(tmp,200)
print(i)
Sys.sleep(0.1)
return(x_lake_length)
}
x_lake_length <- foreach(i = 1:nrow(x),.combine=cbind,.packages=c("lakemorpho","rgdal")) %dopar% (
myfun(x,i)
)
options(digits=10)
df_Test_Fetch <- data.frame(x_lake_length)
write.table(df_Test_Fetch,file="C:/temp/Test_Fetch.csv",row.names=TRUE,col.names=TRUE, sep=",")
print(proc.time())

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Using R on a Hadoop installed cluster - r

Related

How to fix C function R_nc4_get_vara_double returned error in ncdf4 parallel processing in R

Parallel package for windows 10 in R

How to apply rma() normalization to a unique CEL file?

R: Memory Management during xmlEventParse of Huge (>20GB) files

Parallel proccessing in R doParallel foreach save data

Categories

Resources