R shortest.paths for loops - r

I have a task to repeat the function shortest.paths,but my input is too large.I wanna know how to get it fast.To all my known, the igraph is the best choice. My kernel is as follows:
First, I got a real network and 1000 random network as 'list' format
library(igraph)
# real ppi network
REAL.PPI <- paste0(RANDM, "real.ppi.txt")
real.ppi <- graph.data.frame(read.table(REAL.PPI, header = F), directed = F)
ppi.gs <- V(real.ppi)$name
# random network
random_net_names <- dir(paste0(RANDM, "randomnetwork"))
random_nets <- lapply(random_net_names, function(x){
path <- paste(RANDM, "randomnetwork/", x, sep="")
rn <- read.table(path, header = F)
graph.data.frame(rn, directed = F)
}
Then, I have to compare the node sets' shortest paths in real and random networks.For this, I choose for-loop instead apply-function because the latter will not be faster.
input format is as:
hsa-let-7a-2-3p hsa-let-7a-3p GO:0001702 4040 10818 4089
hsa-let-7a-2-3p hsa-let-7a-3p GO:0001764 27185 2625 5048 429 6695
My kernel is as follows:
# input
ovr <- strsplit(readLines("ovr.txt"), '\t')
# for-loop
OUT.final <- outfile("out.txt", "w")
for(i in 1 : length(ovr)){
hyper.ppi.ovr <- ovr[[i]][- c(1, 2, 3)]
D1 <- shortest.paths(real.ppi, hyper.ppi.ovr, hyper.ppi.ovr)
CPLs <- sapply(random_nets, function(r_net){
sum(shortest.paths(r_net, hyper.ppi.ovr, hyper.ppi.ovr))
}
)
D2.p <- sum(CPLs < D1) / 1000
if(D2.p > .01)next
# output
out.value <- ovr[[i]]
cat(out.value, sep = "\t", file = OUT.final)
cat("\n", file = OUT.final)
}
close(OUT.final)

Related

Using "lapply" in R to create multiple raster files from folder with lidar data

How can I read all files in a folder, perform a script and create separate outputs from all files
containing the original name? I have a folder with .las files and I need to create corresponding .asc files from them. My script as below:
library(lidR)
# Path to data
LASfile <- ("path/1234.las")
# Sorting out points in point cloud data, keeping vegetation and ground point classes.
las <- readLAS(LASfile, filter="-keep_class 1 2") # Keep high vegetation and ground point classes
# Normalizing ground points to 0 elevation (idwinterpolation), instead of meters above sea level.
dtm <- grid_terrain(las, algorithm = knnidw(k = 8, p = 2))
las_normalized <- normalize_height(las, dtm)
# Create a filter to remove points above 95th percentile of height
lasfilternoise = function(las, sensitivity)
{
p95 <- grid_metrics(las, ~quantile(Z, probs = 0.95), 10)
las <- merge_spatial(las, p95, "p95")
las <- filter_poi(las, Z < p95*sensitivity)
las$p95 <- NULL
return(las)
}
# Generating a pitfree canopy height modela model without null values (Khosravipour et al., 2014)
las_denoised <- lasfilternoise(las_normalized, sensitivity = 1.2)
chm <- grid_canopy(las_denoised, 0.32, pitfree(c(0,2,5,10,15), c(3,1.5), subcircle = 0.2))
# Applying a median filter, 5x5 moving window to smooth the image and remove noise
ker <- matrix(1,3,3)
chms <- raster::focal(chm, w = ker, fun = median)
plot(chms)
library(raster)
# Writing output file
writeRaster(chms, filename="path/1234.asc", format="ascii", overwrite=TRUE) # Ändra till relevant för varje körning
citation("lidR")
I tried using lapply but I dont know how to use it in the right way.
Must be something like this to read all files in the folder: list.files("path", pattern = "*.las", full.names = TRUE)
and something like this to write the output files: lapply(r, writeRaster, filename = paste0(f, ".asc"), format = "ascii")
But I cannot get it right
An example of my LAZ to LAS+Index conversion:
convertLAZ <- function(lazfile, outdir = "") {
if(!dir.exists({{outdir}})) { dir.create({{outdir}}, recursive = TRUE)}
print(lazfile)
las <- lidR::readLAS(files = {{lazfile}}, filter = "-keep_class 2 9")
.file <- stringi::stri_replace_all_regex({{lazfile}}, "^.*/", "")
lidR::writeLAS(las, file = paste0({{outdir}}, "/", stringi::stri_replace_all_fixed(.file, "laz", "las")), index = TRUE)
}
f <- list.files("data/laz", pattern = "*.laz", full.names = TRUE)
lapply(f, convertLAZ, outdir = "data/las22")
You can expand it to rasterization, normalization, etc and saving as .asc. But I would encourage you to have a look on https://r-lidar.github.io/lidRbook/engine.html. In short: process your LAZ/LAS files as LAScatalog, and then tile the result raster and save to .asc.
And an example how to use parallel processing (in below example 3+1 processes - please note, it can be memory hungry, so be careful with number of workers/processing parameters like opt_chunk_buffer.
library(future)
options(parallelly.availableCores.methods = "mc.cores")
options(mc.cores = 3)
plan(multisession)
parallelly::availableWorkers()
library(lidR)
myPath <- "data/las"
ctg <- readLAScatalog(myPath)
crs(ctg) <- "EPSG:2180"
ctg#output_options$drivers$SpatRaster$param$overwrite <- TRUE
opt_output_files(ctg) <- "data/dtm2/barycz__{XLEFT}_{YBOTTOM}"
opt_chunk_size(ctg) <- 500
opt_chunk_buffer(ctg) <- 600
opt_filter(ctg) <- "-keep_class 2 9"
summary(ctg)
vr <- rasterize_terrain(ctg, 0.25, tin())
plot(vr)
Solved it now
.libPaths( c( "C:/Users/Public/R/win-library/4.2" , .libPaths() ) )
library(lidR)
createASCI <- function(lasfile, outdir = "") {
if(!dir.exists({{outdir}})) { dir.create({{outdir}}, recursive = TRUE)}
print(lasfile)
las <- lidR::readLAS(files = {{lasfile}}, filter = "-keep_class 1 2 3 4 5")
.file <- stringi::stri_replace_all_regex({{lasfile}}, "^.*/", "")
# Normalizing ground points to 0 elevation (idwinterpolation), instead of meters above sea level.
dtm <- grid_terrain(las, algorithm = knnidw(k = 8, p = 2))
las_normalized <- normalize_height(las, dtm)
# Create a filter to remove points above 95th percentile of height
lasfilternoise = function(las, sensitivity)
{
p95 <- grid_metrics(las, ~quantile(Z, probs = 0.95), 10)
las <- merge_spatial(las, p95, "p95")
las <- filter_poi(las, Z < p95*sensitivity)
las$p95 <- NULL
return(las)
}
# Generating a pitfree canopy height modela model without null values (Khosravipour et al., 2014)
las_denoised <- lasfilternoise(las_normalized, sensitivity = 1.2)
chm <- grid_canopy(las_denoised, 0.32, pitfree(c(0,2,5,10,15), c(3,1.5), subcircle = 0.2))
# Applying a median filter, 5x5 moving window to smooth the image and remove noise
ker <- matrix(1,3,3)
chms <- raster::focal(chm, w = ker, fun = median)
writeRaster(chms, file = paste0({{outdir}}, "/", stringi::stri_replace_all_fixed(.file, "las", "asc")), index = TRUE)
}
f <- list.files("C:/Lasdata", pattern = "*.las", full.names = TRUE)
lapply(f, createASCI, outdir = "C:/Lasdata/nytt")

Creating a loop to create NDVI images from raster stacks and naming them from the file name

I have been trying to write a loop to go through two folders of Sentinel 2 satellite images (Band 4 and 5) and get a NDVI for each date.
A stack is created for each band, some cropping and resampling to finally proceed to the NDVI calculation. I struggle with the integration of the NDVI calculation in the loop and the file name creation.
I'd simply want my loop to generate x files for x dates and then give each NDVI images the date as a name "YYYY/MM/DD.tif" extracted from the file name. But I can't think of a way to do so, after a lot of unsuccessful trial and error.
#list files
files4 <- list.files(path4, pattern = "jp2$", full.names = TRUE)
files5 <- list.files(path5, pattern = "jp2$", full.names = TRUE)
ms5 <- stack()
ms4 <- stack()
for (f in files4){
# loading a raster
r4 <- raster(f)
proj4string(r4)
proj4string(emprise)
emprise <- spTransform(emprise, proj4string(r4))
r4b <- crop(r4, emprise)
ms4<- stack(ms4,r4b)
#copy the date from the file to give a name to the final NDVI image (I have to get ride of everything but the date
x <- gsub("[A-z //.//(//)]", "", r4)
y <- substr(x, 4, 11)
}
for (f in files5){
# load the raster
r5 <- raster(f)
proj4string(r5)
proj4string(emprise)
emprise <- spTransform(emprise, proj4string(r5))
r5b <- crop(r5, emprise)
ms5<- stack(ms5,r5b)
}
#Resampling : setting the Band 5 to the same resolution as Band 4
b5_resamp <- resample(ms5, ms4)
Have you considered looping over dates rather than files? I can't give more specific advice without example data, but here is the general idea:
# List files
files4 <- list.files("./band4", pattern = ".tif", full.names = TRUE)
#> "band4/T31UDR_20170126T105321_B04.tif" "band4/T31UDR_20180126T105321_B04.tif"
files5 <- list.files("./band5", pattern = ".tif", full.names = TRUE)
#> "./band5/T31UDR_20170126T105321_B05.tif" "./band5/T31UDR_20180126T105321_B05.tif"
# Get dates
dates <- unique(gsub(pattern = ".*_(\\d{8}).*", replacement = "\\1", x = c(files4, files5)))
#> "20170126" "20180126"
# Define empty stacks
ms5 <- stack()
ms4 <- stack()
for(date in dates){
## Band 4
f4 <- list.files("./band4", pattern = date, full.names = TRUE)
# loading a raster
r4 <- raster(f4)
proj4string(r4)
proj4string(emprise)
emprise <- spTransform(emprise, proj4string(r4))
r4b <- crop(r4, emprise)
ms4 <- stack(ms4,r4b)
## Band 5
f5 <- list.files("./band5", pattern = date, full.names = TRUE)
# load the raster
r5 <- raster(f5)
proj4string(r5)
proj4string(emprise)
emprise <- spTransform(emprise, proj4string(r5))
r5b <- crop(r5, emprise)
ms5<- stack(ms5,r5b)
## Resampling : setting the Band 5 to the same resolution as Band 4
b5_resamp <- resample(ms5, ms4)
## Write to file
writeRaster(b5_resamp, filename = paste0(date, ".tif"))
}

R create Random Networks according data frame

I have a data frame "ref.df" that has info about 12 networks. I want to create 100 random networks for each subject according their node and edge numbers.
I've tried this code but it didn't work well:
library(igraph)
random.networks <- list()
for(i in ref.df$subject){
cat("...")
for( j in 1:100){
random.networks[[j]] <- sample_gnm(n=ref.df$node,m=ref.df$edge, directed = TRUE, loops = FALSE)
}
cat(i,"\n")
}
This code generate 100 random networks only for the first subject.
Thanks for your time and advice in advance.
You can reproduce my data frame:
ref.df <- data.frame(subject=c("Civil.Liberties","Foreign.Policy","Women.s.Rights","Workers..Rights",
"Political.Polarisation","Kurdish.Peace.Process","Parallel.State",
"HDP.Election.Slogans","Related.With.Election","CHP.Election.Slogans",
"AKP.Election.Slogans","MHP.Election.Slogans"),
group=c(298,1150,474,2522,0,2570,718,2736,0,1661,2175,1460),
mod=c(0.77,0.73,0.84,0.78,0,0.72,0.66,0.62,0,0.68,0.76,0.66),
node=c(13524,68792,21925,87094,195678,98008,28499,93024,201342,61539,91640,63035),
edge=c(18694,183932,27120,143032,710044,249267,108352,255615,579919,17590,3313147,213367))
If the problem is that you want 12 x 100 networks and you are only getting a list of 100, with a minimal modification to your code, you can do as follows:
random.networks <- list()
for (subj in ref.df$subject){
cat("...")
for (i in 1:100) {
tmp <- sample_gnm(n=ref.df$node[ref.df$subject == subj],
m=ref.df$edge[ref.df$subject == subj],
directed = TRUE, loops = FALSE)
random.networks[[(length(random.networks) + 1)]] <- tmp
names(random.networks)[length(random.networks)] <- paste(as.vector(subj), i, sep = "_")
}
cat(as.vector(subj),"\n")
}
random.networks
If you want to make sure that the random networks you generate are different, you may want trying the following approach, but because of the cross comparisons, this will be very slow.
random.networks <- list()
look.up <- list()
for (subj in ref.df$subject){
cat("...")
for (i in 1:100) {
tmp <- NA
# enforce uniqueness
while(is.na(tmp)|
as.character(tmp)[4] %in% look.up) {
tmp <- sample_gnm(n=ref.df$node[ref.df$subject == subj],
m=ref.df$edge[ref.df$subject == subj],
directed = TRUE, loops = FALSE)
}
random.networks[[(length(random.networks) + 1)]] <- tmp
look.up[[(length(look.up) + 1)]] <- as.character(tmp)[4]
names(random.networks)[length(random.networks)] <- paste(as.vector(subj), i, sep = "_")
}
cat(as.vector(subj),"\n")
}

Rhadoop mapreduce for multiple input files

I'm building a mapreduce program, using R, that extracts the relevant features from a set of features in a dataset using genetic algorithm. I need to put many files as an input to my mapreduce job. My code below is my mapreduce program but it works only for one input file (data.csv).
library(caret)
library(dplyr)
library(rmr2)
Sys.setenv(HADOOP_CMD="/home/rania/hadoop-2.7.3/bin/hadoop")
Sys.getenv("HADOOP_CMD")
Sys.setenv(HADOOP_STREAMING="/home/rania/hadoop-streaming-2.7.3.jar")
library(rhdfs)
hdfs.init()
rmr.options(backend = "hadoop")
hdfs.mkdir("/user/rania/genetic")
hdfs.mkdir("/user/rania/genetic/data")
I put my files in one folder in hdfs
hadoop fs -copyFromLocal /home/rania/Downloads/matrices/*.csv /user/rania/genetic/data/
This is the map function
mon.map <- function(.,data){
data <- read.csv("/home/rania/Downloads/dataset.csv", header = T, sep = ";")
y <- c(1,0,1,0,1,1,1,1,0,0,1,0,1)
ga_ctrl <- gafsControl(functions = rfGA, # Assess fitness with RF
method = "cv") # 10 fold cross validation
set.seed(10)
lev <- c("1","0")
rf_ga3 <- gafs(x = data, y = y,
iters = 10, # 100 generations of algorithm
popSize = 4, # population size for each generation
levels = lev,
gafsControl = ga_ctrl)
keyval(rf_ga3$ga$final, data[names(data) %in% rf_ga3$ga$final] )
}
This is the reduce function
mon.reduce <- function(k,v){
keyval(k,v) }
Now i apply the mapreduce job
hdfs.root = 'genetic'
hdfs.data = file.path(hdfs.root, 'data')
hdfs.out = file.path(hdfs.root, 'out')
csv.format <- make.output.format("csv")
genetic = function (input, output) {mapreduce(input=input, output=output, input.format="csv",output.format=csv.format, map=mon.map,reduce=mon.reduce)}
out = genetic(hdfs.data, hdfs.out)
Then we print the result from hdfs
results <- from.dfs(out, format="csv")
print(results)
OR
hdfs.cat("/genetic/out/part-00000")
I tried to change the map function to make it work for many files but it failed
mon.map <- function(.,data){
data <- list.files(path="/home/rania/Downloads/matrices/", full.names=TRUE, pattern="\\.csv") %>% lapply(read.csv, header=TRUE, sep=",")
y <- c(1,0,1,0,1,1,1,1,0,0,1,0,1)
for (i in 1:4){
ga_ctrl <- gafsControl(functions = rfGA, # Assess fitness with RF
method = "cv") # 10 fold cross validation
set.seed(10)
lev <- c("1","0")
rf_ga3 <- gafs(x = data[[i]], y = y,
iters = 10, # 100 generations of algorithm
popSize = 4, # population size for each generation
levels = lev,
gafsControl = ga_ctrl)
}
keyval(rf_ga3$ga$final, do.call(cbind, Map(`[`, data, c(rf_ga3$ga$final))) )
}
what can i change in the previous map function to make it work for many input files? thanks

R: Row resampling loop speed improvement

I'm subsampling rows from a dataframe with c("x","y","density") columns at a variety of c("s_size","reps"). Reps= replicates, s_size= number of rows subsampled from the whole dataframe.
> head(data_xyz)
x y density
1 6 1 0
2 7 1 17600
3 8 1 11200
4 12 1 14400
5 13 1 0
6 14 1 8000
#Subsampling###################
subsample_loop <- function(s_size, reps, int) {
tm1 <- system.time( #start timer
{
subsample_bound = data.frame()
#Perform Subsampling of the general
for (s_size in seq(1,s_size,int)){
for (reps in 1:reps) {
subsample <- sample.df.rows(s_size, data_xyz)
assign(paste("sample" ,"_","n", s_size, "_", "r", reps , sep=""), subsample)
subsample_replicate <- subsample[,] #temporary variable
subsample_replicate <- cbind(subsample, rep(s_size,(length(subsample_replicate[,1]))),
rep(reps,(length(subsample_replicate[,1]))))
subsample_bound <- rbind(subsample_bound, subsample_replicate)
}
}
}) #end timer
colnames(subsample_bound) <- c("x","y","density","s_size","reps")
subsample_bound
} #end function
Here's the function call:
source("R/functions.R")
subsample_data <- subsample_loop(s_size=206, reps=5, int=10)
Here's the row subsample function:
# Samples a number of rows in a dataframe, outputs a dataframe of the same # of columns
# df Data Frame
# N number of samples to be taken
sample.df.rows <- function (N, df, ...)
{
df[sample(nrow(df), N, replace=FALSE,...), ]
}
It's way too slow, I've tried a few times with apply functions and had no luck. I'll be doing somewhere around 1,000-10,000 replicates for each s_size from 1:250.
Let me know what you think! Thanks in advance.
=========================================================================
UPDATE EDIT: Sample data from which to sample:
https://www.dropbox.com/s/47mpo36xh7lck0t/density.csv
Joran's code in a function (in a sourced function.R file):
foo <- function(i,j,data){
res <- data[sample(nrow(data),i,replace = FALSE),]
res$s_size <- i
res$reps <- rep(j,i)
res
}
resampling_custom <- function(dat, s_size, int, reps) {
ss <- rep(seq(1,s_size,by = int),each = reps)
id <- rep(seq_len(reps),times = s_size/int)
out <- do.call(rbind,mapply(foo,i = ss,j = id,MoreArgs = list(data = dat),SIMPLIFY = FALSE))
}
Calling the function
set.seed(2)
out <- resampling_custom(dat=retinal_xyz, s_size=206, int=5, reps=10)
outputs data, unfortunately with this warning message:
Warning message:
In mapply(foo, i = ss, j = id, MoreArgs = list(data = dat), SIMPLIFY = FALSE) :
longer argument not a multiple of length of shorter
I put very little thought into actually optimizing this, I was just concentrating on doing something that's at least reasonable while matching your procedure.
Your big problem is that you are growing objects via rbind and cbind. Basically anytime you see someone write data.frame() or c() and expand that object using rbind, cbind or c, you can be very sure that the resulting code will essentially be the slowest possible way of doing what ever task is being attempted.
This version is around 12-13 times faster, and I'm sure you could squeeze some more out of this if you put some real thought into it:
s_size <- 200
int <- 10
reps <- 30
ss <- rep(seq(1,s_size,by = int),each = reps)
id <- rep(seq_len(reps),times = s_size/int)
foo <- function(i,j,data){
res <- data[sample(nrow(data),i,replace = FALSE),]
res$s_size <- i
res$reps <- rep(j,i)
res
}
out <- do.call(rbind,mapply(foo,i = ss,j = id,MoreArgs = list(data = dat),SIMPLIFY = FALSE))
The best part about R is that not only is this way, way faster, it's also way less code.

Resources