Optimize the for loop in R - r

DUMMY DATA SET: (difference from my data set is item_code is string in my case)
in_cluster <- data.frame(item_code = c(1:500))
in_cluster$cluster <-
sample(5, size = nrow(in_cluster), replace = TRUE)
real_sales <- data.frame(item_code = numeric(0), sales = numeric(0))
real_sales <-
data.frame(
item_code = sample(500, size = 100000, replace = TRUE),
sales = sample(500, size = 100000, replace = TRUE)
)
mean_trajectory <- data.frame(sales = c(1:52))
mean_trajectory$sales <- sample(500, size = 52, replace = TRUE)
training_df <- data.frame(
LTF_t_minus_1 = numeric(0),
LTF_t = numeric(0),
LTF_t_plus_1 = numeric(0),
RS_t_minus_1 = numeric(0),
RS_t = numeric(0),
STF_t_plus_1 = numeric(0)
)
training_df[nrow(training_df) + 1, ] <-
c(0, 0, mean_trajectory$sales[[1]], 0, 0, 19) # week 0
week = 2
I have a simple function in R in which all I do is:
system.time({
for (r in 1:nrow(in_cluster)) {
item <- in_cluster[r,]
sale_row <-
dplyr::filter(real_sales, item_code == item$item_code)
if (nrow(sale_row) > 2) {
new_df <- data.frame(
LTF_t_minus_1 = mean_trajectory$sales[[week - 1]],
LTF_t = mean_trajectory$sales[[week]],
LTF_t_plus_1 = mean_trajectory$sales[[week + 1]],
RS_t_minus_1 = sale_row$sales[[week - 1]],
RS_t = sale_row$sales[[week]],
STF_t_plus_1 = sale_row$sales[[week + 1]]
)
training_df <-
bind_rows(training_df, new_df)
}
}
})
I am quite new to R and found this really weird looking at how small the data really is yet how long (421.59 seconds to loop through 500 rows) it is taking to loop through the data frame.
EDIT_IMPORTANT: However for above given dummy data set all it took was 1.10 seconds to get the output > could this be because of having string for item_code? does it take that much time to process a string item_code. (I didn't use string for dummy data sets because I do not know how to have 500 unique strings for item_code in in_cluster, and have the same strings in real_sales as item_code)
I read through few other articles which suggested ways to optimize the R code and used bind_rows instead of rbind or using:
training_df[nrow(training_df) + 1,] <-
c(mean_trajectory$sales[[week-1]], mean_trajectory$sales[[week]], mean_trajectory$sales[[week+1]], sale_row$sales[[week-1]], sale_row$sales[[week]], sale_row$sales[[week+1]])
using bind_rows seems to have improved the performance by 36 seconds when looping through 500 rows of data frame in_cluster
Is it possible to use lapply in this scenario? I tried code below and got an error:
Error in filter_impl(.data, dots) : $ operator is invalid for
atomic vectors
myfun <- function(item, sales, mean_trajectory, week) {
sale_row<- filter(sales, item_code == item$item_code)
data.frame(
LTF_t_minus_1 = mean_trajectory$sales[[week-1]],
LTF_t = mean_trajectory$sales[[week]],
LTF_t_plus_1 = mean_trajectory$sales[[week+1]],
RS_t_minus_1 = sale_row$sales[[week-1]],
RS_t = sale_row$sales[[week]],
STF_t_plus_1 = sale_row$sales[[week+1]])
}
system.time({
lapply(in_cluster, myfun, sales= sales, mean_trajectory = mean_trajectory) %>% bind_rows()
})
Help with lapply would be appreciated, however my main target is to speed up the loop.

Ok, so there a lot of bad practices in your code.
You are operating per row
You are creating 2(!) new data frames per row (very expensive)
You are growing objects in a loop )that training_df <- bind_rows(training_df, new_df) keeps growing in each iteration while running a pretty expensive operation (bind_rows))
You are running the same operation over and over again when you could just run them once (why are you running mean_trajectory$sales[[week-1]] and al per row while mean_trajectory has nothing to do with the loop? You could just assign it afterwards).
And the list goes on...
I would suggest an alternative simple data.table solution which will perform much better. The idea is to first make a binary join between in_cluster and real_sales (and run all the operations while joining without creating extra data frames and then binding them). Then, run all the mean_trajectoryrelated lines only once. (I ignored the training_df[nrow(training_df) + 1, ] <- c(0, 0, mean_trajectory$sales[[1]], 0, 0, 19) initialization as it's irrelevant here and you can just add it afterwards using and rbind)
library(data.table) #v1.10.4
## First step
res <-
setDT(real_sales)[setDT(in_cluster), # binary join
if(.N > 2) .(RS_t_minus_1 = sales[week - 1], # The stuff you want to do
RS_t = sales[week], # by condition
STF_t_plus_1 = sales[week + 1]),
on = "item_code", # The join key
by = .EACHI] # Do the operations per each join
## Second step (run the `mean_trajectory` only once)
res[, `:=`(LTF_t_minus_1 = mean_trajectory$sales[week - 1],
LTF_t = mean_trajectory$sales[week],
LTF_t_plus_1 = mean_trajectory$sales[week + 1])]
Some benchmarks:
### Creating your data sets
set.seed(123)
N <- 1e5
N2 <- 5e7
in_cluster <- data.frame(item_code = c(1:N))
real_sales <-
data.frame(
item_code = sample(N, size = N2, replace = TRUE),
sales = sample(N, size = N2, replace = TRUE)
)
mean_trajectory <- data.frame(sales = sample(N, size = 25, replace = TRUE))
training_df <- data.frame(
LTF_t_minus_1 = numeric(0),
LTF_t = numeric(0),
LTF_t_plus_1 = numeric(0),
RS_t_minus_1 = numeric(0),
RS_t = numeric(0),
STF_t_plus_1 = numeric(0)
)
week = 2
###############################
################# Your solution
system.time({
for (r in 1:nrow(in_cluster)) {
item <- in_cluster[r,, drop = FALSE]
sale_row <-
dplyr::filter(real_sales, item_code == item$item_code)
if (nrow(sale_row) > 2) {
new_df <- data.frame(
LTF_t_minus_1 = mean_trajectory$sales[[week - 1]],
LTF_t = mean_trajectory$sales[[week]],
LTF_t_plus_1 = mean_trajectory$sales[[week + 1]],
RS_t_minus_1 = sale_row$sales[[week - 1]],
RS_t = sale_row$sales[[week]],
STF_t_plus_1 = sale_row$sales[[week + 1]]
)
training_df <-
bind_rows(training_df, new_df)
}
}
})
### Ran forever- I've killed it after half an hour
######################
########## My solution
library(data.table)
system.time({
res <-
setDT(real_sales)[setDT(in_cluster),
if(.N > 2) .(RS_t_minus_1 = sales[week - 1],
RS_t = sales[week],
STF_t_plus_1 = sales[week + 1]),
on = "item_code",
by = .EACHI]
res[, `:=`(LTF_t_minus_1 = mean_trajectory$sales[week - 1],
LTF_t = mean_trajectory$sales[week],
LTF_t_plus_1 = mean_trajectory$sales[week + 1])]
})
# user system elapsed
# 2.42 0.05 2.47
So for 50MM rows the data.table solution ran for about 2 secs, while your solution ran endlessly until I've killed it (after half an hour).

Related

Using app function from {terra} package on raster stacks? (in parallel)

I have four high resolution rasters for a country. I have split each raster into tiles and done some other processing to them. I now want to apply a function to each cell, of each 'stack' of the raster tiles, to produce one set of output tiles. The function is a little complex. I have tried to synthesise some data below to reproduce my current approach. It works (ish) but I'm convinced that there's a better way to do this. To use parallel processing on my unix box, I simply swap mapply for mcmapply, but I haven't done that in the example below as I presume many will be working on Windows machines. I'd welcome ideas on my approach and particularly optimisation.
library("terra")
library("glue")
## Make some toy data
dir.create("temp_folder")
dir.create("result_folder")
x <- rast(ncols = 10, nrows = 10)
a <- rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(a) <- some_values
a_tiles <- makeTiles(a, x, glue("temp_folder/tile_a_{1:100}.tif"), overwrite = TRUE)
b <- rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(b) <- some_values
b_tiles <- makeTiles(b, x, glue("temp_folder/tile_b_{1:100}.tif"), overwrite = TRUE)
c <-rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(c) <- some_values
c_tiles <- makeTiles(c, x, glue("temp_folder/tile_c_{1:100}.tif"), overwrite = TRUE)
d <- rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(d) <- some_values
d_tiles <- makeTiles(d, x, glue("temp_folder/tile_d_{1:100}.tif"), overwrite = TRUE)
## Outer function so that this can be used in parallel ? But maybe this is a silly way to do it?
outer_function <- function(a_tiles, b_tiles, c_tiles, d_tiles, output_files) {
one_a_tile <- rast(unlist(a_tiles))
one_b_tile <- rast(unlist(b_tiles))
one_c_tile <- rast(unlist(c_tiles))
one_d_tile <- rast(unlist(d_tiles))
output_file <- output_files
# I replace any NAs with 0 as an NA will break my 'if' statement of the inner_function.
# I get Error in if (z["a"] <= z["b"]) { : missing value where TRUE/FALSE needed
one_a_tile[is.na(one_a_tile)] <- 0
one_b_tile[is.na(one_b_tile)] <- 0
one_c_tile[is.na(one_c_tile)] <- 0
one_d_tile[is.na(one_d_tile)] <- 0
z <- sds(one_a_tile, one_b_tile, one_c_tile, one_d_tile)
## Inner function that actually does the work I want doing
inner_function <- function(z) {
names(z) <- c('a', 'b', 'c', 'd')
if (z['a'] <= z['b']) {
y <- rowSums(cbind((z['c'] + z['a'] * 10),
(z['c'] + z['a'] * 20)))
}
if (z['a'] >= z['b']) {
y <- rowSums(cbind((z['c'] + z['a'] * 40),
(z['c'] + z['a'] * 10)))
}
if (z['a'] == z['b']) {
y <- rowSums(cbind((z['c'] + z['a'] * 60),
(z['c'] + z['a'] * 10)))
}
y <- ifelse(y == 0, NA, y)
return(y)
}
app(z,
inner_function,
filename = output_file,
overwrite = TRUE,
wopt = list(datatype = "INT4U"))
return(output_file)
}
results <- mapply(outer_function,
a_tiles = a_tiles,
b_tiles = b_tiles,
c_tiles = c_tiles,
d_tiles = d_tiles,
output_files = output_files <- glue("result_folder/result_tile_{1:length(d_tiles)}.tif"))
names(results) <- NULL
unlink("temp_folder", recursive = TRUE)
unlink("result_folder", recursive = TRUE)

Parallel file processing with two tasks using the foreach package in R

I have 300 files of 500 MB each. I want to read them in parallel and filter them down based on some arguments.
To replicate the problem, let's assume I have 3 monthly files:
library(data.table)
library(foreach)
library(doParallel)
set.seed(1)
FirstStatus <- data.frame(IDs = sample(1:200, 100, replace = F))
month_1 <- data.table(IDs = sample(1:200, 100, replace = F),
Month = rep(1, 100),
Status = rep(c('a','b','c','d'), 25))
month_2 <- data.table(IDs = sample(1:200, 100, replace = F),
Month = rep(2, 100),
Status = rep(c('a','b','c','d'), 25))
month_3 <- data.table(IDs = sample(1:200, 100, replace = F),
Month = rep(3, 100),
Status = rep(c('a','b','c','d'), 25))
Where FirstStatus is the full ID population. I used the below function to store data into one dataframe.
Execute <- function(FirstStatus, Start = 1, End = 3){
for(i in Start:End){
print(paste0('Read month ', i )) # Which is in our case month_1 month_2 month_3
month <- fread(paste0('month_', i ,'.txt')) # monthly files generated above
month <- month[IDs %in% FirstStatus$IDs, .(IDs, Month, Status)]
month <- month[Status %in% c('a','c','d')]
if(i == Start){
FirstMonth <- month
} else {
FirstMonth <- rbind(FirstMonth, month)
}
FirstStatus <- FirstStatus[!(IDs %in% FirstMonth$IDs)]
}
return(list('FirstStatus' = FirstStatus, 'FirstMonth' = FirstMonth))
}
res <- Execute(FirstStatus = FirstStatus, Start = 1, End = 3)
This works fine but it would take time when looping over 300 files. I'm trying to optimize this for-loop by using the foreach package in R but I'm struggling a bit. First I tried to read the first file to extract the 'FirstMonth' and to filter down the 'FirstStatus'
cl <- makeCluster(3)
registerDoParallel(cl)
month <- fread(paste0('month_', 1 ,'.txt'))
month <- month[IDs %in% FirstStatus$IDs, .(IDs, Month, Status)]
month <- month[Status %in% c('a','c','d')]
FirstMonth <- month
FirstStatus <- FirstStatus[!(IDs %in% FirstMonth$IDs)]
Next, I used foreach to read them in parallel although I'm struggling with updating FirstStatus, and FirstMonth.
ExecutePar <- function(FirstStatus, FirstMonth, Start = 2, End = 3){
foreach(i = 2:3, .combine = 'rbind', .packages = c('data.table','foreach'), .export('FirstStatus','FirstMonth')) %dopar% {
month <- fread(paste0('month_', i '.txt'))
month <- month[IDs %in% FirstStatus$IDs, .(IDs, Month, Status)]
month <- month[Status %in% c('a','c','d')]
FirstStatus <- FirstStatus[!(IDs %in% c(FirstMonth, Month)$IDs)]
c(FirstMonth, month)
}
}
res <- ExecutePar(FirstStatus, FirstMonth, Start = 2, End = 3)
Any advice on how to improve the for-loop is appreciated.

What would be the best way to improve calcul performance in a big data.table?

In a single data.table, I have many calculs to perform. Simple, but combining many configurations : creating X variables from Y others, making groups based on X different variables, etc...
Step by step, I manage to perform all the calculations I need (with my knowledge in data.table), but my real challenge is called PERFORMANCE. My data.table contains millions of lines, and the calculations are made on dozens and dozens of columns.
What I would like to know :
Is there a better way to write this code to improve performance ?
Some of my options do not work (1.3 and 2.2, tag with KO) : good approach ? How to write it ?
My microbenchmark seems to show me that the best option depends on the number of lines ? Right ?
Here is my code with a reprex :
library(data.table)
library(stringr)
library(microbenchmark)
n.row <- 1e5
foo <- data.table(id = 101:(101+n.row-1),
crit = rep(c('fr', 'ca', 'al', 'se', 'is'), 5),
val_1 = round(runif(n.row, 0.5, 50), digits = 2),
val_2 = round(runif(n.row, 1, 20), digits = 0),
val_3 = round(runif(n.row, 1, 5), digits = 0),
eff = 28500,
num = sample(0:1,n.row, replace = TRUE),
num_2 = round(runif(n.row, 1, 10), digits = 1),
num_17 = round(runif(n.row, 1, 10), digits = 1),
num_69 = round(runif(n.row, 0, 1), digits = 2),
num_5 = round(runif(n.row, 10, 20), digits = 0),
cof = round(runif(n.row, 0.1, 2), digits = 5),
ToDo = rep(1, n.row),
grp_1 = sample(LETTERS[c(1,3)], n.row, replace = TRUE))
foo[, c("grp_2", "grp_3") := {
grp_2 = fcase(grp_1 %in% LETTERS[c(1)], sample(LETTERS[c(5,8,9)], n.row, replace = TRUE),
grp_1 %in% LETTERS[c(3)], sample(LETTERS[c(14,16)], n.row, replace = TRUE))
grp_3 = fcase(grp_1 %in% LETTERS[c(1)], sample(LETTERS[c(20:23)], n.row, replace = TRUE),
grp_1 %in% LETTERS[c(3)], sample(LETTERS[c(24:26)], n.row, replace = TRUE))
list(grp_2, grp_3)
}]
# Calcul sd and qa
foo[, sd := (val_1 * cof)]
foo[num == 1, qa := (val_2 * cof)]
foo[num != 1, qa := (val_3 * cof)]
foo1 <- copy(foo)
foo2 <- copy(foo)
foo3 <- copy(foo)
# calcul of qa_X
var.calc <- names(foo)[str_which(names(foo), "^num.\\d+$")]
# 1.1
for (j in var.calc){
foo1[, paste0("qa_", str_extract(j, "\\d+$")) := qa * get(j)]
}
# 1.2
setDT(foo2)[, paste0("qa_", str_extract(var.calc, "\\d+$")) := lapply(.SD, function(x) x * qa), .SDcols = var.calc ]
# 1.3 KO
for (j in var.calc){ set(foo3, paste0("qa_", str_extract(j, "\\d+$")) := qa * get(j)) }
# comparaison
mbm <- microbenchmark(
Test.for = for (j in var.calc){ foo1[, paste0("qa_", str_extract(j, "\\d+$")) := qa * get(j)] },
Test.set = setDT(foo2)[, paste0("qa_", str_extract(var.calc, "\\d+$")) := lapply(.SD, function(x) x * qa), .SDcols = var.calc ],
times = 10
)
mbm
# calcul by groups
var.grp <- names(foo)[grepl("^grp.\\d+$", names(foo))]
# 2.1
for (j in var.grp) {
foo1[, paste0("s.sd.", j) := sum(sd, na.rm = TRUE), by = get(j)]
foo1[, paste0("s.qa.", j) := sum(qa, na.rm = TRUE), by = get(j)]
}
# 2.2 KO
setDT(foo2)[, paste0("s.sd.", var.grp) := lapply(.SD, function(x) sum(x)), .SDcols = var.calc, by = .SD ]
Many thanks for helping or suggestions.
(If I have to split my request, I will).
question: I would use:
for (j in var.calc) set(foo3, j = paste0("qa_", str_extract(j, "\\d+$")), value = foo3$qa * foo3[[j]])
(fixed 1.3 example)
question: 2.1 seams fine
Notes:
you don't need to constantly use setDT(foo2)
read documentation of data.table! there are lots of useful example, etc.: https://rdatatable.gitlab.io/data.table/
don't look at microbenchmark's, try the code on your real data and time that, because the results(time) will be different and the overhead, that some of data.tables functions have, will be insignificant.

issue with disag_model() function from disaggregation R package

I was trying to use the disaggregation package to evaluate if it could be used on the dataset I have. My original data are disaggregated, so I've aggregated them to use the disag_model function from disaggregation package and compare "fitted values" with actual values.
However when I run the function the R session aborts.
I tried to execute the disag_model function step by step and I saw that the problem is due to the use of nlminb() to optimize the a posteriori density function, but I cannot understand why it's happening and how to solve it.
Thanks for your help.
You can find the data I used at this link: https://www.dropbox.com/sh/au7l0e11trzfo19/AACpfRSUpd4gRCveUsh5JX6Ea?dl=0
Please download the folder to run the code.
This is the code I used:
library(tidyverse)
library(raster)
library(disaggregation)
library(sp)
path<- "yourPath/Data"
load(file.path(path, "myRS"))
load(file.path(path, "RAST"))
Data <- read.csv(file = paste(path, "/sim_data.csv", sep = ""))
Data$HasRes <- ifelse(Data$PN50 > runif(nrow(Data)), 1, 0)
for (i in 1:nlayers(myRS)) {
myRS#layers[[i]]#file#name<-file.path(path, "predStackl10")
}
DFCov <-
as.data.frame(raster::extract(myRS, Data[c("XCoord", "YCoord")]))
Data <- cbind(Data, DFCov)
# Remove NA
NAs <- which(is.na(rowSums(Data[names(myRS)])))
Data <- Data[-NAs, ]
Data$ISO3 <- as.factor(Data$ISO3)
world_shape <-
shapefile(file.path(path, "World.shp"))
lmic_shape <-
world_shape[(world_shape#data$ISO3 %in% levels(Data$ISO3)),]
plot(lmic_shape)
# I would like to convert Data in a SpatialPointsDataFrame object
PN50 <- Data
coordinates(PN50) <- c("XCoord", "YCoord")
is.projected(PN50) # see if a projection is defined
proj4string(PN50) <- CRS("+proj=longlat +datum=WGS84")
# compute the mean P50 within each state
PN50_mean <- aggregate(x = PN50,
by = list(Data$ISO3),
FUN = mean)
# compute the centroid of the observations coordinates for each state
PN50_centroid <-
Data %>% group_by(ISO3) %>% summarise(meanX = mean(XCoord), meanY = mean(YCoord))
# assign to each mean the centroid coordinates
PN50_agg <-
as.data.frame(
cbind(
PN50_mean = PN50_mean#data$PN50,
XCoord = PN50_centroid$meanX,
YCoord = PN50_centroid$meanY
)
)
PN50_agg$XCoord <- as.numeric(PN50_agg$XCoord)
PN50_agg$YCoord <- as.numeric(PN50_agg$YCoord)
PN50_agg$ISO3 <- as.character(PN50_centroid$ISO3)
samsiz <-
Data %>% group_by(ISO3) %>% summarise(sz = sum(SampleSize))
PN50_agg$sample_size <- as.numeric(samsiz$sz)
PN50_agg$case <- round(PN50_agg$PN50_mean * PN50_agg$sample_size)
# I would like having data in a SpatialPolygonsDataFrame format to use the disaggrgation package
library(sp)
coordinates(PN50_agg) <- c("XCoord", "YCoord")
proj4string(PN50_agg) <- CRS("+proj=longlat +datum=WGS84")
PN50_polyg <- lmic_shape
PN50_polyg#data <-
full_join(PN50_polyg#data, PN50_agg#data, by = "ISO3")
# covariates raster
covariate_stack <-
getCovariateRasters(path, shape = raster(x = paste0(path, '/multi.tif')))
names(covariate_stack)
covariate_stack2 <- dropLayer(covariate_stack, nlayers(covariate_stack))
names(covariate_stack2)
plot(covariate_stack2)
covariate_stack2 <- raster::stack(covariate_stack2)
covariate_stack2<-brick(covariate_stack2)
# population raster
extracted <- raster::extract(raster(x = paste0(path, '/multi.tif')), PN50_polyg)
n_cells <- sapply(extracted, length)
PN50_polyg#data$pop_per_cell <- PN50_polyg#data$sample_size / n_cells
population_raster <-
rasterize(PN50_polyg, covariate_stack2, field = 'pop_per_cell')
# prepare data for disag_model()
dis_data <- prepare_data(
polygon_shapefile = PN50_polyg,
covariate_rasters = covariate_stack2,
aggregation_raster = population_raster,
mesh.args = list(
max.edge = c(5, 40),
cut = 0.0005,
offset = 1
),
id_var = "ISO3",
response_var = "case",
sample_size_var = "sample_size",
na.action = TRUE,
ncores = 8
)
# Rho and p(Rho<Rho_min)
dist <- pointDistance(PN50_agg#coords, lonlat = F, allpairs = T)
rownames(dist) <- PN50_agg$ISO3
colnames(dist) <- PN50_agg$ISO3
flattenDist <- function(dist) {
up <- upper.tri(dist)
flat <- data_frame(row = rownames(dist)[row(dist)[up]],
column = rownames(dist)[col(dist)[up]],
dist = dist[up])
return(flat)
}
pair_dist <- flattenDist(dist)
d <- pair_dist$dist
k <- 0.036
CorMatern <- k * d * besselK(k * d, 1)
limits <- sp::bbox(PN50_polyg)
hypontenuse <-
sqrt((limits[1, 2] - limits[1, 1]) ^ 2 + (limits[2, 2] - limits[2, 1]) ^
2)
prior_rho <- hypontenuse / 3
p_rho <- sum(d[CorMatern <= 0.1] < prior_rho) / length(d[CorMatern <= 0.1])
# sigma and p(sigma>sigma_max)
sigma_boost <- function(data, i) {
sd(data[i] / mean(data[i]))
}
sigma <-
boot(data = dis_data$polygon_data$response,
statistic = sigma_boost,
10000)
prior_sigma <- sigma$t0
p_sigma <- sum(sigma$t >= sigma$t0) / length(sigma$t)
default_priors <-
list(
priormean_intercept = 0,
priorsd_intercept = 4,
priormean_slope = 0,
priorsd_slope = 2,
prior_rho_min = prior_rho,
prior_rho_prob = p_rho,
prior_sigma_max = prior_sigma,
prior_sigma_prob = p_sigma,
prior_iideffect_sd_max = 0.1,
prior_iideffect_sd_prob = 0.01
)
fitted_model <- disag_model(
data = dis_data,
iterations = 1000,
family = "binomial",
link = "logit",
# priors = default_priors,
field = TRUE,
iid = TRUE,
silent = TRUE
)
I was able to run the disag_model function using your dis_data object. There were no errors or crashes. I ran the following lines.
fitted_model <- disag_model(
data = dis_data,
iterations = 1000,
family = "binomial",
link = "logit",
field = TRUE,
iid = TRUE,
silent = TRUE
)
I am running on a Windows machine with 64GB RAM and 8 cores. It took over an hour and used all of my RAM for a while and up to 50% of my CPU, which is not surprising as you are fitting 5.5M pixels over the whole world. Therefore, I suspect it is related to your computer running out of resources. I suggest you try a smaller example to test it out first. Try fewer polygons and fewer pixels in each polygon.

Apply a function based on column name in data.tables R

I'm looking to apply a user define function based on the name given to a column
dt <- data.table(gr_id = 1, id = seq(1,10),min_c = runif(10,10,30),
ml_c = runif(10,30,50),mx_c = runif(10,50,100),
min_t = runif(10,10,20),ml_t = runif(10,20,25),
mx_t = runif(10,25,30))
I would like to apply a function which calculates (min(min)+min(ml))/mx for both "c" columns and "t" columns. Currently, I've done as follows. However, becomes hard when I want to add more columns (lets say, "a")
dt[,{
temp1 = min(min_c)
temp2 = min(ml_c)
temp3 = min(mx_c)
score_c = (temp1+temp2)/temp3
temp4 = min(min_t)
temp5 = min(ml_t)
temp6 = min(mx_t)
score_t = (temp4+temp5)/temp6
list(score_c = score_c,
score_t = score_t)
},by = gr_id
]
I think this will work. the basic idea is using get.
# the original code could be simplified to:
dt[, .(
score_c = (min(min_c) + min(ml_c)) / min(mx_c),
score_t = (min(min_t) + min(ml_t)) / min(mx_t)
), by = gr_id]
#
# gr_id score_c score_t
# 1: 1 0.9051556 1.28054
# using `get`
cols <- c('c', 't')
dt[, {
res <- lapply(cols, function(i){
vars <- paste(c('min', 'ml', 'mx'), i, sep = '_')
(min(get(vars[1])) + min(get(vars[2]))) / min(get(vars[3]))
})
names(res) <- paste('score', cols, sep = '_')
res
}, by = gr_id]
# gr_id score_c score_t
# 1: 1 0.9051556 1.28054

Resources