Use apply() with a simple features (SF) function - r

I've written a function to calculate the maximum distance between a centroid and the edge of its polygon, but I can't figure out how to run it on each individual polygon of a simple features ("sf) data.frame.
library(sf)
distance.func <- function(polygon){
max(st_distance(st_cast(polygon, "POINT"), st_centroid(polygon)))
}
If I test the function on a single polygon it works. (The warning messages are irrelevant to the current issue).
nc <- st_read(system.file("shape/nc.shp", package="sf")) # built in w/package
nc.1row <- nc[c(1),] # Just keep the first polygon
>distance.func(nc.1row)
24309.07 m
Warning messages:
1: In st_cast.sf(polygon, "POINT") :
repeating attributes for all sub-geometries for which they may not be constant
2: In st_centroid.sfc(st_geometry(x), of_largest_polygon = of_largest_polygon) :
st_centroid does not give correct centroids for longitude/latitude data
The problem is applying this function to the entire data.frame.
nc$distance <- apply(nc, 1, distance.func)
Error in UseMethod("st_cast") :
no applicable method for 'st_cast' applied to an object of class "list"
What can I do to run this function (or one like it) for each individual polygon in an object of class "sf"?

The problem here is that using apply-like functions directly on sf object is "problematic" because the geometry column is a list-column, which does not interact well with "apply" constructs.
The simplest workaround could be to just use a for loop:
library(sf)
nc <- st_read(system.file("shape/nc.shp", package="sf")) %>%
st_transform(3857)
distance.func <- function(polygon){
max(st_distance(st_cast(polygon, "POINT"), st_centroid(polygon)))
}
dist <- list()
for (i in seq_along(nc[[1]])) dist[[i]] <- distance.func(nc[i,])
head(unlist(dist))
# [1] 30185.34 27001.39 34708.57 52751.61 57273.54 34598.17
, but it is quite slow.
To be able to use apply-like functions, you need to pass to the function only the geometry column of the object. Something like this would work:
library(purrr)
distance.func_lapply <- function(polygon){
polygon <- st_sfc(polygon)
max(st_distance(st_cast(polygon, "POINT"), st_centroid(polygon)))
}
dist_lapply <- lapply(st_geometry(nc), distance.func_lapply)
dist_map <- purrr::map(st_geometry(nc), distance.func_lapply)
all.equal(dist, dist_lapply)
# [1] TRUE
all.equal(dist, dist_map)
# [1] TRUE
Note however that I had to slighlty modify the distance function, adding an st_sfc call, because otherwise you get a lot of "In st_cast.MULTIPOLYGON(polygon, "POINT") : point from first coordinate only" warnings, and the results are not correct (I did not investigate the reason for this - apparently st_cast behaves differently on sfg objects than on sfc ones).
In terms of speed, both the lapply and the map solutions outperform the for loop by almost an order of magnitude:
microbenchmark::microbenchmark(
forloop = {for (i in seq_along(nc[[1]])) dist[[i]] <- distance.func(nc[i,])},
map = {dist_map <- purrr::map(st_geometry(nc), distance.func_lapply)},
lapply = {dist_lapply <- lapply(st_geometry(nc), distance.func_lapply)}, times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
forloop 904.8827 919.5636 936.2214 920.7451 929.7186 1076.9646 10
map 122.7597 124.9074 126.1796 126.3326 127.6940 128.7551 10
lapply 122.9131 125.3699 126.9642 126.8100 129.3791 131.2675 10

There is an other way to apply over simple features albeit not really better than using a for loop. You can first create a list of simple features with lapply before applying your distance function.
distance.func <- function(polygon){
max(st_distance(st_cast(polygon, "POINT"), st_centroid(polygon)))
}
distance.func.ls_sf <- function(sf){
ls_sf <- lapply(1:nrow(sf), function(x, sf) {sf[x,]}, sf)
dist <- lapply(ls_sf, distance.func)
}
dist_lapply_ls_sf <- distance.func.ls_sf(nc)
all.equal(dist, dist_lapply_ls_sf)
# [1] TRUE
The performance is almost as poor as a for loop... and it even seems that 4 years later (now R 4.1.1 with sf 1.0-3), it's almost two orders of magnitude worst (instead of one) than lapply or map using st_geometry(nc)...
microbenchmark::microbenchmark(
forloop = {for (i in seq_along(nc[[1]])) dist[[i]] <- distance.func(nc[i,])},
map = {dist_map <- purrr::map(st_geometry(nc), distance.func_lapply)},
lapply = {dist_lapply <- lapply(st_geometry(nc), distance.func_lapply)},
ls_sf = {dist_lapply_ls_sf <- distance.func.ls_sf(nc)},
times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
forloop 7726.9337 7744.7534 7837.6937 7781.2301 7850.7447 8221.2092 10
map 124.1067 126.2212 135.1502 128.4745 130.2372 182.1479 10
lapply 122.0224 125.6585 130.6488 127.9388 134.1495 147.9301 10
ls_sf 7722.1066 7733.8204 7785.8104 7775.5011 7814.3849 7911.3466 10
So it's a bad solution unless the function you are applying to the simple feature object is taking much more time to compute than st_distance().
What if you need the attributes ?
If your function needs both geometries and attributes part of the sf object, using mapply is a good way to go. Here is an example computing the Sudden Infant Death density (SID/kmĀ²) using three methods:
for
extracting each features before using lapply
mapply
microbenchmark::microbenchmark(
forLoop =
{
sid.density.for <- vector("list", nrow(nc))
for (i in seq(nrow(nc))) sid.density.for[[i]] <- nc[i,][["SID74"]] / st_area(nc[i,]) / 1000^2
},
list_nc =
{
list_nc <- lapply(seq(nrow(nc)), function(x, nc) { nc[x,] }, nc)
sid.density.lapply <- lapply(list_nc, function(x) { x[["SID74"]] / as.numeric(st_area(x)) / 1000^2 })
},
mapply =
{
sid.density.func <- function(geometry, attribute) { attribute / st_area(geometry) / 1000^2 }
sid.density.mapply <- mapply(sid.density.func, st_geometry(nc), nc[["SID74"]], SIMPLIFY = FALSE)
},
times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
forLoop 4511.7203 4515.5997 4557.73503 4542.75200 4560.5508 4707.2877 10
list_nc 4356.3801 4400.5640 4455.35743 4440.38775 4475.2213 4717.5218 10
mapply 17.4783 17.6885 18.20704 17.99295 18.3078 20.1121 10

Related

Looping a function through a list of dataframes is very slow

I have a list, which contains 4438 dataframes with different sizes. I am not sure how to make a reproducible example, but the way I obtained the list is using the expand.grid function to have a dataframe with all the possible combination of elements:
citation <- citation %>%
map_depth(., 1, expand.grid)
List before applying expand.grid
List after applying expand.grid
What I am going to achieve is for each dataframe, counting the number of unique values per row, and finding the minimum number of unique values in the dataframe.
First, I write the function below
fun1 <- function(res){
min(apply(res,1,function(x) length(unique(x))))
}
Then, apply the function to each dataframe:
library(furrr)
plan(multisession, workers = 4)
min_set <- c()
min_set <- citation %>% future_map_dbl(fun1)
However, the calculation is super slow, almost 30 mins to complete. I would like to find another way to accelerate the performance. Looking forward to hear the solution from you guys. Thank you in advance
To speed up the current approach of enumerating the combinations, use rowTabulate from the Rfast package (or rowTabulates from the matrixStats package).
However, it will be much faster to get the desired results with the setcover function in the adagio package, which solves the set cover problem directly (i.e., without the use of expand.grid) via integer linear programming with lp from the lpSolve package.
library(Rfast) # for the rowTabulate function
library(adagio) # for the setcover function
# reproducible example data
set.seed(1141593349)
citation1 <- list(
lapply(c(5,2,8,12,6,38), function(size) sample(50, size)),
lapply(c(5,2,8,12,7), function(size) sample(50, size))
)
# get all combinations of the indices of the unique values for each list in citation1
citation2 <- lapply(citation1, function(x) expand.grid(lapply(x, match, table = unique(unlist(x)))))
# original solution
fun1 <- function(res) min(apply(res, 1, function(x) length(unique(x))))
# faster version of the original solution
fun2 <- function(res) min(rowsums(rowTabulate(as.matrix(res)) > 0L))
# linear programming solution (uses citation1 rather than citation2)
fun3 <- function(res) {
v <- unlist(res)
m <- matrix(0L, max(v), length(res))
m[cbind(v, rep.int(seq_along(res), lengths(res)))] <- 1L
setcover(m)$objective
}
microbenchmark::microbenchmark(fun1 = sapply(citation2, fun1),
fun2 = as.integer(sapply(citation2, fun2)),
fun3 = as.integer(sapply(citation1, fun3)),
times = 10,
check = "identical")
#> Unit: milliseconds
#> expr min lq mean median uq max
#> fun1 1110.4976 1162.003601 1217.049501 1204.608151 1281.121601 1331.057001
#> fun2 101.5173 113.123501 142.265371 145.964502 165.788700 187.196301
#> fun3 1.4038 1.461101 1.734781 1.850701 1.870801 1.888702

function that computes the average of 20 rolls several thousand times and estimates the expectation and the variance of Y

I need to write a function according to the info in the title. I'm trying to perform that with the following code:
my.function <- function(x=1:6,Nsample=20,prob1=NULL) {
rolling.die <- sample(x, size=Nsample, replace=TRUE, prob=prob1)
for (die in 1:10000) {
die.sum <- sum(rolling.die)
average <- die.sum/Nsample
}
return(var(average))
}
my.function()
But I always get N/A as a result. Could you, please, help me to understand what I am doing wrong?
You need replicate() -
set.seed(2)
test <- replicate(1000, mean(sample(1:6, 20, replace = T)))
# for expectation
mean(test)
[1] 3.50025
# for variance
var(test)
[1] 0.147535
average is a number. It does not make sense to calculate variance of a number. What is the variance of 5 ? Variance is applied to a collection of numbers. So your average must be a vector.
A more efficient approach is to generate all your data ahead of time. As long as you have the memory, this would be a very fast approach:
# sim parameters
n_rolls <- 20L #L means integer variables
n_sim <- 10000L
n_sides <- 6L
#generate data
set.seed(2)
sims <- sample(n_sides, n_rolls * n_sim, replace = T)
#make into matrix of n_sims x n_rolls
mat <- matrix(sims, ncol = n_rolls)
#mean of each simulation
rowMeans(mat)
#var of everything
var(rowMeans(mat))
This is around 14x faster than using replicate as this approach calls sample() once.
Unit: milliseconds
expr min lq mean median uq max neval
shree_replic 137.7283 138.9809 145.78485 142.34755 147.2499 172.4633 10
cole_samp_mat 11.3998 11.4477 11.57025 11.52105 11.7628 11.8218 10
As far as your current function, it doesn't make sense - the loop doesn't do anything. It just does the same calculation 10,000 times and as #user31264 points out, tries to calculate the var of a scalar after the loop. I think you mean to do something like:
my.function2 <- function(x=1:6,Nsample=20,prob1=NULL) {
rolling.die <- sample(x, size=Nsample, replace=TRUE, prob=prob1)
return(mean(rolling.die))
}
means <- vector(mode = 'double', length = n_sim)
for (i in 1:n_sim){
means[i] <- my.function2()
}
#which is equivalent to
means <- sapply(1:n_sim, my.function2)
#which is also equivalent to
means <- replicate(n_sim, my.function2())
var(means)
And #shree has a much more succinct version of your function.

Improve speed of color conversion from sRGB to Lab

I need to convert high-resolution ortho-mosaic photos from sRGB to Lab color space. I've tried using base R function convertColor() but I've never accomplished at least one conversion (images over 10 hectares with 5cm pixel resolution, ~50.0000.0000 pixels).
I tried with patchPlot package, it has a faster computation. But, considering the size of my images I'm looking a better way to do it.
Is there a package / function / method to improve this computation?
Example test with convertColor and patchPlot::RGB2Lab:
library(raster)
library(patchPlot)
library(microbenchmark)
r <- stack(system.file("external/rlogo.grd", package="raster"))
microbenchmark(baseR = convertColor(color = values(r), from = 'sRGB', to = 'Lab'),
patchPlot = RGB2Lab(values(r)))
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## baseR 261.702873 282.60345 316.76008 310.31006 327.05536 550.07653 100 b
## patchPlot 8.335807 9.58279 11.53369 10.11684 11.69073 46.78427 100 a
This is far from perfect solution, but we can try to improve it. Thing that you can play around is nMatrix (into how many matrices you want to split original RGB matrix).
library(microbenchmark)
library(parallel)
library(patchPlot)
library(raster)
# How many matrices we want to have
nMatrix <- 4
# Load raster
r <- stack(system.file("external/rlogo.grd", package = "raster"))
# Extract value matrix
rValues <- values(r)
n <- nrow(rValues)
# Groups to split rValues into nMatrix parts
foo <- rep(1:nMatrix, each = ceiling(n / nMatrix))
# If group vector exceeds number of rows in matrix then trim it
if (length(foo) > n) {
foo <- foo[1:n]
}
# Splitted matrices
rValuesSplit <- lapply(split(rValues, foo), matrix, ncol = 3)
microbenchmark(do.call(rbind,mclapply(rValuesSplit, RGB2Lab, mc.cores = 1)))

Optimising sapply() or for(), paste(), to efficiently transform sparse triplet matrix to a libsvm format

I have a piece of R code I want to optimise for speed working with larger datasets. It currently depends on sapply cycling through a vector of numbers (which correspond to rows of a sparse matrix). The reproducible example below gets at the nub of the problem; it is the three line function expensive() that chews up the time, and its obvious why (lots of matching big vectors to eachother, and two nested paste statements for each cycle of the loop). Before I give up and start struggling with doing this bit of the work in C++, is there something I'm missing? Is there a way to vectorize the sapply call that will make it an order of magnitude or three faster?
library(microbenchmark)
# create an example object like a simple_triple_matrix
# number of rows and columns in sparse matrix:
n <- 2000 # real number is about 300,000
ncols <- 1000 # real number is about 80,000
# number of non-zero values, about 10 per row:
nonzerovalues <- n * 10
stm <- data.frame(
i = sample(1:n, nonzerovalues, replace = TRUE),
j = sample(1:ncols, nonzerovalues, replace = TRUE),
v = sample(rpois(nonzerovalues, 5), replace = TRUE)
)
# It seems to save about 3% of time to have i, j and v as objects in their own right
i <- stm$i
j <- stm$j
v <- stm$v
expensive <- function(){
sapply(1:n, function(k){
# microbenchmarking suggests quicker to have which() rather than a vector of TRUE and FALSE:
whichi <- which(i == k)
paste(paste(j[whichi], v[whichi], sep = ":"), collapse = " ")
})
}
microbenchmark(expensive())
The output of expensive is a character vector, of n elements, that looks like this:
[1] "344:5 309:3 880:7 539:6 338:1 898:5 40:1"
[2] "307:3 945:2 949:1 130:4 779:5 173:4 974:7 566:8 337:5 630:6 567:5 750:5 426:5 672:3 248:6 300:7"
[3] "407:5 649:8 507:5 629:5 37:3 601:5 992:3 377:8"
For what its worth, the motivation is to efficiently write data from a sparse matrix format - either from slam or Matrix, but starting with slam - into libsvm format (which is the format above, but with each row beginning with a number representing a target variable for a support vector machine - omitted in this example as it's not part of the speed problem). Trying to improve on the answers to this question. I forked one of the repositories referred to from there and adapted its approach to work with sparse matrices with these functions. The tests show that it works fine; but it doesn't scale up.
Use package data.table. Its by combined with the fast sorting saves you from finding the indices of equal i values.
res1 <- expensive()
library(data.table)
cheaper <- function() {
setDT(stm)
res <- stm[, .(i, jv = paste(j, v, sep = ":"))
][, .(res = paste(jv, collapse = " ")), keyby = i][["res"]]
setDF(stm) #clean-up which might not be necessary
res
}
res2 <- cheaper()
all.equal(res1, res2)
#[1] TRUE
microbenchmark(expensive(),
cheaper())
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# expensive() 127.63343 135.33921 152.98288 136.13957 138.87969 222.36417 100 b
# cheaper() 15.31835 15.66584 16.16267 15.98363 16.33637 18.35359 100 a

Efficiently building a large (200 MM line) dataframe

I am attempting to build a large (~200 MM line) dataframe in R. Each entry in the dataframe will consist of approximately 10 digits (e.g. 1234.12345). The code is designed to walk through a list, subtract an item in position [i] from every item after [i], but not the items before [i] (If I was putting the output into a matrix it would be a triangular matrix). The code is simple and works fine on smaller lists, but I am wondering if there is a faster or more efficient way to do this? I assume the first part of the answer is going to entail "don't use a nested for loop," but I am not sure what the alternatives are.
The idea is that this will be an "edge list" for a social network analysis graph. Once I have 'outlist' I will reduce the number of edges based on some criteria(<,>,==,) so the final list (and graph) won't be quite so ponderous.
#Fake data of same approximate dimensions as real data
dlist<-sample(1:20,20, replace=FALSE)
#purge the output list before running the loop
rm(outlist)
outlist<-data.frame()
for(i in 1:(length(dlist)-1)){
for(j in (i+1):length(dlist)){
outlist<-rbind(outlist, c(dlist[i],dlist[j], dlist[j]-dlist[i]))
}
}
IIUC your final dataset will be ~200 million rows by 3 columns, all of type numeric, which takes a total space of:
200e6 (rows) * 3 (cols) * 8 (bytes) / (1024 ^ 3)
# ~ 4.5GB
That's quite a big data, where it's essential to avoid copies wherever possible.
Here's a method that uses data.table package's unexported (internal) vecseq function (written in C and is fast + memory efficient) and makes use of it's assignment by reference operator :=, to avoid copies.
fn1 <- function(x) {
require(data.table) ## 1.9.2
lx = length(x)
vx = as.integer(lx * (lx-1)/2)
# R v3.1.0 doesn't copy on doing list(.) - so should be even more faster there
ans = setDT(list(v1 = rep.int(head(x,-1L), (lx-1L):1L),
v2=x[data.table:::vecseq(2:lx, (lx-1L):1, vx)]))
ans[, v3 := v2-v1]
}
Benchmarking:
I'll benchmark with functions from other answers on your data dimensions. Note that my benchmark is on R v3.0.2, but fn1() should give better performance (both speed and memory) on R v3.1.0 because list(.) doesn't result in copy anymore.
fn2 <- function(x) {
diffmat <- outer(x, x, "-")
ss <- which(upper.tri(diffmat), arr.ind = TRUE)
data.frame(v1 = x[ss[,1]], v2 = x[ss[,2]], v3 = diffmat[ss])
}
fn3 <- function(x) {
idx <- combn(seq_along(x), 2)
out2 <- data.frame(v1=x[idx[1, ]], v2=x[idx[2, ]])
out2$v3 <- out2$v2-out2$v1
out2
}
set.seed(45L)
x = runif(20e3L)
system.time(ans1 <- fn1(x)) ## 18 seconds + ~8GB (peak) memory usage
system.time(ans2 <- fn2(x)) ## 158 seconds + ~19GB (peak) memory usage
system.time(ans3 <- fn3(x)) ## 809 seconds + ~12GB (peak) memory usage
Note that fn2() due to use of outer requires quite a lot of memory (peak memory usage was >=19GB) and is slower than fn1(). fn3() is just very very slow (due to combn, and unnecessary copy).
Another way to create that data is
#Sample Data
N <- 20
set.seed(15) #for reproducibility
dlist <- sample(1:N,N, replace=FALSE)
we could do
idx <- combn(1:N,2)
out2 <- data.frame(i=dlist[idx[1, ]], j=dlist[idx[2, ]])
out2$dist <- out2$j-out2$i
This uses combn to create all paris of indices in the data.set rather than doing loops. This allows us to build the data.frame all at once rather than adding a row at a time.
We compare that to
out1 <- data.frame()
for(i in 1:(length(dlist)-1)){
for(j in (i+1):length(dlist)){
out1<-rbind(out1, c(dlist[i],dlist[j], dlist[j]-dlist[i]))
}
}
we see that
all(out1==out2)
# [1] TRUE
Plus, if we compare with microbenchmark we see that
microbenchmark(loops(), combdata())
# Unit: microseconds
# expr min lq median uq max neval
# loops() 30888.403 32230.107 33764.7170 34821.2850 82891.166 100
# combdata() 684.316 800.384 873.5015 940.9215 4285.627 100
The method that doesn't use loops is much faster.
You can always start with a triangular matrix and then make your dataframe directly from that:
vec <- 1:10
diffmat <- outer(vec,vec,"-")
ss <- which(upper.tri(diffmat),arr.ind = TRUE)
data.frame(one = vec[ss[,1]],
two = vec[ss[,2]],
diff = diffmat[ss])
You need to preallocate out list, this will significantly increase the speed of your code. By preallocating I mean creating an output structure that already has the desired size, but filled with for example NA's.

Resources