I have some R code that puts together demographic data from the Census for all of states in the US into a list object. The block-level code can take a week to run as a sequential loop since there are ~11M blocks, so I am trying to parallelize the loop over states to make it faster. I have accomplished this goal with this:
states <- c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI",
"ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI",
"MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC",
"ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT",
"VT","VA","WA","WV","WI","WY","DC","PR")
library(future.apply)
plan(multiprocess)
ptm <- proc.time()
CensusObj_block_age_sex = list()
CensusObj_block_age_sex[states] <- future_lapply(states, function(s){
county <- census_geo_api(key = "XXX", state = s, geo = "county", age = TRUE, sex = TRUE)
tract <- census_geo_api(key = "XXX", state = s, geo = "tract", age = TRUE, sex = TRUE)
block <- census_geo_api(key = "XXX", state = s, geo = "block", age = TRUE, sex = TRUE)
censusObj[[s]] <- list(state = s, age = TRUE, sex = TRUE, block = block, tract = tract, county = county)
}
)
However, I need to make it more robust. Sometimes there are problem with the Census API, so I would like the CensusObj to be updated at each state iteration so that I don't loose my completed data if something wrong. That way I can restart the loop over the remaining state if something does goes wrong (like if I spell "WY" as "WU")
Would it be possible to accomplish this somehow? I am open to other methods of parallelization.
The code above runs, but it seems to run into memory issues:
Error: Failed to retrieve the value of MultisessionFuture (future_lapply-3) from cluster RichSOCKnode #3 (PID 80363 on localhost ‘localhost’). The reason reported was ‘vector memory exhausted (limit reached?)’. Post-mortem diagnostic: A process with this PID exists, which suggests that the localhost worker is still alive.
I have R_MAX_VSIZE = 8Gb in my .Renviron, but I am not sure how that would get divided between the 8 cores on my machine. This all suggests that I need to store the results of each iteration rather than try to keep it all in memory, and then append the objects together at the end.
Here is a solution that uses doParallel (with the options for UNIX systems, but you can also use it on Windows, see here) and foreach that stores the results for every state separately and afterwards reads in the single files and combines them to a list.
library(doParallel)
library(foreach)
path_results <- "my_path"
ncpus = 8L
registerDoParallel(cores = ncpus)
states <- c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI",
"ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI",
"MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC",
"ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT",
"VT","VA","WA","WV","WI","WY","DC","PR")
results <- foreach(state = states) %dopar% {
county <- census_geo_api(key = "XXX", state = state, geo = "county", age = TRUE, sex = TRUE)
tract <- census_geo_api(key = "XXX", state = state, geo = "tract", age = TRUE, sex = TRUE)
block <- census_geo_api(key = "XXX", state = state, geo = "block", age = TRUE, sex = TRUE)
results <- list(state = state, age = TRUE, sex = TRUE, block = block, tract = tract, county = county)
# store the results as rds
saveRDS(results,
file = paste0(path_results, "/", state, ".Rds"))
# remove the results
rm(county)
rm(tract)
rm(block)
rm(results)
gc()
# just return a string
paste0("done with ", state)
}
library(purrr)
# combine the results to a list
result_files <- list.files(path = path_results)
CensusObj_block_age_sex <- set_names(result_files, states) %>%
map(~ readRDS(file = paste0(path_results, "/", .x)))
You could use a tryCatch inside future_lapply to try to relaunch the calculation in case of API error, for a maximum of maxtrials.
In the resulting list, you get for each calculation the number of trials and the final status, OK or Error:
states <- c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI",
"ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI",
"MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC",
"ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT",
"VT","VA","WA","WV","WI","WY","DC","PR")
library(future.apply)
#> Le chargement a nécessité le package : future
plan(multiprocess)
ptm <- proc.time()
maxtrials <- 3
census_geo_api <-
function(key = "XXX",
state = s,
geo = "county",
age = TRUE,
sex = TRUE) {
paste(state,'-', geo)
}
CensusObj_block_age_sex <- future_lapply(states, function(s) {
ntrials <- 1
while (ntrials <= maxtrials) {
hasError <- tryCatch({
#simulate random error
if (runif(1)>0.3) {error("API failed")}
county <- census_geo_api(key = "XXX", state = s, geo = "county", age = TRUE, sex = TRUE)
tract <- census_geo_api(key = "XXX", state = s, geo = "tract", age = TRUE, sex = TRUE)
block <- census_geo_api(key = "XXX", state = s, geo = "block", age = TRUE, sex = TRUE)
},
error = function(e)
e)
if (inherits(hasError, "error")) {
ntrials <- ntrials + 1
} else { break}
}
if (ntrials > maxtrials) {
res <- list(state = s, status = 'Error', ntrials = ntrials-1, age = NA, sex = NA, block = NA, tract = NA, county = NA)
} else {
res <- list(state = s, status = 'OK' , ntrials = ntrials, age = TRUE, sex = TRUE, block = block, tract = tract, county = county)
}
res
}
)
CensusObj_block_age_sex[[1]]
#> $state
#> [1] "AL"
#>
#> $status
#> [1] "OK"
#>
#> $ntrials
#> [1] 3
#>
#> $age
#> [1] TRUE
#>
#> $sex
#> [1] TRUE
#>
#> $block
#> [1] "AL - block"
#>
#> $tract
#> [1] "AL - tract"
#>
#> $county
#> [1] "AL - county"
<sup>Created on 2020-08-19 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
One possible solution that I have is to log the value of CensusObj to a text file i.e print the CensusObj in each iteration. The doSNOW package can be used for logging for example
library(doSNOW)
cl <- makeCluster(1, outfile="abc.out")
registerDoSNOW(cl)
states <- c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI",
"ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI",
"MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC",
"ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT",
"VT","VA","WA","WV","WI","WY","DC","PR")
foreach(i=1:length(states), .combine=rbind, .inorder = TRUE) %dopar% {
county <- "A"
tract <- "B"
block <- "C"
censusObj <- data.frame(state = states[i], age = TRUE, sex = TRUE, block = block, tract = tract, county = county)
# edit: print json objects to easily extract from the file
cat(sprintf("%s\n",rjson::toJSON(censusObj)))
}
stopCluster(cl)
This would log the value of censusObj in abc.out and also logs the error if program crashes but you will get the latest value of censusObj logged in abc.out.
Here is the output of the last iteration from the log file:
Type: EXEC {"state":"PR","age":true,"sex":true,"block":"C","tract":"B","county":"A"} Type: DONE
Type:EXEC means that the iteration has started and Type:DONE means execution is completed. The result of cat will be present between these two statements of each iteration. Now, the value of CensusObj can be extracted from the log file as shown below:
Lines = readLines("abc.out")
results = list()
for(i in Lines){
# skip processing logs created by doSNOW
if(!startsWith(i, "starting") && !startsWith(i, "Type:")){
results = rlist::list.append(results, jsonlite::fromJSON(i))
}
}
results will contain the elements all the values printed in abc.out.
> head(results, 1)
[[1]]
[[1]]$state
[1] "AL"
[[1]]$age
[1] TRUE
[[1]]$sex
[1] TRUE
[[1]]$block
[1] "C"
[[1]]$tract
[1] "B"
[[1]]$county
[1] "A"
It is not a very clean solution but works.
Consider this simple example
bogustib <- tibble(doc_id = c(1,2,3),
text = c('bug', 'one love', '838383838'))
# A tibble: 3 x 2
doc_id text
<dbl> <chr>
1 1 bug
2 2 one love
3 3 838383838
This tibble is called bogustib because I know spacyr will fail on row 3.
> spacy_parse('838383838', lemma = FALSE, entity = TRUE, nounphrase = TRUE)
Error in `$<-.data.frame`(`*tmp*`, "doc_id", value = "text1") :
replacement has 1 row, data has 0
so, naturally, feeding the tibble to spacyr will fail as well
spacy_parse(bogustib, lemma = FALSE, entity = TRUE, nounphrase = TRUE)
Error in `$<-.data.frame`(`*tmp*`, "doc_id", value = "3") :
replacement has 1 row, data has 0
My question is: I think I can avoid this issue by calling spacy_parse row by row.
However, this looks inefficient and I would like to use the multithread argument of spacyr to speed up the computation over my large tibble.
Is there any solution here?
Thanks!
Actually, it does not happen in my environment. In my environment, the output is like:
library(tidyverse)
library(spacyr)
bogustib <- tibble(doc_id = c(1,2,3),
text = c('bug', 'one love', '838383838'))
spacy_parse(bogustib)
spacy_parse('838383838', lemma = FALSE, entity = TRUE, nounphrase = TRUE)
## No noun phrase found in documents.
## doc_id sentence_id token_id token pos entity
## 1 text1 1 1 838383838 NUM CARDINAL_B
To get this result, I used the latest master on github. However, I was able to reproduce your error when I ran with the CRAN version of spacyr. I'm sure that I fixed the bug a while ago, but that seems not reflected on CRAN version. We will try to update the CRAN asap.
In the meantime, you can:
devtools::install_github('quanteda/spacyr')
Or zip download the repo and run:
devtools::install('******')
**** is the path to the unzipped repository.
How can one add a new node to a SpatialLinesNetwork?
context of my problem: I have a shapefile of a bus route and another shapefile of bus stops. I want to calculate the distance between stops along the bus route. Ideally, each stop would be a node and I would use stplanr::sum_network_routes() to calculate the distance between them. The problem is that when I convert the bus route into a SpatialLinesNetwork the network only has a few nodes that are far from each other and unrelated to bus stops locations.
reproducible dataset:
# load library and data
library(stplanr)
library(sf)
# get road data
data(routes_fast)
rnet <- overline(routes_fast, attrib = "length")
# convert to sf obj
rnet <- st_as_sf(rnet)
# convert SpatialLinesDataFrame into SpatialLinesNetwork
sln <- SpatialLinesNetwork(rnet)
# identify nodes
sln_nodes = sln2points(sln)
# Here is a bus stop which should be added as a node
new_point_coordinates = c(-1.535, 53.809)
p = sf::st_sf(geometry = sf::st_sfc(sf::st_point(new_point_coordinates)), crs = st_crs(rnet))
# plot
plot(sln, col = "gray") # network
plot(sln_nodes, col="red", add = TRUE) # nodes
plot(p, add=T, col="blue") # stop to be added as a new node
This doesn't answer your question at the outset, but I believe it does resolve your "Context" by showing how your desired network distances can be calculated. This can be done with dodgr (latest dev version) like this:
library (dodgr)
library (stplanr)
library (sf)
library (sp)
dat <- st_as_sf (routes_fast)
net <- weight_streetnet (dat, wt_profile = 1)
The net object is a simple data.frame containing all edges and vertices of the network. Then adapt your code above to get the routing points as a simple matrix
rnet rnet <- overline(routes_fast, attrib = "length")
SLN <- SpatialLinesNetwork(rnet)
sln_nodes = sln2points(SLN)
xy <- coordinates (sln_nodes)
colnames (xy) <- c ("x", "y")
Node that sln2points simply returns "nodes" (in stplanr terminology), which are junction points. You can instead replace with coordinates of bus stops, or simply add those to this matrix. The following three lines convert those coordinates to unique (nearest) vertex IDs of the dodgr net object:
v <- dodgr_vertices (net)
pts <- match_pts_to_graph (v, xy)
pts <- v$id [pts]
To calculate distances between those pts on the network, just
d <- dodgr_dists (net, from = pts, to = pts)
Thanks for the question, thanks to this question and subsequent collaboration with Andrea Gilardi, I'm happy to announce that it is now possible to add new nodes to sfNetwork objects with a new function, sln_add_node().
See below and please try to test reproducible code that demonstrates how it works:
devtools::install_github("ropensci/stplanr")
#> Skipping install of 'stplanr' from a github remote, the SHA1 (33158a5b) has not changed since last install.
#> Use `force = TRUE` to force installation
library(stplanr)
#> Registered S3 method overwritten by 'R.oo':
#> method from
#> throw.default R.methodsS3
#> Warning in fun(libname, pkgname): rgeos: versions of GEOS runtime 3.7.1-CAPI-1.11.1
#> and GEOS at installation 3.7.0-CAPI-1.11.0differ
sample_routes <- routes_fast_sf[2:6, NULL]
sample_routes$value <- rep(1:3, length.out = 5)
rnet <- overline2(sample_routes, attrib = "value")
#> 2019-09-26 16:06:18 constructing segments
#> 2019-09-26 16:06:18 building geometry
#> 2019-09-26 16:06:18 simplifying geometry
#> 2019-09-26 16:06:18 aggregating flows
#> 2019-09-26 16:06:18 rejoining segments into linestrings
plot(sample_routes["value"], lwd = sample_routes$value, main = "Routes")
plot(rnet["value"], lwd = rnet$value, main = "Route network")
sln <- SpatialLinesNetwork(rnet)
#> Linking to GEOS 3.7.1, GDAL 2.4.0, PROJ 5.2.0
new_point_coordinates <- c(-1.540, 53.826)
crs <- sf::st_crs(rnet)
p <- sf::st_sf(geometry = sf::st_sfc(sf::st_point(new_point_coordinates)), crs = crs)
p_dest <- sln2points(sln)[9, ]
# We can identify the nearest point on the network at this point
# and use that to split the associated linestring:
sln_new <- sln_add_node(sln = sln, p = p)
#> although coordinates are longitude/latitude, st_nearest_feature assumes that they are planar
route_new <- route_local(sln = sln_new, from = p, to = p_dest)
plot(sln_new)
plot(p, add = TRUE)
plot(route_new, lwd = 5, add = TRUE)
#> Warning in plot.sf(route_new, lwd = 5, add = TRUE): ignoring all but the
#> first attribute
Created on 2019-09-26 by the reprex package (v0.3.0)
In case it's of use/interest, see the source code of the new small family of functions that support this new functionality here: https://github.com/ropensci/stplanr/blob/master/R/node-funs.R