Printing a single tibble row over multiple lines of text - r

It is sometimes desirable to print a string in a tibble over multiple lines. Example: https://github.com/ropensci/drake/issues/489. drake plans with long commands are hard to read.
library(drake)
pkgconfig::set_config("drake::strings_in_dots" = "literals")
drake_plan(
u_auckland = make_place(
Name = "University of Auckland",
Latitude = -36.8521369,
Longitude = 174.7688785
),
shapefile = {
file_out("u-auckland.prj", "u-auckland.shx", "u-auckland.dbf")
st_write(
obj = u_auckland,
dsn = file_out("u-auckland.shp"),
driver = "ESRI Shapefile",
delete_dsn = TRUE
)
}
)
#> # A tibble: 2 x 2
#> target command
#> * <chr> <chr>
#> 1 u_auckland "make_place(Name = \"University of Auckland\", Latitude = -3…
#> 2 shapefile "{\n file_out(\"u-auckland.prj\", \"u-auckland.shx\", \"u…
Can pillar::pillar_shaft() or a similar tool to achieve something nicer? I am mainly concerned with line breaks and indentation (possibly with styler) but I am also interested in syntax highlighting, possibly with hightlight and crayon.
# A tibble: 2 x 2
target command
* <chr> <drake_cmd>
1 u_auckland make_place(
Name = "University of Auckland",
Latitude = -36.8521369,
Longitude = 174.7688785
)
2 shapefile {
file_out(
"u-auckland.prj",
"u-auckland.shx",
"u-auckland.dbf"
)
st_write(
obj = u_auckland,
dsn = file_out("u-auckland.shp"),
driver = "ESRI Shapefile",
delete_dsn = TRUE
)
}

So, apparently this is currently not possible as such, but a great workaround was suggested for my original use case.

Related

Reading large shapefile in R using using sf package

I have a shapefile of points with 32M observations. I want to load it in R and I have tried read_sf and st_read but my R session keeps getting crashed. One other way that came to my mind was to write a for loop, subsetting columns that I want and maybe going a specific number of rows at a time and then rbinding them, but cannot figure out how to make R understand the query. Here's what I have so far which is not working:
for (i in 1:10) {
j = i-1
jj = i+1
print(i)
print(j)
print(jj)
A <- read_sf("C:\\Users\\...parcels-20210802T125336Z-001\\parcels\\join_L3_Mad_Addresses.shp", query = "SELECT FID, CENTROID_I, LOC_ID FROM join_L3_Mad_Addresses WHERE FID < "jj" AND FID > "j"")
}
I think you can readapt the following code.
Load packages
library(sf)
#> Linking to GEOS 3.9.0, GDAL 3.2.1, PROJ 7.2.1
Define path to a shapefile
dsn <- system.file("shape/nc.shp", package="sf")
Count the number of features in dsn
st_layers(dsn, do_count = TRUE)
#> Driver: ESRI Shapefile
#> Available layers:
#> layer_name geometry_type features fields
#> 1 nc Polygon 100 14
Start a for loop to read 10 features at a time. Add the data to a list
shp_data_list <- list()
i <- 1
for (offset in seq(10, 100, by = 10)) {
query <- paste0("SELECT * FROM nc LIMIT ", 10, " OFFSET ", offset - 10)
shp_data_list[[i]] <- st_read(dsn, query = query, quiet = TRUE)
gc(verbose = FALSE)
i <- i + 1
}
Rbind the objects
shp_data <- do.call(rbind, shp_data_list)
Add an ID column (just for plotting)
shp_data$ID <- as.character(rep(1:10, each = 10))
plot(shp_data["ID"])
The only problem is that this process may not preserve the geometry type. For
example,
unique(st_geometry_type(shp_data))
#> [1] MULTIPOLYGON POLYGON
#> 18 Levels: GEOMETRY POINT LINESTRING POLYGON MULTIPOINT ... TRIANGLE
while
unique(st_geometry_type(st_read(dsn, quiet = TRUE)))
#> [1] MULTIPOLYGON
#> 18 Levels: GEOMETRY POINT LINESTRING POLYGON MULTIPOINT ... TRIANGLE
You can change the geometry type with st_cast()
Created on 2021-08-03 by the reprex package (v2.0.0)

Parallelizing a loop with updating during each iteration

I have some R code that puts together demographic data from the Census for all of states in the US into a list object. The block-level code can take a week to run as a sequential loop since there are ~11M blocks, so I am trying to parallelize the loop over states to make it faster. I have accomplished this goal with this:
states <- c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI",
"ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI",
"MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC",
"ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT",
"VT","VA","WA","WV","WI","WY","DC","PR")
library(future.apply)
plan(multiprocess)
ptm <- proc.time()
CensusObj_block_age_sex = list()
CensusObj_block_age_sex[states] <- future_lapply(states, function(s){
county <- census_geo_api(key = "XXX", state = s, geo = "county", age = TRUE, sex = TRUE)
tract <- census_geo_api(key = "XXX", state = s, geo = "tract", age = TRUE, sex = TRUE)
block <- census_geo_api(key = "XXX", state = s, geo = "block", age = TRUE, sex = TRUE)
censusObj[[s]] <- list(state = s, age = TRUE, sex = TRUE, block = block, tract = tract, county = county)
}
)
However, I need to make it more robust. Sometimes there are problem with the Census API, so I would like the CensusObj to be updated at each state iteration so that I don't loose my completed data if something wrong. That way I can restart the loop over the remaining state if something does goes wrong (like if I spell "WY" as "WU")
Would it be possible to accomplish this somehow? I am open to other methods of parallelization.
The code above runs, but it seems to run into memory issues:
Error: Failed to retrieve the value of MultisessionFuture (future_lapply-3) from cluster RichSOCKnode #3 (PID 80363 on localhost ‘localhost’). The reason reported was ‘vector memory exhausted (limit reached?)’. Post-mortem diagnostic: A process with this PID exists, which suggests that the localhost worker is still alive.
I have R_MAX_VSIZE = 8Gb in my .Renviron, but I am not sure how that would get divided between the 8 cores on my machine. This all suggests that I need to store the results of each iteration rather than try to keep it all in memory, and then append the objects together at the end.
Here is a solution that uses doParallel (with the options for UNIX systems, but you can also use it on Windows, see here) and foreach that stores the results for every state separately and afterwards reads in the single files and combines them to a list.
library(doParallel)
library(foreach)
path_results <- "my_path"
ncpus = 8L
registerDoParallel(cores = ncpus)
states <- c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI",
"ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI",
"MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC",
"ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT",
"VT","VA","WA","WV","WI","WY","DC","PR")
results <- foreach(state = states) %dopar% {
county <- census_geo_api(key = "XXX", state = state, geo = "county", age = TRUE, sex = TRUE)
tract <- census_geo_api(key = "XXX", state = state, geo = "tract", age = TRUE, sex = TRUE)
block <- census_geo_api(key = "XXX", state = state, geo = "block", age = TRUE, sex = TRUE)
results <- list(state = state, age = TRUE, sex = TRUE, block = block, tract = tract, county = county)
# store the results as rds
saveRDS(results,
file = paste0(path_results, "/", state, ".Rds"))
# remove the results
rm(county)
rm(tract)
rm(block)
rm(results)
gc()
# just return a string
paste0("done with ", state)
}
library(purrr)
# combine the results to a list
result_files <- list.files(path = path_results)
CensusObj_block_age_sex <- set_names(result_files, states) %>%
map(~ readRDS(file = paste0(path_results, "/", .x)))
You could use a tryCatch inside future_lapply to try to relaunch the calculation in case of API error, for a maximum of maxtrials.
In the resulting list, you get for each calculation the number of trials and the final status, OK or Error:
states <- c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI",
"ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI",
"MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC",
"ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT",
"VT","VA","WA","WV","WI","WY","DC","PR")
library(future.apply)
#> Le chargement a nécessité le package : future
plan(multiprocess)
ptm <- proc.time()
maxtrials <- 3
census_geo_api <-
function(key = "XXX",
state = s,
geo = "county",
age = TRUE,
sex = TRUE) {
paste(state,'-', geo)
}
CensusObj_block_age_sex <- future_lapply(states, function(s) {
ntrials <- 1
while (ntrials <= maxtrials) {
hasError <- tryCatch({
#simulate random error
if (runif(1)>0.3) {error("API failed")}
county <- census_geo_api(key = "XXX", state = s, geo = "county", age = TRUE, sex = TRUE)
tract <- census_geo_api(key = "XXX", state = s, geo = "tract", age = TRUE, sex = TRUE)
block <- census_geo_api(key = "XXX", state = s, geo = "block", age = TRUE, sex = TRUE)
},
error = function(e)
e)
if (inherits(hasError, "error")) {
ntrials <- ntrials + 1
} else { break}
}
if (ntrials > maxtrials) {
res <- list(state = s, status = 'Error', ntrials = ntrials-1, age = NA, sex = NA, block = NA, tract = NA, county = NA)
} else {
res <- list(state = s, status = 'OK' , ntrials = ntrials, age = TRUE, sex = TRUE, block = block, tract = tract, county = county)
}
res
}
)
CensusObj_block_age_sex[[1]]
#> $state
#> [1] "AL"
#>
#> $status
#> [1] "OK"
#>
#> $ntrials
#> [1] 3
#>
#> $age
#> [1] TRUE
#>
#> $sex
#> [1] TRUE
#>
#> $block
#> [1] "AL - block"
#>
#> $tract
#> [1] "AL - tract"
#>
#> $county
#> [1] "AL - county"
<sup>Created on 2020-08-19 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
One possible solution that I have is to log the value of CensusObj to a text file i.e print the CensusObj in each iteration. The doSNOW package can be used for logging for example
library(doSNOW)
cl <- makeCluster(1, outfile="abc.out")
registerDoSNOW(cl)
states <- c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI",
"ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI",
"MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC",
"ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT",
"VT","VA","WA","WV","WI","WY","DC","PR")
foreach(i=1:length(states), .combine=rbind, .inorder = TRUE) %dopar% {
county <- "A"
tract <- "B"
block <- "C"
censusObj <- data.frame(state = states[i], age = TRUE, sex = TRUE, block = block, tract = tract, county = county)
# edit: print json objects to easily extract from the file
cat(sprintf("%s\n",rjson::toJSON(censusObj)))
}
stopCluster(cl)
This would log the value of censusObj in abc.out and also logs the error if program crashes but you will get the latest value of censusObj logged in abc.out.
Here is the output of the last iteration from the log file:
Type: EXEC {"state":"PR","age":true,"sex":true,"block":"C","tract":"B","county":"A"} Type: DONE
Type:EXEC means that the iteration has started and Type:DONE means execution is completed. The result of cat will be present between these two statements of each iteration. Now, the value of CensusObj can be extracted from the log file as shown below:
Lines = readLines("abc.out")
results = list()
for(i in Lines){
# skip processing logs created by doSNOW
if(!startsWith(i, "starting") && !startsWith(i, "Type:")){
results = rlist::list.append(results, jsonlite::fromJSON(i))
}
}
results will contain the elements all the values printed in abc.out.
> head(results, 1)
[[1]]
[[1]]$state
[1] "AL"
[[1]]$age
[1] TRUE
[[1]]$sex
[1] TRUE
[[1]]$block
[1] "C"
[[1]]$tract
[1] "B"
[[1]]$county
[1] "A"
It is not a very clean solution but works.

Using code_to_plan and target(..., format = "fst") in drake

I really like using the code_to_plan function when constructing drake plans. I also really using target(..., format = "fst") for big files. However I am struggling to combine these two workflows. For example if I have this _drake.R file:
# Data --------------------------------------------------------------------
data_plan = code_to_plan("code/01-data/data.R")
join_plan = code_to_plan("code/01-data/merging.R")
# Cleaning ----------------------------------------------------------------
cleaning_plan = code_to_plan("code/02-cleaning/remove_na.R")
# Model -------------------------------------------------------------------
model_plan = code_to_plan("code/03-model/model.R")
# Combine Plans
dplan = bind_plans(
data_plan,
join_plan,
cleaning_plan,
model_plan
)
config <- drake_config(dplan)
This works fine when called with r_make(r_args = list(show = TRUE))
As I understand it though target can only be used within a drake_plan. If I try something like this:
dplan2 <- drake_plan(full_plan = target(dplan, format = "fst"))
config <- drake_config(dplan2)
I get an r_make error like this:
target full_plan
Error in fst::write_fst(x = value$value, path = tmp) :
Unknown type found in column.
In addition: Warning message:
You selected fst format for target full_plan, so drake will convert it from class c("drake_plan", "tbl_df", "tbl", "data.frame") to a plain data frame.
Error:
-->
in process 18712
See .Last.error.trace for a stack trace.
So ultimately my question is where does one specify special data formats for targets when you are using code_to_plan?
Edit
Using #landau helpful suggestion, I defined this function:
add_target_format <- function(plan) {
# Get a list of named commands.
commands <- plan$command
names(commands) <- plan$target
# Turn it into a good plan.
do.call(drake_plan, commands)
}
So that this would work:
dplan = bind_plans(
data_plan,
join_plan,
cleaning_plan,
model_plan
) %>%
add_target_format()
It is possible, but not convenient. Here is a workaround.
writeLines(
c(
"x <- small_data()",
"y <- target(large_data(), format = \"fst\")"
),
"script.R"
)
cat(readLines("script.R"), sep = "\n")
#> x <- small_data()
#> y <- target(large_data(), format = "fst")
library(drake)
# Produces a plan, but does not process target().
bad_plan <- code_to_plan("script.R")
bad_plan
#> # A tibble: 2 x 2
#> target command
#> <chr> <expr>
#> 1 x small_data()
#> 2 y target(large_data(), format = "fst")
# Get a list of named commands.
commands <- bad_plan$command
names(commands) <- bad_plan$target
# Turn it into a good plan.
good_plan <- do.call(drake_plan, commands)
good_plan
#> # A tibble: 2 x 3
#> target command format
#> <chr> <expr> <chr>
#> 1 x small_data() <NA>
#> 2 y large_data() fst
Created on 2019-12-18 by the reprex package (v0.3.0)

how to feed a tibble to spacyr?

Consider this simple example
bogustib <- tibble(doc_id = c(1,2,3),
text = c('bug', 'one love', '838383838'))
# A tibble: 3 x 2
doc_id text
<dbl> <chr>
1 1 bug
2 2 one love
3 3 838383838
This tibble is called bogustib because I know spacyr will fail on row 3.
> spacy_parse('838383838', lemma = FALSE, entity = TRUE, nounphrase = TRUE)
Error in `$<-.data.frame`(`*tmp*`, "doc_id", value = "text1") :
replacement has 1 row, data has 0
so, naturally, feeding the tibble to spacyr will fail as well
spacy_parse(bogustib, lemma = FALSE, entity = TRUE, nounphrase = TRUE)
Error in `$<-.data.frame`(`*tmp*`, "doc_id", value = "3") :
replacement has 1 row, data has 0
My question is: I think I can avoid this issue by calling spacy_parse row by row.
However, this looks inefficient and I would like to use the multithread argument of spacyr to speed up the computation over my large tibble.
Is there any solution here?
Thanks!
Actually, it does not happen in my environment. In my environment, the output is like:
library(tidyverse)
library(spacyr)
bogustib <- tibble(doc_id = c(1,2,3),
text = c('bug', 'one love', '838383838'))
spacy_parse(bogustib)
spacy_parse('838383838', lemma = FALSE, entity = TRUE, nounphrase = TRUE)
## No noun phrase found in documents.
## doc_id sentence_id token_id token pos entity
## 1 text1 1 1 838383838 NUM CARDINAL_B
To get this result, I used the latest master on github. However, I was able to reproduce your error when I ran with the CRAN version of spacyr. I'm sure that I fixed the bug a while ago, but that seems not reflected on CRAN version. We will try to update the CRAN asap.
In the meantime, you can:
devtools::install_github('quanteda/spacyr')
Or zip download the repo and run:
devtools::install('******')
**** is the path to the unzipped repository.

Add new node to SpatialLinesNetwork in stplanr

How can one add a new node to a SpatialLinesNetwork?
context of my problem: I have a shapefile of a bus route and another shapefile of bus stops. I want to calculate the distance between stops along the bus route. Ideally, each stop would be a node and I would use stplanr::sum_network_routes() to calculate the distance between them. The problem is that when I convert the bus route into a SpatialLinesNetwork the network only has a few nodes that are far from each other and unrelated to bus stops locations.
reproducible dataset:
# load library and data
library(stplanr)
library(sf)
# get road data
data(routes_fast)
rnet <- overline(routes_fast, attrib = "length")
# convert to sf obj
rnet <- st_as_sf(rnet)
# convert SpatialLinesDataFrame into SpatialLinesNetwork
sln <- SpatialLinesNetwork(rnet)
# identify nodes
sln_nodes = sln2points(sln)
# Here is a bus stop which should be added as a node
new_point_coordinates = c(-1.535, 53.809)
p = sf::st_sf(geometry = sf::st_sfc(sf::st_point(new_point_coordinates)), crs = st_crs(rnet))
# plot
plot(sln, col = "gray") # network
plot(sln_nodes, col="red", add = TRUE) # nodes
plot(p, add=T, col="blue") # stop to be added as a new node
This doesn't answer your question at the outset, but I believe it does resolve your "Context" by showing how your desired network distances can be calculated. This can be done with dodgr (latest dev version) like this:
library (dodgr)
library (stplanr)
library (sf)
library (sp)
dat <- st_as_sf (routes_fast)
net <- weight_streetnet (dat, wt_profile = 1)
The net object is a simple data.frame containing all edges and vertices of the network. Then adapt your code above to get the routing points as a simple matrix
rnet rnet <- overline(routes_fast, attrib = "length")
SLN <- SpatialLinesNetwork(rnet)
sln_nodes = sln2points(SLN)
xy <- coordinates (sln_nodes)
colnames (xy) <- c ("x", "y")
Node that sln2points simply returns "nodes" (in stplanr terminology), which are junction points. You can instead replace with coordinates of bus stops, or simply add those to this matrix. The following three lines convert those coordinates to unique (nearest) vertex IDs of the dodgr net object:
v <- dodgr_vertices (net)
pts <- match_pts_to_graph (v, xy)
pts <- v$id [pts]
To calculate distances between those pts on the network, just
d <- dodgr_dists (net, from = pts, to = pts)
Thanks for the question, thanks to this question and subsequent collaboration with Andrea Gilardi, I'm happy to announce that it is now possible to add new nodes to sfNetwork objects with a new function, sln_add_node().
See below and please try to test reproducible code that demonstrates how it works:
devtools::install_github("ropensci/stplanr")
#> Skipping install of 'stplanr' from a github remote, the SHA1 (33158a5b) has not changed since last install.
#> Use `force = TRUE` to force installation
library(stplanr)
#> Registered S3 method overwritten by 'R.oo':
#> method from
#> throw.default R.methodsS3
#> Warning in fun(libname, pkgname): rgeos: versions of GEOS runtime 3.7.1-CAPI-1.11.1
#> and GEOS at installation 3.7.0-CAPI-1.11.0differ
sample_routes <- routes_fast_sf[2:6, NULL]
sample_routes$value <- rep(1:3, length.out = 5)
rnet <- overline2(sample_routes, attrib = "value")
#> 2019-09-26 16:06:18 constructing segments
#> 2019-09-26 16:06:18 building geometry
#> 2019-09-26 16:06:18 simplifying geometry
#> 2019-09-26 16:06:18 aggregating flows
#> 2019-09-26 16:06:18 rejoining segments into linestrings
plot(sample_routes["value"], lwd = sample_routes$value, main = "Routes")
plot(rnet["value"], lwd = rnet$value, main = "Route network")
sln <- SpatialLinesNetwork(rnet)
#> Linking to GEOS 3.7.1, GDAL 2.4.0, PROJ 5.2.0
new_point_coordinates <- c(-1.540, 53.826)
crs <- sf::st_crs(rnet)
p <- sf::st_sf(geometry = sf::st_sfc(sf::st_point(new_point_coordinates)), crs = crs)
p_dest <- sln2points(sln)[9, ]
# We can identify the nearest point on the network at this point
# and use that to split the associated linestring:
sln_new <- sln_add_node(sln = sln, p = p)
#> although coordinates are longitude/latitude, st_nearest_feature assumes that they are planar
route_new <- route_local(sln = sln_new, from = p, to = p_dest)
plot(sln_new)
plot(p, add = TRUE)
plot(route_new, lwd = 5, add = TRUE)
#> Warning in plot.sf(route_new, lwd = 5, add = TRUE): ignoring all but the
#> first attribute
Created on 2019-09-26 by the reprex package (v0.3.0)
In case it's of use/interest, see the source code of the new small family of functions that support this new functionality here: https://github.com/ropensci/stplanr/blob/master/R/node-funs.R

Resources