I have a GIS question that has been stumping me for some time now. The end goal would be to extract the density of lines per pixel/voxel/polygon using tidyverse/sf packages. As of now I have a function that works when I execute line-by-line, but not as a function. The ultimate end-goal would be to use this function in sfLappy of the snowfall package to be run in parallel. Any help on getting this to work as a function would be greatly appreciated! The data involved can be found here....
https://www.dropbox.com/s/zg9o2b5x4wizafo/hexagons.gpkg?dl=0
https://www.dropbox.com/s/x2gxx36pjkutxzm/railroad_lines.gpkg?dl=0
The function that I had created, which, again, works line-for-line but not as a function, can be found here:
length_in_poly <- function(fishnet, spatial_lines) {
require(sf)
require(tidyverse)
require(magrittr)
fishnet <- st_as_sf(do.call(rbind, fishnet))
spatial_lines <- st_as_sf(do.call(rbind, spatial_lines))
fish_length <- list()
for (i in 1:nrow(fishnet)) {
split_lines <- spatial_lines %>%
st_cast(., "MULTILINESTRING", group_or_split = FALSE) %>%
st_intersection(., fishnet[i, ]) %>%
mutate(lineid = row_number())
fish_length[[i]] <- split_lines %>%
mutate(length = sum(st_length(.)))
}
fish_length <- do.call(rbind, fish_length) %>%
group_by(hexid4k) %>%
summarize(length = sum(length))
fishnet <- fishnet %>%
st_join(., fish_length, join = st_intersects) %>%
mutate(hexid4k = hexid4k.x,
length = ifelse(is.na(length), 0, length),
pixel_area = as.numeric(st_area(geom)),
density = length/pixel_area)
}
To prep the data:
library(sf)
library(tidyverse)
library(snowfall)
input_hexagons <- st_read("hexagons.gpkg")
input_rail_lines <- st_read("railroad_lines.gpkg")
Using some code from here:
faster_as_tibble <- function(x) {
structure(x, class = c("tbl_df", "tbl", "data.frame", "sfc"), row.names = as.character(seq_along(x[[1]])))
}
split_fast_tibble <- function (x, f, drop = FALSE, ...) {
lapply(split(x = seq_len(nrow(x)), f = f, ...),
function(ind) faster_as_tibble(lapply(x, "[", ind)))
}
Create a state-wise list:
sub_hexnet <- split_fast_tibble(input_hexagons, input_hexagons$STUSPS) %>%
lapply(st_as_sf)
Finally, to run just as a single-core process:
test <- lapply(fishnet = as.list(sub_hexnet),
FUN = length_in_poly,
spatial_lines = input_rail_lines)
Or, in the perfect world, a multi-core process:
sfInit(parallel = TRUE, cpus = parallel::detectCores())
sfExport(list = c("sub_hexnet", "mask_rails"))
extractions <- sfLapply(fishnet = sub_hexnet,
fun = length_in_poly,
spatial_lines = input_rail_lines)
sfStop()
Thanks in advance for any help - I am completely stumped!
After messing around with this for a while I finally figured out a solution.
The key helper functions used:
load_data <- function(url, dir, layer, outname) {
file <- paste0(dir, "/", layer, ".shp")
if (!file.exists(file)) {
download.file(url, destfile = paste0(dir, ".zip"))
unzip(paste0(dir, ".zip"),
exdir = dir)
unlink(paste0(dir, ".zip"))
}
name <- paste0(outname, "_shp")
name <- sf::st_read(dsn = dir, layer = layer)
name
}
get_density <- function(x, grids, lines) {
require(tidyverse)
require(lubridate)
require(sf)
sub_grids <- grids %>%
dplyr::filter(hexid4k == x)
single_lines_hexid <- lines %>%
dplyr::filter(hexid4k == x) %>%
sf::st_intersection(., sub_grids) %>%
dplyr::select(hexid4k, STUSPS) %>%
dplyr::mutate(length_line = st_length(.),
length_line = ifelse(is.na(length_line), 0, length_line))
sub_grids <- sub_grids %>%
sf::st_join(., single_lines_hexid, join = st_intersects) %>%
dplyr::mutate(hexid4k = hexid4k.x) %>%
dplyr::group_by(hexid4k) %>%
dplyr::summarize(length_line = sum(length_line)) %>%
dplyr::mutate(pixel_area = as.numeric(st_area(geom)),
density = length_line/pixel_area) %>%
dplyr::select(hexid4k, length_line, density, pixel_area)
return(sub_grids)
}
Prep the input data:
usa_shp <- load_data(url = "https://www2.census.gov/geo/tiger/GENZ2016/shp/cb_2016_us_state_20m.zip",
dir = 'data',
layer = "cb_2016_us_state_20m",
outname = "usa") %>%
sf::st_transform(p4string_ea) %>%
dplyr::filter(!STUSPS %in% c("HI", "AK", "PR"))
usa_shp$STUSPS <- droplevels(usa_shp$STUSPS)
hex_points <- spsample(as(usa_shp, 'Spatial'), type = "hexagonal", cellsize = 4000)
hex_grid <- HexPoints2SpatialPolygons(hex_points, dx = 4000)
hexnet_4k <- st_as_sf(hex_grid) %>%
mutate(hexid4k = row_number()) %>%
st_intersection(., st_union(usa_shp)) %>%
st_join(., usa_shp, join = st_intersects) %>%
dplyr::select(hexid4k, STUSPS)
transmission_lines_hex <- load_data( url = "https://hifld-dhs-gii.opendata.arcgis.com/datasets/75af06441c994aaf8e36208b7cd44014_0.zip",
dir = 'data',
layer = 'Electric_Power_Transmission_Lines',
outname = 'tl')%>%
dplyr::select(LINEARID, STUSPS) %>%
st_join(., hexnet_4k, join = st_intersects) %>%
mutate(STUSPS = STUSPS.x) %>%
dplyr::select(LINEARID, hexid4k, STUSPS)
The resulting parallel process is as follows:
hexnet_list <- hexnet_4k %>%
split(., .$STUSPS)
sfInit(parallel = TRUE, cpus = num_cores)
sfExport('transmission_lines_hex')
sfSource('src/functions/helper_functions.R')
transmission_lines_density <- lapply(hexnet_list,
function (input_list) {
require(tidyverse)
require(magrittr)
require(lubridate)
require(lubridate)
require(sf)
sub_grid <- dplyr:::bind_cols(input_list)
unique_ids <- unique(sub_grid$hexid4k)
state_name <- unique(sub_grid$STUSPS)[1]
print(paste0('Working on ', state_name))
got_density <- lapply(unique_ids,
FUN = get_density,
grids = sub_grid,
lines = transmission_lines_hex)
print(paste0('Finishing ', state_name))
return(got_density)
}
)
sfStop()
I hope some of this may be useful to you and, as always, suggestions on optimization would be welcomed.
Related
I need to perform a conceptually straightforward double left-merge followed by a simple series of matching functions (See: Straightforward Solution). However, given the DBs I have to merge are large in size I tried to unpack the merging procedure by considering a for-loop that does the trick but is inefficient to say the least (See: For-loops Solution).
Is there a solution splitting and naming at least the largest db?
Below there is a toy example.
For reference, in my data:
db_m1 ~50k lines (for ~5k unique m1)
db_m2 ~25k lines (for ~5k unique m1 and m2)
db_p ~100m lines
set.seed(0)
db_m1 <- data.frame(
y=rep(1,20),
id=sort(rep(paste0("id_",c(letters[1:4])),5)),
m1=rep(c(1,2),10),
x1=sample(LETTERS, 20, TRUE),
x2=sample(LETTERS, 20, TRUE))
set.seed(0)
db_m2 <- data.frame(y=rep(1,20),
m1=sample(c(1:5),20,TRUE),
m2=sample(c(6:10),20,TRUE))
set.seed(0)
db_p <- data.frame(m2=sample(c(6:10),100,TRUE),
y1=sample(LETTERS, 100,TRUE),
y2=sample(LETTERS, 10,TRUE))
Straightforward Solution :
final_dplyr <- db_m1 %>%
dplyr::left_join(db_m2) %>%
dplyr::left_join(db_p) %>%
dplyr::mutate(match_1=ifelse(x1==y1|x1==y2,1,0),
match_2=ifelse(x2==y1|x2==y2,1,0),
sum_matches=mapply(sum,match_1,match_2),
final_1 = ifelse(as.numeric(sum_matches)>=1,1,0),
final_2 = ifelse(as.numeric(sum_matches)>=2,1,0)) %>%
group_by(id,m2) %>%
dplyr::mutate(n_p=n(),
n_p=ifelse(all(is.na(y1)),NA,n_p)) %>%
group_by(y,id,m1,m2,n_p) %>%
dplyr::summarise(match_1=sum(match_1,na.rm = T),
match_2=sum(match_2,na.rm = T),
final_1 = sum(final_1),
final_2 = sum(final_2))
For-loops Solution:
fn_final <- function(db_m1,db_m2,db_p) {
matches_final <- vector("list",length = length(unique(db_m1$y)))
for(i in 1:length(unique(db_m1$y))){
matches <- vector("list",length = length(unique(db_m1$m1)))
for(j in 1:length(unique(db_m1$m1))){
temp_db_m1 <- db_m1 %>% dplyr::filter(y==unique(db_m1$y)[i], m1==unique(db_m1$m1)[j])
temp_db_m2 <- db_m2 %>% dplyr::filter(y==unique(db_m1$y)[i], m1==unique(db_m1$m1)[j])
m_vector <- unique(temp_db_m2$m2)
temp_db_p <- db_p %>%
dplyr::filter(m2 %in% m_vector)
final <- db_m1 %>%
dplyr::left_join(db_m2) %>%
dplyr::left_join(db_p) %>% dplyr::mutate(match_1=ifelse(x1==y1|x1==y2,1,0),
match_2=ifelse(x2==y1|x2==y2,1,0),
sum_matches=mapply(sum,match_1,match_2),
final_1 = ifelse(as.numeric(sum_matches)>=1,1,0),
final_2 = ifelse(as.numeric(sum_matches)>=2,1,0)) %>%
group_by(id,m2) %>%
dplyr::mutate(n_p=n(),
n_p=ifelse(all(is.na(y1)),NA,n_p)) %>%
group_by(y,id,m1,m2,n_p) %>%
dplyr::summarise(match_1=sum(match_1,na.rm = T),
match_2=sum(match_2,na.rm = T),
final_1 = sum(final_1),
final_2 = sum(final_2))
matches[[j]] <- final
}
matches_all <- do.call(rbind, matches)
matches_final[[i]] <- matches_all
}
final <- do.call(rbind, matches_final) %>%
dplyr::filter(!is.na(n_p)) %>%
unique()
return(final)
}
final_for <- fn_final(db_m1,db_m2,db_p)
This is a possible solution, should it be optimized further?
db_m1_s <- split(db_m1, f = list(db_m1$y,db_m1$m1))
db_m2_s <- split(db_m2, f = list(db_m2$y,db_m2$m1))
db_p_s <- split(db_p, f = list(db_p$m2))
match_fn <- function(temp_db_m1,temp_db_m2,temp_db_p){
final <- temp_db_m1 %>%
dplyr::left_join(temp_db_m2) %>%
dplyr::left_join(temp_db_p) %>%
dplyr::mutate(match_1=ifelse(x1==y1|x1==y2,1,0),
match_2=ifelse(x2==y1|x2==y2,1,0),
sum_matches=mapply(sum,match_1,match_2),
final_1 = ifelse(as.numeric(sum_matches)>=1,1,0),
final_2 = ifelse(as.numeric(sum_matches)>=2,1,0)) %>%
group_by(id,m2) %>%
dplyr::mutate(n_p=n(),
n_p=ifelse(all(is.na(y1)),NA,n_p)) %>%
group_by(y,id,m1,m2,n_p) %>%
dplyr::summarise(match_1=sum(match_1,na.rm = T),
match_2=sum(match_2,na.rm = T),
final_1 = sum(final_1),
final_2 = sum(final_2))
return(final)
}
fn_final <- function(db_m1,db_m1_s,db_m2_s,db_p_s) {
m <- names(db_m1_s)
matches_1 <- vector("list",length = length(m))
for(i in 1:length(m)){
temp_db_m1 <- db_m1_s[[m[i]]]
temp_db_m2 <- db_m2_s[[m[i]]]
n <- as.character(sort(unique(temp_db_m2$m2)))
matches_2 <- vector("list",length = length(n))
for(j in 1:length(n)){
temp_db_p <- db_p_s[[n[j]]]
final <- match_fn(temp_db_m1,temp_db_m2,temp_db_p)
matches_2[[j]] <- final
}
matches_all <- do.call(rbind, matches_2)
matches_1[[i]] <- matches_all
}
matches_0 <- do.call(rbind, matches_1) %>%
dplyr::filter(!is.na(n_p)) %>%
unique()
return(matches_0)
}
final_for <- fn_final(db_m1,db_m1_s,db_m2_s,db_p_s)
Not sure if you all will be able to help me without reproducible example data, but I have a problem with running the code below. I am attempting to use the multidplyr package, but it doesn't seem to find my columns. I am running the code below:
cl <- detectCores()
cl
models_prep <-
bookings_prep %>%
inner_join(pipeline_prep_, by = c("booking_type", "group")) %>%
crossing(biz_day) %>%
left_join(closed_pipeline, by = c("booking_type", "group")) %>%
select(-opportunity_forecast_category)
group1 <- rep(1:cl, length.out = nrow(models_prep))
models_prep1 <- bind_cols(tibble(group1), models_prep)
cluster <- new_cluster(cl)
cluster %>%
cluster_library("tidyr")
cluster %>%
cluster_library("purrr")
cluster %>%
cluster_library("plyr")
cluster %>%
cluster_library("dplyr")
cluster_copy(cluster, "rmf")
cluster_copy(cluster, "fc_xreg")
#cluster_assign(cluster, "rmf")
#cluster_copy(cluster,c("rmf","fc_xreg"))
by_group <- models_prep %>%
group_by(group) %>%
partition(cluster)
by_group1 <- models_prep1 %>%
group_by(group1) %>%
partition(cluster)
models <- by_group %>%
mutate(
xreg_arima = pmap(list(data = pipeline, h = 1,name = group, bookings = bookings, type = booking_type,
biz_day = biz_day, no_bookings = no_bookings,
sparse_pipeline = sparse_pipeline,
closed_forecast_cat = pipeline_amount, FUN = "fc_xreg"), rmf))
Everything runs up to models <- correctly, but it fails there saying it cannot find the object group. Here is what the by_group data frame looks like.
Sometimes arguments just need to be quoted, particularly in dplyr-ish situations.
models <- by_group %>%
mutate(
xreg_arima = pmap(list(data = pipeline, h = 1,name = "group", bookings = "bookings", type = "booking_type",
biz_day = "biz_day", no_bookings = "no_bookings",
sparse_pipeline = "sparse_pipeline",
closed_forecast_cat = "pipeline_amount", FUN = "fc_xreg"), rmf))
When I run a code with save as csv at the end everything runs correctly. It means I filter my dataset by country do some continuations and save it as a country file.
When I try to rbind my datasets I receive nothing. I try different solutions which I found but nothing is working. Clearly I do not understand why I can not rbind.
The code:
library(dplyr)
library(readxl)
setwd("Z:/Reporting_Private/Tableau")
dataupl <- read_excel("Analysis Map_Tableau - Ready.xlsm", sheet = "Data")
df = dataupl
#select right columns
df = df[,1:6]
colnames(df)=c("Office", "Employee","Territiry","Sales","Leads","Act")
#change n/a to zero
df[is.na(df)]=0
countries = df %>% select(Office) %>% distinct()
countries = as.data.frame(countries)
engine <- function(input){
df = df %>% filter(Office==input)
SCALESALES = scale(df$Sales)
SCALELEADS= scale(df$Leads)
SCALEACT= scale(df$Act)
df = df %>% mutate(SCALESALES = SCALESALES, SCALELEADS = SCALELEADS, SCALEACT = SCALEACT)
df$SLegend = ave(df$Sales, df$SalesLegend, FUN = min)
df$SLegend = ifelse(df$SLegend>0, df$SLegend,0)
df$LLegend = ave(df$Leads, df$LeadsLegend, FUN = min)
df$ALegend = ave(df$Act, df$ActLegend, FUN = min)
#write.csv(final, file = paste0(input,".csv"))
dftotal = data.frame()
dftotal = rbind(dftotal,df)
}
for (i in 1:nrow(countries)){
input = countries[i,]
engine(input)
}
It seems that you can split your dataframe L <- split(df, df$Office) and then lapply(L, ...) Instead of write.csv(...) you have to return the dataframe: return(df) in your function.
So, something like this:
engine <- function(dfi) {
SCALESALES = scale(dfi$Sales)
SCALELEADS = scale(dfi$Leads)
SCALEACT = scale(dfi$Act)
dfi = dfi %>% mutate(SCALESALES = SCALESALES, SCALELEADS = SCALELEADS, SCALEACT = SCALEACT)
dfi$SLegend = ave(dfi$Sales, dfi$SalesLegend, FUN = min)
dfi$SLegend = ifelse(dfi$SLegend>0, dfi$SLegend, 0)
dfi$LLegend = ave(dfi$Leads, dfi$LeadsLegend, FUN = min)
dfi$ALegend = ave(dfi$Act, dfi$ActLegend, FUN = min)
return(dfi)
}
L <- split(df, df$Office)
Lnew <- lapply(L, engine)
dftotal <- Lnew[[1]]
for (i in 2:length(Lnew)) dftotal <- rbind(dftotal, Lnew[[i]])
I'm collecting time series data from Wikipedia and want to run a change-point analysis on each time series using dplyr. But when I do so I get an error saying the data need to be numeric, even though the class function states it is numeric. Hope you can help.
library(changepoint)
library(dplyr)
library(pageviews)
library(data.table)
articles <- c("Rugby_union", "Football")
foo <- function(x){article_pageviews(project = "en.wikipedia",
article = x,
start = as.Date('2017-01-01'),
end = as.Date("2017-12-31")
, user_type = "user", platform = c("mobile-web"))
}
output<-articles %>% foo
output %>%
select(article, views) %>%
do(cpt.mean(.))
class(output$views)
library(changepoint)
library(dplyr)
library(pageviews)
articles <- c("Rugby_union", "Football")
foo <- function(x){article_pageviews(project = "en.wikipedia", article = x,
start = as.Date('2017-01-01'),
end = as.Date("2017-12-31"),
user_type = "user", platform = c("mobile-web"))
}
output <- articles %>%
foo
df <- as.data.frame(table(output$article))
output1 <- output %>%
dplyr::select(article, views) %>%
dplyr::filter(article == df[1,1])
output2 <- output %>%
dplyr::select(article, views) %>%
dplyr::filter(article == df[2,1])
q <- floor((min(length(output1$views), length(output2$views)))/2 + 1)
cp1 <- changepoint::cpt.mean(data = output1$views, Q = q, method = "BinSeg", penalty
= "SIC")
plot(cp1)
cp2 <- changepoint::cpt.mean(data = output2$views, Q = q, method = "BinSeg", penalty
= "SIC")
plot(cp2)
I have this code that works but I would like to learn how to replace it by a function.
library(tidyverse)
l1_1617 <- read.csv("http://www.football-data.co.uk/mmz4281/1617/F1.csv", stringsAsFactors = FALSE)
l1_1516 <- read.csv("http://www.football-data.co.uk/mmz4281/1516/F1.csv", stringsAsFactors = FALSE)
l1_1415 <- read.csv("http://www.football-data.co.uk/mmz4281/1415/F1.csv", stringsAsFactors = FALSE)
l1_1314 <- read.csv("http://www.football-data.co.uk/mmz4281/1314/F1.csv", stringsAsFactors = FALSE)
l1_1617_sel <- l1_1617 %>%
select(Date:AST) %>%
mutate(season = 1617)
l1_1516_sel <- l1_1516 %>%
select(Date:AST) %>%
mutate(season = 1516)
l1_1415_sel <- l1_1415 %>%
select(Date:AST) %>%
mutate(season = 1415)
l1_1314_sel <- l1_1314 %>%
select(Date:AST) %>%
mutate(season = 1314)
l1_1317 <- bind_rows(l1_1617_sel, l1_1516_sel, l1_1415_sel, l1_1314_sel)
For the first step I have tried something like this but it obviously failed:
dl_l1 <-function(x){
df_x <- read.csv("http://www.football-data.co.uk/mmz4281/x/F1.csv", stringsAsFactors = FALSE)
}
dl_l1(1617)
You need to use paste to concatenate to build the url. Below code should work.
dl_l1 <-function(x){
read.csv(paste0("http://www.football-data.co.uk/mmz4281/",x,"/F1.csv"), stringsAsFactors = FALSE) %>%
select(Date:AST) %>%
mutate(season = x)
}
dl_l1(1617)
#final output
l1_1317 <- bind_rows(dl_l1(1617), dl_l1(1516), dl_l1(1415), dl_l1(1314))
library(tidyverse)
ids <- as.character(c(1617, 1516, 1415, 1314))
data <- lapply(ids, function(i) {
read.csv(paste0("http://www.football-data.co.uk/mmz4281/", i ,"/F1.csv"), stringsAsFactors = FALSE) %>%
select(Date:AST) %>%
mutate(season = i)
})
data <- do.call(rbind, data)
I would create a for loop in a function so you can iterate through a vector of numbers:
create function football that takes a number, or a vector of numbers, then create an empty data.frame. for each number in the vector, you want to paste it into the url, and then mutate the year into that df. Then you bind_rows into the df. At the end you return the football_df which is the bind_rows version of all of the ones combined.
library(dplyr)
football <- function(numbers){
football_df <- data.frame()
for (i in seq_along(numbers)){
df <- read.csv(paste("http://www.football-data.co.uk/mmz4281/",numbers[i],"/F1.csv", sep=""), stringsAsFactors = FALSE) %>%
mutate(year = numbers[i])
football_df <- bind_rows(football_df, df)
}
return(football_df)
}
years <- c(1617, 1415, 1314)
final_df <- football(years)