How to run a changepoint analysis on multiple time series with dplyr - r

I'm collecting time series data from Wikipedia and want to run a change-point analysis on each time series using dplyr. But when I do so I get an error saying the data need to be numeric, even though the class function states it is numeric. Hope you can help.
library(changepoint)
library(dplyr)
library(pageviews)
library(data.table)
articles <- c("Rugby_union", "Football")
foo <- function(x){article_pageviews(project = "en.wikipedia",
article = x,
start = as.Date('2017-01-01'),
end = as.Date("2017-12-31")
, user_type = "user", platform = c("mobile-web"))
}
output<-articles %>% foo
output %>%
select(article, views) %>%
do(cpt.mean(.))
class(output$views)

library(changepoint)
library(dplyr)
library(pageviews)
articles <- c("Rugby_union", "Football")
foo <- function(x){article_pageviews(project = "en.wikipedia", article = x,
start = as.Date('2017-01-01'),
end = as.Date("2017-12-31"),
user_type = "user", platform = c("mobile-web"))
}
output <- articles %>%
foo
df <- as.data.frame(table(output$article))
output1 <- output %>%
dplyr::select(article, views) %>%
dplyr::filter(article == df[1,1])
output2 <- output %>%
dplyr::select(article, views) %>%
dplyr::filter(article == df[2,1])
q <- floor((min(length(output1$views), length(output2$views)))/2 + 1)
cp1 <- changepoint::cpt.mean(data = output1$views, Q = q, method = "BinSeg", penalty
= "SIC")
plot(cp1)
cp2 <- changepoint::cpt.mean(data = output2$views, Q = q, method = "BinSeg", penalty
= "SIC")
plot(cp2)

Related

Combine outputs of a function for each index in a for loop in R

I have created a function which completes a calculation based on data from two data frames for an individual.
I want to complete that function for each individual and combine all the outputs in a data frame and export to .csv
Currently the output .csv only has data for person 34, none of the other.
I've noted that when I run this it creates an object ID, which is just the numeric 34. It seems to be only holding on to the last ID in data$ID.
How can I create an output with results for all persons?
library(dplyr)
library(lubridate)
library(date)
screen_function = function(x){
# Select each person and get necessary inputs
data = data%>%filter(ID == x)
demogs = demogs %>% filter (P_ID == x)
data$Age = demogs$Age
data$result = data$test * data$Age
data$Date = as.Date(data$Date,format='%d/%m/%Y') # ensures date column is in correct format
# only include tests from most recent 12-24 months and only proceed if test in last 12 months
Recent_12m = data %>% filter(between(Date, today() - years(1), today()))
Recent_24m = data %>% filter(between(Date, today() - years(2), today()))
if ((nrow(Recent_12m)) > 0) {
data = rbind(Recent_12m,Recent_24m)
Recent_12m$min_date = min(Recent_12m$Date)
Recent_12m$Date = as.Date(Recent_12m$Date,format='%d/%m/%Y')
Recent_12m$min_date = as.Date(Recent_12m$min_date,format='%d/%m/%Y')
Recent_24m$min_date = min(Recent_24m$Date)
Recent_24m$Date = as.Date(Recent_24m$Date,format='%d/%m/%Y')
Recent_24m$min_date = as.Date(Recent_24m$min_date,format='%d/%m/%Y')
Recent_12m$Period = interval(Recent_12m$min_date, Recent_12m$Date)
Recent_12m$Years = as.numeric(Recent_12m$Period, unit = "years")
Recent_24m$Period = interval(Recent_24m$min_date, Recent_24m$Date)
Recent_24m$Years = as.numeric(Recent_24m$Period, unit = "years")
# Latest result
Last = filter(Recent_12m, (Recent_12m$Date == max(Date)))
# linear regression model
Reg_12month <- lm(result ~ Years, data=Recent_12m)
Reg_24month <- lm(result ~ Years, data=Recent_24m)
info = c(x, round(Last$result, digits=1), round(Reg_12month$coefficients["Years"], digits = 1), round(Reg_24month$coefficients["Years"], digits = 1))
newdf = data.frame(matrix(0, ncol = 4))
colnames(newdf) = c("ID", "Latest result", "Trend 12month", "Trend 24 month")
newdf= rbind(newdf, info)
write.csv(newdf, "filepath.csv")
}
}
Date= sample(seq(as.Date('2019/11/01'), as.Date('2020/11/01'), by="day"), 12)
ID= c(12,12,12,450,450,450,1,1,1,34,34,34)
test= rnorm(12, mean=150, sd=60)
data= data.frame(ID, Date, test)
P_ID = c(1,12,34,450)
Age = c(50,45,60,72)
demogs = data.frame(P_ID, Age)
persons = unique(data$ID)
for(ID in persons){
screen_function(paste("", ID,"", sep=""))
}
Created on 2020-11-16 by the reprex package (v0.3.0)
So, I've got around this by using a pre-made .csv, instead of creating a new dataframe. The .csv just contains a single row of 4 columns, with random entries in each cell.
newdf= read.csv(file = "filepath.csv")
info = c(x, round(Last$result, digits=1), round(Reg_12month$coefficients["Years"], digits = 1),
round(Reg_24month$coefficients["Years"], digits = 1))
newdf= rbind(Summary, patient_info)
colnames(newdf) = c("ID", "Latest result", "Trend 12month", "Trend 24 month")
newdf= distinct(newdf, ID, .keep_all = TRUE)
write.csv(Summary, "filepath.csv", row.names = FALSE)}}

Error in is.data.frame(.l) : object 'group' not found

Not sure if you all will be able to help me without reproducible example data, but I have a problem with running the code below. I am attempting to use the multidplyr package, but it doesn't seem to find my columns. I am running the code below:
cl <- detectCores()
cl
models_prep <-
bookings_prep %>%
inner_join(pipeline_prep_, by = c("booking_type", "group")) %>%
crossing(biz_day) %>%
left_join(closed_pipeline, by = c("booking_type", "group")) %>%
select(-opportunity_forecast_category)
group1 <- rep(1:cl, length.out = nrow(models_prep))
models_prep1 <- bind_cols(tibble(group1), models_prep)
cluster <- new_cluster(cl)
cluster %>%
cluster_library("tidyr")
cluster %>%
cluster_library("purrr")
cluster %>%
cluster_library("plyr")
cluster %>%
cluster_library("dplyr")
cluster_copy(cluster, "rmf")
cluster_copy(cluster, "fc_xreg")
#cluster_assign(cluster, "rmf")
#cluster_copy(cluster,c("rmf","fc_xreg"))
by_group <- models_prep %>%
group_by(group) %>%
partition(cluster)
by_group1 <- models_prep1 %>%
group_by(group1) %>%
partition(cluster)
models <- by_group %>%
mutate(
xreg_arima = pmap(list(data = pipeline, h = 1,name = group, bookings = bookings, type = booking_type,
biz_day = biz_day, no_bookings = no_bookings,
sparse_pipeline = sparse_pipeline,
closed_forecast_cat = pipeline_amount, FUN = "fc_xreg"), rmf))
Everything runs up to models <- correctly, but it fails there saying it cannot find the object group. Here is what the by_group data frame looks like.
Sometimes arguments just need to be quoted, particularly in dplyr-ish situations.
models <- by_group %>%
mutate(
xreg_arima = pmap(list(data = pipeline, h = 1,name = "group", bookings = "bookings", type = "booking_type",
biz_day = "biz_day", no_bookings = "no_bookings",
sparse_pipeline = "sparse_pipeline",
closed_forecast_cat = "pipeline_amount", FUN = "fc_xreg"), rmf))

Append to dataframe through for loop and function

When I run a code with save as csv at the end everything runs correctly. It means I filter my dataset by country do some continuations and save it as a country file.
When I try to rbind my datasets I receive nothing. I try different solutions which I found but nothing is working. Clearly I do not understand why I can not rbind.
The code:
library(dplyr)
library(readxl)
setwd("Z:/Reporting_Private/Tableau")
dataupl <- read_excel("Analysis Map_Tableau - Ready.xlsm", sheet = "Data")
df = dataupl
#select right columns
df = df[,1:6]
colnames(df)=c("Office", "Employee","Territiry","Sales","Leads","Act")
#change n/a to zero
df[is.na(df)]=0
countries = df %>% select(Office) %>% distinct()
countries = as.data.frame(countries)
engine <- function(input){
df = df %>% filter(Office==input)
SCALESALES = scale(df$Sales)
SCALELEADS= scale(df$Leads)
SCALEACT= scale(df$Act)
df = df %>% mutate(SCALESALES = SCALESALES, SCALELEADS = SCALELEADS, SCALEACT = SCALEACT)
df$SLegend = ave(df$Sales, df$SalesLegend, FUN = min)
df$SLegend = ifelse(df$SLegend>0, df$SLegend,0)
df$LLegend = ave(df$Leads, df$LeadsLegend, FUN = min)
df$ALegend = ave(df$Act, df$ActLegend, FUN = min)
#write.csv(final, file = paste0(input,".csv"))
dftotal = data.frame()
dftotal = rbind(dftotal,df)
}
for (i in 1:nrow(countries)){
input = countries[i,]
engine(input)
}
It seems that you can split your dataframe L <- split(df, df$Office) and then lapply(L, ...) Instead of write.csv(...) you have to return the dataframe: return(df) in your function.
So, something like this:
engine <- function(dfi) {
SCALESALES = scale(dfi$Sales)
SCALELEADS = scale(dfi$Leads)
SCALEACT = scale(dfi$Act)
dfi = dfi %>% mutate(SCALESALES = SCALESALES, SCALELEADS = SCALELEADS, SCALEACT = SCALEACT)
dfi$SLegend = ave(dfi$Sales, dfi$SalesLegend, FUN = min)
dfi$SLegend = ifelse(dfi$SLegend>0, dfi$SLegend, 0)
dfi$LLegend = ave(dfi$Leads, dfi$LeadsLegend, FUN = min)
dfi$ALegend = ave(dfi$Act, dfi$ActLegend, FUN = min)
return(dfi)
}
L <- split(df, df$Office)
Lnew <- lapply(L, engine)
dftotal <- Lnew[[1]]
for (i in 2:length(Lnew)) dftotal <- rbind(dftotal, Lnew[[i]])

How to fill a vector from a for-loop containing a numeric sequence with a given step?

I have the following piece of code:
library(dplyr)
Q = 10000
span = 1995:2016
time = rep(span,times = Q, each= Q)
id = rep(1:Q,times=length(span))
s1 = rep(rnorm(Q,0,1),times=length(span))
gdp = rep(rnorm(Q,0,1),times=length(span))
e = rep(rnorm(Q,0,1),times=length(span))
dfA = data.frame(id,time,s1,e,gdp)
mgr = double()
stp = 10
for(K in seq(10,Q,stp)){
gr = double()
for(t in span){
wt1 = dfA %>% filter(time == t-1) %>%
arrange(desc(s1)) %>% mutate(w= s1/gdp)
zt1 = dfA %>% filter(time == t-1) %>% mutate(z1 = log(s1/e))
zt = dfA %>% filter(time == t) %>% mutate(z = log(s1/e))
gt = left_join(zt1,zt,by="name") %>%
mutate(g = z-z1) %>% select(name,g) %>% na.omit()
a = left_join(wt1,gt,by="name") %>% na.omit()
a = a %>% mutate(id = 1:length(a$name)) %>%
filter(id <= Q) %>% mutate(gbar = mean(g)) %>%
filter(id <= K) %>% mutate(sck = g-gbar,
gamma = w*sck)
gr = append(gr, sum(a$gamma))
}
mgr = append(mgr,mean(gr))
}
where dfA is a data frame containing an id variable and a time variable, among others. Since the time variable ranges from 1995 to 2016 and K is a sequence with step 10, I resorted to append() to store gr and mgr, respectively. The problem is that it takes too long to compute.
So my question is: Is there any way to avoid using append() to fill the vectors gr and mgr and thus reduce the time spent to compute the code?
You could initiate the 'gr' and 'mgr' vectors with a set length rather than just initiate them as a double and have R extend them every iteration. The advantage is that the memory for the vector is allocated beforehand and you don't have to redefine the entire variable mgr/gr.
## initiate vectors with set length
mgr <- double(length = length(seq(10,Q,stp)))
gr <- double(length = length(1995:2016))
# fill the positions in each iteration
outerIteration <- (K - 10) / stp
innerIteration <- t - 1994
gr[innerIteration] <- sum(a$gamma)
# take the mean for each block of length 21 (2016 - 1995)
mgr[outerIteration] <- mean(gr[(outerIteraion -1)*21 + 1 : outerIteration*21])

Create line density per polygon using tidyverse + sf R

I have a GIS question that has been stumping me for some time now. The end goal would be to extract the density of lines per pixel/voxel/polygon using tidyverse/sf packages. As of now I have a function that works when I execute line-by-line, but not as a function. The ultimate end-goal would be to use this function in sfLappy of the snowfall package to be run in parallel. Any help on getting this to work as a function would be greatly appreciated! The data involved can be found here....
https://www.dropbox.com/s/zg9o2b5x4wizafo/hexagons.gpkg?dl=0
https://www.dropbox.com/s/x2gxx36pjkutxzm/railroad_lines.gpkg?dl=0
The function that I had created, which, again, works line-for-line but not as a function, can be found here:
length_in_poly <- function(fishnet, spatial_lines) {
require(sf)
require(tidyverse)
require(magrittr)
fishnet <- st_as_sf(do.call(rbind, fishnet))
spatial_lines <- st_as_sf(do.call(rbind, spatial_lines))
fish_length <- list()
for (i in 1:nrow(fishnet)) {
split_lines <- spatial_lines %>%
st_cast(., "MULTILINESTRING", group_or_split = FALSE) %>%
st_intersection(., fishnet[i, ]) %>%
mutate(lineid = row_number())
fish_length[[i]] <- split_lines %>%
mutate(length = sum(st_length(.)))
}
fish_length <- do.call(rbind, fish_length) %>%
group_by(hexid4k) %>%
summarize(length = sum(length))
fishnet <- fishnet %>%
st_join(., fish_length, join = st_intersects) %>%
mutate(hexid4k = hexid4k.x,
length = ifelse(is.na(length), 0, length),
pixel_area = as.numeric(st_area(geom)),
density = length/pixel_area)
}
To prep the data:
library(sf)
library(tidyverse)
library(snowfall)
input_hexagons <- st_read("hexagons.gpkg")
input_rail_lines <- st_read("railroad_lines.gpkg")
Using some code from here:
faster_as_tibble <- function(x) {
structure(x, class = c("tbl_df", "tbl", "data.frame", "sfc"), row.names = as.character(seq_along(x[[1]])))
}
split_fast_tibble <- function (x, f, drop = FALSE, ...) {
lapply(split(x = seq_len(nrow(x)), f = f, ...),
function(ind) faster_as_tibble(lapply(x, "[", ind)))
}
Create a state-wise list:
sub_hexnet <- split_fast_tibble(input_hexagons, input_hexagons$STUSPS) %>%
lapply(st_as_sf)
Finally, to run just as a single-core process:
test <- lapply(fishnet = as.list(sub_hexnet),
FUN = length_in_poly,
spatial_lines = input_rail_lines)
Or, in the perfect world, a multi-core process:
sfInit(parallel = TRUE, cpus = parallel::detectCores())
sfExport(list = c("sub_hexnet", "mask_rails"))
extractions <- sfLapply(fishnet = sub_hexnet,
fun = length_in_poly,
spatial_lines = input_rail_lines)
sfStop()
Thanks in advance for any help - I am completely stumped!
After messing around with this for a while I finally figured out a solution.
The key helper functions used:
load_data <- function(url, dir, layer, outname) {
file <- paste0(dir, "/", layer, ".shp")
if (!file.exists(file)) {
download.file(url, destfile = paste0(dir, ".zip"))
unzip(paste0(dir, ".zip"),
exdir = dir)
unlink(paste0(dir, ".zip"))
}
name <- paste0(outname, "_shp")
name <- sf::st_read(dsn = dir, layer = layer)
name
}
get_density <- function(x, grids, lines) {
require(tidyverse)
require(lubridate)
require(sf)
sub_grids <- grids %>%
dplyr::filter(hexid4k == x)
single_lines_hexid <- lines %>%
dplyr::filter(hexid4k == x) %>%
sf::st_intersection(., sub_grids) %>%
dplyr::select(hexid4k, STUSPS) %>%
dplyr::mutate(length_line = st_length(.),
length_line = ifelse(is.na(length_line), 0, length_line))
sub_grids <- sub_grids %>%
sf::st_join(., single_lines_hexid, join = st_intersects) %>%
dplyr::mutate(hexid4k = hexid4k.x) %>%
dplyr::group_by(hexid4k) %>%
dplyr::summarize(length_line = sum(length_line)) %>%
dplyr::mutate(pixel_area = as.numeric(st_area(geom)),
density = length_line/pixel_area) %>%
dplyr::select(hexid4k, length_line, density, pixel_area)
return(sub_grids)
}
Prep the input data:
usa_shp <- load_data(url = "https://www2.census.gov/geo/tiger/GENZ2016/shp/cb_2016_us_state_20m.zip",
dir = 'data',
layer = "cb_2016_us_state_20m",
outname = "usa") %>%
sf::st_transform(p4string_ea) %>%
dplyr::filter(!STUSPS %in% c("HI", "AK", "PR"))
usa_shp$STUSPS <- droplevels(usa_shp$STUSPS)
hex_points <- spsample(as(usa_shp, 'Spatial'), type = "hexagonal", cellsize = 4000)
hex_grid <- HexPoints2SpatialPolygons(hex_points, dx = 4000)
hexnet_4k <- st_as_sf(hex_grid) %>%
mutate(hexid4k = row_number()) %>%
st_intersection(., st_union(usa_shp)) %>%
st_join(., usa_shp, join = st_intersects) %>%
dplyr::select(hexid4k, STUSPS)
transmission_lines_hex <- load_data( url = "https://hifld-dhs-gii.opendata.arcgis.com/datasets/75af06441c994aaf8e36208b7cd44014_0.zip",
dir = 'data',
layer = 'Electric_Power_Transmission_Lines',
outname = 'tl')%>%
dplyr::select(LINEARID, STUSPS) %>%
st_join(., hexnet_4k, join = st_intersects) %>%
mutate(STUSPS = STUSPS.x) %>%
dplyr::select(LINEARID, hexid4k, STUSPS)
The resulting parallel process is as follows:
hexnet_list <- hexnet_4k %>%
split(., .$STUSPS)
sfInit(parallel = TRUE, cpus = num_cores)
sfExport('transmission_lines_hex')
sfSource('src/functions/helper_functions.R')
transmission_lines_density <- lapply(hexnet_list,
function (input_list) {
require(tidyverse)
require(magrittr)
require(lubridate)
require(lubridate)
require(sf)
sub_grid <- dplyr:::bind_cols(input_list)
unique_ids <- unique(sub_grid$hexid4k)
state_name <- unique(sub_grid$STUSPS)[1]
print(paste0('Working on ', state_name))
got_density <- lapply(unique_ids,
FUN = get_density,
grids = sub_grid,
lines = transmission_lines_hex)
print(paste0('Finishing ', state_name))
return(got_density)
}
)
sfStop()
I hope some of this may be useful to you and, as always, suggestions on optimization would be welcomed.

Resources