interaction and first differences in zelig - r

I have a dataset with this structure:
# libraries
library(Zelig) # 5.0-12
library(datatable)
# create data
time <- factor(rep(-12:12, 50))
treatment <- rbinom(length(time), 1, .75)
outcome <- rnorm(length(time), 1, 3) + 3 * treatment
dat <- data.table(outcome, time, treatment)
dat
outcome time treatment
1: 5.2656458 -12 0
2: 4.8888805 -11 1
3: 2.6322592 -10 1
4: 8.2449092 -9 1
5: 0.5752739 -8 0
---
1246: 2.1865924 8 0
1247: 1.6028838 9 1
1248: 2.4056725 10 1
1249: 2.0257008 11 1
1250: 6.1503307 12 1
I run a LS model interacting time and treatment:
z <- zls$new()
z$zelig(out ~ time * treatment, data = dat)
summary(z)
Here a trimmed output...
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.40264 0.71552 3.358 0.00081
time-11 -1.61292 1.08177 -1.491 0.13622
time-10 -1.03283 0.99850 -1.034 0.30116
time-9 -1.47934 1.02667 -1.441 0.14987
time-8 -0.35614 1.02667 -0.347 0.72874
time-7 -1.05803 1.04304 -1.014 0.31061
time-6 -2.25316 1.16178 -1.939 0.05269
....
treatment 1.28097 0.89440 1.432 0.15234
time-11:treatment 2.86965 1.30927 2.192 0.02859
time-10:treatment 1.69479 1.25788 1.347 0.17813
time-9:treatment 1.78684 1.27330 1.403 0.16078
time-8:treatment 0.82332 1.27330 0.647 0.51801
time-7:treatment 1.62808 1.28334 1.269 0.20482
time-6:treatment 2.64653 1.36895 1.933 0.05344
time-5:treatment 3.08572 1.36895 2.254 0.02437
....
I would like to estimate the first differences (treatment = 1, treatment = 0) for each time so that I can plot the effects by time.
Any ideas?
Thank you in advance

Here a solution using a loop.
m <- zelig(outcome ~ time * treatment, model = "ls", data = dat)
output <- NULL
for (i in unique(dat$time)) {
t0 <- setx(m, treatment = 0, time = i)
t1 <- setx(m, treatment = 1, time = i)
ss <- sim(m, x = t0, x1 = t1, num = 10000)
fd <- unlist(ss$sim.out[["x1"]][["fd"]])
r <- data.table(time = i, mean = mean(fd), low = quantile(fd, .025), high = quantile(fd, 0.975))
output <- rbind(output, r)
}
output
time mean low high
1: -12 1.506365 -0.30605416 3.347631
2: -11 1.013915 -0.83479749 2.817791
3: -10 2.673004 0.72371241 4.645537
4: -9 1.291547 -0.62162353 3.183365
5: -8 2.985348 0.59834003 5.351312
6: -7 3.911258 1.95825840 5.878157
7: -6 4.222870 1.86773822 6.567400
8: -5 3.152967 0.81620039 5.483884
9: -4 3.893867 1.77629999 6.003647
10: -3 2.319123 0.35445149 4.278032
11: -2 1.942848 0.03771276 3.844245
12: -1 3.879313 1.92915419 5.852765
13: 0 1.388601 -0.93881332 3.703387
14: 1 3.576107 1.54679622 5.567298
15: 2 2.413652 0.58863014 4.225094
16: 3 2.160988 0.03251586 4.266438
17: 4 2.203825 0.28985053 4.080361
18: 5 4.445642 2.40569051 6.510071
19: 6 1.504513 -0.27797349 3.251175
20: 7 2.542558 0.77794333 4.269277
21: 8 2.682681 0.93322199 4.449863
22: 9 4.271228 2.39189897 6.137469
23: 10 2.540004 0.66875643 4.454354
24: 11 3.454584 1.54938921 5.340096
25: 12 3.682521 1.85539403 5.501669
time mean low high

Related

Possible to avoid a FOR loop in this very simple R code?

The answers below are very helpful. But I oversimplified my original question. I figured I learn more if I oversimplify and then adapt to my actual need, but now I am stuck. There are other factors that drive the amortization. See more complete code here. I like the response using "amort$end_bal <- begin_bal * (1 - mpr)^amort$period" and "amort$pmt <- c(0, diff(amort$end_bal))* -1", but in addition npr increases the ending balances and ch_off decreases ending balances. Here´s the more complete code:
n_periods <- 8
begin_bal <- 10000
yld <- .20
npr <- .09
mpr <- .10
co <- .10
period = seq(0,n_periods,1)
fin = 0
pur = 0
pmt = 0
ch_off = 0
end_bal = begin_bal
for(i in 1:n_periods){
{fin[i+1] = end_bal[i]*yld/12}
{pur[i+1] = end_bal[i]*npr}
{pmt[i+1] = end_bal[i]*mpr}
{ch_off[i+1] = end_bal[i]*co/12}
end_bal[i+1] = end_bal[i]+pur[i+1]-pmt[i+1]-ch_off[i+1]}
amort <- data.frame(period,fin,pur,pmt,ch_off,end_bal)
Which gives the below correct output:
print(amort,row.names=FALSE)
period fin pur pmt ch_off end_bal
0 0.0000 0.0000 0.0000 0.00000 10000.000
1 166.6667 900.0000 1000.0000 83.33333 9816.667
2 163.6111 883.5000 981.6667 81.80556 9636.694
3 160.6116 867.3025 963.6694 80.30579 9460.022
4 157.6670 851.4020 946.0022 78.83351 9286.588
5 154.7765 835.7929 928.6588 77.38823 9116.334
6 151.9389 820.4700 911.6334 75.96945 8949.201
7 149.1534 805.4281 894.9201 74.57668 8785.132
8 146.4189 790.6619 878.5132 73.20944 8624.072
I´m new to R, and I understand one of its features is matrix/vector manipulation. In the below example I amortize an asset over 8 months, where each payment ("pmt") is 10% ("mpr") of the prior period balance ("end_bal"). The below works fine. I used a FOR loop. I understand FOR loops can be slow in large models and a better solution is use of R´s abundant vector/matrix functions. But I didn´t know how to do this in my example since each monthly payment is calculated by referencing the prior period ending balance.
So my questions are:
Is there a more efficient way to do the below?
How do I replace the 0 for pmt in period 0, with an empty space?
R code:
n_periods <- 8
begin_bal <- 100
mpr <- .10
# Example loan amortization
pmt = 0
end_bal = begin_bal
for(i in 1:n_periods){
{pmt[i+1] = end_bal[i]*mpr}
end_bal[i+1] = end_bal[i]-pmt[i+1]}
amort <- data.frame(period = 0:n_periods,pmt,end_bal)
amort
Results, which are correct:
> amort
period pmt end_bal
1 0 0.000000 100.00000
2 1 10.000000 90.00000
3 2 9.000000 81.00000
4 3 8.100000 72.90000
5 4 7.290000 65.61000
6 5 6.561000 59.04900
7 6 5.904900 53.14410
8 7 5.314410 47.82969
9 8 4.782969 43.04672
Use R's vectorised calculations
n_periods <- 8
begin_bal <- 100
mpr <- .10
amort <- data.frame(period = seq(0, n_periods, 1))
amort$end_bal <- begin_bal * (1 - mpr)^amort$period
amort$pmt <- c(0, diff(amort$end_bal))* -1
amort
#> period end_bal pmt
#> 1 0 100.00000 0.000000
#> 2 1 90.00000 10.000000
#> 3 2 81.00000 9.000000
#> 4 3 72.90000 8.100000
#> 5 4 65.61000 7.290000
#> 6 5 59.04900 6.561000
#> 7 6 53.14410 5.904900
#> 8 7 47.82969 5.314410
#> 9 8 43.04672 4.782969
Created on 2021-05-12 by the reprex package (v2.0.0)
dplyr way for a different case (say)
n_periods <- 15
begin_bal <- 1000
mpr <- .07
library(dplyr)
seq(0, n_periods, 1) %>% as.data.frame() %>%
setNames('period') %>%
mutate(end_bal = begin_bal * (1 - mpr)^period,
pmt = -1 * c(0, diff(end_bal)))
#> period end_bal pmt
#> 1 0 1000.0000 0.00000
#> 2 1 930.0000 70.00000
#> 3 2 864.9000 65.10000
#> 4 3 804.3570 60.54300
#> 5 4 748.0520 56.30499
#> 6 5 695.6884 52.36364
#> 7 6 646.9902 48.69819
#> 8 7 601.7009 45.28931
#> 9 8 559.5818 42.11906
#> 10 9 520.4111 39.17073
#> 11 10 483.9823 36.42878
#> 12 11 450.1035 33.87876
#> 13 12 418.5963 31.50725
#> 14 13 389.2946 29.30174
#> 15 14 362.0439 27.25062
#> 16 15 336.7009 25.34308
Created on 2021-05-12 by the reprex package (v2.0.0)
Though OP has put another question in edited scenario, here's the approach suggested (for future reference)
n_periods <- 8
begin_bal <- 10000
yld <- .20
npr <- .09
mpr <- .10
co <- .10
library(dplyr)
seq(0, n_periods, 1) %>% as.data.frame() %>%
setNames('period') %>%
mutate(end_bal = begin_bal * (1 - (mpr + co/12 - npr))^period,
fin = c(0, (end_bal * yld/12)[-nrow(.)]),
pur = c(0, (end_bal * npr)[-nrow(.)]),
pmt = c(0, (end_bal * mpr)[-nrow(.)]),
ch_off = c(0, (end_bal * co/12)[-nrow(.)]))
#> period end_bal fin pur pmt ch_off
#> 1 0 10000.000 0.0000 0.0000 0.0000 0.00000
#> 2 1 9816.667 166.6667 900.0000 1000.0000 83.33333
#> 3 2 9636.694 163.6111 883.5000 981.6667 81.80556
#> 4 3 9460.022 160.6116 867.3025 963.6694 80.30579
#> 5 4 9286.588 157.6670 851.4020 946.0022 78.83351
#> 6 5 9116.334 154.7765 835.7929 928.6588 77.38823
#> 7 6 8949.201 151.9389 820.4700 911.6334 75.96945
#> 8 7 8785.132 149.1534 805.4281 894.9201 74.57668
#> 9 8 8624.072 146.4189 790.6619 878.5132 73.20944
Created on 2021-05-13 by the reprex package (v2.0.0)
If you are "lazy" (don't want to formulate the general expression of pmt and end_bal), you can define a recursive function f like blow
f <- function(k) {
if (k == 1) {
return(data.frame(pmt = 100 * mpr, end_bal = 100))
}
u <- f(k - 1)
end_bal <- with(tail(u, 1), end_bal - pmt)
pmt <- mpr * end_bal
rbind(u, data.frame(pmt, end_bal))
}
n_periods <- 8
res <- transform(
cbind(period = 0:n_periods, f(n_periods + 1)),
pmt = c(0, head(pmt, -1))
)
and you will see
> res
period pmt end_bal
1 0 0.000000 100.00000
2 1 10.000000 90.00000
3 2 9.000000 81.00000
4 3 8.100000 72.90000
5 4 7.290000 65.61000
6 5 6.561000 59.04900
7 6 5.904900 53.14410
8 7 5.314410 47.82969
9 8 4.782969 43.04672

For loop with a function for a moving/rolling average?

Essentially (in R), I want to apply a moving average function over a period of time (eg. date and time variables) to see how a particular metric changes over time. However, the metric in itself is a function. The scores can either be 1 (pro), 0 (neutral), or -1 (neg). The function for the metric is:
function(pro, neg, total) {
x <- (pro / total) * 100
y <- (neg / total) * 100
x - y
}
So the percentage of 1's minus the percentage of -1's is the metric value.
Given timestamps for each recorded score, I want to evaluate the metric as a moving average across all rows. I assumed that a for loop would be the best way to apply this but I am stuck in how to do this.
Does anyone have any thoughts / advice?
As mentioned in the comments, rollapply() from zoo is a good option. I took the liberty to generate some example data, apologies if it doesn't resemble yours.
library(zoo)
f <- function(x, l) {
p <- sum(x == 1) / l
n <- sum(x == -1) / l
(p - n)*100
}
# Or more efficiently
f <- function(x, l=length(x)) {
(sum(x)/l)*100
}
set.seed(1)
N <- 25
dtf <- data.frame(time=as.Date(15000+(1:N)), score=sample(-1:1, N, rep=TRUE))
score <- read.zoo(dtf)
l <- 8
zts <- cbind(score, rolling=rollapply(score, l, f, l, fill=NA))
zts
# score rolling
# 2011-01-27 -1 NA
# 2011-01-28 0 NA
# 2011-01-29 0 NA
# 2011-01-30 1 12.5
# 2011-01-31 -1 25.0
# 2011-02-01 1 12.5
# 2011-02-02 1 0.0
# 2011-02-03 0 -25.0
# 2011-02-04 0 0.0
# 2011-02-05 -1 -12.5
# 2011-02-06 -1 -12.5
# 2011-02-07 -1 -12.5
# 2011-02-08 1 0.0
# 2011-02-09 0 25.0
# 2011-02-10 1 37.5
# 2011-02-11 0 62.5
# 2011-02-12 1 62.5
# 2011-02-13 1 50.0
# 2011-02-14 0 37.5
# 2011-02-15 1 25.0
# 2011-02-16 1 0.0
# 2011-02-17 -1 NA
# 2011-02-18 0 NA
# 2011-02-19 -1 NA
# 2011-02-20 -1 NA

Rolling subsetting of a time series data frame based on index values (time points), not number of observations R

I have a data frame with one timescale and multiple time series. I would like to subset it in a dynamically rolling way with the width of the window set to e.g. 10 yrs. Because the time series is not equidistantly sampled the number of rows in the window will change as it rolls along the data frame.
The calculations should be done based on time values and not number of observations.
For example, in the following data.frame:
time var1 var2
5262 -8.981 -0.011
5263.2 -8.993 -0.012
5264.4 -8.978 0.015
5265.6 -9.169 -0.191
5266.8 -8.897 0.272
5268 -9.024 -0.127
5269.2 -8.996 0.028
5270.46 -8.979 0.017
5271.84 -9.004 -0.025
5273.22 -9.01 -0.006
5274.6 -9.106 -0.096
5275.98 -8.971 0.135
5277.36 -8.996 -0.025
5278.74 -8.956 0.04
5280.12 -8.981 -0.025
5281.5 -8.982 -0.001
5282.88 -9.042 -0.06
5284.26 -9.091 -0.049
5285.64 -9.066 0.025
5287.02 -9.03 0.036
5288.4 -9.031 -0.001
5289.78 -9.028 0.003
5291.16 -9.164 -0.136
5294.72 -9.034 0.13
5297.3 -9.296 -0.262
5299.88 -9.097 0.199
5302.46 -8.995 0.102
5305.04 -9.084 -0.089
5307.62 -9.047 0.037
5310.2 -9.066 -0.019
5312.78 -9.07 -0.004
5315.36 -9 0.07
5317.94 -9.057 -0.057
5320.52 -9.219 -0.162
5323.1 -9.084 0.135
5325.68 -9.034 0.05
5328.26 -9.147 -0.113
5330.84 -9.169 -0.022
5333.42 -9.143 0.026
5336 -9.211 -0.068
5338.58 -9.061 0.15
5341.16 -9.1 -0.039
5343.74 -9.094 0.006
5346.32 -9.104 -0.01
5348.9 -9.089 0.015
5351.48 -9.127 -0.038
5354.06 -8.973 0.154
5356.64 -9.009 -0.036
5359.22 -8.966 0.043
5361.8 -8.996 -0.03
5364.38 -8.877 0.119
5366.96 -8.962 -0.085
5369.54 -8.902 0.06
5372.12 -8.915 -0.013
5374.7 -8.913 0.002
5377.28 -8.834 0.079
5379.86 -8.91 -0.076
5382.44 -8.742 0.168
5385.02 -8.877 -0.135
5387.6 -8.743 0.134
5390.18 -8.898 -0.155
5392.76 -8.77 0.128
5395.34 -8.97 -0.2
5397.92 -8.849 0.121
5400.5 -8.846 0.003
5403.08 -8.865 -0.019
5405.66 -8.865 0
5408.24 -8.876 -0.011
5410.82 -8.775 0.101
5413.4 -8.842 -0.067
5415.98 -8.821 0.021
5418.56 -8.85 -0.029
What I did before is subsetting the df but by referring to the rownumbers with the following code and performing a linear regression.
data.column=2
time.column=1
length=dim(data)[1]
window=10
adj_r_sqr=matrix(0,nrow=length,length(window_vekt))
colnames(adj_r_sqr)=window
for(i in 1:(length-window)){
x=data[i:(i+window),time.column]
y=data[i:(i+window),data.column]
lmodel=lm(y~x)
adj_r_sqr[i+floor(window/2)-1),which(window_vekt==window)]=summary(lmodel)$adj.r.squared}
But this will not account for the varying time intervals.
What I would need is a tweak that screens the data frame based on the first column and subsets it in a rolling was so that the subset covers the chosen window and gives an NA if the number of row in that window is < 5.
An additional question could be to subset the data, but not in a rolling, rather in a spliced way again using the time variable.
Previously I managed to extract not only adj. r2, but p-values and other and slopes as well using:
RMSE=sqrt(mean((summary(lmodel)$residuals)^2))
p_val_y=summary(lmodel)$coefficients[2,4]
p_val_intercept=summary(lmodel)$coefficients[1,4]
slope=coeff[i+summary(lmodel)$coefficients[2,1]
but with the old windowing, unfortunately I cannot implement these in the query suggested by #Uwe , because of my incompetence.
A test data set can be found on the link below:
test_data.csv
Rolling window
This can be solved by aggregating in a non-equi join which aggregates a varying number of rows which cover a given time period.
library(data.table)
# define parameters
time_1 <- -10
time_2 <- 10
n_min <- 5L
# create helper columns
setDT(dat)[, `:=`(join = time, start = time + time_1, end = time + time_2)][
# non-equi join and aggregate
dat, on = .(join >= start, join <= end), by = .EACHI, {
lmodel <- lm(var1 ~ time)
lsumm <- summary(lmodel)
.(time = i.time,
N = .N,
adj_r_sqr = lsumm$adj.r.squared,
RMSE = sqrt(mean(lsumm$residuals^2)),
p_val_y = if (.N > 1) lsumm$coefficients[2,4] else NA_real_,
p_val_intercept = lsumm$coefficients[1,4],
slope = coef(lmodel)[2]
)
}]
join join time N adj_r_sqr RMSE p_val_y p_val_intercept slope
1: 5252.00 5272.00 5262.00 9 -1.412484e-01 0.06749996 0.923658051 0.76541424 8.050483e-04
2: 5253.20 5273.20 5263.20 9 -1.412484e-01 0.06749996 0.923658051 0.76541424 8.050483e-04
3: 5254.40 5274.40 5264.40 10 -1.248329e-01 0.06411770 0.973340896 0.77035143 2.202740e-04
4: 5255.60 5275.60 5265.60 11 -5.914522e-02 0.06631161 0.523022100 0.72831553 -3.713582e-03
5: 5256.80 5276.80 5266.80 12 -8.934954e-02 0.06570860 0.760946792 0.96376799 -1.488205e-03
6: 5258.00 5278.00 5268.00 13 -8.696209e-02 0.06341811 0.845238030 0.82437256 -7.998616e-04
7: 5259.20 5279.20 5269.20 14 -8.098718e-02 0.06260179 0.874476744 0.52663574 5.619086e-04
8: 5260.46 5280.46 5270.46 15 -6.931305e-02 0.06060738 0.765814583 0.39825585 9.125789e-04
9: 5261.84 5281.84 5271.84 16 -5.793624e-02 0.05873142 0.679041724 0.29974544 1.102731e-03
10: 5263.22 5283.22 5273.22 15 -6.443192e-02 0.06113970 0.702430340 0.34981142 1.151509e-03
11: 5264.60 5284.60 5274.60 15 -7.684134e-02 0.06444578 0.975417917 0.56686522 9.651049e-05
12: 5265.98 5285.98 5275.98 15 1.462585e-01 0.04608930 0.088169168 0.30888780 -4.011513e-03
13: 5267.36 5287.36 5277.36 15 1.964299e-02 0.04086278 0.278246773 0.81455430 -2.166657e-03
14: 5268.74 5288.74 5278.74 15 6.215962e-02 0.04008533 0.188319558 0.64301698 -2.594692e-03
15: 5270.12 5290.12 5280.12 15 4.288832e-02 0.04025402 0.224394742 0.72203401 -2.388716e-03
16: 5271.50 5291.50 5281.50 15 1.512386e-01 0.04851035 0.084258193 0.28641439 -4.218427e-03
17: 5272.88 5292.88 5282.88 14 1.381176e-01 0.05002305 0.104568029 0.29409267 -4.558051e-03
18: 5274.26 5294.26 5284.26 13 1.327825e-01 0.05157048 0.120223456 0.28797520 -5.072464e-03
19: 5275.64 5295.64 5285.64 13 3.596767e-01 0.04138389 0.017834204 0.06667444 -6.361901e-03
20: 5277.02 5297.02 5287.02 12 2.873660e-01 0.04305408 0.041945918 0.12120605 -6.247512e-03
21: 5278.40 5298.40 5288.40 12 5.028991e-01 0.05961926 0.005895787 0.01376478 -1.191625e-02
22: 5279.78 5299.78 5289.78 11 4.293597e-01 0.06222915 0.017042382 0.03427994 -1.172318e-02
23: 5281.16 5301.16 5291.16 11 2.515343e-01 0.06779510 0.066374606 0.13205602 -8.296797e-03
24: 5284.72 5304.72 5294.72 9 -1.061158e-01 0.08743787 0.644376920 0.85394202 -2.841506e-03
25: 5287.30 5307.30 5297.30 8 -1.646716e-01 0.09175885 0.922552618 0.87761939 -6.640669e-04
26: 5289.88 5309.88 5299.88 7 1.448103e-02 0.08454602 0.344665870 0.25432610 7.335472e-03
27: 5292.46 5312.46 5302.46 7 -7.071101e-02 0.08530518 0.472291173 0.35934829 5.744740e-03
28: 5295.04 5315.04 5305.04 7 1.737202e-01 0.07307195 0.192955618 0.13615980 9.523810e-03
29: 5297.62 5317.62 5307.62 7 -8.646937e-02 0.03513349 0.502173104 0.25578453 2.200997e-03
30: 5300.20 5320.20 5310.20 7 -1.900736e-01 0.03206618 0.846235515 0.69966581 -5.675526e-04
31: 5302.78 5322.78 5312.78 7 5.536354e-05 0.05732965 0.363144502 0.54063521 -4.969546e-03
32: 5305.36 5325.36 5315.36 7 5.335182e-02 0.05578091 0.299620417 0.45814873 -5.592470e-03
33: 5307.94 5327.94 5317.94 7 -1.657721e-01 0.06294568 0.717357983 0.94666422 -2.090255e-03
34: 5310.52 5330.52 5320.52 7 -6.028327e-02 0.06414699 0.453853207 0.63535624 -4.512735e-03
35: 5313.10 5333.10 5323.10 7 8.740895e-02 0.06389452 0.264987111 0.38753689 -6.949059e-03
36: 5315.68 5335.68 5325.68 7 -1.197917e-01 0.05898249 0.575617471 0.80276014 -3.059247e-03
37: 5318.26 5338.26 5328.26 7 -1.150258e-01 0.05925244 0.564069556 0.78843841 -3.169989e-03
38: 5320.84 5340.84 5330.84 7 -5.959161e-02 0.05513800 0.452664410 0.66756091 -3.889812e-03
39: 5323.42 5343.42 5333.42 7 -1.914314e-01 0.05727449 0.857057704 0.88289558 -9.413068e-04
40: 5326.00 5346.00 5336.00 7 1.980666e-01 0.03842399 0.175981115 0.09097818 5.246401e-03
41: 5328.58 5348.58 5338.58 7 2.435057e-01 0.03768830 0.147555108 0.07564568 5.592470e-03
42: 5331.16 5351.16 5341.16 7 1.506886e-01 0.03812754 0.210258699 0.10821693 4.748062e-03
43: 5333.74 5353.74 5343.74 7 -8.457396e-02 0.04203885 0.498438355 0.28478388 2.657807e-03
44: 5336.32 5356.32 5346.32 7 -7.010584e-02 0.04407872 0.471193766 0.27563851 2.976190e-03
45: 5338.90 5358.90 5348.90 7 3.356662e-01 0.03913660 0.100918300 0.05396996 6.810631e-03
46: 5341.48 5361.48 5351.48 7 5.571886e-01 0.03769020 0.032860686 0.01839957 9.551495e-03
47: 5344.06 5364.06 5354.06 7 5.543731e-01 0.03777052 0.033425890 0.01873703 9.523810e-03
48: 5346.64 5366.64 5356.64 7 6.568462e-01 0.04090850 0.016679391 0.01024635 1.252769e-02
49: 5349.22 5369.22 5359.22 7 4.263635e-01 0.04784925 0.066663521 0.04074626 9.689922e-03
50: 5351.80 5371.80 5361.80 7 2.805982e-01 0.03460683 0.127153338 0.06281590 5.481728e-03
51: 5354.38 5374.38 5364.38 7 3.716517e-01 0.03324983 0.086098144 0.04219813 6.146179e-03
52: 5356.96 5376.96 5366.96 7 1.502772e-01 0.03293035 0.210579418 0.09951647 4.097453e-03
53: 5359.54 5379.54 5369.54 7 3.375742e-01 0.03655294 0.100088760 0.05199627 6.381506e-03
54: 5362.12 5382.12 5372.12 7 -1.007327e-01 0.03472663 0.531670980 0.27614749 2.021041e-03
55: 5364.70 5384.70 5374.70 7 5.143262e-01 0.04270061 0.042184448 0.02514076 1.003599e-02
56: 5367.28 5387.28 5377.28 7 1.180375e-01 0.05043403 0.237081066 0.14603312 5.869324e-03
57: 5369.86 5389.86 5379.86 7 3.526957e-01 0.05256444 0.093690478 0.05958580 9.413068e-03
58: 5372.44 5392.44 5382.44 7 -1.143248e-01 0.06697669 0.562404900 0.40786204 3.599114e-03
59: 5375.02 5395.02 5385.02 7 -1.380467e-01 0.06582006 0.624143540 0.45527492 2.976190e-03
60: 5377.60 5397.60 5387.60 7 -1.437437e-01 0.08277209 0.640981907 0.80007275 -3.557586e-03
61: 5380.18 5400.18 5390.18 7 6.865351e-02 0.07101868 0.283553003 0.39287969 -7.392027e-03
62: 5382.76 5402.76 5392.76 7 -1.557062e-01 0.06968790 0.679826634 0.87463729 -2.643965e-03
63: 5385.34 5405.34 5395.34 7 -5.672857e-02 0.06614976 0.447786680 0.61411355 -4.720377e-03
64: 5387.92 5407.92 5397.92 7 -1.978539e-01 0.05568786 0.928270615 0.68172236 4.568106e-04
65: 5390.50 5410.50 5400.50 7 -1.682576e-01 0.05373328 0.727529980 0.98771567 -1.716501e-03
66: 5393.08 5413.08 5403.08 7 3.659413e-01 0.03870980 0.088336383 0.04816083 7.087486e-03
67: 5395.66 5415.66 5405.66 7 -5.189744e-02 0.02893483 0.439709327 0.19602199 2.104097e-03
68: 5398.24 5418.24 5408.24 7 6.657153e-02 0.02820253 0.285688876 0.12137978 2.920819e-03
69: 5400.82 5420.82 5410.82 7 -3.417829e-02 0.02978965 0.411610972 0.18697576 2.311739e-03
70: 5403.40 5423.40 5413.40 6 -1.689106e-01 0.03205016 0.626213128 0.38442443 1.915836e-03
71: 5405.98 5425.98 5415.98 5 -3.324952e-01 0.03383253 0.968080223 0.75067625 2.325581e-04
72: 5408.56 5428.56 5418.56 4 4.196229e-01 0.01811905 0.217004528 0.29310481 -7.906977e-03
join join time N adj_r_sqr RMSE p_val_y p_val_intercept slope
EDIT: The OP has posted the link to another sample dataset which ran into an error. The reason is that some group sizes are too small consisting of only one data point so that the linear model has no slope.
The updated version of the code catches this situation and prevents an out-of-bounds error.
The first two columns show the range of years which is covered; they can be removed if no longer needed .
N is the number of rows included in the computation of lm(). The OP has requested to return NA if N < 5. This also can be done afterwards.
# define parameters
time_1 <- -10
time_2 <- 10
n_min <- 5L
# coerce to data.table
result <- setDT(dat)[
# create helper columns
, `:=`(join = time, start = time + time_1, end = time + time_2)][
# non-equi join and aggregate each interval
dat, on = .(join >= start, join <= end), by = .EACHI, {
# do computations within interval
lmodel <- lm(var1 ~ time)
lsumm <- summary(lmodel)
# create list of results, finally
.(time = i.time,
N = .N,
adj_r_sqr = lsumm$adj.r.squared,
RMSE = sqrt(mean(lsumm$residuals^2)),
p_val_y = if (.N > 1) lsumm$coefficients[2,4] else NA_real_,
p_val_intercept = lsumm$coefficients[1,4],
slope = coef(lmodel)[2]
)
}]
# clean-up result
computed_cols <- setdiff(names(result), c(names(dat), "N"))
result[
# remove join columns
, -(1:2)][
# put NA if too few data points
N < n_min, (computed_cols) := NA][]
time N adj_r_sqr RMSE p_val_y p_val_intercept slope
1: 5262.00 9 -1.412484e-01 0.06749996 0.923658051 0.76541424 8.050483e-04
2: 5263.20 9 -1.412484e-01 0.06749996 0.923658051 0.76541424 8.050483e-04
3: 5264.40 10 -1.248329e-01 0.06411770 0.973340896 0.77035143 2.202740e-04
...
70: 5413.40 6 -1.689106e-01 0.03205016 0.626213128 0.38442443 1.915836e-03
71: 5415.98 5 -3.324952e-01 0.03383253 0.968080223 0.75067625 2.325581e-04
72: 5418.56 4 NA NA NA NA NA
time N adj_r_sqr RMSE p_val_y p_val_intercept slope
Splitting by fixed intervals
The OP has also asked
An additional question could be to subset the data, but not in a
rolling, rather in a spliced way again using the time variable.
# define parameters
n_min <- 5L
t_len <- 20
# create "pretty" breaks
breaks <- setDT(dat)[, seq(floor(min(time)/t_len)*t_len, max(time) + t_len, t_len)]
dat[, {
lmodel <- lm(var1 ~ time)
lsumm <- summary(lmodel)
.(t_min = min(time),
t_max = max(time),
N = .N,
adj_r_sqr = lsumm$adj.r.squared,
RMSE = sqrt(mean(lsumm$residuals^2)),
p_val_y = if (.N > 1) lsumm$coefficients[2,4] else NA_real_,
p_val_intercept = lsumm$coefficients[1,4],
slope = coef(lmodel)[2]
)
}, by = .(cut(time, breaks))]
cut t_min t_max N adj_r_sqr RMSE p_val_y p_val_intercept slope
1: (5.26e+03,5.28e+03] 5262.00 5278.74 14 -0.08098718 0.06260179 0.87447674 0.52663574 0.0005619086
2: (5.28e+03,5.3e+03] 5280.12 5299.88 12 0.33144858 0.06512008 0.02934449 0.06866916 -0.0087040163
3: (5.3e+03,5.32e+03] 5302.46 5317.94 7 -0.19007362 0.03206618 0.84623551 0.69966581 -0.0005675526
4: (5.32e+03,5.34e+03] 5320.52 5338.58 8 -0.16348201 0.06360759 0.90221583 0.62280945 0.0005629384
5: (5.34e+03,5.36e+03] 5341.16 5359.22 8 0.54042068 0.03778046 0.02285248 0.01024298 0.0079272794
6: (5.36e+03,5.38e+03] 5361.80 5379.86 8 0.20369592 0.03803705 0.14585425 0.06090703 0.0043881506
7: (5.38e+03,5.4e+03] 5382.44 5397.92 7 0.06865351 0.07101868 0.28355300 0.39287969 -0.0073920266
8: (5.4e+03,5.42e+03] 5400.50 5418.56 8 -0.04065894 0.02837687 0.42672518 0.14267960 0.0016703581

Trying to add breakpoint lines from strucchange to a plot by "lines" command [duplicate]

This is my first time with strucchange so bear with me. The problem I'm having seems to be that strucchange doesn't recognize my time series correctly but I can't figure out why and haven't found an answer on the boards that deals with this. Here's a reproducible example:
require(strucchange)
# time series
nmreprosuccess <- c(0,0.50,NA,0.,NA,0.5,NA,0.50,0.375,0.53,0.846,0.44,1.0,0.285,
0.75,1,0.4,0.916,1,0.769,0.357)
dat.ts <- ts(nmreprosuccess, frequency=1, start=c(1996,1))
str(dat.ts)
Time-Series [1:21] from 1996 to 2016: 0 0.5 NA 0 NA 0.5 NA 0.5 0.375 0.53 ...
To me this means that the time series looks OK to work with.
# obtain breakpoints
bp.NMSuccess <- breakpoints(dat.ts~1)
summary(bp.NMSuccess)
Which gives:
Optimal (m+1)-segment partition:
Call:
breakpoints.formula(formula = dat.ts ~ 1)
Breakpoints at observation number:
m = 1 6
m = 2 3 7
m = 3 3 14 16
m = 4 3 7 14 16
m = 5 3 7 10 14 16
m = 6 3 7 10 12 14 16
m = 7 3 5 7 10 12 14 16
Corresponding to breakdates:
m = 1 0.333333333333333
m = 2 0.166666666666667 0.388888888888889
m = 3 0.166666666666667
m = 4 0.166666666666667 0.388888888888889
m = 5 0.166666666666667 0.388888888888889 0.555555555555556
m = 6 0.166666666666667 0.388888888888889 0.555555555555556 0.666666666666667
m = 7 0.166666666666667 0.277777777777778 0.388888888888889 0.555555555555556 0.666666666666667
m = 1
m = 2
m = 3 0.777777777777778 0.888888888888889
m = 4 0.777777777777778 0.888888888888889
m = 5 0.777777777777778 0.888888888888889
m = 6 0.777777777777778 0.888888888888889
m = 7 0.777777777777778 0.888888888888889
Fit:
m 0 1 2 3 4 5 6 7
RSS 1.6986 1.1253 0.9733 0.8984 0.7984 0.7581 0.7248 0.7226
BIC 14.3728 12.7421 15.9099 20.2490 23.9062 28.7555 33.7276 39.4522
Here's where I start having the problem. Instead of reporting the actual breakdates it reports numbers which then makes it impossible to plot the break lines onto a graph because they're not at the breakdate (2002) but at 0.333.
plot.ts(dat.ts, main="Natural Mating")
lines(fitted(bp.NMSuccess, breaks = 1), col = 4, lwd = 1.5)
Nothing shows up for me in this graph (I think because it's so small for the scale of the graph).
In addition, when I try fixes that may possibly work around this problem,
fm1 <- lm(dat.ts ~ breakfactor(bp.NMSuccess, breaks = 1))
I get:
Error in model.frame.default(formula = dat.ts ~ breakfactor(bp.NMSuccess, :
variable lengths differ (found for 'breakfactor(bp.NMSuccess, breaks = 1)')
I get errors because of the NA values in the data so the length of dat.ts is 21 and the length of breakfactor(bp.NMSuccess, breaks = 1) 18 (missing the 3 NAs).
Any suggestions?
The problem occurs because breakpoints() currently can only (a) cope with NAs by omitting them, and (b) cope with times/date through the ts class. This creates the conflict because when you omit internal NAs from a ts it loses its ts property and hence breakpoints() cannot infer the correct times.
The "obvious" way around this would be to use a time series class that can cope with this, namely zoo. However, I just never got round to fully integrate zoo support into breakpoints() because it would likely break some of the current behavior.
To cut a long story short: Your best choice at the moment is to do the book-keeping about the times yourself and not expect breakpoints() to do it for you. The additional work is not so huge. First, we create a time series with the response and the time vector and omit the NAs:
d <- na.omit(data.frame(success = nmreprosuccess, time = 1996:2016))
d
## success time
## 1 0.000 1996
## 2 0.500 1997
## 4 0.000 1999
## 6 0.500 2001
## 8 0.500 2003
## 9 0.375 2004
## 10 0.530 2005
## 11 0.846 2006
## 12 0.440 2007
## 13 1.000 2008
## 14 0.285 2009
## 15 0.750 2010
## 16 1.000 2011
## 17 0.400 2012
## 18 0.916 2013
## 19 1.000 2014
## 20 0.769 2015
## 21 0.357 2016
Then we can estimate the breakpoint(s) and afterwards transform from the "number" of observations back to the time scale. Note that I'm setting the minimal segment size h explicitly here because the default of 15% is probably somewhat small for this short series. 4 is still small but possibly enough for estimating of a constant mean.
bp <- breakpoints(success ~ 1, data = d, h = 4)
bp
## Optimal 2-segment partition:
##
## Call:
## breakpoints.formula(formula = success ~ 1, h = 4, data = d)
##
## Breakpoints at observation number:
## 6
##
## Corresponding to breakdates:
## 0.3333333
We ignore the break "date" at 1/3 of the observations but simply map back to the original time scale:
d$time[bp$breakpoints]
## [1] 2004
To re-estimate the model with nicely formatted factor levels, we could do:
lab <- c(
paste(d$time[c(1, bp$breakpoints)], collapse = "-"),
paste(d$time[c(bp$breakpoints + 1, nrow(d))], collapse = "-")
)
d$seg <- breakfactor(bp, labels = lab)
lm(success ~ 0 + seg, data = d)
## Call:
## lm(formula = success ~ 0 + seg, data = d)
##
## Coefficients:
## seg1996-2004 seg2005-2016
## 0.3125 0.6911
Or for visualization:
plot(success ~ time, data = d, type = "b")
lines(fitted(bp) ~ time, data = d, col = 4, lwd = 2)
abline(v = d$time[bp$breakpoints], lty = 2)
One final remark: For such short time series where just a simple shift in the mean is needed, one could also consider conditional inference (aka permutation tests) rather than the asymptotic inference employed in strucchange. The coin package provides the maxstat_test() function exactly for this purpose (= short series where a single shift in the mean is tested).
library("coin")
maxstat_test(success ~ time, data = d, dist = approximate(99999))
## Approximative Generalized Maximally Selected Statistics
##
## data: success by time
## maxT = 2.3953, p-value = 0.09382
## alternative hypothesis: two.sided
## sample estimates:
## "best" cutpoint: <= 2004
This finds the same breakpoint and provides a permutation test p-value. If however, one has more data and needs multiple breakpoints and/or further regression coefficients, then strucchange would be needed.

strucchange not reporting breakdates

This is my first time with strucchange so bear with me. The problem I'm having seems to be that strucchange doesn't recognize my time series correctly but I can't figure out why and haven't found an answer on the boards that deals with this. Here's a reproducible example:
require(strucchange)
# time series
nmreprosuccess <- c(0,0.50,NA,0.,NA,0.5,NA,0.50,0.375,0.53,0.846,0.44,1.0,0.285,
0.75,1,0.4,0.916,1,0.769,0.357)
dat.ts <- ts(nmreprosuccess, frequency=1, start=c(1996,1))
str(dat.ts)
Time-Series [1:21] from 1996 to 2016: 0 0.5 NA 0 NA 0.5 NA 0.5 0.375 0.53 ...
To me this means that the time series looks OK to work with.
# obtain breakpoints
bp.NMSuccess <- breakpoints(dat.ts~1)
summary(bp.NMSuccess)
Which gives:
Optimal (m+1)-segment partition:
Call:
breakpoints.formula(formula = dat.ts ~ 1)
Breakpoints at observation number:
m = 1 6
m = 2 3 7
m = 3 3 14 16
m = 4 3 7 14 16
m = 5 3 7 10 14 16
m = 6 3 7 10 12 14 16
m = 7 3 5 7 10 12 14 16
Corresponding to breakdates:
m = 1 0.333333333333333
m = 2 0.166666666666667 0.388888888888889
m = 3 0.166666666666667
m = 4 0.166666666666667 0.388888888888889
m = 5 0.166666666666667 0.388888888888889 0.555555555555556
m = 6 0.166666666666667 0.388888888888889 0.555555555555556 0.666666666666667
m = 7 0.166666666666667 0.277777777777778 0.388888888888889 0.555555555555556 0.666666666666667
m = 1
m = 2
m = 3 0.777777777777778 0.888888888888889
m = 4 0.777777777777778 0.888888888888889
m = 5 0.777777777777778 0.888888888888889
m = 6 0.777777777777778 0.888888888888889
m = 7 0.777777777777778 0.888888888888889
Fit:
m 0 1 2 3 4 5 6 7
RSS 1.6986 1.1253 0.9733 0.8984 0.7984 0.7581 0.7248 0.7226
BIC 14.3728 12.7421 15.9099 20.2490 23.9062 28.7555 33.7276 39.4522
Here's where I start having the problem. Instead of reporting the actual breakdates it reports numbers which then makes it impossible to plot the break lines onto a graph because they're not at the breakdate (2002) but at 0.333.
plot.ts(dat.ts, main="Natural Mating")
lines(fitted(bp.NMSuccess, breaks = 1), col = 4, lwd = 1.5)
Nothing shows up for me in this graph (I think because it's so small for the scale of the graph).
In addition, when I try fixes that may possibly work around this problem,
fm1 <- lm(dat.ts ~ breakfactor(bp.NMSuccess, breaks = 1))
I get:
Error in model.frame.default(formula = dat.ts ~ breakfactor(bp.NMSuccess, :
variable lengths differ (found for 'breakfactor(bp.NMSuccess, breaks = 1)')
I get errors because of the NA values in the data so the length of dat.ts is 21 and the length of breakfactor(bp.NMSuccess, breaks = 1) 18 (missing the 3 NAs).
Any suggestions?
The problem occurs because breakpoints() currently can only (a) cope with NAs by omitting them, and (b) cope with times/date through the ts class. This creates the conflict because when you omit internal NAs from a ts it loses its ts property and hence breakpoints() cannot infer the correct times.
The "obvious" way around this would be to use a time series class that can cope with this, namely zoo. However, I just never got round to fully integrate zoo support into breakpoints() because it would likely break some of the current behavior.
To cut a long story short: Your best choice at the moment is to do the book-keeping about the times yourself and not expect breakpoints() to do it for you. The additional work is not so huge. First, we create a time series with the response and the time vector and omit the NAs:
d <- na.omit(data.frame(success = nmreprosuccess, time = 1996:2016))
d
## success time
## 1 0.000 1996
## 2 0.500 1997
## 4 0.000 1999
## 6 0.500 2001
## 8 0.500 2003
## 9 0.375 2004
## 10 0.530 2005
## 11 0.846 2006
## 12 0.440 2007
## 13 1.000 2008
## 14 0.285 2009
## 15 0.750 2010
## 16 1.000 2011
## 17 0.400 2012
## 18 0.916 2013
## 19 1.000 2014
## 20 0.769 2015
## 21 0.357 2016
Then we can estimate the breakpoint(s) and afterwards transform from the "number" of observations back to the time scale. Note that I'm setting the minimal segment size h explicitly here because the default of 15% is probably somewhat small for this short series. 4 is still small but possibly enough for estimating of a constant mean.
bp <- breakpoints(success ~ 1, data = d, h = 4)
bp
## Optimal 2-segment partition:
##
## Call:
## breakpoints.formula(formula = success ~ 1, h = 4, data = d)
##
## Breakpoints at observation number:
## 6
##
## Corresponding to breakdates:
## 0.3333333
We ignore the break "date" at 1/3 of the observations but simply map back to the original time scale:
d$time[bp$breakpoints]
## [1] 2004
To re-estimate the model with nicely formatted factor levels, we could do:
lab <- c(
paste(d$time[c(1, bp$breakpoints)], collapse = "-"),
paste(d$time[c(bp$breakpoints + 1, nrow(d))], collapse = "-")
)
d$seg <- breakfactor(bp, labels = lab)
lm(success ~ 0 + seg, data = d)
## Call:
## lm(formula = success ~ 0 + seg, data = d)
##
## Coefficients:
## seg1996-2004 seg2005-2016
## 0.3125 0.6911
Or for visualization:
plot(success ~ time, data = d, type = "b")
lines(fitted(bp) ~ time, data = d, col = 4, lwd = 2)
abline(v = d$time[bp$breakpoints], lty = 2)
One final remark: For such short time series where just a simple shift in the mean is needed, one could also consider conditional inference (aka permutation tests) rather than the asymptotic inference employed in strucchange. The coin package provides the maxstat_test() function exactly for this purpose (= short series where a single shift in the mean is tested).
library("coin")
maxstat_test(success ~ time, data = d, dist = approximate(99999))
## Approximative Generalized Maximally Selected Statistics
##
## data: success by time
## maxT = 2.3953, p-value = 0.09382
## alternative hypothesis: two.sided
## sample estimates:
## "best" cutpoint: <= 2004
This finds the same breakpoint and provides a permutation test p-value. If however, one has more data and needs multiple breakpoints and/or further regression coefficients, then strucchange would be needed.

Resources