I am trying to use the group= option in geom_boxplot and it works for one grouping function, but not for the another. First plot runs, 2nd and 3rd plots (really same, called differently) both fail to produce 2-month boxplots for pre 2017 and one-month boxplots for 2017, as the grouper intends. For grouper function ggplot declares Warning message: position_dodge requires non-overlapping x intervals " but X value is same across graphs. Clearly related to my groupdates function, but groups appear to be constructed properly. Suggestions welcome. With thanks.
library(tidyverse)
library(lubridate)
# I want two month groups before 2017, and one-month groups in 2017
groupdates <- function(date) {
month_candidate <-case_when(
year(date) < 2017 ~ paste0(year(date), "-", (floor(((0:11)/12)*6)*2)+1),
TRUE ~ paste0(year(date), "_", month(date))
)
month_candidate2 <-case_when(
(str_length(month_candidate)==6) ~ paste0(str_sub(month_candidate,1,5), "0", str_sub(month_candidate,6)),
TRUE ~ month_candidate
)
return(month_candidate2)
}
generate_fake_date_time <- function(N, st="2015/01/02", et="2017/02/28") {
st <- as.POSIXct(as.Date(st))
et <- as.POSIXct(as.Date(et))
dt <- as.numeric(difftime(et,st,unit="sec"))
ev <- sort(runif(N, 0, dt))
rt <- st + ev
}
n=5000
set.seed(250)
test <-as.data.frame(generate_fake_date_time(n))
colnames(test) <- "posixctdate"
test$ranvalue <- month(test$posixctdate)+runif(length(test), 0,1)
test$grouped_time <-groupdates(test$posixctdate)
table(test$grouped_time)
ggplot(test)+geom_boxplot(aes(x=posixctdate, y=ranvalue, group=paste0(year(posixctdate), "_", month(posixctdate))))
#ggplot(test)+geom_violin(aes(x=posixctdate, y=ranvalue, group=junk))
ggplot(test)+geom_boxplot(aes(x=posixctdate, y=ranvalue, group=grouped_time))
ggplot(test)+geom_boxplot(aes(x=posixctdate, y=ranvalue, group=groupdates(posixctdate)))
sessionInfo()
If I correctly understood your problem, you should think about modifying your groupdates function.
I only modified the 3rd line using :
ceiling instead of floor
month(date) instead of 0:11
Resulting in :
groupdates <- function(date) {
month_candidate <-case_when(
year(date) < 2017 ~ paste0(year(date), "-", (ceiling(((month(date))/12)*6)*2)+1),
TRUE ~ paste0(year(date), "_", month(date))
)
month_candidate2 <-case_when(
(str_length(month_candidate)==6) ~ paste0(str_sub(month_candidate,1,5), "0", str_sub(month_candidate,6)),
TRUE ~ month_candidate
)
return(month_candidate2)
}
I also modified the computation of ranvalue to have a better distribution, I bet you wanted to use nrow instead of length :
test$ranvalue <- month(test$posixctdate) + runif(nrow(test), 0, 1)
test$grouped_time <-groupdates(test$posixctdate)
table(test$grouped_time)
And the output (no changes) :
ggplot(test)+geom_boxplot(aes(x=posixctdate, y=ranvalue, group=grouped_time))
Related
I need help solving this error, I am not sure how to, but it seems as when I run the "dp_stat" in the end, an error appears when I do my OLS model. Essentially I try to find the cumulative difference between a benchmark and a given individual predictive model. Here dp is my independent variable and sg is my dependent variable. datanu is my excel data. I'm not sure how to attach the data here, however here is a link to the excel and the code: https://drive.google.com/drive/folders/12BOuNBODURIP7CQIBZWMmHFc1d7zXHxN?usp=sharing If anyone has a fix it would mean the world!
"#Error in lag():! n must be a positive integer, not a double vector of length 1."
rm(list= ls()) # Clear global environment
invisible(gc()) # Free up unused R-occupied memory
cat("\014") # Clear console output: equivalent to ctrl + L
library("tseries")
library("readxl")
library("Metrics")
library("lubridate")
library("ggplot2")
library("data.table")
library("dyn")
library("reshape2")
#header TRUE fordi første row er navne.
datanu <- read_xlsx("~/Documents/6.semester/Bachelor/Data/datanu.xlsx",
na = "NaN",
sheet = "datax",
)
myts <- ts(datanu, start=c(1872, 1), end=c(2020, 12), frequency=12)
plot(myts[, c("dp", "dy", "ep", "de")])
get_statistics <- function(myts, dp, sg, h=1, start=1872, end=2020, est_periods_OOS = 20) {
#### IS ANALYSIS
#1. Historical mean model for en portefølje
avg <- mean(window(myts, start, end)[, sg], na.rm=TRUE)
IS_error_N <- (window(myts, start, end)[, sg] - avg)
#2. OLS model
#reg <- dyn$lm(sg ~ lag(as.numeric(dp), 1), data=window(myts, start, end))
reg <- dyn$lm(eval(parse(text=sg)) ~ lag(eval(parse(text=dp)), -1), data=window(myts, start, end)) #Error in `lag()`:! `n` must be a positive integer, not a double vector of length 1.
IS_error_A <- reg$residuals
#OOS ANALYSIS
OOS_error_N <- numeric(end - start - est_periods_OOS)
OOS_error_A <- numeric(end - start - est_periods_OOS)
#anvender kun information op til forecasten er lavet.
j <- 0
for (i in (start + est_periods_OOS):(end-1)) {
j <- j + 1
#Get the actual ERP that you want to predict
actual_ERP <- as.numeric(window(myts, i+1, i+1)[, sg])
#1. Historical mean model
OOS_error_N[j] <- actual_ERP - mean(window(myts, start, i)[, sg], na.rm=TRUE)
#2. OLS model
reg_OOS <- dyn$lm(eval(parse(text=sg)) ~ lag(eval(parse(text=dp)), -1),
data=window(myts, start, i))
#Compute_error
df <- data.frame(x=as.numeric(window(myts, i, i)[, dp]))
names(df) <- dp
pred_ERP <- predict.lm(reg_OOS, newdata=df)
OOS_error_A[j] <- pred_ERP - actual_ERP
}
#Compute statistics
MSE_N <- mean(OOS_error_N^2)
MSE_A <- mean(OOS_error_A^2)
T <- length(!is.na(myts[, sg]))
OOS_R2 <- 1 - MSE_A/MSE_N
#Is the -1 enough (maybe -2 needed because of lag)?
OOS_oR2 <- OOS_R2 - (1-OOS_R2)*(reg$df.residual)/(T - 1)
dRMSE <- sqrt(MSE_N) - sqrt(MSE_A)
##
#### CREATE PLOT
IS <- cumsum(IS_error_N[2:length(IS_error_N)]^2)-cumsum(IS_error_A^2)
OOS <- cumsum(OOS_error_N^2)-cumsum(OOS_error_A^2)
df <- data.frame(x=seq.int(from=start + 1 + est_periods_OOS, to=end),
IS=IS[(1 + est_periods_OOS):length(IS)],
OOS=OOS) #Because you lose one observation due to the lag
#Shift IS errors vertically, so that the IS line begins
# at zero on the date of first OOS prediction. (se Goyal/Welch (2008, side 1465))
df$IS <- df$IS - df$IS[1]
df <- melt(df, id.var="x")
plotGG <- ggplot(df) +
geom_line(aes(x=x, y=value,color=variable)) +
geom_rect(data=data.frame(),#Needed by ggplot2, otherwise not transparent
aes(xmin=2008, xmax=2010,ymin=-0.2,ymax=0.2),
fill='red',
alpha=0.1) +
scale_y_continuous('Cumulative SSE Difference', limits=c(-0.2, 0.2)) +
scale_x_continuous('Year')
##
return(list(IS_error_N = IS_error_N,
IS_error_A = reg$residuals,
OOS_error_N = OOS_error_N,
OOS_error_A = OOS_error_A,
IS_R2 = summary(reg)$r.squared,
IS_aR2 = summary(reg)$adj.r.squared,
OOS_R2 = OOS_R2,
OOS_oR2 = OOS_oR2,
dRMSE = dRMSE,
plotGG = plotGG))
}
dp_stat <- get_statistics(myts, "dp", "sg", start=1872)
dp_stat$plotGG
As the error message states, n must be a positive integer, not a double vector of length 1. The error comes from you providing n = -1 (i.e., a negative number) as an argument. I assume your idea is to have a negative number of positions to lag by. However, the lag() function only accepts a positive number of lag positions. Instead of lag(), you should use lead() with n = 1 to achieve the desired result.
I want to perform an expanding regression at monthly frequency using daily data. The model is:
ret = \beta_0 + \beta_1 X + \varepsilon
Sample data and my attempt:
library(zoo)
df = data.frame(date = seq(as.Date('2011-01-01'),as.Date('2011-03-31'),by = 1), ret = rnorm(90,0,1), X = rnorm(90,0,1))
roll = function(data, n = now(data) {
rollapplyr(1:n, 1:n, function(x) coef(lm(ret ~ X, data, subset =x))[[2]]
}
output = df %>%
mutate(coefficient = roll(data.frame(ret, X)))
The code above runs expanding regression by row and I could extract just the last value in each month to get the coefficients for that month (i.e., in this example, I only need coefficients estimated on Jan-31, Feb-28 and Mar-31).
However, I need to apply this code to a large dataset, and to save time I only want the regressions to run at the last day of each month in the expanding style (i.e., not run regression every day). I'd appreciate if someone can help point out a way to improve the code here.
Create a function coef2 that given a yearmon object computes the second regression coeffient up to dates in that year and month.
library(zoo)
coef2 <- function(ym, data) {
coef(lm(ret ~ X, data, subset = as.yearmon(date) <= ym))[[2]]
}
yearmonth <- unique(as.yearmon(df$date))
data.frame(yearmonth, slope = sapply(yearmonth, coef2, data = df))
## yearmonth slope
## 1 Jan 2011 0.2208940
## 2 Feb 2011 0.1792896
## 3 Mar 2011 0.1180308
If performance is an issue you could try t his alternative version of coef2 which avoids the use of lm :
coef2 <- function(ym, data) {
with(subset(data, as.yearmon(date) <= ym), cov(ret, X) / var(X))
}
As shown in the following example, what I want to achieve is to run the regression many times, each time R records the estimates of did in one data.frame.
Each time, I changed the year condition in "ifelse", ie., ifelse(mydata$year >= 1993, 1, 0), thus each time I run a different regression.
mydata$time = ifelse(mydata$year >= 1994, 1, 0)
Can anyone help it? My basic code is as below (the data can be downloaded through browser if R returned errors):
library(foreign)
mydata = read.dta("http://dss.princeton.edu/training/Panel101.dta")
mydata$time = ifelse(mydata$year >= 1994, 1, 0)
mydata$did = mydata$time * mydata$treated
mydata$treated = ifelse(mydata$country == "E" | mydata$country == "F" | mydata$country == "G", 1, 0)
didreg = lm(y ~ treated + time + did, data = mydata)
summary(didreg)
Generally if you want to repeat a process many times with some different input each time, you need a function. The following function takes a scalar value year_value as its input, creates local variables for regression and exports estimates for model term did.
foo <- function (year_value) {
## create local variables from `mydata`
y <- mydata$y
treated <- as.numeric(mydata$country %in% c("E", "F", "G")) ## use `%in%`
time <- as.numeric(mydata$year >= year_value) ## use `year_value`
did <- time * treated
## run regression using local variables
didreg <- lm(y ~ treated + time + did)
## return estimate for model term `did`
coef(summary(didreg))["did", ]
}
foo(1993)
# Estimate Std. Error t value Pr(>|t|)
#-2.784222e+09 1.504349e+09 -1.850782e+00 6.867661e-02
Note there are several places where your original code can be improved. Say, using "%in%" instead of multiple "|", and using as.numeric instead of ifelse to coerce boolean to numeric.
Now you need something like a loop to iterate this function over several different year_value. I would use lappy.
## raw list of result from `lapply`
year_of_choice <- 1993:1994 ## taken for example
result <- lapply(year_of_choice, foo)
## rbind them into a matrix
data.frame(year = year_of_choice, do.call("rbind", result), check.names = FALSE)
# year Estimate Std. Error t value Pr(>|t|)
#1 1993 -2784221881 1504348732 -1.850782 0.06867661
#2 1994 -2519511630 1455676087 -1.730819 0.08815711
Note, don't include year 1990 (the minimum of variable year) as a choice, otherwise time will be a vector of 1, as same as the intercept. The resulting model is rank-deficient and you will get "subscript out of bounds" error. R version since 3.5.0 has a new complete argument to generic function coef. So for stability we may use
coef(summary(didreg), complete = TRUE)["did", ]
But you should see all NA or NaN for year 1990.
Here is another option, here we create a matrix for all the years, join it to mydata, gather to long, nest by grouping, then run regression to extract the estimates. Note that "gt_et_**" stands for "greater than or equal to.."
library(foreign)
library(dplyr)
library(tidyr)
library(purrr)
mydata = read.dta("http://dss.princeton.edu/training/Panel101.dta")
mtrx <- matrix(0, length(min(mydata$year):max(mydata$year)), length(min(mydata$year):max(mydata$year)))
mtrx[lower.tri(mtrx, diag = TRUE)] <- 1
df <- mtrx %>% as.data.frame() %>% mutate(year = min(mydata$year):max(mydata$year))
colnames(df) <- c(paste0("gt_et_", df$year), "year")
models <- df %>%
full_join(., mydata, by = "year") %>%
gather(mod, time, gt_et_1990:gt_et_1999) %>%
nest(-mod) %>%
mutate(data = map(data, ~mutate(.x, treated = ifelse(country == "E"|country == "F"|country == "G", 1, 0),
did = time * treated)),
mods = map(data, ~lm(y ~ treated + time + did, data = .x) %>% summary() %>% coef())) %>%
unnest(mods %>% map(broom::tidy)) %>%
filter(.rownames == "did") %>%
select(-.rownames)
models
#> mod Estimate Std..Error t.value Pr...t..
#> 1 gt_et_1991 -2309823993 2410140350 -0.95837738 0.34137018
#> 2 gt_et_1992 -2036098728 1780081308 -1.14382344 0.25682856
#> 3 gt_et_1993 -2784221881 1504348732 -1.85078222 0.06867661
#> 4 gt_et_1994 -2519511630 1455676087 -1.73081886 0.08815711
#> 5 gt_et_1995 -2357323806 1455203186 -1.61992760 0.11001662
#> 6 gt_et_1996 250180589 1511322882 0.16553749 0.86902697
#> 7 gt_et_1997 405842197 1619653548 0.25057346 0.80292231
#> 8 gt_et_1998 -75683039 1852314277 -0.04085864 0.96753194
#> 9 gt_et_1999 2951694230 2452126428 1.20372840 0.23299421
Created on 2018-09-01 by the reprex
package (v0.2.0).
I am working on a large dataset with 19 subcohorts for which I want to run a lineair regression model to estimate BMI.
One of the covariates I am using is sex, but some subcohorts consist only of men, which causes problems in my loop.
If I try to run a linear regression model, I get the following error:
tmp*`, value = contr.funs[1 + isOF[nn]]) :
contrasts can be applied only to factors with 2 or more levels
I have found a solution for this problem, by running seperate loops for subcohorts with men and subcohorts with men and women by the following (simplified) code:
men <- c(1,6,15) # Cohort nrs that only contain men
menandwomen <- c(2,3,4,5,7,8,9,10,11,12,13,14,16,17,18,19)
trenddpmodelm <-list()
for(i in men) {
trenddpmodelm[[i]] <- lm(BMI ~ age + sex,
data=subcohort[subcohort$centre_a==i, ],)
}
trenddpmodelmw <-list()
for(i in menandwomen) {
trenddpmodelmw[[i]] <- lm(BMI ~ age + sex,
data=subcohort[subcohort$centre_a==i, ],)
}
trenddpmodel <- c(list(trenddpmodelm[[1]]), list(trenddpmodelmw[[2]]), list(trenddpmodelmw[[3]]), list(trenddpmodelmw[[4]]), list(trenddpmodelmw[[5]]), list(trenddpmodelm[[6]]), list(trenddpmodelmw[[7]]), list(trenddpmodelmw[[8]]), list(trenddpmodelmw[[9]]), list(trenddpmodelmw[[10]]), list(trenddpmodelmw[[11]]), list(trenddpmodelmw[[12]]), list(trenddpmodelmw[[13]]), list(trenddpmodelmw[[14]]), list(trenddpmodelm[[15]]), list(trenddpmodelmw[[16]]), list(trenddpmodelmw[[17]]), list(trenddpmodelmw[[18]]), list(trenddpmodelmw[[19]]))
After this step, I extract relevant information from the summaries and put this in a df to export to excel.
My problem is that I will be running quite a lot of analyses, which will result in pages and pages of code.
My question is therefore: Is there a setting in R that I could use that allows non varying factors to be dropped from my lineair regression model in subcohorts where this is applicable? (similar to what happens in coxph; R gives a warning that the factor does not always vary, but the loop does run)
It is not like I cannot continue working without a solution, but I have been trying to find an answer to this question for days without succes and I think it must be possible somehow. Any advice is much appreciated :)
I would recommend building your formula dynamically within the loop.
DF <- list(Cohort1 = data.frame(bmi = rnorm(25, 24, 1),
age = rnorm(25, 50, 3),
sex = sample(c("F", "M"), 25, replace = TRUE)),
Cohort2 = data.frame(bmi = rnorm(15, 24, 1),
age = rnorm(15, 55, 4),
sex = rep("M", 15)))
candidate_vars <- c("age", "sex")
Models <- vector("list", length(DF))
for (i in seq_along(DF)){
# Determine if the variables are either numeric, or factor with more than 1 level
indep <- vapply(X = DF[[i]][candidate_vars],
FUN = function(x){
if (is.numeric(x)) return(TRUE)
else return(nlevels(x) > 1)
},
FUN.VALUE = logical(1))
# Write the formula
form <- paste("bmi ~ ", paste(candidate_vars[indep], collapse = " + "))
# Create the model
Models[[i]] <- lm(as.formula(form), data = DF[[i]])
}
Using R I would like to compare the RMSE (root mean square error) from two prediction models. The first model uses estimates from 1966 to 2000 to predict 2001 and then uses estimates from 1966 to 2001 to predict 2002 and so on up to 2015. The second model uses estimates from 1991 to 2000 to predict 2001 and then uses estimates from 1992 to 2001 to predict 2002 and so on up to 2015. This problem has me really stumped and I truly appreciate any help.
DF <- data.frame(YEAR=1966:2015, TEMP=rnorm(50), PRESSURE=rnorm(50), RAINFALL=rnorm(50))
lmod <- lm(TEMP ~ PRESSURE + RAINFALL, data = DF)
rmse <- function(error) sqrt(mean(error^2))
rmse(lmod$residuals)
You can loop it:
Method 1:
pred1<-numeric(0)
rmse1<-numeric(0)
for(i in 1:15){
DF.train1<-DF[DF$YEAR < 2000+i,]
DF.test1<-DF[DF$YEAR == 2000+i,]
lmod1 <- lm(TEMP ~ PRESSURE + RAINFALL, data = DF.train1)
pred1[i]<- predict(lmod1, newdata = DF.test1)
rmse1[i]<-sqrt(mean((DF.test1$TEMP-pred1[i])^2))
}
pred1
rmse1
mean(rmse1)
Method 2:
pred2<-numeric(0)
rmse2<-numeric(0)
for(i in 1:15){
DF.train2<-DF[DF$YEAR < 2000+i & DF$YEAR > 1989+i,]
DF.test2<-DF[DF$YEAR == 2000+i,]
lmod2 <- lm(TEMP ~ PRESSURE + RAINFALL, data = DF.train2)
pred2[i]<- predict(lmod2, newdata = DF.test2)
rmse2[i]<-sqrt(mean((DF.test2$TEMP-pred2[i])^2))
}
pred2
rmse2
mean(rmse2)
Comparing the individual components of rmse1 and rmse2, as well as their respective means should be useful. The vectors pred1 and pred2 contain the individual TEMP predictions for each year (2001-2015) for their respective methods.
Edit: should be working now, and Method 2 trains on a rolling 10 year gap. Also, I take RMSE to be the square root of the MSE as defined for predictors in this article.
Here is another solution, where simulations are in a function.
The interest of this solution is to easily modify model specifications.
For example, if you want to try the model2 with a range of 15 years instead of 10, just modify the input in the function (range = 15). This also gives you the possibility to do a light sensibility analysis.
compare_models <- function(DF, start = 1966, end = 2000, range = 10)
{
require(hydroGOF)
for (i in (end+1):tail(DF$YEAR)[6])
{
# model1
lmod_1 = lm(TEMP ~ PRESSURE + RAINFALL, data = DF[DF$YEAR >= start & DF$YEAR < i,])
DF$model1_sim[DF$YEAR == i] <- predict(lmod_1, newdata = DF[DF$YEAR == i,])
# model2
lmod_2 = lm(TEMP ~ PRESSURE + RAINFALL, data = DF[DF$YEAR >= i-range & DF$YEAR < i,])
DF$model2_sim[DF$YEAR == i] <- predict(lmod_2, newdata = DF[DF$YEAR == i,])
}
return(DF)
}
I used hydroGOF package to compute rmse and NSE, which is a common indicator of model efficiency (see Nash and Sutcliffe, 1970, 11528 citations at the moment).
output = compare_models(DF)
require(hydroGOF) # compute RMSE and NSE
# RMSE
rmse(output$model1_sim,output$TEMP)
rmse(output$model2_sim,output$TEMP)
# Nash-Sutcliffe efficiency
NSE(output$model1_sim,output$TEMP, na.rm = T)
NSE(output$model2_sim,output$TEMP, na.rm = T)
And a simple simulated/observed plot to look for model predictions:
# melting data for plot
output_melt = melt(output[,c("TEMP", "model1_sim", "model2_sim")], id = "TEMP")
# Plot
ggplot(output_melt, aes(x = TEMP, y = value, color = variable)) +
theme_bw() + geom_point() + geom_abline(slope = 1, intercept = 0) +
xlim(-2,2) + ylim(-2,2) + xlab("Measured") + ylab("Simulated")
Here's yet another solution:
year <- 2000
time.frame <- 35
train.models <- function(year, time.frame) {
predictions <- sapply(year:(max(df$YEAR)-1),
function(year) {
lmod <- lm(TEMP ~ PRESSURE + RAINFALL, DF,
subset = with(DF, YEAR %in% (year - time.frame + 1):year))
pred <- predict(lmod, newdata = DF[DF$YEAR == (year + 1),])
names(pred) <- year + 1
return (pred)
})
return (predictions)
}
models1 <- train.models(2000, 35)
models2 <- train.models(2001, 10)
rmse(models1 - DF$TEMP[DF$YEAR %in% names(models1)])
rmse(models2 - DF$TEMP[DF$YEAR %in% names(models2)])