rolling regression with dplyr - r

I have a dataframe of "date", "company" and "return", reproducible by the code below:
library(dplyr)
n.dates <- 60
n.stocks <- 2
date <- seq(as.Date("2011-07-01"), by=1, len=n.dates)
symbol <- replicate(n.stocks, paste0(sample(LETTERS, 5), collapse = ""))
x <- expand.grid(date, symbol)
x$return <- rnorm(n.dates*n.stocks, 0, sd = 0.05)
names(x) <- c("date", "company", "return")
With this dataframe, I can calculate the daily market average return and add that result into a new column "market.ret".
x <- group_by(x, date)
x <- mutate(x, market.ret = mean(x$return, na.rm = TRUE))
Now I want to group all my data by different companies (2 in this case).
x <- group_by(x, company)
After doing this, I would like to fit "return" by "market.ret" and calculate the linear regression coefficient and store the slopes in a new column. If I want to do the fitting for the whole data set within a given company, then I can simply call lm():
group_by(x, company) %>%
do(data.frame(beta = coef(lm(return ~ market.ret,data = .))[2])) %>%
left_join(x,.)
However, I actually want to do the linear regression on a "rolling" basis, i.e. for each day separately over a 20-day trailing period. I want to use rollapply() but do not know how to pass two columns into the function. Any help or suggestion is greatly appreciated.
Note: Below is the code that I used for calculating 20-day rolling standard deviation of returns which might be helpful:
sdnoNA <- function(x){return(sd(x, na.rm = TRUE))}
x <- mutate(x, sd.20.0.d = rollapply(return, FUN = sdnoNA, width = 20, fill = NA))

## lms is a function which calculate the linear regression coefficient
lms <- function(y, x){
s = which(is.finite(x * y))
y = y[s]
x = x[s]
return(cov(x, y)/var(x))
}
## z is a dataframe which stores our final result
z <- data.frame()
## x has to be ungrouped
x <- ungroup(x)
## subset with "filter" and roll with "rollapply"
symbols <- unique(x$company)
for(i in 1:length(symbols)){
temp <- filter(x, company == symbols[i])
z <- rbind(z, mutate(temp, beta = rollapply(temp[, c(3, 4)],
FUN = function(x) lms(x[, 1], x[, 2]),
width = 20, fill = NA,
by.column = FALSE, align = "right")))
}
## final result
print(z)

Here is a dplyr solution
#####
# setup data as OP (notice the fix when computing the market return)
library(dplyr)
set.seed(41797642)
n.dates <- 60
n.stocks <- 2
date <- seq(as.Date("2011-07-01"), by=1, len=n.dates)
symbol <- replicate(n.stocks, paste0(sample(LETTERS, 5), collapse = ""))
x <- expand.grid(date, symbol)
x$return <- rnorm(n.dates*n.stocks, 0, sd = 0.05)
names(x) <- c("date", "company", "return")
x <- x %>%
group_by(date) %>%
mutate(market.ret = mean(return))
#####
# compute coefs using rollRegres
library(rollRegres)
func <- . %>% {
roll_regres.fit(x = cbind(1, .$market.ret),
y = .$return, width = 20L)$coefs }
out <- x %>%
group_by(company) %>%
# make it explicit that data needs to be sorted
arrange(date, .by_group = TRUE) %>%
do(cbind(reg_col = select(., market.ret, return) %>% func,
date_col = select(., date))) %>%
ungroup
head(out[!is.na(out$reg_col.1), ], 5)
#R # A tibble: 5 x 4
#R company reg_col.1 reg_col.2 date
#R <fct> <dbl> <dbl> <date>
#R 1 SNXAD -0.0104 0.746 2011-07-20
#R 2 SNXAD -0.00953 0.755 2011-07-21
#R 3 SNXAD -0.0124 0.784 2011-07-22
#R 4 SNXAD -0.0167 0.709 2011-07-23
#R 5 SNXAD -0.0148 0.691 2011-07-24
tail(out[!is.na(out$reg_col.1), ], 5)
#R # A tibble: 5 x 4
#R company reg_col.1 reg_col.2 date
#R <fct> <dbl> <dbl> <date>
#R 1 UYLTS -0.00276 0.837 2011-08-25
#R 2 UYLTS 0.0000438 0.928 2011-08-26
#R 3 UYLTS 0.000250 0.936 2011-08-27
#R 4 UYLTS -0.000772 0.886 2011-08-28
#R 5 UYLTS 0.00173 0.902 2011-08-29
It is very close to the answer here which is fairly close to this answer though using the rollRegres package.

Related

Loop over columns in a data.frame, extract model beta and SE into a new data.frame

I raised a similar question previously which was adequately answered Nested loop with simple linear regression (Thank you!)
This time, I am seeking a method to loop over columns in a data frame and save the output in a new data frame where each row of the new output data frame represents one iteration of the loop (namely a column from the original data frame we are looping over).
Here is my attempt so far:
library(tidyverse)
set.seed(42)
n <- 1000
dat <- data.frame(id=1:n,
q=runif(n, min=45, max=85),
r=runif(n, min=2.4, max=6.0),
s=runif(n, min=24, max=60),
t=runif(n, min=0.28, max=1.73))
vd <- runif(n, min=15, max=125)
my_models <- list()
x <- 1
for (i in 1:colnames(dat)){
model <- lm(paste(dat[[i]], "~", vd, "-1"))
my_models[[x]]<-data.frame(ModelNo=i,coefficients(summary(model)))
x <- x+1
}
output <- do.call(rbind,my_models)
reg.tbl <- output %>%
mutate(beta=(round(exp(Estimate), digits=3)),
lower=(round(exp(Estimate-1.96*Std..Error), digits=3)),
upper=(round(exp(Estimate+1.96*Std..Error), digits=3))) %>%
select(-Estimate, -Std..Error, -t.value, -Pr...t..) %>%
mutate(coef=paste(beta, " ", "(", lower, " ", "to", " ", upper, ")")) %>%
mutate(outcome=if_else(ModelNo==1, "MI", "Stroke")) %>%
select(-ModelNo, -beta, -lower, -upper)
The reason I have not applied the previous solution to my current data is that I have over 40 variables to run regressions on, and saving each as an object and then into a 'predictorlist' as per my previous solution is time consuming and defeats the purpose of trying to loop over something to automate the analysis. The very reason I switched over to R ~18 months ago.
Any assistance is, naturally, greatly appreciated.
I don't understand why you want the final result coefficients and CI's as character strings but the code below produces them like that.
I have changed the for loop, the question's didn't work;
the regression formulas are created with reformulate, not with paste;
in the pipe, the numbers are not rounded, sprintf takes care of that.
suppressPackageStartupMessages(library(tidyverse))
set.seed(42)
n <- 1000
dat <- data.frame(id=1:n,
q=runif(n, min=45, max=85),
r=runif(n, min=2.4, max=6.0),
s=runif(n, min=24, max=60),
t=runif(n, min=0.28, max=1.73))
vd <- runif(n, min=15, max=125)
my_models <- vector("list", length = ncol(dat) - 1L)
for(i in seq_along(names(dat))[-1]){
resp <- names(dat)[i]
fmla <- reformulate(termlabels = c(-1, "vd"), response = resp)
model <- lm(fmla, data = dat)
smry <- summary(model)
my_models[[i - 1L]] <- data.frame(ModelNo = i - 1L, coefficients(smry), check.names = FALSE)
}
output <- do.call(rbind, my_models)
row.names(output) <- NULL
output %>%
mutate(beta = exp(Estimate),
lower = exp(Estimate - 1.96*`Std. Error`),
upper = exp(Estimate + 1.96*`Std. Error`)) %>%
select(-Estimate, -`Std. Error`, -`t value`, -`Pr(>|t|)`) %>%
mutate(coef = sprintf("%.03g (%.03g to %.03g)", beta, lower, upper)) %>%
mutate(outcome = if_else(ModelNo == 1, "MI", "Stroke")) %>%
select(-ModelNo, -beta, -lower, -upper)
#> coef outcome
#> 1 2.13 (2.08 to 2.18) MI
#> 2 1.05 (1.05 to 1.05) Stroke
#> 3 1.64 (1.61 to 1.66) Stroke
#> 4 1.01 (1.01 to 1.01) Stroke
Created on 2022-03-07 by the reprex package (v2.0.1)
Edit
Here is another way. At first it seems more complicated but it simplifies the main loop, making it a simple purrr::map call.
Write a regression function reg_fun taking care of everything, the regression and the computation of the confidence intervals lower and upper bounds.
Then have it return an S3 class object sub-classing class "data.frame". I have called this custom class "Sandro", feel free to change this.
The custom class serves a purpose, to have its own print method. Like this the regression's return value will still be numbers and can be used extracting them with the standard extraction operators, but they will print in the desired format.
Then call purrr::map_dfr in a much simpler pipe.
suppressPackageStartupMessages(library(tidyverse))
set.seed(42)
n <- 1000
dat <- data.frame(id=1:n,
q=runif(n, min=45, max=85),
r=runif(n, min=2.4, max=6.0),
s=runif(n, min=24, max=60),
t=runif(n, min=0.28, max=1.73))
vd <- runif(n, min=15, max=125)
reg_fun <- function(y, x, data, conf = 0.95){
alpha <- qnorm(1 - (1 - conf)/2)
fit <- lm(y ~ 0 + x, data = data)
smry <- summary(fit)
estimate <- coef(smry)[, 1]
se <- coef(smry)[, 2]
out <- data.frame(
beta = exp(estimate),
lower = exp(estimate - alpha * se),
upper = exp(estimate + alpha * se)
)
class(out) <- c("Sandro", class(out))
out
}
# custom print method
print.Sandro <- function(x, digits = 3){
outcome <- rep("Stroke", nrow(x))
outcome[1] <- "MI"
fmt <- paste0("%.0", digits, "g")
fmt <- paste0(fmt, " (", fmt, " to ", fmt, ")")
out <- data.frame(
coef = sprintf(fmt, x[["beta"]], x[["lower"]], x[["upper"]]),
outcome = outcome
)
print.data.frame(out)
}
dat %>%
select(-id) %>%
map_dfr(reg_fun, x = vd, data = dat)
#> coef outcome
#> 1 2.13 (2.08 to 2.18) MI
#> 2 1.05 (1.05 to 1.05) Stroke
#> 3 1.64 (1.61 to 1.66) Stroke
#> 4 1.01 (1.01 to 1.01) Stroke
Created on 2022-03-07 by the reprex package (v2.0.1)
Edit 2
To have the responses also print, change the print method and the pipe to the following.
print.Sandro <- function(x, digits = 3){
fmt <- paste0("%.0", digits, "g")
fmt <- paste0(fmt, " (", fmt, " to ", fmt, ")")
out <- data.frame(
coef = sprintf(fmt, x[["beta"]], x[["lower"]], x[["upper"]])
)
print.data.frame(out)
}
dat %>%
select(-id) %>%
map_dfr(reg_fun, x = vd, data = dat) %>%
mutate(outcome = names(dat)[-1]) %>%
relocate(outcome, .before = "beta")
And to see the return data.frame with the coefficients, lower and upper bounds, end the pipe with a call to as.data.frame.
dat %>%
select(-id) %>%
map_dfr(reg_fun, x = vd, data = dat) %>%
mutate(outcome = names(dat)[-1]) %>%
relocate(outcome, .before = "beta") %>%
as.data.frame()
#> outcome beta lower upper
#> 1 q 2.128118 2.079911 2.177443
#> 2 r 1.050421 1.048759 1.052085
#> 3 s 1.638222 1.612149 1.664716
#> 4 t 1.012015 1.011534 1.012497

Regression imputation with dplyr in R

I want to do regression imputation with dplyr in R efficiently. Here is my problem: I have a data set with many missing values for one column - let's call it p. Now I want to estimate the missing values of p with a regression imputation approach. For that I regress p on a set of variables with OLS using uncensored data (a subset of the data set without missing values for p). Then I use the estimated coefficients to calculate the missing values of p.
My data set looks like that:
df = data.frame(
id = c(1, 1, 1, 2, 2, 2),
group = c(1, 1, 2, 1, 1, 2),
sub_group = c(1, 2, 3, 1, 2, 3),
p = c(4.3, 5.7, NA, NA, NA, 10),
var1 = c(0.3, 0.1, 0.4, 0.9, 0.1, 0.2),
var2 = c(0, 0, 0, 1, 1, 1)
)
where id represent individuals, which buy goods from a group (e.g. "food") with subgroups (like "bread"). p is the price, while var1 and var2 are some demographic variables (like "education" and "age").
What I've done so far:
library(dplyr)
df <- as_tibble(df)
# Create uncensored data
uncensored_df <- df %>%
filter(!is.na(p))
# Run regression on uncensored data
imp_model <- lm(p ~ var1 + var2, data = uncensored_df)
# Get the coefficients of the fitted model
coefs <- unname(imp_model$coefficients)
# Use coefficients to compute missing values of p
censored_df <-df %>%
filter(is.na(p)) %>%
group_by(id, group, sub_group) %>%
mutate(p = coefs[1] + coefs[2] * var1 + coefs[3] * var2)
# And finally combine the two subsets
bind_rows(uncensored_df, censored_df) %>% arrange(id, group, sub_group)
As I use more than var1 and var2 in my actual problem (about 30 variables), what is a better way to do regression imputation with dplyr? (I'm also open for non-dplyr solutions, though.)
library(dplyr)
fit <- lm(p ~ ., data = select(df, p, starts_with("var")))
df %>%
rowwise() %>%
mutate(p = ifelse(is.na(p), predict(fit, newdata = across()), p)) %>%
ungroup()
How it works
For starters, when fitting your model, you can subset your data frame using select and any of the tidyselect helpers to select your dependent variables (here used starts_with("var")). This subset data frame then allows you to use the ~ . notation which means regress p on everything in the subset data frame.
Next you create a row-wise data frame and use your model to predict where p is missing. In this instance across turns each row into a 1x6 tibble that you can pass to the newdata argument. predict then uses the model fit and this new data to predict a value of p.
Output
id group sub_group p var1 var2
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 4.3 0.3 0
2 1 1 2 5.7 0.1 0
3 1 2 3 3.60 0.4 0
4 2 1 1 5.10 0.9 1
5 2 1 2 10.7 0.1 1
6 2 2 3 10 0.2 1
Benchmarking
As mentioned in the comments, for large data frames the rowwise operation takes significantly longer than some other options:
library(microbenchmark)
set.seed(1)
df1 <- df %>%
slice_sample(n = 1E5, replace = T)
fit <- lm(p ~ ., data = select(df1, p, starts_with("var")))
dplyr_rowwise <- function(){
df1 %>%
rowwise() %>%
mutate(p = ifelse(is.na(p), predict(fit, newdata = across()), p)) %>%
ungroup()
}
dplyr_coalesce <- function(){
df1 %>%
mutate(p = coalesce(p, predict(fit, newdata = df1)))
}
base_index <- function(){
isna <- is.na(df1$p)
df1$p[isna] <- predict(fit, newdata = subset(df1, isna))
}
microbenchmark(
dplyr_rowwise(),
dplyr_coalesce(),
base_index(),
times = 10L
)
Unit: milliseconds
expr min lq mean median uq
dplyr_rowwise() 63739.9512 64441.0800 66926.46041 65513.51785 66923.0241
dplyr_coalesce() 6.5901 6.9037 8.55971 7.21125 7.7157
base_index() 13.0368 13.1790 15.73682 13.53310 19.3004

How to nest tables in a column of a dataframe?

I read that it is possible to store dataframes in a column of a dataframe with nest:
https://tidyr.tidyverse.org/reference/nest.html
Is it also possible to store tables in a column of a dataframe?
The reason is that I would like to calculate the Kappa for every subgroup of a dataframe with Caret. Although caret::confusionMatrix(t) expects a table as input.
In the example-code below this works fine if I calculate the Kappa for the complete dataframe at once:
library(tidyverse)
library(caret)
# generate some sample data:
n <- 100L
x1 <- rnorm(n, 1.0, 2.0)
x2 <- rnorm(n, -1.0, 0.5)
y <- rbinom(n, 1L, plogis(1 * x1 + 1 * x2))
my_factor <- rep( c('A','B','C','D'), 25 )
df <- cbind(x1, x2, y, my_factor)
# fit a model and make predictions:
mod <- glm(y ~ x1 + x2, "binomial")
probs <- predict(mod, type = "response")
# confusion matrix
probs_round <- round(probs)
t <- table(factor(probs_round, c(1,0)), factor(y, c(1,0)))
ccm <- caret::confusionMatrix(t)
# extract Kappa:
ccm$overall[2]
> Kappa
> 0.5232
Although if I try to do group_by to generate the Kappa for every factor as a subgroup (see code below) it does not succeed. I suppose I need to nest t in a certain way in df although I don't know how:
# extract Kappa for every subgroup with same factor (NOT WORKING CODE):
df <- cbind(df, probs_round)
df <- as.data.frame(df)
output <- df %>%
dplyr::group_by(my_factor) %>%
dplyr::mutate(t = table(factor(probs_round, c(1,0)), factor(y, c(1,0)))) %>%
summarise(caret::confusionMatrix(t))
Expected output:
>my_factor Kappa
>1 A 0.51
>2 B 0.52
>3 C 0.53
>4 D 0.54
Is this correct and is this possible?
(the exact values for Kappa will be different due to the randomness in the sample data)
Thanks a lot!
You could skip the intermediate mutate() that's giving you trouble to do:
library(dplyr)
library(caret)
df %>%
group_by(my_factor) %>%
summarize(t = confusionMatrix(table(factor(probs_round, c(1,0)),
factor(y, c(1,0))))$overall[2])
Returns:
# A tibble: 4 x 2
my_factor t
<chr> <dbl>
1 A 0.270
2 B 0.513
3 C 0.839
4 D 0.555
The above approach is the easiest to get the desired results. But just to show whats possible, we can use your approach with rowwise::nest_by which groups the data set rowwise.
In the approach below we calculate a separate glm for each subgroup. I'm not sure if that's what you want to do.
library(tidyverse)
library(caret)
# generate some sample data:
n <- 1000L
df <- tibble(x1 = rnorm(n, 1.0, 2.0),
x2 = rnorm(n, -1.0, 0.5),
y = rbinom(n, 1L, plogis(x1 + 1 * x1 + 1 * x2)),
my_factor = rep( c('A','B','C','D'), 250))
output <- df %>%
nest_by(my_factor) %>%
mutate(y = list(data$y),
mod = list(glm(y ~ x1 + x2,
family = "binomial",
data = data)),
probs = list(predict(mod, type = "response")),
probs_round = list(round(probs)),
t = list(table(factor(probs_round, c(1, 0)),
factor(y, c(1, 0)))),
ccm = caret::confusionMatrix(t)$overall[2])
output %>%
pull(ccm)
#> Kappa Kappa Kappa Kappa
#> 0.7743682 0.7078112 0.7157761 0.7549340
Created on 2021-06-23 by the reprex package (v0.3.0)

Why lm() not showing some output in R?

I was wondering why lm() says 5 coefs not defined because of singularities and then gives all NA in the summary output for 5 coefficients.
Note that all my predictors are categorical.
Is there anything wrong with my data on these 5 coefficients or code? How can I possibly fix this?
d <- read.csv("https://raw.githubusercontent.com/rnorouzian/m/master/v.csv", h = T) # Data
nms <- c("Age","genre","Length","cf.training","error.type","cf.scope","cf.type","cf.revision")
d[nms] <- lapply(d[nms], as.factor) # make factor
vv <- lm(dint~Age+genre+Length+cf.training+error.type+cf.scope+cf.type+cf.revision, data = d)
summary(vv)
First 6 lines of output:
Coefficients: (5 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.17835 0.63573 0.281 0.779330
Age1 -0.04576 0.86803 -0.053 0.958010
Age2 0.46431 0.87686 0.530 0.596990
Age99 -1.64099 1.04830 -1.565 0.118949
genre2 1.57015 0.55699 2.819 0.005263 **
genre4 NA NA NA NA ## For example here is all `NA`s? there are 4 more !
As others noted, a problem is that you seem to have multicollinearity. Another is that there are missing values in your dataset. The missing values should probably just be removed. As for correlated variables, you should inspect your data to identify this collinearity, and remove it. Deciding which variables to remove and which to retain is a very domain-specific topic. However, you could if you wish decide to use regularisation and fit a model while retaining all variables. This also allows you to fit a model when n (number of samples) is less than p (number of predictors).
I've shown code below that demonstrates how to examine the correlation structure within your data, and to identify which variables are most correlated (thanks to this answer. I've included an example of fitting such a model, using L2 regularisation (commonly known as ridge regression).
d <- read.csv("https://raw.githubusercontent.com/rnorouzian/m/master/v.csv", h = T) # Data
nms <- c("Age","genre","Length","cf.training","error.type","cf.scope","cf.type","cf.revision")
d[nms] <- lapply(d[nms], as.factor) # make factor
vv <- lm(dint~Age+genre+Length+cf.training+error.type+cf.scope+cf.type+cf.revision, data = d)
df <- d
df[] <- lapply(df, as.numeric)
cor_mat <- cor(as.matrix(df), use = "complete.obs")
library("gplots")
heatmap.2(cor_mat, trace = "none")
## https://stackoverflow.com/questions/22282531/how-to-compute-correlations-between-all-columns-in-r-and-detect-highly-correlate
library("tibble")
library("dplyr")
library("tidyr")
d2 <- df %>%
as.matrix() %>%
cor(use = "complete.obs") %>%
## Set diag (a vs a) to NA, then remove
(function(x) {
diag(x) <- NA
x
}) %>%
as.data.frame %>%
rownames_to_column(var = 'var1') %>%
gather(var2, value, -var1) %>%
filter(!is.na(value)) %>%
## Sort by decreasing absolute correlation
arrange(-abs(value))
## 2 pairs of variables are almost exactly correlated!
head(d2)
#> var1 var2 value
#> 1 id study.name 0.9999430
#> 2 study.name id 0.9999430
#> 3 Location timed 0.9994082
#> 4 timed Location 0.9994082
#> 5 Age ed.level 0.7425026
#> 6 ed.level Age 0.7425026
## Remove some variables here, or maybe try regularized regression (see below)
library("glmnet")
## glmnet requires matrix input
X <- d[, c("Age", "genre", "Length", "cf.training", "error.type", "cf.scope", "cf.type", "cf.revision")]
X[] <- lapply(X, as.numeric)
X <- as.matrix(X)
ind_na <- apply(X, 1, function(row) any(is.na(row)))
X <- X[!ind_na, ]
y <- d[!ind_na, "dint"]
glmnet <- glmnet(
x = X,
y = y,
## alpha = 0 is ridge regression
alpha = 0)
plot(glmnet)
Created on 2019-11-08 by the reprex package (v0.3.0)
Under such situation you can use "olsrr" package in R for stepwise regression analysis. I am providing you a sample code to do stepwise regression analysis in R
library("olsrr")
#Load the data
d <- read.csv("https://raw.githubusercontent.com/rnorouzian/m/master/v.csv", h = T)
# stepwise regression
vv <- lm(dint ~ Age + genre + Length + cf.training + error.type + cf.scope + cf.type + cf.revision, data = d)
summary(vv)
k <- ols_step_both_p(vv, pent = 0.05, prem = 0.1)
# stepwise regression plot
plot(k)
# final model
k$model
It will provide you exactly the same output as that of SPSS.

How to add a column of fitted values to a data frame by group?

Say I have a data frame like this:
X <- data_frame(
x = rep(seq(from = 1, to = 10, by = 1), 3),
y = 2*x + rnorm(length(x), sd = 0.5),
g = rep(LETTERS[1:3], each = length(x)/3))
How can I fit a regression y~x grouped by variable g and add the values from the fitted and resid generic methods to the data frame?
I know I can do:
A <- X[X$g == "A",]
mA <- with(A, lm(y ~ x))
A$fit <- fitted(mA)
A$res <- resid(mA)
B <- X[X$g == "B",]
mB <- with(B, lm(y ~ x))
B$fit <- fitted(mB)
B$res <- resid(mB)
C <- X[X$g == "C",]
mC <- with(B, lm(y ~ x))
C$fit <- fitted(mC)
C$res <- resid(mC)
And then rbind(A, B, C). However, in real life I am not using lm (I'm using rqss in the quantreg package). The method occasionally fails, so I need error handling, where I'd like to place NA all the rows that failed. Also, there are way more than 3 groups, so I don't want to just keep copying and pasting code for each group.
I tried using dplyr with do but didn't make any progress. I was thinking it might be something like:
make_qfits <- function(data) {
data %>%
group_by(g) %>%
do(failwith(NULL, rqss), formula = y ~ qss(x, lambda = 3))
}
Would this be easy to do by that approach? Is there another way in base R?
You can use do on grouped data for this task, fitting the model in each group in do and putting the model residuals and fitted values into a data.frame. To add these to the original data, just include the . that represents the data going into do in the output data.frame.
In your simple case, this would look like this:
X %>%
group_by(g) %>%
do({model = rqss(y ~ qss(x, lambda = 3), data = .)
data.frame(., residuals = resid.rqss(model), fitted = fitted(model))
})
Source: local data frame [30 x 5]
Groups: g
x y g residuals fitted
1 1 1.509760 A -1.368963e-08 1.509760
2 2 3.576973 A -8.915993e-02 3.666133
3 3 6.239950 A 4.174453e-01 5.822505
4 4 7.978878 A 4.130033e-09 7.978878
5 5 10.588367 A 4.833475e-01 10.105020
6 6 11.786445 A -3.807876e-01 12.167232
7 7 14.646221 A 4.167763e-01 14.229445
8 8 15.938253 A -3.534045e-01 16.291658
9 9 19.114927 A 7.610560e-01 18.353871
10 10 19.574449 A -8.416343e-01 20.416083
.. .. ... . ... ...
Things will look more complicated if you need to catch errors. Here is what it would look like using try and filling the residuals and fitted columns with NA if fit attempt for the group results in an error.
X[9:30,] %>%
group_by(g) %>%
do({catch = try(rqss(y ~ qss(x, lambda = 3), data = .))
if(class(catch) == "try-error"){
data.frame(., residuals = NA, fitted = NA)
}
else{
model = rqss(y ~ qss(x, lambda = 3), data = .)
data.frame(., residuals = resid.rqss(model), fitted = fitted(model))
}
})
Source: local data frame [22 x 5]
Groups: g
x y g residuals fitted
1 9 19.114927 A NA NA
2 10 19.574449 A NA NA
3 1 2.026199 B -4.618675e-01 2.488066
4 2 4.399768 B 1.520739e-11 4.399768
5 3 6.167690 B -1.437800e-01 6.311470
6 4 8.642481 B 4.193089e-01 8.223172
7 5 10.255790 B 1.209160e-01 10.134874
8 6 12.875674 B 8.290981e-01 12.046576
9 7 13.958278 B -4.803891e-10 13.958278
10 8 15.691032 B -1.789479e-01 15.869980
.. .. ... . ... ...
For the lm models you could try
library(nlme) # lmList to do lm by group
library(ggplot2) # fortify to get out the fitted/resid data
do.call(rbind, lapply(lmList(y ~ x | g, data=X), fortify))
This gives you the residual and fitted data in ".resid" and ".fitted" columns as well as a bunch of other fit data. By default the rownames will be prefixed with the letters from g.
With the rqss models that might fail
do.call(rbind, lapply(split(X, X$g), function(z) {
fit <- tryCatch({
rqss(y ~ x, data=z)
}, error=function(e) NULL)
if (is.null(fit)) data.frame(resid=numeric(0), fitted=numeric(0))
else data.frame(resid=fit$resid, fitted=fitted(fit))
}))
Here's a version that works with base R:
modelit <- function(df) {
mB <- with(df, lm(y ~ x, na.action = na.exclude))
df$fit <- fitted(mB)
df$res <- resid(mB)
return(df)
}
dfs.with.preds <- lapply(split(X, as.factor(X$g)), modelit)
output <- Reduce(function(x, y) { rbind(x, y) }, dfs.with.preds)

Resources