I have a large data-set with multiple target variables. Currently, I am having issues in writing code/loop for one of the part of the model i.e
mod <- list(ah=ah,bn=bn).
#Detailed code is as follows:
jk<- data.frame(y=runif(40), l=runif(40), m=runif(40), p=runif(40))
ah <- lm(l ~ p, jk)
bn <- lm(m ~ y, jk)
mod <- list(ah=ah,bn=bn)
for (i in names(mod))
{
jk[[i]] <- predict(mod[[i]], jk)
}
Problem is that if there are 200 models then it will be cumbersome task to write ah=ah, bn=bn for 200 times. Therefore, I need a loop to run the same so as to use in below predict function.
If we are only concerned about getting the 'mod' in a list, create the objects within a new environment and get the values using mget after listing the objects (ls()) from the environment
e1 <- new.env()
e1$ah <- lm(l ~ p, jk)
e1$bn <- lm(m ~ y, jk)
mod <- mget(ls(envir=e1), envir = e1)
mod
#$ah
#Call:
#lm(formula = l ~ p, data = jk)
#Coefficients:
#(Intercept) p
# 0.4800 0.0145
#$bn
#Call:
#lm(formula = m ~ y, data = jk)
#Coefficients:
#(Intercept) y
# 0.37895 -0.02564
Or another option is using paste
mod1 <- mget(paste0(c("a", "b"), c("h", "n")), envir = e1)
names(mod1)
#[1] "ah" "bn"
This will be useful if there are many objects and we want to return them in a sequence i.e. suppose we have 'ah1', 'ah2', ... in an environment
e2 <- new.env()
e2$ah1 <- 1:5
e2$ah2 <- 1:6
e2$ah3 <- 3:5
new1 <- mget(paste0("ah", 1:3), envir = e2)
new1
#$ah1
#[1] 1 2 3 4 5
#$ah2
#[1] 1 2 3 4 5 6
#$ah3
#[1] 3 4 5
Now, applying the loop to get the predict based on the 'mod'
for (i in names(mod)){
jk[[i]] <- predict(mod[[i]], jk)
}
head(jk)
# y l m p ah bn
#1 0.2925740 0.47038243 0.5268515 0.9267596 0.4934493 0.3714515
#2 0.2248911 0.37568719 0.1203445 0.5141895 0.4874671 0.3731871
#3 0.7042230 0.27253736 0.5068240 0.6584371 0.4895587 0.3608958
#4 0.5188971 0.21981567 0.2168941 0.7158389 0.4903910 0.3656480
#5 0.6626196 0.04366575 0.3655512 0.3298476 0.4847942 0.3619626
#6 0.9204438 0.07509480 0.3494581 0.7410798 0.4907570 0.3553514
data
set.seed(24)
jk<- data.frame(y=runif(40), l=runif(40), m=runif(40), p=runif(40))
Related
I want to write function combinations_features(y, x) which go through all combinations containing three variables and will output r squared, adjusted r squared, AIC and BIC for each combination.
My solution
combinations_features <- function(y, x) {
# Define empty vectors to store statistics
feature_vec_1 <- feature_vec_2 <-
feature_vec_3 <- feature_vec_4 <- c()
# Obtaining all combinations containing three variables
comb_names <- utils::combn(colnames(x), 3)
# For each combination obtain wanted statistics
for (i in 1:ncol(comb_names)) {
feature_vec_1 <- append(
feature_vec_1, summary(lm(y ~ ., data = x[, comb_names[, i]]))$adj.r.squared
)
feature_vec_2 <- append(
feature_vec_2, summary(lm(y ~ ., data = x[, comb_names[, i]]))$r.squared
)
feature_vec_3 <- append(
feature_vec_3, AIC(lm(y ~ ., data = x[, comb_names[, i]]))
)
feature_vec_4 <- append(
feature_vec_4, BIC(lm(y ~ ., data = x[, comb_names[, i]]))
)
}
# Assign everything into data frame
data.frame(
"Adj R2" = feature_vec_1, "R2" = feature_vec_2,
"AIC" = feature_vec_3, "BIC" = feature_vec_4
)
}
Let's see how it works - define some artificial data and give it to the function.
set.seed(42)
predictors <- data.frame(rnorm(100), runif(100), rexp(100), rpois(100, 1))
dependent <- rnorm(100)
> combinations_features(dependent, predictors)
Adj.R2 R2 AIC BIC
1 -0.0283756015 0.002787295 276.2726 289.2985
2 0.0000677269 0.030368705 273.4678 286.4937
3 -0.0011990695 0.029140296 273.5944 286.6203
4 0.0015404392 0.031796789 273.3204 286.3463
However I find this code very inefficient due to these two things:
(1) Loop - I looped it over columns of matrices comb_names, I wonder if it can be omitted somehow
(2) Length of the code - This code is huge! Due to the fact that I define feature_vec for each statistics and append to them separately. I wonder if assigning to them can be done somehow by one command.
Could you please give me hand with improving my code by telling if it's possible to apply (1) or (2) ?
How about this, which relies on bind_rows() from tidyverse? I don't think there's a way to avoid looping over the combinations, but lapply makes everything a little neater, IMHO.
combinations_features1 <- function(y, x) {
comb_names <- utils::combn(colnames(x), 3)
bind_rows(
lapply(
1:ncol(comb_names),
function(z) {
m <- lm(y ~ ., data = x[, comb_names[,z]])
s <- summary(m)
tibble(Adj.R2=s$adj.r.squared, R2=s$r.squared, AIC=AIC(m), BIC=BIC(m))
}
)
)
}
combinations_features1(dependent, predictors)
# A tibble: 4 x 4
Adj.R2 R2 AIC BIC
<dbl> <dbl> <dbl> <dbl>
1 -0.0284 0.00279 276. 289.
2 0.0000677 0.0304 273. 286.
3 -0.00120 0.0291 274. 287.
4 0.00154 0.0318 273. 286.
bind_rows(), if given a list, binds the elements of the list into a single data.frame.
Same idea as above, just directly applying lapply to the list of combinations would also work:
combinations_features <- function(y,x){
do.call(rbind, lapply(utils::combn(colnames(x), 3, simplify=FALSE),
function(i){
f1 <- lm(y ~ ., data=x[, i])
data.frame(Adj.R2=summary(f1)$adj.r.squared,
R2=summary(f1)$r.squared,
AIC=AIC(f1), BIC=BIC(f1))
}))
}
I'm trying to create a R function for a package that will take user data and (the right hand side of) a formula, do some processing, and return a model. But, I'm having trouble when the user data or formula contain variables with the same name as I use internally. A reproducible example,
(Note that updating the formula's environment is required to keep R from looking in the user's R_GlobalEnv for my variable y.)
# R Version 3.6.2
my_function <- function(user_data, user_formula){
y <- as.numeric(user_data[,1] > mean(user_data[,1]))
my_formula <- update.formula(user_formula, y ~ .)
environment(my_formula) <- environment()
my_model <- lm(my_formula, data = user_data, model = TRUE)
return(my_model)
}
some_data <- data.frame(x1 = c(1,2,3,3))
some_formula <- response ~ x1
my_function(some_data, some_formula)
The above is what I want to run, and it works as long as there isn't variable in user_formula or user_data with the name "y". But when the user_data contains a variable with the same name, the model will use that variable instead of mine.
some_data <- data.frame(x1 = c(1,2,3,3), y = c(6,7,5,6))
some_formula <- response ~ x1 + y
my_function(some_data, some_formula)$model
# y x1
# 1 6 1
# 2 7 2
# 3 5 3
# 4 6 3
# Warning messages:
# 1: In model.matrix.default(mt, mf, contrasts) :
# the response appeared on the right-hand side and was dropped
# 2: In model.matrix.default(mt, mf, contrasts) :
# problem with term 2 in model.matrix: no columns are assigned
I tried forcing R to search the function's environment for y by using get(),
my_function <- function(user_data, user_formula){
y <- as.numeric(user_data[,1] > mean(user_data[,1]))
e1 <- environment()
my_formula <- update.formula(user_formula, get("y", e1) ~ .)
environment(my_formula) <- environment()
my_model <- lm(my_formula, data = user_data, model = TRUE)
return(my_model)
}
some_data <- data.frame(x1 = c(1,2,3,3), y = c(6,7,5,6))
some_formula <- response ~ x1 + y
my_function(some_data, some_formula)$model
# get("y", e1) x1 y
# 1 0 1 6
# 2 0 2 7
# 3 1 3 5
# 4 1 3 6
But this also fails if the user data has a variable with the same name as my internal environment name,
some_data <- data.frame(x1 = c(1,2,3,3), y = c(6,7,5,6), e1 = c(1,2,3,4))
some_formula <- response ~ x1 + y + e1
my_function(some_data, some_formula)$model
# Error in get("y", e1) : invalid 'envir' argument
What is the proper way to avoid overlapping my internal variables with user-supplied variable names? I'd prefer a method for base R if possible.
Per docs of lm, the data argument handles variables in formula in two ways that are NOT mutually exclusive:
data
an optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. If not found in data, the variables are taken from environment(formula), typically the environment from which lm is called.
Specifically, the calculated y vector in function runs into a name collision with potential y column in data frame. As you noticed and as emphasized in docs above, lm will default to y column before the y vector.
To avoid naming conflict, consider adding a suitable placeholder like response or dependent_variable and raise a tryCatch warning if user supply a data frame with same column name. This approach also allows you to avoid resetting the environment(formula).
my_function <- function(user_data, user_formula){
tryCatch({
user_data$response <- as.numeric(user_data[,1] > mean(user_data[,1]))
my_formula <- update.formula(user_formula, response ~ .)
my_model <- lm(my_formula, data = user_data, model = TRUE)
return(my_model)
}, warning = function(w) message("Please rename column 'response' in your data frame")
, error = function(e) print(e)
)
}
Output
some_data <- data.frame(x1 = c(1,2,3,3))
some_formula <- response ~ x1
my_function(some_data, some_formula)
# Call:
# lm(formula = my_formula, data = user_data, model = TRUE)
# Coefficients:
# (Intercept) x1
# -0.7273 0.5455
some_data <- data.frame(x1 = c(1,2,3,3), y = c(6,7,5,6), e1 = c(1,2,3,4))
some_formula <- response ~ x1 + y + e1
my_function(some_data, some_formula)
# Call:
# lm(formula = my_formula, data = user_data, model = TRUE)
# Coefficients:
# (Intercept) x1 y e1
# 1.667e+00 3.850e-16 -3.333e-01 3.333e-01
some_data <- data.frame(x1 = c(1,2,3,3), y = c(6,7,5,6), response=c(1,1,1,1))
some_formula <- response ~ x1 + y + response
my_function(some_data, some_formula)
# Please rename column 'response' in your data frame
Say I have a data frame like this:
X <- data_frame(
x = rep(seq(from = 1, to = 10, by = 1), 3),
y = 2*x + rnorm(length(x), sd = 0.5),
g = rep(LETTERS[1:3], each = length(x)/3))
How can I fit a regression y~x grouped by variable g and add the values from the fitted and resid generic methods to the data frame?
I know I can do:
A <- X[X$g == "A",]
mA <- with(A, lm(y ~ x))
A$fit <- fitted(mA)
A$res <- resid(mA)
B <- X[X$g == "B",]
mB <- with(B, lm(y ~ x))
B$fit <- fitted(mB)
B$res <- resid(mB)
C <- X[X$g == "C",]
mC <- with(B, lm(y ~ x))
C$fit <- fitted(mC)
C$res <- resid(mC)
And then rbind(A, B, C). However, in real life I am not using lm (I'm using rqss in the quantreg package). The method occasionally fails, so I need error handling, where I'd like to place NA all the rows that failed. Also, there are way more than 3 groups, so I don't want to just keep copying and pasting code for each group.
I tried using dplyr with do but didn't make any progress. I was thinking it might be something like:
make_qfits <- function(data) {
data %>%
group_by(g) %>%
do(failwith(NULL, rqss), formula = y ~ qss(x, lambda = 3))
}
Would this be easy to do by that approach? Is there another way in base R?
You can use do on grouped data for this task, fitting the model in each group in do and putting the model residuals and fitted values into a data.frame. To add these to the original data, just include the . that represents the data going into do in the output data.frame.
In your simple case, this would look like this:
X %>%
group_by(g) %>%
do({model = rqss(y ~ qss(x, lambda = 3), data = .)
data.frame(., residuals = resid.rqss(model), fitted = fitted(model))
})
Source: local data frame [30 x 5]
Groups: g
x y g residuals fitted
1 1 1.509760 A -1.368963e-08 1.509760
2 2 3.576973 A -8.915993e-02 3.666133
3 3 6.239950 A 4.174453e-01 5.822505
4 4 7.978878 A 4.130033e-09 7.978878
5 5 10.588367 A 4.833475e-01 10.105020
6 6 11.786445 A -3.807876e-01 12.167232
7 7 14.646221 A 4.167763e-01 14.229445
8 8 15.938253 A -3.534045e-01 16.291658
9 9 19.114927 A 7.610560e-01 18.353871
10 10 19.574449 A -8.416343e-01 20.416083
.. .. ... . ... ...
Things will look more complicated if you need to catch errors. Here is what it would look like using try and filling the residuals and fitted columns with NA if fit attempt for the group results in an error.
X[9:30,] %>%
group_by(g) %>%
do({catch = try(rqss(y ~ qss(x, lambda = 3), data = .))
if(class(catch) == "try-error"){
data.frame(., residuals = NA, fitted = NA)
}
else{
model = rqss(y ~ qss(x, lambda = 3), data = .)
data.frame(., residuals = resid.rqss(model), fitted = fitted(model))
}
})
Source: local data frame [22 x 5]
Groups: g
x y g residuals fitted
1 9 19.114927 A NA NA
2 10 19.574449 A NA NA
3 1 2.026199 B -4.618675e-01 2.488066
4 2 4.399768 B 1.520739e-11 4.399768
5 3 6.167690 B -1.437800e-01 6.311470
6 4 8.642481 B 4.193089e-01 8.223172
7 5 10.255790 B 1.209160e-01 10.134874
8 6 12.875674 B 8.290981e-01 12.046576
9 7 13.958278 B -4.803891e-10 13.958278
10 8 15.691032 B -1.789479e-01 15.869980
.. .. ... . ... ...
For the lm models you could try
library(nlme) # lmList to do lm by group
library(ggplot2) # fortify to get out the fitted/resid data
do.call(rbind, lapply(lmList(y ~ x | g, data=X), fortify))
This gives you the residual and fitted data in ".resid" and ".fitted" columns as well as a bunch of other fit data. By default the rownames will be prefixed with the letters from g.
With the rqss models that might fail
do.call(rbind, lapply(split(X, X$g), function(z) {
fit <- tryCatch({
rqss(y ~ x, data=z)
}, error=function(e) NULL)
if (is.null(fit)) data.frame(resid=numeric(0), fitted=numeric(0))
else data.frame(resid=fit$resid, fitted=fitted(fit))
}))
Here's a version that works with base R:
modelit <- function(df) {
mB <- with(df, lm(y ~ x, na.action = na.exclude))
df$fit <- fitted(mB)
df$res <- resid(mB)
return(df)
}
dfs.with.preds <- lapply(split(X, as.factor(X$g)), modelit)
output <- Reduce(function(x, y) { rbind(x, y) }, dfs.with.preds)
It is possible to use a shortcut for formula in lm()
m <- matrix(rnorm(100), ncol=5)
lm(m[,1] ~ m[,2:5]
here it would be the same as
lm(m[,1] ~ m[,2] + m[,3] + m[,4] + m[,5]
but in the case when variables are not of the same level (at least this is my assumption for now) this does not work and I get the error:
Error in model.frame.default(formula = hm[, 1] ~ hm[, 2:4], drop.unused.levels = TRUE) :
invalid type (list) for variable 'hm[, 2:4]'
Data (hm):
N cor.distance switches time
1 50 0.04707842 2 0.003
2 100 -0.10769441 2 0.004
3 200 -0.01278359 2 0.004
4 300 0.04229509 5 0.008
5 500 -0.04490092 6 0.010
6 1000 0.01939561 4 0.007
Is there some shortcut still possible to avoid having to write the long formula?
Try lm(y ~ ., data) where . means "every other column in data besides y.
m <- matrix(rnorm(100), ncol =5)
m <- as.data.frame(m)
names(m) <- paste("m", 1:5, sep="")
lm(m1 ~., data=m)
You can reassign m to include only the columns you as the predictors
m <- m[ ,2:4]
lm(m1 ~ ., data=m)
There is another one shortcut for the cases when a dependent variable is in the first column:
data <- data.frame(y = rnorm(10), x1 = rnorm(10), x2 = rnorm(10))
lm(data)
It is possible to use a shortcut for formula in lm()
m <- matrix(rnorm(100), ncol=5)
lm(m[,1] ~ m[,2:5]
here it would be the same as
lm(m[,1] ~ m[,2] + m[,3] + m[,4] + m[,5]
but in the case when variables are not of the same level (at least this is my assumption for now) this does not work and I get the error:
Error in model.frame.default(formula = hm[, 1] ~ hm[, 2:4], drop.unused.levels = TRUE) :
invalid type (list) for variable 'hm[, 2:4]'
Data (hm):
N cor.distance switches time
1 50 0.04707842 2 0.003
2 100 -0.10769441 2 0.004
3 200 -0.01278359 2 0.004
4 300 0.04229509 5 0.008
5 500 -0.04490092 6 0.010
6 1000 0.01939561 4 0.007
Is there some shortcut still possible to avoid having to write the long formula?
Try lm(y ~ ., data) where . means "every other column in data besides y.
m <- matrix(rnorm(100), ncol =5)
m <- as.data.frame(m)
names(m) <- paste("m", 1:5, sep="")
lm(m1 ~., data=m)
You can reassign m to include only the columns you as the predictors
m <- m[ ,2:4]
lm(m1 ~ ., data=m)
There is another one shortcut for the cases when a dependent variable is in the first column:
data <- data.frame(y = rnorm(10), x1 = rnorm(10), x2 = rnorm(10))
lm(data)