Function which outputs statistics for each variable combination - r

I want to write function combinations_features(y, x) which go through all combinations containing three variables and will output r squared, adjusted r squared, AIC and BIC for each combination.
My solution
combinations_features <- function(y, x) {
# Define empty vectors to store statistics
feature_vec_1 <- feature_vec_2 <-
feature_vec_3 <- feature_vec_4 <- c()
# Obtaining all combinations containing three variables
comb_names <- utils::combn(colnames(x), 3)
# For each combination obtain wanted statistics
for (i in 1:ncol(comb_names)) {
feature_vec_1 <- append(
feature_vec_1, summary(lm(y ~ ., data = x[, comb_names[, i]]))$adj.r.squared
)
feature_vec_2 <- append(
feature_vec_2, summary(lm(y ~ ., data = x[, comb_names[, i]]))$r.squared
)
feature_vec_3 <- append(
feature_vec_3, AIC(lm(y ~ ., data = x[, comb_names[, i]]))
)
feature_vec_4 <- append(
feature_vec_4, BIC(lm(y ~ ., data = x[, comb_names[, i]]))
)
}
# Assign everything into data frame
data.frame(
"Adj R2" = feature_vec_1, "R2" = feature_vec_2,
"AIC" = feature_vec_3, "BIC" = feature_vec_4
)
}
Let's see how it works - define some artificial data and give it to the function.
set.seed(42)
predictors <- data.frame(rnorm(100), runif(100), rexp(100), rpois(100, 1))
dependent <- rnorm(100)
> combinations_features(dependent, predictors)
Adj.R2 R2 AIC BIC
1 -0.0283756015 0.002787295 276.2726 289.2985
2 0.0000677269 0.030368705 273.4678 286.4937
3 -0.0011990695 0.029140296 273.5944 286.6203
4 0.0015404392 0.031796789 273.3204 286.3463
However I find this code very inefficient due to these two things:
(1) Loop - I looped it over columns of matrices comb_names, I wonder if it can be omitted somehow
(2) Length of the code - This code is huge! Due to the fact that I define feature_vec for each statistics and append to them separately. I wonder if assigning to them can be done somehow by one command.
Could you please give me hand with improving my code by telling if it's possible to apply (1) or (2) ?

How about this, which relies on bind_rows() from tidyverse? I don't think there's a way to avoid looping over the combinations, but lapply makes everything a little neater, IMHO.
combinations_features1 <- function(y, x) {
comb_names <- utils::combn(colnames(x), 3)
bind_rows(
lapply(
1:ncol(comb_names),
function(z) {
m <- lm(y ~ ., data = x[, comb_names[,z]])
s <- summary(m)
tibble(Adj.R2=s$adj.r.squared, R2=s$r.squared, AIC=AIC(m), BIC=BIC(m))
}
)
)
}
combinations_features1(dependent, predictors)
# A tibble: 4 x 4
Adj.R2 R2 AIC BIC
<dbl> <dbl> <dbl> <dbl>
1 -0.0284 0.00279 276. 289.
2 0.0000677 0.0304 273. 286.
3 -0.00120 0.0291 274. 287.
4 0.00154 0.0318 273. 286.
bind_rows(), if given a list, binds the elements of the list into a single data.frame.

Same idea as above, just directly applying lapply to the list of combinations would also work:
combinations_features <- function(y,x){
do.call(rbind, lapply(utils::combn(colnames(x), 3, simplify=FALSE),
function(i){
f1 <- lm(y ~ ., data=x[, i])
data.frame(Adj.R2=summary(f1)$adj.r.squared,
R2=summary(f1)$r.squared,
AIC=AIC(f1), BIC=BIC(f1))
}))
}

Related

Manually created formula seems to not work...but why?

Let's consider data following :
library(pglm)
data('UnionWage', package = 'pglm')
df1 <- data.frame(UnionWage[1:2],'wage' = UnionWage$wage, 'exper' = UnionWage$exper)
head(df1)
id year wage exper
1 13 1980 1.197540 1
2 13 1981 1.853060 2
3 13 1982 1.344462 3
4 13 1983 1.433213 4
5 13 1984 1.568125 5
6 13 1985 1.699891 6
I want to write a function which will fit logistic random panel regression to data :
Function
fit_panel_binary <- function(y, x) {
x <- cbind(x, y)
#Taking varnames since 3 (first and second are respectively id and year)
varnames <- names(x)[3:(length(x))]
#Excluding dependent variable from varnames
varnames <- varnames[!(varnames == names(y))]
#Creating formula of independent variables (sum)
form <- paste0(varnames, collapse = "+")
x_copy <- data.frame(x)
#Performing panel random regression
model <- pglm(as.formula(paste0(names(y), "~", form)), data = as.matrix(x_copy),
model = 'random', family = binomial(link = 'logit'))
}
And error occrus :
fit_panel_binary(UnionWage['union'],df1)
Error in if (!id.name %in% names(x)) stop(paste("variable ", id.name, :
argument is of length zero
I though that I put formula incorrectly so I changed my function to outputs formula paste0(names(y), "~", form)
and this is what I got :
"union~wage+exper"
It seems that formula is correct, but regression says something different. DO you know where is the problem ?
Unfortunately, because of the way pglm captures its arguments using non-standard evaluation, it doesn't like being passed formulas that have to be evaluated (i.e. formulas that aren't typed in situ). You can get round this by calling it with do.call, but even then you have to be careful that the data is available in the frame in which pglm is called. So you have to do something like:
fit_panel_binary <- function(y, x) {
x <- cbind(x, y)
varnames <- names(x)[3:(length(x))]
varnames <- varnames[!(varnames == names(y))]
form <- paste0(varnames, collapse = "+")
x_copy <- data.frame(x)
form <- as.formula(paste(names(y), "~", form))
params <- list(formula = form, data = x_copy, model = "random",
family = binomial(link = "logit"))
pglm_env <- list2env(params, envir = new.env())
do.call("pglm", params, envir = pglm_env)
}
Allowing:
fit_panel_binary(UnionWage['union'], df1)
#> Maximum Likelihood estimation
#> Newton-Raphson maximisation, 6 iterations
#> Return code 1: gradient close to zero
#> Log-Likelihood: -1655.081 (4 free parameter(s))
#> Estimate(s): -3.428254 0.8305558 -0.06590541 4.2625
Created on 2020-12-08 by the reprex package (v0.3.0)

R Loop For Extracting Values

hsb2 <- read.csv("https://stats.idre.ucla.edu/stat/data/hsb2.csv")
names(hsb2)
varlist <- names(hsb2)[8:11]
models <- lapply(varlist, function(x) {
lm(substitute(read ~ i, list(i = as.name(x))), data = hsb2)
})
## look at the first element of the list, model 1
models[[1]]
The code above generates a series of simple regression models for different independent variables. My priority is to then extract the coefficient and standard error for each of the variables listed in varlist. My attempt shows below.
ATTEMPT = lapply(1:length(models), function(x) {
cbind(cov, coef(summary(models[[x]]))[2,1:2])})
My hopeful output will show three columns--variable, coefficient, std. error:
How about:
ATTEMPT2 = lapply(1:length(models), function(x) {
cf <- coef(summary(models[[x]]))
data.frame(Variable=rownames(cf)[2],
Estimate=cf[2,1],
Std.Error=cf[2,2])})
(df2 <- do.call("rbind", ATTEMPT2))
# Variable Estimate Std.Error
# 1 write 0.6455300 0.06168323
# 2 math 0.7248070 0.05827449
# 3 science 0.6525644 0.05714318
# 4 socst 0.5935322 0.05317162

Rolling regression and prediction with lm() and predict()

I need to apply lm() to an enlarging subset of my dataframe dat, while making prediction for the next observation. For example, I am doing:
fit model predict
---------- -------
dat[1:3, ] dat[4, ]
dat[1:4, ] dat[5, ]
. .
. .
dat[-1, ] dat[nrow(dat), ]
I know what I should do for a particular subset (related to this question: predict() and newdata - How does this work?). For example to predict the last row, I do
dat1 = dat[1:(nrow(dat)-1), ]
dat2 = dat[nrow(dat), ]
fit = lm(log(clicks) ~ log(v1) + log(v12), data=dat1)
predict.fit = predict(fit, newdata=dat2, se.fit=TRUE)
How can I do this automatically for all subsets, and potentially extract what I want into a table?
From fit, I'd need the summary(fit)$adj.r.squared;
From predict.fit I'd need predict.fit$fit value.
Thanks.
(Efficient) solution
This is what you can do:
p <- 3 ## number of parameters in lm()
n <- nrow(dat) - 1
## a function to return what you desire for subset dat[1:x, ]
bundle <- function(x) {
fit <- lm(log(clicks) ~ log(v1) + log(v12), data = dat, subset = 1:x, model = FALSE)
pred <- predict(fit, newdata = dat[x+1, ], se.fit = TRUE)
c(summary(fit)$adj.r.squared, pred$fit, pred$se.fit)
}
## rolling regression / prediction
result <- t(sapply(p:n, bundle))
colnames(result) <- c("adj.r2", "prediction", "se")
Note I have done several things inside the bundle function:
I have used subset argument for selecting a subset to fit
I have used model = FALSE to not save model frame hence we save workspace
Overall, there is no obvious loop, but sapply is used.
Fitting starts from p, the minimum number of data required to fit a model with p coefficients;
Fitting terminates at nrow(dat) - 1, as we at least need the final column for prediction.
Test
Example data (with 30 "observations")
dat <- data.frame(clicks = runif(30, 1, 100), v1 = runif(30, 1, 100),
v12 = runif(30, 1, 100))
Applying code above gives results (27 rows in total, truncated output for 5 rows)
adj.r2 prediction se
[1,] NaN 3.881068 NaN
[2,] 0.106592619 3.676821 0.7517040
[3,] 0.545993989 3.892931 0.2758347
[4,] 0.622612495 3.766101 0.1508270
[5,] 0.180462206 3.996344 0.2059014
The first column is the adjusted-R.squared value for fitted model, while the second column is the prediction. The first value for adj.r2 is NaN, because the first model we fit has 3 coefficients for 3 data points, hence no sensible statistics is available. The same happens to se as well, as the fitted line has no 0 residuals, so prediction is done without uncertainty.
I just made up some random data to use for this example. I'm calling the object data because that was what it was called in the question at the time that I wrote this solution (call it anything you like).
(Efficient) Solution
data <- data.frame(v1=rnorm(100),v2=rnorm(100),clicks=rnorm(100))
data1 = data[1:(nrow(data)-1), ]
data2 = data[nrow(data), ]
for(i in 3:nrow(data)){
nam <- paste("predict", i, sep = "")
nam1 <- paste("fit", i, sep = "")
nam2 <- paste("summary_fit", i, sep = "")
fit = lm(clicks ~ v1 + v2, data=data[1:i,])
tmp <- predict(fit, newdata=data2, se.fit=TRUE)
tmp1 <- fit
tmp2 <- summary(fit)
assign(nam, tmp)
assign(nam1, tmp1)
assign(nam2, tmp2)
}
All of the results you want will be stored in the data objects this creates.
For example:
> summary_fit10$r.squared
[1] 0.3087432
You mentioned in the comments that you'd like a table of results. You can programmatically create tables of results from the 3 types of output files like this:
rm(data,data1,data2,i,nam,nam1,nam2,fit,tmp,tmp1,tmp2)
frames <- ls()
frames.fit <- frames[1:98] #change index or use pattern matching as needed
frames.predict <- frames[99:196]
frames.sum <- frames[197:294]
fit.table <- data.frame(intercept=NA,v1=NA,v2=NA,sourcedf=NA)
for(i in 1:length(frames.fit)){
tmp <- get(frames.fit[i])
fit.table <- rbind(fit.table,c(tmp$coefficients[[1]],tmp$coefficients[[2]],tmp$coefficients[[3]],frames.fit[i]))
}
fit.table
> fit.table
intercept v1 v2 sourcedf
2 -0.0647017971121678 1.34929652763687 -0.300502017324518 fit10
3 -0.0401617893034109 -0.034750571912636 -0.0843076273486442 fit100
4 0.0132968863522573 1.31283604433593 -0.388846211083564 fit11
5 0.0315113918953643 1.31099122173898 -0.371130010135382 fit12
6 0.149582794027583 0.958692838785998 -0.299479715938493 fit13
7 0.00759688947362175 0.703525856001948 -0.297223988673322 fit14
8 0.219756240025917 0.631961979610744 -0.347851129205841 fit15
9 0.13389223748979 0.560583832333355 -0.276076134872669 fit16
10 0.147258022154645 0.581865844000838 -0.278212722024832 fit17
11 0.0592160359650468 0.469842498721747 -0.163187274356457 fit18
12 0.120640756525163 0.430051839741539 -0.201725012088506 fit19
13 0.101443924785995 0.34966728554219 -0.231560038360121 fit20
14 0.0416637001406594 0.472156988919337 -0.247684504074867 fit21
15 -0.0158319749710781 0.451944113682333 -0.171367482879835 fit22
16 -0.0337969739950376 0.423851304105399 -0.157905431162024 fit23
17 -0.109460218252207 0.32206642419212 -0.055331391802687 fit24
18 -0.100560410735971 0.335862465403716 -0.0609509815266072 fit25
19 -0.138175283219818 0.390418411384468 -0.0873106257144312 fit26
20 -0.106984355317733 0.391270279253722 -0.0560299858019556 fit27
21 -0.0740684978271464 0.385267011513678 -0.0548056844433894 fit28

R: assigning objects to vector in loop

I'm generating regressions of 30 subpopulations in a for-loop and assigning them to successive elements of a list (matrix?). It seems like only the first component of each regression is making it into the list. The simple version goes like this:
i <- 30
num30 <- with(gapMeans, lm(DHt[Gap==i] ~ Time[Gap==i]))
Works just as I want. The loop version goes:
gmRegs <- NULL
for (i in 1:30){
gmRegs[i] <- with(gapMeans,
if(Ht[Gap==i][1] > 1){
lm(DHt[Gap==i] ~ Time[Gap==i])
} else {NULL}
)}
That runs correctly but:
num30
# Call:
# lm(formula = DHt[Gap == i] ~ Time[Gap == i])
#
# Coefficients:
# (Intercept) Time[Gap == i]
# 24.56874 -0.01546
gmRegs[30]
# [[1]]
# (Intercept) Time[Gap == i]
# 24.56874082 -0.01546019
And str() describes num30 as a list of 13 while gmRegs[30] is a list of 1, and when I try to do abline(reg=gmRegs[30]) it won't. So it seems like my assignment is doing only thing1[1] <- thing2[1], or something to that effect -- I just can't figure out how to properly box up the lm() object to all fit in the list slot.
When you save an lm as an item to a list, the lm itself is a structured element in R. As you have noted, running str(num30) returns a list of 13 things. If you want to save each lm as an element in a list, you can do the following:
# generate random data
response <- runif(90,0,1)
time <- runif(90,10,20)
gap <- rep(1:30,3)
gapMeans <- data.frame(gap,response,time)
Now, head(gapMeans) returns
gap response time
1 1 0.6809973 12.66655
2 2 0.5473042 11.73821
3 3 0.6095777 18.96527
4 4 0.3081830 15.62343
5 5 0.1640612 13.42454
6 6 0.8473997 12.83730
As Richard pointed above, you can rewrite your with call as the following lm:
num30 <- lm(response[gap==30] ~ time[gap==30], data = gapMeans)
Now for your loop you can simply write the following:
gmRegs <- NULL
for(i in 1:30){
gmRegs[[i]] <- lm(response[gap==i] ~ time[gap==i], data= gapMeans)
}
Now each element of gmRegs, accessed via gmRegs[[30]] is itself a lm object.
plot(gapMeans$time[gapMeans$gap==30], gapMeans$response[gapMeans$gap==30], xlab = 'time', ylab = 'response')
abline(gmRegs[[30]]$coefficients, col = "red")

Create and Call Linear Models from List

So I'm trying to compare different linear models in order to determine if one is better than another. However I have several models, so I want to create an list of models and then call on them. Is that possible?
Models <- list(lm(y~a),lm(y~b),lm(y~c)
Models2 <- list(lm(y~a+b),lm(y~a+c),lm(y~b+c))
anova(Models2[1],Models[1])
Thank you for your help!
If you have two lists of models, and you want to compare each pair of models, then you want Map:
models1 <- list(lm(y ~ a), lm(y ~ b), lm(y ~ c)
models2 <- list(lm(y ~ a + b), lm(y ~ a + c), lm(y ~ b + c))
Map(anova, models1, models2)
This is basically equivalent to the following for loop:
out <- vector("list", length(models1))
for (i in seq_along(out) {
out[[i]] <- anova(models1[[i]], models2[[i]])
}
Map is an example of a functional, and you can find out more about them at https://github.com/hadley/devtools/wiki/Functionals
You can use do.call to convert a list of any length into a call suitable for a function taking .... The only trick here is that anova expects the first model to be named--that's what the Curry handles by creating a new function which already has its first argument specified.
Put everything except the first model (call it lm1) into one list called Models.
Then:
library(functional)
do.call( Curry(anova, object=lm1), Models )
Example:
> Models <- list( lm(runif(10)~rnorm(10)),lm(runif(10)~rnorm(10)),lm(runif(10)~rnorm(10)) )
> lm1 <- lm(runif(10)~rnorm(10))
> do.call( Curry(anova, object=lm1), Models )
Analysis of Variance Table
Model 1: runif(10) ~ rnorm(10)
Model 2: runif(10) ~ rnorm(10)
Model 3: runif(10) ~ rnorm(10)
Model 4: runif(10) ~ rnorm(10)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 8 0.46614
2 8 0.59522 0 -0.12908
3 8 1.00869 0 -0.41346
4 8 0.81686 0 0.19182
x <- rnorm(100,0,1)
y <- rnorm(100,5,2)
z <- rnorm(100,8,1)
models <- list(y.x = lm(y~x), y.z = lm(y~z))
anova(models[[1]],models[[2]])
This worked for me.

Resources