I currently have two data tables one of which contains independent and control variables in columns while the other contains rows of dependent variables.
Can anybody help in creating a method to do linear models from the two tables that repeats for each row in the dependent value table?
You haven't provided nor a reproducible example, nor the desired output, so I'll have to guess
If this is your column names vector
vec <- LETTERS[1:3]
And this is your data set
set.seed(1)
df <- data.frame(A = sample(10, 10),
B = sample(20, 10),
C = sample(30, 10))
Then you can try something like
lapply(vec,
function(x) lm(as.formula(paste(x, "~",
paste(setdiff(names(df), x),
collapse = "+"))),
data = df))
Which will give
# [[1]]
#
# Call:
# lm(formula = as.formula(paste(x, "~", paste(setdiff(names(df),
# x), collapse = "+"))), data = df)
#
# Coefficients:
# (Intercept) B C
# 4.9687 0.2410 -0.1565
#
#
# [[2]]
#
# Call:
# lm(formula = as.formula(paste(x, "~", paste(setdiff(names(df),
# x), collapse = "+"))), data = df)
#
# Coefficients:
# (Intercept) A C
# 2.7975 0.8182 0.2775
#
#
# [[3]]
#
# Call:
# lm(formula = as.formula(paste(x, "~", paste(setdiff(names(df),
# x), collapse = "+"))), data = df)
#
# Coefficients:
# (Intercept) A B
# 13.200 -1.675 0.875
Related
I generated something that takes data, which adds all column names to the linear regression, adds their interactions and their second degree of that variable, and uses step function to finds out the best linear regression model. I added both all column names and interactions but here I have a problem adding a second degree of the variables. I use the above-given code but it gives an
Error in (colnames(dpt)[i])^2 : non-numeric argument to binary operator" error
Could anyone help? I really appreciate it.
for (i in colnames(dpt)) {
lmod <- lmod %>% update(paste('. ~ . + I((colnames(dpt)[i])^2)'),
data = dpt)
}
Using sprintf.
fit <- lm(mpg ~ cyl, mtcars)
v <- names(mtcars)[-(1:2)]
res <- lapply(v, \(x) update(fit, sprintf('. ~ . + I(%s^2)', x)))
Or, if you are into pipes:
res <- lapply(v, \(x) fit |> update(sprintf('. ~ . + I(%s^2)', x)))
Or if you cannot do without the for loop:
res <- vector(mode='list', length=length(v))
for (i in seq_along(v)) {
res[[i]] <- fit |> update(sprintf('. ~ . + I(%s^2)', v[i]))
}
Gives
res
# [[1]]
#
# Call:
# lm(formula = mpg ~ cyl + I(disp^2), data = mtcars)
#
# Coefficients:
# (Intercept) cyl I(disp^2)
# 3.543e+01 -2.246e+00 -2.115e-05
#
#
# [[2]]
#
# Call:
# lm(formula = mpg ~ cyl + I(hp^2), data = mtcars)
#
# Coefficients:
# (Intercept) cyl I(hp^2)
# 3.705e+01 -2.641e+00 -2.374e-05
#
#
# [[3]]
#
# Call:
# lm(formula = mpg ~ cyl + I(drat^2), data = mtcars)
#
# Coefficients:
# (Intercept) cyl I(drat^2)
# 31.6655 -2.4568 0.2745
#
#
# [[4]]
#
# Call:
# lm(formula = mpg ~ cyl + I(wt^2), data = mtcars)
#
# Coefficients:
# (Intercept) cyl I(wt^2)
# 35.8818 -1.9282 -0.3423
#
#
# [[5]]
#
# Call:
# lm(formula = mpg ~ cyl + I(qsec^2), data = mtcars)
#
# Coefficients:
# (Intercept) cyl I(qsec^2)
# 43.73138 -3.15264 -0.01285
#
#
# [[6]]
#
# Call:
# lm(formula = mpg ~ cyl + I(vs^2), data = mtcars)
#
# Coefficients:
# (Intercept) cyl I(vs^2)
# 39.6250 -3.0907 -0.9391
#
#
# [[7]]
#
# Call:
# lm(formula = mpg ~ cyl + I(am^2), data = mtcars)
#
# Coefficients:
# (Intercept) cyl I(am^2)
# 34.522 -2.501 2.567
#
#
# [[8]]
#
# Call:
# lm(formula = mpg ~ cyl + I(gear^2), data = mtcars)
#
# Coefficients:
# (Intercept) cyl I(gear^2)
# 36.03695 -2.76010 0.08013
#
#
# [[9]]
#
# Call:
# lm(formula = mpg ~ cyl + I(carb^2), data = mtcars)
#
# Coefficients:
# (Intercept) cyl I(carb^2)
# 37.55404 -2.74723 -0.04454
That is not a duplicate of this (Edit and reuse the formula part of the call for a model in R) but rather an extension.
So let's say I have several regression models to test. I also have a set of socio demographic variables that I want to add to each model to control for them.
m1 <- lm(a~b)
m1_full <- lm(a~b+age+gender+income)
m2 <- lm(c~d)
m2_full <- lm(c~d+age+gender+income)
Is there a way to store (age+gender+income) and add it to some models?
Something like that (in pseudocode):
ses <- age+gender+income
m1 <- lm(a~b)
m1_full <- lm(a~b+... ses)
m2 <- lm(c~d)
m2_full <- lm(c~d+...ses)
I guess you can just paste everything together into a formula. Here's a function that would do the whole process:
set.seed(0)
df <- data.frame(age = rpois(200,45),
gender = rbinom(200,1,0.5),
income = rnorm(200,30000,10000),
a = rnorm(200),
b = rnorm(200),
c = rnorm(200),
d = rnorm(200))
ses <- c("age", "gender", "income")
get_lm_model <- function(df, outcome_var, pred_var, ses) {
fm <- as.formula(paste(outcome_var, "~", pred_var, "+", paste(ses, collapse = " + ")))
lm(fm, data = df)
}
get_lm_model(df, "a", "b", ses)
#>
#> Call:
#> lm(formula = fm, data = df)
#>
#> Coefficients:
#> (Intercept) b age gender income
#> 3.345e-01 -9.516e-02 -3.748e-03 -7.033e-02 -6.718e-06
get_lm_model(df, "c", "d", ses)
#>
#> Call:
#> lm(formula = fm, data = df)
#>
#> Coefficients:
#> (Intercept) d age gender income
#> 2.775e-01 5.570e-02 -4.564e-03 -5.359e-02 -5.898e-06
Created on 2021-10-26 by the reprex package (v2.0.1)
I have a dataset similar to the below structure that I'd like to run several single regressions with:
example <- tibble(
id = paste0('ID', runif(100,0,10)),
response = runif(100,0,10),
x1 = runif(100,0,10),
x2 = factor(rep(seq(1,5),20)))
regression_1 <- lm(response ~ x1, data = example)
regression_2 <- lm(response ~ x2, data = example)
And so on for n predictors. I've tried a variety of approaches but I keep running into an error and cannot figure out for the life of me any neat and easy way to do this.
Appreciate any help
Using reformulate.
lapply(c("x1", "x2", "xn"), function(x) lm(reformulate(x, response="response"), d))
# [[1]]
#
# Call:
# lm(formula = reformulate(x, response = "response"), data = d)
#
# Coefficients:
# (Intercept) x1
# 0.03567 0.03603
#
#
# [[2]]
#
# Call:
# lm(formula = reformulate(x, response = "response"), data = d)
#
# Coefficients:
# (Intercept) x2
# 0.03098 -0.14824
#
#
# [[3]]
#
# Call:
# lm(formula = reformulate(x, response = "response"), data = d)
#
# Coefficients:
# (Intercept) xn
# 0.02961 0.08823
Data:
set.seed(42)
d <- data.frame(matrix(rnorm(400), 100, 4, dimnames=list(NULL, c("response", "x1", "x2", "xn"))))
We can subset the data and use ~ .
lapply(c("x1", "x2", "xn"), function(x) lm(response ~ ., df1[c('response', x)]))
I create random data in R like that:
data<-matrix(rnorm(100*5,mean=0,sd=1), 100, 5)
colnames(data) <- c("X1", "X2", "X3", "X4", "X5")
data <- as.data.frame(data)
a <- 5
b <- 0.8
c <- 100
and then i want to "play" with the correlations of those data and do something like the following
data[,2] <- a*data[,1] - b*rnorm(c)
data[,3] <- a*data[,1] + b*rnorm(c)
data[,4] <- a*data[,1] - b*rnorm(c)
after that i perform the following code
data<-matrix(rnorm(100*5,mean=0,sd=1), 100, 5)
colnames(data) <- c("X1", "X2", "X3", "X4", "X5")
data <- as.data.frame(data)
a <- 5
b <- 0.8
c <- 100
data[,2] <- a*data[,1] - b*rnorm(c)
data[,3] <- a*data[,1] + b*rnorm(c)
data[,4] <- a*data[,1] - b*rnorm(c)
library(glmnet)
library(coefplot)
A <- as.matrix(data)
set.seed(1)
results <- lapply(seq_len(ncol(A)), function(i) {
list(
cvfit = cv.glmnet(A[, -i] , A[, i] , standardize = TRUE , type.measure = "mse" , nfolds = 10 , alpha = 1)
)
})
lam <- as.data.frame(`names<-`(
lapply(results, function(x) (x$cvfit$lambda.min)),
paste0("X", seq_along(results))
))
sigma<- matrix(rnorm(1*5,mean=0,sd=1), 1, 5)
colnames(sigma) <- c("X1", "X2", "X3", "X4", "X5")
as.vector(sigma)
sub1.sigma <- subset(sigma, select = sigma <= sum(lam))
sub2.sigma <- subset(sigma, select = sigma <= 2*sum(lam))
sub3.sigma <- subset(sigma, select = sigma <= 3*sum(lam))
which results in a vector 1x5 called sigma and 3 vectors sub1.sigma, sub2.sigma, sub3.sigma like the following
> sigma
X1 X2 X3 X4 X5
38.64019 624.4896 0 0 0
> sub1.sigma
X1 X3 X4 X5
1 38.64019 0 0 0
> sub2.sigma
X1 X3 X4 X5
1 38.64019 0 0 0
> sub3.sigma
X1 X3 X4 X5
1 38.64019 0 0 0
The generated data are random and i usually use a set.seed() to produce the same results. I want, if it's possible without modify the main code, to run my code 100 times (with different data each time) and save in 4 dataframes the correspanding results sigma sub1.sigma, sub2.sigma, sub3.sigma in order to compare the them. Is there any way to achieve that in R?
Based on comments i manage to create the following but still doesn't seem to give the desired results. FIrst of all code[1:10] display 10 vectors which represent what? the sigma? are those the sigma of each run? how can i make it calculate the sub.sigma also?
set.seed(2021)
code <- replicate(10,{
data<-matrix(rnorm(100*5,mean=0,sd=1), 100, 5)
colnames(data) <- c("X1", "X2", "X3", "X4", "X5")
data <- as.data.frame(data)
a <- 5
b <- 0.8
c <- 100
data[,2] <- a*data[,1] - b*rnorm(c)
data[,3] <- a*data[,1] + b*rnorm(c)
data[,4] <- a*data[,1] - b*rnorm(c)
library(glmnet)
library(coefplot)
A <- as.matrix(data)
set.seed(1)
results <- lapply(seq_len(ncol(A)), function(i) {
list(
cvfit = cv.glmnet(A[, -i] , A[, i] , standardize = TRUE , type.measure = "mse" , nfolds = 10 , alpha = 1)
)
})
lam <- as.data.frame(`names<-`(
lapply(results, function(x) (x$cvfit$lambda.min)),
paste0("X", seq_along(results))
))
sigma<- matrix(rnorm(1*5,mean=0,sd=1), 1, 5)
colnames(sigma) <- c("X1", "X2", "X3", "X4", "X5")
as.vector(sigma)
sub1.sigma <- subset(sigma, select = sigma <= sum(lam))
sub2.sigma <- subset(sigma, select = sigma <= 2*sum(lam))
sub3.sigma <- subset(sigma, select = sigma <= 3*sum(lam))
}, simplify = FALSE)
code[1:10]
sigmas <- as.data.frame(do.call(rbind,lapply(code, sigma)))
I'm a fan of keeping models around for various reasons, so I'll start with a list of models-run. In this case, replicate(n, ..., simplify=FALSE) returns a list of whatever we need it to. (For the record, that is equivalent to lapply(seq_len(n), function(ign) ...).)
(Side note: I don't have glmnet installed, so I'll mimic that with a simple/absurd lm model.)
set.seed(2021)
models <- replicate(10, {
zany_numbers <- runif(32) # nrow(mtcars)
lm(zany_numbers ~ mpg + disp + cyl, data = mtcars)
}, simplify = FALSE)
models[1:2]
# [[1]]
# Call:
# lm(formula = zany_numbers ~ mpg + disp + cyl, data = mtcars)
# Coefficients:
# (Intercept) mpg disp cyl
# 8.85e-02 2.01e-02 -5.29e-05 2.41e-02
# [[2]]
# Call:
# lm(formula = zany_numbers ~ mpg + disp + cyl, data = mtcars)
# Coefficients:
# (Intercept) mpg disp cyl
# -0.52302 0.02485 -0.00122 0.13071
From here, we can make any sort of frame you want.
coefs <- as.data.frame(do.call(rbind, lapply(models, coef)))
coefs
# (Intercept) mpg disp cyl
# 1 0.0885 0.020114 -5.29e-05 0.0241
# 2 -0.5230 0.024847 -1.22e-03 0.1307
# 3 0.0856 0.014215 1.18e-03 -0.0225
# 4 0.4899 0.012013 1.08e-03 -0.0876
# 5 -0.6926 0.024653 -1.16e-03 0.1499
# 6 0.0862 0.010497 -5.02e-04 0.0389
# 7 0.8358 -0.008419 -6.64e-04 -0.0141
# 8 0.3679 -0.000198 1.44e-03 -0.0391
# 9 0.4360 0.011994 -9.59e-05 -0.0303
# 10 0.2276 0.003659 -1.14e-03 0.0651
(You might need to clean up names there.)
You can replace do.call(rbind, ...) with data.table::rbindlist(...) or dplyr::bind_rows(...) if you prefer.
From this models, and using the same list-of-frames do.call(rbind, ...) follow-up, you can generate companion frames, such as
otherstats <- as.data.frame(do.call(rbind, lapply(models, function(mdl) summary(mdl)[c("r.squared", "adj.r.squared")])))
otherstats
# r.squared adj.r.squared
# 1 0.104 0.00745
# 2 0.144 0.0523
# 3 0.044 -0.0584
# 4 0.202 0.117
# 5 0.149 0.0573
# 6 0.0639 -0.0364
# 7 0.0586 -0.0422
# 8 0.137 0.0446
# 9 0.241 0.16
# 10 0.0814 -0.017
I have a data frame that I split based on a vector of factors. I'm trying to create a model for each data set and then create a set of predicted values from them.
I'm trying to span the predicted values over a large number of values (e.g. length.out = 500), but when I feed the predict function with a new data set with 500 rows, it still spits out a predicted data frame that's the same length as the original data frame fed into the model.
data(mtcars)
rownames(mtcars) <- NULL #I've ran this code with and without this line, both times it gave the same result
mtcars.split <- split(mtcars, mtcars$cyl)
mtcars.split <- lapply(mtcars.split, function(x){
rownames(x) <- NULL
x <- droplevels(x)
return(x)
})
mtcars.lm <- lapply(mtcars.split, function(x){
lm(disp ~ wt, data = x)
})
mtcars.fitted <- mapply(x = mtcars.lm, y = mtcars.split, function(x, y){
newdata = data.frame(wt = seq(min(y$wt), max(y$wt), length.out = 500))
fitted <- as.data.frame(predict(x, new.data = newdata, se = T))
return(fitted)
}, SIMPLIFY = F)
lapply(mtcars.fitted, nrow)
lapply(mtcars.split, nrow)
I tried running the linear model for the entire data set and it did the same thing.
mtcars.lm.all <- lm(disp ~ wt, data = mtcars)
newdata <- data.frame(wt = seq(min(mtcars$wt), max(mtcars$wt), length.out = 500))
nrow(as.data.frame(predict(mtcars.lm.all, new.data = newdata, se = T)))
Even attempting to subset the data set didn't make any difference.
mtcars.head <- head(mtcars, n = 16)
mtcars.head.lm <- lm(disp ~ wt, data = mtcars.head)
predict.mtcars <- as.data.frame(predict(mtcars.head.lm,
new.data = data.frame(wt = seq(min(mtcars.head$wt),
max(mtcars.head$wt),
length.out = 500)),
se = T))
nrow(predict.mtcars)
Am I missing something here? This used to work but it doesn't seem to work now. Even restarting the R session or my computer doesn't seem to make it work.
the argument in the predict function is not new.data but newdata.
Attached the desired result.
data(mtcars)
rownames(mtcars) <- NULL #I've ran this code with and without this line, both times it gave the same result
mtcars.split <- split(mtcars, mtcars$cyl)
mtcars.split <- lapply(mtcars.split, function(x){
rownames(x) <- NULL
x <- droplevels(x)
return(x)
})
mtcars.lm <- lapply(mtcars.split, function(x){
lm(disp ~ wt, data = x)
})
mtcars.fitted <- mapply(x = mtcars.lm, y = mtcars.split, function(x, y){
newdata = data.frame(wt = seq(min(y$wt), max(y$wt), length.out = 500))
fitted <- as.data.frame(predict(x, newdata = newdata, se = T))
return(fitted)
}, SIMPLIFY = F)
lapply(mtcars.fitted, nrow)
#> $`4`
#> [1] 500
#>
#> $`6`
#> [1] 500
#>
#> $`8`
#> [1] 500
lapply(mtcars.split, nrow)
#> $`4`
#> [1] 11
#>
#> $`6`
#> [1] 7
#>
#> $`8`
#> [1] 14
mtcars.lm.all <- lm(disp ~ wt, data = mtcars)
newdata <- data.frame(wt = seq(min(mtcars$wt), max(mtcars$wt), length.out = 500))
nrow(as.data.frame(predict(mtcars.lm.all, newdata = newdata, se = T)))
#> [1] 500
Created on 2020-07-22 by the reprex package (v0.3.0)