I have the data like this:
df <- tibble::tibble(
id = rep(c(1:50), each = 5),
y = runif(250,min = 0, max = 1),
x1 = rnorm(250, mean = 0, sd=1),
x2 = rnorm(250, mean = 0, sd=1),
x3 = rnorm(250, mean = 0, sd=1),
x4 = rnorm(250, mean = 0, sd=1),
x5 = rnorm(250, mean = 0, sd=1),
) %>%
group_by(id) %>%
mutate(year = rep(c(2001:2005)))
I would like to estimate the probit model for every year to get (1)coefficient estimates,and (2) predicted value of y, and (3) number of observations used to estimate the model:
probit_model <- function(df) {
glm (y ~ x1 + x2 + x3 + x4+ x5,
family = binomial(link = "probit"),
data = df)
}
Do you know how we can get the coefficient estimates, predicted value for every year and then combine them with the original data (that is df) here? I know what we can do with OLS model (by using map function to estimate for multiple models). But I do not know how to do with probit regression.
Thank you so much.
I think you need to do this, I used this post as reference.
library(dplyr)
df <- tibble::tibble(
id = rep(c(1:50), each = 5),
y = runif(250,min = 0, max = 1),
x1 = rnorm(250, mean = 0, sd=1),
x2 = rnorm(250, mean = 0, sd=1),
x3 = rnorm(250, mean = 0, sd=1),
x4 = rnorm(250, mean = 0, sd=1),
x5 = rnorm(250, mean = 0, sd=1),
) %>%
group_by(id) %>%
mutate(year = rep(c(2001:2005)))
fitted_models = df %>% group_by(year) %>% do(model = glm(y ~ x1 + x2 + x3 + x4+ x5,
family = binomial(link = "probit"),
data = .))
#fitted_models$year
#fitted_models$model[1]
fitted_models %>% summarise(broom::tidy(model))
## A tibble: 30 x 5
#term estimate std.error statistic p.value
#<chr> <dbl> <dbl> <dbl> <dbl>
# 1 (Intercept) -0.160 0.187 -0.856 0.392
#2 x1 0.0860 0.230 0.375 0.708
#3 x2 0.0657 0.187 0.351 0.725
#4 x3 0.0472 0.160 0.296 0.767
#5 x4 0.216 0.191 1.13 0.257
#6 x5 -0.159 0.263 -0.604 0.546
#7 (Intercept) -0.0792 0.182 -0.434 0.664
#8 x1 0.0314 0.170 0.185 0.853
#9 x2 -0.0320 0.194 -0.164 0.869
#10 x3 0.167 0.218 0.763 0.445
## ... with 20 more rows
fitted_models %>% summarise(broom::glance(model))
## A tibble: 5 x 8
#null.deviance df.null logLik AIC BIC deviance df.residual nobs
#<dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <int>
# 1 21.7 49 -32.5 77.0 88.5 19.7 44 50
#2 16.4 49 -33.4 78.8 90.3 15.7 44 50
#3 15.5 49 -34.5 81.1 92.5 15.2 44 50
#4 16.6 49 -32.4 76.7 88.2 15.0 44 50
#5 19.6 49 -33.3 78.6 90.0 19.1 44 50
fitted_models %>% summarise(broom::augment(model, type.predict = "response"))
## A tibble: 250 x 12
#y x1 x2 x3 x4 x5 .fitted .resid .std.resid .hat .sigma .cooksd
#<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 0.819 0.0246 0.0176 0.280 0.192 0.840 0.407 0.846 0.875 0.0665 0.664 0.00894
#2 0.0418 1.41 0.297 1.15 -1.41 0.347 0.372 -0.792 -0.853 0.137 0.665 0.0144
#3 0.119 -0.265 -0.158 -1.37 -2.48 -0.504 0.237 -0.300 -0.327 0.156 0.676 0.00284
#4 0.0282 -0.836 -0.442 -1.63 0.506 0.910 0.355 -0.808 -0.858 0.114 0.665 0.0112
#5 0.893 -0.481 -0.384 -0.974 0.897 -0.662 0.510 0.819 0.850 0.0703 0.665 0.00792
#6 0.865 0.417 -0.0233 0.841 -0.268 -0.140 0.451 0.865 0.883 0.0395 0.664 0.00494
#7 0.809 1.30 -0.469 1.01 -0.0913 -0.106 0.486 0.669 0.702 0.0921 0.669 0.00778
#8 0.0220 0.119 -0.580 -0.533 -1.09 0.0142 0.326 -0.780 -0.801 0.0522 0.666 0.00406
#9 0.722 0.194 -1.50 -0.395 1.65 -0.868 0.592 0.271 0.297 0.167 0.676 0.00281
#10 0.131 1.24 0.600 1.14 -1.17 0.370 0.392 -0.579 -0.618 0.122 0.671 0.00756
## ... with 240 more rows
A similar answer to #cdcarrion's, from the same post, but using map (a slightly newer approach than do()):
fit the models
library(broom)
models <- (df
%>% group_by(year)
%>% nest()
%>% mutate(model = map(data, glm,
formula = y ~ x1 + x2 + x3 + x4+ x5,
family = binomial(link = "probit")))
)
get coefficients
coefs <- (models
%>% mutate(cc = map(model, tidy))
%>% select(year, cc)
%>% unnest(cols = cc)
)
get predictions
preds <- (models
%>% mutate(aug = map(model, augment, type.predict = "response"))
%>% select(year, aug)
%>% unnest(cols = aug)
%>% select(year:x5, .fitted)
)
Related
I want to tidy up a dataframe and automate the process. Given the following data.frame:
library(survival)
library(rms)
library(broom)
library(tidyverse)
res.cox <- coxph(Surv(time, status) ~ rcs(age, 3) + sex + ph.ecog +
rcs(meal.cal, 4), data = lung)
output <- tidy(res.cox)
output
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 rcs(age, 3)age -0.00306 0.0219 -0.140 0.889
# 2 rcs(age, 3)age' 0.0154 0.0261 0.592 0.554
# 3 sex -0.525 0.192 -2.74 0.00620
# 4 ph.ecog 0.421 0.131 3.22 0.00128
# 5 rcs(meal.cal, 4)meal.cal -0.000416 0.00104 -0.400 0.689
# 6 rcs(meal.cal, 4)meal.cal' 0.00118 0.00232 0.509 0.611
# 7 rcs(meal.cal, 4)meal.cal'' -0.00659 0.0114 -0.577 0.564
I want to remove the rcs-spline information from term variable and be left with:
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 s1 age -0.00306 0.0219 -0.140 0.889
# 2 s2 age 0.0154 0.0261 0.592 0.554
# 3 sex -0.525 0.192 -2.74 0.00620
# 4 ph.ecog 0.421 0.131 3.22 0.00128
# 5 s1 meal.cal -0.000416 0.00104 -0.400 0.689
# 6 s2 meal.cal 0.00118 0.00232 0.509 0.611
# 7 s3 meal.cal -0.00659 0.0114 -0.577 0.564
I want the solution to easily work for other cases too so when you increase the number of knots:
res.cox2 <- coxph(Surv(time, status) ~ rcs(age, 4) + rcs(meal.cal, 6) +
sex + ph.ecog, data = lung)
output2 <- tidy(res.cox2)
output2
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 rcs(age, 4)age 0.0419 0.0403 1.04 0.298
# 2 rcs(age, 4)age' -0.101 0.0806 -1.26 0.208
# 3 rcs(age, 4)age'' 0.569 0.388 1.47 0.142
# 4 rcs(meal.cal, 6)meal.cal -0.000974 0.00155 -0.631 0.528
# 5 rcs(meal.cal, 6)meal.cal' 0.00751 0.0115 0.655 0.512
# 6 rcs(meal.cal, 6)meal.cal'' -0.0217 0.0358 -0.607 0.544
# 7 rcs(meal.cal, 6)meal.cal''' 0.0614 0.123 0.501 0.616
# 8 rcs(meal.cal, 6)meal.cal'''' -0.0775 0.163 -0.475 0.634
# 9 sex -0.552 0.195 -2.83 0.00465
# 10 ph.ecog 0.440 0.132 3.34 0.000835
you would be left with:
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 s1 age 0.0419 0.0403 1.04 0.298
# 2 s2 age -0.101 0.0806 -1.26 0.208
# 3 s3 age 0.569 0.388 1.47 0.142
# 4 s1 meal.cal -0.000974 0.00155 -0.631 0.528
# 5 s2 meal.cal 0.00751 0.0115 0.655 0.512
# 6 s3 meal.cal -0.0217 0.0358 -0.607 0.544
# 7 s4 meal.cal 0.0614 0.123 0.501 0.616
# 8 s5 meal.cal -0.0775 0.163 -0.475 0.634
# 9 sex -0.552 0.195 -2.83 0.00465
# 10 ph.ecog 0.440 0.132 3.34 0.000835
etc...
My attempt so far gets me some of the way but I am not sure of the best way to deal with the ', '' (note the first term does not contain a ') etc.:
output %>%
mutate(rcs_indicator = str_detect(term, fixed("rcs(")),
term = str_replace_all(term, "rcs\\(.+?\\)", ""))
# term estimate std.error statistic p.value rcs_indicator
# <chr> <dbl> <dbl> <dbl> <dbl> <lgl>
# 1 age -0.00306 0.0219 -0.140 0.889 TRUE
# 2 age' 0.0154 0.0261 0.592 0.554 TRUE
# 3 sex -0.525 0.192 -2.74 0.00620 FALSE
# 4 ph.ecog 0.421 0.131 3.22 0.00128 FALSE
# 5 meal.cal -0.000416 0.00104 -0.400 0.689 TRUE
# 6 meal.cal' 0.00118 0.00232 0.509 0.611 TRUE
# 7 meal.cal'' -0.00659 0.0114 -0.577 0.564 TRUE
It might be useful to just work with the terms I need to change directly:
unique(str_subset(output$term, fixed("rcs(")) %>%
str_replace_all("'", ""))
# [1] "rcs(age, 3)age" "rcs(meal.cal, 4)meal.cal"
I feel there is a way to do this in a simpler way than the steps I am doing.
Any suggestions?
Thanks
This one is clunky but it should work:
library(dplyr)
library(stringr)
output %>%
group_by(group =str_extract(term, 'rcs\\(.')) %>%
mutate(row = row_number()) %>%
mutate(term = str_replace_all(term, 'rcs\\(', paste0("s",row, " "))) %>%
mutate(term = ifelse(str_detect(term, 's\\d'),
str_extract(term, '.\\d\\s.*\\s'), term)) %>%
mutate(term = str_trim(term)) %>%
mutate(term = str_replace_all(term, '\\,', '')) %>%
ungroup() %>%
select(-c(group, row))
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 s1 age -0.00306 0.0219 -0.140 0.889
2 s2 age 0.0154 0.0261 0.592 0.554
3 sex -0.525 0.192 -2.74 0.00620
4 ph.ecog 0.421 0.131 3.22 0.00128
5 s1 meal.cal -0.000416 0.00104 -0.400 0.689
6 s2 meal.cal 0.00118 0.00232 0.509 0.611
7 s3 meal.cal -0.00659 0.0114 -0.577 0.564
This is also less elegant than desired, but should work for multiple knots
output %>%
mutate(is_spline = grepl("^rcs\\(.*?, \\d\\)", term),
n_term = str_count(term, "'") + 1,
pre = ifelse(is_spline, paste0('s', n_term, ' '), ""),
term = paste0(pre, gsub("(^rcs\\(.*?, \\d\\))|(\\'+$)", "", term))) %>%
select(-is_spline, -n_term, -pre)
#> # A tibble: 7 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 s1 age -0.00306 0.0219 -0.140 0.889
#> 2 s2 age 0.0154 0.0261 0.592 0.554
#> 3 sex -0.525 0.192 -2.74 0.00620
#> 4 ph.ecog 0.421 0.131 3.22 0.00128
#> 5 s1 meal.cal -0.000416 0.00104 -0.400 0.689
#> 6 s2 meal.cal 0.00118 0.00232 0.509 0.611
#> 7 s3 meal.cal -0.00659 0.0114 -0.577 0.564
I have a dataset with multiple columns that follow a name pattern, and I need to calculate new columns that is the product of two other columns. I am looking for a tidyverse option, but I would want to avoid to do a pivot_longer as the dataset has >million rows.
Example dataset
library(dplyr)
df <- tibble(
jan_mean = runif(10),
feb_mean = runif(10),
mar_mean = runif(10),
jan_sd = runif(10),
feb_sd = runif(10),
mar_sd = runif(10),
)
I can do it manually like this:
df2 <- df %>%
mutate(jan_cv= jan_mean/jan_sd,
feb_cv= feb_mean/feb_sd,
mar_cv= mar_mean/mar_sd
)
This is a simple example, but I have similar operations for monthly values.
EDIT 1
I need to do this for large datasets and I was worried that pivot_longer would be quite consuming, so I did a quick comparison of the three methods.
Method 1 is the manual way, Method 2 is the short version suggested by #Tarjae, and Method 3 is using pivot longer:
tic("Method 1: manual option")
df2 <- df %>%
mutate(jan_cv= jan_mean/jan_sd,
feb_cv= feb_mean/feb_sd,
mar_cv= mar_mean/mar_sd
)
toc()
tic("Method 2: Short option")
df2 <- df %>%
mutate(across(ends_with('_mean'), ~ . /
get(str_replace(cur_column(), "mean$", "sd")), .names = "{.col}_cv")) %>%
rename_at(vars(ends_with('cv')), ~ str_remove(., "\\_mean"))
toc()
tic("Method 3: pivot wider option")
df2 <- df %>%
mutate(id = row_number()) %>%
pivot_longer(-id, names_to = c("month", ".value"), names_sep = "_") %>%
mutate(cv = mean / sd) %>%
pivot_wider(names_from = "month", values_from = c(mean, sd, cv), names_glue = "{month}_{.value}") %>%
select(-id)
toc()
The results are:
Method 1: manual option: 0.05 sec elapsed
Method 2: Short option: 0.01 sec elapsed
Method 3: pivot wider option: 0.19 sec elapsed
So method 2 is even faster than manually doing each column
We could use across in this situation with some string manipulation with stringr:
library(dplyr)
library(stringr)
df %>%
mutate(across(ends_with('_mean'), ~ . /
get(str_replace(cur_column(), "mean$", "sd")), .names = "{.col}_cv")) %>%
rename_at(vars(ends_with('cv')), ~ str_remove(., "\\_mean"))
jan_mean feb_mean mar_mean jan_sd feb_sd mar_sd jan_cv feb_cv mar_cv
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.838 0.401 0.131 0.329 0.0292 0.911 2.55 13.7 0.144
2 0.595 0.173 0.0935 0.313 0.105 0.247 1.90 1.64 0.378
3 0.0546 0.934 0.983 0.536 0.618 0.292 0.102 1.51 3.36
4 0.543 0.802 0.569 0.585 0.901 0.742 0.928 0.891 0.766
5 0.899 0.761 0.245 0.932 0.506 0.526 0.965 1.50 0.466
6 0.832 0.875 0.947 0.390 0.613 0.607 2.13 1.43 1.56
7 0.268 0.421 0.930 0.869 0.873 0.612 0.308 0.483 1.52
8 0.475 0.217 0.330 0.0473 0.826 0.903 10.0 0.262 0.366
9 0.379 0.425 0.479 0.931 0.381 0.223 0.407 1.12 2.15
10 0.616 0.922 0.707 0.976 0.241 0.619 0.631 3.82 1.14
One option to achieve your desired result would be to convert your data to long format which makes it easy to do the computations per month and if desired convert back to wide format afterwards. To this end I first added an identifier column to your data:
library(dplyr)
library(tidyr)
set.seed(42)
df %>%
mutate(id = row_number()) %>%
pivot_longer(-id, names_to = c("month", ".value"), names_sep = "_") %>%
mutate(cv = mean / sd) %>%
pivot_wider(names_from = "month", values_from = c(mean, sd, cv), names_glue = "{month}_{.value}") %>%
select(-id)
#> # A tibble: 10 × 9
#> jan_mean feb_mean mar_mean jan_sd feb_sd mar_sd jan_cv feb_cv mar_cv
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0.915 0.458 0.904 0.738 0.380 0.333 1.24 1.21 2.71
#> 2 0.937 0.719 0.139 0.811 0.436 0.347 1.16 1.65 0.400
#> 3 0.286 0.935 0.989 0.388 0.0374 0.398 0.737 25.0 2.48
#> 4 0.830 0.255 0.947 0.685 0.974 0.785 1.21 0.262 1.21
#> 5 0.642 0.462 0.0824 0.00395 0.432 0.0389 163. 1.07 2.12
#> 6 0.519 0.940 0.514 0.833 0.958 0.749 0.623 0.982 0.687
#> 7 0.737 0.978 0.390 0.00733 0.888 0.677 100. 1.10 0.576
#> 8 0.135 0.117 0.906 0.208 0.640 0.171 0.648 0.184 5.29
#> 9 0.657 0.475 0.447 0.907 0.971 0.261 0.725 0.489 1.71
#> 10 0.705 0.560 0.836 0.612 0.619 0.514 1.15 0.905 1.63
I am extracting the regression results for two different groups as shown in this example below. In the temp data.frame i get the estimate, std.error, statistic and p-value. However, i don't get the confidence intervals. Is there a simple way to extract them as well?
df <- tibble(
a = rnorm(1000),
b = rnorm(1000),
c = rnorm(1000),
d = rnorm(1000),
group = rbinom(n=1000, size=1, prob=0.5)
)
df$group = as.factor(df$group)
temp <- df %>%
group_by(group) %>%
do(model1 = tidy(lm(a ~ b + c + d, data = .))) %>%
gather(model_name, model, -group) %>%
unnest()
You are doing tidy on a lm object. If you check the help page, there is an option to include the confidence interval, conf.int=TRUE:
temp <- df %>%
group_by(group) %>%
do(model1 = tidy(lm(a ~ b + c + d, data = . ), conf.int=TRUE)) %>%
gather(model_name, model, -group) %>%
unnest()
# A tibble: 8 x 9
group model_name term estimate std.error statistic p.value conf.low conf.high
<fct> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 model1 (Int… 0.0616 0.0423 1.46 0.146 -0.0215 0.145
2 0 model1 b 0.00178 0.0421 0.0424 0.966 -0.0808 0.0844
3 0 model1 c -0.00339 0.0431 -0.0787 0.937 -0.0881 0.0813
4 0 model1 d -0.0537 0.0445 -1.21 0.228 -0.141 0.0337
5 1 model1 (Int… -0.0185 0.0454 -0.408 0.683 -0.108 0.0707
6 1 model1 b 0.00128 0.0435 0.0295 0.976 -0.0842 0.0868
7 1 model1 c -0.0972 0.0430 -2.26 0.0244 -0.182 -0.0126
8 1 model1 d 0.0734 0.0457 1.60 0.109 -0.0165 0.163
If your version of dplyr is higher than 1.0.0, you can use:
df %>%
group_by(group) %>%
summarise(tidy(lm(a ~ b + c + d), conf.int = TRUE), .groups = "drop")
#> # A tibble: 8 x 8
#> group term estimate std.error statistic p.value conf.low conf.high
#> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0 (Intercept) 0.0734 0.0468 1.57 0.117 -0.0185 0.165
#> 2 0 b -0.101 0.0461 -2.19 0.0292 -0.191 -0.0102
#> 3 0 c 0.0337 0.0464 0.726 0.468 -0.0575 0.125
#> 4 0 d -0.101 0.0454 -2.23 0.0265 -0.190 -0.0118
#> 5 1 (Intercept) -0.0559 0.0468 -1.20 0.232 -0.148 0.0360
#> 6 1 b -0.0701 0.0474 -1.48 0.140 -0.163 0.0230
#> 7 1 c 0.0319 0.0477 0.668 0.504 -0.0619 0.126
#> 8 1 d -0.0728 0.0466 -1.56 0.119 -0.164 0.0188
I need to sum columns in a dataframe where the columns that need to be summed are defined in a separate data frame. Reproducible example below.
dataset <- tibble(L1 = runif(100, 0, 1),
L2 = runif(100, 0, 1),
L3 = runif(100, 0, 1),
L4 = runif(100, 0, 1))
cols_to_sum <- tibble(col1 = c("L1","L2"),
col2 = c("L3","L4"))
In the example above I need to create two additional columns in dataset, one called "L1L3" which is the sum of L1 and L3 and similar for L2 and L4. The desired output should look like the dataframe below. The cols_to_sum dataframe could have any number of rows and the dataset could have any number of columns.
dataset <- tibble(L1 = runif(100, 0, 1),
L2 = runif(100, 0, 1),
L3 = runif(100, 0, 1),
L4 = runif(100, 0, 1)) %>%
mutate(L1L3 = L1 + L3,
L2L4 = L2 + L4)
One option involving dplyr and purrr could be:
map_dfc(.x = asplit(cols_to_sum, 1), ~ dataset %>%
mutate(!!paste(paste(.x, collapse = "_"), "sum", sep = "_") := rowSums(select(., .x))) %>%
select(ends_with("sum"))) %>%
bind_cols(dataset)
L1_L3_sum L2_L4_sum L1 L2 L3 L4
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1.42 1.79 0.621 0.878 0.802 0.908
2 0.944 1.39 0.135 0.527 0.809 0.864
3 1.16 0.859 0.607 0.361 0.555 0.498
4 1.71 1.10 0.982 0.853 0.729 0.252
5 0.856 0.950 0.287 0.0234 0.568 0.927
6 0.235 1.16 0.00368 0.363 0.232 0.801
7 1.27 1.24 0.516 0.601 0.755 0.637
8 1.37 1.38 0.486 0.914 0.882 0.465
9 0.368 1.12 0.168 0.642 0.200 0.482
10 0.341 1.33 0.317 0.477 0.0240 0.857
More sequentally you can create a function to pass the character evaluation you want to evaluate, as in here. The code would be as follows:
library(tidyverse)
library(rlang)
library(dplyr)
library(tidyr)
# You create the function
example_fun <- function(df, new_var, expression) {
df %>%
mutate(!! new_var := !! parse_expr(expression))
}
example_fun(new_var, expression)
dataset <- tibble(L1 = runif(100, 0, 1),
L2 = runif(100, 0, 1),
L3 = runif(100, 0, 1),
L4 = runif(100, 0, 1))
#Transform it to dataframe
cols_to_sum <- tibble(col1 = c("L1","L2"),
col2 = c("L3","L4"))%>% as.data.frame()
# apply by column the rule of summing
for(i in 1:ncol(cols_to_sum)){
expressionsum <- paste(as.character(cols_to_sum[,i]), collapse = "+",sep ="")
Newvar <-paste(as.character(cols_to_sum[,i]), collapse = "")
dataset <- example_fun(dataset, Newvar, expressionsum)
}
dataset
# # A tibble: 100 x 6
# L1 L2 L3 L4 L1L2 L3L4
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 0.550 0.209 0.331 0.000826 0.759 0.332
# 2 0.503 0.587 0.918 0.0305 1.09 0.948
# 3 0.0269 0.223 0.310 0.539 0.250 0.850
# 4 0.622 0.0543 0.887 0.322 0.676 1.21
# 5 0.748 0.784 0.830 0.0694 1.53 0.899
# 6 0.374 0.416 0.688 0.520 0.791 1.21
# 7 0.524 0.603 0.884 0.0563 1.13 0.941
# 8 0.774 0.640 0.117 0.0622 1.41 0.180
# 9 0.954 0.868 0.809 0.429 1.82 1.24
# 10 0.606 0.833 0.310 0.894 1.44 1.20
# # … with 90 more rows
Here is one base R solution which combines the columns you want to sum for the column names, and uses subsetting and rowSums() within lapply() to add up your columns:
dataset[sapply(cols_to_sum, paste0, collapse = "")] <- lapply(cols_to_sum, function(x) rowSums(dataset[x]))
dataset
# A tibble: 100 x 6
L1 L2 L3 L4 L1L2 L3L4
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.915 0.626 0.885 0.484 1.54 1.37
2 0.937 0.217 0.517 0.445 1.15 0.962
3 0.286 0.217 0.852 0.0604 0.503 0.912
4 0.830 0.389 0.443 0.328 1.22 0.770
5 0.642 0.942 0.158 0.878 1.58 1.04
6 0.519 0.963 0.442 0.931 1.48 1.37
7 0.737 0.740 0.968 0.392 1.48 1.36
8 0.135 0.733 0.485 0.159 0.868 0.643
9 0.657 0.536 0.252 0.320 1.19 0.572
10 0.705 0.00227 0.260 0.307 0.707 0.567
Data:
set.seed(42)
dataset <- tibble(L1 = runif(100, 0, 1),
L2 = runif(100, 0, 1),
L3 = runif(100, 0, 1),
L4 = runif(100, 0, 1))
cols_to_sum <- tibble(col1 = c("L1","L2"),
col2 = c("L3","L4"))
I have 3 exposure variables x1-x3, 10 outcome variables y1-y10 and 3 covariates cv1-cv3.
I would like to regress each outcome on each exposure adjusted for all covariates. Then I would like model estimates i.e. beta, SE, p-value placed in a dataframe. Is there a way to automate this in R. Thank you!
The models i want to run look like this:
y1 ~ x1+cv1+cv2+cv3 ... y10 ~ x1+cv1+cv2+cv3
y1 ~ x2+cv1+cv2+cv3 ... y10 ~ x2+cv1+cv2+cv3
y1 ~ x3+cv1+cv2+cv3 ... y10 ~ x3+cv1+cv2+cv3
Without data and a reproducible example, it is hard to help you, but here's an example with simulated data. First, create a fake dataset, called data:
library(tidyverse)
make_df <- function(y_i) {
data_frame(y_var = y_i, y_i = rnorm(100),
x1 = rnorm(100), x2 = rnorm(100), x3 = rnorm(100),
cv1 = runif(100), cv2 = runif(100), cv3 = runif(100))
}
ys <- paste0("Y_", sprintf("%02d", 1:10))
ys
#> [1] "Y_01" "Y_02" "Y_03" "Y_04" "Y_05" "Y_06" "Y_07" "Y_08" "Y_09" "Y_10"
data <-
ys %>%
map_dfr(make_df)
data
#> # A tibble: 1,000 x 8
#> y_var y_i x1 x2 x3 cv1 cv2 cv3
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Y_01 0.504 0.892 -0.806 -1.56 0.145 0.436 0.701
#> 2 Y_01 0.967 1.24 -1.19 0.920 0.866 0.00100 0.567
#> 3 Y_01 -0.824 -0.729 -0.0855 -1.06 0.0665 0.780 0.471
#> 4 Y_01 0.294 2.37 -0.514 -0.955 0.397 0.0462 0.209
#> 5 Y_01 -0.893 0.0298 0.0369 0.0787 0.640 0.709 0.0485
#> 6 Y_01 0.670 -0.347 1.56 2.11 0.843 0.542 0.793
#> 7 Y_01 -1.59 1.04 0.228 0.573 0.185 0.151 0.558
#> 8 Y_01 -2.04 0.289 -0.435 -0.113 0.833 0.0898 0.653
#> 9 Y_01 -0.637 0.818 -0.454 0.606 0.294 0.378 0.315
#> 10 Y_01 -1.61 -0.628 -2.75 1.06 0.353 0.0863 0.332
#> # ... with 990 more rows
At this point, you have options. One way is to use the group_by %>% do(tidy(*)) recipe:
data %>%
gather(x_var, x_value, -c(y_var, y_i, cv1:cv3)) %>%
group_by(y_var, x_var) %>%
do(broom::tidy(lm(y_i ~ x_value + cv1 + cv2 + cv3, data = .)))
#> # A tibble: 150 x 7
#> # Groups: y_var, x_var [30]
#> y_var x_var term estimate std.error statistic p.value
#> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Y_01 x1 (Intercept) -0.111 0.344 -0.324 0.747
#> 2 Y_01 x1 x_value -0.0440 0.111 -0.396 0.693
#> 3 Y_01 x1 cv1 0.286 0.372 0.769 0.444
#> 4 Y_01 x1 cv2 0.0605 0.379 0.160 0.873
#> 5 Y_01 x1 cv3 -0.0690 0.378 -0.182 0.856
#> 6 Y_01 x2 (Intercept) -0.146 0.336 -0.434 0.665
#> 7 Y_01 x2 x_value 0.117 0.105 1.12 0.265
#> 8 Y_01 x2 cv1 0.287 0.362 0.793 0.430
#> 9 Y_01 x2 cv2 0.0564 0.376 0.150 0.881
#> 10 Y_01 x2 cv3 0.0125 0.379 0.0330 0.974
#> # ... with 140 more rows
Another approach is to use a split variable and then a map function from purrr:
data %>%
gather(x_var, x_value, -c(y_var, y_i, cv1:cv3)) %>%
mutate(y_var_x_var = paste0(y_var, x_var)) %>%
split(.$y_var_x_var) %>%
map(~ lm(y_i ~ x_value + cv1 + cv2 + cv3, data = .))
#> $Y_01x1
#>
#> Call:
#> lm(formula = y_i ~ x_value + cv1 + cv2 + cv3, data = .)
#>
#> Coefficients:
#> (Intercept) x_value cv1 cv2 cv3
#> -0.11144 -0.04396 0.28585 0.06051 -0.06896
#>
#>
#> $Y_01x2
#>
#> Call:
#> lm(formula = y_i ~ x_value + cv1 + cv2 + cv3, data = .)
#>
#> Coefficients:
#> (Intercept) x_value cv1 cv2 cv3
#> -0.14562 0.11732 0.28726 0.05642 0.01249
#>
#>
# ...and so on...
#>
#>
#> $Y_10x2
#>
#> Call:
#> lm(formula = y_i ~ x_value + cv1 + cv2 + cv3, data = .)
#>
#> Coefficients:
#> (Intercept) x_value cv1 cv2 cv3
#> -0.45689 -0.02530 0.61375 0.34377 -0.02357
#>
#>
#> $Y_10x3
#>
#> Call:
#> lm(formula = y_i ~ x_value + cv1 + cv2 + cv3, data = .)
#>
#> Coefficients:
#> (Intercept) x_value cv1 cv2 cv3
#> -0.44423 -0.18377 0.64739 0.27688 -0.02013