Applying function to multiple dataframe columns in R - r

So I have a function that I want to apply to a range of columns in my dataframe
My dataframe has 111 columns and I want to de-mean each column based on its ID number.
I have the following code for the column named esgscore
demeaned_df <- df %>%
group_by(ID) %>%
mutate(avg_esgscore = mean(esgscore)) %>%
mutate(demeaned_esgscore = esgscore - avg_esgscore)
This does exactly what I want so I am happy with it but I essentially want to apply this function to columns 3:111 in my dataframe (df) and put these into a new dataframe called demeaned_df
Any help would be greatly appreciated

Here is one idea. We can use across to apply the function to multiple columns. In this example, I applied the demean operation to columns from disp to carb.
library(dplyr)
mtcars2 <- mtcars %>%
group_by(cyl) %>%
mutate(across(disp:carb, .fns = function(x) x - mean(x)))
mtcars2
# # A tibble: 32 x 11
# # Groups: cyl [3]
# mpg cyl disp hp drat wt qsec vs am gear carb
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 21 6 -23.3 -12.3 0.314 -0.497 -1.52 -0.571 0.571 0.143 0.571
# 2 21 6 -23.3 -12.3 0.314 -0.242 -0.957 -0.571 0.571 0.143 0.571
# 3 22.8 4 2.86 10.4 -0.221 0.0343 -0.527 0.0909 0.273 -0.0909 -0.545
# 4 21.4 6 74.7 -12.3 -0.506 0.0979 1.46 0.429 -0.429 -0.857 -2.43
# 5 18.7 8 6.90 -34.2 -0.0793 -0.559 0.248 0 -0.143 -0.286 -1.5
# 6 18.1 6 41.7 -17.3 -0.826 0.343 2.24 0.429 -0.429 -0.857 -2.43
# 7 14.3 8 6.90 35.8 -0.0193 -0.429 -0.932 0 -0.143 -0.286 0.5
# 8 24.4 4 41.6 -20.6 -0.381 0.904 0.863 0.0909 -0.727 -0.0909 0.455
# 9 22.8 4 35.7 12.4 -0.151 0.864 3.76 0.0909 -0.727 -0.0909 0.455
# 10 19.2 6 -15.7 0.714 0.334 0.323 0.323 0.429 -0.429 0.143 0.571
# # ... with 22 more rows

Related

Apply function to two columns at a time with purrr

I have a simplified tibble where I select two columns (manually) and pass them to a custom function, but in this case just using sum. Any ideas on how I could expand this to accommodate any number of ko. In this case there's only 2, but let's say there were 5?
library(dplyr)
library(purrr)
df <- tibble(l2fc_ko1 = rnorm(1:10), l2fc_ko2 = rnorm(1:10), ctrl_ko1 = rnorm(1:10), ctrl_ko2 = rnorm(1:10))
df %>% mutate(ko1_sum = map2_dbl(ctrl_ko1, l2fc_ko1, sum),
ko2_sum = map2_dbl(ctrl_ko2, l2fc_ko2, sum))
We can use pivot_longer to reshape the data, creating a column for each level of ko. Compute the sum, then pivot_wider to get back to your original format:
library(tidyverse)
df %>%
mutate(idx = row_number()) %>%
pivot_longer(-idx, names_sep = '_', names_to = c('group', 'ko')) %>%
pivot_wider(names_from = group, values_from = value) %>%
mutate(sum = l2fc + ctrl) %>%
pivot_wider(names_from = ko, values_from = c(l2fc, ctrl, sum))
idx l2fc_ko1 l2fc_ko2 ctrl_ko1 ctrl_ko2 sum_ko1 sum_ko2
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 -1.04 -0.710 -0.288 -1.65 -1.33 -2.36
2 2 0.0338 0.400 -0.850 0.319 -0.816 0.719
3 3 2.08 0.723 0.325 0.314 2.40 1.04
4 4 0.740 -0.411 -0.307 1.77 0.433 1.36
5 5 0.347 -1.57 -0.153 0.657 0.195 -0.915
6 6 -0.998 -0.145 0.265 -1.95 -0.733 -2.09
7 7 2.05 -0.0876 -0.909 -0.190 1.14 -0.278
8 8 0.0735 -0.134 -2.04 -0.832 -1.96 -0.966
9 9 1.52 2.37 1.53 -0.596 3.05 1.78
10 10 1.42 -0.753 -1.61 1.84 -0.194 1.09
If you have a dynamic number of paired ctrl_/l2fc_ columns, then try this:
Ensure we have all ctrl_ that have a corresponding l2fc_ (and vice versa):
ctrls <- grep("^ctrl_ko", names(df), value = TRUE)
l2fcs <- gsub("^ctrl", "l2fc", ctrls)
ctrls <- ctrls[ l2fcs %in% names(df) ]
l2fcs <- l2fcs[ l2fcs %in% names(df) ] # or intersect(l2fcs, names(df))
Combine these into one vector (we'll split on it later) and convert this to the new _sum names we'll need.
nms <- c(l2fcs, ctrls)
nms
# [1] "l2fc_ko1" "l2fc_ko2" "ctrl_ko1" "ctrl_ko2"
newnms <- gsub("ctrl_(.*)", "\\1_sum", ctrls)
newnms
# [1] "ko1_sum" "ko2_sum"
Using split.default (which will split the df into groups of columns) and rowSums, we can devise two _sum columns:
setNames(as.data.frame(lapply(split.default(df[nms], gsub(".*_ko", "", nms)), rowSums)), newnms)
# ko1_sum ko2_sum
# 1 1.0643199 1.7603198
# 2 -2.3460066 2.9914827
# 3 0.1912111 -0.3537572
# 4 1.8475373 -0.8877151
# 5 2.2994618 0.3716338
# 6 -0.5365936 -1.0810583
# 7 1.2542526 -1.0687119
# 8 -1.8578221 -3.5073630
# 9 2.4785211 -4.8546746
# 10 -0.7027090 1.3562360
We can cbind/bind_cols those in, or we can mutate them just as well. For the latter, we'll replace df with cur_data() for within the mutate environment, and we'll need to add as.data.frame)
Choose one of the following, all producing effectively the same results:
cbind(df, setNames(lapply(split.default(df[nms], gsub(".*_ko", "", nms)), rowSums), newnms))
bind_cols(df, setNames(lapply(split.default(df[nms], gsub(".*_ko", "", nms)), rowSums), newnms))
df %>%
mutate(
setNames(
as.data.frame(
lapply(split.default(cur_data()[nms], gsub(".*_ko", "", nms)), rowSums)),
newnms)
)
# # A tibble: 10 x 6
# l2fc_ko1 l2fc_ko2 ctrl_ko1 ctrl_ko2 ko1_sum ko2_sum
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1.37 1.30 -0.307 0.455 1.06 1.76
# 2 -0.565 2.29 -1.78 0.705 -2.35 2.99
# 3 0.363 -1.39 -0.172 1.04 0.191 -0.354
# 4 0.633 -0.279 1.21 -0.609 1.85 -0.888
# 5 0.404 -0.133 1.90 0.505 2.30 0.372
# 6 -0.106 0.636 -0.430 -1.72 -0.537 -1.08
# 7 1.51 -0.284 -0.257 -0.784 1.25 -1.07
# 8 -0.0947 -2.66 -1.76 -0.851 -1.86 -3.51
# 9 2.02 -2.44 0.460 -2.41 2.48 -4.85
# 10 -0.0627 1.32 -0.640 0.0361 -0.703 1.36
How about rowwise? You can specify the columns you want with c or c_across.
df %>%
rowwise() %>%
mutate(total = sum(c_across(ends_with("ko1"))))
# A tibble: 10 x 5
# Rowwise:
l2fc_ko1 l2fc_ko2 ctrl_ko1 ctrl_ko2 total
<dbl> <dbl> <dbl> <dbl> <dbl>
1 -0.179 0.496 -1.10 -0.375 -1.27
2 -0.0887 -0.873 0.613 -0.348 0.525
3 -2.33 -0.322 -0.515 3.03 -2.84
4 -0.602 -0.0387 0.704 -0.118 0.102
5 -0.389 -0.00801 0.276 0.500 -0.113
6 -2.18 0.648 -0.485 -0.243 -2.66
7 0.0529 0.237 -0.371 -0.0382 -0.318
8 0.818 -0.181 1.11 -1.25 1.93
9 -0.271 -0.883 0.480 -0.296 0.209
10 -0.208 -1.11 1.09 -0.528 0.882

How do I subtract each element from the column average and divide it by the column standard deviation

I am quite new to R so I needed some help working out this problem. I have a data frame for daily rainfall values for different regions (AEZ).
The output needs to be another table that takes the (individual rainfall - column average)/column standard deviation.
For example in the table below for 01.Jan and AEZ 3 what it should do is take (0.0402 - Average (01.Jan)) / SD(01.Jan). This calculation needs to be run for each AEZ and the output then will be another table with results of these calculations.
AEZ `01-Jan` `02-Jan` `03-Jan` `04-Jan` `05-Jan` `06-Jan` `07-Jan`
1 3 0.0402 0.0044 0.0998 0.142 0.0061 0.0267 0.0351
2 12 0.0143 0.0027 0.0027 0.0029 0.0317 0.0012 0.0012
3 48 0 0 0.0026 0.0015 0.0019 0 0
4 77 0 0 0.0059 0.0124 0.0048 0.0009 0
5 160 0.0261 0.0173 0.057 0.0221 0.0892 0 0.0003
6 162 0.167 0.0037 0.0041 0.0683 0.102 0.199 0.0308
7 178 0.0062 0.0033 0.0808 0.101 0.0033 0.0023 0.0315
This will standardise (center and scale) the original dataframe.
df[,-1] <- scale(df[,-1], center = TRUE, scale = TRUE)
To scale a copy do:
foo <- df
foo[,-1] <- scale(foo[,-1], center = TRUE, scale = TRUE)
We could use dplyr:
library(dplyr)
data %>%
mutate(across(-AEZ, ~ (.x - mean(.x)) / sd(.x)))
which returns
# A tibble: 7 x 8
AEZ `\`01-Jan\`` `\`02-Jan\`` `\`03-Jan\`` `\`04-Jan\`` `\`05-Jan\`` `\`06-Jan\`` `\`07-Jan\``
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 3 0.0663 -0.0145 1.51 1.67 -0.647 -0.0835 1.22
2 12 -0.369 -0.302 -0.793 -0.857 -0.0563 -0.429 -0.751
3 48 -0.610 -0.759 -0.795 -0.882 -0.744 -0.445 -0.821
4 77 -0.610 -0.759 -0.717 -0.684 -0.677 -0.433 -0.821
5 160 -0.171 2.17 0.495 -0.508 1.27 -0.445 -0.804
6 162 2.20 -0.133 -0.760 0.332 1.56 2.25 0.969
7 178 -0.505 -0.201 1.06 0.926 -0.711 -0.414 1.01

Performing a linear model in R of a single response with a single predictor from a large dataframe and repeat for each column

It might not be very clear from the title but what I wish to do is:
I have a dataframe df with, say, 200 columns and the first 80 columns are response variables (y1, y2, y3, ...) and the rest of 120 are predictors (x1, x2, x3, ...).
I wish to compute a linear model for each pair – lm(yi ~ xi, data = df).
Many problems and solutions I have looked through online have a either a fixed response vs many predictors or the other way around, using lapply() and its related functions.
Could anyone who is familiar with it point me to the right step?
use tidyverse
library(tidyverse)
library(broom)
df <- mtcars
y <- names(df)[1:3]
x <- names(df)[4:7]
result <- expand_grid(x, y) %>%
rowwise() %>%
mutate(frm = list(reformulate(x, y)),
model = list(lm(frm, data = df)))
result$model <- purrr::set_names(result$model, nm = paste0(result$y, " ~ ", result$x))
result$model[1:2]
#> $`mpg ~ hp`
#>
#> Call:
#> lm(formula = frm, data = df)
#>
#> Coefficients:
#> (Intercept) hp
#> 30.09886 -0.06823
#>
#>
#> $`cyl ~ hp`
#>
#> Call:
#> lm(formula = frm, data = df)
#>
#> Coefficients:
#> (Intercept) hp
#> 3.00680 0.02168
map_df(result$model, tidy)
#> # A tibble: 24 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 30.1 1.63 18.4 6.64e-18
#> 2 hp -0.0682 0.0101 -6.74 1.79e- 7
#> 3 (Intercept) 3.01 0.425 7.07 7.41e- 8
#> 4 hp 0.0217 0.00264 8.23 3.48e- 9
#> 5 (Intercept) 21.0 32.6 0.644 5.25e- 1
#> 6 hp 1.43 0.202 7.08 7.14e- 8
#> 7 (Intercept) -7.52 5.48 -1.37 1.80e- 1
#> 8 drat 7.68 1.51 5.10 1.78e- 5
#> 9 (Intercept) 14.6 1.58 9.22 2.93e-10
#> 10 drat -2.34 0.436 -5.37 8.24e- 6
#> # ... with 14 more rows
map_df(result$model, glance)
#> # A tibble: 12 x 12
#> r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0.602 0.589 3.86 45.5 1.79e- 7 1 -87.6 181. 186.
#> 2 0.693 0.683 1.01 67.7 3.48e- 9 1 -44.6 95.1 99.5
#> 3 0.626 0.613 77.1 50.1 7.14e- 8 1 -183. 373. 377.
#> 4 0.464 0.446 4.49 26.0 1.78e- 5 1 -92.4 191. 195.
#> 5 0.490 0.473 1.30 28.8 8.24e- 6 1 -52.7 111. 116.
#> 6 0.504 0.488 88.7 30.5 5.28e- 6 1 -188. 382. 386.
#> 7 0.753 0.745 3.05 91.4 1.29e-10 1 -80.0 166. 170.
#> 8 0.612 0.599 1.13 47.4 1.22e- 7 1 -48.3 103. 107.
#> 9 0.789 0.781 57.9 112. 1.22e-11 1 -174. 355. 359.
#> 10 0.175 0.148 5.56 6.38 1.71e- 2 1 -99.3 205. 209.
#> 11 0.350 0.328 1.46 16.1 3.66e- 4 1 -56.6 119. 124.
#> 12 0.188 0.161 114. 6.95 1.31e- 2 1 -196. 398. 402.
#> # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
Created on 2020-12-11 by the reprex package (v0.3.0)

Error in using tidyverse function pivot_wider

[enter image description here][1]Dear all,
I have a very large file (14,566,680 records) with 2 variables (ID and A).
The first variable (ID) is the individual (n=258) and each individual has 56,460 records (A)
I would like to write out a "transpose" file (i.e. 258 lines & 54460 columns).
When I execute the following code:
system.time(snp1 %>%
#filter(`Sample ID`=='8362974') %>%
select(`Sample ID`,A) %>%
mutate(id = row_number()) %>%
#head(n=nsnp) %>%
pivot_wider(names_from=id,
values_from = A)->T)
I got the following error:
Error in rep_len(NA_integer_, n) : invalid 'length.out' value
In addition: Warning message:
In nrow * ncol : NAs produced by integer overflow
Timing stopped at: 28.73 0.62 29.36
If I use only 1 ID it works correctly
Best
Stefano
Does it work if you group the records by individual before calculating the row_number (record ID)?
# made up sample
df <- tibble(`Sample ID` = rep(1:258, each = 56460)) %>%
mutate(A = rnorm(nrow(.)))
df %>%
group_by(`Sample ID`) %>%
mutate(id = row_number()) %>%
pivot_wider(names_from=id,
values_from = A)
# A tibble: 258 x 56,461
# Groups: Sample ID [258]
`Sample ID` `1` `2` `3` `4` `5` `6` `7`
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1.49 0.546 0.0517 -0.480 -0.500 0.266 -1.52
2 2 -0.391 -0.855 -1.28 -0.0277 -0.999 0.617 -0.415
3 3 0.200 0.484 1.08 -0.568 1.16 1.75 -0.143
4 4 0.212 0.371 0.674 0.0481 -1.09 -1.07 0.160
5 5 0.409 1.54 0.931 -0.280 1.27 0.0447 0.426
6 6 -0.936 0.903 -0.0408 0.590 -1.52 -1.14 -0.600
7 7 -1.97 0.336 -0.233 0.488 0.995 -0.933 -1.90
8 8 -0.396 2.12 1.10 0.304 0.290 0.595 -1.32
9 9 -1.31 -0.124 -0.804 -0.447 1.12 -0.721 0.378
10 10 0.977 0.818 1.51 -0.258 -0.00794 0.0386 2.03
# ... with 248 more rows, and 56,453 more variables: ...

Calculate all possible interactions in model_matrix

I'm simulating data with a fluctuating number of variables. As part of the situation, I am needing to calculate a model matrix with all possible combinations. See the following reprex for an example. I am able to get all two-interactions by specifying the formula as ~ .*.. However, this particular dataset has 3 variables (ndim <- 3). I can get all two- and three-way interactions by specifying the formula as ~ .^3. The issue is that there may be 4+ variables that I need to calculate, so I would like to be able to generalize this. I have tried specifying the formula as ~ .^ndim, but this throws an error.
Is there a way define the power in the formula with a variable?
library(tidyverse)
library(mvtnorm)
library(modelr)
ndim <- 3
data <- rmvnorm(100, mean = rep(0, ndim)) %>%
as_tibble(.name_repair = ~ paste0("dim_", seq_len(ndim)))
model_matrix(data, ~ .*.)
#> # A tibble: 100 x 7
#> `(Intercept)` dim_1 dim_2 dim_3 `dim_1:dim_2` `dim_1:dim_3`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 -0.775 0.214 0.111 -0.166 -0.0857
#> 2 1 1.25 -0.0636 1.40 -0.0794 1.75
#> 3 1 1.07 -0.361 0.976 -0.384 1.04
#> 4 1 2.08 0.381 0.593 0.793 1.24
#> 5 1 -0.197 0.382 -0.257 -0.0753 0.0506
#> 6 1 0.266 -1.82 0.00411 -0.485 0.00109
#> 7 1 3.09 2.57 -0.612 7.96 -1.89
#> 8 1 2.03 0.247 0.112 0.501 0.226
#> 9 1 -0.397 0.204 1.55 -0.0810 -0.614
#> 10 1 0.597 0.335 0.533 0.200 0.319
#> # … with 90 more rows, and 1 more variable: `dim_2:dim_3` <dbl>
model_matrix(data, ~ .^3)
#> # A tibble: 100 x 8
#> `(Intercept)` dim_1 dim_2 dim_3 `dim_1:dim_2` `dim_1:dim_3`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 -0.775 0.214 0.111 -0.166 -0.0857
#> 2 1 1.25 -0.0636 1.40 -0.0794 1.75
#> 3 1 1.07 -0.361 0.976 -0.384 1.04
#> 4 1 2.08 0.381 0.593 0.793 1.24
#> 5 1 -0.197 0.382 -0.257 -0.0753 0.0506
#> 6 1 0.266 -1.82 0.00411 -0.485 0.00109
#> 7 1 3.09 2.57 -0.612 7.96 -1.89
#> 8 1 2.03 0.247 0.112 0.501 0.226
#> 9 1 -0.397 0.204 1.55 -0.0810 -0.614
#> 10 1 0.597 0.335 0.533 0.200 0.319
#> # … with 90 more rows, and 2 more variables: `dim_2:dim_3` <dbl>,
#> # `dim_1:dim_2:dim_3` <dbl>
model_matrix(data, ~.^ndim)
#> Error in terms.formula(object, data = data): invalid power in formula
Created on 2019-02-15 by the reprex package (v0.2.1)
You can use use as.formula with paste in model_matrix:
model_matrix(data, as.formula(paste0("~ .^", ndim)))

Resources