Apply function to two columns at a time with purrr - r

I have a simplified tibble where I select two columns (manually) and pass them to a custom function, but in this case just using sum. Any ideas on how I could expand this to accommodate any number of ko. In this case there's only 2, but let's say there were 5?
library(dplyr)
library(purrr)
df <- tibble(l2fc_ko1 = rnorm(1:10), l2fc_ko2 = rnorm(1:10), ctrl_ko1 = rnorm(1:10), ctrl_ko2 = rnorm(1:10))
df %>% mutate(ko1_sum = map2_dbl(ctrl_ko1, l2fc_ko1, sum),
ko2_sum = map2_dbl(ctrl_ko2, l2fc_ko2, sum))

We can use pivot_longer to reshape the data, creating a column for each level of ko. Compute the sum, then pivot_wider to get back to your original format:
library(tidyverse)
df %>%
mutate(idx = row_number()) %>%
pivot_longer(-idx, names_sep = '_', names_to = c('group', 'ko')) %>%
pivot_wider(names_from = group, values_from = value) %>%
mutate(sum = l2fc + ctrl) %>%
pivot_wider(names_from = ko, values_from = c(l2fc, ctrl, sum))
idx l2fc_ko1 l2fc_ko2 ctrl_ko1 ctrl_ko2 sum_ko1 sum_ko2
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 -1.04 -0.710 -0.288 -1.65 -1.33 -2.36
2 2 0.0338 0.400 -0.850 0.319 -0.816 0.719
3 3 2.08 0.723 0.325 0.314 2.40 1.04
4 4 0.740 -0.411 -0.307 1.77 0.433 1.36
5 5 0.347 -1.57 -0.153 0.657 0.195 -0.915
6 6 -0.998 -0.145 0.265 -1.95 -0.733 -2.09
7 7 2.05 -0.0876 -0.909 -0.190 1.14 -0.278
8 8 0.0735 -0.134 -2.04 -0.832 -1.96 -0.966
9 9 1.52 2.37 1.53 -0.596 3.05 1.78
10 10 1.42 -0.753 -1.61 1.84 -0.194 1.09

If you have a dynamic number of paired ctrl_/l2fc_ columns, then try this:
Ensure we have all ctrl_ that have a corresponding l2fc_ (and vice versa):
ctrls <- grep("^ctrl_ko", names(df), value = TRUE)
l2fcs <- gsub("^ctrl", "l2fc", ctrls)
ctrls <- ctrls[ l2fcs %in% names(df) ]
l2fcs <- l2fcs[ l2fcs %in% names(df) ] # or intersect(l2fcs, names(df))
Combine these into one vector (we'll split on it later) and convert this to the new _sum names we'll need.
nms <- c(l2fcs, ctrls)
nms
# [1] "l2fc_ko1" "l2fc_ko2" "ctrl_ko1" "ctrl_ko2"
newnms <- gsub("ctrl_(.*)", "\\1_sum", ctrls)
newnms
# [1] "ko1_sum" "ko2_sum"
Using split.default (which will split the df into groups of columns) and rowSums, we can devise two _sum columns:
setNames(as.data.frame(lapply(split.default(df[nms], gsub(".*_ko", "", nms)), rowSums)), newnms)
# ko1_sum ko2_sum
# 1 1.0643199 1.7603198
# 2 -2.3460066 2.9914827
# 3 0.1912111 -0.3537572
# 4 1.8475373 -0.8877151
# 5 2.2994618 0.3716338
# 6 -0.5365936 -1.0810583
# 7 1.2542526 -1.0687119
# 8 -1.8578221 -3.5073630
# 9 2.4785211 -4.8546746
# 10 -0.7027090 1.3562360
We can cbind/bind_cols those in, or we can mutate them just as well. For the latter, we'll replace df with cur_data() for within the mutate environment, and we'll need to add as.data.frame)
Choose one of the following, all producing effectively the same results:
cbind(df, setNames(lapply(split.default(df[nms], gsub(".*_ko", "", nms)), rowSums), newnms))
bind_cols(df, setNames(lapply(split.default(df[nms], gsub(".*_ko", "", nms)), rowSums), newnms))
df %>%
mutate(
setNames(
as.data.frame(
lapply(split.default(cur_data()[nms], gsub(".*_ko", "", nms)), rowSums)),
newnms)
)
# # A tibble: 10 x 6
# l2fc_ko1 l2fc_ko2 ctrl_ko1 ctrl_ko2 ko1_sum ko2_sum
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1.37 1.30 -0.307 0.455 1.06 1.76
# 2 -0.565 2.29 -1.78 0.705 -2.35 2.99
# 3 0.363 -1.39 -0.172 1.04 0.191 -0.354
# 4 0.633 -0.279 1.21 -0.609 1.85 -0.888
# 5 0.404 -0.133 1.90 0.505 2.30 0.372
# 6 -0.106 0.636 -0.430 -1.72 -0.537 -1.08
# 7 1.51 -0.284 -0.257 -0.784 1.25 -1.07
# 8 -0.0947 -2.66 -1.76 -0.851 -1.86 -3.51
# 9 2.02 -2.44 0.460 -2.41 2.48 -4.85
# 10 -0.0627 1.32 -0.640 0.0361 -0.703 1.36

How about rowwise? You can specify the columns you want with c or c_across.
df %>%
rowwise() %>%
mutate(total = sum(c_across(ends_with("ko1"))))
# A tibble: 10 x 5
# Rowwise:
l2fc_ko1 l2fc_ko2 ctrl_ko1 ctrl_ko2 total
<dbl> <dbl> <dbl> <dbl> <dbl>
1 -0.179 0.496 -1.10 -0.375 -1.27
2 -0.0887 -0.873 0.613 -0.348 0.525
3 -2.33 -0.322 -0.515 3.03 -2.84
4 -0.602 -0.0387 0.704 -0.118 0.102
5 -0.389 -0.00801 0.276 0.500 -0.113
6 -2.18 0.648 -0.485 -0.243 -2.66
7 0.0529 0.237 -0.371 -0.0382 -0.318
8 0.818 -0.181 1.11 -1.25 1.93
9 -0.271 -0.883 0.480 -0.296 0.209
10 -0.208 -1.11 1.09 -0.528 0.882

Related

r function similar to pmin() but finds nth lowest value across columns in dataframe?

I would like a function that would find the nth lowest value across columns. In other words, a function that is similar to pmin() but rather than finding the lowest, I am hoping it returns the nth lowest. Thank you in advance!
df %>%
rowid_to_column() %>%
pivot_longer(-rowid)%>%
arrange(value)%>% #You could arrange with decreasing to find max
group_by(rowid) %>%
summarise(value = nth(value, 2)) # Find the second minimum
# A tibble: 10 x 2
rowid value
<int> <dbl>
1 1 -0.560
2 2 -0.218
3 3 0.401
4 4 0.0705
5 5 -0.556
6 6 1.72
7 7 0.498
8 8 -1.27
9 9 -0.687
10 10 -0.446
Here is a simple one (it could be modified to deal with NAs):
nth_lowest <- function(x,n) x[order(x)[n]]
Apply it to a data frame, using rowwise() and c_across() from the dplyr package.
df %>%
rowwise() %>%
mutate( second_lowest = f(c_across(x:z),2))
Output:
x y z second_lowest
<dbl> <dbl> <dbl> <dbl>
1 -0.560 1.22 -1.07 -0.560
2 -0.230 0.360 -0.218 -0.218
3 1.56 0.401 -1.03 0.401
4 0.0705 0.111 -0.729 0.0705
5 0.129 -0.556 -0.625 -0.556
6 1.72 1.79 -1.69 1.72
7 0.461 0.498 0.838 0.498
8 -1.27 -1.97 0.153 -1.27
9 -0.687 0.701 -1.14 -0.687
10 -0.446 -0.473 1.25 -0.446
Input:
set.seed(123)
df <- data.frame(x=rnorm(10), y=rnorm(10), z=rnorm(10))
We may also do this with pmap and nth
library(purrr)
library(dplyr)
pmap_dbl(df, ~ nth(sort(c(...)), n = 2))

How do I ask R to subtract adjacent columns in a dataframe?

Basically, I have 14 columns of numerical variables (say v1 to v14), and I want R to do the following mutation:
v1=v1-v2
v3=v3-v4
v5=v5-v6
...
Here is what I tried. I called all the odd-column variables 'source_vars' and
the even-column variables 'use_vars', and tried the following
# Defining source and use vars
source_vars<-c("ofc_s","privnonfin_s","hh_s",
"ROW_s", "total_s", "banking_s", "totgov_s")
use_vars<-c("ofc_u","privnonfin_u","hh_u",
"ROW_u", "total_u", "banking_u", "totgov_u")
new<-sw_flows%>%filter(sector=="Total")%>%
mutate(sector="Source-Use", source_vars=source_vars-use_vars)
This didnt work.
Is there an efficient way to do this without having to name the variables?
Here is a purrr one-liner.
I first create a test data set, then get the odd and even column numbers with the modulus operator and finally use map2 to apply function - to the columns.
set.seed(2022)
df1 <- matrix(rnorm(10*14), ncol = 14)
df1 <- as.data.frame(df1)
source_vars <- which(seq.int(ncol(df1)) %% 2 == 1)
use_vars <- seq.int(ncol(df1))[-source_vars]
purrr::map2_dfc(df1[source_vars], df1[use_vars], `-`)
#> # A tibble: 10 x 7
#> V1 V3 V5 V7 V9 V11 V13
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.106 0.247 -1.96 -1.20 -1.76 -0.861 -1.38
#> 2 -0.988 -0.448 -0.188 -2.62 1.61 -0.221 0.636
#> 3 0.0843 2.67 -1.31 -0.746 0.648 1.25 -1.27
#> 4 -1.54 1.43 -2.67 0.0901 -0.706 0.667 1.43
#> 5 -0.278 0.469 -0.249 -2.14 -5.08 1.00 2.69
#> 6 -2.82 -1.94 -1.43 0.185 1.14 2.47 -0.0427
#> 7 -0.405 -0.430 0.317 -2.16 3.22 0.448 -2.23
#> 8 1.23 0.186 -1.44 -0.294 0.667 -0.749 0.205
#> 9 -0.270 -0.675 -0.851 0.838 2.50 -0.597 -0.246
#> 10 -0.617 -0.0703 -0.0478 -0.464 -0.513 2.17 1.94
Created on 2022-02-26 by the reprex package (v2.0.1)
And a base R way.
res <- Map(\(x, y) x - y, df1[source_vars], df1[use_vars])
do.call(cbind.data.frame, res)
Created on 2022-02-26 by the reprex package (v2.0.1)
If we want to subtract odd from even column sequence columns (assuming there are equal number of even/odd columns
sw_flows[c(TRUE, FALSE)] - sw_flows[c(FALSE, TRUE)]

How to mutate multiple columns following name pattern?

I have a dataset with multiple columns that follow a name pattern, and I need to calculate new columns that is the product of two other columns. I am looking for a tidyverse option, but I would want to avoid to do a pivot_longer as the dataset has >million rows.
Example dataset
library(dplyr)
df <- tibble(
jan_mean = runif(10),
feb_mean = runif(10),
mar_mean = runif(10),
jan_sd = runif(10),
feb_sd = runif(10),
mar_sd = runif(10),
)
I can do it manually like this:
df2 <- df %>%
mutate(jan_cv= jan_mean/jan_sd,
feb_cv= feb_mean/feb_sd,
mar_cv= mar_mean/mar_sd
)
This is a simple example, but I have similar operations for monthly values.
EDIT 1
I need to do this for large datasets and I was worried that pivot_longer would be quite consuming, so I did a quick comparison of the three methods.
Method 1 is the manual way, Method 2 is the short version suggested by #Tarjae, and Method 3 is using pivot longer:
tic("Method 1: manual option")
df2 <- df %>%
mutate(jan_cv= jan_mean/jan_sd,
feb_cv= feb_mean/feb_sd,
mar_cv= mar_mean/mar_sd
)
toc()
tic("Method 2: Short option")
df2 <- df %>%
mutate(across(ends_with('_mean'), ~ . /
get(str_replace(cur_column(), "mean$", "sd")), .names = "{.col}_cv")) %>%
rename_at(vars(ends_with('cv')), ~ str_remove(., "\\_mean"))
toc()
tic("Method 3: pivot wider option")
df2 <- df %>%
mutate(id = row_number()) %>%
pivot_longer(-id, names_to = c("month", ".value"), names_sep = "_") %>%
mutate(cv = mean / sd) %>%
pivot_wider(names_from = "month", values_from = c(mean, sd, cv), names_glue = "{month}_{.value}") %>%
select(-id)
toc()
The results are:
Method 1: manual option: 0.05 sec elapsed
Method 2: Short option: 0.01 sec elapsed
Method 3: pivot wider option: 0.19 sec elapsed
So method 2 is even faster than manually doing each column
We could use across in this situation with some string manipulation with stringr:
library(dplyr)
library(stringr)
df %>%
mutate(across(ends_with('_mean'), ~ . /
get(str_replace(cur_column(), "mean$", "sd")), .names = "{.col}_cv")) %>%
rename_at(vars(ends_with('cv')), ~ str_remove(., "\\_mean"))
jan_mean feb_mean mar_mean jan_sd feb_sd mar_sd jan_cv feb_cv mar_cv
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.838 0.401 0.131 0.329 0.0292 0.911 2.55 13.7 0.144
2 0.595 0.173 0.0935 0.313 0.105 0.247 1.90 1.64 0.378
3 0.0546 0.934 0.983 0.536 0.618 0.292 0.102 1.51 3.36
4 0.543 0.802 0.569 0.585 0.901 0.742 0.928 0.891 0.766
5 0.899 0.761 0.245 0.932 0.506 0.526 0.965 1.50 0.466
6 0.832 0.875 0.947 0.390 0.613 0.607 2.13 1.43 1.56
7 0.268 0.421 0.930 0.869 0.873 0.612 0.308 0.483 1.52
8 0.475 0.217 0.330 0.0473 0.826 0.903 10.0 0.262 0.366
9 0.379 0.425 0.479 0.931 0.381 0.223 0.407 1.12 2.15
10 0.616 0.922 0.707 0.976 0.241 0.619 0.631 3.82 1.14
One option to achieve your desired result would be to convert your data to long format which makes it easy to do the computations per month and if desired convert back to wide format afterwards. To this end I first added an identifier column to your data:
library(dplyr)
library(tidyr)
set.seed(42)
df %>%
mutate(id = row_number()) %>%
pivot_longer(-id, names_to = c("month", ".value"), names_sep = "_") %>%
mutate(cv = mean / sd) %>%
pivot_wider(names_from = "month", values_from = c(mean, sd, cv), names_glue = "{month}_{.value}") %>%
select(-id)
#> # A tibble: 10 × 9
#> jan_mean feb_mean mar_mean jan_sd feb_sd mar_sd jan_cv feb_cv mar_cv
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0.915 0.458 0.904 0.738 0.380 0.333 1.24 1.21 2.71
#> 2 0.937 0.719 0.139 0.811 0.436 0.347 1.16 1.65 0.400
#> 3 0.286 0.935 0.989 0.388 0.0374 0.398 0.737 25.0 2.48
#> 4 0.830 0.255 0.947 0.685 0.974 0.785 1.21 0.262 1.21
#> 5 0.642 0.462 0.0824 0.00395 0.432 0.0389 163. 1.07 2.12
#> 6 0.519 0.940 0.514 0.833 0.958 0.749 0.623 0.982 0.687
#> 7 0.737 0.978 0.390 0.00733 0.888 0.677 100. 1.10 0.576
#> 8 0.135 0.117 0.906 0.208 0.640 0.171 0.648 0.184 5.29
#> 9 0.657 0.475 0.447 0.907 0.971 0.261 0.725 0.489 1.71
#> 10 0.705 0.560 0.836 0.612 0.619 0.514 1.15 0.905 1.63

Can I overlook a missing variable in a summing part of a function?

This is a shortened version of my real df. I have a function (called: calc) which creates a new variable called 'total', for simplicity this adds up three variables: a, b, c. When I add a dataframe, to that function, that does not feature one variable (say c) so only has a & b, the function falls over. Is there a 'function' / simple way that counts the variables regardless if they are missing?
calc <- function(x) {x %>% mutate(total = a + b + c)}
data.2 has two columns a & b with many rows of values, but when running that in the function it cannot find c so does not calculate.
new.df <- calc(data.2)
Many thanks.
If you want to perform rowwise sum or mean they have na.rm argument which you can use to ignore NA values.
library(dplyr)
calc <- function(x) {x %>% mutate(total = rowSums(select(., a:c), na.rm = TRUE))}
In general case if you are not able to find a function which gives you an out-of-box solution you can replace NA values with 0 maybe and then perform the operation that you want to perform.
calc <- function(x) {
x %>%
mutate(across(a:c, tidyr::replace_na, 0),
total = a + b + c)
}
You can use rowwise() and c_across() with any_of() (or any other tidyselect function) from dplyr (>= 1.0.0).
library(dplyr)
df <- data.frame(a = rnorm(10), b = rnorm(10))
dfc <- data.frame(a = rnorm(10), b = rnorm(10), c = rnorm(10))
calc <- function(x) {
x %>%
rowwise() %>%
mutate(total = sum(c_across(any_of(c("a", "b", "c"))))) %>%
ungroup()
}
calc(df)
#> # A tibble: 10 x 3
#> a b total
#> <dbl> <dbl> <dbl>
#> 1 -0.884 0.851 -0.0339
#> 2 -1.56 -0.464 -2.02
#> 3 -0.884 0.815 -0.0689
#> 4 -1.46 -0.259 -1.71
#> 5 0.211 -0.528 -0.317
#> 6 1.85 0.190 2.04
#> 7 -1.31 -0.921 -2.23
#> 8 0.450 0.394 0.845
#> 9 -1.14 0.428 -0.714
#> 10 -1.11 0.417 -0.698
calc(dfc)
#> # A tibble: 10 x 4
#> a b c total
#> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.0868 0.632 1.81 2.36
#> 2 0.568 -0.523 0.240 0.286
#> 3 -0.0325 0.377 -0.437 -0.0921
#> 4 0.660 0.456 1.28 2.39
#> 5 -0.123 1.75 -1.03 0.599
#> 6 0.641 1.39 0.902 2.93
#> 7 0.266 0.520 0.904 1.69
#> 8 -1.53 0.319 0.439 -0.776
#> 9 0.942 0.468 -1.69 -0.277
#> 10 0.254 -0.600 -0.196 -0.542
If you want to be able to generalize beyond those 3 variables you can use any tidyselect methodology.
df <- data.frame(a = rnorm(10), b = rnorm(10))
dfc <- data.frame(a = rnorm(10), b = rnorm(10), c = rnorm(10))
calc <- function(x) {
x %>%
rowwise() %>%
mutate(total = sum(c_across(everything()))) %>%
ungroup()
}
calc(df)
#> # A tibble: 10 x 3
#> a b total
#> <dbl> <dbl> <dbl>
#> 1 0.775 1.17 1.95
#> 2 -1.05 1.21 0.155
#> 3 2.07 -0.264 1.81
#> 4 1.11 0.793 1.90
#> 5 -0.700 -0.216 -0.916
#> 6 -1.04 -1.03 -2.07
#> 7 -0.525 1.60 1.07
#> 8 0.354 0.828 1.18
#> 9 0.126 0.110 0.236
#> 10 -0.0954 -0.603 -0.698
calc(dfc)
#> # A tibble: 10 x 4
#> a b c total
#> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.616 0.767 0.0462 0.196
#> 2 -0.370 -0.538 -0.186 -1.09
#> 3 0.337 1.11 -0.700 0.751
#> 4 -0.993 -0.531 -0.984 -2.51
#> 5 0.0538 1.50 -0.0808 1.47
#> 6 -0.907 -1.54 -0.734 -3.18
#> 7 -1.65 -0.242 1.43 -0.455
#> 8 -0.166 0.447 -0.281 -0.000524
#> 9 0.0637 -0.0185 0.754 0.800
#> 10 1.81 -1.09 -2.15 -1.42
Created on 2020-09-10 by the reprex package (v0.3.0)

Error in using tidyverse function pivot_wider

[enter image description here][1]Dear all,
I have a very large file (14,566,680 records) with 2 variables (ID and A).
The first variable (ID) is the individual (n=258) and each individual has 56,460 records (A)
I would like to write out a "transpose" file (i.e. 258 lines & 54460 columns).
When I execute the following code:
system.time(snp1 %>%
#filter(`Sample ID`=='8362974') %>%
select(`Sample ID`,A) %>%
mutate(id = row_number()) %>%
#head(n=nsnp) %>%
pivot_wider(names_from=id,
values_from = A)->T)
I got the following error:
Error in rep_len(NA_integer_, n) : invalid 'length.out' value
In addition: Warning message:
In nrow * ncol : NAs produced by integer overflow
Timing stopped at: 28.73 0.62 29.36
If I use only 1 ID it works correctly
Best
Stefano
Does it work if you group the records by individual before calculating the row_number (record ID)?
# made up sample
df <- tibble(`Sample ID` = rep(1:258, each = 56460)) %>%
mutate(A = rnorm(nrow(.)))
df %>%
group_by(`Sample ID`) %>%
mutate(id = row_number()) %>%
pivot_wider(names_from=id,
values_from = A)
# A tibble: 258 x 56,461
# Groups: Sample ID [258]
`Sample ID` `1` `2` `3` `4` `5` `6` `7`
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1.49 0.546 0.0517 -0.480 -0.500 0.266 -1.52
2 2 -0.391 -0.855 -1.28 -0.0277 -0.999 0.617 -0.415
3 3 0.200 0.484 1.08 -0.568 1.16 1.75 -0.143
4 4 0.212 0.371 0.674 0.0481 -1.09 -1.07 0.160
5 5 0.409 1.54 0.931 -0.280 1.27 0.0447 0.426
6 6 -0.936 0.903 -0.0408 0.590 -1.52 -1.14 -0.600
7 7 -1.97 0.336 -0.233 0.488 0.995 -0.933 -1.90
8 8 -0.396 2.12 1.10 0.304 0.290 0.595 -1.32
9 9 -1.31 -0.124 -0.804 -0.447 1.12 -0.721 0.378
10 10 0.977 0.818 1.51 -0.258 -0.00794 0.0386 2.03
# ... with 248 more rows, and 56,453 more variables: ...

Resources