change pattern of cols with same name in purrr::map_dfc R - r

I have a function that generates a dataframe with 2 cols (X and Y).
I want to use map_dfc but I would like to change the suffixes "...1", "...2" and so on that appear because the col names are the same
I would like something as (X_df1, Y_df1, X_df2, Y_df2, ...). Is there a suffix parameter? I've read the documentation and couldn't find
I don't want to use map_dfr because I need the dataframe to be wide.
example_function <- function(n1,n2){
tibble(X = n1+n2,
Y = n1*n2)
}
values <- tibble(n1 = c(1,2),
n2 = c(5,6))
map2_dfc(values$n1, values$n2, example_function)
gives me
A tibble: 1 x 4
X...1 Y...2 X...3 Y...4
<dbl> <dbl> <dbl> <dbl>
1 6 5 8 12
And I want
A tibble: 1 x 4
X_df1 Y_df1 X_df2 Y_df2
<dbl> <dbl> <dbl> <dbl>
1 6 5 8 12
Thanks!

If we don't want to change the function, we can rename before binding the cols - use pmap to loop over the rows the data, apply the function (example_function), loop over the list with imap, rename all the columns of the list of tibbles with the list index and then use bind_cols
library(dplyr)
library(purrr)
library(stringr)
pmap(values, example_function) %>%
imap(~ {nm1 <- str_c('_df', .y)
rename_with(.x, ~ str_c(., nm1), everything())
}) %>%
bind_cols
-output
# A tibble: 1 × 4
X_df1 Y_df1 X_df2 Y_df2
<dbl> <dbl> <dbl> <dbl>
1 6 5 8 12

Or you could just build the new names first and apply them after you call map2_dfc():
library(purrr)
library(tibble)
example_function <- function(n1,n2){
tibble(X = n1+n2,
Y = n1*n2)
}
values <- tibble(n1 = c(1,2),
n2 = c(5,6))
new_names <- lapply(seq_len(ncol(values)), function(x) paste0(c("X", "Y"), "_df", x)) %>%
unlist()
map2_dfc(values$n1, values$n2, example_function) %>%
setNames(new_names)
#> New names:
#> * X -> X...1
#> * Y -> Y...2
#> * X -> X...3
#> * Y -> Y...4
#> # A tibble: 1 x 4
#> X_df1 Y_df1 X_df2 Y_df2
#> <dbl> <dbl> <dbl> <dbl>
#> 1 6 5 8 12
Created on 2022-04-08 by the reprex package (v2.0.1)

Related

How do I map a function to each row in a tibble in R?

I am trying to map a function to each row in a tibble. Please see code below. My desired workflow is as follows -
Convert a list with sub lists to a tibble
Map each row the tibble to a function
My desired output should be a list with a tibble as output for each row mapped to the function. See full code below -
# Packages
library(tidyverse)
library(purrr)
# Function i want to map
sample_func <- function(tib){
a <- tib$name
b <- tib$qty
c <- tib$price
d <- tib$add
e <- b+c+d
t <- tibble(e = c(e), stock = c(a))
return(t)
}
# Define the list with multiple sublists
lst <- list(c( "CHR1", 15, 222.14, 6), c( "CHR2", 10, 119.20, 10))
# Convert each sublist to a tibble and bind the rows
tib <- bind_rows(lapply(lst, function(x) {
tibble(name = x[1], qty = x[2] %>% as.numeric(), price = x[3] %>% as.numeric(),
add = x[4] %>% as.numeric())
}))
# Apply the function to each row in the tibble using map()
result <- tib %>%
rowwise() %>%
mutate(temp = map(list(name, qty, price, add), sample_func)) %>%
unnest(temp)
My desired output should be -
[[1]]
# A tibble: 1 × 2
e name
<dbl> <chr>
1 243. CHR1
[[2]]
# A tibble: 1 × 2
e name
<dbl> <chr>
1 139. CHR2
However when the final rowwise mapping, I get the following error -
Error in `mutate()`:
! Problem while computing `temp = map(list(name, qty, price, add), sample_func)`.
ℹ The error occurred in row 1.
Caused by error in `map()`:
ℹ In index: 1.
Caused by error in `tib$name`:
! $ operator is invalid for atomic vectors
What am I doing wrong here?
An alternative approach is to change the inputs of the sample_func function to be the names of the columns instead of the tibble, then you can do this with pmap():
# Function i want to map
sample_func <- function(name, qty, price, add){
a <- name
b <- qty
c <- price
d <- add
e <- b+c+d
t <- tibble(e = c(e), stock = c(a))
return(t)
}
# Define the list with multiple sublists
lst <- list(c( "CHR1", 15, 222.14, 6), c( "CHR2", 10, 119.20, 10))
# Convert each sublist to a tibble and bind the rows
tib <- bind_rows(lapply(lst, function(x) {
tibble(name = x[1], qty = x[2] %>% as.numeric(), price = x[3] %>% as.numeric(),
add = x[4] %>% as.numeric())
}))
# Apply the function to each row in the tibble using map()
pmap(tib, sample_func)
Instead of passing a tibble to the function you may pass columns of the tibble as vector.
library(dplyr)
library(purrr)
sample_func <- function(name, qty, price, add){
res <- tibble(e = qty + price + add, stock = name)
return(res)
}
You may then use pmap -
out <- tib %>%
mutate(res = pmap(list(name, qty, price, add), sample_func))
out
# A tibble: 2 × 5
# name qty price add res
# <chr> <dbl> <dbl> <dbl> <list>
#1 CHR1 15 222. 6 <tibble [1 × 2]>
#2 CHR2 10 119. 10 <tibble [1 × 2]>
out$res
#[[1]]
# A tibble: 1 × 2
# e stock
# <dbl> <chr>
#1 243. CHR1
#[[2]]
# A tibble: 1 × 2
# e stock
# <dbl> <chr>
#1 139. CHR2
You may use unnest to get separate columns.
out %>% unnest(res)
# name qty price add e stock
# <chr> <dbl> <dbl> <dbl> <dbl> <chr>
#1 CHR1 15 222. 6 243. CHR1
#2 CHR2 10 119. 10 139. CHR2
We could just apply the sample_func on the picked dataset and unnest
library(dplyr)
library(tidyr)
tib %>%
transmute(temp = sample_func(pick(everything()))) %>%
unnest(where(is_tibble))
-output
# A tibble: 2 × 2
e stock
<dbl> <chr>
1 243. CHR1
2 139. CHR2
If we want it as a list of tibbles
tib %>%
rowwise %>%
reframe(temp = list(sample_func(pick(everything())))) %>%
pull(temp)
-output
[[1]]
# A tibble: 1 × 2
e stock
<dbl> <chr>
1 243. CHR1
[[2]]
# A tibble: 1 × 2
e stock
<dbl> <chr>
1 139. CHR2
To get your desired output and without changing your function or tibble we can use dplyr::rowwise() and dplyr::group_map().
With rowwise we tell 'dplyr' to treat each row as a group. With group_map we apply a function to each group (in our case row) and the function takes the data.frame of each group as input .x which fits your sample_func() perfectly.
library(dplyr)
tib %>%
rowwise() %>%
group_map(~ sample_func(.x))
#> [[1]]
#> # A tibble: 1 × 2
#> e stock
#> <dbl> <chr>
#> 1 243. CHR1
#>
#> [[2]]
#> # A tibble: 1 × 2
#> e stock
#> <dbl> <chr>
#> 1 139. CHR2
Data from OP
library(tidyverse)
# Function i want to map
sample_func <- function(tib){
a <- tib$name
b <- tib$qty
c <- tib$price
d <- tib$add
e <- b+c+d
t <- tibble(e = c(e), stock = c(a))
return(t)
}
# Define the list with multiple sublists
lst <- list(c( "CHR1", 15, 222.14, 6), c( "CHR2", 10, 119.20, 10))
# Convert each sublist to a tibble and bind the rows
tib <- bind_rows(lapply(lst, function(x) {
tibble(name = x[1], qty = x[2] %>% as.numeric(), price = x[3] %>% as.numeric(),
add = x[4] %>% as.numeric())
}))
Created on 2023-02-12 with reprex v2.0.2

How to use vector of column names as input into dplyr::group_by()?

I want to create a function based on dplyr that performs certain operations on subsets of data. The subsets are defined by values of one or more key columns in the dataset. When only one column is used to identify subsets, my code works fine:
set.seed(1)
df <- tibble(
g1 = c(1, 1, 2, 2, 2),
g2 = c(1, 2, 1, 2, 1),
a = sample(5)
)
group_key <- "g1"
aggregate <- function(df, by) {
df %>% group_by(!!sym(by)) %>% summarize(a = mean(a))
}
aggregate(df, by = group_key)
This works as expected and returns something like this:
# A tibble: 2 x 2
g1 a
<dbl> <dbl>
1 1 1.5
2 2 4
Unfortunately everything breaks down if I change group_key:
group_key <- c("g1", "g2")
aggregate(df, by = group_key)
I get an error: Only strings can be converted to symbols, which I think comes from rlang::sym(). Replacing it with syms() does not work since I get a list of names, on which group_by() chokes.
Any suggestions would be appreciated!
You need to use the unquote-splice operator !!!:
aggregate <- function(df, by) {
df %>% group_by(!!!syms(by)) %>% summarize(a = mean(a))
}
group_key <- c("g1", "g2")
aggregate(df, by = group_key)
## A tibble: 4 x 3
## Groups: g1 [2]
# g1 g2 a
# <dbl> <dbl> <dbl>
#1 1 1 1
#2 1 2 4
#3 2 1 2.5
#4 2 2 5
Alternatively, you can use dplyr::group_by_at:
agg <- function(df, by) {
require(dplyr)
df %>% group_by_at(vars(one_of(by))) %>% summarize(a = mean(a))}
group_key <- "g1"
group_keys <- c("g1","g2")
agg(df, by = group_key)
#> # A tibble: 2 x 2
#> g1 a
#> <dbl> <dbl>
#> 1 1 2.5
#> 2 2 3.33
agg(df, by = group_keys)
#> # A tibble: 4 x 3
#> # Groups: g1 [2]
#> g1 g2 a
#> <dbl> <dbl> <dbl>
#> 1 1 1 1
#> 2 1 2 4
#> 3 2 1 2.5
#> 4 2 2 5
Update with dplyr 1.0.0
The new across() allows tidyselect functions like all_of which replaces the quote-unqote procedure of NSE. The code looks a bit simpler with that:
aggregate <- function(df, by) {
df %>%
group_by(across(all_of(by))) %>%
summarize(a = mean(a))
}
df %>% aggregate(group_key)

Omitting columns instead of dropping them in purrr

I need to calculate an index for multiple lists. However, I can only do this if I drop some columns (here represented by "w" and "x"). For ex.
library(tidyverse)
lists<- list(
l1=tribble(
~w, ~x, ~y, ~z,
#--|--|--|----
12, "a", 2, 1,
12, "a",5, 3,
12, "a",6, 2),
l2=tribble(
~w, ~x, ~y, ~z,
#--|--|--|----
13,"b", 5, 7,
13,"b", 4, 6,
13,"b", 3, 2))
lists %>%
map(~ .x %>%
#group_by(w,x) %>%
select(-w,-x) %>%
mutate(row_sums = rowSums(.)))
Instead of dropping those columns I would like to keep/omit them and calculate the index only for "y" and "z".
I manage to do this by first extracting those columns and binding them again afterward. For ex.
select.col<-lists %>%
map_dfr(~ .x %>%
select(w,x))
lists %>%
map_dfr(~ .x %>%
select(-w,-x) %>%
mutate(row_sums = rowSums(.))) %>%
bind_cols(select.col)
However, this is not so elegant and I had to bind the lists (map_dfr), I would like to keep them as a list though.
Probably, another approach would be to use select_if(., is.numeric), but as I have some numeric columns I need to omit, I'm not sure whether this is the best option.
I'm certain there is a simple solution to this problem. Can anyone take a look at it?
Instead of dropping the columns, you can select the columns for which you want to take the sum.
You can select by name
library(dplyr)
library(purrr)
lists %>% map(~ .x %>% mutate(row_sums = rowSums(.[c("y", "z")])))
#$l1
# A tibble: 3 x 5
# w x y z row_sums
# <dbl> <chr> <dbl> <dbl> <dbl>
#1 12 a 2 1 3
#2 12 a 5 3 8
#3 12 a 6 2 8
#$l2
# A tibble: 3 x 5
# w x y z row_sums
# <dbl> <chr> <dbl> <dbl> <dbl>
#1 13 b 5 7 12
#2 13 b 4 6 10
#3 13 b 3 2 5
Or also by position of columns
lists %>% map(~ .x %>% mutate(row_sums = rowSums(.[3:4])))
Here is a tidyverse approach to get the row sums
library(tidyverse)
lists %>%
map(~ .x %>%
mutate(row_sums = select(., y:z) %>%
reduce(`+`)))
#$l1
# A tibble: 3 x 5
# w x y z row_sums
# <dbl> <chr> <dbl> <dbl> <dbl>
#1 12 a 2 1 3
#2 12 a 5 3 8
#3 12 a 6 2 8
#$l2
# A tibble: 3 x 5
# w x y z row_sums
# <dbl> <chr> <dbl> <dbl> <dbl>
#1 13 b 5 7 12
#2 13 b 4 6 10
#3 13 b 3 2 5
Or using base R
lapply(lists, transform, row_sums = y + z)

When I don't know column names in data.frame, when I use dplyr mutate function

I like to know how I can use dplyr mutate function when I don't know column names. Here is my example code;
library(dplyr)
w<-c(2,3,4)
x<-c(1,2,7)
y<-c(1,5,4)
z<-c(3,2,6)
df <- data.frame(w,x,y,z)
df %>% rowwise() %>% mutate(minimum = min(x,y,z))
Source: local data frame [3 x 5]
Groups: <by row>
# A tibble: 3 x 5
w x y z minimum
<dbl> <dbl> <dbl> <dbl> <dbl>
1 2 1 1 3 1
2 3 2 5 2 2
3 4 7 4 6 4
This code is finding minimum value in row-wise. Yes, "df %>% rowwise() %>% mutate(minimum = min(x,y,z))" works because I typed column names, x, y, z. But, let's assume that I have a really big data.frame with several hundred columns, and I don't know all of the column names. Or, I have multiple data sets of data.frame, and they have all different column names; I just want to find a minimum value from 10th column to 20th column in each row and in each data.frame.
In this example data.frame I provided above, let's assume that I don't know column names, but I just want to get minimum value from 2nd column to 4th column in each row. Of course, this doesn't work, because 'mutate' doesn't work with vector;
df %>% rowwise() %>% mutate(minimum=min(df[,2],df[,3], df[,4]))
Source: local data frame [3 x 5]
Groups: <by row>
# A tibble: 3 x 5
w x y z minimum
<dbl> <dbl> <dbl> <dbl> <dbl>
1 2 1 1 3 1
2 3 2 5 2 1
3 4 7 4 6 1
These two codes below also don't work.
df %>% rowwise() %>% mutate(average=min(colnames(df)[2], colnames(df)[3], colnames(df)[4]))
df %>% rowwise() %>% mutate(average=min(noquote(colnames(df)[2]), noquote(colnames(df)[3]), noquote(colnames(df)[4])))
I know that I can get minimum value by using apply or different method when I don't know column names. But, I like to know whether dplyr mutate function can be able to do that without known column names.
Thank you,
With apply:
library(dplyr)
library(purrr)
df %>%
mutate(minimum = apply(df[,2:4], 1, min))
or with pmap:
df %>%
mutate(minimum = pmap(.[2:4], min))
Also with by_row from purrrlyr:
df %>%
purrrlyr::by_row(~min(.[2:4]), .collate = "rows", .to = "minimum")
Output:
# tibble [3 x 5]
w x y z minimum
<dbl> <dbl> <dbl> <dbl> <dbl>
1 2 1 1 3 1
2 3 2 5 2 2
3 4 7 4 6 4
A vectorized option would be pmin. Convert the column names to symbols with syms and evaluate (!!!) to return the values of the columns on which pmin is applied
library(dplyr)
df %>%
mutate(minimum = pmin(!!! rlang::syms(names(.)[2:4])))
# w x y z minimum
#1 2 1 1 3 1
#2 3 2 5 2 2
#3 4 7 4 6 4
Here is a tidyeval approach along the lines of the suggestion from aosmith. If you don't know the column names, you can make a function that accepts the desired positions as inputs and finds the columns names itself. Here, rlang::syms() takes the column names as strings and turns them into symbols, !!! unquotes and splices the symbols into the function.
library(dplyr)
w<-c(2,3,4)
x<-c(1,2,7)
y<-c(1,5,4)
z<-c(3,2,6)
df <- data.frame(w,x,y,z)
rowwise_min <- function(df, min_cols){
cols <- df[, min_cols] %>% colnames %>% rlang::syms()
df %>%
rowwise %>%
mutate(minimum = min(!!!cols))
}
rowwise_min(df, 2:4)
#> Source: local data frame [3 x 5]
#> Groups: <by row>
#>
#> # A tibble: 3 x 5
#> w x y z minimum
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2 1 1 3 1
#> 2 3 2 5 2 2
#> 3 4 7 4 6 4
rowwise_min(df, c(1, 3))
#> Source: local data frame [3 x 5]
#> Groups: <by row>
#>
#> # A tibble: 3 x 5
#> w x y z minimum
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2 1 1 3 1
#> 2 3 2 5 2 3
#> 3 4 7 4 6 4
Created on 2018-09-04 by the reprex package (v0.2.0).

dplyr: passing a grouped tibble to a custom function

(The following scenario simplifies my actual situation)
My data comes from villages, and I would like to summarize an outcome variable by a village variable.
> data
village A Z Y
<chr> <int> <int> <dbl>
1 a 1 1 500
2 a 1 1 400
3 a 1 0 800
4 b 1 0 300
5 b 1 1 700
For example, I would like to calculate the mean of Y only using Z==z by villages. In this case, I want to have (500 + 400)/2 = 450 for village "a" and 700 for village "b".
Please note that the actual situation is more complicated and I cannot directly use this answer, but the point is I need to pass a grouped tibble and a global variable (z) to my function.
z <- 1 # z takes 0 or 1
data %>%
group_by(village) %>% # grouping by village
summarize(Y_village = Y_hat_village(., z)) # pass a part of tibble and a global variable
Y_hat_village <- function(data_village, z){
# This function takes a part of tibble (`data_village`) and a variable `z`
# Calculate the mean for a specific z in a village
data_z <- data_village %>% filter(Z==get("z"))
return(mean(data_z$Y))
}
However, I found . passes entire tibble and the code above returns the same values for all groups.
There are a couple things you can simplify. One is in your function: since you're passing in a value z to the function, you don't need to use get("z"). You have a z in the global environment that you pass in; or, more safely, assign your z value to a variable with some other name so you don't run into scoping issues, and pass that in to the function. In this case, I'm calling it z_val.
library(tidyverse)
z_val <- 1
Y_hat_village2 <- function(data, z) {
data_z <- data %>% filter(Z == z)
return(mean(data_z$Y))
}
You can make the function call on each group using do, which will get you a list-column, and then unnesting that column. Again note that I'm passing in the variable z_val to the argument z.
df %>%
group_by(village) %>%
do(y_hat = Y_hat_village2(., z = z_val)) %>%
unnest()
#> # A tibble: 2 x 2
#> village y_hat
#> <chr> <dbl>
#> 1 a 450
#> 2 b 700
However, do is being deprecated in favor of purrr::map, which I am still having trouble getting the hang of. In this case, you can group and nest, which gives a column of data frames called data, then map over that column and again supply z = z_val. When you unnest the y_hat column, you still have the original data as a nested column, since you wanted access to the rest of the columns still.
df %>%
group_by(village) %>%
nest() %>%
mutate(y_hat = map(data, ~Y_hat_village2(., z = z_val))) %>%
unnest(y_hat)
#> # A tibble: 2 x 3
#> village data y_hat
#> <chr> <list> <dbl>
#> 1 a <tibble [3 × 3]> 450
#> 2 b <tibble [2 × 3]> 700
Just to check that everything works okay, I also passed in z = 0 to check for 1. scoping issues, and 2. that other values of z work.
df %>%
group_by(village) %>%
nest() %>%
mutate(y_hat = map(data, ~Y_hat_village2(., z = 0))) %>%
unnest(y_hat)
#> # A tibble: 2 x 3
#> village data y_hat
#> <chr> <list> <dbl>
#> 1 a <tibble [3 × 3]> 800
#> 2 b <tibble [2 × 3]> 300
As an extension/modification to #patL's answer, you can also wrap the tidyverse solution within purrr:map to return a list of two tibbles, one for each z value:
z <- c(0, 1);
map(z, ~df %>% filter(Z == .x) %>% group_by(village) %>% summarise(Y.mean = mean(Y)))
#[[1]]
## A tibble: 2 x 2
# village Y.mean
# <fct> <dbl>
#1 a 800.
#2 b 300.
#
#[[2]]
## A tibble: 2 x 2
# village Y.mean
# <fct> <dbl>
#1 a 450.
#2 b 700.
Sample data
df <- read.table(text =
" village A Z Y
1 a 1 1 500
2 a 1 1 400
3 a 1 0 800
4 b 1 0 300
5 b 1 1 700 ", header = T)
You can use dplyr to accomplish it:
library(dplyr)
df %>%
group_by(village) %>%
filter(Z == 1) %>%
summarise(Y_village = mean(Y))
## A tibble: 2 x 2
# village Y_village
# <chr> <dbl>
#1 a 450
#2 b 700
To get all columns:
df %>%
group_by(village) %>%
filter(Z == 1) %>%
mutate(Y_village = mean(Y)) %>%
distinct(village, A, Z, Y_village)
## A tibble: 2 x 4
## Groups: village [2]
# village A Z Y_village
# <chr> <dbl> <dbl> <dbl>
#1 a 1 1 450
#2 b 1 1 700
data
df <- data_frame(village = c("a", "a", "a", "b", "b"),
A = rep(1, 5),
Z = c(1, 1, 0, 0, 1),
Y = c(500, 400, 800, 30, 700))

Resources