Provide tibble names in purrr - r

I would like to know if it is possible to provide column names in the as_tibble function. I know that I could use the rename function to change column names, but I would like to save the number of lines I write. Lets say I want my column names to be a1, a2, a3.
> library(purrr)
> library(tidyverse)
> 1:3 %>%
+ map(~ rnorm(104, .x)) %>%
+ map_dfc(~as_tibble(.x))
# A tibble: 104 x 3
value value1 value2
<dbl> <dbl> <dbl>
1 2.91139409 1.44646163 1.298360
2 0.87725704 4.05341889 3.892296
3 0.73230088 2.72506579 3.520865
4 1.02862344 2.09576397 4.009980
5 0.49159059 -1.23746772 3.172201
6 0.24665840 1.80876495 2.927716
7 0.75112051 2.22486452 2.896452
8 -0.06036349 3.63503054 3.218324
9 1.84431314 1.88562406 2.398761
10 0.70866474 0.08947359 3.954770
# ... with 94 more rows

We can put as_tibble with map_dfc, and then use setNames(paste0("a", seq_len(ncol(.)))) to change column name based on the number of columns.
library(tidyverse)
set.seed(123)
1:3 %>%
map_dfc(~as_tibble(rnorm(104, .x))) %>%
setNames(paste0("a", seq_len(ncol(.))))
# A tibble: 104 x 3
a1 a2 a3
<dbl> <dbl> <dbl>
1 0.440 1.05 4.65
2 0.770 1.95 2.95
3 2.56 1.22 3.12
4 1.07 0.332 3.24
5 1.13 1.62 4.23
6 2.72 2.92 2.48
7 1.46 1.42 2.01
8 -0.265 2.61 4.68
9 0.313 0.382 2.56
10 0.554 1.94 2.28
# ... with 94 more rows

Related

avoid repeated unquoting in dplyr non standard evaluation

Suppose we have the following data:
tib <- tibble::tibble(x = 1:10)
Then, suppose we want to make a function that takes a column as input and returns a tibble with several added columns such as:
library(dplyr)
generate_transformations <- function(data, column){
transform <- sym(column)
data %>%
mutate(
sqrt = sqrt(!!transform),
recip = 1 / !!transform,
log = log(!!transform)
)
}
# Usage is great:
tib %>%
generate_transformations('x')
# A tibble: 10 x 4
x sqrt recip log
<int> <dbl> <dbl> <dbl>
1 1 1 1 0
2 2 1.41 0.5 0.693
3 3 1.73 0.333 1.10
4 4 2 0.25 1.39
5 5 2.24 0.2 1.61
6 6 2.45 0.167 1.79
7 7 2.65 0.143 1.95
8 8 2.83 0.125 2.08
9 9 3 0.111 2.20
10 10 3.16 0.1 2.30
Now my question is, is there a way to avoid unquoting (!!) transform repeatedly?
Yes, I could, e.g., temporarily rename column and then rename it back after I am done, but that is not my interest in this question.
I am interested if there is a way to produce a variable that does not need the !!.
While it does not work, I was looking for something like:
generate_transformations <- function(data, column){
transform <- !!sym(column) # cannot unquote here :(
data %>%
mutate(
sqrt = sqrt(transform),
recip = 1 / transform,
log = log(transform)
)
}
Convert to string and subset from the data and use transform
generate_transformations <- function(data, column){
transform <- data[[rlang::as_string(ensym(column))]]
data %>%
mutate(
sqrt = sqrt(transform),
recip = 1 / transform,
log = log(transform)
)
}
-testing
tib %>%
generate_transformations('x')
# A tibble: 10 × 4
x sqrt recip log
<int> <dbl> <dbl> <dbl>
1 1 1 1 0
2 2 1.41 0.5 0.693
3 3 1.73 0.333 1.10
4 4 2 0.25 1.39
5 5 2.24 0.2 1.61
6 6 2.45 0.167 1.79
7 7 2.65 0.143 1.95
8 8 2.83 0.125 2.08
9 9 3 0.111 2.20
10 10 3.16 0.1 2.30
Or create a temporary column and remove it later
generate_transformations <- function(data, column){
data %>%
mutate(transform = !! rlang::ensym(column),
sqrt = sqrt(transform),
recip = 1 / transform,
log = log(transform),
transform = NULL
)
}
-testing
tib %>%
generate_transformations('x')
# A tibble: 10 × 4
x sqrt recip log
<int> <dbl> <dbl> <dbl>
1 1 1 1 0
2 2 1.41 0.5 0.693
3 3 1.73 0.333 1.10
4 4 2 0.25 1.39
5 5 2.24 0.2 1.61
6 6 2.45 0.167 1.79
7 7 2.65 0.143 1.95
8 8 2.83 0.125 2.08
9 9 3 0.111 2.20
10 10 3.16 0.1 2.30
You can do it in one, if you swap !! for {{}} and use across:
data_transformations <- function(d, col, funs=list(sqrt=sqrt, log=log, recip=~1/.)) {
d %>% mutate(across({{col}}, .fns=funs))
}
d %>% data_transformations(x)
# A tibble: 10 × 4
x x_sqrt x_log x_recip
<int> <dbl> <dbl> <dbl>
1 1 1 0 1
2 2 1.41 0.693 0.5
3 3 1.73 1.10 0.333
4 4 2 1.39 0.25
5 5 2.24 1.61 0.2
6 6 2.45 1.79 0.167
7 7 2.65 1.95 0.143
8 8 2.83 2.08 0.125
9 9 3 2.20 0.111
10 10 3.16 2.30 0.1
To restore your original column names, use
data_transformations <- function(d, col, funs=list(sqrt=sqrt, log=log, recip=~1/.)) {
d %>% mutate(across({{col}}, .fns=funs, .names="{.fn}"))
}
d %>% data_transformations(x)
# A tibble: 10 × 4
x sqrt log recip
<int> <dbl> <dbl> <dbl>
1 1 1 0 1
2 2 1.41 0.693 0.5
3 3 1.73 1.10 0.333
4 4 2 1.39 0.25
5 5 2.24 1.61 0.2
6 6 2.45 1.79 0.167
7 7 2.65 1.95 0.143
8 8 2.83 2.08 0.125
9 9 3 2.20 0.111
10 10 3.16 2.30 0.1
To handle multiple columns:
data_transformations <- function(d, cols, funs=list(sqrt=sqrt, log=log, recip=~1/.)) {
d %>% mutate(across({{cols}}, .fns=funs))
}
d1 <- tibble(x=1:10, y=seq(2, 20, 2))
d1 %>% data_transformations(c(x, y), list(sqrt=sqrt, log=log))
A tibble: 10 × 6
x y x_sqrt x_log y_sqrt y_log
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2 1 0 1.41 0.693
2 2 4 1.41 0.693 2 1.39
3 3 6 1.73 1.10 2.45 1.79
4 4 8 2 1.39 2.83 2.08
5 5 10 2.24 1.61 3.16 2.30
6 6 12 2.45 1.79 3.46 2.48
7 7 14 2.65 1.95 3.74 2.64
8 8 16 2.83 2.08 4 2.77
9 9 18 3 2.20 4.24 2.89
10 10 20 3.16 2.30 4.47 3.00

r function similar to pmin() but finds nth lowest value across columns in dataframe?

I would like a function that would find the nth lowest value across columns. In other words, a function that is similar to pmin() but rather than finding the lowest, I am hoping it returns the nth lowest. Thank you in advance!
df %>%
rowid_to_column() %>%
pivot_longer(-rowid)%>%
arrange(value)%>% #You could arrange with decreasing to find max
group_by(rowid) %>%
summarise(value = nth(value, 2)) # Find the second minimum
# A tibble: 10 x 2
rowid value
<int> <dbl>
1 1 -0.560
2 2 -0.218
3 3 0.401
4 4 0.0705
5 5 -0.556
6 6 1.72
7 7 0.498
8 8 -1.27
9 9 -0.687
10 10 -0.446
Here is a simple one (it could be modified to deal with NAs):
nth_lowest <- function(x,n) x[order(x)[n]]
Apply it to a data frame, using rowwise() and c_across() from the dplyr package.
df %>%
rowwise() %>%
mutate( second_lowest = f(c_across(x:z),2))
Output:
x y z second_lowest
<dbl> <dbl> <dbl> <dbl>
1 -0.560 1.22 -1.07 -0.560
2 -0.230 0.360 -0.218 -0.218
3 1.56 0.401 -1.03 0.401
4 0.0705 0.111 -0.729 0.0705
5 0.129 -0.556 -0.625 -0.556
6 1.72 1.79 -1.69 1.72
7 0.461 0.498 0.838 0.498
8 -1.27 -1.97 0.153 -1.27
9 -0.687 0.701 -1.14 -0.687
10 -0.446 -0.473 1.25 -0.446
Input:
set.seed(123)
df <- data.frame(x=rnorm(10), y=rnorm(10), z=rnorm(10))
We may also do this with pmap and nth
library(purrr)
library(dplyr)
pmap_dbl(df, ~ nth(sort(c(...)), n = 2))

pivot_longer on a mix of matrix columns and regular vector columns

I have a tibble where some columns are matrices. Here's a toy example:
library(dplyr)
library(tidyr)
dat <- structure(list(id = 0:5, matrix_column = structure(c(-1.34333431222985,
-1.54123232044003, -1.7260282725816, -1.8924463753132, -2.0376516335872,
-2.16069643164938, -0.250406602741403, -0.287716094522968, -0.32269823315914,
-0.354360193430544, -0.382155662949252, -0.405883260458378, 1.53709630050992,
1.76715755374983, 1.98313378488307, 2.17881959842109, 2.35072520728221,
2.4974704619887), .Dim = c(6L, 3L)), vector_column = c(10.453112322311,
10.3019556236512, 10.1273409693709, 9.91474471968391, 9.65093549479026,
9.32601906868098)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
The tibble looks like this.
> dat
# A tibble: 6 x 3
id matrix_column[,1] [,2] [,3] vector_column
<int> <dbl> <dbl> <dbl> <dbl>
1 0 -1.34 -0.250 1.54 10.5
2 1 -1.54 -0.288 1.77 10.3
3 2 -1.73 -0.323 1.98 10.1
4 3 -1.89 -0.354 2.18 9.91
5 4 -2.04 -0.382 2.35 9.65
6 5 -2.16 -0.406 2.50 9.33
If I apply pivot_longer from tidyr to the non-id columns, the values in vector_column get replicated to fill the two additional columns required to accommodate matrix_column.
dat %>%
pivot_longer(cols = -id, values_to = "new_column")
# A tibble: 12 x 3
id name new_column[,1] [,2] [,3]
<int> <chr> <dbl> <dbl> <dbl>
1 0 matrix_column -1.34 -0.250 1.54
2 0 vector_column 10.5 10.5 10.5
3 1 matrix_column -1.54 -0.288 1.77
4 1 vector_column 10.3 10.3 10.3
5 2 matrix_column -1.73 -0.323 1.98
6 2 vector_column 10.1 10.1 10.1
7 3 matrix_column -1.89 -0.354 2.18
8 3 vector_column 9.91 9.91 9.91
9 4 matrix_column -2.04 -0.382 2.35
10 4 vector_column 9.65 9.65 9.65
11 5 matrix_column -2.16 -0.406 2.50
12 5 vector_column 9.33 9.33 9.33
Is there a way to have the [,2] and the [,3] columns of new_column to be NA (instead of the same value of [,1]) when name equals vector_column?
Something like
# A tibble: 12 x 3
id name new_column[,1] [,2] [,3]
<int> <chr> <dbl> <dbl> <dbl>
1 0 matrix_column -1.34 -0.250 1.54
2 0 vector_column 10.5 NA NA
3 1 matrix_column -1.54 -0.288 1.77
4 1 vector_column 10.3 NA NA
My real life data have dozens of matrix columns and vector columns.
If you continue with the format of data that you currently have (having dataframe and matrix together) you'll keep on running into trouble to work with it. I would suggest to convert the matrix into dataframe and add them as their separate columns.
library(dplyr)
library(tidyr)
dat$matrix_column %>%
data.frame() %>%
bind_cols(dat %>% select(-matrix_column)) %>%
pivot_longer(cols = -id, values_to = "new_column")
# id name new_column
# <int> <chr> <dbl>
# 1 0 X1 -1.34
# 2 0 X2 -0.250
# 3 0 X3 1.54
# 4 0 vector_column 10.5
# 5 1 X1 -1.54
# 6 1 X2 -0.288
# 7 1 X3 1.77
# 8 1 vector_column 10.3
# 9 2 X1 -1.73
#10 2 X2 -0.323
# … with 14 more rows

Using pivot_longer in tidyr with a complex separator [duplicate]

This question already has an answer here:
How to use Pivot_longer to reshape from wide-type data to long-type data with multiple variables
(1 answer)
Closed 2 years ago.
In a previous post here I tried to get the equivalent of an rbind using tidyr::pivotlonger(). This is the data and the solution.
set.seed(1)
df1 <- data.frame(group = rep(letters[1:2],each=3),
day = rep(1:3,2),
var1_mean = round(rnorm(6),2),
var1_sd = round(rnorm(6,5),2),
var2_mean = round(rnorm(6),2),
var2_sd = round(rnorm(6,5),2))
# group day var1_mean var1_sd var2_mean var2_sd
# 1 a 1 -0.63 5.49 -0.62 5.82
# 2 a 2 0.18 5.74 -2.21 5.59
# 3 a 3 -0.84 5.58 1.12 5.92
# 4 b 1 1.60 4.69 -0.04 5.78
# 5 b 2 0.33 6.51 -0.02 5.07
# 6 b 3 -0.82 5.39 0.94 3.01
df1 %>%
pivot_longer(cols = starts_with('var'),
names_to = c('grp', '.value'),
names_sep="_")
# group day grp mean sd
# <fct> <int> <chr> <dbl> <dbl>
# 1 a 1 var1 -0.63 5.49
# 2 a 1 var2 -0.62 5.82
# 3 a 2 var1 0.18 5.74
# 4 a 2 var2 -2.21 5.59
# 5 a 3 var1 -0.84 5.58
# 6 a 3 var2 1.12 5.92
# 7 b 1 var1 1.6 4.69
# 8 b 1 var2 -0.04 5.78
# 9 b 2 var1 0.33 6.51
# 10 b 2 var2 -0.02 5.07
# 11 b 3 var1 -0.82 5.39
# 12 b 3 var2 0.94 3.01
This solution is quite contingent on the naming convention used for the mean and sd variables. If there is a different naming convention, with a more complex separator between the two important nodes of the column names, like so...
df2 <- data.frame(group = rep(letters[1:2],each=3),
day = rep(1:3,2),
mean_var_1 = round(rnorm(6),2),
sd_var_1 = round(rnorm(6,5),2),
mean_var_2 = round(rnorm(6),2),
sd_var_2 = round(rnorm(6,5),2))
df2
# group day mean_var_1 sd_var_1 mean_var_2 sd_var_2
# 1 a 1 0.62 6.36 -0.39 5.70
# 2 a 2 -0.06 4.90 -0.06 5.56
# 3 a 3 -0.16 5.39 1.10 4.31
# 4 b 1 -1.47 4.95 0.76 4.29
# 5 b 2 -0.48 3.62 -0.16 5.36
# 6 b 3 0.42 4.59 -0.25 5.77
How would I achieve a similar result to the first example, with a single mean and sd column and with var_1 and var_2 as the grouping variable?
If you have names that are complicated you can use names_pattern argument where you can specify how each part of column name would be used to get data in long format.
tidyr::pivot_longer(df2,
cols = contains('var'),
names_to = c('.value', 'grp'),
names_pattern = '(.*?)_(.*)')
# group day grp mean sd
# <chr> <int> <chr> <dbl> <dbl>
# 1 a 1 var_1 0.62 6.36
# 2 a 1 var_2 -0.39 5.7
# 3 a 2 var_1 -0.06 4.9
# 4 a 2 var_2 -0.06 5.56
# 5 a 3 var_1 -0.16 5.39
# 6 a 3 var_2 1.1 4.31
# 7 b 1 var_1 -1.47 4.95
# 8 b 1 var_2 0.76 4.29
# 9 b 2 var_1 -0.48 3.62
#10 b 2 var_2 -0.16 5.36
#11 b 3 var_1 0.42 4.59
#12 b 3 var_2 -0.25 5.77
'(.*?)_(.*)' uses two groups of data where the first group is everything until the first underscore ((.*?)) in the column name and the second group is everything after the underscore following the first group ((.*)).

change several column names() in data.frame() with str_replace_all()

I read this this question and practiced matching patterns, but I am still not figuring it.
I have a panel with the same measure, several times per year. Now, I want to rename them in a logical way. My raw data looks a bit like this,
set.seed(667)
dta <- data.frame(id = 1:6,
R1213 = runif(6),
R1224 = runif(6, 1, 2),
R1255 = runif(6, 2, 3),
R1235 = runif(6, 3, 4))
# install.packages(c("tidyverse"), dependencies = TRUE)
require(tidyverse)
(tbl <- dta %>% as_tibble())
#> # A tibble: 6 x 5
#> id R1213 R1224 R1255 R1235
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.488 1.60 2.07 3.07
#> 2 2 0.692 1.42 2.76 3.19
#> 3 3 0.262 1.34 2.33 3.82
#> 4 4 0.330 1.77 2.61 3.93
#> 5 5 0.582 1.92 2.15 3.86
#> 6 6 0.930 1.88 2.56 3.59
Now, I use str_replace_all() to rename them, here with only one variable in where I use pate, and everything is fine (it might also be possible to optimize this in other ways, if so please feel to let me know),
names(tbl) <- tbl %>% names() %>%
str_replace_all('^R1.[125].$', 'A') %>%
str_replace_all('^R1.[3].$', paste0('A.2018.', 1))
tbl
#> # A tibble: 6 x 5
#> id A A A A.2018.1
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.488 1.60 2.07 3.07
#> 2 2 0.692 1.42 2.76 3.19
#> 3 3 0.262 1.34 2.33 3.82
#> 4 4 0.330 1.77 2.61 3.93
#> 5 5 0.582 1.92 2.15 3.86
#> 6 6 0.930 1.88 2.56 3.59
Eveything call A is actually from the same year, let's say 2017, but with the suffix .1, .2, etc. need to appended. I start over and again use paste0('A.2017.', 1:3), but this time with three suffices,
tbl <- dta %>% as_tibble()
names(tbl) <- tbl %>% names() %>%
str_replace_all('^R1.[125].$', paste0('A.2017.', 1:3)) %>%
str_replace_all('^R1.[7].$', paste0('A.2018.', 1))
tbl
#> Warning message:
#> In stri_replace_all_regex(string, pattern, fix_replacement(replacement), :
#> longer object length is not a multiple of shorter object length
#> > tbl
#> # A tibble: 6 x 5
#> id A.2017.2 A.2017.3 A.2017.1 R1235
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.488 1.60 2.07 3.07
#> 2 2 0.692 1.42 2.76 3.19
#> 3 3 0.262 1.34 2.33 3.82
#> 4 4 0.330 1.77 2.61 3.93
#> 5 5 0.582 1.92 2.15 3.86
#> 6 6 0.930 1.88 2.56 3.59
this does come out, but the order is reversed and I am told longer object length is not a multiple of shorter object length, but isen't 3 the right length? I am looking to do this in a cleaner and simpler way. Also, I don't really like names(tbl) <-, if that can be done in a more elegant way.
Building on David's suggestion - how about something like the following using dplyr::rename_at?
library(dplyr)
## Get data
set.seed(667)
dta <- data.frame(id = 1:6,
R1213 = runif(6),
R1224 = runif(6, 1, 2),
R1255 = runif(6, 2, 3),
R1235 = runif(6, 3, 4)) %>%
as_tibble()
## Rename
dta <- dta %>%
rename_at(.vars = grep('^R1.[125].$', names(.)),
.funs = ~paste0("A.2017.", 1:length(.)))
dta
#> # A tibble: 6 x 5
#> id A.2017.1 A.2017.2 A.2017.3 R1235
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.196 1.74 2.51 3.49
#> 2 2 0.478 1.85 2.06 3.69
#> 3 3 0.780 1.32 2.21 3.26
#> 4 4 0.705 1.49 2.49 3.33
#> 5 5 0.942 1.59 2.66 3.58
#> 6 6 0.906 1.90 2.87 3.93
Vectorised solution for multiple patterns
For a complete solution that can be used for multiple patterns and replacements, we can make use of purr::map2_dfc as follows.
library(dplyr)
library(purrr)
## Get data
set.seed(667)
dta <- data.frame(id = 1:6,
R1213 = runif(6),
R1224 = runif(6, 1, 2),
R1255 = runif(6, 2, 3),
R1235 = runif(6, 3, 4)) %>%
as_tibble()
## Define a function to keep a hold out data set, then rename iteratively for each pattern and replacement.
rename_multiple_years <- function(df, patterns,
replacements,
hold_out_var = "id") {
hold_out_df <- df %>%
select_at(.vars = hold_out_var)
rename_df <- map2_dfc(patterns, replacements, function(pattern, replacement) {
df %>%
rename_at(.vars = grep(pattern, names(.)),
.funs = ~paste0(replacement, 1:length(.))) %>%
select_at(.vars = grep(replacement, names(.)))
})
final_df <- bind_cols(hold_out_df, rename_df)
return(final_df)
}
## Call function on specified patterns and replacements
renamed_dta <- dta %>%
rename_multiple_years(patterns = c("^R1.[125].$", "^R1.[3].$"),
replacements = c("A.2017.", "A.2018."))
renamed_dta
#> # A tibble: 6 x 5
#> id A.2017.1 A.2017.2 A.2017.3 A.2018.1
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.196 1.74 2.51 3.49
#> 2 2 0.478 1.85 2.06 3.69
#> 3 3 0.780 1.32 2.21 3.26
#> 4 4 0.705 1.49 2.49 3.33
#> 5 5 0.942 1.59 2.66 3.58
#> 6 6 0.906 1.90 2.87 3.93
Towards tidy data
Now that the variables have been renamed you might find it useful to have your data in a tidy format. The following using tidyr::gather might be useful.
library(tidyr)
library(dplyr)
#Use tidy dataframe gather all variables, split by "." and drop A column (or keep if a measurement id)
renamed_dta %>%
gather(key = "measure", value = "value", -id) %>%
separate(measure, c("A", "year", "measure"), "[[.]]") %>%
select(-A)
#> # A tibble: 24 x 4
#> id year measure value
#> <int> <chr> <chr> <dbl>
#> 1 1 2017 1 0.196
#> 2 2 2017 1 0.478
#> 3 3 2017 1 0.780
#> 4 4 2017 1 0.705
#> 5 5 2017 1 0.942
#> 6 6 2017 1 0.906
#> 7 1 2017 2 1.74
#> 8 2 2017 2 1.85
#> 9 3 2017 2 1.32
#> 10 4 2017 2 1.49
#> # ... with 14 more rows

Resources