function for dplyr with argument that defaults to "." - r

Let's say I want to sum over all columns in a tibble to create a new column called "total". I could do:
library(tibble)
library(dplyr)
set.seed(42)
N <- 10
Df <- tibble(p_1 = rnorm(N),
p_2 = rnorm(N),
q_1 = rnorm(N),
q_2 = rnorm(N))
# Works fine
Df %>% mutate(total = apply(., 1, sum))
I could make a helper function like so,
myfun <- function(Df){
apply(Df, 1, sum)
}
# Works fine
Df %>% mutate(total = myfun(.))
But let's say this myfun was usually going to be used in this way, i.e. within a dplyr verb function, then the "." referencing the data frame is a but superfluous, and it would be nice if the myfun function could replace this with a default value. I'd like something like this:
myfun2 <- function(Df=.){
apply(Df, 1, sum)
}
which does not work.
Df %>% mutate(total = myfun2())
Error in mutate_impl(.data, dots) :
Evaluation error: object '.' not found.
Because I am not even sure how the "." works, I don't think I can formulate the question better, but basically, I want to know if there a way of saying, in effect, if the Df is not defined in myfun2, get the data-frame that is normally referenced by "."?

One option would be to quote the function and then evaluate with !!
library(tidyverse)
myfun <- function() {
quote(reduce(., `+`))
}
r1 <- Df %>%
mutate(total = !! myfun())
r1
# A tibble: 10 x 5
# p_1 p_2 q_1 q_2 total
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1.37 1.30 -0.307 0.455 2.82
# 2 -0.565 2.29 -1.78 0.705 0.645
# 3 0.363 -1.39 -0.172 1.04 -0.163
# 4 0.633 -0.279 1.21 -0.609 0.960
# 5 0.404 -0.133 1.90 0.505 2.67
# 6 -0.106 0.636 -0.430 -1.72 -1.62
# 7 1.51 -0.284 -0.257 -0.784 0.186
# 8 -0.0947 -2.66 -1.76 -0.851 -5.37
# 9 2.02 -2.44 0.460 -2.41 -2.38
#10 -0.0627 1.32 -0.640 0.0361 0.654
Note that the reduce was used to be more in align with tidyverse, but the OP's function can also be quoted and get the same result
myfun2 <- function() {
quote(apply(., 1, sum ))
}
r2 <- Df %>%
mutate(total = !! myfun2())
all.equal(r2$total, r1$total)
#[1] TRUE

Related

Merge lists with a dataframe

I have a list of lists like out. In each list I have a dataframe (with the same structure, i.e. same dimensions and variable names (id/period/pred_dif):
id <- c(1,2,3,4,5,1,2,3,4,5)
period <- c(01,09,12,01,08, 02,08,12,11,12)
pred_dif <- c(0.5,0.1,0.15,0.23,0.75,0.6,0.49,0.81,0.37,0.14)
list_1 <- data.frame(id, period, pred_dif)
pred_dif <- c(0.45,0.18,0.35,0.63,0.25,0.63,0.29,0.11,0.17,0.24)
list_2 <- data.frame(id, period, pred_dif)
pred_dif <- c(0.58,0.13,0.55,0.13,0.76,0.3,0.29,0.81,0.27,0.04)
list_3 <- data.frame(id, period, pred_dif)
pred_dif <- c(0.3,0.61,0.18,0.29,0.85,0.76,0.56,0.91,0.48,0.91)
list_4 <- data.frame(id, period, pred_dif)
out <- list(list_1, list_2, list_3, list_4)
I want to:
Merge list out with dataframe df of same structure
pred_second <- c(0.4,0.71,0.28,0.39,0.95,0.86,0.66,0.81,0.58,0.81)
df <- data.frame(id, period, pred_second)
I would proceed (in a dplyr environment) as follows:
out <- merge(out, df, by = c("id", "period"), all.x = T)
Create a list containing an OLS (lm) regression capturing the effect of variable "period" on "pred_dif". In a dataframe environment would be something like:
ols <- summary(lm(formula = pred_dif ~ as.factor(period) - 1, data = out))
Create a list or dataframe (preferred) registering the estimates and standard errors of the regressions of point 2 (it is ok if points 2/3 happen together)
Any idea on how to solve this in an iterative and fast way for all lists?
We could do this in tidyverse
Loop over the list with map
Do the left_join
Build the lm model in summarise
Convert the output to a tidy dataset
unnest the list of tibble and store in the list (or use _dfr in map to return a single data with .id specified as identifier)
library(purrr)
library(dplyr)
library(broom)
library(tidyr)
lmout_lst <- map(out,
~ left_join(.x, df, by = c('id', 'period')) %>%
summarise(new = list(lm(pred_dif ~ as.factor(period) - 1) %>%
broom::tidy(.))) %>%
unnest(new))
It can be converted to a single dataset with bind_rows as well
lmout <- bind_rows(lmout_lst, .id = 'categ')
-output
lmout
# A tibble: 24 x 6
categ term estimate std.error statistic p.value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 1 as.factor(period)1 0.365 0.223 1.63 0.153
2 1 as.factor(period)2 0.6 0.316 1.90 0.106
3 1 as.factor(period)8 0.62 0.223 2.78 0.0321
4 1 as.factor(period)9 0.1 0.316 0.317 0.762
5 1 as.factor(period)11 0.37 0.316 1.17 0.286
6 1 as.factor(period)12 0.412 0.141 2.92 0.0267
7 2 as.factor(period)1 0.54 0.0789 6.85 0.000478
8 2 as.factor(period)2 0.63 0.112 5.65 0.00132
9 2 as.factor(period)8 0.27 0.0789 3.42 0.0141
10 2 as.factor(period)9 0.18 0.112 1.61 0.158
# … with 14 more rows

How to pass tibble of variable names and function calls to tibble

I'm trying to go from a tibble of variable names and functions like this:
N <- 100
dat <-
tibble(
variable_name = c("a", "b"),
variable_value = c("rnorm(N)", "rnorm(N)")
)
to a tibble with two variables a and b of length N
dat2 <-
tibble(
a = rnorm(N),
b = rnorm(N)
)
is there a !!! or rlang-y way to accomplish this?
We can evalutate the string
library(dplyr)
library(purrr)
library(tibble)
deframe(dat) %>%
map_dfc(~ eval(rlang::parse_expr(.x)))
-output
# A tibble: 100 x 2
a b
<dbl> <dbl>
1 0.0750 2.55
2 -1.65 -1.48
3 1.77 -0.627
4 0.766 -0.0411
5 0.832 0.200
6 -1.91 -0.533
7 -0.0208 -0.266
8 -0.409 1.08
9 -1.38 -0.181
10 0.727 0.252
# … with 90 more rows
Here is a base way with a pipe and a as_tibble call.
Map(function(x) eval(str2lang(x)), setNames(dat$variable_value, dat$variable_name)) %>%
as_tibble

Using group_modify with selected columns (retaining whole data frame and order)

I have run out of R power on this one. I appreciate any help, it is probably quite simple for someone with more experience.
I have a data frame (tibble) with some numerical columns, a group column, and some other columns with other information. I want to do operations on the numerical columns, by group, but still retain all the columns.
I've put an example below: I am replacing the NAs with the group mean, for each column. The columns to replace the NAs are specified by the df_names variable.
It basically works, except it removes all columns except the numerical ones, AND reorders everything. Which makes it hard to reassemble. I could work around this, but I have a feeling there must be a simpler way to direct group_apply to specified columns, while retaining the other columns, and keeping the order.
Can anyone help? Thanks so much in advance!
Will
library("tidyverse")
# create tibble
df <- tibble(
name=letters[1:10],
csize=c("L","S","S","L","L","S","L","S","L","S"),
v1=rnorm(10),
v2=rnorm(10),
v3=rnorm(10)
)
# introduce some missing data
df$v1[3] <- NA
df$v1[6] <- NA
df$v1[7] <- NA
df$v3[2] <- NA
# these are the cols where I want to replace the NAs
df_names <- c("v1","v2","v3")
# this is the grouping variable (has to be stored as a string, since it is an input to the function)
groupvar <- "csize"
# now I want to replace the NAs with column means, restricted to their group
# the following line works, but the problem is that it removes the name column, and reorders the rows...
df_imp <- df %>% group_by(.dots=groupvar) %>% select(df_names) %>% group_modify( ~{replace_na(.x,as.list(colMeans(.x, na.rm=TRUE)))})
group_modify is overkill in this case; mutate(across()) is your friend here:
df %>% group_by(.dots = groupvar) %>%
mutate(across(all_of(df_names), ~if_else(is.na(.x), mean(.x, na.rm = TRUE), .x)))
Result:
> df
# A tibble: 10 x 5
# Groups: csize [2]
name csize v1 v2 v3
<chr> <chr> <dbl> <dbl> <dbl>
1 a L -1.22 1.48 -0.628
2 b S -1.17 0.0890 -0.130
3 c S -0.422 -0.0956 -0.0271
4 d L -0.265 0.180 -0.786
5 e L -0.491 0.509 -0.359
6 f S -0.422 -0.712 0.232
7 g L -0.400 -1.13 1.13
8 h S -0.538 -0.0785 0.690
9 i L 0.373 0.308 0.252
10 j S 0.445 0.743 -1.41
Does this work:
> library(dplyr)
> df %>% group_by(csize) %>% mutate(across(v1:v3, ~ replace_na(., mean(., na.rm = T))))
# A tibble: 10 x 5
# Groups: csize [2]
name csize v1 v2 v3
<chr> <chr> <dbl> <dbl> <dbl>
1 a L 1.57 0.310 -1.76
2 b S -0.705 0.0655 0.577
3 c S -1.05 1.28 1.82
4 d L 0.958 -2.09 -0.371
5 e L -0.712 0.247 -1.13
6 f S -1.05 -0.516 -0.107
7 g L 0.403 1.79 0.128
8 h S -0.793 1.52 1.07
9 i L -0.206 -0.369 -1.77
10 j S -1.65 -0.992 -0.476

Vectorised column operations in dplyr

I am looking for a tidy way of incorporating vectorised operations on columns using dplyr.
Basically, having a simple df as follows:
library(dplyr)
df <- data.frame("X" = runif(1:10),
"Y" = runif(1:10), "Z" = runif(1:10)) %>%
tbl_df()
I am now looking to apply the following vectorised formula:
Formula <- "X / Y + lag(Z)"
Of course the following won't work as it is looking for a column 'X / Y + lag(Z)':
df %>% mutate(Result := !!sym(Formula))
Can anyone suggest a simple way of applying a formula from a vector directly in a pipe on columns to achieve:
df %>% mutate(Result = X/Y+lag(Z))
Is this what you're looking for?
set.seed(1)
df <- data.frame("X" = runif(1:10),
"Y" = runif(1:10), "Z" = runif(1:10)) %>%
tbl_df()
Formula <- "X / Y + lag(Z)"
df <- df %>% mutate(Result = eval(parse(text = Formula)))
X Y Z Result
<dbl> <dbl> <dbl> <dbl>
1 0.153 0.0158 0.527 NA
2 0.322 0.231 0.327 1.93
3 0.479 0.0958 0.365 5.33
4 0.764 0.537 0.105 1.79
5 0.180 0.223 0.0243 0.913
6 0.178 0.538 0.975 0.355
7 0.869 0.820 0.845 2.03
8 0.356 0.263 0.0628 2.20
9 0.0399 0.710 0.968 0.119
10 0.863 0.422 0.825 3.02
parse an unevaluated expression, then evaluate it.
With tidyverse, parse_expr can be used
library(dplyr)
df <- df %>%
mutate(Calc_Col = !! rlang::parse_expr(Formula))
and if we need to pass the column name as variable, use the := (as #Nick mentioned in the comments)
Name <- "Calc_Col"
df <- df %>%
mutate(!!Name := !!rlang::parse_expr(Formula))

Order data frame by the last column with dplyr

library(dplyr)
df <- tibble(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
df %>%
arrange(colnames(df) %>% tail(1) %>% desc())
I am looping over a list of data frames. There are different columns in the data frames and the last column of each may have a different name.
I need to arrange every data frame by its last column. The simple case looks like the above code.
Using arrange_at and ncol:
df %>% arrange_at(ncol(.), desc)
As arrange_at will be depricated in the future, you could also use:
# option 1
df %>% arrange(desc(.[ncol(.)]))
# option 2
df %>% arrange(across(ncol(.), desc))
If we need to arrange by the last column name, either use the name string
df %>%
arrange_at(vars(last(names(.))), desc)
Or specify the index
df %>%
arrange_at(ncol(.), desc)
The new dplyr way (I guess from 1.0.0 on) would be using across(last_col()):
library(dplyr)
df <- tibble(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
df %>%
arrange(across(last_col(), desc))
#> # A tibble: 10 x 4
#> a b c d
#> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.283 0.443 1.30 0.910
#> 2 0.797 -0.0819 -0.936 0.828
#> 3 0.0717 -0.858 -0.355 0.671
#> 4 -1.38 -1.08 -0.472 0.426
#> 5 1.52 1.43 -0.0593 0.249
#> 6 0.827 -1.28 1.86 0.0824
#> 7 -0.448 0.0558 -1.48 -0.143
#> 8 0.377 -0.601 0.238 -0.918
#> 9 0.770 1.93 1.23 -1.43
#> 10 0.0532 -0.0934 -1.14 -2.08
> packageVersion("dplyr")
#> [1] ‘1.0.4’

Resources