Mutate dataframes in a nested list without for loop - r

I have a list of dataframes (and parameters for sensitivity analyses for a study), and I want to mutate each dataframe in the same way. The expected output is generated by the code below (a new column x2). Is there a way to assign the resulting dataframes (newdfs) to the list without using a for loop?
models <- list(m1 = list('params' = list('start'='2014-01-01'),
'data' = data.frame(y=c(1,2,3), x1=c(4,5,6))),
m2 = list('params' = list('start'='2017-01-01'),
'data' = data.frame(y=c(1,2,3), x1=c(7,8,9))))
newdfs <- lapply(models, function(z) {z$data$x2 <- z$data$x1 + 1
z$data})
# Can I do this without "for"?
for(x in 1:length(models)) models[[x]]$data <- newdfs[[x]]

You can try this:
newdfs <- lapply(models, function(z) {z$data$x2 <- z$data$x1 + 1
return(z)})
$m1
$m1$params
$m1$params$start
[1] "2014-01-01"
$m1$data
y x1 x2
1 1 4 5
2 2 5 6
3 3 6 7
$m2
$m2$params
$m2$params$start
[1] "2017-01-01"
$m2$data
y x1 x2
1 1 7 8
2 2 8 9
3 3 9 10

Revise the function in lapply() to return z instead of z$data:
lapply(models, function(z) {z$data$x2 <- z$data$x1 + 1 ; z})
To make this question complete, here are two purrr solutions:
library(purrr)
map() + map_at()
map(models, map_at, "data", transform, x2 = x1 + 1)
transpose() + map()
models %>%
transpose %>%
`[[<-`(., "data", map(.$data, transform, x2 = x1 + 1)) %>%
transpose
Output
$m1
$m1$params
$m1$params$start
[1] "2014-01-01"
$m1$data
y x1 x2
1 1 4 5
2 2 5 6
3 3 6 7
$m2
$m2$params
$m2$params$start
[1] "2017-01-01"
$m2$data
y x1 x2
1 1 7 8
2 2 8 9
3 3 9 10

Related

Creating a list with column-wise partitions of a data.frame

I have a data.frame with a single "identifier" column and many additional columns. I am interested in turning this data.frame into a list of length K, whose elements are sets of columns partitioning the data.frame.
For example, given the below data.frame:
# Example data.frame
df <- data.frame(id = 1:10,
x1 = rnorm(10),
x2 = rnorm(10),
x3 = rnorm(10),
x4 = rnorm(10))
I'd like to have some function that converts it into this:
# Partitioning function
foo(df, partitions = 3)
# Expected output
list(data.frame(id = df$id, x1 = df[ ,2]),
data.frame(id = df$id, x2 = df[ ,3]),
data.frame(id = df$id, x3 = df[ ,4], x4 = df[ ,5]),
Bonus points if you can extend this so that you can specify how many non-id columns each element of the list should contain by passing a numeric vector. Imagine the same output with an input that looks like this or equivalent.
columns_per_element <- c(1,1,2)
foo(df, columns_per_element)
It is actually easier to define a function with the splitting sequence. The key functions here are repand split.default i.e.
f2 <- function(df, n, split){
i1 <- rep(seq(n), split)
res_list <- split.default(df[-1], i1)
return(lapply(res_list, function(i)cbind.data.frame(ID = df$id, i)))
}
f2(df, 3, c(1, 1, 2))
$`1`
ID x1
1 1 1.54960977
2 2 -1.59144017
3 3 0.02853548
4 4 -0.14231391
5 5 1.26989801
6 6 0.87495876
7 7 0.27373774
8 8 -0.75600983
9 9 0.32216493
10 10 -1.05113771
$`2`
ID x2
1 1 0.8529416
2 2 0.4555094
3 3 -0.3620756
4 4 1.4779813
5 5 -1.6484066
6 6 -0.5697431
7 7 -0.2139384
8 8 0.1619074
9 9 -0.5390306
10 10 -0.2228809
$`3`
ID x3 x4
1 1 -0.2579865 1.185526074
2 2 -0.0519554 -0.388179976
3 3 2.5350092 -0.675504829
4 4 -1.7051955 0.073448252
5 5 0.6207733 -0.637220508
6 6 0.3015831 -1.324024114
7 7 -0.5647717 0.969025962
8 8 0.1404714 -1.575383604
9 9 1.3049560 -1.846413101
10 10 -0.6716643 0.008675125
f2(df, 3, c(1, 2, 1))
$`1`
ID x1
1 1 1.54960977
2 2 -1.59144017
3 3 0.02853548
4 4 -0.14231391
5 5 1.26989801
6 6 0.87495876
7 7 0.27373774
8 8 -0.75600983
9 9 0.32216493
10 10 -1.05113771
$`2`
ID x2 x3
1 1 0.8529416 -0.2579865
2 2 0.4555094 -0.0519554
3 3 -0.3620756 2.5350092
4 4 1.4779813 -1.7051955
5 5 -1.6484066 0.6207733
6 6 -0.5697431 0.3015831
7 7 -0.2139384 -0.5647717
8 8 0.1619074 0.1404714
9 9 -0.5390306 1.3049560
10 10 -0.2228809 -0.6716643
$`3`
ID x4
1 1 1.185526074
2 2 -0.388179976
3 3 -0.675504829
4 4 0.073448252
5 5 -0.637220508
6 6 -1.324024114
7 7 0.969025962
8 8 -1.575383604
9 9 -1.846413101
10 10 0.008675125
Here is solution with two parameters in the function with a vectorized column select. note this assumes the first column is id and is called id. second if the sum of the vector is greater than ncol(df)-1 (this will be your input df) it will throw an error.
f2 <- function(x,y){
#keep id
id <- x[,"id" , drop = FALSE]
#keep all other variables
df2 <- x[,-1]
#get sequence for columns
y2 <- lapply(cumsum(y), function(x){sequence(x)})
#grab correct columns
y3 <- c(y2[1],mapply(dplyr::setdiff ,y2[2:length(y2)],y2[1:2]))
#recreate df
lapply(y3,
function(x){
cbind.data.frame(id, df2[,x, drop = FALSE])
})
}
f2(df, c(1,1,2))

Custom Function Mutate that mutates column

I'm having trouble with a function that I'm trying to write. I'd like to be able to change the column via the function so that I can do some mutations inside of a pipe. I'm afraid my R is a bit rusty, so I would appreciate some help.
# Create some fake data
dat <- data.frame(x = 1:4,
y = 5:8,
z = 9:12)
# Write the function
my_func <- function(data, col1, number){
x <- data %>%
# This is where I run into trouble
mutate(col4 = [[col1]] + number)
return(x)
}
# Try to call the function - gets error
df <- my_func(dat, x, 5)
I've tried all sorts of stuff with this, but nothing works or even gets close.
Use curly-curly operator ({{}}) if we are passing unquoted column name
my_func <- function(data, col1, number){
x <- data %>%
mutate(col4 = {{col1}} + number)
return(x)
}
-testing
> my_func(dat, x, 5)
x y z col4
1 1 5 9 6
2 2 6 10 7
3 3 7 11 8
4 4 8 12 9
For more flexibility i.e. using either quoted or unquoted, convert to symbol with ensym and evaluate (!!)
my_func <- function(data, col1, number){
x <- data %>%
mutate(col4 = !! rlang::ensym(col1) + number)
return(x)
}
-testing
> my_func(dat, "x", 5)
x y z col4
1 1 5 9 6
2 2 6 10 7
3 3 7 11 8
4 4 8 12 9
> my_func(dat, x, 5)
x y z col4
1 1 5 9 6
2 2 6 10 7
3 3 7 11 8
4 4 8 12 9

Selection of argument within a function based on the comparison of two vectors

Given is a dataframe with the vectors x1 and y1:
x1 <- c(1,1,2,2,3,4)
y1 <- c(0,0,1,1,2,2)
df1 <- data.frame(x1,y1)
Also, I have a dataframe with the different values from the vector y1 and a corresponding probability:
y <- c(0,1,2)
p <- c(0.1,0.6,0.9)
df2 <- data.frame(y,p)
The following function compares a given probability (p) with a random number (runif(1)). Based on the result of the comparison, the value of df$x1 changes and is stored in df$x2 (for each value of x1 a new random number has to be drawn):
example_function <- function(x,p){
if(runif(1) <= p) return(x + 1)
return(x)
}
set.seed(123)
df1$x2 <- unlist(lapply(df1$x1,example_function,0.5))
> df1$x2
[1] 2 1 3 2 3 5
Here is my problem: In the example above I chose 0.5 for the argument "p" (manually). Instead, I would like to select a probability p from df2 based on the values for y1 associated with x1 in df1. Accordingly, I want p in
df1$x2 <- unlist(lapply(df1$x1,example_function,p))
to be derived from df2.
For example, df$x1[3], which is a 2, belongs to df$y1[3], which is a 1. df2 shows, that a 1 for y is associated with p = 0.6. In that case, the argument p for df1$x1[3] in "example_function" should be 0.6. How can this kind of a query for the value p be integrated into the described function?
df1$x2 <- unlist(lapply(df1$x1,
function(z) {
example_function(z, df2$p[df2$y == df1$y1[df1$x1 == z][1])
}))
df1
# x1 y1 x2
# 1 1 0 1
# 2 2 0 2
# 3 3 1 4
# 4 4 1 4
# 5 5 2 6
# 6 6 2 7
There is no need to do anything complicated here. You can get what you want using vector-expressions.
To pick your probabilities given p and y1, simply subscript:
> p[y1]
[1] 0.1 0.1 0.6 0.6
and then pick your x2 from x1 and the sample like this:
> ifelse(runif(1) <= p[y1], x1, x1 + 2)
[1] 3 4 3 4
One way to solve the problem is working with "merge" and "mapply" instead of "lapply":
df_new <- merge(df1, df2, by.x = 'y1', by.y = 'y')
set.seed(123)
df1$x2 <- mapply(example_function,df1$x1,df_new$p)
> df1
x1 y1 x2
1 1 0 1
2 1 0 1
3 2 1 3
4 2 1 2
5 3 2 3
6 4 2 5

Create a new variable from the minimum in R

The data contains four fields: id, x1, x2, and x3.
id <- c(1,2,3,4,5,6,7,8,9,10)
x1 <- c(2,4,5,3,6,4,3,6,7,7)
x2 <- c(0,1,2,6,7,6,0,8,2,2)
x3 <- c(5,3,4,5,8,3,4,2,5,6)
DF <- data.frame(id, x1,x2,x3)
Before I ask the question, let me create a new field (minX) which is the min of (x1,x2,x3)
DF$minX <- pmin(DF$x1, DF$x2, DF$x3)
I need to create a new field, y, that is defined as follows
if min(x1,x2,x3) = x1, then y = "x1"
if min(x1,x2,x3) = x2, then y = "x2"
if min(x1,x2,x3) = x3, then y = "x3"
Note: we assume no ties.
As a simply solution, do:
VARS <- colnames(DF)[-1]
y <- VARS[apply(DF[, -1], MARGIN = 1, FUN = which.min)]
DF$y <- y
The function which.min returns the index of the minimum. If the minimum is not unique it returns the first one. Since you guarantee that there is no tie, this is not an issue here.
Finally, you should be familiar with apply, right? MARGIN = 1 means applying function FUN row-wise, while MARGIN = 2 means applying FUN column-wise. This is an useful function to avoid the need for a for loop when dealing with matrix. Since your data frame only contains numerical/integer values, it is like a matrix hence we can use apply.
Here is another option using pmin and max.col
library(data.table)
setDT(DF)[, c("minx", "y") := list(do.call(pmin, .SD),
names(.SD)[max.col(-1*.SD)]), .SDcols= x1:x3]
DF
# id x1 x2 x3 minx y
# 1: 1 2 0 5 0 x2
# 2: 2 4 1 3 1 x2
# 3: 3 5 2 4 2 x2
# 4: 4 3 6 5 3 x1
3 5: 5 6 7 8 6 x1
# 6: 6 4 6 3 3 x3
# 7: 7 3 0 4 0 x2
# 8: 8 6 8 2 2 x3
# 9: 9 7 2 5 2 x2
#10: 10 7 2 6 2 x2
a data.table solution:
# create variables
id <- c(1,2,3,4,5,6,7,8,9,10)
x1 <- c(2,4,5,3,6,4,3,6,7,7)
x2 <- c(0,1,2,6,7,6,0,8,2,2)
x3 <- c(5,3,4,5,8,3,4,2,5,6)
DF <- data.frame(id, x1,x2,x3)
# load package and set data table, calculating min
library(data.table)
setDT(DF)[, minx := apply(.SD, 1, min), .SDcols=c("x1", "x2", "x3")]
# Create variable with name of minimum
DF[, y := apply(.SD, 1, function(x) names(x)[which.min(x)]), .SDcols = c("x1", "x2", "x3")]
# call result
DF
## id x1 x2 x3 minx y
1: 1 2 0 5 0 x2
2: 2 4 1 3 1 x2
3: 3 5 2 4 2 x2
4: 4 3 6 5 3 x1
5: 5 6 7 8 6 x1
6: 6 4 6 3 3 x3
7: 7 3 0 4 0 x2
8: 8 6 8 2 2 x3
9: 9 7 2 5 2 x2
10: 10 7 2 6 2 x2
The last step can be called directly, without the need to calculate minx.
Please notice that data.table is particularily fast in large data sets.
######## EDIT TO ADD: DPLYR METHOD #########
For completeness, this would be a dplyr method to produce the same (final) result. This solution is credited to #eipi10 in a question I started out of this problem (see here):
DF %>% mutate(y = apply(.[,2:4], 1, function(x) names(x)[which.min(x)]))
This solution takes about the same time as the data.table one provided in the original answer, when applyed to a 1e6 rows data frame (about 17 secs in my sony laptop).

Why is using list() critical for .dots = setNames() uses in dplyr?

I am calling mutate using dynamic variable names. An example that mostly works is:
df <- data.frame(a = 1:5, b = 1:5)
func <- function(a,b){
return(a+b)
}
var1 = 'a'
var2 = 'b'
expr <- interp(~func(x, y), x = as.name(var1), y = as.name(var2))
new_name <- "dynamically_created_name"
temp <- df %>% mutate_(.dots = setNames(expr, nm = new_name))
Which produces
temp
a b func(a, b)
1 1 1 2
2 2 2 4
3 3 3 6
4 4 4 8
5 5 5 10
This is mostly fine except that set names ignored the nm key. This is solved by wrapping my function in list():
temp <- df %>% mutate_(.dots = setNames(list(expr), nm = new_name))
temp
a b dynamically_created_name
1 1 1 2
2 2 2 4
3 3 3 6
4 4 4 8
5 5 5 10
My question is why is setNames ignoring it's key in the first place, and how does list() solve this problem?
As noted in the other answer, the .dots argument is assumed to be a list, and setNames is a convenient way to rename elements in a list.
What is the .dots argument doing? Let's first think about the actual dots ... argument. It is a series of expressions to be evaluated. Below the dots ... are the two named expressions c = ~ a * scale1 and d = ~ a * scale2.
scale1 <- -1
scale2 <- -2
df %>%
mutate_(c = ~ a * scale1, d = ~ a * scale2)
#> a b c d
#> 1 1 1 -1 -2
#> 2 2 2 -2 -4
#> 3 3 3 -3 -6
#> 4 4 4 -4 -8
#> 5 5 5 -5 -10
We could just bundle those expressions together beforehand in a list. That's where .dots comes in. That parameter lets us tell mutate_ to evaluate the expressions in the list.
bundled <- list(
c2 = ~ a * scale1,
d2 = ~ a * scale2
)
df %>%
mutate_(.dots = bundled)
#> a b c2 d2
#> 1 1 1 -1 -2
#> 2 2 2 -2 -4
#> 3 3 3 -3 -6
#> 4 4 4 -4 -8
#> 5 5 5 -5 -10
If we want to programmatically update the names of the expressions in the list, then setNames is a convenient way to do that. If we want to programmatically mix and match constants and variable names when making expressions, then the lazyeval package provides convenient ways to do that. Below I do both to create a list of expressions, name them, and evaluate them with mutate_
# Imagine some dropdown boxes in a Shiny app, and this is what user requested
selected_func1 <- "min"
selected_func2 <- "max"
selected_var1 <- "a"
selected_var2 <- "b"
# Assemble expressions from those choices
bundled2 <- list(
interp(~fun(x), fun = as.name(selected_func1), x = as.name(selected_var1)),
interp(~fun(x), fun = as.name(selected_func2), x = as.name(selected_var2))
)
bundled2
#> [[1]]
#> ~min(a)
#>
#> [[2]]
#> ~max(b)
# Create variable names
exp_name1 <- paste0(selected_func1, "_", selected_var1)
exp_name2 <- paste0(selected_func2, "_", selected_var2)
bundled2 <- setNames(bundled2, c(exp_name1, exp_name2))
bundled2
#> $min_a
#> ~min(a)
#>
#> $max_b
#> ~max(b)
# Evaluate the expressions
df %>%
mutate_(.dots = bundled2)
#> a b min_a max_b
#> 1 1 1 1 5
#> 2 2 2 1 5
#> 3 3 3 1 5
#> 4 4 4 1 5
#> 5 5 5 1 5
From vignettes("nse"):
If you also want to output variables to vary, you need to pass a list of quoted objects to the .dots argument
So perhaps the reason why
temp <- df %>% mutate_(.dots = setNames(expr, nm = new_name))
Doesn't do what you want is, while you successfully set the name attribute here, expr is still a formula, not a list:
foo <- setNames(expr, nm = new_name)
names(foo) #"dynamically_created_name" ""
class(foo) #"formula"
So if you make it a list, it works as expected:
expr <- interp(~func(x, y), x = as.name(var1),
y = as.name(var2))
df %>% mutate_(.dots = list(new_name = expr))
a b new_name
1 1 1 2
2 2 2 4
3 3 3 6
4 4 4 8
5 5 5 10

Resources