Merge lists with a dataframe - r

I have a list of lists like out. In each list I have a dataframe (with the same structure, i.e. same dimensions and variable names (id/period/pred_dif):
id <- c(1,2,3,4,5,1,2,3,4,5)
period <- c(01,09,12,01,08, 02,08,12,11,12)
pred_dif <- c(0.5,0.1,0.15,0.23,0.75,0.6,0.49,0.81,0.37,0.14)
list_1 <- data.frame(id, period, pred_dif)
pred_dif <- c(0.45,0.18,0.35,0.63,0.25,0.63,0.29,0.11,0.17,0.24)
list_2 <- data.frame(id, period, pred_dif)
pred_dif <- c(0.58,0.13,0.55,0.13,0.76,0.3,0.29,0.81,0.27,0.04)
list_3 <- data.frame(id, period, pred_dif)
pred_dif <- c(0.3,0.61,0.18,0.29,0.85,0.76,0.56,0.91,0.48,0.91)
list_4 <- data.frame(id, period, pred_dif)
out <- list(list_1, list_2, list_3, list_4)
I want to:
Merge list out with dataframe df of same structure
pred_second <- c(0.4,0.71,0.28,0.39,0.95,0.86,0.66,0.81,0.58,0.81)
df <- data.frame(id, period, pred_second)
I would proceed (in a dplyr environment) as follows:
out <- merge(out, df, by = c("id", "period"), all.x = T)
Create a list containing an OLS (lm) regression capturing the effect of variable "period" on "pred_dif". In a dataframe environment would be something like:
ols <- summary(lm(formula = pred_dif ~ as.factor(period) - 1, data = out))
Create a list or dataframe (preferred) registering the estimates and standard errors of the regressions of point 2 (it is ok if points 2/3 happen together)
Any idea on how to solve this in an iterative and fast way for all lists?

We could do this in tidyverse
Loop over the list with map
Do the left_join
Build the lm model in summarise
Convert the output to a tidy dataset
unnest the list of tibble and store in the list (or use _dfr in map to return a single data with .id specified as identifier)
library(purrr)
library(dplyr)
library(broom)
library(tidyr)
lmout_lst <- map(out,
~ left_join(.x, df, by = c('id', 'period')) %>%
summarise(new = list(lm(pred_dif ~ as.factor(period) - 1) %>%
broom::tidy(.))) %>%
unnest(new))
It can be converted to a single dataset with bind_rows as well
lmout <- bind_rows(lmout_lst, .id = 'categ')
-output
lmout
# A tibble: 24 x 6
categ term estimate std.error statistic p.value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 1 as.factor(period)1 0.365 0.223 1.63 0.153
2 1 as.factor(period)2 0.6 0.316 1.90 0.106
3 1 as.factor(period)8 0.62 0.223 2.78 0.0321
4 1 as.factor(period)9 0.1 0.316 0.317 0.762
5 1 as.factor(period)11 0.37 0.316 1.17 0.286
6 1 as.factor(period)12 0.412 0.141 2.92 0.0267
7 2 as.factor(period)1 0.54 0.0789 6.85 0.000478
8 2 as.factor(period)2 0.63 0.112 5.65 0.00132
9 2 as.factor(period)8 0.27 0.0789 3.42 0.0141
10 2 as.factor(period)9 0.18 0.112 1.61 0.158
# … with 14 more rows

Related

How to pass tibble of variable names and function calls to tibble

I'm trying to go from a tibble of variable names and functions like this:
N <- 100
dat <-
tibble(
variable_name = c("a", "b"),
variable_value = c("rnorm(N)", "rnorm(N)")
)
to a tibble with two variables a and b of length N
dat2 <-
tibble(
a = rnorm(N),
b = rnorm(N)
)
is there a !!! or rlang-y way to accomplish this?
We can evalutate the string
library(dplyr)
library(purrr)
library(tibble)
deframe(dat) %>%
map_dfc(~ eval(rlang::parse_expr(.x)))
-output
# A tibble: 100 x 2
a b
<dbl> <dbl>
1 0.0750 2.55
2 -1.65 -1.48
3 1.77 -0.627
4 0.766 -0.0411
5 0.832 0.200
6 -1.91 -0.533
7 -0.0208 -0.266
8 -0.409 1.08
9 -1.38 -0.181
10 0.727 0.252
# … with 90 more rows
Here is a base way with a pipe and a as_tibble call.
Map(function(x) eval(str2lang(x)), setNames(dat$variable_value, dat$variable_name)) %>%
as_tibble

Rank and choose the 5 models that showed the best result for each group

This is an example data.
ind1 <- rnorm(99)
ind2 <- rnorm(99)
ind3 <- rnorm(99)
ind4 <- rnorm(99)
ind5 <- rnorm(99)
dep <- rnorm(99, mean=ind1)
group <- rep(c("A", "B", "C"), each=33)
df <- data.frame(dep,group, ind1, ind2, ind3, ind4, ind5)
Here simple linear regression model has been fitted on every combination of variables in df after grouped by categorical variable. The result is satisfied. But my original data has much more than 5 variables. It is hard to see and compare the results in this list. So I would like to choose the best 5 models for each group from the resulting list (tibble_list) based on AIC value. It will be highly appreciated if someone could help me to do so.
indvar_list <- lapply(1:5, function(x)
combn(paste0("ind", 1:5), x, , simplify = FALSE))
formulas_list <- rapply(indvar_list, function(x)
as.formula(paste("dep ~", paste(x, collapse="+"))))
run_model <- function(f) {
df %>%
nest(-group) %>%
mutate(fit = map(data, ~ lm(f, data = .)),
results1 = map(fit, glance),
results2 = map(fit, tidy)) %>%
unnest(results1) %>%
unnest(results2) %>%
select(group, term, estimate, r.squared, p.value, AIC) %>%
mutate(estimate = exp(estimate))
}
tibble_list <- lapply(formulas_list, run_model)
tibble_list
An option would be to bind the rows into a single dataset with a .id column, then arrange by 'group', 'AIC', grouped by 'group', filter the rows having the first five unique 'index'
library(tidyverse)
bind_rows(tibble_list, .id = 'index') %>%
arrange(group, AIC) %>%
group_by(group) %>%
filter(index %in% head(unique(index), 5))
# A tibble: 51 x 7
# Groups: group [3]
# index group term estimate r.squared p.value AIC
# <chr> <fct> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 1 A (Intercept) 0.897 0.319 0.000620 79.5
# 2 1 A ind1 2.07 0.319 0.000620 79.5
# 3 7 A (Intercept) 0.883 0.358 0.00129 79.5
# 4 7 A ind1 2.14 0.358 0.00129 79.5
# 5 7 A ind3 0.849 0.358 0.00129 79.5
# 6 8 A (Intercept) 0.890 0.351 0.00153 79.9
# 7 8 A ind1 2.12 0.351 0.00153 79.9
# 8 8 A ind4 0.860 0.351 0.00153 79.9
# 9 19 A (Intercept) 0.877 0.387 0.00237 80.0
#10 19 A ind1 2.18 0.387 0.00237 80.0
## … with 41 more rows

R - use group_by() and mutate() in dplyr to apply function that returns a vector the length of groups

Take the following example data:
set.seed(1)
foo <- data.frame(x=rnorm(10, 0, 10), y=rnorm(10, 0, 10), fac = c(rep("A", 5), rep("B", 5)))
I want to split the dataframe "foo" by the variable "fac" into A's and B's, apply a function (mahalanobis distance) that returns a vector of the length of each subgroup, and then mutate the output back on to the original dataframe. For example:
auto.mahalanobis <- function(x) {
temp <- x[, c("x", "y")]
return(mahalanobis(temp, center = colMeans(temp, na.rm=T), cov = cov(temp,
use="pairwise.complete.obs")))
}
foo %>% group_by(fac) %>%
mutate(mahal = auto.mahalanobis(.))
Which gives an error. Obviously this procedure can be done manually by splitting the dataset, applying the function, and adding the output as a column before putting it all back together again. But there must be a more efficient way to do this (perhaps this is a misuse of dplyr?).
How about making use of nest instead:
foo %>%
group_by(fac) %>%
nest() %>%
mutate(mahal = map(data, ~mahalanobis(
.x,
center = colMeans(.x, na.rm = T),
cov = cov(.x, use = "pairwise.complete.obs")))) %>%
unnest()
## A tibble: 10 x 4
# fac mahal x y
# <fct> <dbl> <dbl> <dbl>
# 1 A 1.02 -6.26 15.1
# 2 A 0.120 1.84 3.90
# 3 A 2.81 -8.36 -6.21
# 4 A 2.84 16.0 -22.1
# 5 A 1.21 3.30 11.2
# 6 B 2.15 -8.20 -0.449
# 7 B 2.86 4.87 -0.162
# 8 B 1.23 7.38 9.44
# 9 B 0.675 5.76 8.21
#10 B 1.08 -3.05 5.94
Here you avoid an explicit "x", "y" filter of the form temp <- x[, c("x", "y")], as you nest relevant columns after grouping by fac. Applying mahalanobis is then straight-forward.
Update
To respond to your comment, here is a purrr option. Since it's easy to loose track of what's going on, let's go step-by-step:
Generate sample data with one additional column.
set.seed(1)
foo <- data.frame(
x = rnorm(10, 0, 10),
y = rnorm(10, 0, 10),
z = rnorm(10, 0, 10),
fac = c(rep("A", 5), rep("B", 5)))
We now store the columns which define the subset of the data to be used for the calculation of the Mahalanobis distance in a list
cols <- list(cols1 = c("x", "y"), cols2 = c("y", "z"))
So we will calculate the Mahalanobis distance (per fac) for the subset of data in columns x+y and then separately for y+z. The names of cols will be used as the column names of the two distance vectors.
Now for the actual purrr command:
imap_dfc(cols, ~nest(foo %>% group_by(fac), .x, .key = !!.y) %>% select(!!.y)) %>%
mutate_all(function(lst) map(lst, ~mahalanobis(
.x,
center = colMeans(.x, na.rm = T),
cov = cov(., use = "pairwise.complete.obs")))) %>%
unnest() %>%
bind_cols(foo, .)
# x y z fac cols1 cols2
#1 -6.264538 15.1178117 9.1897737 A 1.0197542 1.3608052
#2 1.836433 3.8984324 7.8213630 A 0.1199607 1.1141352
#3 -8.356286 -6.2124058 0.7456498 A 2.8059562 1.5099574
#4 15.952808 -22.1469989 -19.8935170 A 2.8401953 3.0675228
#5 3.295078 11.2493092 6.1982575 A 1.2141337 0.9475794
#6 -8.204684 -0.4493361 -0.5612874 B 2.1517055 1.2284793
#7 4.874291 -0.1619026 -1.5579551 B 2.8626501 1.1724828
#8 7.383247 9.4383621 -14.7075238 B 1.2271316 2.5723023
#9 5.757814 8.2122120 -4.7815006 B 0.6746788 0.6939081
#10 -3.053884 5.9390132 4.1794156 B 1.0838341 2.3328276
In short, we
loop over entries in cols,
nest data in foo per fac based on columns defined in cols,
apply mahalanobis on the nested and grouped data generating as many distance columns with nested data as we have entries in cols (i.e. subsets), and
finally unnest the distance data and column-bind it to the original foo data.
You can simply do -
foo %>% group_by(fac) %>%
mutate(mahal = auto.mahalanobis(data.frame(x, y)))
# A tibble: 10 x 4
# Groups: fac [2]
x y fac mahal
<dbl> <dbl> <fct> <dbl>
1 - 6.26 15.1 A 1.02
2 1.84 3.90 A 0.120
3 - 8.36 - 6.21 A 2.81
4 16.0 -22.1 A 2.84
5 3.30 11.2 A 1.21
6 - 8.20 - 0.449 B 2.15
7 4.87 - 0.162 B 2.86
8 7.38 9.44 B 1.23
9 5.76 8.21 B 0.675
10 - 3.05 5.94 B 1.08
You can remove temp <- x[, c("x", "y")] from your function and simply use temp instead of x as function argument.
Cleaned-up function -
auto.mahalanobis <- function(temp) {
mahalanobis(temp,
center = colMeans(temp, na.rm=T),
cov = cov(temp, use="pairwise.complete.obs")
)
}
Btw, great job on your first post!
Alternative: group_modify
There is also the way of using the group_modify function of dplyr what makes the code shorter.
Extension 1: Select specific columns
I have used the Mahalanobis distance to identify multivariate outliers. Here it makes sense to add variable names that should be considered (This is also an answer to #TKraft). You can specify now the columns that should be by cols=c("x","y")
Extension 2: Determine threshold for multivariate outliers
Moreover, the distance alone cannot be used directly to filter out possible outliers, since one needs a threshold. This threshold could be determined by the chi-square distribution, where the degrees of freedom are equal to the number of variables to be considered.
auto.mahalanobis <- function(temp, cols, chisq.prob=0.95) {
temp_sel <- temp %>% select(any_of(cols))
temp$mah <- mahalanobis(temp_sel,
center = colMeans(temp_sel, na.rm=T),
cov = cov(temp_sel, use="pairwise.complete.obs")
)
threshold <- qchisq(chisq.prob, length(cols))
temp$mah_chisq <- temp$mah > threshold
return(temp)}
foo %>%
group_by(fac) %>%
group_modify(~ auto.mahalanobis(temp=.x, cols=c("x","y"), chisq.prob = .975))
# A tibble: 10 x 5
# Groups: fac [2]
fac x y mah mah_chisq
<chr> <dbl> <dbl> <dbl> <lgl>
1 A -6.26 15.1 1.02 FALSE
2 A 1.84 3.90 0.120 FALSE
3 A -8.36 -6.21 2.81 FALSE
4 A 16.0 -22.1 2.84 FALSE
5 A 3.30 11.2 1.21 FALSE
6 B -8.20 -0.449 2.15 FALSE
7 B 4.87 -0.162 2.86 FALSE
8 B 7.38 9.44 1.23 FALSE
9 B 5.76 8.21 0.675 FALSE
10 B -3.05 5.94 1.08 FALSE

R summarise with multiple evalution metric functions that use actual and predicted from a data frame

I want to calculate multiple model evaluation metrics by groups for a data set. Each metric requires the input of actual (observed) and predicted values. These are columns in my data frame. My groups are represented by the variables iTime and an_id.
I can do the necessary calculations with summarise and much redundant typing, but there must be a purrr way to do this. I am trying to master purrr. I have tried invoke_map and pmap but could not figure out how to refer to the columns "actual" and "predicted" in my data frame.
A short example - there are more metrics needed:
library(Metrics)
df <- data.frame(an_id = c('G','J','J', 'J', 'G','G','J','G'),
iTime = c(1,1,2,2,1,2,1,2),
actual = c(1.28, 2.72,.664,.927,.711,1.16,.727,.834),
predicted = c(1.14,1.61,.475,.737,.715,1.15,.725,.90))
dataMetrics <- df %>%
group_by(an_id, iTime) %>%
summarise(vmae = mae(actual, predicted),
rae = rae(actual, predicted),
vrmse = rmse(actual, predicted))
> dataMetrics
A tibble: 4 x 5
an_id iTime vmae rae vrmse
<chr> <dbl> <dbl> <dbl> <dbl>
1 G 1 0.072 0.253 0.0990
2 G 2 0.038 0.233 0.0472
3 J 1 0.556 0.558 0.785
4 J 2 0.190 1.44 0.190
I don't know where mae, mase and rmse come from, which regrettably makes your example not reproducible. It's important to always explicitly state which packages you're using.
invoke_map is the way to map multiple functions to the same data. We can then combine that with nesting data and mapping invoke_map over the nested data.
I'll demonstrate with the sample data you give and by defining two functions f1 and f2:
f1 <- function(x, y) sum(abs(x - y))
f2 <- function(x, y) sum((x - y)^2)
library(tidyverse)
df %>%
group_by(an_id, iTime) %>%
nest() %>%
mutate(tmp = map(data, ~invoke_map_dfc(
list(f1 = f1, f2 = f2),
x = .x$actual, y = .x$predicted))) %>%
select(-data) %>%
unnest()
## A tibble: 4 x 4
# an_id iTime f1 f2
# <fct> <int> <dbl> <dbl>
#1 G 1 0.144 0.0196
#2 J 1 1.11 1.23
#3 J 2 0.381 0.0718
#4 G 2 0.01 0.0001
Explanation: We group observations by an_id and iTime, then nest the remaining data and use invoke_map_dfc inside map to apply f1 and f2 to data and store the result in columns of a nested tibble. The last step is removing the data column and un-nesting the summary stats.
Update
To reproduce your expected output
library(Metrics)
df %>%
group_by(an_id, iTime) %>%
nest() %>%
mutate(tmp = map(data, ~invoke_map_dfc(
list(vmae = mae, rae = rae, vrmse = rmse),
actual = .x$actual, predicted = .x$predicted))) %>%
select(-data) %>%
unnest()
## A tibble: 4 x 5
# an_id iTime vmae rae vrmse
# <fct> <dbl> <dbl> <dbl> <dbl>
#1 G 1 0.072 0.253 0.0990
#2 J 1 0.556 0.558 0.785
#3 J 2 0.190 1.44 0.190
#4 G 2 0.038 0.233 0.0472
Sample data
df <- read.table(text =
"an_id iTime actual predicted
G 1 1.28 1.14
J 1 2.72 1.61
J 2 0.664 0.475
J 2 0.927 0.737
G 1 0.711 0.715
G 2 1.16 1.15
J 2 0.727 0.725", header = T)

function for dplyr with argument that defaults to "."

Let's say I want to sum over all columns in a tibble to create a new column called "total". I could do:
library(tibble)
library(dplyr)
set.seed(42)
N <- 10
Df <- tibble(p_1 = rnorm(N),
p_2 = rnorm(N),
q_1 = rnorm(N),
q_2 = rnorm(N))
# Works fine
Df %>% mutate(total = apply(., 1, sum))
I could make a helper function like so,
myfun <- function(Df){
apply(Df, 1, sum)
}
# Works fine
Df %>% mutate(total = myfun(.))
But let's say this myfun was usually going to be used in this way, i.e. within a dplyr verb function, then the "." referencing the data frame is a but superfluous, and it would be nice if the myfun function could replace this with a default value. I'd like something like this:
myfun2 <- function(Df=.){
apply(Df, 1, sum)
}
which does not work.
Df %>% mutate(total = myfun2())
Error in mutate_impl(.data, dots) :
Evaluation error: object '.' not found.
Because I am not even sure how the "." works, I don't think I can formulate the question better, but basically, I want to know if there a way of saying, in effect, if the Df is not defined in myfun2, get the data-frame that is normally referenced by "."?
One option would be to quote the function and then evaluate with !!
library(tidyverse)
myfun <- function() {
quote(reduce(., `+`))
}
r1 <- Df %>%
mutate(total = !! myfun())
r1
# A tibble: 10 x 5
# p_1 p_2 q_1 q_2 total
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1.37 1.30 -0.307 0.455 2.82
# 2 -0.565 2.29 -1.78 0.705 0.645
# 3 0.363 -1.39 -0.172 1.04 -0.163
# 4 0.633 -0.279 1.21 -0.609 0.960
# 5 0.404 -0.133 1.90 0.505 2.67
# 6 -0.106 0.636 -0.430 -1.72 -1.62
# 7 1.51 -0.284 -0.257 -0.784 0.186
# 8 -0.0947 -2.66 -1.76 -0.851 -5.37
# 9 2.02 -2.44 0.460 -2.41 -2.38
#10 -0.0627 1.32 -0.640 0.0361 0.654
Note that the reduce was used to be more in align with tidyverse, but the OP's function can also be quoted and get the same result
myfun2 <- function() {
quote(apply(., 1, sum ))
}
r2 <- Df %>%
mutate(total = !! myfun2())
all.equal(r2$total, r1$total)
#[1] TRUE

Resources