I need to perform an analysis with glmer on many different subgroups of a large dataset and only extract the estimate and z-value of each model. This works perfectly fine if I only use a small subset of my data (or some dummy data, as attached below), but when I try to include the whole data set, it takes forever. Currently I am using this bit of code:
slope_range <- df %>%
group_by(region, year, species) %>%
summarise(slope = coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial")))[2],
p_val = coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial")))[6])
As I said, this works fine, but very slow on a large data set. I'm aware that I could also just write multiple loops, but I assume this would take even longer. Does anyone have a better solution of what could be done to make it faster? Thanks!
Dummy data:
> dput(df)
structure(list(region = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("ARG", "CHE"), class = "factor"),
transect = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), presence = c(1L, 1L,
1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 0L,
1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L,
1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L,
0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L,
0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L), year = c(2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L), species = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("a", "b"), class = "factor"),
road = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("FG", "MK", "PL", "XY"), class = "factor")), class = "data.frame", row.names = c(NA,
-160L))
You are calling coef(summary(glmer(...))) twice for each group, so you can cut the execution time roughly in half by fitting the model and extracting the coefficients once for each group. The following code will extract all the coefficients and their Z and p-values, not just the two values you specified, which I think is preferable if you might end up needing them later. Of course it can be easily modified to discard the other coefficients and keep only the two you specified.
code
library(tidyverse)
library(lme4)
df %>%
group_by(region, year, species) %>%
group_modify(~ data.frame(variable = c('Intercept', 'transect'),
coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial", data = .)))))
output
# A tibble: 16 x 8
# Groups: region, year, species [8]
region year species variable Estimate Std..Error z.value Pr...z..
<fct> <int> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
1 ARG 2007 a Intercept 6.11 2.81 2.17 0.0300
2 ARG 2007 a transect -0.743 0.361 -2.06 0.0398
3 ARG 2007 b Intercept 1.91 1.22 1.57 0.116
4 ARG 2007 b transect -0.396 0.208 -1.90 0.0570
5 ARG 2017 a Intercept 3.95 1.73 2.28 0.0223
6 ARG 2017 a transect -0.654 0.275 -2.38 0.0174
7 ARG 2017 b Intercept 2.44 1.33 1.83 0.0668
8 ARG 2017 b transect -0.396 0.208 -1.90 0.0570
9 CHE 2007 a Intercept 3.95 1.73 2.28 0.0223
10 CHE 2007 a transect -0.654 0.275 -2.38 0.0174
11 CHE 2007 b Intercept 2.44 1.33 1.83 0.0668
12 CHE 2007 b transect -0.396 0.208 -1.90 0.0570
13 CHE 2017 a Intercept 6.11 2.81 2.17 0.0300
14 CHE 2017 a transect -0.743 0.361 -2.06 0.0398
15 CHE 2017 b Intercept 1.91 1.22 1.57 0.116
16 CHE 2017 b transect -0.396 0.208 -1.90 0.0570
You could use a parallel approach as suggested earlier, e.g. with parallel::mclapply (on my 6-core machine using more than 4 cores gave only marginal improvements, though).
You could speed up glmer using nAGQ=0, at the cost of precision (see https://stats.stackexchange.com/questions/132841/default-lme4-optimizer-requires-lots-of-iterations-for-high-dimensional-data).
Example code with benchmarks:
invisible(lapply(c("lme4", "data.table", "tidyverse", "parallel", "microbenchmark"),
require, character.only = TRUE))
#> Loading required package: lme4
#> Loading required package: Matrix
#> Loading required package: data.table
#> Loading required package: tidyverse
#> Loading required package: parallel
#> Loading required package: microbenchmark
df <- structure(list(region = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("ARG", "CHE"), class = "factor"),
transect = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), presence = c(1L, 1L,
1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 0L,
1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L,
1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L,
0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L,
0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L), year = c(2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L), species = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("a", "b"), class = "factor"),
road = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("FG", "MK", "PL", "XY"), class = "factor")), class = "data.frame", row.names = c(NA,
-160L))
## Your function for comparison
tidy_fun <- function(){
df %>%
group_by(region, year, species) %>%
summarise(slope = coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial")))[2],
p_val = coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial")))[6])
}
gf2 <- function(presence, transect, road, nAGQ = 1L) {
res <- coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial", nAGQ=nAGQ)))
return(data.table(slope=res[2], p_val=res[6]))
}
parLM <- function(mc.cores=4L, nAGQ=1L){
DT <- data.table(df, key = c("region","year","species"))
iDT <- DT[,by=.(region, year, species),.(irange=.(range(.I)))]
result <- mclapply(seq(nrow(iDT)),
function(x) DT[do.call(seq, as.list(iDT[x, irange][[1]])),
.(gf2(presence, transect, road, nAGQ=nAGQ))], mc.cores=mc.cores)
return(cbind(iDT, rbindlist(result))[,-4])
}
microbenchmark(
original = suppressMessages(tidy_fun()),
multicore = parLM(mc.cores = 4L, nAGQ = 1L),
singlecore.nAGQ0 = parLM(mc.cores = 1L, nAGQ = 0L),
multicore.nAGQ0 = parLM(mc.cores = 4L, nAGQ = 0L),
times=10L)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> original 898.2732 925.0621 963.7452 940.9577 973.0648 1157.0030 10
#> multicore 319.1234 334.4151 347.8024 344.1370 362.6539 373.8189 10
#> singlecore.nAGQ0 237.4782 245.4084 262.6290 268.1308 274.8516 280.7944 10
#> multicore.nAGQ0 132.3356 132.9963 137.2777 135.8659 141.5145 144.2564 10
#> cld
#> d
#> c
#> b
#> a
I have a data on countries and want to summarize it and create a table.
> head(data)
country year score members
A 1989 0 7
A 1990 0 7
A 1991 0 7
A 1992 0 7
A 1993 0 7
A 1994 0 7
The table should show the relationship between country "score" and the number of "members" – put differently, I want to see how many states with score 0,1 or 2 have "members"(ranging from 1 to 7).
I want to set it like this:
score members==1 members==2 members==3 members==4 members==5 members==6 members==7
0 1 0
1 2 0
2 0 1 and so on..
To do this I run the following:
library(dplyr)
table <- data %>%
group_by(score) %>%
summarise(
m1 = sum(members==1, na.rm=TRUE),
m2 = sum(members==2, na.rm=TRUE),
m3 = sum(members==3, na.rm=TRUE),
m4 = sum(members==4, na.rm=TRUE),
m5 = sum(members==5, na.rm=TRUE),
m6 = sum(members==6, na.rm=TRUE),
m7 = sum(members==7, na.rm=TRUE)
)
This gives:
score m1 m2 m3 m4 m5 m6 m7
0 0 2 0 0 0 3 30
1 15 3 11 11 3 18 3
2 3 0 2 2 0 6 9
.
.
I need a little help here. As you see it has calculated the total number of observations, whereas I want to count each country only once.
How do I summarize this data to have the total number of countries for each members-level?
Here's a sample of my data for reproducibility:
data <-
structure(list(country = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L), .Label = c("A", "B", "C", "D", "E", "F"), class = "factor"),
year = c(1989L, 1990L, 1991L, 1992L, 1993L, 1994L, 1995L,
1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L,
2005L, 2006L, 2007L, 2008L, 2010L, 1989L, 1990L, 1991L, 1992L,
1993L, 1994L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L,
2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L, 2009L, 2010L,
2011L, 1989L, 1991L, 1993L, 1994L, 1995L, 1996L, 1997L, 1999L,
2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L,
2010L, 1989L, 1990L, 1991L, 1992L, 1993L, 1994L, 1995L, 1996L,
1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L,
2006L, 2007L, 2008L, 2009L, 2010L, 2011L, 1991L, 1992L, 1993L,
1994L, 1995L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L,
2004L, 2005L, 2006L, 2007L, 2008L, 2010L, 1991L, 1992L, 1993L,
1994L, 1995L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L,
2004L, 2005L, 2006L, 2007L, 2008L, 2010L), score = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L,
1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
2L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), members = c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L)), .Names = c("country", "year", "score",
"members"), class = "data.frame", row.names = c(NA, -121L))
I believe you need this:
library(reshape2)
dcast(aggregate(country~score+members, data=data, FUN=function(x) length(unique(x))),
score~members, value.var="country", fill=0L)
# score 1 2 3 4 5 6 7
#1 0 0 1 0 0 0 1 2
#2 1 1 1 2 2 1 3 2
#3 2 1 0 1 2 0 1 1
Or, to put it the dplyr/tidyr way:
data %>%
group_by(members, score) %>%
summarise(n=n_distinct(country)) %>%
spread(members, n, fill=0L)
## A tibble: 3 x 8
# score 1 2 3 4 5 6 7
#* <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 0 0 1 0 0 0 1 2
#2 1 1 1 2 2 1 3 2
#3 2 1 0 1 2 0 1 1
As the OP is using dplyr methods, we can do this by grouping with 'score', 'members' to get the number of elements (n()), and then spread (from tidyr) to reshape it to 'wide' format.
library(dplyr)
library(tidyr)
data %>%
group_by(score, members) %>%
summarise(n = n()) %>%
mutate(members = paste0("m", members)) %>%
spread(members, n, fill = 0)
# score m1 m2 m3 m4 m5 m6 m7
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 0 0 2 0 0 0 3 30
#2 1 15 3 11 11 3 18 3
#3 2 3 0 2 2 0 6 9
If we need to also get the counts by 'country', just add 'country' in the group_by
data %>%
group_by(country, score, members) %>%
summarise(n = n()) %>%
mutate(members = paste0("m", members)) %>%
spread(members, n, fill = 0)
If the expected output is the one showed in the other posts, an option using data.table would be to convert the 'data.frame' to 'data.table' (setDT(data), and dcast from 'long' to 'wide' specifying the fun.aggregate as uniqueN of the 'value.var' variable i.e. 'country' where uniqueN returns the length of unique elements in the 'country' column. The fill=0 specifies to occupy 0 for those combinations that are not available. By default, it returns as NA.
library(data.table)
dcast(setDT(data), score~members, value.var= 'country', fun.aggregate = uniqueN, fill = 0)
# score 1 2 3 4 5 6 7
#1: 0 0 1 0 0 0 1 2
#2: 1 1 1 2 2 1 3 2
#3: 2 1 0 1 2 0 1 1
It seems the crux of the issue is having the duplicated rows for each year? In which case you can remove them with distinct, then it's a simple crosstab. You could use the %$% exposition pipe from magrittr:
library(dplyr)
library(magrittr)
data %>%
distinct(country, score, members) %$%
table(score, members)
members
score 1 2 3 4 5 6 7
0 0 1 0 0 0 1 2
1 1 1 2 2 1 3 2
2 1 0 1 2 0 1 1
Or a regular pipe and tabyl from the janitor package:
library(dplyr)
library(janitor)
data %>%
distinct(country, score, members) %>%
tabyl(score, members)
score 1 2 3 4 5 6 7
0 0 1 0 0 0 1 2
1 1 1 2 2 1 3 2
2 1 0 1 2 0 1 1