Related
I need to perform an analysis with glmer on many different subgroups of a large dataset and only extract the estimate and z-value of each model. This works perfectly fine if I only use a small subset of my data (or some dummy data, as attached below), but when I try to include the whole data set, it takes forever. Currently I am using this bit of code:
slope_range <- df %>%
group_by(region, year, species) %>%
summarise(slope = coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial")))[2],
p_val = coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial")))[6])
As I said, this works fine, but very slow on a large data set. I'm aware that I could also just write multiple loops, but I assume this would take even longer. Does anyone have a better solution of what could be done to make it faster? Thanks!
Dummy data:
> dput(df)
structure(list(region = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("ARG", "CHE"), class = "factor"),
transect = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), presence = c(1L, 1L,
1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 0L,
1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L,
1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L,
0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L,
0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L), year = c(2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L), species = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("a", "b"), class = "factor"),
road = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("FG", "MK", "PL", "XY"), class = "factor")), class = "data.frame", row.names = c(NA,
-160L))
You are calling coef(summary(glmer(...))) twice for each group, so you can cut the execution time roughly in half by fitting the model and extracting the coefficients once for each group. The following code will extract all the coefficients and their Z and p-values, not just the two values you specified, which I think is preferable if you might end up needing them later. Of course it can be easily modified to discard the other coefficients and keep only the two you specified.
code
library(tidyverse)
library(lme4)
df %>%
group_by(region, year, species) %>%
group_modify(~ data.frame(variable = c('Intercept', 'transect'),
coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial", data = .)))))
output
# A tibble: 16 x 8
# Groups: region, year, species [8]
region year species variable Estimate Std..Error z.value Pr...z..
<fct> <int> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
1 ARG 2007 a Intercept 6.11 2.81 2.17 0.0300
2 ARG 2007 a transect -0.743 0.361 -2.06 0.0398
3 ARG 2007 b Intercept 1.91 1.22 1.57 0.116
4 ARG 2007 b transect -0.396 0.208 -1.90 0.0570
5 ARG 2017 a Intercept 3.95 1.73 2.28 0.0223
6 ARG 2017 a transect -0.654 0.275 -2.38 0.0174
7 ARG 2017 b Intercept 2.44 1.33 1.83 0.0668
8 ARG 2017 b transect -0.396 0.208 -1.90 0.0570
9 CHE 2007 a Intercept 3.95 1.73 2.28 0.0223
10 CHE 2007 a transect -0.654 0.275 -2.38 0.0174
11 CHE 2007 b Intercept 2.44 1.33 1.83 0.0668
12 CHE 2007 b transect -0.396 0.208 -1.90 0.0570
13 CHE 2017 a Intercept 6.11 2.81 2.17 0.0300
14 CHE 2017 a transect -0.743 0.361 -2.06 0.0398
15 CHE 2017 b Intercept 1.91 1.22 1.57 0.116
16 CHE 2017 b transect -0.396 0.208 -1.90 0.0570
You could use a parallel approach as suggested earlier, e.g. with parallel::mclapply (on my 6-core machine using more than 4 cores gave only marginal improvements, though).
You could speed up glmer using nAGQ=0, at the cost of precision (see https://stats.stackexchange.com/questions/132841/default-lme4-optimizer-requires-lots-of-iterations-for-high-dimensional-data).
Example code with benchmarks:
invisible(lapply(c("lme4", "data.table", "tidyverse", "parallel", "microbenchmark"),
require, character.only = TRUE))
#> Loading required package: lme4
#> Loading required package: Matrix
#> Loading required package: data.table
#> Loading required package: tidyverse
#> Loading required package: parallel
#> Loading required package: microbenchmark
df <- structure(list(region = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("ARG", "CHE"), class = "factor"),
transect = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), presence = c(1L, 1L,
1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 0L,
1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L,
1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L,
0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L,
0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L), year = c(2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L), species = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("a", "b"), class = "factor"),
road = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("FG", "MK", "PL", "XY"), class = "factor")), class = "data.frame", row.names = c(NA,
-160L))
## Your function for comparison
tidy_fun <- function(){
df %>%
group_by(region, year, species) %>%
summarise(slope = coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial")))[2],
p_val = coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial")))[6])
}
gf2 <- function(presence, transect, road, nAGQ = 1L) {
res <- coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial", nAGQ=nAGQ)))
return(data.table(slope=res[2], p_val=res[6]))
}
parLM <- function(mc.cores=4L, nAGQ=1L){
DT <- data.table(df, key = c("region","year","species"))
iDT <- DT[,by=.(region, year, species),.(irange=.(range(.I)))]
result <- mclapply(seq(nrow(iDT)),
function(x) DT[do.call(seq, as.list(iDT[x, irange][[1]])),
.(gf2(presence, transect, road, nAGQ=nAGQ))], mc.cores=mc.cores)
return(cbind(iDT, rbindlist(result))[,-4])
}
microbenchmark(
original = suppressMessages(tidy_fun()),
multicore = parLM(mc.cores = 4L, nAGQ = 1L),
singlecore.nAGQ0 = parLM(mc.cores = 1L, nAGQ = 0L),
multicore.nAGQ0 = parLM(mc.cores = 4L, nAGQ = 0L),
times=10L)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> original 898.2732 925.0621 963.7452 940.9577 973.0648 1157.0030 10
#> multicore 319.1234 334.4151 347.8024 344.1370 362.6539 373.8189 10
#> singlecore.nAGQ0 237.4782 245.4084 262.6290 268.1308 274.8516 280.7944 10
#> multicore.nAGQ0 132.3356 132.9963 137.2777 135.8659 141.5145 144.2564 10
#> cld
#> d
#> c
#> b
#> a
I have a question that is somewhat similar to others that have been posted, but after looking thoroughly at several posts, I can't get the code to work. Any help would be much appreciated.
My data frame looks like, this:
'data.frame': 501 obs. of 5 variables:
$ Tattoo.MUM : Factor w/ 250 levels "1004","1007",..: 76 76 76 81 81 81 85 85 85 85 ...
$ OffspringMUMs: int 4 4 4 4 4 4 11 11 11 11 ...
$ YearBIRTH.CUB: int 1988 1990 1991 1988 1991 2007 1989 1991 1992 1993 ...
$ YearBIRTH.MUM: int 1991 1991 NA NA NA NA 1987 1987 1987 1987 ...
$ OFFSpYR : int 2 1 1 1 2 1 1 4 3 3 ...
A few lines here:
structure(list(Tattoo.MUM = structure(c(6L, 6L, 6L, 6L, 7L, 7L,
7L, 8L, 9L, 11L, 11L, 11L, 11L, 5L, 1L, 4L, 2L, 3L, 3L, 10L,
10L, 10L, 10L, 10L), .Label = c("10454", "1045A", "1045X", "12392",
"1601", "22", "27", "29", "41", "424X", "60"), class = "factor"),
OffspringMUMs = c(11L, 11L, 11L, 11L, 5L, 5L, 5L, 1L, 3L,
7L, 7L, 7L, 7L, 1L, 2L, 1L, 1L, 4L, 4L, 6L, 6L, 6L, 6L, 6L
), YearBIRTH.CUB = c(1989L, 1991L, 1992L, 1993L, 1990L, 1991L,
1993L, 1989L, 1988L, 1988L, 1989L, 1991L, 1994L, 2015L, 2012L,
2015L, 2005L, 2009L, 2010L, 1996L, 1998L, 2000L, 2001L, 2006L
), YearBIRTH.MUM = c(1987L, 1987L, 1987L, 1987L, NA, NA,
NA, NA, NA, 1987L, 1987L, 1987L, 1987L, NA, NA, NA, NA, 2005L,
2005L, 1994L, 1994L, 1994L, 1994L, 1994L), OFFSpYR = c(1L,
4L, 3L, 3L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L)), .Names = c("Tattoo.MUM",
"OffspringMUMs", "YearBIRTH.CUB", "YearBIRTH.MUM", "OFFSpYR"), class = "data.frame", row.names = c(NA,
-24L))
I want to add new rows for all missing years (YearBIRTH.CUB) in Tattoo.MUM keeping the rest of the values the same and adding '0' to OFFSpYR.
Like so:
structure(list(Tattoo.MUM = structure(c(6L, 6L, 6L, 6L, 6L, 7L,
7L, 7L, 7L, 8L, 9L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 5L, 1L,
4L, 2L, 3L, 3L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L), .Label = c("10454", "1045A", "1045X", "12392", "1601",
"22", "27", "29", "41", "424X", "60"), class = "factor"), OffspringMUMs = c(11L,
11L, 11L, 11L, 11L, 5L, 5L, 5L, 5L, 1L, 3L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 1L, 2L, 1L, 1L, 4L, 4L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L), YearBIRTH.CUB = c(1989L, 1990L, 1991L, 1992L, 1993L,
1990L, 1991L, 1992L, 1993L, 1989L, 1988L, 1988L, 1989L, 1990L,
1991L, 1992L, 1993L, 1994L, 2015L, 2012L, 2015L, 2005L, 2009L,
2010L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L,
2004L, 2005L, 2006L), YearBIRTH.MUM = c(1987L, 1987L, 1987L,
1987L, 1987L, NA, NA, NA, NA, NA, NA, 1987L, 1987L, 1987L, 1987L,
1987L, 1987L, 1987L, NA, NA, NA, NA, 2005L, 2005L, 1994L, 1994L,
1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L
), OFFSpYR = c(1L, 0L, 4L, 3L, 3L, 1L, 1L, 0L, 3L, 1L, 3L, 3L,
1L, 0L, 2L, 0L, 0L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 0L, 1L, 0L,
1L, 2L, 0L, 0L, 0L, 0L, 1L)), .Names = c("Tattoo.MUM", "OffspringMUMs",
"YearBIRTH.CUB", "YearBIRTH.MUM", "OFFSpYR"), class = "data.frame", row.names = c(NA,
-35L))
I've tried:
library(tidyr)
library(dplyr)
df1 <- pedMUM %>% group_by(Tattoo.MUM, OffspringMUMs) %>% complete(YearBIRTH.CUB = full_seq(YearBIRTH.CUB,1)) %>% fill(OFFSpYR=0)
library(data.table)
df1 <- setDT(pedMUM)[CJ(Tattoo.MUM=Tattoo.MUM, OffspringMUMs=OffspringMUMs, YearBIRTH.MUM=YearBIRTH.MUM, YearBIRTH.CUB=seq(min(YearBIRTH.CUB), max(YearBIRTH.CUB)), unique=TRUE),
on=.(Tattoo.MUM, OffspringMUMs, YearBIRTH.CUB), roll=T]
I am obviously using tidyr, dplyr, and data.table wrongly because none have given me the results I want.
I've had a look at the following posts:
Add rows with missing years by group
Adding rows with values of "0" to a dataframe with missing data
Find missing month after grouping with dplyr
And even tried loops:
R code - clever loop to add rows
but I get confused when I try to determine the year sequence for each Tattoo.MUM within the loop.
Would anyone be able to point me in the right direction?
I haven't used complete() before, but the following seems to work. nesting() allows you to keep two variables together, =full_seq() allows you to expand the values of a variable, fill=list() allows you to fill in blanks.
pedMUM <- structure(list(Tattoo.MUM = structure(c(6L, 6L, 6L, 6L, 7L, 7L,
7L, 8L, 9L, 11L, 11L, 11L, 11L, 5L, 1L, 4L, 2L, 3L, 3L, 10L,
10L, 10L, 10L, 10L), .Label = c("10454", "1045A", "1045X", "12392",
"1601", "22", "27", "29", "41", "424X", "60"), class = "factor"),
OffspringMUMs = c(11L, 11L, 11L, 11L, 5L, 5L, 5L, 1L, 3L,
7L, 7L, 7L, 7L, 1L, 2L, 1L, 1L, 4L, 4L, 6L, 6L, 6L, 6L, 6L
), YearBIRTH.CUB = c(1989L, 1991L, 1992L, 1993L, 1990L, 1991L,
1993L, 1989L, 1988L, 1988L, 1989L, 1991L, 1994L, 2015L, 2012L,
2015L, 2005L, 2009L, 2010L, 1996L, 1998L, 2000L, 2001L, 2006L
), YearBIRTH.MUM = c(1987L, 1987L, 1987L, 1987L, NA, NA,
NA, NA, NA, 1987L, 1987L, 1987L, 1987L, NA, NA, NA, NA, 2005L,
2005L, 1994L, 1994L, 1994L, 1994L, 1994L), OFFSpYR = c(1L,
4L, 3L, 3L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L)), .Names = c("Tattoo.MUM",
"OffspringMUMs", "YearBIRTH.CUB", "YearBIRTH.MUM", "OFFSpYR"), class = "data.frame", row.names = c(NA,
-24L))
library(tidyr)
library(dplyr)
df1 <- pedMUM %>%
group_by(Tattoo.MUM) %>% # find min and max year for each mum
mutate(
minyear=min(YearBIRTH.CUB, na.rm=TRUE),
maxyear=max(YearBIRTH.CUB, na.rm=TRUE)
) %>%
complete( # complete table
nesting(Tattoo.MUM, minyear, maxyear, OffspringMUMs, YearBIRTH.MUM),
YearBIRTH.CUB=full_seq(YearBIRTH.CUB, 1),
fill=list(OFFSpYR=0)
) %>%
filter(YearBIRTH.CUB>=minyear & YearBIRTH.CUB<=maxyear) %>% # remove unwanted years
select(names(pedMUM)) # return original column order
I want generate the following endogenous lag (Y) variable
set Y=1 in the current routine year, if submission==1 and routineyear==1 in the previous routine year
set Y=2 in the current routine year, if sub==0 and routineyear==1 in the previous routine year
Otherwise=0
Note though that "previous routine year" is not previous year, the intervals between routine years varies. This is actually what makes it hard for me to generate this variable.
Basically, I want to generate an endogenous variable that would capture state's behavior in their LAST routineyear.
To illustrate what I want to do:
Assume that country A had its routine year in 1990 - the same year the submission variable was also =1. This would generate Y=1.
Now, the next routineyear for country A is in 1992, where the submission=1 and routineyear=1 in that year. The endogenous lag in this should indicate A's previous behavior as in 1990 (Y=1).
Then, the next routineyear is in 1996 where submission=0 while routineyear=1. The endogenous lag in this case would be the value of A's previous behavior in 1992 (Y=1).
Then again, next routineyear is in 1998, where submission=1 and routineyear=1. The endogenous lag here should indicate A's previous behavior in the last routineyear, in 1996. that is: Y=2!.
This is how the endogenous lag should look like (based on the example above)
country year submission routineyear Y(endo lag)
A 1990 1 1 1
A 1991 0 0 0
A 1992 1 1 1
A 1993 1 0 0
A 1994 0 0 0
A 1995 0 0 0
A 1996 0 1 1
A 1997 0 0 0
A 1998 1 1 2
A 1999 0 0 0
A 2000 0 0 0
A 2001 0 1 1
A 2002 0 0 0
A 2003 1 1 2
I've been trying to do this using different logics but without success. One of the biggest problems is that routine year is different for each country, the intervals are not stable.
I believe that someone who can write proper codes/functions in R would be able to slove this puzzle. If not, I would appreciate all recommendations as how to proceed from here.
A sample from my real data:
structure(list(ccode = c(31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L,
31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 40L,
40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L,
40L, 40L, 40L, 40L, 40L, 40L, 40L, 41L, 41L, 41L, 41L, 41L, 41L, 41L,
41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L,
41L, 42L, 42L, 42L, 42L, 42L, 42L, 42L, 42L, 42L, 42L, 42L, 42L, 42L,
42L, 42L, 42L, 42L, 42L, 42L, 42L, 42L, 42L, 51L, 51L, 51L, 51L, 51L,
51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L,
51L, 51L, 51L, 51L, 52L, 52L, 52L, 52L, 52L, 52L, 52L, 52L, 52L, 52L,
52L, 52L, 52L, 52L, 52L, 52L, 52L, 52L, 52L, 52L, 52L, 52L, 53L, 53L,
53L, 53L, 53L, 53L, 53L, 53L, 53L, 53L, 53L, 53L, 53L, 53L, 53L, 53L,
53L, 53L, 53L, 53L, 53L, 53L, 54L, 54L, 54L, 54L, 54L, 54L, 54L, 54L,
54L, 54L, 54L, 54L, 54L, 54L, 54L, 54L, 54L, 54L, 54L, 54L, 54L, 54L,
70L, 70L, 70L, 70L, 70L, 70L, 70L, 70L, 70L, 70L, 70L, 70L, 70L, 70L,
70L, 70L, 70L, 70L, 70L, 70L, 70L, 70L, 80L, 80L, 80L, 80L, 80L, 80L,
80L, 80L, 80L, 80L, 80L, 80L, 80L, 80L, 80L, 80L, 80L, 80L, 80L, 80L,
80L, 80L, 90L, 90L, 90L, 90L, 90L, 90L, 90L, 90L, 90L, 90L, 90L, 90L,
90L, 90L, 90L, 90L, 90L, 90L, 90L, 90L, 90L, 90L), year = c(1990L,
1991L, 1992L, 1993L, 1994L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L,
2001L, 2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L, 2009L, 2010L,
2011L, 1990L, 1991L, 1992L, 1993L, 1994L, 1995L, 1996L, 1997L, 1998L,
1999L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L,
2009L, 2010L, 2011L, 1990L, 1991L, 1992L, 1993L, 1994L, 1995L, 1996L,
1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L,
2007L, 2008L, 2009L, 2010L, 2011L, 1990L, 1991L, 1992L, 1993L, 1994L,
1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L,
2005L, 2006L, 2007L, 2008L, 2009L, 2010L, 2011L, 1990L, 1991L, 1992L,
1993L, 1994L, 1995L, 1996L, 1997L, 1998L, 1999L, 1999L, 2000L, 2001L,
2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L, 2009L, 2010L, 2011L,
1990L, 1991L, 1992L, 1993L, 1994L, 1995L, 1996L, 1997L, 1998L, 1999L,
2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L, 2009L,
2010L, 2011L, 1990L, 1991L, 1992L, 1993L, 1994L, 1995L, 1996L, 1997L,
1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L, 2007L,
2008L, 2009L, 2010L, 2011L, 1990L, 1991L, 1992L, 1993L, 1994L, 1995L,
1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L,
2006L, 2007L, 2008L, 2009L, 2010L, 2011L, 1990L, 1991L, 1992L, 1993L,
1994L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L,
2004L, 2005L, 2006L, 2007L, 2008L, 2009L, 2010L, 2011L, 1990L, 1991L,
1992L, 1993L, 1994L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L,
2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L, 2009L, 2010L, 2011L,
1990L, 1991L, 1992L, 1993L, 1994L, 1995L, 1996L, 1997L, 1998L, 1999L,
2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L, 2009L,
2010L, 2011L), country = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L), .Label = c("Bahamas", "Barbados",
"Belize", "Cuba", "Dominica", "Dominican Republic", "Guatemala",
"Haiti", "Jamaica", "Mexico", "Trinidad and Tobago"), class =
"factor"),
submission = c(1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L,
1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L,
1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L,
1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L,
0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L,
1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L,
0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L,
1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L,
0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L,
0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L,
1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L,
0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L,
1L, 0L, 1L, 0L, 1L, 0L, 0L), routineyear = c(1L, 0L, 0L,
1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L,
0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L,
1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L,
0L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L,
0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L,
0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L,
0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L
)), .Names = c("ccode", "year", "country", "submission", "routineyear"), class = "data.frame", row.names = c(NA, -243L ))
Using data.table:
library(data.table)
setDT(DF)
DF[, Y := 0
][routineyear == 1
, Y := 1 + (shift(submission, fill = 1) == 0)
, by = country][]
which gives (first 15 rows shown):
> DF
ccode year country submission routineyear Y
1: 31 1990 Bahamas 1 1 1
2: 31 1991 Bahamas 0 0 0
3: 31 1992 Bahamas 0 0 0
4: 31 1993 Bahamas 0 1 1
5: 31 1994 Bahamas 0 0 0
6: 31 1995 Bahamas 1 0 0
7: 31 1996 Bahamas 0 0 0
8: 31 1997 Bahamas 1 1 2
9: 31 1998 Bahamas 0 0 0
10: 31 1999 Bahamas 1 1 1
11: 31 2000 Bahamas 0 0 0
12: 31 2001 Bahamas 1 1 1
13: 31 2002 Bahamas 0 0 0
14: 31 2003 Bahamas 1 1 1
15: 31 2004 Bahamas 0 0 0
........
What this does:
setDT(DF) converts your dataframe to a data.table
Y := 0 sets Y to 0 by reference first
Filter for routineyear == 1
Update Y by reference such that Y is set to 1 if previous submission is 1 and to 2 is previous submission is 0
library(dplyr)
select(dat2, -Y) %>%
filter(routineyear == 1L) %>%
group_by(country) %>%
mutate(Y = 2L - lag(submission, default = 1L)) %>%
ungroup() %>%
right_join(select(dat2, -Y)) %>%
mutate(Y = replace(Y, is.na(Y), 0L))
# # A tibble: 14 x 5
# country year submission routineyear Y
# <fct> <int> <int> <int> <int>
# 1 A 1990 1 1 1
# 2 A 1991 0 0 0
# 3 A 1992 1 1 1
# 4 A 1993 1 0 0
# 5 A 1994 0 0 0
# 6 A 1995 0 0 0
# 7 A 1996 0 1 1
# 8 A 1997 0 0 0
# 9 A 1998 1 1 2
# 10 A 1999 0 0 0
# 11 A 2000 0 0 0
# 12 A 2001 0 1 1
# 13 A 2002 0 0 0
# 14 A 2003 1 1 2
all.equal(.Last.value, dat2)
# [1] TRUE
where dat2 is:
dat2 <- read.table(text =
"country year submission routineyear Y
A 1990 1 1 1
A 1991 0 0 0
A 1992 1 1 1
A 1993 1 0 0
A 1994 0 0 0
A 1995 0 0 0
A 1996 0 1 1
A 1997 0 0 0
A 1998 1 1 2
A 1999 0 0 0
A 2000 0 0 0
A 2001 0 1 1
A 2002 0 0 0
A 2003 1 1 2
", header = TRUE)
I have a data on countries and want to summarize it and create a table.
> head(data)
country year score members
A 1989 0 7
A 1990 0 7
A 1991 0 7
A 1992 0 7
A 1993 0 7
A 1994 0 7
The table should show the relationship between country "score" and the number of "members" – put differently, I want to see how many states with score 0,1 or 2 have "members"(ranging from 1 to 7).
I want to set it like this:
score members==1 members==2 members==3 members==4 members==5 members==6 members==7
0 1 0
1 2 0
2 0 1 and so on..
To do this I run the following:
library(dplyr)
table <- data %>%
group_by(score) %>%
summarise(
m1 = sum(members==1, na.rm=TRUE),
m2 = sum(members==2, na.rm=TRUE),
m3 = sum(members==3, na.rm=TRUE),
m4 = sum(members==4, na.rm=TRUE),
m5 = sum(members==5, na.rm=TRUE),
m6 = sum(members==6, na.rm=TRUE),
m7 = sum(members==7, na.rm=TRUE)
)
This gives:
score m1 m2 m3 m4 m5 m6 m7
0 0 2 0 0 0 3 30
1 15 3 11 11 3 18 3
2 3 0 2 2 0 6 9
.
.
I need a little help here. As you see it has calculated the total number of observations, whereas I want to count each country only once.
How do I summarize this data to have the total number of countries for each members-level?
Here's a sample of my data for reproducibility:
data <-
structure(list(country = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L), .Label = c("A", "B", "C", "D", "E", "F"), class = "factor"),
year = c(1989L, 1990L, 1991L, 1992L, 1993L, 1994L, 1995L,
1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L,
2005L, 2006L, 2007L, 2008L, 2010L, 1989L, 1990L, 1991L, 1992L,
1993L, 1994L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L,
2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L, 2009L, 2010L,
2011L, 1989L, 1991L, 1993L, 1994L, 1995L, 1996L, 1997L, 1999L,
2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L,
2010L, 1989L, 1990L, 1991L, 1992L, 1993L, 1994L, 1995L, 1996L,
1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L,
2006L, 2007L, 2008L, 2009L, 2010L, 2011L, 1991L, 1992L, 1993L,
1994L, 1995L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L,
2004L, 2005L, 2006L, 2007L, 2008L, 2010L, 1991L, 1992L, 1993L,
1994L, 1995L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L,
2004L, 2005L, 2006L, 2007L, 2008L, 2010L), score = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L,
1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
2L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), members = c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L)), .Names = c("country", "year", "score",
"members"), class = "data.frame", row.names = c(NA, -121L))
I believe you need this:
library(reshape2)
dcast(aggregate(country~score+members, data=data, FUN=function(x) length(unique(x))),
score~members, value.var="country", fill=0L)
# score 1 2 3 4 5 6 7
#1 0 0 1 0 0 0 1 2
#2 1 1 1 2 2 1 3 2
#3 2 1 0 1 2 0 1 1
Or, to put it the dplyr/tidyr way:
data %>%
group_by(members, score) %>%
summarise(n=n_distinct(country)) %>%
spread(members, n, fill=0L)
## A tibble: 3 x 8
# score 1 2 3 4 5 6 7
#* <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 0 0 1 0 0 0 1 2
#2 1 1 1 2 2 1 3 2
#3 2 1 0 1 2 0 1 1
As the OP is using dplyr methods, we can do this by grouping with 'score', 'members' to get the number of elements (n()), and then spread (from tidyr) to reshape it to 'wide' format.
library(dplyr)
library(tidyr)
data %>%
group_by(score, members) %>%
summarise(n = n()) %>%
mutate(members = paste0("m", members)) %>%
spread(members, n, fill = 0)
# score m1 m2 m3 m4 m5 m6 m7
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 0 0 2 0 0 0 3 30
#2 1 15 3 11 11 3 18 3
#3 2 3 0 2 2 0 6 9
If we need to also get the counts by 'country', just add 'country' in the group_by
data %>%
group_by(country, score, members) %>%
summarise(n = n()) %>%
mutate(members = paste0("m", members)) %>%
spread(members, n, fill = 0)
If the expected output is the one showed in the other posts, an option using data.table would be to convert the 'data.frame' to 'data.table' (setDT(data), and dcast from 'long' to 'wide' specifying the fun.aggregate as uniqueN of the 'value.var' variable i.e. 'country' where uniqueN returns the length of unique elements in the 'country' column. The fill=0 specifies to occupy 0 for those combinations that are not available. By default, it returns as NA.
library(data.table)
dcast(setDT(data), score~members, value.var= 'country', fun.aggregate = uniqueN, fill = 0)
# score 1 2 3 4 5 6 7
#1: 0 0 1 0 0 0 1 2
#2: 1 1 1 2 2 1 3 2
#3: 2 1 0 1 2 0 1 1
It seems the crux of the issue is having the duplicated rows for each year? In which case you can remove them with distinct, then it's a simple crosstab. You could use the %$% exposition pipe from magrittr:
library(dplyr)
library(magrittr)
data %>%
distinct(country, score, members) %$%
table(score, members)
members
score 1 2 3 4 5 6 7
0 0 1 0 0 0 1 2
1 1 1 2 2 1 3 2
2 1 0 1 2 0 1 1
Or a regular pipe and tabyl from the janitor package:
library(dplyr)
library(janitor)
data %>%
distinct(country, score, members) %>%
tabyl(score, members)
score 1 2 3 4 5 6 7
0 0 1 0 0 0 1 2
1 1 1 2 2 1 3 2
2 1 0 1 2 0 1 1
I'm having trouble writing a for loop function where I ask gmulti to find the best model. I have the following example data set:
dput(Data)
structure(list(Studbook.ID = structure(c(16L, 16L, 16L, 16L,
16L, 16L, 16L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L,
17L, 17L, 17L, 17L, 30L, 30L, 30L), .Label = c("230", "298",
"308", "329", "357", "358", "374", "382", "385", "394", "397",
"399", "404", "413", "414", "418", "432", "433", "434", "437",
"439", "444", "446", "455", "458", "460", "473", "475", "476",
"477", "478", "492", "495", "496", "499", "503"), class = "factor"),
Season = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("Breeding", "Nonbreeding"), class = "factor"),
Year = c(1999L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L,
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L,
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L,
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L,
2000L, 2000L, 2000L, 2000L, 2000L, 2000L), Age.Class = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("Adult",
"Sub-Adult"), class = "factor"), Sex = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Female", "Male"
), class = "factor"), Captive_Wild = structure(c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Captive", "Wild"
), class = "factor"), C.SA.F = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), C.HA.F = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), W.MW.F = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L
), W.MW.DUR = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L),
C.CHEW.F = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 0L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L)), .Names =c("Studbook.ID",
"Season", "Year", "Age.Class", "Sex", "Captive_Wild", "C.SA.F",
"C.HA.F", "W.MW.F", "W.MW.DUR", "C.CHEW.F"), row.names = c(NA,
40L), class = "data.frame")
Code for my total loop is:
#lmer wrapper for glmulti function
lmer.glmulti <- function (formula, data, family=binomial, random, ...) {
lmer(paste(deparse(formula), random), data = data,...)
}
#make a dependent variable list for loop
dep_list<-colnames(Bamboo)
dep_list<-dep_list[-c(1:6)]
outglm<-c()
outdesc<-c()
#start loop coding
for (depend in dep_list){
y <-Bamboo[,depend]
#gmluti loop
#glm full model (substitute behavioral variables in place of 'depend')
glmmod<-y~Captive_Wild+Sex+Age.Class+Season
glm.glmulti<-glmulti(glmmod, random="+(1|Studbook.ID)", data=Bamboo, fitfunc = lmer.glmulti, family=binomial, level=2)
#make and print table for final best model
htmlreg(glm.glmulti#objects[[1]], file=paste(depend, ".doc", sep=""), caption = depend, caption.above = TRUE)
}
It's hanging up on the glmulti code where it gives me this error:
Error in model.frame.default(as.formula(paste(y, "~", paste(x, sep = "", :
variable lengths differ (found for 'Captive_Wild')
And traceback looks like:
8 model.frame.default(as.formula(paste(y, "~", paste(x, sep = "",
collapse = "+"), sep = "")), data = data)
7 model.frame(as.formula(paste(y, "~", paste(x, sep = "", collapse = "+"),
sep = "")), data = data)
6 glmulti(y = "y", data = Bamboo, level = 2, fitfunction = lmer.glmulti,
random = "+(1|Studbook.ID)", xr = c("Sex", "Season"), exclude = 1)
5 glmulti(y = "y", data = Bamboo, level = 2, fitfunction = lmer.glmulti,
random = "+(1|Studbook.ID)", xr = c("Sex", "Season"), exclude = 1)
4 eval(expr, envir, enclos)
3 eval(call)
2 glmulti(y ~ Sex + Season, random = "+(1|Studbook.ID)", data = Bamboo,
fitfunc = lmer.glmulti, level = 2)
1 glmulti(y ~ Sex + Season, random = "+(1|Studbook.ID)", data = Bamboo,
fitfunc = lmer.glmulti, level = 2)
I've also tried When I run the variables through by hand one-by-one the glmulti works just fine and when I remove Captive_Wild (which of course I don't want to do) it gives me the same error with Sex and ditto with Season. I've checked all variable lengths and they are the same.
This implies to me that glmulti is having a problem with the for loop somewhere but I'm not sure where. Can anyone suggest fixes? This is my first attempt at for loops so any and all help would be much appreciated!
After a little more fooling around I found two problems in the code:
1) lmer wrapper is old so need to call:
glmer.glmulti <- function (formula, data, family=binomial, random, ...) {
glmer(paste(deparse(formula), random), data = data,...)
}
and 2) using the alternate form of calling glmulti in the for-loop like so:
for (depend in dep_list){
glm.glmulti = glmulti(depend, c("Captive_Wild", "Sex", "Age.Class", "Season"), random="+(1|Studbook.ID)", data=Bamboo, fitfunc=lmer.glmulti, family=binomial, level=2)
#make and print table for final best model
htmlreg(glm.glmulti#objects[[1]], file=paste(depend, ".doc", sep=""), caption = depend, caption.above = TRUE)
}