Count combinations of two variables excluding rows that repeat ID - r

I have a data on countries and want to summarize it and create a table.
> head(data)
country year score members
A 1989 0 7
A 1990 0 7
A 1991 0 7
A 1992 0 7
A 1993 0 7
A 1994 0 7
The table should show the relationship between country "score" and the number of "members" – put differently, I want to see how many states with score 0,1 or 2 have "members"(ranging from 1 to 7).
I want to set it like this:
score members==1 members==2 members==3 members==4 members==5 members==6 members==7
0 1 0
1 2 0
2 0 1 and so on..
To do this I run the following:
library(dplyr)
table <- data %>%
group_by(score) %>%
summarise(
m1 = sum(members==1, na.rm=TRUE),
m2 = sum(members==2, na.rm=TRUE),
m3 = sum(members==3, na.rm=TRUE),
m4 = sum(members==4, na.rm=TRUE),
m5 = sum(members==5, na.rm=TRUE),
m6 = sum(members==6, na.rm=TRUE),
m7 = sum(members==7, na.rm=TRUE)
)
This gives:
score m1 m2 m3 m4 m5 m6 m7
0 0 2 0 0 0 3 30
1 15 3 11 11 3 18 3
2 3 0 2 2 0 6 9
.
.
I need a little help here. As you see it has calculated the total number of observations, whereas I want to count each country only once.
How do I summarize this data to have the total number of countries for each members-level?
Here's a sample of my data for reproducibility:
data <-
structure(list(country = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L), .Label = c("A", "B", "C", "D", "E", "F"), class = "factor"),
year = c(1989L, 1990L, 1991L, 1992L, 1993L, 1994L, 1995L,
1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L,
2005L, 2006L, 2007L, 2008L, 2010L, 1989L, 1990L, 1991L, 1992L,
1993L, 1994L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L,
2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L, 2009L, 2010L,
2011L, 1989L, 1991L, 1993L, 1994L, 1995L, 1996L, 1997L, 1999L,
2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L,
2010L, 1989L, 1990L, 1991L, 1992L, 1993L, 1994L, 1995L, 1996L,
1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L,
2006L, 2007L, 2008L, 2009L, 2010L, 2011L, 1991L, 1992L, 1993L,
1994L, 1995L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L,
2004L, 2005L, 2006L, 2007L, 2008L, 2010L, 1991L, 1992L, 1993L,
1994L, 1995L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L,
2004L, 2005L, 2006L, 2007L, 2008L, 2010L), score = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L,
1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
2L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), members = c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L)), .Names = c("country", "year", "score",
"members"), class = "data.frame", row.names = c(NA, -121L))

I believe you need this:
library(reshape2)
dcast(aggregate(country~score+members, data=data, FUN=function(x) length(unique(x))),
score~members, value.var="country", fill=0L)
# score 1 2 3 4 5 6 7
#1 0 0 1 0 0 0 1 2
#2 1 1 1 2 2 1 3 2
#3 2 1 0 1 2 0 1 1
Or, to put it the dplyr/tidyr way:
data %>%
group_by(members, score) %>%
summarise(n=n_distinct(country)) %>%
spread(members, n, fill=0L)
## A tibble: 3 x 8
# score 1 2 3 4 5 6 7
#* <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 0 0 1 0 0 0 1 2
#2 1 1 1 2 2 1 3 2
#3 2 1 0 1 2 0 1 1

As the OP is using dplyr methods, we can do this by grouping with 'score', 'members' to get the number of elements (n()), and then spread (from tidyr) to reshape it to 'wide' format.
library(dplyr)
library(tidyr)
data %>%
group_by(score, members) %>%
summarise(n = n()) %>%
mutate(members = paste0("m", members)) %>%
spread(members, n, fill = 0)
# score m1 m2 m3 m4 m5 m6 m7
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 0 0 2 0 0 0 3 30
#2 1 15 3 11 11 3 18 3
#3 2 3 0 2 2 0 6 9
If we need to also get the counts by 'country', just add 'country' in the group_by
data %>%
group_by(country, score, members) %>%
summarise(n = n()) %>%
mutate(members = paste0("m", members)) %>%
spread(members, n, fill = 0)
If the expected output is the one showed in the other posts, an option using data.table would be to convert the 'data.frame' to 'data.table' (setDT(data), and dcast from 'long' to 'wide' specifying the fun.aggregate as uniqueN of the 'value.var' variable i.e. 'country' where uniqueN returns the length of unique elements in the 'country' column. The fill=0 specifies to occupy 0 for those combinations that are not available. By default, it returns as NA.
library(data.table)
dcast(setDT(data), score~members, value.var= 'country', fun.aggregate = uniqueN, fill = 0)
# score 1 2 3 4 5 6 7
#1: 0 0 1 0 0 0 1 2
#2: 1 1 1 2 2 1 3 2
#3: 2 1 0 1 2 0 1 1

It seems the crux of the issue is having the duplicated rows for each year? In which case you can remove them with distinct, then it's a simple crosstab. You could use the %$% exposition pipe from magrittr:
library(dplyr)
library(magrittr)
data %>%
distinct(country, score, members) %$%
table(score, members)
members
score 1 2 3 4 5 6 7
0 0 1 0 0 0 1 2
1 1 1 2 2 1 3 2
2 1 0 1 2 0 1 1
Or a regular pipe and tabyl from the janitor package:
library(dplyr)
library(janitor)
data %>%
distinct(country, score, members) %>%
tabyl(score, members)
score 1 2 3 4 5 6 7
0 0 1 0 0 0 1 2
1 1 1 2 2 1 3 2
2 1 0 1 2 0 1 1

Related

speed up modelling of subgroups in large data frame

I need to perform an analysis with glmer on many different subgroups of a large dataset and only extract the estimate and z-value of each model. This works perfectly fine if I only use a small subset of my data (or some dummy data, as attached below), but when I try to include the whole data set, it takes forever. Currently I am using this bit of code:
slope_range <- df %>%
group_by(region, year, species) %>%
summarise(slope = coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial")))[2],
p_val = coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial")))[6])
As I said, this works fine, but very slow on a large data set. I'm aware that I could also just write multiple loops, but I assume this would take even longer. Does anyone have a better solution of what could be done to make it faster? Thanks!
Dummy data:
> dput(df)
structure(list(region = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("ARG", "CHE"), class = "factor"),
transect = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), presence = c(1L, 1L,
1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 0L,
1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L,
1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L,
0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L,
0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L), year = c(2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L), species = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("a", "b"), class = "factor"),
road = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("FG", "MK", "PL", "XY"), class = "factor")), class = "data.frame", row.names = c(NA,
-160L))
You are calling coef(summary(glmer(...))) twice for each group, so you can cut the execution time roughly in half by fitting the model and extracting the coefficients once for each group. The following code will extract all the coefficients and their Z and p-values, not just the two values you specified, which I think is preferable if you might end up needing them later. Of course it can be easily modified to discard the other coefficients and keep only the two you specified.
code
library(tidyverse)
library(lme4)
df %>%
group_by(region, year, species) %>%
group_modify(~ data.frame(variable = c('Intercept', 'transect'),
coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial", data = .)))))
output
# A tibble: 16 x 8
# Groups: region, year, species [8]
region year species variable Estimate Std..Error z.value Pr...z..
<fct> <int> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
1 ARG 2007 a Intercept 6.11 2.81 2.17 0.0300
2 ARG 2007 a transect -0.743 0.361 -2.06 0.0398
3 ARG 2007 b Intercept 1.91 1.22 1.57 0.116
4 ARG 2007 b transect -0.396 0.208 -1.90 0.0570
5 ARG 2017 a Intercept 3.95 1.73 2.28 0.0223
6 ARG 2017 a transect -0.654 0.275 -2.38 0.0174
7 ARG 2017 b Intercept 2.44 1.33 1.83 0.0668
8 ARG 2017 b transect -0.396 0.208 -1.90 0.0570
9 CHE 2007 a Intercept 3.95 1.73 2.28 0.0223
10 CHE 2007 a transect -0.654 0.275 -2.38 0.0174
11 CHE 2007 b Intercept 2.44 1.33 1.83 0.0668
12 CHE 2007 b transect -0.396 0.208 -1.90 0.0570
13 CHE 2017 a Intercept 6.11 2.81 2.17 0.0300
14 CHE 2017 a transect -0.743 0.361 -2.06 0.0398
15 CHE 2017 b Intercept 1.91 1.22 1.57 0.116
16 CHE 2017 b transect -0.396 0.208 -1.90 0.0570
You could use a parallel approach as suggested earlier, e.g. with parallel::mclapply (on my 6-core machine using more than 4 cores gave only marginal improvements, though).
You could speed up glmer using nAGQ=0, at the cost of precision (see https://stats.stackexchange.com/questions/132841/default-lme4-optimizer-requires-lots-of-iterations-for-high-dimensional-data).
Example code with benchmarks:
invisible(lapply(c("lme4", "data.table", "tidyverse", "parallel", "microbenchmark"),
require, character.only = TRUE))
#> Loading required package: lme4
#> Loading required package: Matrix
#> Loading required package: data.table
#> Loading required package: tidyverse
#> Loading required package: parallel
#> Loading required package: microbenchmark
df <- structure(list(region = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("ARG", "CHE"), class = "factor"),
transect = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), presence = c(1L, 1L,
1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 0L,
1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L,
1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L,
0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L,
0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L), year = c(2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L), species = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("a", "b"), class = "factor"),
road = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("FG", "MK", "PL", "XY"), class = "factor")), class = "data.frame", row.names = c(NA,
-160L))
## Your function for comparison
tidy_fun <- function(){
df %>%
group_by(region, year, species) %>%
summarise(slope = coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial")))[2],
p_val = coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial")))[6])
}
gf2 <- function(presence, transect, road, nAGQ = 1L) {
res <- coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial", nAGQ=nAGQ)))
return(data.table(slope=res[2], p_val=res[6]))
}
parLM <- function(mc.cores=4L, nAGQ=1L){
DT <- data.table(df, key = c("region","year","species"))
iDT <- DT[,by=.(region, year, species),.(irange=.(range(.I)))]
result <- mclapply(seq(nrow(iDT)),
function(x) DT[do.call(seq, as.list(iDT[x, irange][[1]])),
.(gf2(presence, transect, road, nAGQ=nAGQ))], mc.cores=mc.cores)
return(cbind(iDT, rbindlist(result))[,-4])
}
microbenchmark(
original = suppressMessages(tidy_fun()),
multicore = parLM(mc.cores = 4L, nAGQ = 1L),
singlecore.nAGQ0 = parLM(mc.cores = 1L, nAGQ = 0L),
multicore.nAGQ0 = parLM(mc.cores = 4L, nAGQ = 0L),
times=10L)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> original 898.2732 925.0621 963.7452 940.9577 973.0648 1157.0030 10
#> multicore 319.1234 334.4151 347.8024 344.1370 362.6539 373.8189 10
#> singlecore.nAGQ0 237.4782 245.4084 262.6290 268.1308 274.8516 280.7944 10
#> multicore.nAGQ0 132.3356 132.9963 137.2777 135.8659 141.5145 144.2564 10
#> cld
#> d
#> c
#> b
#> a

adding rows with values of '0' for missing years

I have a question that is somewhat similar to others that have been posted, but after looking thoroughly at several posts, I can't get the code to work. Any help would be much appreciated.
My data frame looks like, this:
'data.frame': 501 obs. of 5 variables:
$ Tattoo.MUM : Factor w/ 250 levels "1004","1007",..: 76 76 76 81 81 81 85 85 85 85 ...
$ OffspringMUMs: int 4 4 4 4 4 4 11 11 11 11 ...
$ YearBIRTH.CUB: int 1988 1990 1991 1988 1991 2007 1989 1991 1992 1993 ...
$ YearBIRTH.MUM: int 1991 1991 NA NA NA NA 1987 1987 1987 1987 ...
$ OFFSpYR : int 2 1 1 1 2 1 1 4 3 3 ...
A few lines here:
structure(list(Tattoo.MUM = structure(c(6L, 6L, 6L, 6L, 7L, 7L,
7L, 8L, 9L, 11L, 11L, 11L, 11L, 5L, 1L, 4L, 2L, 3L, 3L, 10L,
10L, 10L, 10L, 10L), .Label = c("10454", "1045A", "1045X", "12392",
"1601", "22", "27", "29", "41", "424X", "60"), class = "factor"),
OffspringMUMs = c(11L, 11L, 11L, 11L, 5L, 5L, 5L, 1L, 3L,
7L, 7L, 7L, 7L, 1L, 2L, 1L, 1L, 4L, 4L, 6L, 6L, 6L, 6L, 6L
), YearBIRTH.CUB = c(1989L, 1991L, 1992L, 1993L, 1990L, 1991L,
1993L, 1989L, 1988L, 1988L, 1989L, 1991L, 1994L, 2015L, 2012L,
2015L, 2005L, 2009L, 2010L, 1996L, 1998L, 2000L, 2001L, 2006L
), YearBIRTH.MUM = c(1987L, 1987L, 1987L, 1987L, NA, NA,
NA, NA, NA, 1987L, 1987L, 1987L, 1987L, NA, NA, NA, NA, 2005L,
2005L, 1994L, 1994L, 1994L, 1994L, 1994L), OFFSpYR = c(1L,
4L, 3L, 3L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L)), .Names = c("Tattoo.MUM",
"OffspringMUMs", "YearBIRTH.CUB", "YearBIRTH.MUM", "OFFSpYR"), class = "data.frame", row.names = c(NA,
-24L))
I want to add new rows for all missing years (YearBIRTH.CUB) in Tattoo.MUM keeping the rest of the values the same and adding '0' to OFFSpYR.
Like so:
structure(list(Tattoo.MUM = structure(c(6L, 6L, 6L, 6L, 6L, 7L,
7L, 7L, 7L, 8L, 9L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 5L, 1L,
4L, 2L, 3L, 3L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L), .Label = c("10454", "1045A", "1045X", "12392", "1601",
"22", "27", "29", "41", "424X", "60"), class = "factor"), OffspringMUMs = c(11L,
11L, 11L, 11L, 11L, 5L, 5L, 5L, 5L, 1L, 3L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 1L, 2L, 1L, 1L, 4L, 4L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L), YearBIRTH.CUB = c(1989L, 1990L, 1991L, 1992L, 1993L,
1990L, 1991L, 1992L, 1993L, 1989L, 1988L, 1988L, 1989L, 1990L,
1991L, 1992L, 1993L, 1994L, 2015L, 2012L, 2015L, 2005L, 2009L,
2010L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L,
2004L, 2005L, 2006L), YearBIRTH.MUM = c(1987L, 1987L, 1987L,
1987L, 1987L, NA, NA, NA, NA, NA, NA, 1987L, 1987L, 1987L, 1987L,
1987L, 1987L, 1987L, NA, NA, NA, NA, 2005L, 2005L, 1994L, 1994L,
1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L
), OFFSpYR = c(1L, 0L, 4L, 3L, 3L, 1L, 1L, 0L, 3L, 1L, 3L, 3L,
1L, 0L, 2L, 0L, 0L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 0L, 1L, 0L,
1L, 2L, 0L, 0L, 0L, 0L, 1L)), .Names = c("Tattoo.MUM", "OffspringMUMs",
"YearBIRTH.CUB", "YearBIRTH.MUM", "OFFSpYR"), class = "data.frame", row.names = c(NA,
-35L))
I've tried:
library(tidyr)
library(dplyr)
df1 <- pedMUM %>% group_by(Tattoo.MUM, OffspringMUMs) %>% complete(YearBIRTH.CUB = full_seq(YearBIRTH.CUB,1)) %>% fill(OFFSpYR=0)
library(data.table)
df1 <- setDT(pedMUM)[CJ(Tattoo.MUM=Tattoo.MUM, OffspringMUMs=OffspringMUMs, YearBIRTH.MUM=YearBIRTH.MUM, YearBIRTH.CUB=seq(min(YearBIRTH.CUB), max(YearBIRTH.CUB)), unique=TRUE),
on=.(Tattoo.MUM, OffspringMUMs, YearBIRTH.CUB), roll=T]
I am obviously using tidyr, dplyr, and data.table wrongly because none have given me the results I want.
I've had a look at the following posts:
Add rows with missing years by group
Adding rows with values of "0" to a dataframe with missing data
Find missing month after grouping with dplyr
And even tried loops:
R code - clever loop to add rows
but I get confused when I try to determine the year sequence for each Tattoo.MUM within the loop.
Would anyone be able to point me in the right direction?
I haven't used complete() before, but the following seems to work. nesting() allows you to keep two variables together, =full_seq() allows you to expand the values of a variable, fill=list() allows you to fill in blanks.
pedMUM <- structure(list(Tattoo.MUM = structure(c(6L, 6L, 6L, 6L, 7L, 7L,
7L, 8L, 9L, 11L, 11L, 11L, 11L, 5L, 1L, 4L, 2L, 3L, 3L, 10L,
10L, 10L, 10L, 10L), .Label = c("10454", "1045A", "1045X", "12392",
"1601", "22", "27", "29", "41", "424X", "60"), class = "factor"),
OffspringMUMs = c(11L, 11L, 11L, 11L, 5L, 5L, 5L, 1L, 3L,
7L, 7L, 7L, 7L, 1L, 2L, 1L, 1L, 4L, 4L, 6L, 6L, 6L, 6L, 6L
), YearBIRTH.CUB = c(1989L, 1991L, 1992L, 1993L, 1990L, 1991L,
1993L, 1989L, 1988L, 1988L, 1989L, 1991L, 1994L, 2015L, 2012L,
2015L, 2005L, 2009L, 2010L, 1996L, 1998L, 2000L, 2001L, 2006L
), YearBIRTH.MUM = c(1987L, 1987L, 1987L, 1987L, NA, NA,
NA, NA, NA, 1987L, 1987L, 1987L, 1987L, NA, NA, NA, NA, 2005L,
2005L, 1994L, 1994L, 1994L, 1994L, 1994L), OFFSpYR = c(1L,
4L, 3L, 3L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L)), .Names = c("Tattoo.MUM",
"OffspringMUMs", "YearBIRTH.CUB", "YearBIRTH.MUM", "OFFSpYR"), class = "data.frame", row.names = c(NA,
-24L))
library(tidyr)
library(dplyr)
df1 <- pedMUM %>%
group_by(Tattoo.MUM) %>% # find min and max year for each mum
mutate(
minyear=min(YearBIRTH.CUB, na.rm=TRUE),
maxyear=max(YearBIRTH.CUB, na.rm=TRUE)
) %>%
complete( # complete table
nesting(Tattoo.MUM, minyear, maxyear, OffspringMUMs, YearBIRTH.MUM),
YearBIRTH.CUB=full_seq(YearBIRTH.CUB, 1),
fill=list(OFFSpYR=0)
) %>%
filter(YearBIRTH.CUB>=minyear & YearBIRTH.CUB<=maxyear) %>% # remove unwanted years
select(names(pedMUM)) # return original column order

Automate coding (sum) in R

First at all I would like to apologise if I did not use the correct jargon.
I have the dataset as below which contains a wide range of categories
Here some excerpt from dput (using droplevels)
structure(list(
x = c(2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L), *[ME: there are more years than 2010...]*
y = c(7.85986, 185.81068, 107.24097, 7094.74649,
1.4982, 185.77319, 5090.79354, 167.58584, 4189.64609, 157.08277,
3927.06932, 2.86732, 71.683, 4.70123, 117.53085, 2.93452, 73.36292,
1.4982, 18.18734, 901.14744, 0.90268, 13.77532, 613.38298, 0.01845,
0.0681, 7.19925, 3.75315, 0.14333, 136.54008, 0.04766, 0.59077,
28.97255, 0.38608, 115.05258, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
x1 = structure(c(4L, 2L, 3L, 1L, 4L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 4L, 2L, 1L, 4L, 2L, 1L, 4L, 2L,
1L, 2L, 4L, 1L, 4L, 2L, 1L, 4L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L), .Label = c("All greenhouse gases - (CO2 equivalent)",
"CH4", "CO2", "N2O"), class = "factor"),
x2 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Austria",
class = "factor"),
x4 = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 4L,
4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 8L, 9L, 9L, 9L, 10L,
10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L, 13L, 13L, 14L, 14L,
15L, 15L, 16L, 16L, 17L, 17L, 18L, 18L), .Label = c("3",
"3.1", "3.A", "3.A.1", "3.A.2", "3.A.3", "3.A.4", "3.B",
"3.B.1", "3.B.2", "3.B.3", "3.B.4", "3.B.5", "3.C", "3.C.1",
"3.C.2", "3.C.3", "3.C.4"), class = "factor")), class = "data.frame",
row.names = c(NA,
-44L))
I want to know whether the of the sum of subcategories in x4 (e.g. 3.B.1+3.B.2+...+3.B.n) equal the figure stated in the parent category (e.g. 3.B). (i.e. the in the csv stated sum) for a given year and country. I want to verify the sums.
For get the sum of the subcategories I have this
sum(df$y[df$x4 %in% c("3.A.1", "3.A.2", "3.A.3", "3.A.4") & x ==
"2010" & x2 == "Austria"])
To receive the sum of the parent category I have this
sum(df$y[df$x4 %in% c("3.A") & x == "2010" & x2 == "Austria"])
Next I would need an operation which checks whether the results of both codes are equal (True/false). However, I have more than 20 countries, 20 years, dozens of categories to check. With my newby approach I would be writing code for ages...
is there anyway to automate this? Basically, I am looking for a code which is able to do the following
1) Run for one category, go to next one
2) once done with categories change year and start again with categories
3) ... same for countries....
Any sort of help would be appreciated and even a suggestions how to use the right jargon in the title. Thanks in any case
Here's a potential solution using dplyr (might require some tweaking based on the full dataset):
require(dplyr)
# Create two columns - one that shows only the parent category number, and one that tells you if it's a parent or child; note that the regex here makes some assumptions on the format of your data.
mutate(df,parent=gsub("(.?\\..?)\\..*", "\\1", df$x4),
type=ifelse(parent==x4,"Parent","Child")) %>%
# Sum the children y's by category, year and country
group_by(parent, type, x, x2) %>%
summarize(sum(y)) %>%
# See if the sum of the children is equal to the parent y
tidyr::spread(type,`sum(y)`) %>%
mutate(equals=isTRUE(all.equal(Child,Parent)))
Result using your (new) data:
parent x x2 Child Parent equals
<chr> <int> <fct> <dbl> <dbl> <lgl>
1 3 2010 Austria NA 7396. FALSE
2 3.1 2010 Austria NA 5278. FALSE
3 3.A 2010 Austria 4357. 4357. TRUE
4 3.B 2010 Austria 921. 921. TRUE
5 3.C 2010 Austria 0 0 TRUE
I can see from your new data that you have two levels of parents. My solution will only work for the second level (e.g. 3.1 and its children), but can be easily tweaked to also work for the top level.

Conditional dummy for specific years in R

I am trying to generate a conditional dummy variable ”X" with the following rule
set X=1 if Y is =1, two years prior to the NA.
In other words, X=1/0 depending on [0/1=year1,0/1=year2,NA].
For example, as seen below, if the pattern for Y is 0,0,NA then the X variable is =0 for all the two years prior to the NA. If the pattern for Y is 0,1,NA or 1,0,NA then the X =1 . To be clear, if 1,1,NA then the X=1 that first specific year, it should only count once (X=1), not twice.
The code that I have now (thanks #Auréle, from my previous question here) is the closest that I have to generate it.
dat2 <- dat1 %>%
group_by(country) %>%
group_by(grp = cumsum(is.na(lag(Y))), add = TRUE) %>%
mutate(first_year_at_1 = match(1, Y) * any(is.na(Y)) * any(tail(Y, 3) == 1L),
X = {x <- integer(length(Y)) ; x[first_year_at_1] <- 1L ; x}) %>%
ungroup()
However, it doesn’t really generate what I described above. Any help here would be much appreciated.
Below you can see my sample data with the desired outcome ”X” dummy in it.
data <- structure(list(year = c(1991L, 1992L, 1993L, 1994L, 1995L, 1996L,
1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L,
2006L, 2007L, 2008L, 2009L, 2010L, 2011L, 1990L, 1991L, 1992L,
1993L, 1994L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L,
2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L, 2009L, 2010L,
2011L, 1990L, 1991L, 1992L, 1993L, 1994L, 1995L, 1996L, 1997L,
1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L,
2007L, 2008L, 2009L, 2010L, 2011L, 1990L, 1991L, 1992L, 1993L,
1994L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L,
2003L, 2004L, 2005L, 2006L, 2007L, 2008L, 2009L, 2010L, 2011L,
1990L, 1991L, 1992L, 1993L, 1994L, 1995L, 1996L, 1997L, 1998L,
1999L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L,
2007L, 2008L, 2009L, 2010L, 2011L), country = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), .Label = c("Canada",
"Cuba", "Dominican Republic", "Haiti", "Jamaica"), class = "factor"),
Y = c(1L, NA, 1L, 1L, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L,
1L, NA, 1L, NA, 1L, NA, 1L, NA, NA, 1L, 1L, NA, NA, 1L, NA,
1L, NA, 1L, NA, 1L, 1L, 1L, 1L, NA, 1L, NA, 1L, NA, 1L, NA,
NA, 1L, NA, 1L, 0L, 0L, 0L, 1L, NA, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 1L, NA, 0L, 1L, 1L, NA, 0L, 1L, NA, 1L, NA, 1L, NA, 1L,
NA, 1L, NA, 1L, 1L, 1L, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L,
0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, NA, 0L, 1L, 1L, 1L,
NA, 1L, NA, 0L, 1L, 1L, NA), X = c(1L, 0L, 0L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L,
0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L,
0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L,
1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-110L))
To be honest the question is not 100% clear, but I thought I'd give it a shot, so here it goes:
data_new <- data_1 %>%
mutate(Y_2 = ifelse(is.na(Y), -1, Y)) %>%
group_by(country) %>%
mutate(X_2 = ifelse((Y_2==1 &
lead(Y_2, 1) == -1 &
(lag(Y_2,1)!=1 | is.na(lag(Y_2,1)))) |
(Y_2==1 & lead(Y_2, 2) == -1 ),
1, 0))
basically I formulated the condition as follows:
X is 1 in two cases:
if Y == 1 and Y after two years is NA
or if (Y == 1) and (Y next year is NA) and (Y on the year before is not 1)
A couple of notes:
Since we can't use NAs in comparisons, I used the column Y_2 to replace the NAs with the value -1, and then used it in the comparison
The condition (Y on the year before is not 1) also might cause problems in the first recorded row (year) of each group (country) when Y == 1, which is why I included this case also in the condition (i.e (lag(Y_2,1)!=1 | is.na(lag(Y_2,1))))
Like mentioned in the comment by #andrew_reece, the pattern you're trying to get has a lot of edge cases, only one of which is in the point above, other example might be if Y == 1 in the last couple of years for some country, how would you handle that?
Try considering a more specified description of your conditions based on the data you have
hope this helps

How to change the order of the bars in accordance with the group variable in a barplot using lattice in R?

I am making a bar plot using lattice in R where I have data for 4 different years on sources of irrigation for different states. using my code, the bar plot is coming fine but I wish the bar corresponding to the year 1996 to be plotted first followed by the bar corresponding to year 2001 etc. so as to show the increasing area being irrigated by tube-wells. However, I am unable to change the ordering. Here is my data and the R code. Many thanks for your help.
# sample data
irr_atlas <- structure(list(state = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("ANDHRA PRADESH",
"KARNATAKA", "MADHYA PRADESH", "RAJASTHAN"), class = "factor"),
st_code = c(28L, 28L, 28L, 28L, 28L, 28L, 28L, 28L, 28L,
28L, 28L, 28L, 28L, 28L, 28L, 28L, 29L, 29L, 29L, 29L, 29L,
29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 23L,
23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L,
23L, 23L, 23L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L), year = c(1996L, 1996L, 1996L, 1996L,
2001L, 2001L, 2001L, 2001L, 2006L, 2006L, 2006L, 2006L, 2011L,
2011L, 2011L, 2011L, 1996L, 1996L, 1996L, 1996L, 2001L, 2001L,
2001L, 2001L, 2006L, 2006L, 2006L, 2006L, 2011L, 2011L, 2011L,
2011L, 1996L, 1996L, 1996L, 1996L, 2001L, 2001L, 2001L, 2001L,
2006L, 2006L, 2006L, 2006L, 2011L, 2011L, 2011L, 2011L, 1996L,
1996L, 1996L, 1996L, 2001L, 2001L, 2001L, 2001L, 2006L, 2006L,
2006L, 2006L, 2011L, 2011L, 2011L, 2011L), irr_area = c(1.84066,
0.942819, 0.82886, 0.853502, 1.54922, 0.825659, 0.542492,
1.53412, 1.72969, 0.70271, 0.637221, 1.53894, 1.99893, 0.678425,
0.819829, 1.70708, 0.921594, 0.231669, 0.316999, 0.358529,
0.91339, 0.207157, 0.426549, 0.481061, 0.921255, 0.18192,
0.426145, 0.547193, 0.930802, 0.148065, 0.377149, 1.51843,
1.59425, 0.112145, 2.67683, 0.540054, 1.48056, 0.030502,
1.63696, 0.563948, 1.12595, 0.058667, 2.46494, 1.15004, 1.10444,
0.157069, 2.64378, 2.14177, 1.55814, 0.106623, 2.71347, 0.644683,
1.35746, 0.030586, 2.41845, 0.935234, 1.76933, 0.054374,
2.46197, 1.76918, 1.62587, 0.050299, 2.14737, 2.82708),irr_source = structure(c(1L,2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L,
1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L,
3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L,
4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L,
2L, 4L, 3L), .Label = c("Canal", "Tank", "Tube", "Well"), class = "factor")), .Names = c("state","st_code", "year", "irr_area", "irr_source"), class = "data.frame", row.names = c(NA, -64L))
Code for plot...
library(lattice)
barchart(~irr_area | factor(state) + factor(irr_source),
group=year, data=irr_atlas, auto.key=list(space="right"))
As mentioned, ordering of groups in R graphics is usually determined by the ordering of the factor variable. So, you can reorder your factors with factor and its levels argument.
library(lattice)
barchart(~irr_area | factor(state) + factor(irr_source),
group=factor(year, levels=sort(unique(year), decreasing=T)), # change the order of years
data=irr_atlas, auto.key=list(space="right"))
You can switch it back the other way by changing decreasing=F.

Resources