Percentage of reoccurring observations by group - r

I have a data set of firm-employee through time, that looks like this
data.table(firm = c(rep("A", 8), rep("B", 8)),
employee = c(1, 2, 3, 4, 1, 2, 3, NA, 5, 6, NA, NA, 5, 6, 7, 8),
year = c(rep(1, 4), rep(2, 4)))
firm employee_id year
A 1 1
A 2 1
A 3 1
A 4 1
A 1 2
A 2 2
A 3 2
A NA 2
B 5 1
B 6 1
B NA 1
B NA 1
B 5 2
B 6 2
B 7 2
B 8 2
I want to calculate the percentage of employees from year==1 that were still working on year==2, for each firm.
The output should be like this
firm year continued_employees
A 2 0.75
B 2 1
I can do it in a loop for each year, using
sum(employee_id[year==1] %in% employee_id[year==2]) / length(employee_id[year==1])
but I have around 40k firms and 10 years of observations. Any thoughts on how to do it in a dplyr or data.table syntax?

Here's a not-so-pretty data.table approach you could use for any number of firms and years:
years <- head(sort(unique(dt$year)), -1)
setNames(lapply(years, function(y) {
dt[dt[(year == y), .(firm, employee)], on = .(firm, employee)][
!is.na(employee), all(c(y, y+1) %in% year), by = .(employee, firm)][,
.(continued = mean(V1), year = y+1), by = firm]
}), paste("Year", years, sep="-"))
#$`Year-1`
# firm continued year
#1: A 0.75 2
#2: B 1.00 2
Since you only have two years in your sample data, you only get a single list element in return.

Join with shifted year
This is an approach using a kind of self join with a shifted year:
library(data.table)
options(datatable.print.class = TRUE)
# self join with shifted year
DT[.(firm = firm, employee = employee, year = year - 1),
on = .(firm, employee, year), cont := TRUE][]
# aggregate
DT[!is.na(employee), sum(cont, na.rm = TRUE) / .N, by = .(firm, year = year + 1)][
# beautify result
year <= max(DT$year)]
firm year V1
<char> <num> <num>
1: A 2 0.75
2: B 2 1.00
The first expression modifies DT to indicate continued employees:
firm employee year cont
<char> <num> <num> <lgcl>
1: A 1 1 TRUE
2: A 2 1 TRUE
3: A 3 1 TRUE
4: A 4 1 NA
5: A 1 2 NA
6: A 2 2 NA
7: A 3 2 NA
8: A NA 2 NA
9: B 5 1 TRUE
10: B 6 1 TRUE
11: B NA 1 NA
12: B NA 1 NA
13: B 5 2 NA
14: B 6 2 NA
15: B 7 2 NA
16: B 8 2 NA
Using shift()
Alternatively, the shift() function can be used to compute the cont column. The aggregation part is the same as with the join approach above. shift() requires to ensure the data are ordered by year.
DT[order(year), cont := shift(year, type = "lead") == year + 1, by = .(firm, employee)][
!is.na(employee), sum(cont, na.rm = TRUE) / .N, by = .(firm, year = year + 1)][
year <= max(DT$year)]
Benchmark
At the time of writing, three approaches have been proposed in addition to OP's own attempt using loops:
by docendo discimus
join with shifted year
using shift()
The answer of Jean Vuda is not considered in the benchmark as it is limited to 2 years.
According to the OP, the production data set consists of 40 k firms and 10 years of data. For a realistic benchmark, a sample data set of similar size is created:
n_firm <- 40000L
max_employee <- 10L
fluctuation_rate <- 0.2
n_year <- 10L
start_year <- 2001L
DT0 <- CJ(firm = sprintf("%06i", seq_len(n_firm)),
employee = seq_len(max_employee),
year = seq(start_year, length.out = n_year))
set.seed(123L)
n_row <- nrow(DT0)
DT0[sample.int(n_row, fluctuation_rate * n_row), employee := NA]
The sample data set consists of 4 M rows and can be visualised best after reshaping from long to wide format:
dcast(DT0[!is.na(employee)], firm + employee ~ year)
Using 'year' as value column. Use 'value.var' to override
firm employee 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010
<char> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1: 000001 1 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010
2: 000001 2 2001 2002 2003 NA 2005 2006 2007 NA 2009 NA
3: 000001 3 2001 2002 NA NA 2005 2006 2007 2008 2009 2010
4: 000001 4 2001 NA NA NA 2005 2006 2007 2008 NA 2010
---
399996: 040000 6 2001 2002 NA 2004 2005 NA NA NA 2009 2010
399997: 040000 7 NA 2002 NA NA 2005 2006 2007 2008 2009 2010
399998: 040000 8 2001 2002 2003 NA NA NA 2007 NA NA 2010
399999: 040000 9 2001 2002 2003 NA 2005 2006 2007 2008 2009 NA
400000: 040000 10 2001 2002 2003 NA NA 2006 2007 2008 2009 2010
For benchmarking, the microbenchmark package is used because a check function can be passed to verify the results are identical:
my_check <- function(values) {
values <- lapply(values, function(x) x[, dcast(.SD, firm ~ year, value.var = "continued")])
all(sapply(values[-1], function(x) identical(values[[1]], x)))
}
The benchmark code:
microbenchmark::microbenchmark(
dd = {
dt <- copy(DT0)
years <- head(sort(unique(dt$year)), -1)
rbindlist(
setNames(lapply(years, function(y) {
dt[dt[(year == y), .(firm, employee)], on = .(firm, employee)][
!is.na(employee), all(c(y, y+1) %in% year), by = .(employee, firm)][
, .(continued = mean(V1), year = y+1), by = firm]
}), paste("Year", years, sep="-"))
)
},
join = {
DT <- copy(DT0)
DT[.(firm = firm, employee = employee, year = year - 1),
on = .(firm, employee, year), cont := TRUE][
!is.na(employee), .(continued = sum(cont, na.rm = TRUE) / .N),
by = .(firm, year = year + 1)][
year <= max(DT$year)]
},
shift = {
DT <- copy(DT0)
DT[order(year), cont := shift(year, type = "lead") == year + 1,
by = .(firm, employee)][
!is.na(employee), .(continued = sum(cont, na.rm = TRUE) / .N),
by = .(firm, year = year + 1)][
year <= max(DT$year)]
},
check = my_check,
times = 3L
)
The benchmark results show that the join approach is 4 times faster than the shift approach and 8 times faster than docendo discimus's approach.
Unit: seconds
expr min lq mean median uq max neval cld
dd 11.756114 11.919959 12.083042 12.083805 12.246506 12.409207 3 c
join 1.054293 1.239829 1.303971 1.425366 1.428810 1.432254 3 a
shift 6.105725 6.105906 6.148136 6.106087 6.169342 6.232596 3 b

Here is a slightly different approach to do it:
dt<-dat[,list(all=.(unique(employee))), by=list(year,firm)]
dt<-dt[,list(year1=sapply(list(all),`[`,1),
year2=sapply(list(all),`[`,2)), by=firm]
dt[,uniqueN(mapply(intersect, year1, year2))/uniqueN(na.omit(unlist(year1))),by=firm]

Related

R subset data.table by condition , getting nearest value row if missing

I would like to subset a data.table by condition "Year". Basically I want the data from the dt that matches a given year, per group. However, some groups do not have a complete time line across all years, and therefore I would like to return the nearest year's data for every group, so there are data for every group present for any year chosen (whether that is exactly the right year, or not).
library(data.table)
# make dummy data
dt <- data.table(Group = c(rep("A", 5),rep("B", 3),rep("C", 5),rep("D", 2)),
x = sample(1:10,15, rep=T), Year = c(2011:2015, 2013:2015, 2011:2015, 2014:2015))
# subset by, e.g., Year == 2015 is fine, but I want a full result for ANY
# year chosen, such as 2012, by using the closest entry in time, per group.
# Attempt;
y <- 2012
dt[Year == which.min(abs(Year - y)), .SD, by = Group]
Empty data.table (0 rows and 3 cols): Group,x,Year
The result in this example should be;
Group x Year
1: A 4 2012
2: B 7 2013
3: C 2 2012
4: D 3 2014
You are close: the use of which.min(abs(Year - y)) is good, but needs to be within the .SD-subsetting in the j portion.
dt[, .SD[which.min(abs(Year - y)),], Group]
# Group x Year
# <char> <int> <int>
# 1: A 5 2012
# 2: B 4 2013
# 3: C 8 2012
# 4: D 5 2014
Reproducible data
set.seed(42)
dt <- data.table(Group = c(rep("A", 5),rep("B", 3),rep("C", 5),rep("D", 2)), x = sample(1:10,15, rep=T), Year = c(2011:2015, 2013:2015, 2011:2015, 2014:2015))
dt
# Group x Year
# <char> <int> <int>
# 1: A 1 2011
# 2: A 5 2012
# 3: A 1 2013
# 4: A 9 2014
# 5: A 10 2015
# 6: B 4 2013
# 7: B 2 2014
# 8: B 10 2015
# 9: C 1 2011
# 10: C 8 2012
# 11: C 7 2013
# 12: C 4 2014
# 13: C 9 2015
# 14: D 5 2014
# 15: D 4 2015
y <- 2012

How to fasten nested for-loop R

I have two datasets, and one of them is very big. I'm trying to run the following loop to create a treatment column, treatment, in the dataset a. However, it is way too slow. I looked for a way to fasten for-loops like vectorization or defining conditions outside the loops however I'm having a hard time applying those methods since I have two datasets I'm conditioning on.
Here is my code:
reform_loop <- function(a, b){
for(i in 1:nrow(a)) {
for(j in 1:nrow(b)){
if(!is.na(a[i,"treatment"])){break}
a[i,"treatment"] <- case_when(a[i,"country_code"] == b[j, "country_code"] &
a[i,"birth_year"] >= b[j,"cohort"] &
a[i,"birth_year"]<= b[j,"upper_cutoff"] ~ 1,
a[i,"country_code"] == b[j, "country_code"] &
a[i,"birth_year"] < b[j,"cohort"]&
a[i,"birth_year"]>= b[j,"lower_cutoff"] ~ 0)
}
}
return(a)
}
a <- reform_loop(a, b)
You can find a sample dataset below. Dataset a is an individual dataset with birth year informations and dataset b is country-level data with some country reform information. treatment is 1 if thebirth_year is between the cohort and upper_cutoff and 0 if between cohort and lower_cutoff in a specific country which means country_code variables should also be matched. And anything else should be NA.
#individual level data, birth years
a <- data.frame (country_code = c(2,2,2,10,10,10,10,8),
birth_year = c(1920,1930,1940,1970,1980,1990, 2000, 1910))
#country level reform info with affected cohorts
b <- data.frame(country_code = c(2,10,10,11),
lower_cutoff = c(1928, 1975, 1907, 1934),
upper_cutoff = c(1948, 1995, 1927, 1948),
cohort = c(1938, 1985, 1917, 1942))
The following is the result I want to get:
treatment <- c(NA, 0, 1, NA, 0, 1, NA, NA)
Unfortunately, I cannot merge these two datasets since most of the countries in my dataset have more than one reform.
Any ideas on how can I fasten this code? Thank you so much in advance!
This is a range-based non-equi join. As such, this can be done with data.table or fuzzyjoin or sqldf.
data.table
library(data.table)
setDT(a)
setDT(b)
b[, treatment := 1L]
a[b, treatment := i.treatment, on = .(country_code, birth_year >= lower_cutoff, birth_year <= upper_cutoff)]
a[is.na(treatment), treatment := 0L]
a
# country_code birth_year treatment
# <num> <num> <int>
# 1: 2 1920 0
# 2: 2 1930 1
# 3: 2 1940 1
# 4: 10 1970 0
# 5: 10 1980 1
# 6: 10 1990 1
# 7: 10 2000 0
# 8: 8 1910 0
sqldf
out <- sqldf::sqldf("select a.*, b.treatment from a left join b on a.country_code=b.country_code and a.birth_year between b.lower_cutoff and b.upper_cutoff")
out$treatment[is.na(out$treatment)] <- 0L
out
# country_code birth_year treatment
# 1 2 1920 0
# 2 2 1930 1
# 3 2 1940 1
# 4 10 1970 0
# 5 10 1980 1
# 6 10 1990 1
# 7 10 2000 0
# 8 8 1910 0
fuzzyjoin
fuzzyjoin::fuzzy_left_join(a, b, by = c("country_code" = "country_code", "birth_year" = "lower_cutoff", "birth_year" = "upper_cutoff"), match_fun = list(`==`, `>=`, `<=`))
# country_code.x birth_year country_code.y lower_cutoff upper_cutoff cohort treatment
# 1 2 1920 NA NA NA NA NA
# 2 2 1930 2 1928 1948 1938 1
# 3 2 1940 2 1928 1948 1938 1
# 4 10 1970 NA NA NA NA NA
# 5 10 1980 10 1975 1995 1985 1
# 6 10 1990 10 1975 1995 1985 1
# 7 10 2000 NA NA NA NA NA
# 8 8 1910 NA NA NA NA NA
and then you need to clean up the extra columns (and fill 0 for NA).

Running a function in sapply is returning faulty results when first ID is all missing

I'm having a very odd, specific problem that I'm struggling to google, so I'm hoping I can just show someone.
I've written a function that will fill in some missing data according to a few conditions. For example, for panel data like this:
library(tidyverse)
library(data.table)
dt <- data.frame(id = c(rep('a', 5),
rep('b', 5),
rep('c', 5)),
var1 = c(rep('', 4), 'bonjour',
'bye', NA, 'bye', 'bye', NA,
'hi', 'hi', NA, 'hi', 'hi'),
year = c(2005:2009,
1995:1998, 2002,
1995:1999))
dt
id var1 year
1: a 2005
2: a 2006
3: a 2007
4: a 2008
5: a bonjour 2009
6: b bye 1995
7: b <NA> 1996
8: b bye 1997
9: b bye 1998
10: b <NA> 2002
11: c hi 1995
12: c hi 1996
13: c <NA> 1997
14: c hi 1998
15: c hi 1999
I use the following function to update some of the missing values:
fill.in <- function(var, yr, finyr) {
leadv <- lead(var, n=1, order_by = yr)
lagv <- lag(var, n=1, order_by = yr)
leadyr <- lead(yr, n=1, order_by = yr)
lagyr <- lag(yr, n=1, order_by = yr)
# ------- build the updated var w/ sequential conditions
# keep the var as it is if not missing
try1 <- ifelse(test = !is.na(var),
yes = var,
no = NA)
# fill in if the lead and lag match and no more than 2 missing years
try2 <- ifelse(test = is.na(try1) & leadv == lagv &
abs(leadyr-lagyr) <= 3 &
!is.na(leadv),
yes = leadv,
no = try1)
# fill in with the lag if it's the final year of observed data
ifelse(test = is.na(try2) & yr == finyr &
abs(yr-lagyr) <= 3 & !is.na(lagv),
yes = lagv,
no = try2)
}
After a little bit of set-up, by and large I get good results:
# ------------ Set-up
# real data is big so use data.table
setDT(dt)
dt[, finalyr := max(year), by = id]
# don't want to fill in factor values
dt$var1 <- as.character(dt$var1)
# make empty strings NAs
dt[, var1 := na_if(var1, '')]
# useful for when i'm filling in many variables
fill.in.vs <- c('var1')
fixed.vnames <- paste0('fixed.', fill.in.vs)
# ------------ Call the function and results
dt[, (fixed.vnames) := sapply(.SD,
FUN = fill.in,
year,
finalyr,
simplify = FALSE, USE.NAMES = FALSE),
by = id, .SDcols = fill.in.vs]
# this gives me what I want:
dt
id var1 year finalyr fixed.var1
1: a <NA> 2005 2009 <NA>
2: a <NA> 2006 2009 <NA>
3: a <NA> 2007 2009 <NA>
4: a <NA> 2008 2009 <NA>
5: a bonjour 2009 2009 bonjour
6: b bye 1995 2002 bye
7: b <NA> 1996 2002 bye
8: b bye 1997 2002 bye
9: b bye 1998 2002 bye
10: b <NA> 2002 2002 <NA>
11: c hi 1995 1999 hi
12: c hi 1996 1999 hi
13: c <NA> 1997 1999 hi
14: c hi 1998 1999 hi
15: c hi 1999 1999 hi
The problem is that when the first set of IDs--e.g. all the 'a' values--have empty strings that I turn into NAs, all values of the "fixed" variable end up NAs as well.
So using that same code but with the following data, I get all NAs in the new variable:
# id of 'a' now is all empty strings in var1:
dt <- data.frame(id = c(rep('a', 5),
rep('b', 5),
rep('c', 5)),
var1 = c(rep('', 5),
'bye', NA, 'bye', 'bye', NA,
'hi', 'hi', NA, 'hi', 'hi'),
year = c(2005:2009,
1995:1998, 2002,
1995:1999))
# which results in this final data after running the same code above:
dt
id var1 year finalyr fixed.var1
1: a <NA> 2005 2009 NA
2: a <NA> 2006 2009 NA
3: a <NA> 2007 2009 NA
4: a <NA> 2008 2009 NA
5: a <NA> 2009 2009 NA
6: b bye 1995 2002 NA
7: b <NA> 1996 2002 NA
8: b bye 1997 2002 NA
9: b bye 1998 2002 NA
10: b <NA> 2002 2002 NA
11: c hi 1995 1999 NA
12: c hi 1996 1999 NA
13: c <NA> 1997 1999 NA
14: c hi 1998 1999 NA
15: c hi 1999 1999 NA
For brevity I won't show you all the things I've tried, but a few observations about when it happens:
All empty strings in the first ID isn't a problem if I do not convert empty strings to NA.
I only get this result if it's the first ID that has all empty strings; if it's the second set, the results are fine.
How I convert "" to NA doesn't matter, i.e. it's not an issue with na_if because it also happens when I use ifelse.
Overall I'm pretty stumped as to what's happening or how to investigate it further. I would really appreciate any help.
When I run your code, I get this warning:
1: In [.data.table(dt, , :=((fixed.vnames), sapply(.SD, FUN = fill.in, : Coercing 'character' RHS to 'logical' to match the type of the target column (column 0 named '').
I get it twice, once for the second, and for the third group.
As it says, the variable fixed.var1 is initialised as logical variable (for the group id==a); values that are added later are then converted to the same class 'logical'.
The major culprit here is your function fill.in(), since e.g.
logicalVar <- fill.in( var=rep(NA,5), yr=2005:2009, finyr=rep(2009,5)); class(logicalVar)
returns a logical variable.
So all you need to do is to wrap as.character() around the return of your function.

Remove discontinuous time points in longitudinal data

I have a firm-year longitudinal data but the year is not continuous for some firms, for example
library(data.table)
dt = data.table(firm_id=c(rep(1,5),rep(2,5)),year=c(1990,1991,1999,2000,2001,1995,1997,2008,2009,2010))
For each firm, I want to keep observations in the most recent continuous years and remove other observations. For example, Firm 1 has five-year observations in (1990, 1991, 1999, 2000, 2001) and I want to keep (1999, 2000, 2001)
I can think of some awkward approaches to solve this issue but I am wondering if there is an easy way to solve it.
Enlighted by the comments, I am also wondering if there is any way to keep the longest continuous vector block of years. For example,
library(data.table)
dt = data.table(firm_id=c(rep(1,5),rep(2,5)),year=c(1990,1991,1992,2000,2001,1995,1997,2008,2009,2010))
The result would be
library(data.table)
DT2 <- setorder(dt, firm_id, year)[
,d := cumsum(c(TRUE, diff(year) > 1)), by = .(firm_id) ][
,n := .N, by = .(firm_id, d) ]
DT2
# firm_id year d n
# <num> <num> <int> <int>
# 1: 1 1990 1 3
# 2: 1 1991 1 3
# 3: 1 1992 1 3
# 4: 1 2000 2 2
# 5: 1 2001 2 2
# 6: 2 1995 1 1
# 7: 2 1997 2 1
# 8: 2 2008 3 3
# 9: 2 2009 3 3
# 10: 2 2010 3 3
From here, if you want runs of 3 consecutive years or more, then
DT2[ (n > 2), ]
If you want the longest run for each firm_id, then
DT2[, .SD[n == max(n),], by = .(firm_id) ]

Speeding up rowwise comparison

I have a data.frame like the following:
id year x y v1
1 2006 12 1 0.8510703
1 2007 12 1 0.5954527
1 2008 12 2 -1.9312854
1 2009 12 1 0.1558393
1 2010 8 1 0.9051487
2 2001 12 2 -0.5480566
2 2002 12 2 -0.7607420
2 2003 3 2 -0.8094283
2 2004 3 2 -0.1732794
I would like to sum up (grouped by id) v1 of consecutive years (so 2010 and 2009, 2009 and 2008 and so on) only if x and y match. Expected output:
id year res
1 2010 NA
1 2009 NA
1 2008 NA
1 2007 1.4465230
2 2004 -0.9827077
2 2003 NA
2 2002 -1.3087987
The oldest year per id is removed, as there is no preceding year.
I have a slow lapply solution in place but would like to speed things up, as my data is rather large.
Data:
set.seed(1)
dat <- data.frame(id = c(rep(1,5),rep(2,4)),year = c(2006:2010,2001:2004),
x = c(12,12,12,12,8,12,12,3,3), y = c(1,1,2,1,1,2,2,2,2),
v1 = rnorm(9))
Current Solution:
require(dplyr)
myfun <- function(dat) { do.call(rbind,lapply(rev(unique(dat$year)[-1]),
function(z) inner_join(dat[dat$year==z,2:5],
dat[dat$year==z-1,2:5],
by=c("x","y")) %>%
summarise(year = z, res = ifelse(nrow(.) < 1,NA,sum(v1.x,v1.y)))))
}
dat %>% group_by(id) %>% do(myfun(.))
Here is a data.table solution, I think.
datNew <- setDT(dat)[, .(year=year, res=(v1+shift(v1)) * NA^(x != shift(x) | y != shift(y))),
by=id][-1, .SD, by=id][]
id year res
1: 1 2007 -0.4428105
2: 1 2008 NA
3: 1 2009 NA
4: 1 2010 NA
5: 2 2001 NA
6: 2 2002 -0.3330393
7: 2 2003 NA
8: 2 2004 1.3141061
Here, the j statement contains a list with two elements, the year and a function. This function sums values with the lagged value, using shift, but is multiplied by NA or 1 depending on whether the x and y match with their lagged values. This calculation is performed by id. The output is fed to a second chain, which drops the first observation of each id which is all NA.
You can adjust the order efficiently using setorder if desired.
setorder(datNew, id, -year)
datNew
id year res
1: 1 2010 NA
2: 1 2009 NA
3: 1 2008 NA
4: 1 2007 -0.4428105
5: 2 2004 1.3141061
6: 2 2003 NA
7: 2 2002 -0.3330393
8: 2 2001 NA
Assuming there are sorted years as in the example:
dat %>%
group_by(id) %>%
mutate(res = v1 + lag(v1), #simple lag for difference
res = ifelse(x == lag(x) & y == lag(y), v1, NA)) %>% #NA if x and y don't match
slice(-1) #drop the first year
You can use %>% select(id, year, res), and %>% arrange(id, desc(year)) at the end if you want.

Resources