Diff-in-diff estimation with resampling from large dataset - r

I have a large dataset on which to perform a diff-in-diff estimation. Given the nature of the dataset my t-statistics denominators are inflated and coefficient are (surreptitiously) statistically significant.
I want to step-by-step reducing the number of element in the database, and for each step resample a large number of times and re-estimating each time interaction coefficient and standard errors.
Then I want to take all the averages estimates and standard error, and plot them on a graph, to show at what point (if any) they are not statistically different from zero.
My code follows with a toy example.
I am not sure this is the most efficient way to tackle the problem
I cannot retrieve and thus plot the confidence interval
I am not sure the sampling is representative given the existence of different groups.
Toy example (Creds Torres-Reyna - ‎2015)
library(foreign)
library(dplyr)
library(ggplot2)
df_0 <- NULL
for (i in 1:length(seq(5,nrow(mydata)-1,5))){
index <- seq(5,nrow(mydata),5)[i]
df_1 <- NULL
for (j in 1:10){
mydata_temp <- mydata[sample(nrow(mydata), index), ]
didreg = lm(y ~ treated + time + did, data = mydata_temp)
out <- summary(didreg)
new_line <- c(out$coefficients[,1][4], out$coefficients[,2][4], index)
new_line <- data.frame(t(new_line))
names(new_line) <- c("c","s","i")
df_1 <- rbind(df_1,new_line)
}
df_0 <- rbind(df_0,df_1)
}
df_0 <- df_0 %>% group_by(i) %>% summarise(coefficient <- mean(c, na.rm = T),
standard_error <- mean(s, na.rm = T))
names(df_0) <- c("i","c","s")
View(df_0)

Consider the following refactored code using base R functions: within, %in%, nested lapply, setNames, aggregate, and do.call. This approach avoids calling rbind in a loop and compactly re-writes code without constantly using $ column referencing.
library(foreign)
mydata = read.dta("http://dss.princeton.edu/training/Panel101.dta")
mydata <- within(mydata, {
time <- ifelse(year >= 1994, 1, 0)
treated <- ifelse(country %in% c("E", "F", "G"), 1, 0)
did <- time * treated
})
# OUTER LIST OF DATA FRAMES
df_0_list <- lapply(1:length(seq(5,nrow(mydata)-1,5)), function(i) {
index <- seq(5,nrow(mydata),5)[i]
# INNER LIST OF DATA FRAMES
df_1_list <- lapply(1:100, function(j) {
mydata_temp <- mydata[sample(nrow(mydata), index), ]
didreg <- lm(y ~ treated + time + did, data = mydata_temp)
out <- summary(didreg)
new_line <- c(out$coefficients[,1][4], out$coefficients[,2][4], index)
new_line <- setNames(data.frame(t(new_line)), c("c","s","i"))
})
# APPEND ALL INNER DFS
df <- do.call(rbind, df_1_list)
return(df)
})
# APPEND ALL OUTER DFS
df_0 <- do.call(rbind, df_0_list)
# AGGREGATE WITH NEW COLUMNS
df_0 <- within(aggregate(cbind(c, s) ~ i, df_0, function(x) mean(x, na.rm=TRUE)), {
upper = c + s
lower = c - s
})
# RUN PLOT
within(df_0, {
plot(i, c, ylim=c(min(c)-5000000000, max(c)+5000000000), type = "l",
cex.lab=0.75, cex.axis=0.75, cex.main=0.75, cex.sub=0.75)
polygon(c(i, rev(i)), c(lower, rev(upper)),
col = "grey75", border = FALSE)
lines(i, c, lwd = 2)
})

In the end I solved it like this:
Is this the most efficient way?
library(foreign)
library(dplyr)
mydata = read.dta("http://dss.princeton.edu/training/Panel101.dta")
mydata$time = ifelse(mydata$year >= 1994, 1, 0)
mydata$treated = ifelse(mydata$country == "E" |
mydata$country == "F" |
mydata$country == "G", 1, 0)
mydata$did = mydata$time * mydata$treated
df_0 <- NULL
for (i in 1:length(seq(5,nrow(mydata)-1,5))){
index <- seq(5,nrow(mydata),5)[i]
df_1 <- NULL
for (j in 1:100){
mydata_temp <- mydata[sample(nrow(mydata), index), ]
didreg = lm(y ~ treated + time + did, data = mydata_temp)
out <- summary(didreg)
new_line <- c(out$coefficients[,1][4], out$coefficients[,2][4], index)
new_line <- data.frame(t(new_line))
names(new_line) <- c("c","s","i")
df_1 <- rbind(df_1,new_line)
}
df_0 <- rbind(df_0,df_1)
}
df_0 <- df_0 %>% group_by(i) %>% summarise(c = mean(c, na.rm = T), s =
mean(s, na.rm = T))
df_0 <- df_0 %>% group_by(i) %>% mutate(upper = c+s, lower = c-s)
df <- df_0
plot(df$i, df$c, ylim=c(min(df_0$c)-5000000000, max(df_0$c)+5000000000), type = "l")
polygon(c(df$i,rev(df$i)),c(df$lower,rev(df$upper)),col = "grey75", border = FALSE)
lines(df$i, df$c, lwd = 2)

Related

Multiple linear regression by group in a rolling window in R

My dataframe looks like this:
Date = c(rep(as.Date(seq(15000,15012)),2))
Group = c(rep("a",13),rep("b",13))
y = c(seq(1,26,1))
x1 = c(seq(0.01,0.26,0.01))
x2 = c(seq(0.02,0.26*2,0.02))
df = data.frame(Group,Date,y,x1,x2)
head(df,3)
Group
Date
y
x1
x2
a
2011-01-26
1
0.01
0.02
a
2011-01-27
2
0.02
0.04
a
2011-01-28
3
0.03
0.06
And I would like to do multiple regression by group (y as the dependent variable and x1, x2 as the independent variables) in a rolling window i.e. 3.
I have tried to achieve this using packages tidyverse and zoo with following codes but failed.
## define multi-var-linear regression function and get the residual
rsd <- function(df){
lm(formula = y~x1+x2, data = as.data.frame(df), na.action = na.omit) %>%
resid() %>%
return()
}
## apply it by group with rolling window
x <- df %>% group_by(Group) %>%
rollapplyr(. , width = 3, FUN = rsd)
The output of this code is not what I acutually want.
Does anyone know how to do multiple regression by group in a rolling window?
Thanks in advance, Giselle
Thank Grothendieck and Marcus for your codes!
It really helped me a lot:)
I now appened them here:
# Grothendieck method
rsd <- function(df){
lm(formula = y~x1+x2, data = as.data.frame(df), na.action = na.omit) %>%
resid() %>%
return()
}
width <- 5
df_m2 <-
df %>%
group_by(Group) %>%
group_modify(~ {
cbind(., rollapplyr(.[c("y", "x1", "x2")], width, rsd, fill = NA,
by.column = FALSE))
}) %>%
ungroup %>%
select(c("Group","Date","5")) %>%
dplyr::rename(residual_m2 = "5")
# Marcus method
output <- data.frame()
for (i in unique(df$Group)) {
a = df%>% subset(Group==i)
a[,"residual"] = NA
max = nrow(a)
if(max<5){
next
}
for (j in seq(5,max,by=1)) {
b = a %>% slice((j-4):j)
lm_ = lm(y~x1+x2, data = b)
a[j,]$residual = residuals(lm_)[5]
}
output <-
output %>%
rbind(a)
}
Use group_modify and use rollapplyr with the by.column = FALSE argument so that rsd is applied to all columns at once rather than one at a time.
Note that if you use width 3 with two predictors and an intercept the residuals will necessarily be all zero so we changed the width to 5.
library(dplyr, exclude = c("lag", "filter"))
library(zoo)
width <- 5
df %>%
group_by(Group) %>%
group_modify(~ {
cbind(., rollapplyr(.[c("y", "x1", "x2")], width, rsd, fill = NA,
by.column = FALSE))
}) %>%
ungroup
A good old-fashioned for-loop here could be:
for (i in unique(df$Group)){
for (j in (seq(15000,15012, 3))){
lm_ <- lm(formula = df[df$Group== i & df$Date %in% c(j, j+1, j+2), 3] ~ df[df$Group== i & df$Date %in% c(j, j+1, j+2), 4] + df[df$Group== i & df$Date %in% c(j, j+1, j+2), 5], na.action = na.omit)
print(paste('Group', i, 'Dates from', j, 'to', j+3, residuals(lm_)))
}
}

Loop over string characters in R

I'm trying to loop over a list of characters:
covariate_names <- c("Age_50.", "Age_30.49", "Age_18.29", "Income_High", "Income_Medium", "Income_Low",
"DemographicSegment_Value.Seeker", "DemographicSegment_Established", "DemographicSegment_Planner", "DemographicSegment_Affluent",
"DemographicSegment_Digital.Native","Gender_F..", "Gender_M..", "TransactionAmount",
"TechTxns", "FashionTxns", "TravelTxns", "GamerTxns",
"Recency", "Frequency", "Monetary", "Breadth", "Consistency", "is_donor", "donation_amt")
pairs <- function(df, covariate, group){
pwc <- pairwise_t_test(data = df,
formula = as.formula(paste0(covariate,"~",group)),
paired = FALSE )
pwc <- as.data.frame(pwc)
pwc <- pwc %>% rename(GroupA = group1,
GroupB = group2,
N_grpA = n1, N_grpB = n2)
pwc <- pwc[,1:7]
pwc <- pwc %>%
mutate_if(is.numeric, round, digits = 6)
#print(pwc)
}
for (i in covariate_names){
pwc_i <- pairs(df = df_test_TS, covariate = i, group = "w.contextual")
}
But the pairs function returns a df so I don't know if this is possible to use a loop for. I just want to run the pairs function for all of my covariates in the list, and be able to call to the output from each of the individual iteration of the pairs function.
You can use the function :
library(dplyr)
library(purrr)
pairs <- function(df, covariate, group){
pwc <- pairwise_t_test(data = df,
formula = reformulate(group, covariate),
paired = FALSE )
pwc <- as.data.frame(pwc)
pwc %>%
rename(GroupA = group1,
GroupB = group2,
N_grpA = n1, N_grpB = n2) %>%
select(1:7) %>%
mutate(across(where(is.numeric), round, digits = 6))
#For older dplyr
#mutate_if(is.numeric, round, digits = 6)
}
and then use map/lapply to get list as output :
result <- map(covariate_names, ~pairs(df_test_TS, .x, "w.contextual"))
If you want to combine the results into one dataframe use map_df :
result <- map_df(covariate_names, ~pairs(df_test_TS, .x, "w.contextual"))
You would need to store your results in a list, using the loop. You can try next code:
#Data
covariate_names <- c("Age_50.", "Age_30.49", "Age_18.29", "Income_High", "Income_Medium", "Income_Low",
"DemographicSegment_Value.Seeker", "DemographicSegment_Established", "DemographicSegment_Planner", "DemographicSegment_Affluent",
"DemographicSegment_Digital.Native","Gender_F..", "Gender_M..", "TransactionAmount",
"TechTxns", "FashionTxns", "TravelTxns", "GamerTxns",
"Recency", "Frequency", "Monetary", "Breadth", "Consistency", "is_donor", "donation_amt")
#Function
pairs <- function(df, covariate, group){
pwc <- pairwise_t_test(data = df,
formula = as.formula(paste0(covariate,"~",group)),
paired = FALSE )
pwc <- as.data.frame(pwc)
pwc <- pwc %>% rename(GroupA = group1,
GroupB = group2,
N_grpA = n1, N_grpB = n2)
pwc <- pwc[,1:7]
pwc <- pwc %>%
mutate_if(is.numeric, round, digits = 6)
#print(pwc)
}
Here the changes:
#List to store results
List <- list()
#Loop
for (i in 1:length(covariate_names)){
List[[i]] <- pairs(df = df_test_TS, covariate = covariate_names[i], group = "w.contextual")
}

variable length df subsampling function r

I need to write a function involving subsetting a df by a variable n bins. Like, if n is 2, then subsample the df some number of times in two bins (from the first half, then from the second half). If n is 3, subsample in 3 bins (first 1/3, second 1/3, third 1/3). I've been doing this for different lengths of n manually so far, and I know there must be a better way to do it. I want to write it into a function with n as an input, but I can't make it work so far. Code below.
# create df
df <- data.frame(year = c(1:46),
sample = seq(from=10,to=30,length.out = 46) + rnorm(46,mean=0,sd=2) )
# real df has some NAs, so we'll add some here
df[c(20,32),2] <- NA
this df is 46 years of sampling. I want to pretend instead of 46 samples, I only took 2, but at one random year in the first half (1:23), and one random year in the second half (24:46).
# to subset in 2 groups, say, 200 times
# I'll make a df of elements to sample
samplelist <- data.frame(firstsample = sample(1:(nrow(df)/2),200,replace = T), # first sample in first half of vector
secondsample = sample((nrow(df)/2):nrow(df),200, replace = T) )# second sample in second half of vector
samplelist <- as.matrix(samplelist)
# start a df to add to
plot_df <- df %>% mutate(first='all',
second = 'all',
group='full')
# fill the df using coords from expand.grid
for(i in 1:nrow(samplelist)){
plot_df <<- rbind(plot_df,
df[samplelist[i,] , ] %>%
mutate(
first = samplelist[i,1],
second = samplelist[i,2],
group = i
))
print(i)
}
(If we can make it skip samples on "NA" sample years, that would be extra good).
So, if I wanted to do this for three points instead of two, I'd repeat the process like this:
# to subset in 3 groups 200 times
# I'll make a df of elements to sample
samplelist <- data.frame(firstsample = sample(1:(nrow(df)/3),200,replace = T), # first sample in first 1/3
secondsample = sample(round(nrow(df)/3):round(nrow(df)*(2/3)),200, replace = T), # second sample in second 1/3
thirdsample = sample(round(nrow(df)*(2/3)):nrow(df), 200, replace=T) # third sample in last 1/3
)
samplelist <- as.matrix(samplelist)
# start a df to add to
plot_df <- df %>% mutate(first='all',
second = 'all',
third = 'all',
group='full')
# fill the df using coords from expand.grid
for(i in 1:nrow(samplelist)){
plot_df <<- rbind(plot_df,
df[samplelist[i,] , ] %>%
mutate(
first = samplelist[i,1],
second = samplelist[i,2],
third = samplelist[i,3],
group = i
))
print(i)
}
but, I want to do this many times, sampling up to ~20 times (so in 20 bins), so this manual method is not sustainable. Can you help me write a function to say "pick one sample from n bins x times"?
btw, this is the plot I am making with the complete df:
plot_df %>%
ggplot(aes(x=year,y=sample)) +
geom_point(color="grey40") +
stat_smooth(geom="line",
method = "lm",
alpha=.3,
aes(color=group,
group=group),
se=F,
show.legend = F) +
geom_line(color="grey40") +
geom_smooth(data = plot_df %>% filter(group %in% c("full")),
method = "lm",
alpha=.7,
color="black",
size=2,
#se=F,
# fill="grey40
show.legend = F
) +
theme_classic()
If I got you right, the following function splits your df in n bins, draws x samples from each and puts the results back into cols of a df:
library(tidyverse)
set.seed(42)
df <- data.frame(year = c(1:46),
sample = seq(from=10,to=30,length.out = 46) + rnorm(46,mean=0,sd=2) )
get_df_sample <- function(df, n, x) {
df %>%
# bin df in n bins of (approx.) equal length
mutate(bin = ggplot2::cut_number(seq_len(nrow(.)), n, labels = seq_len(n))) %>%
# split by bin
split(.$bin) %>%
# sample x times from each bin
map(~ .x[sample(seq_len(nrow(.x)), x, replace = TRUE),]) %>%
# keep only column "sample"
map(~ select(.x, sample)) %>%
# Rename: Add number of df-bin from which sample is drawn
imap(~ rename(.x, !!sym(paste0("sample_", .y)) := sample)) %>%
# bind
bind_cols() %>%
# Add group = rownames
rownames_to_column(var = "group")
}
get_df_sample(df, 3, 200) %>%
head()
#> sample_1 sample_2 sample_3 group
#> 1 12.58631 18.27561 24.74263 1
#> 2 19.46218 24.24423 23.44881 2
#> 3 12.92179 18.47367 27.40558 3
#> 4 15.22020 18.47367 26.29243 4
#> 5 12.58631 24.24423 24.43108 5
#> 6 19.46218 23.36464 27.40558 6
Created on 2020-03-24 by the reprex package (v0.3.0)
Here's a function using loops, closer to what you started doing:
df <- data.frame(year = c(1:46),
sample = seq(from=10, to=30, length.out = 46) +
rnorm(46,mean=0,sd=2))
df[c(20,32), 2] <- NA
my_function <- function(n, sample_size, data = df) {
plot_df <- data %>% mutate(group = 'full')
sample_matrix <- matrix(data = NA, nrow = sample_size, ncol = n)
first_row <- 1 # First subset has 1 as first row, no matter how many subsets
for (i in 1:n) {
last_row <- round(first_row + nrow(df)/n - 1) # Determine last row of i-th subset
sample_matrix[, i] <- sample(first_row:last_row, sample_size, replace = T) # Store sample directly in matrix
first_row <- i + last_row # Determine first row for next i
group_name <- paste("group", i, sep = "_") # Column name for i-th group
plot_df[[group_name]] <- "all" # Column for i-th group
}
for (j in 1:sample_size) {
# Creating a new data frame for new observations
new_obs <- df[sample_matrix[j,], ]
new_obs[["group"]] <- j
for (group_n in 1:n) {
new_obs[[paste0("group_", group_n)]] <- sample_matrix[j, group_n]
}
plot_df <- rbind(plot_df, new_obs)
plot_df <<- plot_df
}
}
my_function(2, 200, data = df)

rowwise filtering in dplyr

I wanted to use dplyr instead of apply,1 in order to filter a dataset rowwise according to a logical expression, ie for this example I´d like to remove all rows that have one or more values of 99.
However, I was surprised by the poor performance in dplyr. Any ideas if I can speed this up in dplyr? Also, I would have thought that the rowwise function would pipe the individual rows, but apparently not (see below). How can I use the rowwise function?
library(tidyverse)
s <- tibble(rows = seq(from = 250, to = 5000, by = 250)) #my original dataset has 400K rows...
s$num <- map(s$rows, ~ rnorm(.x * 6))
s$num <-
map(s$num, ~ replace(.x, sample(1:length(.x), size = length(.x) / 20), 99))
s$mat <- map(s$num, ~ as_data_frame(matrix(.x, ncol = 6)))
help_an <- function(vec) {
browser()
return(!any(vec == 99))
}
help_dp_t <- function(df) {
clo1 <- proc.time()
a <- as_data_frame(t(df)) %>% summarise_all(help_an)
df2 <- filter(df, t(a)[, 1])
b <- tibble(time = (proc.time() - clo1)[3], df = list(df2))
return(b)
}
s$dplyr <- map(s$mat, ~ dplyr::mutate(help_dp_t(.x)))
help_lap <- function(df) {
clo1 <- proc.time()
a_base <- df[apply(df, 1, function(x)
! any(x == 99)), ]
b <- tibble(time = (proc.time() - clo1)[3], df = list(a_base))
return(b)
}
s$lapply <- map(s$mat, ~ mutate(help_lap(.x)))
s$equal_dplyr_lapply <-
map2_lgl(s$dplyr, s$lapply, ~ all.equal(.x$df, .y$df))
s$dplyr_time <- map_dbl(s$dplyr, "time")
s$lapply_time <- map_dbl(s$lapply, "time")
ggplot(gather(s, ... = c(7, 8)), aes(x = rows, y = value, color = key)) +
geom_line()
I tried the following with rowwise, but the rowwise pipe does not send a vector, but the entire df to the help_an function.
help_dp_r <- function(df) {
clo1 <- proc.time()
df2 <-
df %>% rowwise() %>% mutate(cond = help_an(.)) ### . is not passed on as a vector, but the entire df??
b <- tibble(time = (proc.time() - clo1)[3], df = list(df2))
}
s$dplyr_r <- map(s$mat, ~ dplyr::mutate(help_dp_r(.x)))

randomize observations by groups (blocks)

I have a data frame with I obsevations, and each observation belongs to one of g categories.
set.seed(9782)
I <- 500
g <- 10
library(dplyr)
anon_id <- function(n = 1, length = 12) {
randomString <- c(1:n)
for (i in 1:n)
{
randomString[i] <- paste(sample(c(0:9, letters, LETTERS),
length, replace = TRUE),
collapse = "")
}
return(randomString)
}
df <- data.frame(id = anon_id(n = I, length = 16),
group = sample(1:g, I, T))
I want to randomly assign each observation to one of J "urns", given some vector of probabilities p. That is the probability of being assign to urn J=1 is p[1]. The added complexity is that I want to do this block by block.
If I ignore the blocks, I can do this easily:
J <- 3
p <- c(0.25, 0.5, 0.25)
df1 <- df %>% mutate(urn = sample(x = c(1:J), size = I, replace = T, prob = p))
I thought about this method to do it by "block"
# Block randomization
randomize_block <- function(g) {
df1 <- df %>% filter(group==g)
size <- nrow(df1)
df1 <- df1 %>% mutate(urn = sample(x = c(1:J),
size = size,
replace = T,
prob = p))
return(df1)
}
df2 <- lapply(1:g, randomize_block)
df2 <- data.table::rbindlist(df2)
Is there a better way?
Not sure if this is better, but here is a base R technique with data.frame df, that has group name "group" as well as urn assignments 1:J with assignment probabilities in vector p of length J.
# get urn assignment
urnAssignment <- lapply(unique(df$group),
function(i) sample(1:J, nrow(df[group==i,]), replace =T, prob=p))
# get a list that collects position of observations
obsOrder <- lapply(unique(df$group),
function(i) which(df$group == i))
df$urnAssignment <- unlist(urnAssignment)[unlist(obsOrder)]
randomizr::block_ra does exactly what you want.
library(randomizr)
library(janitor) #just for the tabyl function
block_rand <- as.tibble(randomizr::block_ra(blocks = df$group, conditions = c("urn_1","urn_2","urn_3")))
df2 <- as.tibble(bind_cols(df, block_rand))
df2 %>% janitor::tabyl(group, value)
This does the trick using dplyr:
randomize <- function(data, groups=2, block_id = NULL, p=NULL, seed=9782) {
if(is.null(p)) p <- rep(1/groups, groups)
if(is.null(block_id)){
df1 <- data %>%
mutate(Treatment = sample(x = c(1:groups),
size = n(),
replace = T,
prob = p))
return(df1)
}else{
df1 <- data %>% group_by_(block_id) %>%
mutate(Treatment = sample(x = c(1:groups),
size = n(),
replace = T,
prob = p))
}
}
df1 <- randomize(data = df, groups = J, block_id = "group", p = p, seed = 9782)

Resources