I'm trying to loop over a list of characters:
covariate_names <- c("Age_50.", "Age_30.49", "Age_18.29", "Income_High", "Income_Medium", "Income_Low",
"DemographicSegment_Value.Seeker", "DemographicSegment_Established", "DemographicSegment_Planner", "DemographicSegment_Affluent",
"DemographicSegment_Digital.Native","Gender_F..", "Gender_M..", "TransactionAmount",
"TechTxns", "FashionTxns", "TravelTxns", "GamerTxns",
"Recency", "Frequency", "Monetary", "Breadth", "Consistency", "is_donor", "donation_amt")
pairs <- function(df, covariate, group){
pwc <- pairwise_t_test(data = df,
formula = as.formula(paste0(covariate,"~",group)),
paired = FALSE )
pwc <- as.data.frame(pwc)
pwc <- pwc %>% rename(GroupA = group1,
GroupB = group2,
N_grpA = n1, N_grpB = n2)
pwc <- pwc[,1:7]
pwc <- pwc %>%
mutate_if(is.numeric, round, digits = 6)
#print(pwc)
}
for (i in covariate_names){
pwc_i <- pairs(df = df_test_TS, covariate = i, group = "w.contextual")
}
But the pairs function returns a df so I don't know if this is possible to use a loop for. I just want to run the pairs function for all of my covariates in the list, and be able to call to the output from each of the individual iteration of the pairs function.
You can use the function :
library(dplyr)
library(purrr)
pairs <- function(df, covariate, group){
pwc <- pairwise_t_test(data = df,
formula = reformulate(group, covariate),
paired = FALSE )
pwc <- as.data.frame(pwc)
pwc %>%
rename(GroupA = group1,
GroupB = group2,
N_grpA = n1, N_grpB = n2) %>%
select(1:7) %>%
mutate(across(where(is.numeric), round, digits = 6))
#For older dplyr
#mutate_if(is.numeric, round, digits = 6)
}
and then use map/lapply to get list as output :
result <- map(covariate_names, ~pairs(df_test_TS, .x, "w.contextual"))
If you want to combine the results into one dataframe use map_df :
result <- map_df(covariate_names, ~pairs(df_test_TS, .x, "w.contextual"))
You would need to store your results in a list, using the loop. You can try next code:
#Data
covariate_names <- c("Age_50.", "Age_30.49", "Age_18.29", "Income_High", "Income_Medium", "Income_Low",
"DemographicSegment_Value.Seeker", "DemographicSegment_Established", "DemographicSegment_Planner", "DemographicSegment_Affluent",
"DemographicSegment_Digital.Native","Gender_F..", "Gender_M..", "TransactionAmount",
"TechTxns", "FashionTxns", "TravelTxns", "GamerTxns",
"Recency", "Frequency", "Monetary", "Breadth", "Consistency", "is_donor", "donation_amt")
#Function
pairs <- function(df, covariate, group){
pwc <- pairwise_t_test(data = df,
formula = as.formula(paste0(covariate,"~",group)),
paired = FALSE )
pwc <- as.data.frame(pwc)
pwc <- pwc %>% rename(GroupA = group1,
GroupB = group2,
N_grpA = n1, N_grpB = n2)
pwc <- pwc[,1:7]
pwc <- pwc %>%
mutate_if(is.numeric, round, digits = 6)
#print(pwc)
}
Here the changes:
#List to store results
List <- list()
#Loop
for (i in 1:length(covariate_names)){
List[[i]] <- pairs(df = df_test_TS, covariate = covariate_names[i], group = "w.contextual")
}
Related
My dataframe looks like this:
Date = c(rep(as.Date(seq(15000,15012)),2))
Group = c(rep("a",13),rep("b",13))
y = c(seq(1,26,1))
x1 = c(seq(0.01,0.26,0.01))
x2 = c(seq(0.02,0.26*2,0.02))
df = data.frame(Group,Date,y,x1,x2)
head(df,3)
Group
Date
y
x1
x2
a
2011-01-26
1
0.01
0.02
a
2011-01-27
2
0.02
0.04
a
2011-01-28
3
0.03
0.06
And I would like to do multiple regression by group (y as the dependent variable and x1, x2 as the independent variables) in a rolling window i.e. 3.
I have tried to achieve this using packages tidyverse and zoo with following codes but failed.
## define multi-var-linear regression function and get the residual
rsd <- function(df){
lm(formula = y~x1+x2, data = as.data.frame(df), na.action = na.omit) %>%
resid() %>%
return()
}
## apply it by group with rolling window
x <- df %>% group_by(Group) %>%
rollapplyr(. , width = 3, FUN = rsd)
The output of this code is not what I acutually want.
Does anyone know how to do multiple regression by group in a rolling window?
Thanks in advance, Giselle
Thank Grothendieck and Marcus for your codes!
It really helped me a lot:)
I now appened them here:
# Grothendieck method
rsd <- function(df){
lm(formula = y~x1+x2, data = as.data.frame(df), na.action = na.omit) %>%
resid() %>%
return()
}
width <- 5
df_m2 <-
df %>%
group_by(Group) %>%
group_modify(~ {
cbind(., rollapplyr(.[c("y", "x1", "x2")], width, rsd, fill = NA,
by.column = FALSE))
}) %>%
ungroup %>%
select(c("Group","Date","5")) %>%
dplyr::rename(residual_m2 = "5")
# Marcus method
output <- data.frame()
for (i in unique(df$Group)) {
a = df%>% subset(Group==i)
a[,"residual"] = NA
max = nrow(a)
if(max<5){
next
}
for (j in seq(5,max,by=1)) {
b = a %>% slice((j-4):j)
lm_ = lm(y~x1+x2, data = b)
a[j,]$residual = residuals(lm_)[5]
}
output <-
output %>%
rbind(a)
}
Use group_modify and use rollapplyr with the by.column = FALSE argument so that rsd is applied to all columns at once rather than one at a time.
Note that if you use width 3 with two predictors and an intercept the residuals will necessarily be all zero so we changed the width to 5.
library(dplyr, exclude = c("lag", "filter"))
library(zoo)
width <- 5
df %>%
group_by(Group) %>%
group_modify(~ {
cbind(., rollapplyr(.[c("y", "x1", "x2")], width, rsd, fill = NA,
by.column = FALSE))
}) %>%
ungroup
A good old-fashioned for-loop here could be:
for (i in unique(df$Group)){
for (j in (seq(15000,15012, 3))){
lm_ <- lm(formula = df[df$Group== i & df$Date %in% c(j, j+1, j+2), 3] ~ df[df$Group== i & df$Date %in% c(j, j+1, j+2), 4] + df[df$Group== i & df$Date %in% c(j, j+1, j+2), 5], na.action = na.omit)
print(paste('Group', i, 'Dates from', j, 'to', j+3, residuals(lm_)))
}
}
I have a large dataset on which to perform a diff-in-diff estimation. Given the nature of the dataset my t-statistics denominators are inflated and coefficient are (surreptitiously) statistically significant.
I want to step-by-step reducing the number of element in the database, and for each step resample a large number of times and re-estimating each time interaction coefficient and standard errors.
Then I want to take all the averages estimates and standard error, and plot them on a graph, to show at what point (if any) they are not statistically different from zero.
My code follows with a toy example.
I am not sure this is the most efficient way to tackle the problem
I cannot retrieve and thus plot the confidence interval
I am not sure the sampling is representative given the existence of different groups.
Toy example (Creds Torres-Reyna - 2015)
library(foreign)
library(dplyr)
library(ggplot2)
df_0 <- NULL
for (i in 1:length(seq(5,nrow(mydata)-1,5))){
index <- seq(5,nrow(mydata),5)[i]
df_1 <- NULL
for (j in 1:10){
mydata_temp <- mydata[sample(nrow(mydata), index), ]
didreg = lm(y ~ treated + time + did, data = mydata_temp)
out <- summary(didreg)
new_line <- c(out$coefficients[,1][4], out$coefficients[,2][4], index)
new_line <- data.frame(t(new_line))
names(new_line) <- c("c","s","i")
df_1 <- rbind(df_1,new_line)
}
df_0 <- rbind(df_0,df_1)
}
df_0 <- df_0 %>% group_by(i) %>% summarise(coefficient <- mean(c, na.rm = T),
standard_error <- mean(s, na.rm = T))
names(df_0) <- c("i","c","s")
View(df_0)
Consider the following refactored code using base R functions: within, %in%, nested lapply, setNames, aggregate, and do.call. This approach avoids calling rbind in a loop and compactly re-writes code without constantly using $ column referencing.
library(foreign)
mydata = read.dta("http://dss.princeton.edu/training/Panel101.dta")
mydata <- within(mydata, {
time <- ifelse(year >= 1994, 1, 0)
treated <- ifelse(country %in% c("E", "F", "G"), 1, 0)
did <- time * treated
})
# OUTER LIST OF DATA FRAMES
df_0_list <- lapply(1:length(seq(5,nrow(mydata)-1,5)), function(i) {
index <- seq(5,nrow(mydata),5)[i]
# INNER LIST OF DATA FRAMES
df_1_list <- lapply(1:100, function(j) {
mydata_temp <- mydata[sample(nrow(mydata), index), ]
didreg <- lm(y ~ treated + time + did, data = mydata_temp)
out <- summary(didreg)
new_line <- c(out$coefficients[,1][4], out$coefficients[,2][4], index)
new_line <- setNames(data.frame(t(new_line)), c("c","s","i"))
})
# APPEND ALL INNER DFS
df <- do.call(rbind, df_1_list)
return(df)
})
# APPEND ALL OUTER DFS
df_0 <- do.call(rbind, df_0_list)
# AGGREGATE WITH NEW COLUMNS
df_0 <- within(aggregate(cbind(c, s) ~ i, df_0, function(x) mean(x, na.rm=TRUE)), {
upper = c + s
lower = c - s
})
# RUN PLOT
within(df_0, {
plot(i, c, ylim=c(min(c)-5000000000, max(c)+5000000000), type = "l",
cex.lab=0.75, cex.axis=0.75, cex.main=0.75, cex.sub=0.75)
polygon(c(i, rev(i)), c(lower, rev(upper)),
col = "grey75", border = FALSE)
lines(i, c, lwd = 2)
})
In the end I solved it like this:
Is this the most efficient way?
library(foreign)
library(dplyr)
mydata = read.dta("http://dss.princeton.edu/training/Panel101.dta")
mydata$time = ifelse(mydata$year >= 1994, 1, 0)
mydata$treated = ifelse(mydata$country == "E" |
mydata$country == "F" |
mydata$country == "G", 1, 0)
mydata$did = mydata$time * mydata$treated
df_0 <- NULL
for (i in 1:length(seq(5,nrow(mydata)-1,5))){
index <- seq(5,nrow(mydata),5)[i]
df_1 <- NULL
for (j in 1:100){
mydata_temp <- mydata[sample(nrow(mydata), index), ]
didreg = lm(y ~ treated + time + did, data = mydata_temp)
out <- summary(didreg)
new_line <- c(out$coefficients[,1][4], out$coefficients[,2][4], index)
new_line <- data.frame(t(new_line))
names(new_line) <- c("c","s","i")
df_1 <- rbind(df_1,new_line)
}
df_0 <- rbind(df_0,df_1)
}
df_0 <- df_0 %>% group_by(i) %>% summarise(c = mean(c, na.rm = T), s =
mean(s, na.rm = T))
df_0 <- df_0 %>% group_by(i) %>% mutate(upper = c+s, lower = c-s)
df <- df_0
plot(df$i, df$c, ylim=c(min(df_0$c)-5000000000, max(df_0$c)+5000000000), type = "l")
polygon(c(df$i,rev(df$i)),c(df$lower,rev(df$upper)),col = "grey75", border = FALSE)
lines(df$i, df$c, lwd = 2)
Suppose I have the following code that makes multiple regressions and stores the lm and lm with stepwise selection models in tibbles:
library(dplyr)
library(tibble)
library(MASS)
set.seed(1)
df <- data.frame(A = sample(3, 10, replace = T),
B = sample(100, 10, replace = T),
C = sample(100, 10, replace = T))
df <- df %>% arrange(A)
formula_df <- as.tibble(NA)
aic_df <- as.tibble(NA)
for (i in unique(df$A)){
temp <- df %>% filter(A == i)
formula_df[i, 1] <- temp %>%
do(model = lm(B ~ C, data = .))
aic_df[i, 1] <- temp %>%
do(model = stepAIC(formula_df[[1,1]], direction = "both", trace = F))
}
Is it possible to vectorize to make it faster, for example using the *pply functions? The loop becomes extremely slow when the data gets larger. Thank you in advance.
You could try something like:
model <- df %>% group_by(A) %>%
summarise(formula_model = list(lm(B ~ C))) %>%
mutate(aic_model = list(stepAIC(.[[1,2]], direction = "both", trace = F)))
#Inputs:
n1 = c(5,6,7)
n2 = c(1,2,3)
list1 = data.frame(n1,n2)
list2 = data.frame(n1,n2)
listx = list(list1,list2)
n1 = c(5,6,7,8)
n2 = c(6,7,8,9)
list3 = data.frame(n1,n2)
list4 = data.frame(n1,n2)
list5 = data.frame(n1,n2)
listy = list(list3,list4,list5)
list6 = list(listx,listy)
#Code:
z <- list()
for(i in 1:length(list6)){
w <- data.frame(x=c(rep(0, nrow(list6[[i]][[1]])))) #init 0,0,0,0...
for(j in 1:length(list6[[i]])){
w[,1] <- w[,1] + list6[[i]][[j]]$n1
z[[i]] <- w
}
}
z
I believe there's a more efficient coding method instead of using double for-loop, would like lapply/sapply type equivalent (or any?). Many thanks
lapply(list6,function(x) Reduce("+",x)[,1,drop=FALSE])
This should do the job given list6.
With tidyverse, if there are no missing elements i.e NA, we can use the reduce approach
library(dplyr)
library(purrr)
list6 %>%
map(~ .x %>%
reduce(`+`) %>%
select(1))
Or in general, it can be done with group_by sum
list6 %>%
map(~ bind_rows(.x, .id = 'grp') %>%
group_by(grp) %>%
group_by(grp1 = row_number()) %>%
summarise_at(2, sum, na.rm = TRUE) %>%
select(-grp1) )
I have a data frame with I obsevations, and each observation belongs to one of g categories.
set.seed(9782)
I <- 500
g <- 10
library(dplyr)
anon_id <- function(n = 1, length = 12) {
randomString <- c(1:n)
for (i in 1:n)
{
randomString[i] <- paste(sample(c(0:9, letters, LETTERS),
length, replace = TRUE),
collapse = "")
}
return(randomString)
}
df <- data.frame(id = anon_id(n = I, length = 16),
group = sample(1:g, I, T))
I want to randomly assign each observation to one of J "urns", given some vector of probabilities p. That is the probability of being assign to urn J=1 is p[1]. The added complexity is that I want to do this block by block.
If I ignore the blocks, I can do this easily:
J <- 3
p <- c(0.25, 0.5, 0.25)
df1 <- df %>% mutate(urn = sample(x = c(1:J), size = I, replace = T, prob = p))
I thought about this method to do it by "block"
# Block randomization
randomize_block <- function(g) {
df1 <- df %>% filter(group==g)
size <- nrow(df1)
df1 <- df1 %>% mutate(urn = sample(x = c(1:J),
size = size,
replace = T,
prob = p))
return(df1)
}
df2 <- lapply(1:g, randomize_block)
df2 <- data.table::rbindlist(df2)
Is there a better way?
Not sure if this is better, but here is a base R technique with data.frame df, that has group name "group" as well as urn assignments 1:J with assignment probabilities in vector p of length J.
# get urn assignment
urnAssignment <- lapply(unique(df$group),
function(i) sample(1:J, nrow(df[group==i,]), replace =T, prob=p))
# get a list that collects position of observations
obsOrder <- lapply(unique(df$group),
function(i) which(df$group == i))
df$urnAssignment <- unlist(urnAssignment)[unlist(obsOrder)]
randomizr::block_ra does exactly what you want.
library(randomizr)
library(janitor) #just for the tabyl function
block_rand <- as.tibble(randomizr::block_ra(blocks = df$group, conditions = c("urn_1","urn_2","urn_3")))
df2 <- as.tibble(bind_cols(df, block_rand))
df2 %>% janitor::tabyl(group, value)
This does the trick using dplyr:
randomize <- function(data, groups=2, block_id = NULL, p=NULL, seed=9782) {
if(is.null(p)) p <- rep(1/groups, groups)
if(is.null(block_id)){
df1 <- data %>%
mutate(Treatment = sample(x = c(1:groups),
size = n(),
replace = T,
prob = p))
return(df1)
}else{
df1 <- data %>% group_by_(block_id) %>%
mutate(Treatment = sample(x = c(1:groups),
size = n(),
replace = T,
prob = p))
}
}
df1 <- randomize(data = df, groups = J, block_id = "group", p = p, seed = 9782)