Randomization/ permutation test - r

I have a panel data that looks like this:
x <- structure(list(id2 = c(1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8,
8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11), private = c(1,
1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
0, 0, 0, 0, 1, 1, 1), capex = c(-0.003423963, -0.028064674, -0.03058208,
-0.00186256, -0.010839419, 0.052905358, 0.058931317, 0.065547734,
0.007644231, -0.025942514, 0.00163772, -0.007530502, 0.010706151,
0.025040116, 0.035105374, 0.036772128, 0.03886272, 0.045399148,
0.042642809, 0.080788992, 0.080848917, 0.057645567, 0.057636742,
0.046084184, 0.041080192, 0.05690382, 0.057325598, 0.051791377,
0.070084445, 0.069627948, 0.077849329, 0.077247024, 0.081251919,
0.071702167, 0.078424804, 0.077482991, 0.078969546, 0.076208547,
0.059055354, 0.056043826, 0.029450929, 0.016044363, 0.048353843,
0.047607671, 0.046497576, 0.047454875, 0.050881654, 0.047155183,
0.055546004, 0.057564467), roa = c(-0.003078416, -0.035302367,
-0.01884984, 0.002839225, -0.001113289, 0.024291474, 0.040153231,
0.051482957, 0.026102768, 0.005372915, 0.004466314, -0.025178509,
-0.002043936, -0.069235161, 0.023604594, 0.010512878, 0.021912357,
0.016721437, -0.09472625, 0.04061316, 0.074661337, 0.0214584,
0.047743626, 0.013149141, -0.010418181, 0.025671346, 0.031785361,
0.084893651, 0.018490626, 0.024941774, 0.023567598, 0.031878859,
0.029931206, 0.043837443, 0.041305128, 0.041293647, 0.039307728,
0.046259467, 0.017479861, 0.029429797, 0.023826957, 0.00763736,
0.017485917, 0.017156925, 0.006504506, 0.021350464, 0.032917287,
0.036106978, 0.04545372, 0.049348988), year = c(2011, 2012, 2013,
2014, 2015, 2011, 2012, 2013, 2011, 2012, 2013, 2014, 2015, 2011,
2012, 2013, 2014, 2015, 2011, 2012, 2013, 2014, 2015, 2011, 2012,
2013, 2014, 2015, 2011, 2012, 2013, 2014, 2015, 2011, 2012, 2013,
2014, 2015, 2011, 2012, 2013, 2014, 2011, 2012, 2013, 2014, 2015,
2011, 2012, 2013)), row.names = c(NA, -50L), class = c("tbl_df",
"tbl", "data.frame"))
Where id2 is firm ID and private is an indicator for private/public status. My goal is to run a randomization test for r-squared as follows:
regress roa on capex for private firms (i.e. private==1) and public (private==0) separately and get the observed difference in R-squared
randomly assign firms to private-public status (note that the data is panel)
rerun the regression and get the difference in r-squared for the random sample
repeat this, say, 1000 times
measure the p-value as the number of times that the randomly generated difference in R2 is larger than the observed difference divided by the number of iterations (1,000)
My issue is that this code takes ages to run, it will be great if someone has an idea of a better way to do this.
you will need estimatr and tidyverse packages to run this code
library(estimatr)
library(tidyverse)
# run model 1
mod1 <-lm_robust(roa ~ capex,
cluster=id2,
se_type = "stata",
data=x,private==0)
# run model 2
mod2 <-lm_robust(roa ~ capex,
cluster=id2,
se_type = "stata",
data=x,private==1)
#obtain the observed difference in R2
R2.obs1 <- summary(mod1)$adj.r.squared
R2.obs2 <- summary(mod2)$adj.r.squared
diff_r2_obs <- R2.obs2 - R2.obs1
]
#create a list for the simulated differnce in r2
simulated_r2 <- list()
# prepare the loop
set.seed(8)
nreps = 1000
for(i in 1:nreps){
x1 <- x %>% # randamize the variable private taking into acount each id appears a number of times
distinct(id2, private) %>%
mutate(private1=sample(private), replace=T) %>%
left_join(x, by="id2")
model_m <-lm_robust(roa ~ capex,
cluster=id2,
se_type = "stata",
data = x1,
subset=private1==0)
R2.obs_m <- summary(model_m)$adj.r.squared
model_f <-lm_robust(roa ~ capex,
cluster=id2,
se_type = "stata",
data = x1,
subset=private1==1)
R2.obs_f <- summary(model_f)$adj.r.squared
r2_diff_sim <- R2.obs_f - R2.obs_m
simulated_r2[i] <- r2_diff_sim
}
simulated_r2 <- unlist(simulated_r2)
exceed_count <- length(simulated_r2[simulated_r2 >=
diff_r2_obs])
p_val <- exceed_count / nreps
p_val

I initialized simulated_r2 as a vector rather than a list. We benefit a lot when we predefine it's length=. Also I replaced the dplyr code with an transform approach which appears to be faster. Runs quite fast with 1000 reps on this data actually. (Note, that in your code there was a mistake when using sample, the replace= argument is outside the call but should be inside!)
nreps <- 1000
simulated_r2 <- numeric(length=nreps)
set.seed(8)
system.time(
for (i in seq_len(nreps)) {
x1 <- transform(subset(x, !duplicated(id2)),
private=sample(private, replace=T))
model_m <- estimatr::lm_robust(roa ~ capex, clusters=id2, se_type="stata",
data=x1, subset=private == 0)
R2.obs_m <- summary(model_m)$adj.r.squared
model_f <- estimatr::lm_robust(roa ~ capex, clusters=id2, se_type="stata",
data=x1, subset=private == 1)
R2.obs_f <- summary(model_f)$adj.r.squared
r2_diff_sim <- R2.obs_f - R2.obs_m
simulated_r2[i] <- r2_diff_sim
}
)
# user system elapsed
# 8.262 0.000 8.267
(p_val <- length(simulated_r2[simulated_r2 >= diff_r2_obs])/nreps)
# [1] 0.273
Note, that warnings occur if you set replace=TRUE, probably when the nobs of one of the subsets on 0/1 get to small. You should rethink your approach a little.
Parallelize
If this is too slow, you could parallelize the code.
library(parallel)
cl <- makeCluster(detectCores() - 1)
clusterExport(cl, 'x')
clusterSetRNGStream(cl, 8) ## set seed
nreps <- 10000
system.time(
simulated_r2 <- parSapplyLB(cl, seq_len(nreps), \(i) {
x1 <- transform(subset(x, !duplicated(id2)),
private=sample(private, replace=F))
model_m <- estimatr::lm_robust(roa ~ capex, clusters=id2, se_type="stata",
data=x1, subset=private == 0)
R2.obs_m <- summary(model_m)$adj.r.squared
model_f <- estimatr::lm_robust(roa ~ capex, clusters=id2, se_type="stata",
data=x1, subset=private == 1)
R2.obs_f <- summary(model_f)$adj.r.squared
R2.obs_f - R2.obs_m
})
)
# user system elapsed
# 0.067 0.058 17.820
(p_val <- length(simulated_r2[simulated_r2 >= diff_r2_obs])/nreps)
# [1] 0.257
stopCluster(cl)

This takes a different approach to jay.sf's nice solution. Here, I use data.table, create the random permutations of private/public status, use a small function to get difference(s) in r-sq:
library(data.table)
setDT(x)
set.seed(8)
nperms=1000
# get permutations of public/private status
perms = cbind(unique(x[,.(id2)]), unique(x[,.(id2,private)])[,lapply(1:nperms, function(p) sample(private, replace=F))])
# function to get R-sq differences
obsR <- function(r,c,id,p) {
r0 = lm_robust(r[p==0]~c[p==0],clusters=id[p==0],se_type="stata")$adj.r.squared
r1 = lm_robust(r[p==1]~c[p==1],clusters=id[p==1],se_type="stata")$adj.r.squared
r1-r0
}
# empirical r-sq diff
empirR = x[, obsR(roa,capex,id2,private)]
# simulated r-sq differences
simR = x[perms, on=.(id2)][, sapply(.SD, function(v) obsR(roa,capex,id2,v)), .SDcols = patterns("V")]
# pvalue
sum(simR>=empirR)/nperms
With the small provided dataset, this can do a thousand estimates and get the p-value in under 5 seconds on my machine.
user system elapsed
4.56 0.14 4.70

Related

Compute value based on multiple conditions

I have a dataframe of children with their month and year of birth as well as air quality data their location from 1999 to 2013. I want to replicate a method used in a paper (https://www.nature.com/articles/s41586-018-0263-3#Sec2) where the authors compute the in utero exposure. Their calculation covers both air quality exposure for two years if the pregnancy to birth period occurred in two-year period. Using a direct quote from the paper "For instance, a child born in the third month of year t would be assigned an in utero pollution exposure of 2/9(exposure in year t) + 7/9(exposure in year t−1)". exposure in year t−1 means the year prior to the birth year.So for a child born in se month 9 to month 12 in year t, the child's exposure is the mean value for just year t.
Is there a way to capture this computing formula in R such that if the birth month is less than 9, then the formula above is applied. Else, the value for the birth year is returned.
The structure of a sample data is
structure(list(CaseID = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
44, 45, 46, 47, 48, 49, 50), Birth_month = c(1, 5, 7, 3, 12,
12, 11, 8, 8, 6, 9, 2, 10, 4, 8, 12, 9, 9, 1, 5, 7, 3, 10, 9,
11, 8, 8, 3, 9, 2, 8, 9, 8, 5, 6, 8, 9, 1, 5, 7, 3, 10, 9, 11,
8, 8, 3, 9, 2, 11), Birth_Year = c(2000, 2003, 2010, 2008, 2006,
2001, 2012, 2013, 2007, 2007, 2008, 2000, 2013, 2004, 2001, 2010,
2008, 2006, 2001, 2012, 2001, 2012, 2013, 2007, 2007, 2008, 2000,
2013, 2013, 2004, 2001, 2010, 2008, 2006, 2001, 2012, 2001, 2012,
2013, 2006, 2001, 2012, 2013, 2007, 2007, 2008, 2000, 2013, 2004,
2001), AIR1999 = c(8.96798959699599, 6.80355783305597, 5.89652403522655,
6.08351130989008, 0.421893172909738, 9.04891222893749, 7.32566080174991,
11.9538120562027, 5.83821052537696, 3.05275316117657, 7.56280655869865,
14.8201468275825, 7.64593596096383, 13.4841890986795, 14.0470039087897,
7.70797096865252, 1.61059988426883, 3.80031918734312, 14.9501510635975,
10.0847597081028, 10.1585740911858, 14.173089028311, 6.63212439636118,
5.61902280925075, 0.986908540996723, 3.13157703826507, 4.89343957488122,
5.31442170692771, 12.6929693986033, 1.26105029549589, 5.4180377367679,
4.25665508354944, 14.3792099452973, 5.24041748730466, 6.07755938634626,
4.95787754927552, 9.61953763780231, 3.75130929742288, 7.53080678060791,
12.3820129775514, 13.6887069176028, 4.13298569958261, 14.1752688808057,
3.52867313173367, 11.8386358425156, 1.79493586691422, 11.4111429566422,
14.4466513980518, 10.9929119145086, 0.526445570948068), AIR2000 = c(11.5400193935689,
4.51400380016048, 13.0405851599751, 9.26140852396632, 8.20474216026161,
13.7864063032873, 2.26955144127086, 2.73370701770671, 1.31484532203898,
5.95023887719191, 1.35904698184086, 6.38629071258241, 10.6814684475625,
11.7245391233198, 6.32023995462316, 14.6061184533106, 13.1787409855174,
2.6639327040026, 9.77708519725292, 14.2537217202925, 0.251322827515658,
11.6949597093156, 0.901117715214845, 8.8839327113477, 7.87421084115212,
11.7988941842713, 11.5616693068156, 13.6192478532903, 10.2098129178141,
2.6167536478648, 1.54635387455602, 12.0428954083114, 4.64425452833646,
15.1004807784159, 8.74940761103621, 6.54845786664169, 3.74066828807676,
13.8062808303186, 3.14344133563456, 4.87711996067478, 1.83335648309905,
0.564922403780743, 4.75278873524722, 14.1680012285325, 10.9841650458858,
4.35061422900041, 9.69220729148341, 12.3134085446505, 4.58245493312832,
6.09497011553915), AIR2001 = c(2.34047661832464, 10.1636542402387,
5.63355084510311, 4.85862207704014, 9.23495329037285, 14.7018283497058,
3.34896555680828, 10.7035308826005, 9.70681187241594, 5.94983366167778,
10.6982700353058, 8.20110325993877, 5.47815369644901, 2.29662297346932,
5.81311151243444, 5.86180122796237, 3.29303425198002, 12.6395572883144,
9.19196868231753, 6.9857222145889, 11.3394421438701, 11.047885820206,
10.3115117570383, 6.52485884934803, 4.32370917450194, 4.76700344993779,
2.66265303340764, 9.34482253280724, 8.82734048097441, 12.4377567085049,
13.4038470998239, 6.85719530528947, 13.0154083911972, 13.9397814449291,
6.42288194378652, 11.5360288507433, 4.05413083475339, 3.52317486527632,
5.33977231099782, 2.62500006449549, 6.07684988212213, 9.66904514889885,
4.74849246345088, 4.30533116448019, 7.45883561741118, 6.83800770166353,
13.7860890292185, 8.95913024599384, 4.22882908951701, 11.9366229237847
), AIR2002 = c(5.5421209915732, 8.82941941950074, 13.9415065737988,
4.05088145747734, 0.634888762937393, 8.94379655820853, 14.6699177745578,
2.92273129951465, 3.32000594967115, 9.6626486005201, 7.67113231661101,
5.73763535333215, 6.66896355409967, 4.76775433524861, 11.7307171144853,
11.9204512965262, 4.30270969435317, 6.07337335666409, 0.895004365468398,
7.92298369263555, 0.373079233546509, 11.6968179701357, 2.86150118232053,
2.24216720518936, 11.7806082410973, 9.40187198882131, 12.0526066867323,
11.9956986808735, 3.58023497230932, 8.58664103748742, 7.57416980166524,
4.31572127419198, 4.87601018757676, 1.42685255046957, 0.610385331556434,
4.55194799008779, 14.0502073411413, 9.80004817027273, 8.61341614048323,
0.900034241871675, 1.41049400813086, 1.55246704871324, 4.72272099354747,
0.316862174819224, 2.93053907830617, 8.05078771834052, 5.28952837263816,
10.1443750119205, 9.46636938612279, 7.2068673761792), AIR2003 = c(11.6987492112564,
8.03123494261876, 14.6267426478136, 10.1608755875826, 12.2925055704629,
4.41903535030968, 1.32192490517627, 14.9171937333997, 5.989500815548,
1.33018757369556, 11.5268041675435, 10.0354422837207, 12.2193238575859,
8.34460034461785, 3.19971360721812, 4.72411370138545, 8.99170322034135,
8.84431456421316, 9.19228873900324, 11.6073826548178, 5.04809568638727,
2.44113702619448, 1.3020562816998, 6.58069895481877, 12.9026523987483,
7.16368647065572, 8.66070749361161, 6.00576582145039, 2.10032442033477,
6.07895397265628, 9.84188887039386, 8.72011480981018, 8.3973909978196,
2.93135377582256, 2.07479393241927, 4.33787543469854, 11.5524645352829,
6.32281940148678, 13.4578647589665, 8.25475389447622, 6.92853975148313,
13.9737096681725, 3.20975688825082, 5.83681199098192, 14.7013523747697,
12.8483105113003, 14.7442567509897, 11.3872388804331, 11.9289726106934,
11.3432943127351), AIR2004 = c(12.3298210280007, 13.1959769331908,
12.370661520134, 15.0868251372818, 8.94963014865853, 12.2191067473528,
5.22362188581377, 0.720411225163843, 12.4820522407312, 0.351069664391922,
1.30037323312717, 4.37076402375568, 3.62288978581387, 8.48409274117183,
0.679597671930213, 7.34746075248183, 13.1647057766567, 11.1547359531655,
4.96984202732425, 8.96129352980759, 9.53370854761545, 5.8134242443291,
8.71016911426559, 11.8883539470122, 12.2352502209791, 4.68252076998213,
5.95696850650571, 13.8056899585128, 7.50516538352217, 5.02250328123337,
13.487732587669, 14.5949376791599, 14.9951364343211, 6.59220350365457,
9.48408624811447, 1.41794018899766, 14.1069514028479, 9.36621385829034,
13.7439532769257, 10.4824247457942, 14.7214399003615, 3.88337102833367,
6.97264864603989, 8.02159295712691, 12.6082874644615, 13.1953726709504,
13.6941493970903, 8.22842252415093, 2.50750512640784, 3.58666553804674
), AIR2005 = c(14.1543983449384, 0.207154301401461, 10.3945003528385,
1.55163006050745, 11.7088354445258, 13.8307717176049, 1.60947553726472,
12.4807727501071, 11.4753113418682, 12.1466166828242, 4.28278952114563,
15.0142466179228, 10.775123869299, 8.21507695954968, 10.2687020091594,
1.73548990486166, 5.87357538677612, 0.838787299717544, 2.09888134363526,
1.10793127552466, 12.8584094551371, 5.82341710817441, 7.35206087073311,
2.75513321271562, 2.28206509921863, 14.7161662780128, 1.92734397216351,
8.52569639326329, 0.435341671429342, 14.8357928042437, 11.0187966747552,
6.7161015153341, 0.729511158074718, 3.54536445078091, 0.25745916891587,
13.7526868569555, 12.4909417931621, 2.44651768055302, 4.29353768908698,
4.3296417530498, 0.332072943038773, 3.9897366906961, 7.34801499286247,
9.05279436282767, 7.81229433161113, 14.6314884596826, 13.0453833885309,
13.3968611858496, 7.20212454602541, 13.6024225526371), AIR2006 = c(14.3602117700654,
9.67784631192451, 3.55228421162465, 9.17853459700709, 3.97472074926249,
0.453478662259644, 13.5944658882257, 5.97668834996084, 5.33786357147782,
7.47689095921, 2.81935510349181, 3.96459207065962, 13.239876144056,
8.31305088601867, 4.70401683545741, 3.55927178151719, 0.198072984460741,
12.4664548987749, 3.95155436665518, 1.47646058510151, 1.15288747034175,
10.5564422610337, 11.799767498377, 9.69579476557509, 13.4778237678527,
0.578315307238372, 7.05817080976465, 12.4450234571598, 3.42532579906844,
8.00017632821924, 8.99522336013103, 10.3307407885047, 7.55966268972564,
2.25758463139716, 0.838791949320352, 14.5796161984191, 0.842886504037306,
12.3815860850157, 7.64115155479219, 8.93122855890589, 0.309247846194077,
9.11649027255387, 14.334240223086, 11.4479621604206, 0.662708246090217,
11.4913576022733, 7.95924178698263, 2.67515668715094, 0.879071030008839,
13.6501796660786), AIR2007 = c(9.97003772936854, 5.66991338106431,
13.1535815664255, 6.65941205831734, 12.9431154270126, 7.54975869239541,
2.68487146483106, 8.1954225972998, 0.989637708235998, 15.0157662873652,
14.2492329591238, 8.72830057256878, 10.6656589124098, 14.407073575431,
5.56657257661223, 12.0258530469052, 14.3806802332073, 6.06250679987832,
12.6181324996231, 13.7419910761765, 7.96911312058102, 4.95833557256381,
4.3284153526728, 4.37892194461683, 8.58931198042166, 15.086752575278,
6.84238949001231, 3.06172467645654, 13.959445785162, 4.88097198925563,
8.14034017345565, 14.5361199092346, 14.4077074720624, 8.95354569882294,
2.42978998625372, 3.71779510277393, 8.94922842727159, 14.5382596874342,
11.9046104310593, 3.08895008495753, 12.1694276854037, 12.2801624915064,
3.20018387928163, 1.09386935686017, 2.04407673706114, 5.70477642147499,
15.0334163082221, 6.06478327849694, 1.98094456569944, 1.69397633470688
), AIR2008 = c(9.30397066498711, 14.7486538500078, 5.58379651584826,
8.80245459234202, 13.487498242771, 4.53696579690417, 14.4067800776965,
6.79924817802059, 13.2960046531346, 14.6843470196975, 1.25236799454479,
5.97676394814788, 8.07777813938539, 8.63516177444276, 8.9589754464475,
15.0031446587539, 14.8031015943952, 2.49399273799104, 10.7569668015961,
4.28002286700346, 10.1122482336515, 5.05124714686349, 11.0521815955096,
0.343769731794717, 13.0853167199786, 12.1844864826184, 7.30493155200058,
8.15045584558858, 7.54400607235706, 2.49430188038922, 0.753580058788415,
13.3526555226133, 6.2869325775844, 8.3021644556059, 10.9850292504223,
2.18342414438305, 7.04020942923496, 11.017099474912, 8.27777373495162,
2.20133028454101, 8.49576657339185, 1.58340844516177, 2.63671668942692,
12.1220992218638, 1.75073969954345, 1.62113385793217, 14.099990950956,
6.91643273566617, 4.30560507998336, 3.77403995886794), AIR2009 = c(1.67957183947507,
7.36369897405664, 6.58358088548272, 2.38956139467959, 7.1751103291281,
6.10804192248383, 14.9931828636669, 1.0084126098291, 12.1872471913092,
5.01698368997965, 13.3619180300804, 12.8454065388709, 13.4218508905321,
14.4554111795065, 13.0900773408331, 12.3451018773937, 11.6958484717617,
7.24591008744668, 8.89023789232783, 11.9545650390091, 10.6377289210549,
1.51871868283534, 12.5126682352189, 5.65189430709695, 13.4598202915611,
14.3695526043761, 7.51904321175604, 6.24126743773976, 14.8900968077758,
8.72267893032101, 12.6833998839031, 2.2834426388687, 8.63599658885389,
9.25064845107519, 2.92453739979817, 10.8969492889207, 8.18858585808612,
1.62620084687509, 7.34673358480399, 12.0605738303089, 14.7037668900241,
12.6395342878401, 3.43311224376853, 3.07568217523932, 12.7473398134939,
12.403539704418, 11.9232473302584, 10.1782277375013, 4.20707396323467,
0.415611640287098), AIR2010 = c(3.43646944312239, 1.35975490884157,
0.983912079614354, 0.778802652686601, 12.3345025972561, 1.00613869713736,
6.00807040606486, 10.3557605237481, 0.571828635439742, 3.23014963916549,
1.50621501078014, 5.46978982399334, 10.487337793163, 2.86241729479888,
1.20100536868256, 9.26454035470425, 0.789336679036962, 1.52654933029041,
8.34533523020451, 11.7671594076965, 8.67667731308495, 11.9934830903618,
3.69538885993906, 9.75540565486182, 9.09214047452505, 1.95581501926063,
14.9897193048559, 14.5229993574421, 9.34056145941513, 1.41195704022213,
9.19928426172957, 11.7689157741664, 6.50435117529007, 3.27914538083738,
1.54789654616197, 10.6469015548979, 12.8618307883253, 8.4587018141062,
11.5180247743961, 2.29130000634049, 2.89561315982137, 2.7726630775854,
6.8413915040032, 13.9176417988634, 13.7705030146381, 0.52759597543464,
1.77941870173556, 5.53348983248253, 12.8605902424706, 3.84327436685492
), AIR2011 = c(7.77058110130485, 6.32489416138083, 9.22097524081613,
12.5675709558835, 8.91335506780934, 0.296842172110919, 14.3338035923715,
4.95561812325357, 1.608350189402, 9.59421682450897, 0.325252260173904,
4.3207606499726, 3.37387573508476, 7.52029055199213, 13.879214577225,
0.226798813565169, 14.1581023259959, 4.70900534654991, 14.7834656228917,
4.33595546064293, 9.77514042361919, 0.503774107655976, 7.74240870135883,
4.71506228298857, 14.9198252162652, 7.62993979265424, 10.4375572762149,
11.1040101385666, 3.69114950193861, 4.54456000300357, 8.70021347533702,
6.00627810265892, 0.358524227888556, 5.78310163486213, 6.06518945520674,
0.372255713034654, 1.26057865931257, 12.8861343483718, 13.7745616754233,
11.8614557243602, 5.64862275314773, 6.31275415731431, 8.65507386001036,
13.9926604995227, 0.355387645868585, 8.69261395941931, 13.1685344892796,
6.71539060721896, 6.13620514613669, 11.5668249191546), AIR2012 = c(6.27081028353097,
7.86677249507676, 0.58618860890693, 12.388112353907, 2.85958583392995,
7.95952853473043, 2.56079553408665, 10.8089659682196, 5.11167549905134,
11.0882686589232, 5.7707566303697, 11.3495220269873, 2.28377063603839,
0.545485086908331, 3.47129222193733, 6.00050472526765, 9.87287770145945,
4.24923844732647, 4.15209205533913, 14.5653932706246, 9.64327895222511,
5.11627035313752, 13.1698418143084, 0.588214227333898, 5.74336776575609,
3.65524465449248, 6.91148347930191, 9.29653071753797, 14.6478107533806,
9.5541726365569, 3.80763447697437, 2.96324152937718, 2.91963071328681,
6.08352127633477, 13.1309727081126, 10.8486344935496, 2.23741842607246,
3.65041686724569, 1.81148035473563, 0.905375592498109, 13.6041364369688,
10.7900264257432, 6.13362577195535, 6.43496705320152, 10.7790867917938,
0.281412984936498, 5.93390544060129, 4.08022078832751, 14.0153999755471,
5.40949646908627), AIR2013 = c(8.27712373652565, 9.04319920654944,
6.74612877325271, 13.8873930297748, 11.9153593588832, 4.61324733532476,
8.4878295564407, 5.77498647962371, 4.68663368575624, 9.53798435499915,
13.8239222256639, 12.4329902363573, 11.6120311678429, 10.0747855773047,
3.92196284314175, 5.33750072185858, 3.57540085327323, 9.85522755583306,
11.9822010349946, 3.45481815994019, 10.892755780485, 6.48081650614436,
3.70137969797198, 12.7409640677895, 10.6433459473609, 2.62831938764267,
14.5458863536206, 4.11003716498474, 3.62549759677309, 13.7243097189411,
14.4954679398946, 6.75792811234971, 4.24549357121275, 7.19747523671994,
10.0870223054055, 12.3274281337378, 12.2477434444681, 10.0755658776555,
3.12831819514418, 6.52412743031723, 8.59926303841453, 10.9694159899252,
6.06990504534845, 2.95378935882496, 1.91351455361326, 13.3501023647671,
6.91474291581591, 8.28943165152357, 10.4379813756945, 7.7918838394885
)), row.names = c(NA, -50L), class = c("tbl_df", "tbl", "data.frame"
))
Using the sample data, my formula in simple terms would be
*exp = x/9(t) + y/9(t-1) if Birth_month < 9 #else
exp = t
where x= Birth_month - 1; y = 9 - x; t = air quality value for the year of birth.*
Thank you
There could be a more elegant way, but I would break it in 3 parts:
Getting a table of cases with the years and months (numeric) of conception and birth.
Getting a tidy table of caseID-year pairs of pollution values.
Join 1 and 2 to get pollution values for the conception and birth years, and then run the arithmetic.
library(tidyverse)
df_cases <- df %>%
group_by(CaseID) %>%
select(Birth_month, Birth_Year) %>%
mutate(conception_year = if_else(Birth_month <= 9, true = Birth_Year - 1, false = Birth_Year),
months_in_birth_year = min(9, Birth_month),
months_in_conception_year = 9 - months_in_birth_year ) %>% ungroup()
df_airquality <-
df %>% select(CaseID, AIR1999:AIR2013) %>%
pivot_longer(names_to = "year", names_prefix = "AIR", values_to = "airquality", cols = !CaseID) %>% mutate(year = as.numeric(year))
df_exposure <-
df_cases %>%
left_join(df_airquality, by = c( "CaseID", "Birth_Year" = "year")) %>%
left_join(df_airquality, by = c( "CaseID", "conception_year" = "year")) %>%
mutate(utero_exposure = ((months_in_birth_year - 1)/9*airquality.x) +
((months_in_conception_year + 1)/9*airquality.y) ) %>%
select(CaseID, Birth_month,Birth_Year, airquality.x, airquality.y, utero_exposure)
df_exposure
# A tibble: 50 x 6
CaseID Birth_month Birth_Year airquality.x airquality.y utero_exposure
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 2000 11.5 8.97 8.97
2 2 5 2003 8.03 8.83 8.47
3 3 7 2010 0.984 6.58 2.85
4 4 3 2008 8.80 6.66 7.14
5 5 12 2006 3.97 3.97 3.97
6 6 12 2001 14.7 14.7 14.7
7 7 11 2012 2.56 2.56 2.56
8 8 8 2013 5.77 10.8 6.89
# ... with 42 more rows

Negative binomial regression of month trend

I read a paper about negative binomial regression:"We modelled the number of Ecoli bloodstream infections and E coli UTIs per month using negative-binomial regression (incorporating overdispersion), assuming the same underlying population(no offset)." The figure as the followings
I also have a set of data, want to figure the infection like the figure with month/year, how can I do that? thank you very much
df <- structure(list(Year = c(2013, 2013, 2013, 2013, 2013, 2013, 2013,
2013, 2013, 2013, 2013, 2013, 2014, 2014, 2014, 2014, 2014, 2014,
2014, 2014, 2014, 2014, 2014, 2014, 2015, 2015, 2015, 2015, 2015,
2015, 2015, 2015, 2015, 2015, 2015, 2015), Month = c(1, 2, 3,
4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12), Incidence = c(2.25538216197745,
3.49502862307924, 2.76311704439615, 2.9836483329794, 3.09375,
3.0368028900429, 3.82920688208141, 3.9154960734432, 3.33517393705135,
3.54593329432417, 3.27586206896552, 3.25655281969817, 3.35912052117264,
3.21672101986362, 2.78237182605312, 2.58435732397113, 2.72516428295323,
3.1227603153476, 2.6300688599847, 2.66324718879463, 2.62653374233129,
2.45256358498183, 2.39520958083832, 3.58683926645092, 3.41995942421022,
3.61001317523057, 2.62718158187895, 2.86944045911047, 2.77978993118435,
2.89282762420792, 2.69410829432029, 3.22232223222322, 3.39818882811799,
3.36725958337297, 2.90030211480363, 3.20789124668435), Inpatient = c(8779,
6638, 9663, 9418, 9600, 8858, 9532, 9041, 9055, 8545, 9280, 10072,
9824, 6746, 10279, 10254, 10348, 9767, 10456, 10138, 10432, 9908,
9853, 11124, 10351, 7590, 10772, 11152, 11044, 10889, 11321,
11110, 11153, 10513, 11585, 12064), infection = c(198, 232, 267,
281, 297, 269, 365, 354, 302, 303, 304, 328, 330, 217, 286, 265,
282, 305, 275, 270, 274, 243, 236, 399, 354, 274, 283, 320, 307,
315, 305, 358, 379, 354, 336, 387)), row.names = c(NA, -36L), class = c("tbl_df",
"tbl", "data.frame"))
reference:
Vihta K D, Stoesser N, Llewelyn M J, et al. Trends over time in Escherichia coli bloodstream infections, urinary tract infections, and antibiotic susceptibilities in Oxfordshire, UK, 1998–2016: a study of electronic health records[J]. The Lancet Infectious Diseases, 2018, 18(10): 1138-1149.
Using the data above, one can do the following:
library(MASS) # for function glm.nb
library(ggplot2)
library(broom) # for tidy model outputs
Create a date, to make plotting easy
df$t <- as.Date(paste("01", df$Month, df$Year, sep = "-"), format = "%d-%m-%Y")
Plot the data. geom_smooth adds the trend line and confidence intervals, using the date as the predictor.
p <- ggplot(data = df, aes(x = t, y = infection)) +
geom_point() +
geom_smooth(method = "glm.nb")
p
To perform the regression, set the count of infections as the dependent variable and the nth month as the independent variable, below month_as_integer.
df$month_as_integer <- seq_along(df$Month)
m1 <- glm.nb(infection ~ month_as_integer, data = df)
using tidy from the broom package, one can get the estimate and confidence intervals as a data frame.
out1 <- as.data.frame(tidy(m1, exponentiate = TRUE, conf.int = TRUE) )
out1
term estimate std.error statistic p.value conf.low conf.high
1 (Intercept) 264.44399 0.048006493 116.184897 0.000000000 240.943378 290.556355
2 month_as_integer 1.00697 0.002250993 3.085763 0.002030303 1.002569 1.011394

Weighted moving average for time series in R

I'm quite new in time series and I'm wondering, is there any similar function as sma() from smooth package to fit weighted moving average (WMA) for my time serie?
I would like to fit a WMA model with weights
weights <- (1/35)*c(-3, 12, 17, 12, -3)
I'm able to calculate the values of WMA with filter() function but I would love to get output similar to sma() function (including e.g. residuals, BIC, ...)
Income <- c(44649, 47507, 49430, 51128, 54453, 58712, 60533,
63091, 63563, 62857, 62481, 63685, 65596)
Year <- c(2000, 2001, 2002, 2003, 2004, 2005, 2006,
2007, 2008, 2009, 2010, 2011, 2012)
df <- data.frame(Year, Income)
library(smooth)
# simple moving average model
(sma5_fit <- sma(df$Income, order = 5))
# weighted moving average
wma5 <- filter(df$Income, filter = (1/35)*c(-3, 12, 17, 12, -3), sides = 2)
Any suggestions welcomed!
EDIT:
It would be also nice to calculate first 2 and last 2 values of weighted moving average. Now, I have to calculate them by hand with following code (weights come from Kendall's Time Series book):
n <- length(Income)
wma5[n-1] <- sum(1/35 * c(2, -8, 12, 27, 2) * c(Income[(n-4):(n)]))
wma5[n] <- sum(1/70 * c(-1, 4, -6, 4, 69) * c(Income[(n-4):(n)]))
wma5[2] <- sum(1/35 * c(2, 27, 12, -8, 2) * c(Income[1:5]))
wma5[1] <- sum(1/70 * c(69, 4, -6, 4, -1) * c(Income[1:5]))

Stargazer Confidence Interval Incorrect?

So I am really fond of the stargazer package for displaying the statistics for regression models. I've been using R and Stata together to complete some problems in a textbook. One issue that I have found is that the confidence interval printed by the stargazer package does not correspond to the confidence interval by stata. I determined that the CI in stata is the correct one after doing it by hand.
Because the issue might may possibly lie in how I am handling the data, I offer it here as an optional choice. My primary concern is to determine why the CI's do not respond. From a previous post, here is one possible way of finding the data I am using;
install.packages("devtools") # if not already installed
library(devtools)
install_git("https://github.com/ccolonescu/PoEdata")
library(PoEdata) # loads the package in memory
library(multcomp) # for hypo testing
data(fair4) # loads the data set of interest
In Stata, the name of the dataset I am using is called fair4.dta. For the data itself, you can use it manually,
structure(list(year = structure(c(1880, 1884, 1888, 1892, 1896,
1900, 1904, 1908, 1912, 1916, 1920, 1924, 1928, 1932, 1936, 1940,
1944, 1948, 1952, 1956, 1960, 1964, 1968, 1972, 1976, 1980, 1984,
1988, 1992, 1996, 2000, 2004, 2008), label = "year", format.stata = "%9.0g"),
vote = structure(c(50.2200012207031, 49.8460006713867, 50.4140014648438,
48.2680015563965, 47.7599983215332, 53.1710014343262, 60.0060005187988,
54.4830017089844, 54.7080001831055, 51.681999206543, 36.1189994812012,
58.2439994812012, 58.8199996948242, 40.8409996032715, 62.4580001831055,
54.9990005493164, 53.773998260498, 52.3699989318848, 44.5950012207031,
57.7639999389648, 49.9129981994629, 61.3440017700195, 49.5960006713867,
61.7890014648438, 48.9480018615723, 44.6969985961914, 59.1699981689453,
53.9020004272461, 46.5449981689453, 54.7360000610352, 50.2649993896484,
51.2330017089844, 46.5999984741211), label = "Incumbent share of the two-party presidential vote", format.stata = "%9.0g"),
party = structure(c(-1, -1, 1, -1, 1, -1, -1, -1, -1, 1,
1, -1, -1, -1, 1, 1, 1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1,
-1, -1, 1, 1, -1, -1), label = "= 1 if Democratic incumbent at election time; -1 if a Republican incumbent", format.stata = "%9.0g"),
person = structure(c(0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1,
0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0,
1, 0), label = "= 1 if incumbent is running for election and 0 otherwise", format.stata = "%9.0g"),
duration = structure(c(1.75, 2, 0, 0, 0, 0, 1, 1.25, 1.5,
0, 1, 0, 1, 1.25, 0, 1, 1.25, 1.5, 1.75, 0, 1, 0, 1, 0, 1,
0, 0, 1, 1.25, 0, 1, 0, 1), label = "number of terms incumbent administration in power", format.stata = "%9.0g"),
war = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0), label = "= 1 for elections of 1920, 1944, and 1948 and 0 otherwise.", format.stata = "%9.0g"),
growth = structure(c(3.87899994850159, 1.58899998664856,
-5.55299997329712, 2.76300001144409, -10.0240001678467, -1.42499995231628,
-2.4210000038147, -6.2810001373291, 4.16400003433228, 2.22900009155273,
-11.4630002975464, -3.87199997901917, 4.6230001449585, -14.4989995956421,
11.7650003433228, 3.90199995040894, 4.27899980545044, 3.5789999961853,
0.690999984741211, -1.45099997520447, 0.377000004053116,
5.10900020599365, 5.04300022125244, 5.91400003433228, 3.75099992752075,
-3.59699988365173, 5.44000005722046, 2.17799997329712, 2.66199994087219,
3.12100005149841, 1.21899998188019, 2.69000005722046, 0.219999998807907
), label = "growth rate GDP in first three quarters of the election year", format.stata = "%9.0g"),
inflation = structure(c(1.97399997711182, 1.05499994754791,
0.603999972343445, 2.2739999294281, 3.41000008583069, 2.54800009727478,
1.44200003147125, 1.87899994850159, 2.17199993133545, 4.2519998550415,
0, 5.16099977493286, 0.18299999833107, 7.19999980926514,
2.49699997901917, 0.0810000002384186, 0, 0, 2.36199998855591,
1.93499994277954, 1.96700000762939, 1.25999999046326, 3.13899993896484,
4.81500005722046, 7.63000011444092, 7.83099985122681, 5.25899982452393,
2.90599989891052, 3.27999997138977, 2.06200003623962, 1.60500001907349,
2.32500004768372, 2.88000011444092), label = "growth rate of GDP deflator during first 15 quarters of admin", format.stata = "%9.0g"),
goodnews = structure(c(9, 2, 3, 7, 6, 7, 5, 8, 8, 3, 0, 10,
7, 4, 9, 8, 0, 0, 7, 5, 5, 10, 7, 4, 5, 5, 8, 4, 2, 4, 8,
1, 3), label = "number of quarters in first 15 with real GDP per capita growth > 3.2", format.stata = "%9.0g")), notes = c("more complete variable definitions in fair.def",
"1"), .Names = c("year", "vote", "party", "person", "duration",
"war", "growth", "inflation", "goodnews"), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -33L))
So here is the stargazer code that is giving me trouble:
presidential <- read_dta("~/Directory/fair4.dta")
pres.lm = lm(vote ~ growth, data = subset(presidential,
presidential$year >= 1916)
stargazer(pres.lm,
type = "text",
intercept.bottom = T,
digits = 5,
report = "vc*stp",
ci = T
)
confint(pres.lm, level = 0.95)
Consider the difference in the confidence intervals.
(0.52948, 1.24241) # in R, Stargazer
0.5087671 1.263126 # in R, confint(pres.lm)
.5087671 1.263126 # in Stata
I also calculated by hand for the confidence intervals and the confit() and the Stata numbers check out. The t-critical value for this dataset should be t_(N-2 , prob) = t(22,.0025) = -2.073873.
In addition, I made sure to create an entirely new data frame. That is, instead of subsetting within the the lm() argument, I subset it first. When comparing this method to the previous one, I still get the same exact (incorrect) confidence intervals.
# subset into a new dataframe
presidential.1 = subset(presidential, presidential$year >= 1916)
# create the model
pres.lm.2 = lm(vote ~ growth, data = presidential.1)
# compare the two
stargazer(pres.lm,pres.lm.2,
type = "text",
intercept.bottom = F,
digits = 5,
report = "vc*stp",
ci = T,
t.auto = T)
(1) (2)
-----------------------------------------------------------------------
Constant 50.84840*** 50.84840***
(48.86384, 52.83295) (48.86384, 52.83295)
t = 50.21835 t = 50.21835
p = 0.00000 p = 0.00000
growth 0.88595*** 0.88595***
(0.52948, 1.24241) (0.52948, 1.24241)
t = 4.87126 t = 4.87126
p = 0.00008 p = 0.00008
# correct intervals from Stata and R's confint()
growth 0.5087671 1.263126
Am I running the code incorrectly? It really isn't a big deal for me to run the stargazer command and print only the coefficients and the t-stats, but it is kind of disappointing that I would have to run confint() as a separate command given that the output for Stargazer is gorgeous. It is quite odd because the coefficient estimates and the t-statistics are perfect. The confidence intervals are off by varying degrees, and I would like to know what the cause of this might be. Any advice would be greatly appreciated.
The simple answer is that stata and confint calculate confidence intervals using the t-distribution, while stargazer's internal method uses the normal distribution. The result is that the former two are more conservative in their estimates and thus have wider CI compared to stargazer. (Well, I'm assuming with stata here, but since it gives the same results as confint I feel it is a safe assumption).
Looking deep into the source code for stargazer (line 688ff) we can find how CIs are calculated:
z.value <- qnorm((1 + .format.ci.level.use)/2)
coef <- .global.coefficients[.global.coefficient.variables[which.variable],i]
se <- .global.std.errors[.global.coefficient.variables[which.variable],i]
ci.lower.bound <- coef - z.value * se
ci.upper.bound <- coef + z.value * se
It uses qnorm to set the critical value.
Compare to confint:
a <- (1 - level)/2
a <- c(a, 1 - a)
fac <- qt(a, object$df.residual) ##Relevant line, uses T-distribution
pct <- format.perc(a, 3)
ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm,
pct))
ses <- sqrt(diag(vcov(object)))[parm]
ci[] <- cf[parm] + ses %o% fac
Compare:
#Using normal/z distribution
> pres.lm$coefficients[2] + sqrt(diag(vcov(pres.lm)))[2] %o% c(-qnorm((1 + 0.95)/2), qnorm((1 + 0.95)/2))
[,1] [,2]
growth 0.5294839 1.242409
#Using t-distribution with df degrees of freedom
> df <- pres.lm$df.residual
> pres.lm$coefficients[2] + sqrt(diag(vcov(pres.lm)))[2] %o% c(-qt((1 + 0.95)/2, df), qt((1 + 0.95)/2, df))
[,1] [,2]
growth 0.5087671 1.263126
Probably the easiest way to handle this if you are committed to stargazer is to use the ci.custom argument:
> stargazer(pres.lm, type = "text", ci.custom = list(confint(pres.lm)))
===============================================
Dependent variable:
---------------------------
vote
-----------------------------------------------
growth 0.886***
(0.509, 1.263)
Constant 50.848***
(48.749, 52.948)
-----------------------------------------------
Observations 24
R2 0.519
Adjusted R2 0.497
Residual Std. Error 4.798 (df = 22)
F Statistic 23.729*** (df = 1; 22)
===============================================
Note: *p<0.1; **p<0.05; ***p<0.01
Once the sample size is sufficiently large, the t-distribution converges on the z-distribution and the differences between the CIs become much smaller.
set.seed(432)
x1 <- rnorm(10000, 100, 50)
u <- 2 * rnorm(10000)
y <- 50 + x1 * 0.752 * u
fit <- lm(y ~ x1)
> confint(fit)
2.5 % 97.5 %
(Intercept) 39.29108955 54.1821315
x1 -0.02782141 0.1061173
> stargazer(fit, type= "text", ci = T)
===============================================
Dependent variable:
---------------------------
y
-----------------------------------------------
x1 0.039
(-0.028, 0.106)
Constant 46.737***
(39.292, 54.181)
-----------------------------------------------
Observations 10,000
R2 0.0001
Adjusted R2 0.00003
Residual Std. Error 168.194 (df = 9998)
F Statistic 1.313 (df = 1; 9998)
===============================================
Note: *p<0.1; **p<0.05; ***p<0.01
With a sample size of 24, the t-distribution with 22 degrees of freedom has much fatter tails than the z!

Error in svd(ctr) : infinite or missing values in 'x'

I'm trying to run svd on some data and I'm getting an error. I saw another post suggesting that this might happen when one or more of the columns are all 0, but this is not the case here. Can someone please explain what is going on and how to fix this? Note, that this is a subset of a much larger data-set. Thank you.
year <- c(2015, 2015, 2015, 2015, 2015, 2015)
week <- c(1, 1, 1, 1, 1, 1)
flight_type_name <- c("Commercial", "Filler", "Label", "Commercial", "Filler", "Filler")
userdata_country <- c("NO", "SG", "NI", "None", "CA", "GT")
platform <- c("iphone", "linux", "iphone", "linux", "web", "ipad")
num_users <- c("26726, 2, 161, 1, 4316, 577")
impressions <- c(135019, 0, 312, 0, 37014, 11492)
clicks <- c(407, 2, 2, 2, 59, 25)
ctr <- data.frame(year, week, flight_type_name, userdata_country, platform, num_users, impressions, clicks)
svd(ctr)

Resources