Compute value based on multiple conditions - r

I have a dataframe of children with their month and year of birth as well as air quality data their location from 1999 to 2013. I want to replicate a method used in a paper (https://www.nature.com/articles/s41586-018-0263-3#Sec2) where the authors compute the in utero exposure. Their calculation covers both air quality exposure for two years if the pregnancy to birth period occurred in two-year period. Using a direct quote from the paper "For instance, a child born in the third month of year t would be assigned an in utero pollution exposure of 2/9(exposure in year t) + 7/9(exposure in year t−1)". exposure in year t−1 means the year prior to the birth year.So for a child born in se month 9 to month 12 in year t, the child's exposure is the mean value for just year t.
Is there a way to capture this computing formula in R such that if the birth month is less than 9, then the formula above is applied. Else, the value for the birth year is returned.
The structure of a sample data is
structure(list(CaseID = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
44, 45, 46, 47, 48, 49, 50), Birth_month = c(1, 5, 7, 3, 12,
12, 11, 8, 8, 6, 9, 2, 10, 4, 8, 12, 9, 9, 1, 5, 7, 3, 10, 9,
11, 8, 8, 3, 9, 2, 8, 9, 8, 5, 6, 8, 9, 1, 5, 7, 3, 10, 9, 11,
8, 8, 3, 9, 2, 11), Birth_Year = c(2000, 2003, 2010, 2008, 2006,
2001, 2012, 2013, 2007, 2007, 2008, 2000, 2013, 2004, 2001, 2010,
2008, 2006, 2001, 2012, 2001, 2012, 2013, 2007, 2007, 2008, 2000,
2013, 2013, 2004, 2001, 2010, 2008, 2006, 2001, 2012, 2001, 2012,
2013, 2006, 2001, 2012, 2013, 2007, 2007, 2008, 2000, 2013, 2004,
2001), AIR1999 = c(8.96798959699599, 6.80355783305597, 5.89652403522655,
6.08351130989008, 0.421893172909738, 9.04891222893749, 7.32566080174991,
11.9538120562027, 5.83821052537696, 3.05275316117657, 7.56280655869865,
14.8201468275825, 7.64593596096383, 13.4841890986795, 14.0470039087897,
7.70797096865252, 1.61059988426883, 3.80031918734312, 14.9501510635975,
10.0847597081028, 10.1585740911858, 14.173089028311, 6.63212439636118,
5.61902280925075, 0.986908540996723, 3.13157703826507, 4.89343957488122,
5.31442170692771, 12.6929693986033, 1.26105029549589, 5.4180377367679,
4.25665508354944, 14.3792099452973, 5.24041748730466, 6.07755938634626,
4.95787754927552, 9.61953763780231, 3.75130929742288, 7.53080678060791,
12.3820129775514, 13.6887069176028, 4.13298569958261, 14.1752688808057,
3.52867313173367, 11.8386358425156, 1.79493586691422, 11.4111429566422,
14.4466513980518, 10.9929119145086, 0.526445570948068), AIR2000 = c(11.5400193935689,
4.51400380016048, 13.0405851599751, 9.26140852396632, 8.20474216026161,
13.7864063032873, 2.26955144127086, 2.73370701770671, 1.31484532203898,
5.95023887719191, 1.35904698184086, 6.38629071258241, 10.6814684475625,
11.7245391233198, 6.32023995462316, 14.6061184533106, 13.1787409855174,
2.6639327040026, 9.77708519725292, 14.2537217202925, 0.251322827515658,
11.6949597093156, 0.901117715214845, 8.8839327113477, 7.87421084115212,
11.7988941842713, 11.5616693068156, 13.6192478532903, 10.2098129178141,
2.6167536478648, 1.54635387455602, 12.0428954083114, 4.64425452833646,
15.1004807784159, 8.74940761103621, 6.54845786664169, 3.74066828807676,
13.8062808303186, 3.14344133563456, 4.87711996067478, 1.83335648309905,
0.564922403780743, 4.75278873524722, 14.1680012285325, 10.9841650458858,
4.35061422900041, 9.69220729148341, 12.3134085446505, 4.58245493312832,
6.09497011553915), AIR2001 = c(2.34047661832464, 10.1636542402387,
5.63355084510311, 4.85862207704014, 9.23495329037285, 14.7018283497058,
3.34896555680828, 10.7035308826005, 9.70681187241594, 5.94983366167778,
10.6982700353058, 8.20110325993877, 5.47815369644901, 2.29662297346932,
5.81311151243444, 5.86180122796237, 3.29303425198002, 12.6395572883144,
9.19196868231753, 6.9857222145889, 11.3394421438701, 11.047885820206,
10.3115117570383, 6.52485884934803, 4.32370917450194, 4.76700344993779,
2.66265303340764, 9.34482253280724, 8.82734048097441, 12.4377567085049,
13.4038470998239, 6.85719530528947, 13.0154083911972, 13.9397814449291,
6.42288194378652, 11.5360288507433, 4.05413083475339, 3.52317486527632,
5.33977231099782, 2.62500006449549, 6.07684988212213, 9.66904514889885,
4.74849246345088, 4.30533116448019, 7.45883561741118, 6.83800770166353,
13.7860890292185, 8.95913024599384, 4.22882908951701, 11.9366229237847
), AIR2002 = c(5.5421209915732, 8.82941941950074, 13.9415065737988,
4.05088145747734, 0.634888762937393, 8.94379655820853, 14.6699177745578,
2.92273129951465, 3.32000594967115, 9.6626486005201, 7.67113231661101,
5.73763535333215, 6.66896355409967, 4.76775433524861, 11.7307171144853,
11.9204512965262, 4.30270969435317, 6.07337335666409, 0.895004365468398,
7.92298369263555, 0.373079233546509, 11.6968179701357, 2.86150118232053,
2.24216720518936, 11.7806082410973, 9.40187198882131, 12.0526066867323,
11.9956986808735, 3.58023497230932, 8.58664103748742, 7.57416980166524,
4.31572127419198, 4.87601018757676, 1.42685255046957, 0.610385331556434,
4.55194799008779, 14.0502073411413, 9.80004817027273, 8.61341614048323,
0.900034241871675, 1.41049400813086, 1.55246704871324, 4.72272099354747,
0.316862174819224, 2.93053907830617, 8.05078771834052, 5.28952837263816,
10.1443750119205, 9.46636938612279, 7.2068673761792), AIR2003 = c(11.6987492112564,
8.03123494261876, 14.6267426478136, 10.1608755875826, 12.2925055704629,
4.41903535030968, 1.32192490517627, 14.9171937333997, 5.989500815548,
1.33018757369556, 11.5268041675435, 10.0354422837207, 12.2193238575859,
8.34460034461785, 3.19971360721812, 4.72411370138545, 8.99170322034135,
8.84431456421316, 9.19228873900324, 11.6073826548178, 5.04809568638727,
2.44113702619448, 1.3020562816998, 6.58069895481877, 12.9026523987483,
7.16368647065572, 8.66070749361161, 6.00576582145039, 2.10032442033477,
6.07895397265628, 9.84188887039386, 8.72011480981018, 8.3973909978196,
2.93135377582256, 2.07479393241927, 4.33787543469854, 11.5524645352829,
6.32281940148678, 13.4578647589665, 8.25475389447622, 6.92853975148313,
13.9737096681725, 3.20975688825082, 5.83681199098192, 14.7013523747697,
12.8483105113003, 14.7442567509897, 11.3872388804331, 11.9289726106934,
11.3432943127351), AIR2004 = c(12.3298210280007, 13.1959769331908,
12.370661520134, 15.0868251372818, 8.94963014865853, 12.2191067473528,
5.22362188581377, 0.720411225163843, 12.4820522407312, 0.351069664391922,
1.30037323312717, 4.37076402375568, 3.62288978581387, 8.48409274117183,
0.679597671930213, 7.34746075248183, 13.1647057766567, 11.1547359531655,
4.96984202732425, 8.96129352980759, 9.53370854761545, 5.8134242443291,
8.71016911426559, 11.8883539470122, 12.2352502209791, 4.68252076998213,
5.95696850650571, 13.8056899585128, 7.50516538352217, 5.02250328123337,
13.487732587669, 14.5949376791599, 14.9951364343211, 6.59220350365457,
9.48408624811447, 1.41794018899766, 14.1069514028479, 9.36621385829034,
13.7439532769257, 10.4824247457942, 14.7214399003615, 3.88337102833367,
6.97264864603989, 8.02159295712691, 12.6082874644615, 13.1953726709504,
13.6941493970903, 8.22842252415093, 2.50750512640784, 3.58666553804674
), AIR2005 = c(14.1543983449384, 0.207154301401461, 10.3945003528385,
1.55163006050745, 11.7088354445258, 13.8307717176049, 1.60947553726472,
12.4807727501071, 11.4753113418682, 12.1466166828242, 4.28278952114563,
15.0142466179228, 10.775123869299, 8.21507695954968, 10.2687020091594,
1.73548990486166, 5.87357538677612, 0.838787299717544, 2.09888134363526,
1.10793127552466, 12.8584094551371, 5.82341710817441, 7.35206087073311,
2.75513321271562, 2.28206509921863, 14.7161662780128, 1.92734397216351,
8.52569639326329, 0.435341671429342, 14.8357928042437, 11.0187966747552,
6.7161015153341, 0.729511158074718, 3.54536445078091, 0.25745916891587,
13.7526868569555, 12.4909417931621, 2.44651768055302, 4.29353768908698,
4.3296417530498, 0.332072943038773, 3.9897366906961, 7.34801499286247,
9.05279436282767, 7.81229433161113, 14.6314884596826, 13.0453833885309,
13.3968611858496, 7.20212454602541, 13.6024225526371), AIR2006 = c(14.3602117700654,
9.67784631192451, 3.55228421162465, 9.17853459700709, 3.97472074926249,
0.453478662259644, 13.5944658882257, 5.97668834996084, 5.33786357147782,
7.47689095921, 2.81935510349181, 3.96459207065962, 13.239876144056,
8.31305088601867, 4.70401683545741, 3.55927178151719, 0.198072984460741,
12.4664548987749, 3.95155436665518, 1.47646058510151, 1.15288747034175,
10.5564422610337, 11.799767498377, 9.69579476557509, 13.4778237678527,
0.578315307238372, 7.05817080976465, 12.4450234571598, 3.42532579906844,
8.00017632821924, 8.99522336013103, 10.3307407885047, 7.55966268972564,
2.25758463139716, 0.838791949320352, 14.5796161984191, 0.842886504037306,
12.3815860850157, 7.64115155479219, 8.93122855890589, 0.309247846194077,
9.11649027255387, 14.334240223086, 11.4479621604206, 0.662708246090217,
11.4913576022733, 7.95924178698263, 2.67515668715094, 0.879071030008839,
13.6501796660786), AIR2007 = c(9.97003772936854, 5.66991338106431,
13.1535815664255, 6.65941205831734, 12.9431154270126, 7.54975869239541,
2.68487146483106, 8.1954225972998, 0.989637708235998, 15.0157662873652,
14.2492329591238, 8.72830057256878, 10.6656589124098, 14.407073575431,
5.56657257661223, 12.0258530469052, 14.3806802332073, 6.06250679987832,
12.6181324996231, 13.7419910761765, 7.96911312058102, 4.95833557256381,
4.3284153526728, 4.37892194461683, 8.58931198042166, 15.086752575278,
6.84238949001231, 3.06172467645654, 13.959445785162, 4.88097198925563,
8.14034017345565, 14.5361199092346, 14.4077074720624, 8.95354569882294,
2.42978998625372, 3.71779510277393, 8.94922842727159, 14.5382596874342,
11.9046104310593, 3.08895008495753, 12.1694276854037, 12.2801624915064,
3.20018387928163, 1.09386935686017, 2.04407673706114, 5.70477642147499,
15.0334163082221, 6.06478327849694, 1.98094456569944, 1.69397633470688
), AIR2008 = c(9.30397066498711, 14.7486538500078, 5.58379651584826,
8.80245459234202, 13.487498242771, 4.53696579690417, 14.4067800776965,
6.79924817802059, 13.2960046531346, 14.6843470196975, 1.25236799454479,
5.97676394814788, 8.07777813938539, 8.63516177444276, 8.9589754464475,
15.0031446587539, 14.8031015943952, 2.49399273799104, 10.7569668015961,
4.28002286700346, 10.1122482336515, 5.05124714686349, 11.0521815955096,
0.343769731794717, 13.0853167199786, 12.1844864826184, 7.30493155200058,
8.15045584558858, 7.54400607235706, 2.49430188038922, 0.753580058788415,
13.3526555226133, 6.2869325775844, 8.3021644556059, 10.9850292504223,
2.18342414438305, 7.04020942923496, 11.017099474912, 8.27777373495162,
2.20133028454101, 8.49576657339185, 1.58340844516177, 2.63671668942692,
12.1220992218638, 1.75073969954345, 1.62113385793217, 14.099990950956,
6.91643273566617, 4.30560507998336, 3.77403995886794), AIR2009 = c(1.67957183947507,
7.36369897405664, 6.58358088548272, 2.38956139467959, 7.1751103291281,
6.10804192248383, 14.9931828636669, 1.0084126098291, 12.1872471913092,
5.01698368997965, 13.3619180300804, 12.8454065388709, 13.4218508905321,
14.4554111795065, 13.0900773408331, 12.3451018773937, 11.6958484717617,
7.24591008744668, 8.89023789232783, 11.9545650390091, 10.6377289210549,
1.51871868283534, 12.5126682352189, 5.65189430709695, 13.4598202915611,
14.3695526043761, 7.51904321175604, 6.24126743773976, 14.8900968077758,
8.72267893032101, 12.6833998839031, 2.2834426388687, 8.63599658885389,
9.25064845107519, 2.92453739979817, 10.8969492889207, 8.18858585808612,
1.62620084687509, 7.34673358480399, 12.0605738303089, 14.7037668900241,
12.6395342878401, 3.43311224376853, 3.07568217523932, 12.7473398134939,
12.403539704418, 11.9232473302584, 10.1782277375013, 4.20707396323467,
0.415611640287098), AIR2010 = c(3.43646944312239, 1.35975490884157,
0.983912079614354, 0.778802652686601, 12.3345025972561, 1.00613869713736,
6.00807040606486, 10.3557605237481, 0.571828635439742, 3.23014963916549,
1.50621501078014, 5.46978982399334, 10.487337793163, 2.86241729479888,
1.20100536868256, 9.26454035470425, 0.789336679036962, 1.52654933029041,
8.34533523020451, 11.7671594076965, 8.67667731308495, 11.9934830903618,
3.69538885993906, 9.75540565486182, 9.09214047452505, 1.95581501926063,
14.9897193048559, 14.5229993574421, 9.34056145941513, 1.41195704022213,
9.19928426172957, 11.7689157741664, 6.50435117529007, 3.27914538083738,
1.54789654616197, 10.6469015548979, 12.8618307883253, 8.4587018141062,
11.5180247743961, 2.29130000634049, 2.89561315982137, 2.7726630775854,
6.8413915040032, 13.9176417988634, 13.7705030146381, 0.52759597543464,
1.77941870173556, 5.53348983248253, 12.8605902424706, 3.84327436685492
), AIR2011 = c(7.77058110130485, 6.32489416138083, 9.22097524081613,
12.5675709558835, 8.91335506780934, 0.296842172110919, 14.3338035923715,
4.95561812325357, 1.608350189402, 9.59421682450897, 0.325252260173904,
4.3207606499726, 3.37387573508476, 7.52029055199213, 13.879214577225,
0.226798813565169, 14.1581023259959, 4.70900534654991, 14.7834656228917,
4.33595546064293, 9.77514042361919, 0.503774107655976, 7.74240870135883,
4.71506228298857, 14.9198252162652, 7.62993979265424, 10.4375572762149,
11.1040101385666, 3.69114950193861, 4.54456000300357, 8.70021347533702,
6.00627810265892, 0.358524227888556, 5.78310163486213, 6.06518945520674,
0.372255713034654, 1.26057865931257, 12.8861343483718, 13.7745616754233,
11.8614557243602, 5.64862275314773, 6.31275415731431, 8.65507386001036,
13.9926604995227, 0.355387645868585, 8.69261395941931, 13.1685344892796,
6.71539060721896, 6.13620514613669, 11.5668249191546), AIR2012 = c(6.27081028353097,
7.86677249507676, 0.58618860890693, 12.388112353907, 2.85958583392995,
7.95952853473043, 2.56079553408665, 10.8089659682196, 5.11167549905134,
11.0882686589232, 5.7707566303697, 11.3495220269873, 2.28377063603839,
0.545485086908331, 3.47129222193733, 6.00050472526765, 9.87287770145945,
4.24923844732647, 4.15209205533913, 14.5653932706246, 9.64327895222511,
5.11627035313752, 13.1698418143084, 0.588214227333898, 5.74336776575609,
3.65524465449248, 6.91148347930191, 9.29653071753797, 14.6478107533806,
9.5541726365569, 3.80763447697437, 2.96324152937718, 2.91963071328681,
6.08352127633477, 13.1309727081126, 10.8486344935496, 2.23741842607246,
3.65041686724569, 1.81148035473563, 0.905375592498109, 13.6041364369688,
10.7900264257432, 6.13362577195535, 6.43496705320152, 10.7790867917938,
0.281412984936498, 5.93390544060129, 4.08022078832751, 14.0153999755471,
5.40949646908627), AIR2013 = c(8.27712373652565, 9.04319920654944,
6.74612877325271, 13.8873930297748, 11.9153593588832, 4.61324733532476,
8.4878295564407, 5.77498647962371, 4.68663368575624, 9.53798435499915,
13.8239222256639, 12.4329902363573, 11.6120311678429, 10.0747855773047,
3.92196284314175, 5.33750072185858, 3.57540085327323, 9.85522755583306,
11.9822010349946, 3.45481815994019, 10.892755780485, 6.48081650614436,
3.70137969797198, 12.7409640677895, 10.6433459473609, 2.62831938764267,
14.5458863536206, 4.11003716498474, 3.62549759677309, 13.7243097189411,
14.4954679398946, 6.75792811234971, 4.24549357121275, 7.19747523671994,
10.0870223054055, 12.3274281337378, 12.2477434444681, 10.0755658776555,
3.12831819514418, 6.52412743031723, 8.59926303841453, 10.9694159899252,
6.06990504534845, 2.95378935882496, 1.91351455361326, 13.3501023647671,
6.91474291581591, 8.28943165152357, 10.4379813756945, 7.7918838394885
)), row.names = c(NA, -50L), class = c("tbl_df", "tbl", "data.frame"
))
Using the sample data, my formula in simple terms would be
*exp = x/9(t) + y/9(t-1) if Birth_month < 9 #else
exp = t
where x= Birth_month - 1; y = 9 - x; t = air quality value for the year of birth.*
Thank you

There could be a more elegant way, but I would break it in 3 parts:
Getting a table of cases with the years and months (numeric) of conception and birth.
Getting a tidy table of caseID-year pairs of pollution values.
Join 1 and 2 to get pollution values for the conception and birth years, and then run the arithmetic.
library(tidyverse)
df_cases <- df %>%
group_by(CaseID) %>%
select(Birth_month, Birth_Year) %>%
mutate(conception_year = if_else(Birth_month <= 9, true = Birth_Year - 1, false = Birth_Year),
months_in_birth_year = min(9, Birth_month),
months_in_conception_year = 9 - months_in_birth_year ) %>% ungroup()
df_airquality <-
df %>% select(CaseID, AIR1999:AIR2013) %>%
pivot_longer(names_to = "year", names_prefix = "AIR", values_to = "airquality", cols = !CaseID) %>% mutate(year = as.numeric(year))
df_exposure <-
df_cases %>%
left_join(df_airquality, by = c( "CaseID", "Birth_Year" = "year")) %>%
left_join(df_airquality, by = c( "CaseID", "conception_year" = "year")) %>%
mutate(utero_exposure = ((months_in_birth_year - 1)/9*airquality.x) +
((months_in_conception_year + 1)/9*airquality.y) ) %>%
select(CaseID, Birth_month,Birth_Year, airquality.x, airquality.y, utero_exposure)
df_exposure
# A tibble: 50 x 6
CaseID Birth_month Birth_Year airquality.x airquality.y utero_exposure
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 2000 11.5 8.97 8.97
2 2 5 2003 8.03 8.83 8.47
3 3 7 2010 0.984 6.58 2.85
4 4 3 2008 8.80 6.66 7.14
5 5 12 2006 3.97 3.97 3.97
6 6 12 2001 14.7 14.7 14.7
7 7 11 2012 2.56 2.56 2.56
8 8 8 2013 5.77 10.8 6.89
# ... with 42 more rows

Related

Randomization/ permutation test

I have a panel data that looks like this:
x <- structure(list(id2 = c(1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8,
8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11), private = c(1,
1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
0, 0, 0, 0, 1, 1, 1), capex = c(-0.003423963, -0.028064674, -0.03058208,
-0.00186256, -0.010839419, 0.052905358, 0.058931317, 0.065547734,
0.007644231, -0.025942514, 0.00163772, -0.007530502, 0.010706151,
0.025040116, 0.035105374, 0.036772128, 0.03886272, 0.045399148,
0.042642809, 0.080788992, 0.080848917, 0.057645567, 0.057636742,
0.046084184, 0.041080192, 0.05690382, 0.057325598, 0.051791377,
0.070084445, 0.069627948, 0.077849329, 0.077247024, 0.081251919,
0.071702167, 0.078424804, 0.077482991, 0.078969546, 0.076208547,
0.059055354, 0.056043826, 0.029450929, 0.016044363, 0.048353843,
0.047607671, 0.046497576, 0.047454875, 0.050881654, 0.047155183,
0.055546004, 0.057564467), roa = c(-0.003078416, -0.035302367,
-0.01884984, 0.002839225, -0.001113289, 0.024291474, 0.040153231,
0.051482957, 0.026102768, 0.005372915, 0.004466314, -0.025178509,
-0.002043936, -0.069235161, 0.023604594, 0.010512878, 0.021912357,
0.016721437, -0.09472625, 0.04061316, 0.074661337, 0.0214584,
0.047743626, 0.013149141, -0.010418181, 0.025671346, 0.031785361,
0.084893651, 0.018490626, 0.024941774, 0.023567598, 0.031878859,
0.029931206, 0.043837443, 0.041305128, 0.041293647, 0.039307728,
0.046259467, 0.017479861, 0.029429797, 0.023826957, 0.00763736,
0.017485917, 0.017156925, 0.006504506, 0.021350464, 0.032917287,
0.036106978, 0.04545372, 0.049348988), year = c(2011, 2012, 2013,
2014, 2015, 2011, 2012, 2013, 2011, 2012, 2013, 2014, 2015, 2011,
2012, 2013, 2014, 2015, 2011, 2012, 2013, 2014, 2015, 2011, 2012,
2013, 2014, 2015, 2011, 2012, 2013, 2014, 2015, 2011, 2012, 2013,
2014, 2015, 2011, 2012, 2013, 2014, 2011, 2012, 2013, 2014, 2015,
2011, 2012, 2013)), row.names = c(NA, -50L), class = c("tbl_df",
"tbl", "data.frame"))
Where id2 is firm ID and private is an indicator for private/public status. My goal is to run a randomization test for r-squared as follows:
regress roa on capex for private firms (i.e. private==1) and public (private==0) separately and get the observed difference in R-squared
randomly assign firms to private-public status (note that the data is panel)
rerun the regression and get the difference in r-squared for the random sample
repeat this, say, 1000 times
measure the p-value as the number of times that the randomly generated difference in R2 is larger than the observed difference divided by the number of iterations (1,000)
My issue is that this code takes ages to run, it will be great if someone has an idea of a better way to do this.
you will need estimatr and tidyverse packages to run this code
library(estimatr)
library(tidyverse)
# run model 1
mod1 <-lm_robust(roa ~ capex,
cluster=id2,
se_type = "stata",
data=x,private==0)
# run model 2
mod2 <-lm_robust(roa ~ capex,
cluster=id2,
se_type = "stata",
data=x,private==1)
#obtain the observed difference in R2
R2.obs1 <- summary(mod1)$adj.r.squared
R2.obs2 <- summary(mod2)$adj.r.squared
diff_r2_obs <- R2.obs2 - R2.obs1
]
#create a list for the simulated differnce in r2
simulated_r2 <- list()
# prepare the loop
set.seed(8)
nreps = 1000
for(i in 1:nreps){
x1 <- x %>% # randamize the variable private taking into acount each id appears a number of times
distinct(id2, private) %>%
mutate(private1=sample(private), replace=T) %>%
left_join(x, by="id2")
model_m <-lm_robust(roa ~ capex,
cluster=id2,
se_type = "stata",
data = x1,
subset=private1==0)
R2.obs_m <- summary(model_m)$adj.r.squared
model_f <-lm_robust(roa ~ capex,
cluster=id2,
se_type = "stata",
data = x1,
subset=private1==1)
R2.obs_f <- summary(model_f)$adj.r.squared
r2_diff_sim <- R2.obs_f - R2.obs_m
simulated_r2[i] <- r2_diff_sim
}
simulated_r2 <- unlist(simulated_r2)
exceed_count <- length(simulated_r2[simulated_r2 >=
diff_r2_obs])
p_val <- exceed_count / nreps
p_val
I initialized simulated_r2 as a vector rather than a list. We benefit a lot when we predefine it's length=. Also I replaced the dplyr code with an transform approach which appears to be faster. Runs quite fast with 1000 reps on this data actually. (Note, that in your code there was a mistake when using sample, the replace= argument is outside the call but should be inside!)
nreps <- 1000
simulated_r2 <- numeric(length=nreps)
set.seed(8)
system.time(
for (i in seq_len(nreps)) {
x1 <- transform(subset(x, !duplicated(id2)),
private=sample(private, replace=T))
model_m <- estimatr::lm_robust(roa ~ capex, clusters=id2, se_type="stata",
data=x1, subset=private == 0)
R2.obs_m <- summary(model_m)$adj.r.squared
model_f <- estimatr::lm_robust(roa ~ capex, clusters=id2, se_type="stata",
data=x1, subset=private == 1)
R2.obs_f <- summary(model_f)$adj.r.squared
r2_diff_sim <- R2.obs_f - R2.obs_m
simulated_r2[i] <- r2_diff_sim
}
)
# user system elapsed
# 8.262 0.000 8.267
(p_val <- length(simulated_r2[simulated_r2 >= diff_r2_obs])/nreps)
# [1] 0.273
Note, that warnings occur if you set replace=TRUE, probably when the nobs of one of the subsets on 0/1 get to small. You should rethink your approach a little.
Parallelize
If this is too slow, you could parallelize the code.
library(parallel)
cl <- makeCluster(detectCores() - 1)
clusterExport(cl, 'x')
clusterSetRNGStream(cl, 8) ## set seed
nreps <- 10000
system.time(
simulated_r2 <- parSapplyLB(cl, seq_len(nreps), \(i) {
x1 <- transform(subset(x, !duplicated(id2)),
private=sample(private, replace=F))
model_m <- estimatr::lm_robust(roa ~ capex, clusters=id2, se_type="stata",
data=x1, subset=private == 0)
R2.obs_m <- summary(model_m)$adj.r.squared
model_f <- estimatr::lm_robust(roa ~ capex, clusters=id2, se_type="stata",
data=x1, subset=private == 1)
R2.obs_f <- summary(model_f)$adj.r.squared
R2.obs_f - R2.obs_m
})
)
# user system elapsed
# 0.067 0.058 17.820
(p_val <- length(simulated_r2[simulated_r2 >= diff_r2_obs])/nreps)
# [1] 0.257
stopCluster(cl)
This takes a different approach to jay.sf's nice solution. Here, I use data.table, create the random permutations of private/public status, use a small function to get difference(s) in r-sq:
library(data.table)
setDT(x)
set.seed(8)
nperms=1000
# get permutations of public/private status
perms = cbind(unique(x[,.(id2)]), unique(x[,.(id2,private)])[,lapply(1:nperms, function(p) sample(private, replace=F))])
# function to get R-sq differences
obsR <- function(r,c,id,p) {
r0 = lm_robust(r[p==0]~c[p==0],clusters=id[p==0],se_type="stata")$adj.r.squared
r1 = lm_robust(r[p==1]~c[p==1],clusters=id[p==1],se_type="stata")$adj.r.squared
r1-r0
}
# empirical r-sq diff
empirR = x[, obsR(roa,capex,id2,private)]
# simulated r-sq differences
simR = x[perms, on=.(id2)][, sapply(.SD, function(v) obsR(roa,capex,id2,v)), .SDcols = patterns("V")]
# pvalue
sum(simR>=empirR)/nperms
With the small provided dataset, this can do a thousand estimates and get the p-value in under 5 seconds on my machine.
user system elapsed
4.56 0.14 4.70

How do I get the aggregate number of a variable against another variable in R? Both these variables are non-numeric

I have this dataset, and I am trying to create a new variable (n_commitments) that will give me an aggregate number of paragraphs per country. I know this is super basic but I have somehow been stuck for an hour now. I think it is something to do with the fact that both variables are character classes and I want a numeric as an output.
Please help so I can finally move on. Thank you.
structure(list(country = c("Afghanistan", "Afghanistan"), paragraphs = c("The representative of Afghanistan confirmed that his Government would ensure the transparency of its ongoing privatization programme. He stated that his Government would provide reports to WTO Members on developments in its privatisation programme, periodically and upon request, as long as the programme would be in existence, and along the lines of the information already provided to the Working Party during the accession process. The Working Party took note of this commitment. ",
"The representative of Afghanistan confirmed that from the date of accession, State-trading enterprises (including State-owned and State-controlled enterprises, enterprises with special or exclusive privileges, and unitary enterprises) in Afghanistan would make any purchases or sales, which were not for the Government's own use or consumption, solely in accordance with commercial considerations, including price, quality, availability, marketability, transportation and other conditions of purchase or sale. He further confirmed that these State trading enterprises would afford the enterprises of other Members adequate opportunity, in accordance with customary business practice, to compete for participation in purchases from or sales to Afghanistan's State enterprises. The Working Party took note of these commitments. "
)), row.names = 1:2, class = "data.frame")
Columns: 8
$ country <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanis…
$ category <chr> "State Ownership and Privatization; State-Trading Entities", "State Ownership and Pr…
$ paragraphs <chr> "The representative of Afghanistan confirmed that his Government would ensure the tr…
$ year_complete <int> 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, …
$ year_start <int> 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, …
$ accession_duration <int> 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, …
$ wto <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ n_commitments <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
Here's how to count the unique paragraphs by country:
df %>%
group_by(country) %>%
summarize(n_unique_paragraphs = n_distinct(paragraphs))
If, as you say, "each row of the data is a unique paragraph", then we can simplify and just count rows:
df %>% group_by(country) %>%
summarize(n = n())
There's also built-in utility function for this:
df %>% count(country)

mapping over lists with `ends_with` to apply a custom error function

I have a list which looks like:
I am trying to map over it and use the mutate function to apply a custom function. The list is called results and I want to compute an error between the preds and another column in the data frame. The common theme of that column in all the lists is the 1 at the very end of one of the columns.
How can I compute my custom function using contain, ends_with or something similar? The column preds is the same in all data frames.
rse <- function(x, y){
sqrt((x - y)**2)
}
x <- map(results, ~mutate(
error = rse(ends_with("1"), preds)
))
Data:
list(`c(5, 19)` = structure(list(date = structure(c(16801, 16802,
16803, 16804, 16805, 16806), class = "Date"), year = c(2016,
2016, 2016, 2016, 2016, 2016), c_farolillo = c(17, 9, 8, 3, 4,
4), plaza_eliptica = c(25, 29, 18, 11, 13, 9), c_farolillo1 = c(17,
9, 8, 3, 4, 4), preds = c(7.08282661437988, 9.66606140136719,
5.95918273925781, 3.81649804115295, 4.26900291442871, 3.38829565048218
)), row.names = c(NA, 6L), class = "data.frame"), `c(7, 1, 2, 18)` = structure(list(
date = structure(c(16801, 16802, 16803, 16804, 16805, 16806
), class = "Date"), year = c(2016, 2016, 2016, 2016, 2016,
2016), pza_del_carmen = c(12, 10, 10, 6, 8, 4), pza_de_espana = c(28,
21, 14, 8, 10, 6), escuelas_aguirre = c(17, 24, 19, 20, 22,
16), retiro = c(6, 5, 7, 3, 2, 2), pza_del_carmen1 = c(12,
10, 10, 6, 8, 4), preds = c(15.3020477294922, 16.007848739624,
15.3953952789307, 9.59985256195068, 9.85349082946777, 8.42792892456055
)), row.names = c(NA, 6L), class = "data.frame"))
We loop over the list of data.frames ('results') with map, then use mutate_at to modify the columns with names that ends_with "1" by applying rse function while speciying the 'y' as 'preds' column
library(dplyr)
library(purrr)
results <- map(results, ~ .x %>%
mutate_at(vars(ends_with("1")), list(new = ~ rse(., y = preds))))

Negative binomial regression of month trend

I read a paper about negative binomial regression:"We modelled the number of Ecoli bloodstream infections and E coli UTIs per month using negative-binomial regression (incorporating overdispersion), assuming the same underlying population(no offset)." The figure as the followings
I also have a set of data, want to figure the infection like the figure with month/year, how can I do that? thank you very much
df <- structure(list(Year = c(2013, 2013, 2013, 2013, 2013, 2013, 2013,
2013, 2013, 2013, 2013, 2013, 2014, 2014, 2014, 2014, 2014, 2014,
2014, 2014, 2014, 2014, 2014, 2014, 2015, 2015, 2015, 2015, 2015,
2015, 2015, 2015, 2015, 2015, 2015, 2015), Month = c(1, 2, 3,
4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12), Incidence = c(2.25538216197745,
3.49502862307924, 2.76311704439615, 2.9836483329794, 3.09375,
3.0368028900429, 3.82920688208141, 3.9154960734432, 3.33517393705135,
3.54593329432417, 3.27586206896552, 3.25655281969817, 3.35912052117264,
3.21672101986362, 2.78237182605312, 2.58435732397113, 2.72516428295323,
3.1227603153476, 2.6300688599847, 2.66324718879463, 2.62653374233129,
2.45256358498183, 2.39520958083832, 3.58683926645092, 3.41995942421022,
3.61001317523057, 2.62718158187895, 2.86944045911047, 2.77978993118435,
2.89282762420792, 2.69410829432029, 3.22232223222322, 3.39818882811799,
3.36725958337297, 2.90030211480363, 3.20789124668435), Inpatient = c(8779,
6638, 9663, 9418, 9600, 8858, 9532, 9041, 9055, 8545, 9280, 10072,
9824, 6746, 10279, 10254, 10348, 9767, 10456, 10138, 10432, 9908,
9853, 11124, 10351, 7590, 10772, 11152, 11044, 10889, 11321,
11110, 11153, 10513, 11585, 12064), infection = c(198, 232, 267,
281, 297, 269, 365, 354, 302, 303, 304, 328, 330, 217, 286, 265,
282, 305, 275, 270, 274, 243, 236, 399, 354, 274, 283, 320, 307,
315, 305, 358, 379, 354, 336, 387)), row.names = c(NA, -36L), class = c("tbl_df",
"tbl", "data.frame"))
reference:
Vihta K D, Stoesser N, Llewelyn M J, et al. Trends over time in Escherichia coli bloodstream infections, urinary tract infections, and antibiotic susceptibilities in Oxfordshire, UK, 1998–2016: a study of electronic health records[J]. The Lancet Infectious Diseases, 2018, 18(10): 1138-1149.
Using the data above, one can do the following:
library(MASS) # for function glm.nb
library(ggplot2)
library(broom) # for tidy model outputs
Create a date, to make plotting easy
df$t <- as.Date(paste("01", df$Month, df$Year, sep = "-"), format = "%d-%m-%Y")
Plot the data. geom_smooth adds the trend line and confidence intervals, using the date as the predictor.
p <- ggplot(data = df, aes(x = t, y = infection)) +
geom_point() +
geom_smooth(method = "glm.nb")
p
To perform the regression, set the count of infections as the dependent variable and the nth month as the independent variable, below month_as_integer.
df$month_as_integer <- seq_along(df$Month)
m1 <- glm.nb(infection ~ month_as_integer, data = df)
using tidy from the broom package, one can get the estimate and confidence intervals as a data frame.
out1 <- as.data.frame(tidy(m1, exponentiate = TRUE, conf.int = TRUE) )
out1
term estimate std.error statistic p.value conf.low conf.high
1 (Intercept) 264.44399 0.048006493 116.184897 0.000000000 240.943378 290.556355
2 month_as_integer 1.00697 0.002250993 3.085763 0.002030303 1.002569 1.011394

forecast model giving odd MAPE values, can some one please tell me if this is correct?

I ran this script as part of a forecasting project for school, but I got some odd results especially with the MAPE values. What it's supposed to do is predict international terrorism incident for the next 12 months. Can anyone tell me if this report is accurate or if I missed something? I tried to include the graphs, but I don't think they can be posted here.
Thanks
library(ggplot2)
library(forecast)
library(tseries)
library(reprex)
terror <- tibble::tribble(
~imonth, ~iyear, ~monthly,
1, 2015, 1534,
2, 2015, 1295,
3, 2015, 1183,
4, 2015, 1277,
5, 2015, 1316,
6, 2015, 1168,
7, 2015, 1263,
8, 2015, 1290,
9, 2015, 1107,
10, 2015, 1269,
11, 2015, 1172,
12, 2015, 1091,
1, 2016, 1162,
2, 2016, 1153,
3, 2016, 1145,
4, 2016, 1120,
5, 2016, 1353,
6, 2016, 1156,
7, 2016, 1114,
8, 2016, 1162,
9, 2016, 1045,
10, 2016, 1140,
11, 2016, 1114,
12, 2016, 923,
1, 2017, 879,
2, 2017, 879,
3, 2017, 961,
4, 2017, 856,
5, 2017, 1081,
6, 2017, 1077,
7, 2017, 994,
8, 2017, 968,
9, 2017, 838,
10, 2017, 805,
11, 2017, 804,
12, 2017, 749
)
# aggregated data
terror_byMonth_Train = ts(data = terror$monthly,
start = c(2015,1),
end = c(2016,12),
frequency=12)
terror_byMonth_Test = ts(data = terror$monthly,
start = c(2017,1),
end = c(2017,12),
frequency=12)
# arima instead of exp smooth
m_arima <- auto.arima(terror_byMonth_Train)
#> Warning in value[[3L]](cond): The chosen test encountered an error, so no
#> seasonal differencing is selected. Check the time series data.
# fit exp smooth model
m_ets = ets(terror_byMonth_Train)
# Get length of terror_byMonth_Test set
size <- length(terror_byMonth_Test)
# forecast for 2017 using multiple forecast (Davis Style)
f_arima_multi <- m_arima %>%
forecast(h = size)
f_arima_multi %>%
autoplot()
# forecast ARIMA 2017 (Orininal Style)
f_arima<-forecast(m_arima,h=12)
f_arima %>%
autoplot()
# forecast ETS 2017
f_ets = forecast(m_ets, h=12)
f_ets %>%
autoplot()
# check accuracy ETS
acc_ets <- accuracy(m_ets)
#check accuracy ARIMA, between train and test sets
acc_arima_TrainVSTest <- accuracy(f_arima_multi, x = terror_byMonth_Test)
# check accuarcy ARIMA
acc_arima <- accuracy(f_arima)
# MAPE(ETS)= 20.03 < MAPE(ARIMA) = 22.05
# ETS model chosen
# Compare to 2017 data
accuracy(f_ets, terror_byMonth_Test)
#> ME RMSE MAE MPE MAPE MASE
#> Training set -14.30982 90.08823 70.06438 -1.606862 5.900178 0.5790445
#> Test set 303.53575 316.03133 303.53575 23.986363 23.986363 2.5085599
#> ACF1 Theil's U
#> Training set 0.0008690031 NA
#> Test set -0.2148651254 2.356116
Created on 2019-02-13 by the reprex package (v0.2.1)
The issue is in how you defined terror_byMonth_Test. It should be, e.g.,
terror_byMonth_Test <- ts(data = tail(terror$monthly, 12),
start = c(2017, 1),
end = c(2017, 12),
frequency = 12)
That is, simply providing start and end dates isn't enough for ts to know which 12 observations out of 24 in terror$monthly to take. This reduces MAPE to 10.4%.

Resources