Error bars on double Y-axis graph--ggplot2 - r

I am trying to add error bars to my double y axis graph, but when ran, it completely ruins the graph. I attached a picture below. I also added my code.
If you need the full data set, let me know! Thank you so much in advance!
scalefactor <- max(Complete_Seasonality_Data$PRCP)/max(Complete_Seasonality_Data$Temp_C)
p <- ggplot(Complete_Seasonality_Data, aes(x = NewMonths5))
p <- p + geom_point(aes(y = PRCP, colour = "Precipitation"))
p <- p + geom_line(aes(y = PRCP, colour = "Precipitation", group=1))
p <- p + geom_point(aes(y = Temp_C*scalefactor, colour = "Temperature"))
p <- p + geom_line(aes(y = Temp_C*scalefactor, colour = "Temperature", group=1))
p <- p + scale_y_continuous(sec.axis = sec_axis(~./scalefactor, name = ylabseasonality))
p <- p + scale_colour_manual(values = c("blue", "red"))
p <- p + labs(y = "Precipitation (in)",
x = "Month",
colour = "Parameter")
p <- p + theme_bw()
p <- p + theme(axis.text.x = element_text(angle = 90), legend.position = c(.99, .01))
p <- p + geom_errorbar(aes(ymin = TempSummary$mean - StdErrorTemp, ymax = TempSummary$mean + StdErrorTemp), position=position_dodge(.9), width=0.2)
p <- p + geom_errorbar(aes(ymin = PrecipSummary$mean - StdErrorPrecip, ymax = TempSummary$mean + StdErrorPrecip), position=position_dodge(.9), width=0.2)
p
How I computed the Std Errors
TempSummary<- Summarize(Temp_C~ Month,
data=Chara_Data,
digits=3)
View(TempSummary)
StdErrorTemp<- (TempSummary$sd)/ (sqrt(TempSummary$n))
View(StdErrorTemp)
PrecipSummary<- Summarize(PRCP ~ Group.1,
data=Complete_Seasonality_Data,
digits=3)
StdErrorPrecip<- (PrecipSummary$sd/ sqrt(PrecipSummary$n))
Complete data set!
structure(list(Group.1 = c("April", "August", "December", "February",
"January", "July", "June", "March", "May", "November", "October",
"September"), Season = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), Month = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), Year = c(2017.05882352941, 2016.6, 2016.6, 2017.6,
2017.6, 2016.6, 2017.05882352941, 2017, 2017.05882352941, 2016.6,
2016.6, 2016.6), Date = structure(c(1494315952.94118, 1490691600,
1500316560, 1506183120, 1504163520, 1487501280, 1499108611.76471,
1489840800, 1496798682.35294, 1498314240, 1496087280, 1493421840
), class = c("POSIXct", "POSIXt")), Site = c(8.17647058823529,
8.125, 7.775, 7.775, 6.375, 6.375, 8.20588235294118, 6.80555555555556,
6.55882352941176, 6.375, 8.1, 6.375), PercentCover = c(0.765882352941176,
0.7125, 0.7505, 0.7775, 0.8625, 0.867, 0.763529411764706, 0.83,
0.850588235294118, 0.848, 0.7065, 0.834), AveHt = c(60.1684438927086,
50.2311192279942, 58.9048701298701, 57.3448097041847, 55.2253291847042,
64.6965656565657, 57.9602622867329, 56.672138047138, 64.4076426024955,
57.1465322871573, 54.3781565656566, 58.3185831529582), SE = c(7.07246013321596,
7.79305525403115, 7.00224498332823, 6.46671176266333, 6.32495719718401,
7.04611575726224, 8.09695750051648, 5.65899377193264, 7.28959135811987,
6.24571692582705, 7.32819802238581, 7.05669314452393), MaxHt = c(88.3823529411765,
81.625, 87.75, 85, 85.875, 96.425, 92.9117647058823, 82.5, 98.6764705882353,
88.125, 79.75, 89.65), green = c(0.350962665193537, 0.278211058736042,
0.183934291894458, 0.197711422851132, 0.179043270311077, 0.335751664926552,
0.186533536107468, 0.256634190010066, 0.319397625619223, 0.204519948331115,
0.249063275007846, 0.277894684744482), yellow = c(0.556643767952726,
0.569690303836593, 0.686152813243381, 0.654331042886853, 0.594548585049017,
0.554485584960289, 0.581008683220038, 0.609988063809375, 0.594827659217835,
0.620510694031593, 0.633793562346056, 0.600527348262596), brown = c(0.0923935668537371,
0.14983619398845, 0.122185622134889, 0.145933312808728, 0.226114026992848,
0.10976275011316, 0.229212761734686, 0.132653108499399, 0.0857747151629417,
0.174675239990233, 0.114398064606882, 0.121577966992922), Temp = c(78.4411764705882,
82.975, 75.65, 74.75, 74.3, 82.2051282051282, 81.0882352941177,
75.8333333333333, 79.8823529411765, 78.6, 80.1944444444444, 83
), Temp_C = c(25.8006535947712, 28.3194444444444, 24.25, 23.75,
23.5, 27.8917378917379, 27.2712418300654, 24.3518518518519, 26.6013071895425,
25.8888888888889, 26.7746913580247, 28.3333333333333), Vis = c(1.98823529411765,
2.12820512820513, 2.2125, 2.07, 2.1625, 2.07179487179487, 2.05,
2.02777777777778, 2.11764705882353, 2.205, 2.11, 2.17375), Nests = c(12.4117647058824,
17.1, 7.1, 6.275, 4, 8.9, 13.8787878787879, 4.88888888888889,
7.38235294117647, 2.8, 13.025, 5.6), SickorDeadFish = c(0.0882352941176471,
0.2, 0.175, 0.075, 0.05, 0.117647058823529, 0.0882352941176471,
0.166666666666667, 0.0294117647058824, 0.25, 0.333333333333333,
0.275), Cladophora = c(0.0866666666666667, 0.0492857142857143,
0.0471428571428571, 0.0907142857142857, 0.0264285714285714, 0.0154545454545455,
0.0380952380952381, 0.0295238095238095, 0.0161904761904762, 0.0178571428571429,
0.0407142857142857, 0.03), Comments = c(NaN, NaN, NaN, NaN, NaN,
NaN, NaN, NaN, NaN, NaN, NaN, NaN), STATION = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), NAME = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), DATE = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), MONTH = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), AWND = c(6.52626966292135, 5.97866090712743, 5.85811926605505,
6.31656097560976, 6.181, 6.1103908045977, 6.23947727272727, 6.5154211663067,
6.0985313174946, 5.64997635933806, 5.43263157894737, 5.54940639269406
), FMTM = c(1412.13333333333, 1431.1935483871, 1411.77419354839,
1535.16666666667, 1339.24137931034, 1439.77419354839, 1378.3,
1398.8064516129, 1353.12903225806, 1362.96666666667, 1408.45161290323,
1381.46666666667), PGTM = c(1394.1095890411, 1394.96774193548,
1306.83333333333, 1412.0511627907, 1327.90350877193, 1435.51769911504,
1372.37674418605, 1389.12328767123, 1376.75576036866, 1373.45341614907,
1346.2774566474, 1396), PRCP = c(0.0205869074492099, 0.0248701298701299,
0.0663425925925926, 0.0481472684085511, 0.0360991379310345, 0.0101144164759725,
0.00790067720090293, 0.0762693156732892, 0.0298491379310345,
0.0472985781990521, 0.034965034965035, 0.0243778801843318), SNOW = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), SNWD = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), TAVG = c(78.5333333333333, NaN, NaN, 61.1052631578947,
68.6333333333333, 80.2903225806452, 79.4, 72.5161290322581, 77.8709677419355,
NaN, NaN, NaN), TMAX = c(83.6826484018265, 88.8509719222462,
81.4940617577197, 80.6938271604938, 80.8072562358277, 88.1520737327189,
86.8795454545455, 81.3290043290043, 84.6048034934498, 83.8289786223278,
86.3615560640732, 88.1009174311927), TMIN = c(67.5423340961098,
72.5917926565875, 66.4394299287411, 64.9283950617284, 64.5600907029478,
71.9654377880184, 70.6772727272727, 65.7597402597403, 68.6527472527472,
68.9643705463183, 70.558352402746, 71.7821100917431), TSUN = c(NaN,
NaN, NaN, 0, 0, NaN, NaN, NaN, NaN, NaN, NaN, NaN), WDF2 = c(115.538116591928,
100.905172413793, 133.577981651376, 143.965936739659, 149.438444924406,
91.141876430206, 99.5022624434389, 131.612903225806, 124.279569892473,
109.693396226415, 119.450800915332, 115.068493150685), WDF5 = c(107.545045045045,
97.6077586206897, 124.528735632184, 133.031784841076, 140.826086956522,
82.5229357798165, 90.972850678733, 120.634573304158, 115.714285714286,
103.720379146919, 109.266055045872, 104.736842105263), WSF2 = c(15.2026905829596,
14.8530172413793, 14.6919724770642, 15.4111922141119, 15.1332613390929,
14.9070938215103, 15.083257918552, 15.4161290322581, 14.8625806451613,
14.322641509434, 14.3432494279176, 14.5600456621005), WSF5 = c(22.1105855855856,
21.9961206896552, 20.8029885057471, 20.8081145584726, 20.4824675324675,
22.4052752293578, 22.2158371040724, 21.9317286652079, 21.130303030303,
20.8722748815166, 20.493119266055, 21.0052511415525), WT01 = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), WT02 = c(NaN, 1, NaN, 1, 1,
NaN, NaN, 1, 1, NaN, 1, NaN), WT08 = c(1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1), WT10 = c(NaN, NaN, NaN, NaN, NaN, NaN, 1, NaN, NaN,
NaN, NaN, NaN), NewMonths2 = structure(c(17295, 17253, 17364,
17432, 17409, 17216, 17350, 17243, 17324, 17341, 17315, 17284
), class = "Date")), row.names = c(NA, -12L), class = "data.frame")
**Edited to add complete data set and how I did std error
Temp Summary
structure(list(Month = c("April", "August", "December", "February",
"January", "July", "June", "March", "May", "November", "October",
"September"), n = c(34, 40, 40, 40, 40, 40, 34, 36, 34, 40, 40,
40), nvalid = c(34, 40, 40, 40, 40, 39, 34, 36, 34, 40, 36, 40
), mean = c(25.801, 28.319, 24.25, 23.75, 23.5, 27.892, 27.271,
24.352, 26.601, 25.889, 26.775, 28.333), sd = c(0.478, 0.978,
0.921, 0.793, 0.551, 0.463, 0.632, 1.47, 0.905, 0.763, 0.928,
0.534), min = c(25, 26.667, 22.778, 21.667, 21.667, 27.222, 26.111,
22.778, 25, 25, 25.556, 27.222), Q1 = c(25.556, 27.778, 23.889,
23.333, 23.333, 27.778, 27.222, 23.333, 26.111, 25.556, 25.556,
27.778), median = c(25.556, 27.778, 23.889, 23.889, 23.333, 27.778,
27.222, 23.889, 26.667, 25.556, 27.222, 28.333), Q3 = c(25.972,
28.889, 25, 24.444, 23.889, 28.333, 27.639, 24.583, 27.222, 26.111,
27.361, 28.889), max = c(26.667, 30, 25.556, 25, 24.444, 28.889,
28.889, 27.222, 27.778, 27.778, 28.333, 29.444)), class = "data.frame", row.names = c(NA,
-12L))
Precip Summary
structure(list(MONTH = c("April", "August", "December", "February",
"January", "July", "June", "March", "May", "November", "October",
"September"), n = c(446, 464, 436, 422, 465, 437, 444, 465, 465,
424, 438, 439), nvalid = c(443, 462, 432, 421, 464, 437, 443,
453, 464, 422, 429, 434), mean = c(0.021, 0.025, 0.066, 0.048,
0.036, 0.01, 0.008, 0.076, 0.03, 0.047, 0.035, 0.024), sd = c(0.094,
0.184, 0.342, 0.211, 0.142, 0.047, 0.047, 0.343, 0.14, 0.24,
0.243, 0.112), min = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Q1 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), median = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), Q3 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
max = c(1.1, 3.06, 4.94, 2.61, 1.5, 0.47, 0.76, 3.32, 1.43,
3.29, 3.64, 1.25), percZero = c(81.264, 87.662, 76.389, 75.534,
77.802, 86.728, 86.682, 75.717, 84.267, 77.962, 83.916, 81.797
)), class = "data.frame", row.names = c(NA, -12L))
Temp Summary Results
enter image description here
Precip Summary Results
enter image description here

I would suggest next approach. Just be careful on the values of your error bars. Also, scaling factors must also be applied to error bars. That is why you got a messy plot. Here the code using the data you added:
library(ggplot2)
#Create var
Complete_Seasonality_Data$NewMonths5 <- as.Date(Complete_Seasonality_Data$Date)
#Computing
StdErrorTemp<- (TempSummary$sd)/ (sqrt(TempSummary$n))
StdErrorPrecip<- (PrecipSummary$sd/ sqrt(PrecipSummary$n))
#Scale factor
scalefactor <- max(Complete_Seasonality_Data$PRCP)/max(Complete_Seasonality_Data$Temp_C)
#Plot
p <- ggplot(Complete_Seasonality_Data, aes(x = NewMonths5))
p <- p + geom_point(aes(y = PRCP, colour = "Precipitation"))
p <- p + geom_line(aes(y = PRCP, colour = "Precipitation", group=1))
p <- p + geom_errorbar(aes(ymin = PrecipSummary$mean - StdErrorPrecip,
ymax = PrecipSummary$mean + StdErrorPrecip),
position=position_dodge(.9), width=0.2)
p <- p + geom_point(aes(y = Temp_C*scalefactor, colour = "Temperature"))
p <- p + geom_line(aes(y = Temp_C*scalefactor, colour = "Temperature", group=1))
p <- p + scale_y_continuous(sec.axis = sec_axis(~./scalefactor, name = 'Temperature'))
p <- p + geom_errorbar(aes(ymin = TempSummary$mean*scalefactor - StdErrorTemp,
ymax = TempSummary$mean*scalefactor + StdErrorTemp),
position=position_dodge(.9), width=0.2)
p <- p + scale_colour_manual(values = c("blue", "red"))
p <- p + labs(y = "Precipitation (in)",
x = "Month",
colour = "Parameter")
p <- p + theme_bw()
p <- p + theme(axis.text.x = element_text(angle = 90), legend.position = c(.99, .01))
p
Output:

Related

How to estimate a population proportion that has a certain disease

I have this data (listed as reproducible):
structure(list(age = c(62.84998, 60.33899, 52.74698, 42.38498
), death = c(0, 1, 1, 1), sex = c("male", "female", "female",
"female"), hospdead = c(0, 1, 0, 0), slos = c(5, 4, 17, 3), d.time = c(2029,
4, 47, 133), dzgroup = c("Lung Cancer", "Cirrhosis", "Cirrhosis",
"Lung Cancer"), dzclass = c("Cancer", "COPD/CHF/Cirrhosis", "COPD/CHF/Cirrhosis",
"Cancer"), num.co = c(0, 2, 2, 2), edu = c(11, 12, 12, 11), income = c("$11-$25k",
"$11-$25k", "under $11k", "under $11k"), scoma = c(0, 44, 0,
0), charges = c(9715, 34496, 41094, 3075), totcst = c(NA_real_,
NA_real_, NA_real_, NA_real_), totmcst = c(NA_real_, NA_real_,
NA_real_, NA_real_), avtisst = c(7, 29, 13, 7), race = c("other",
"white", "white", "white"), sps = c(33.8984375, 52.6953125, 20.5,
20.0976562), aps = c(20, 74, 45, 19), surv2m = c(0.262939453,
0.0009999275, 0.790893555, 0.698974609), surv6m = c(0.0369949341,
0, 0.664916992, 0.411987305), hday = c(1, 3, 4, 1), diabetes = c(0,
0, 0, 0), dementia = c(0, 0, 0, 0), ca = c("metastatic", "no",
"no", "metastatic"), prg2m = c(0.5, 0, 0.75, 0.899999619), prg6m = c(0.25,
0, 0.5, 0.5), dnr = c("no dnr", NA, "no dnr", "no dnr"), dnrday = c(5,
NA, 17, 3), meanbp = c(97, 43, 70, 75), wblc = c(6, 17.0976562,
8.5, 9.09960938), hrt = c(69, 112, 88, 88), resp = c(22, 34,
28, 32), temp = c(36, 34.59375, 37.39844, 35), pafi = c(388,
98, 231.65625, NA), alb = c(1.7998047, NA, NA, NA), bili = c(0.19998169,
NA, 2.19970703, NA), crea = c(1.19995117, 5.5, 2, 0.79992676),
sod = c(141, 132, 134, 139), ph = c(7.459961, 7.25, 7.459961,
NA), glucose = c(NA_real_, NA_real_, NA_real_, NA_real_),
bun = c(NA_real_, NA_real_, NA_real_, NA_real_), urine = c(NA_real_,
NA_real_, NA_real_, NA_real_), adlp = c(7, NA, 1, 0), adls = c(7,
1, 0, 0), sfdm2 = c(NA, "<2 mo. follow-up", "<2 mo. follow-up",
"no(M2 and SIP pres)"), adlsc = c(7, 1, 0, 0)), row.names = c(NA,
4L), class = "data.frame")
I am wanting to estimate the population proportion of individuals who had lung cancer listed as their primary disease group (dzgroup). How would I do this? My original thought was to just divide the total number that have lung cancer by the whole dataset population, but I do not believe this is correct.
If we want to get the proportion on the whole data, create a logical vector and get the mean as TRUE -> 1 and FALSE -> 0, the mean will be the proportion of 1s and multiplying by 100 gives the percentage
round(100 * mean(df1$dzgroup == "Lung Cancer", na.rm = TRUE), 2)

How to calculate a proportion in R

I have this reproducible DataFrame:
structure(list(age = c(62.84998, 60.33899, 52.74698, 42.38498,
79.88495, 93.01599, 62.37097, 86.83899, 85.65594, 42.25897),
death = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 1), sex = c("male",
"female", "female", "female", "female", "male", "male", "male",
"male", "female"), hospdead = c(0, 1, 0, 0, 0, 1, 0, 0, 0,
0), slos = c(5, 4, 17, 3, 16, 4, 9, 7, 12, 8), d.time = c(2029,
4, 47, 133, 2029, 4, 659, 142, 63, 370), dzgroup = c("Lung Cancer",
"Cirrhosis", "Cirrhosis", "Lung Cancer", "ARF/MOSF w/Sepsis",
"Coma", "CHF", "CHF", "Lung Cancer", "Colon Cancer"), dzclass = c("Cancer",
"COPD/CHF/Cirrhosis", "COPD/CHF/Cirrhosis", "Cancer", "ARF/MOSF",
"Coma", "COPD/CHF/Cirrhosis", "COPD/CHF/Cirrhosis", "Cancer",
"Cancer"), num.co = c(0, 2, 2, 2, 1, 1, 1, 3, 2, 0), edu = c(11,
12, 12, 11, NA, 14, 14, NA, 12, 11), income = c("$11-$25k",
"$11-$25k", "under $11k", "under $11k", NA, NA, "$25-$50k",
NA, NA, "$25-$50k"), scoma = c(0, 44, 0, 0, 26, 55, 0, 26,
26, 0), charges = c(9715, 34496, 41094, 3075, 50127, 6884,
30460, 30460, NA, 9914), totcst = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), totmcst = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
), avtisst = c(7, 29, 13, 7, 18.666656, 5, 8, 6.5, 8.5, 8
), race = c("other", "white", "white", "white", "white",
"white", "white", "white", "black", "hispanic"), sps = c(33.8984375,
52.6953125, 20.5, 20.0976562, 23.5, 19.3984375, 17.296875,
21.5976562, 15.8984375, 2.2998047), aps = c(20, 74, 45, 19,
30, 27, 46, 53, 17, 9), surv2m = c(0.262939453, 0.0009999275,
0.790893555, 0.698974609, 0.634887695, 0.284973145, 0.892944336,
0.670898438, 0.570922852, 0.952880859), surv6m = c(0.0369949341,
0, 0.664916992, 0.411987305, 0.532958984, 0.214996338, 0.820922852,
0.498962402, 0.24899292, 0.887939453), hday = c(1, 3, 4,
1, 3, 1, 1, 1, 1, 1), diabetes = c(0, 0, 0, 0, 0, 0, 0, 1,
0, 0), dementia = c(0, 0, 0, 0, 0, 0, 0, 0, 1, 0), ca = c("metastatic",
"no", "no", "metastatic", "no", "no", "no", "no", "metastatic",
"metastatic"), prg2m = c(0.5, 0, 0.75, 0.899999619, 0.899999619,
0, NA, 0.799999714, 0.049999982, NA), prg6m = c(0.25, 0,
0.5, 0.5, 0.8999996, 0, 0.6999998, 0.3999999, 0.0001249999,
NA), dnr = c("no dnr", NA, "no dnr", "no dnr", "no dnr",
"no dnr", "no dnr", "no dnr", "dnr after sadm", "no dnr"),
dnrday = c(5, NA, 17, 3, 16, 4, 9, 7, 2, 8), meanbp = c(97,
43, 70, 75, 59, 110, 78, 72, 97, 84), wblc = c(6, 17.0976562,
8.5, 9.09960938, 13.5, 10.3984375, 11.6992188, 13.5996094,
9.69921875, 11.2988281), hrt = c(69, 112, 88, 88, 112, 101,
120, 100, 56, 94), resp = c(22, 34, 28, 32, 20, 44, 28, 26,
20, 20), temp = c(36, 34.59375, 37.39844, 35, 37.89844, 38.39844,
37.39844, 37.59375, 36.59375, 38.19531), pafi = c(388, 98,
231.65625, NA, 173.3125, 266.625, 309.5, 404.75, 357.125,
NA), alb = c(1.7998047, NA, NA, NA, NA, NA, 4.7998047, NA,
NA, 4.6992188), bili = c(0.19998169, NA, 2.19970703, NA,
NA, NA, 0.39996338, NA, 0.39996338, 0.19998169), crea = c(1.19995117,
5.5, 2, 0.79992676, 0.79992676, 0.69995117, 1.59985352, 2,
1, 0.79992676), sod = c(141, 132, 134, 139, 143, 140, 132,
139, 143, 139), ph = c(7.459961, 7.25, 7.459961, NA, 7.509766,
7.65918, 7.479492, 7.509766, 7.449219, NA), glucose = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), bun = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), urine = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), adlp = c(7, NA, 1, 0, NA, NA, 0, NA, NA, 0), adls = c(7,
1, 0, 0, 2, 1, 1, 0, 7, NA), sfdm2 = c(NA, "<2 mo. follow-up",
"<2 mo. follow-up", "no(M2 and SIP pres)", "no(M2 and SIP pres)",
"<2 mo. follow-up", "no(M2 and SIP pres)", NA, NA, NA), adlsc = c(7,
1, 0, 0, 2, 1, 1, 0, 7, 0.4947999)), row.names = c(NA, 10L
), class = "data.frame")
I am needing to calculate the proportion of patients who died in the hospital in patients with an active DNR order on day 3 and in patients without an active DNR order on day 3. To group which patients had an active DNR on day 3 and which did not, I used the subset function below:
SB_xlsx1 = SB_xlsx[!is.na(SB_xlsx$dnrday), ]
YesDNR = subset(SB_xlsx1, dnrday <= 3)
NoDNR = subset(SB_xlsx1, dnrday > 3)
However, I don't know how to calculate the proportion of patients that died in the hospital for those with a DNR and without a DNR. The 'hospdead' variable has all 0s and 1s, where 0 = not dead and 1 = dead. However, I don't know how to get the proportion that died for having a DNR at day 3 and did not have a DNR at day 3. What code could I use for my desired result. SB_xlsx also just represents my DataFrame name.
There's a few ways to do this but the simplest is probably via the aggregate function.
> aggregate( hospdead ~ (dnrday<=3) , SB_xlsx1 , mean)
dnrday <= 3 hospdead
1 FALSE 0.1428571
2 TRUE 0.0000000
You may use tapply to group deaths by the condition dnrday <= 3, i.e. with an active DNR on day 3 and calculate the mean.
(res <- proportions(xtabs(death ~ dnrday <= 3, SB_xlsx)))
# dnrday <= 3
# FALSE TRUE
# 0.7142857 0.2857143
where
sum(res)
# [1] 1
EDIT: I apologize; I misread your post when providing my original answer. I've revised it below.
You referred to the hospdeath variable, but in the toy data set it has just one nonzero entry, so I'm using the death variable instead to demonstrate the principle.
First, abase R approach:
mean(SB_xlsx1[SB_xlsx1$death == 1, ]$dnrday <= 3)
mean(SB_xlsx1[SB_xlsx1$death == 1, ]$dnrday > 3)
The idea is to restrict to the subset of rows for which a death occurred, then perform a logical check to see which entries have dnrday greater than 3.
Note that if you have NA entries in death, you'll want to remove them first as you did with those in dnrday.
For a dplyr approach:
library(dplyr)
SB_xlsx1 %>%
filter(death == 1) %>%
summarize(mean(dnrday <= 3), mean(dnrday > 3))
or, for a slightly nicer-looking table,
SB_xlsx1 %>%
filter(death == 1) %>%
group_by(dnrday <= 3) %>%
summarize(prop = n() / nrow(.))

Layering ggplot

I have 3 piece of data that I need to layer onto one plot. The first time series layer is coded:
p<-ggplot(MI_FL_Data, aes(realdate, FLday))+geom_line()
The next layer adds two geom_hlines at yintercept=15000 and 17000 respectively. This layer is coded:
q<-ggplot(MI_FL_Data, aes( realdate, FL_Actions))+geom_point(na.rm = TRUE)
The final layer plots the points based on a categorical variable FL_Actions at the yintercept produced in the second code. This code is:
r<-ggplot(MI_FL_Data, aes(realdate, FLday))+
geom_hline(data = MI_FL_Data %>% filter(FL_Actions == 1), aes(yintercept = 15000), linetype=5, na.rm=TRUE)+
geom_hline(data = MI_FL_Data %>% filter(FL_Actions == 2), aes(yintercept = 17000), linetype=1, na.rm=TRUE))
Now I need to layer each of these saved vectors on top of each other in one graph. When I use the code:
ggplot(MI_FL_Data, aes(realdate, FLday))+
geom_hline(data=r)+
geom_point(data=r)
I get an error: data must be a data frame, or other object coercible by fortify(), not an S3 object with class gg/ggplot. I thought by saving each layer it would be fairly simple to just add them together. Any advice? I'm a bit of a novice with ggplot but what I want to do seem fairly intuitive so I'm stumped.
I've added images of each layer just in case.
# data
structure(list(Date = c("1/22/20", "1/23/20", "1/24/20", "1/25/20",
"1/26/20", "1/27/20", "1/28/20", "1/29/20", "1/30/20", "1/31/20",
"2/1/20", "2/2/20", "2/3/20", "2/4/20", "2/5/20", "2/6/20", "2/7/20",
"2/8/20", "2/9/20", "2/10/20", "2/11/20", "2/12/20", "2/13/20",
"2/14/20", "2/15/20", "2/16/20", "2/17/20", "2/18/20", "2/19/20",
"2/20/20"), Date2 = c("1/22/20", "1/23/20", "1/24/20", "1/25/20",
"1/26/20", "1/27/20", "1/28/20", "1/29/20", "1/30/20", "1/31/20",
"2/1/20", "2/2/20", "2/3/20", "2/4/20", "2/5/20", "2/6/20", "2/7/20",
"2/8/20", "2/9/20", "2/10/20", "2/11/20", "2/12/20", "2/13/20",
"2/14/20", "2/15/20", "2/16/20", "2/17/20", "2/18/20", "2/19/20",
"2/20/20"), Date3 = c("1/22/20", "1/23/20", "1/24/20", "1/25/20",
"1/26/20", "1/27/20", "1/28/20", "1/29/20", "1/30/20", "1/31/20",
"2/1/20", "2/2/20", "2/3/20", "2/4/20", "2/5/20", "2/6/20", "2/7/20",
"2/8/20", "2/9/20", "2/10/20", "2/11/20", "2/12/20", "2/13/20",
"2/14/20", "2/15/20", "2/16/20", "2/17/20", "2/18/20", "2/19/20",
"2/20/20"), FLORIDA = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), FLday = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), MICHIGAN = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0), MIday = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), FL_Actions = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), MI_Actions = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), realdate = structure(c(18283,
18284, 18285, 18286, 18287, 18288, 18289, 18290, 18291, 18292,
18293, 18294, 18295, 18296, 18297, 18298, 18299, 18300, 18301,
18302, 18303, 18304, 18305, 18306, 18307, 18308, 18309, 18310,
18311, 18312), class = "Date")), row.names = c(NA, -30L), class = c("tbl_df",
"tbl", "data.frame"))
NOTE that FL_Actions shows up as NA in this sippet of the data. This is because policy actions did not occur until March and continued through November of 2020
This is the current ggplot created with suggested code:
ggplot(MI_FL_Data, aes(realdate, FLday)) +
geom_line()+ geom_label(data=MI_FL_Data, aes(label=FL_Actions), nudge_x = 0.50, nudge_y=.25, size=2, na.rm=TRUE)+
geom_point(na.rm = TRUE) +
geom_point(na.rm = TRUE) +
geom_hline(data = MI_FL_Data %>% filter(FL_Actions == 1),aes(yintercept = 15000), linetype=5, na.rm=TRUE) +
geom_hline(data = MI_FL_Data %>% filter(FL_Actions == 2),aes(yintercept = 17000), linetype=1, na.rm=TRUE) +
labs(x=NULL, y="Number of Reported Daily COVID Cases", title="State of Florida",caption="1= closing actions, 2= opening actions")+theme_classic()
Updating with suggestion. This is the code: ggplot(MI_FL_Data, aes(realdate, FLday)) + geom_line()+ geom_label(data=MI_FL_Data, aes(label=FL_Actions), nudge_x = 0.50, nudge_y=.25, size=2, na.rm=TRUE, y=15000)+geom_point(aes(realdate, 17000),na.rm = TRUE) + geom_point(aes(realdate, 15000), na.rm = TRUE) + geom_hline(data = MI_FL_Data %>% filter(FL_Actions == 1),aes(yintercept = 15000), linetype=5, na.rm=TRUE) +geom_hline(data = MI_FL_Data %>% filter(FL_Actions == 2),aes(yintercept = 17000), linetype=1, na.rm=TRUE)+labs(x=NULL, y="Number of Reported Daily COVID Cases", title="State of Florida",caption="1= closing actions, 2= opening actions")+theme_classic() and this is the resulting graph:
You have 3 plots, not 3 layers. Every time you use ggplot(), you're creating a new plot. The layers are the just geoms. You need to add only the layers together, not the full plots:
ggplot(MI_FL_Data, aes(realdate, FL_Actions)) +
geom_point(na.rm = TRUE) +
geom_point(na.rm = TRUE) +
geom_hline(
data = MI_FL_Data %>% filter(FL_Actions == 1),
aes(yintercept = 15000), linetype=5, na.rm=TRUE
) +
geom_hline(
data = MI_FL_Data %>% filter(FL_Actions == 2),
aes(yintercept = 17000), linetype=1, na.rm=TRUE)
)
I think the above should work. If it gives you trouble, please post a reproducible example - say 10 rows of data shared with dput, e.g., dput(MI_FL_Data[1:10, ]).
The code used to produce the following graph is: gplot(MI_FL_Data, aes(realdate, FLday)) + geom_line()+ geom_label(data=MI_FL_Data, aes(label=FL_Actions), na.rm=TRUE, y=15500)+ geom_point(aes(realdate, 15000), na.rm = TRUE) + geom_hline(data = MI_FL_Data %>% filter(FL_Actions >= 1),aes(yintercept = 15000), linetype=5, na.rm=TRUE)+labs(x=NULL, y="Number of Reported Daily COVID Cases", title="State of Florida",caption="1= closing actions, 2= opening actions")+theme_classic()
However, the size of the hline is still concerning and I'm not sure exactly how to get the bolded overlay to go away. Any suggestions on this are welcome.

How to find make one table of proportions of demographic variables in R

I'm new to R and am having trouble with a simple command. How do I find the proportion of demographic variables (for example, proportion of English speakers in my population, or proportion of White respondents)?
I'd like to create a large table with all of the proportions, and would hopefull include mean age and median education level, but am having trouble finding the command. This is what I've tried:
table2 <- table(VR_Data$English)
prop.table(table2)
table3 <- table(VR_Data$race)
prop.table(table3)
table4 <- table(VR_Data$male)
prop.table(table4)
If it helps, this is my data:
structure(list(study = c(4, 4, 4, 1, 1, 1), TREATMENT = c(0,
0, 0, 0, 0, 0), TREATMENT4 = c(0, 0, 0, 0, 0, 0), TREATMENT2 = c(0,
0, 0, 0, 0, 0), TREATMENT3 = c(0, 0, 0, 0, 0, 0), order = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), id = c(279,
238, 239, 135, 143, 138), treatment = c(0, 0, 0, 0, 0, 0), treatment_condition = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), control_condition = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), m_check1 = c(1,
1, 1, 1, 1, 1), relationship = c(NA, NA, NA, 7, 6, 5), payment = c(NA,
NA, NA, 10, 3, 3), educ_level = c(14, 14, 12, 16, 16, 18), golf = c(3,
5, 3, 3, 2, 3), male = c(1, 0, 1, 0, 0, 1), Asian = c(0, 1, 0,
0, 0, 0), Black = c(0, 0, 0, 0, 0, 0), Latino = c(1, 0, 0, 0,
0, 0), White = c(0, 0, 1, 1, 1, 1), age = c(27, 53, 49, 25, 28,
24), English = c(1, 1, 1, 1, 1, 1), education = c(16, 16, 14,
14, 14, 16), enjoy = c(4, 1, 3.5, 4.25, 3.25, 3.5), RELATIONSHIP = c(4.33333349227905,
1, 4.33333349227905, 3.66666674613953, 3.5, 3.66666674613953),
anxiety = c(3, 3.40000009536743, 2.20000004768372, 1.25,
2, 1.25), BEH_SIM = c(3, 1, 3.75, 2.75, 2.5, 1.75), sptconf = c(3.33333325386047,
1.5, 4, 4.83333349227905, 4, 3.66666674613953), NEG_EFFICACY = c(4,
1.16666662693024, 3.66666674613953, 4.83333349227905, 4.16666650772095,
4.5), spteffort = c(3.16666674613953, 3.5, 4.16666650772095,
3.16666674613953, 3.16666674613953, 3.5), SPTEFFORT_OTHER = c(3.16666674613953,
3.5, 3.5, 3.16666674613953, 3, 3.33333325386047), SIM_VALUES = c(3.75,
1, 3.75, 3.75, 1.5, 2.25), COOP_MOTIV = c(2.33333325386047,
3, 2.66666674613953, 5, 2.5, 2.66666674613953), COMP_MOTIV = c(5,
5, 3.20000004768372, 4.40000009536743, 2.40000009536743,
4.40000009536743), presence = c(NA, NA, NA, 2.79999995231628,
1.79999995231628, 2.59999990463257), environ = c(NA, NA,
NA, 3, 4, 3), openresponse = c(NA, NA, NA, 94.25, 86, 60),
TotalOwnerCommission = c(300, 266.666656494141, 258.333343505859,
266.666656494141, 383.333343505859, 325), TotalRangerComm = c(258.333343505859,
233.33332824707, 291.666656494141, 258.333343505859, 175,
166.66667175293), TotalComm = c(279.166687011719, 250, 275,
262.5, 279.166687011719, 245.833343505859), merge = c(1,
1, 1, 0, 0, 0), Control = c(1, 1, 1, NA, NA, NA), treatment_Shoes = c(0,
0, 0, NA, NA, NA), treatment_Instructions_Only = c(0, 0,
0, NA, NA, NA), treatment_Info_Only = c(0, 0, 0, NA, NA,
NA), treatment_Info_Instructions = c(0, 0, 0, NA, NA, NA),
group = c("OwnerOnly", "OwnerOnly", "OwnerOnly", "", "",
""), race = c(4, 2, 5, NA, NA, NA), race_a = c("", "", "",
"", "", ""), RELATIONSHIP_2 = c(9.02055358886719, 1, 9.02055358886719,
7.02113246917725, 6.54790019989014, 7.02113246917725), TotalOwnerCommission_2 = c(5196.15234375,
4354.64794921875, 4152.12744140625, 4354.64794921875, 7505.24560546875,
5859.02099609375)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
How can I put all of these proportions into one table, with mean and medians? Is this possible? Thank you so much in advance.
If I understand your question correctly, this should help you.
library(dplyr)
VR_Data %>%
summarize(English_prop = sum(English) / n(),
White_prop = sum(White) / n(),
male_prop = sum(male) / n(),
age_avg = mean(age),
education_avg = mean(education))
Should give you this...
# A tibble: 1 x 5
English_prop White_prop male_prop age_avg education_avg
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0.667 0.5 34.3 15

r for loop with names mutate

The goal is to replace NAs with 0 values in a set of variables using a loop function. Obviously, this is a super simple loop function, but I have no idea why this is not doing what it should.
two additional preferences, suggestions that use the variable names (as opposed to column numbers) and use dplyr are preferred.
library
library(plyr)
library(dplyr)
sample data
y <- structure(list(pid = c(1002L, 1002L, 1002L, 1002L, 1002L, 1002L,1002L, 1002L, 1002L, 1002L), year = 1968:1977, weeks_hd_e = c(3,0, 50, 49, 50, 50, 50, 50, 50, 49), weeks_wf_e = c(4, 6, 0, 0,0, 0, 0, 0, 0, 0), weeks_hd_u = c(NA, NA, 0, 0, 0, 0, 0, 0, 0,0), weeks_hd = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), weeks_wf_u = c(NA,NA, NA, NA, NA, NA, NA, NA, 0, NA), weeks_wf = c(NA_real_, NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,NA_real_)), .Names = c("pid", "year", "weeks_hd_e", "weeks_wf_e","weeks_hd_u", "weeks_hd", "weeks_wf_u", "weeks_wf"), row.names = c(NA,10L), class = "data.frame")
this command works
y <- mutate(y, i = ifelse(!is.na(i), i, 0))
this loop does not
vars <- c("weeks_hd_e", "weeks_hd_u", "weeks_wf_e", "weeks_wf_u", "weeks_hd", "weeks_wf")
for (i in names(vars)) {
y <- mutate(y, i = ifelse(!is.na(i), i, 0))
}
View(y)
i have been given two excellent answers from friends:
for (i in 1:length(vars)){
y[vars[i]][is.na(y[vars[i]])] <- 0
}
or
y[, vars] <- apply(y[, vars], 2, function(x) ifelse(is.na(x), 0, x))
The replace_na command from the tidyr package does exactly what you want.
Use it like this:
install.packages("tidyr")
library(tidyr)
# your data
y <- structure(list(pid = c(1002L, 1002L, 1002L, 1002L, 1002L, 1002L,1002L, 1002L, 1002L, 1002L), year = 1968:1977, weeks_hd_e = c(3,0, 50, 49, 50, 50, 50, 50, 50, 49), weeks_wf_e = c(4, 6, 0, 0,0, 0, 0, 0, 0, 0), weeks_hd_u = c(NA, NA, 0, 0, 0, 0, 0, 0, 0,0), weeks_hd = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), weeks_wf_u = c(NA,NA, NA, NA, NA, NA, NA, NA, 0, NA), weeks_wf = c(NA_real_, NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,NA_real_)), .Names = c("pid", "year", "weeks_hd_e", "weeks_wf_e","weeks_hd_u", "weeks_hd", "weeks_wf_u", "weeks_wf"), row.names = c(NA,10L), class = "data.frame")
# replacing NAs in your dataframe
# specify the variables you want to replace NAs in and the replacement in the `replace` = list argument
y <- replace_na(y, replace = list(weeks_hd_e = 0, weeks_hd_u = 0, weeks_wf_e = 0, weeks_wf_u = 0, weeks_hd = 0, weeks_wf = 0))
Note that this meets your preference to specify the variables by name and is more flexible in terms of replacement, i.e. you can replace NAs in numeric and character variables in the same command.

Resources