How to calculate a proportion in R - r

I have this reproducible DataFrame:
structure(list(age = c(62.84998, 60.33899, 52.74698, 42.38498,
79.88495, 93.01599, 62.37097, 86.83899, 85.65594, 42.25897),
death = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 1), sex = c("male",
"female", "female", "female", "female", "male", "male", "male",
"male", "female"), hospdead = c(0, 1, 0, 0, 0, 1, 0, 0, 0,
0), slos = c(5, 4, 17, 3, 16, 4, 9, 7, 12, 8), d.time = c(2029,
4, 47, 133, 2029, 4, 659, 142, 63, 370), dzgroup = c("Lung Cancer",
"Cirrhosis", "Cirrhosis", "Lung Cancer", "ARF/MOSF w/Sepsis",
"Coma", "CHF", "CHF", "Lung Cancer", "Colon Cancer"), dzclass = c("Cancer",
"COPD/CHF/Cirrhosis", "COPD/CHF/Cirrhosis", "Cancer", "ARF/MOSF",
"Coma", "COPD/CHF/Cirrhosis", "COPD/CHF/Cirrhosis", "Cancer",
"Cancer"), num.co = c(0, 2, 2, 2, 1, 1, 1, 3, 2, 0), edu = c(11,
12, 12, 11, NA, 14, 14, NA, 12, 11), income = c("$11-$25k",
"$11-$25k", "under $11k", "under $11k", NA, NA, "$25-$50k",
NA, NA, "$25-$50k"), scoma = c(0, 44, 0, 0, 26, 55, 0, 26,
26, 0), charges = c(9715, 34496, 41094, 3075, 50127, 6884,
30460, 30460, NA, 9914), totcst = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), totmcst = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
), avtisst = c(7, 29, 13, 7, 18.666656, 5, 8, 6.5, 8.5, 8
), race = c("other", "white", "white", "white", "white",
"white", "white", "white", "black", "hispanic"), sps = c(33.8984375,
52.6953125, 20.5, 20.0976562, 23.5, 19.3984375, 17.296875,
21.5976562, 15.8984375, 2.2998047), aps = c(20, 74, 45, 19,
30, 27, 46, 53, 17, 9), surv2m = c(0.262939453, 0.0009999275,
0.790893555, 0.698974609, 0.634887695, 0.284973145, 0.892944336,
0.670898438, 0.570922852, 0.952880859), surv6m = c(0.0369949341,
0, 0.664916992, 0.411987305, 0.532958984, 0.214996338, 0.820922852,
0.498962402, 0.24899292, 0.887939453), hday = c(1, 3, 4,
1, 3, 1, 1, 1, 1, 1), diabetes = c(0, 0, 0, 0, 0, 0, 0, 1,
0, 0), dementia = c(0, 0, 0, 0, 0, 0, 0, 0, 1, 0), ca = c("metastatic",
"no", "no", "metastatic", "no", "no", "no", "no", "metastatic",
"metastatic"), prg2m = c(0.5, 0, 0.75, 0.899999619, 0.899999619,
0, NA, 0.799999714, 0.049999982, NA), prg6m = c(0.25, 0,
0.5, 0.5, 0.8999996, 0, 0.6999998, 0.3999999, 0.0001249999,
NA), dnr = c("no dnr", NA, "no dnr", "no dnr", "no dnr",
"no dnr", "no dnr", "no dnr", "dnr after sadm", "no dnr"),
dnrday = c(5, NA, 17, 3, 16, 4, 9, 7, 2, 8), meanbp = c(97,
43, 70, 75, 59, 110, 78, 72, 97, 84), wblc = c(6, 17.0976562,
8.5, 9.09960938, 13.5, 10.3984375, 11.6992188, 13.5996094,
9.69921875, 11.2988281), hrt = c(69, 112, 88, 88, 112, 101,
120, 100, 56, 94), resp = c(22, 34, 28, 32, 20, 44, 28, 26,
20, 20), temp = c(36, 34.59375, 37.39844, 35, 37.89844, 38.39844,
37.39844, 37.59375, 36.59375, 38.19531), pafi = c(388, 98,
231.65625, NA, 173.3125, 266.625, 309.5, 404.75, 357.125,
NA), alb = c(1.7998047, NA, NA, NA, NA, NA, 4.7998047, NA,
NA, 4.6992188), bili = c(0.19998169, NA, 2.19970703, NA,
NA, NA, 0.39996338, NA, 0.39996338, 0.19998169), crea = c(1.19995117,
5.5, 2, 0.79992676, 0.79992676, 0.69995117, 1.59985352, 2,
1, 0.79992676), sod = c(141, 132, 134, 139, 143, 140, 132,
139, 143, 139), ph = c(7.459961, 7.25, 7.459961, NA, 7.509766,
7.65918, 7.479492, 7.509766, 7.449219, NA), glucose = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), bun = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), urine = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), adlp = c(7, NA, 1, 0, NA, NA, 0, NA, NA, 0), adls = c(7,
1, 0, 0, 2, 1, 1, 0, 7, NA), sfdm2 = c(NA, "<2 mo. follow-up",
"<2 mo. follow-up", "no(M2 and SIP pres)", "no(M2 and SIP pres)",
"<2 mo. follow-up", "no(M2 and SIP pres)", NA, NA, NA), adlsc = c(7,
1, 0, 0, 2, 1, 1, 0, 7, 0.4947999)), row.names = c(NA, 10L
), class = "data.frame")
I am needing to calculate the proportion of patients who died in the hospital in patients with an active DNR order on day 3 and in patients without an active DNR order on day 3. To group which patients had an active DNR on day 3 and which did not, I used the subset function below:
SB_xlsx1 = SB_xlsx[!is.na(SB_xlsx$dnrday), ]
YesDNR = subset(SB_xlsx1, dnrday <= 3)
NoDNR = subset(SB_xlsx1, dnrday > 3)
However, I don't know how to calculate the proportion of patients that died in the hospital for those with a DNR and without a DNR. The 'hospdead' variable has all 0s and 1s, where 0 = not dead and 1 = dead. However, I don't know how to get the proportion that died for having a DNR at day 3 and did not have a DNR at day 3. What code could I use for my desired result. SB_xlsx also just represents my DataFrame name.

There's a few ways to do this but the simplest is probably via the aggregate function.
> aggregate( hospdead ~ (dnrday<=3) , SB_xlsx1 , mean)
dnrday <= 3 hospdead
1 FALSE 0.1428571
2 TRUE 0.0000000

You may use tapply to group deaths by the condition dnrday <= 3, i.e. with an active DNR on day 3 and calculate the mean.
(res <- proportions(xtabs(death ~ dnrday <= 3, SB_xlsx)))
# dnrday <= 3
# FALSE TRUE
# 0.7142857 0.2857143
where
sum(res)
# [1] 1

EDIT: I apologize; I misread your post when providing my original answer. I've revised it below.
You referred to the hospdeath variable, but in the toy data set it has just one nonzero entry, so I'm using the death variable instead to demonstrate the principle.
First, abase R approach:
mean(SB_xlsx1[SB_xlsx1$death == 1, ]$dnrday <= 3)
mean(SB_xlsx1[SB_xlsx1$death == 1, ]$dnrday > 3)
The idea is to restrict to the subset of rows for which a death occurred, then perform a logical check to see which entries have dnrday greater than 3.
Note that if you have NA entries in death, you'll want to remove them first as you did with those in dnrday.
For a dplyr approach:
library(dplyr)
SB_xlsx1 %>%
filter(death == 1) %>%
summarize(mean(dnrday <= 3), mean(dnrday > 3))
or, for a slightly nicer-looking table,
SB_xlsx1 %>%
filter(death == 1) %>%
group_by(dnrday <= 3) %>%
summarize(prop = n() / nrow(.))

Related

How to estimate a population proportion that has a certain disease

I have this data (listed as reproducible):
structure(list(age = c(62.84998, 60.33899, 52.74698, 42.38498
), death = c(0, 1, 1, 1), sex = c("male", "female", "female",
"female"), hospdead = c(0, 1, 0, 0), slos = c(5, 4, 17, 3), d.time = c(2029,
4, 47, 133), dzgroup = c("Lung Cancer", "Cirrhosis", "Cirrhosis",
"Lung Cancer"), dzclass = c("Cancer", "COPD/CHF/Cirrhosis", "COPD/CHF/Cirrhosis",
"Cancer"), num.co = c(0, 2, 2, 2), edu = c(11, 12, 12, 11), income = c("$11-$25k",
"$11-$25k", "under $11k", "under $11k"), scoma = c(0, 44, 0,
0), charges = c(9715, 34496, 41094, 3075), totcst = c(NA_real_,
NA_real_, NA_real_, NA_real_), totmcst = c(NA_real_, NA_real_,
NA_real_, NA_real_), avtisst = c(7, 29, 13, 7), race = c("other",
"white", "white", "white"), sps = c(33.8984375, 52.6953125, 20.5,
20.0976562), aps = c(20, 74, 45, 19), surv2m = c(0.262939453,
0.0009999275, 0.790893555, 0.698974609), surv6m = c(0.0369949341,
0, 0.664916992, 0.411987305), hday = c(1, 3, 4, 1), diabetes = c(0,
0, 0, 0), dementia = c(0, 0, 0, 0), ca = c("metastatic", "no",
"no", "metastatic"), prg2m = c(0.5, 0, 0.75, 0.899999619), prg6m = c(0.25,
0, 0.5, 0.5), dnr = c("no dnr", NA, "no dnr", "no dnr"), dnrday = c(5,
NA, 17, 3), meanbp = c(97, 43, 70, 75), wblc = c(6, 17.0976562,
8.5, 9.09960938), hrt = c(69, 112, 88, 88), resp = c(22, 34,
28, 32), temp = c(36, 34.59375, 37.39844, 35), pafi = c(388,
98, 231.65625, NA), alb = c(1.7998047, NA, NA, NA), bili = c(0.19998169,
NA, 2.19970703, NA), crea = c(1.19995117, 5.5, 2, 0.79992676),
sod = c(141, 132, 134, 139), ph = c(7.459961, 7.25, 7.459961,
NA), glucose = c(NA_real_, NA_real_, NA_real_, NA_real_),
bun = c(NA_real_, NA_real_, NA_real_, NA_real_), urine = c(NA_real_,
NA_real_, NA_real_, NA_real_), adlp = c(7, NA, 1, 0), adls = c(7,
1, 0, 0), sfdm2 = c(NA, "<2 mo. follow-up", "<2 mo. follow-up",
"no(M2 and SIP pres)"), adlsc = c(7, 1, 0, 0)), row.names = c(NA,
4L), class = "data.frame")
I am wanting to estimate the population proportion of individuals who had lung cancer listed as their primary disease group (dzgroup). How would I do this? My original thought was to just divide the total number that have lung cancer by the whole dataset population, but I do not believe this is correct.
If we want to get the proportion on the whole data, create a logical vector and get the mean as TRUE -> 1 and FALSE -> 0, the mean will be the proportion of 1s and multiplying by 100 gives the percentage
round(100 * mean(df1$dzgroup == "Lung Cancer", na.rm = TRUE), 2)

Error bars on double Y-axis graph--ggplot2

I am trying to add error bars to my double y axis graph, but when ran, it completely ruins the graph. I attached a picture below. I also added my code.
If you need the full data set, let me know! Thank you so much in advance!
scalefactor <- max(Complete_Seasonality_Data$PRCP)/max(Complete_Seasonality_Data$Temp_C)
p <- ggplot(Complete_Seasonality_Data, aes(x = NewMonths5))
p <- p + geom_point(aes(y = PRCP, colour = "Precipitation"))
p <- p + geom_line(aes(y = PRCP, colour = "Precipitation", group=1))
p <- p + geom_point(aes(y = Temp_C*scalefactor, colour = "Temperature"))
p <- p + geom_line(aes(y = Temp_C*scalefactor, colour = "Temperature", group=1))
p <- p + scale_y_continuous(sec.axis = sec_axis(~./scalefactor, name = ylabseasonality))
p <- p + scale_colour_manual(values = c("blue", "red"))
p <- p + labs(y = "Precipitation (in)",
x = "Month",
colour = "Parameter")
p <- p + theme_bw()
p <- p + theme(axis.text.x = element_text(angle = 90), legend.position = c(.99, .01))
p <- p + geom_errorbar(aes(ymin = TempSummary$mean - StdErrorTemp, ymax = TempSummary$mean + StdErrorTemp), position=position_dodge(.9), width=0.2)
p <- p + geom_errorbar(aes(ymin = PrecipSummary$mean - StdErrorPrecip, ymax = TempSummary$mean + StdErrorPrecip), position=position_dodge(.9), width=0.2)
p
How I computed the Std Errors
TempSummary<- Summarize(Temp_C~ Month,
data=Chara_Data,
digits=3)
View(TempSummary)
StdErrorTemp<- (TempSummary$sd)/ (sqrt(TempSummary$n))
View(StdErrorTemp)
PrecipSummary<- Summarize(PRCP ~ Group.1,
data=Complete_Seasonality_Data,
digits=3)
StdErrorPrecip<- (PrecipSummary$sd/ sqrt(PrecipSummary$n))
Complete data set!
structure(list(Group.1 = c("April", "August", "December", "February",
"January", "July", "June", "March", "May", "November", "October",
"September"), Season = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), Month = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), Year = c(2017.05882352941, 2016.6, 2016.6, 2017.6,
2017.6, 2016.6, 2017.05882352941, 2017, 2017.05882352941, 2016.6,
2016.6, 2016.6), Date = structure(c(1494315952.94118, 1490691600,
1500316560, 1506183120, 1504163520, 1487501280, 1499108611.76471,
1489840800, 1496798682.35294, 1498314240, 1496087280, 1493421840
), class = c("POSIXct", "POSIXt")), Site = c(8.17647058823529,
8.125, 7.775, 7.775, 6.375, 6.375, 8.20588235294118, 6.80555555555556,
6.55882352941176, 6.375, 8.1, 6.375), PercentCover = c(0.765882352941176,
0.7125, 0.7505, 0.7775, 0.8625, 0.867, 0.763529411764706, 0.83,
0.850588235294118, 0.848, 0.7065, 0.834), AveHt = c(60.1684438927086,
50.2311192279942, 58.9048701298701, 57.3448097041847, 55.2253291847042,
64.6965656565657, 57.9602622867329, 56.672138047138, 64.4076426024955,
57.1465322871573, 54.3781565656566, 58.3185831529582), SE = c(7.07246013321596,
7.79305525403115, 7.00224498332823, 6.46671176266333, 6.32495719718401,
7.04611575726224, 8.09695750051648, 5.65899377193264, 7.28959135811987,
6.24571692582705, 7.32819802238581, 7.05669314452393), MaxHt = c(88.3823529411765,
81.625, 87.75, 85, 85.875, 96.425, 92.9117647058823, 82.5, 98.6764705882353,
88.125, 79.75, 89.65), green = c(0.350962665193537, 0.278211058736042,
0.183934291894458, 0.197711422851132, 0.179043270311077, 0.335751664926552,
0.186533536107468, 0.256634190010066, 0.319397625619223, 0.204519948331115,
0.249063275007846, 0.277894684744482), yellow = c(0.556643767952726,
0.569690303836593, 0.686152813243381, 0.654331042886853, 0.594548585049017,
0.554485584960289, 0.581008683220038, 0.609988063809375, 0.594827659217835,
0.620510694031593, 0.633793562346056, 0.600527348262596), brown = c(0.0923935668537371,
0.14983619398845, 0.122185622134889, 0.145933312808728, 0.226114026992848,
0.10976275011316, 0.229212761734686, 0.132653108499399, 0.0857747151629417,
0.174675239990233, 0.114398064606882, 0.121577966992922), Temp = c(78.4411764705882,
82.975, 75.65, 74.75, 74.3, 82.2051282051282, 81.0882352941177,
75.8333333333333, 79.8823529411765, 78.6, 80.1944444444444, 83
), Temp_C = c(25.8006535947712, 28.3194444444444, 24.25, 23.75,
23.5, 27.8917378917379, 27.2712418300654, 24.3518518518519, 26.6013071895425,
25.8888888888889, 26.7746913580247, 28.3333333333333), Vis = c(1.98823529411765,
2.12820512820513, 2.2125, 2.07, 2.1625, 2.07179487179487, 2.05,
2.02777777777778, 2.11764705882353, 2.205, 2.11, 2.17375), Nests = c(12.4117647058824,
17.1, 7.1, 6.275, 4, 8.9, 13.8787878787879, 4.88888888888889,
7.38235294117647, 2.8, 13.025, 5.6), SickorDeadFish = c(0.0882352941176471,
0.2, 0.175, 0.075, 0.05, 0.117647058823529, 0.0882352941176471,
0.166666666666667, 0.0294117647058824, 0.25, 0.333333333333333,
0.275), Cladophora = c(0.0866666666666667, 0.0492857142857143,
0.0471428571428571, 0.0907142857142857, 0.0264285714285714, 0.0154545454545455,
0.0380952380952381, 0.0295238095238095, 0.0161904761904762, 0.0178571428571429,
0.0407142857142857, 0.03), Comments = c(NaN, NaN, NaN, NaN, NaN,
NaN, NaN, NaN, NaN, NaN, NaN, NaN), STATION = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), NAME = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), DATE = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), MONTH = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), AWND = c(6.52626966292135, 5.97866090712743, 5.85811926605505,
6.31656097560976, 6.181, 6.1103908045977, 6.23947727272727, 6.5154211663067,
6.0985313174946, 5.64997635933806, 5.43263157894737, 5.54940639269406
), FMTM = c(1412.13333333333, 1431.1935483871, 1411.77419354839,
1535.16666666667, 1339.24137931034, 1439.77419354839, 1378.3,
1398.8064516129, 1353.12903225806, 1362.96666666667, 1408.45161290323,
1381.46666666667), PGTM = c(1394.1095890411, 1394.96774193548,
1306.83333333333, 1412.0511627907, 1327.90350877193, 1435.51769911504,
1372.37674418605, 1389.12328767123, 1376.75576036866, 1373.45341614907,
1346.2774566474, 1396), PRCP = c(0.0205869074492099, 0.0248701298701299,
0.0663425925925926, 0.0481472684085511, 0.0360991379310345, 0.0101144164759725,
0.00790067720090293, 0.0762693156732892, 0.0298491379310345,
0.0472985781990521, 0.034965034965035, 0.0243778801843318), SNOW = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), SNWD = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), TAVG = c(78.5333333333333, NaN, NaN, 61.1052631578947,
68.6333333333333, 80.2903225806452, 79.4, 72.5161290322581, 77.8709677419355,
NaN, NaN, NaN), TMAX = c(83.6826484018265, 88.8509719222462,
81.4940617577197, 80.6938271604938, 80.8072562358277, 88.1520737327189,
86.8795454545455, 81.3290043290043, 84.6048034934498, 83.8289786223278,
86.3615560640732, 88.1009174311927), TMIN = c(67.5423340961098,
72.5917926565875, 66.4394299287411, 64.9283950617284, 64.5600907029478,
71.9654377880184, 70.6772727272727, 65.7597402597403, 68.6527472527472,
68.9643705463183, 70.558352402746, 71.7821100917431), TSUN = c(NaN,
NaN, NaN, 0, 0, NaN, NaN, NaN, NaN, NaN, NaN, NaN), WDF2 = c(115.538116591928,
100.905172413793, 133.577981651376, 143.965936739659, 149.438444924406,
91.141876430206, 99.5022624434389, 131.612903225806, 124.279569892473,
109.693396226415, 119.450800915332, 115.068493150685), WDF5 = c(107.545045045045,
97.6077586206897, 124.528735632184, 133.031784841076, 140.826086956522,
82.5229357798165, 90.972850678733, 120.634573304158, 115.714285714286,
103.720379146919, 109.266055045872, 104.736842105263), WSF2 = c(15.2026905829596,
14.8530172413793, 14.6919724770642, 15.4111922141119, 15.1332613390929,
14.9070938215103, 15.083257918552, 15.4161290322581, 14.8625806451613,
14.322641509434, 14.3432494279176, 14.5600456621005), WSF5 = c(22.1105855855856,
21.9961206896552, 20.8029885057471, 20.8081145584726, 20.4824675324675,
22.4052752293578, 22.2158371040724, 21.9317286652079, 21.130303030303,
20.8722748815166, 20.493119266055, 21.0052511415525), WT01 = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), WT02 = c(NaN, 1, NaN, 1, 1,
NaN, NaN, 1, 1, NaN, 1, NaN), WT08 = c(1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1), WT10 = c(NaN, NaN, NaN, NaN, NaN, NaN, 1, NaN, NaN,
NaN, NaN, NaN), NewMonths2 = structure(c(17295, 17253, 17364,
17432, 17409, 17216, 17350, 17243, 17324, 17341, 17315, 17284
), class = "Date")), row.names = c(NA, -12L), class = "data.frame")
**Edited to add complete data set and how I did std error
Temp Summary
structure(list(Month = c("April", "August", "December", "February",
"January", "July", "June", "March", "May", "November", "October",
"September"), n = c(34, 40, 40, 40, 40, 40, 34, 36, 34, 40, 40,
40), nvalid = c(34, 40, 40, 40, 40, 39, 34, 36, 34, 40, 36, 40
), mean = c(25.801, 28.319, 24.25, 23.75, 23.5, 27.892, 27.271,
24.352, 26.601, 25.889, 26.775, 28.333), sd = c(0.478, 0.978,
0.921, 0.793, 0.551, 0.463, 0.632, 1.47, 0.905, 0.763, 0.928,
0.534), min = c(25, 26.667, 22.778, 21.667, 21.667, 27.222, 26.111,
22.778, 25, 25, 25.556, 27.222), Q1 = c(25.556, 27.778, 23.889,
23.333, 23.333, 27.778, 27.222, 23.333, 26.111, 25.556, 25.556,
27.778), median = c(25.556, 27.778, 23.889, 23.889, 23.333, 27.778,
27.222, 23.889, 26.667, 25.556, 27.222, 28.333), Q3 = c(25.972,
28.889, 25, 24.444, 23.889, 28.333, 27.639, 24.583, 27.222, 26.111,
27.361, 28.889), max = c(26.667, 30, 25.556, 25, 24.444, 28.889,
28.889, 27.222, 27.778, 27.778, 28.333, 29.444)), class = "data.frame", row.names = c(NA,
-12L))
Precip Summary
structure(list(MONTH = c("April", "August", "December", "February",
"January", "July", "June", "March", "May", "November", "October",
"September"), n = c(446, 464, 436, 422, 465, 437, 444, 465, 465,
424, 438, 439), nvalid = c(443, 462, 432, 421, 464, 437, 443,
453, 464, 422, 429, 434), mean = c(0.021, 0.025, 0.066, 0.048,
0.036, 0.01, 0.008, 0.076, 0.03, 0.047, 0.035, 0.024), sd = c(0.094,
0.184, 0.342, 0.211, 0.142, 0.047, 0.047, 0.343, 0.14, 0.24,
0.243, 0.112), min = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Q1 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), median = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), Q3 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
max = c(1.1, 3.06, 4.94, 2.61, 1.5, 0.47, 0.76, 3.32, 1.43,
3.29, 3.64, 1.25), percZero = c(81.264, 87.662, 76.389, 75.534,
77.802, 86.728, 86.682, 75.717, 84.267, 77.962, 83.916, 81.797
)), class = "data.frame", row.names = c(NA, -12L))
Temp Summary Results
enter image description here
Precip Summary Results
enter image description here
I would suggest next approach. Just be careful on the values of your error bars. Also, scaling factors must also be applied to error bars. That is why you got a messy plot. Here the code using the data you added:
library(ggplot2)
#Create var
Complete_Seasonality_Data$NewMonths5 <- as.Date(Complete_Seasonality_Data$Date)
#Computing
StdErrorTemp<- (TempSummary$sd)/ (sqrt(TempSummary$n))
StdErrorPrecip<- (PrecipSummary$sd/ sqrt(PrecipSummary$n))
#Scale factor
scalefactor <- max(Complete_Seasonality_Data$PRCP)/max(Complete_Seasonality_Data$Temp_C)
#Plot
p <- ggplot(Complete_Seasonality_Data, aes(x = NewMonths5))
p <- p + geom_point(aes(y = PRCP, colour = "Precipitation"))
p <- p + geom_line(aes(y = PRCP, colour = "Precipitation", group=1))
p <- p + geom_errorbar(aes(ymin = PrecipSummary$mean - StdErrorPrecip,
ymax = PrecipSummary$mean + StdErrorPrecip),
position=position_dodge(.9), width=0.2)
p <- p + geom_point(aes(y = Temp_C*scalefactor, colour = "Temperature"))
p <- p + geom_line(aes(y = Temp_C*scalefactor, colour = "Temperature", group=1))
p <- p + scale_y_continuous(sec.axis = sec_axis(~./scalefactor, name = 'Temperature'))
p <- p + geom_errorbar(aes(ymin = TempSummary$mean*scalefactor - StdErrorTemp,
ymax = TempSummary$mean*scalefactor + StdErrorTemp),
position=position_dodge(.9), width=0.2)
p <- p + scale_colour_manual(values = c("blue", "red"))
p <- p + labs(y = "Precipitation (in)",
x = "Month",
colour = "Parameter")
p <- p + theme_bw()
p <- p + theme(axis.text.x = element_text(angle = 90), legend.position = c(.99, .01))
p
Output:

How to find make one table of proportions of demographic variables in R

I'm new to R and am having trouble with a simple command. How do I find the proportion of demographic variables (for example, proportion of English speakers in my population, or proportion of White respondents)?
I'd like to create a large table with all of the proportions, and would hopefull include mean age and median education level, but am having trouble finding the command. This is what I've tried:
table2 <- table(VR_Data$English)
prop.table(table2)
table3 <- table(VR_Data$race)
prop.table(table3)
table4 <- table(VR_Data$male)
prop.table(table4)
If it helps, this is my data:
structure(list(study = c(4, 4, 4, 1, 1, 1), TREATMENT = c(0,
0, 0, 0, 0, 0), TREATMENT4 = c(0, 0, 0, 0, 0, 0), TREATMENT2 = c(0,
0, 0, 0, 0, 0), TREATMENT3 = c(0, 0, 0, 0, 0, 0), order = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), id = c(279,
238, 239, 135, 143, 138), treatment = c(0, 0, 0, 0, 0, 0), treatment_condition = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), control_condition = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), m_check1 = c(1,
1, 1, 1, 1, 1), relationship = c(NA, NA, NA, 7, 6, 5), payment = c(NA,
NA, NA, 10, 3, 3), educ_level = c(14, 14, 12, 16, 16, 18), golf = c(3,
5, 3, 3, 2, 3), male = c(1, 0, 1, 0, 0, 1), Asian = c(0, 1, 0,
0, 0, 0), Black = c(0, 0, 0, 0, 0, 0), Latino = c(1, 0, 0, 0,
0, 0), White = c(0, 0, 1, 1, 1, 1), age = c(27, 53, 49, 25, 28,
24), English = c(1, 1, 1, 1, 1, 1), education = c(16, 16, 14,
14, 14, 16), enjoy = c(4, 1, 3.5, 4.25, 3.25, 3.5), RELATIONSHIP = c(4.33333349227905,
1, 4.33333349227905, 3.66666674613953, 3.5, 3.66666674613953),
anxiety = c(3, 3.40000009536743, 2.20000004768372, 1.25,
2, 1.25), BEH_SIM = c(3, 1, 3.75, 2.75, 2.5, 1.75), sptconf = c(3.33333325386047,
1.5, 4, 4.83333349227905, 4, 3.66666674613953), NEG_EFFICACY = c(4,
1.16666662693024, 3.66666674613953, 4.83333349227905, 4.16666650772095,
4.5), spteffort = c(3.16666674613953, 3.5, 4.16666650772095,
3.16666674613953, 3.16666674613953, 3.5), SPTEFFORT_OTHER = c(3.16666674613953,
3.5, 3.5, 3.16666674613953, 3, 3.33333325386047), SIM_VALUES = c(3.75,
1, 3.75, 3.75, 1.5, 2.25), COOP_MOTIV = c(2.33333325386047,
3, 2.66666674613953, 5, 2.5, 2.66666674613953), COMP_MOTIV = c(5,
5, 3.20000004768372, 4.40000009536743, 2.40000009536743,
4.40000009536743), presence = c(NA, NA, NA, 2.79999995231628,
1.79999995231628, 2.59999990463257), environ = c(NA, NA,
NA, 3, 4, 3), openresponse = c(NA, NA, NA, 94.25, 86, 60),
TotalOwnerCommission = c(300, 266.666656494141, 258.333343505859,
266.666656494141, 383.333343505859, 325), TotalRangerComm = c(258.333343505859,
233.33332824707, 291.666656494141, 258.333343505859, 175,
166.66667175293), TotalComm = c(279.166687011719, 250, 275,
262.5, 279.166687011719, 245.833343505859), merge = c(1,
1, 1, 0, 0, 0), Control = c(1, 1, 1, NA, NA, NA), treatment_Shoes = c(0,
0, 0, NA, NA, NA), treatment_Instructions_Only = c(0, 0,
0, NA, NA, NA), treatment_Info_Only = c(0, 0, 0, NA, NA,
NA), treatment_Info_Instructions = c(0, 0, 0, NA, NA, NA),
group = c("OwnerOnly", "OwnerOnly", "OwnerOnly", "", "",
""), race = c(4, 2, 5, NA, NA, NA), race_a = c("", "", "",
"", "", ""), RELATIONSHIP_2 = c(9.02055358886719, 1, 9.02055358886719,
7.02113246917725, 6.54790019989014, 7.02113246917725), TotalOwnerCommission_2 = c(5196.15234375,
4354.64794921875, 4152.12744140625, 4354.64794921875, 7505.24560546875,
5859.02099609375)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
How can I put all of these proportions into one table, with mean and medians? Is this possible? Thank you so much in advance.
If I understand your question correctly, this should help you.
library(dplyr)
VR_Data %>%
summarize(English_prop = sum(English) / n(),
White_prop = sum(White) / n(),
male_prop = sum(male) / n(),
age_avg = mean(age),
education_avg = mean(education))
Should give you this...
# A tibble: 1 x 5
English_prop White_prop male_prop age_avg education_avg
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0.667 0.5 34.3 15

Create table using kable in R

This is the code that I used (with a lot of help from the StackOverflow communitiy!) to create a simpler table using the same data:
library(here)
ANOVA_Relationship_Subset_sum <- ANOVA_Relationship_Subset %>%
dplyr::group_by(treatment) %>%
dplyr::summarize(
n=n(),
mean=mean(TotalComm),
`std. dev` = sd(TotalComm)
)
ANOVA_Relationship_Subset_sum
Now I'm on to something a little more complicated; how can I create a table like this:
If it helps, this is my data:
structure(list(study = c(4, 4, 4, 1, 1, 1), TREATMENT = c(0,
0, 0, 0, 0, 0), TREATMENT4 = c(0, 0, 0, 0, 0, 0), TREATMENT2 = c(0,
0, 0, 0, 0, 0), TREATMENT3 = c(0, 0, 0, 0, 0, 0), order = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), id = c(279,
238, 239, 135, 143, 138), treatment = c(0, 0, 0, 0, 0, 0), treatment_condition = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), control_condition = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), m_check1 = c(1,
1, 1, 1, 1, 1), relationship = c(NA, NA, NA, 7, 6, 5), payment = c(NA,
NA, NA, 10, 3, 3), educ_level = c(14, 14, 12, 16, 16, 18), golf = c(3,
5, 3, 3, 2, 3), male = c(1, 0, 1, 0, 0, 1), Asian = c(0, 1, 0,
0, 0, 0), Black = c(0, 0, 0, 0, 0, 0), Latino = c(1, 0, 0, 0,
0, 0), White = c(0, 0, 1, 1, 1, 1), age = c(27, 53, 49, 25, 28,
24), English = c(1, 1, 1, 1, 1, 1), education = c(16, 16, 14,
14, 14, 16), enjoy = c(4, 1, 3.5, 4.25, 3.25, 3.5), RELATIONSHIP = c(4.33333349227905,
1, 4.33333349227905, 3.66666674613953, 3.5, 3.66666674613953),
anxiety = c(3, 3.40000009536743, 2.20000004768372, 1.25,
2, 1.25), BEH_SIM = c(3, 1, 3.75, 2.75, 2.5, 1.75), sptconf = c(3.33333325386047,
1.5, 4, 4.83333349227905, 4, 3.66666674613953), NEG_EFFICACY = c(4,
1.16666662693024, 3.66666674613953, 4.83333349227905, 4.16666650772095,
4.5), spteffort = c(3.16666674613953, 3.5, 4.16666650772095,
3.16666674613953, 3.16666674613953, 3.5), SPTEFFORT_OTHER = c(3.16666674613953,
3.5, 3.5, 3.16666674613953, 3, 3.33333325386047), SIM_VALUES = c(3.75,
1, 3.75, 3.75, 1.5, 2.25), COOP_MOTIV = c(2.33333325386047,
3, 2.66666674613953, 5, 2.5, 2.66666674613953), COMP_MOTIV = c(5,
5, 3.20000004768372, 4.40000009536743, 2.40000009536743,
4.40000009536743), presence = c(NA, NA, NA, 2.79999995231628,
1.79999995231628, 2.59999990463257), environ = c(NA, NA,
NA, 3, 4, 3), openresponse = c(NA, NA, NA, 94.25, 86, 60),
TotalOwnerCommission = c(300, 266.666656494141, 258.333343505859,
266.666656494141, 383.333343505859, 325), TotalRangerComm = c(258.333343505859,
233.33332824707, 291.666656494141, 258.333343505859, 175,
166.66667175293), TotalComm = c(279.166687011719, 250, 275,
262.5, 279.166687011719, 245.833343505859), merge = c(1,
1, 1, 0, 0, 0), Control = c(1, 1, 1, NA, NA, NA), treatment_Shoes = c(0,
0, 0, NA, NA, NA), treatment_Instructions_Only = c(0, 0,
0, NA, NA, NA), treatment_Info_Only = c(0, 0, 0, NA, NA,
NA), treatment_Info_Instructions = c(0, 0, 0, NA, NA, NA),
group = c("OwnerOnly", "OwnerOnly", "OwnerOnly", "", "",
""), race = c(4, 2, 5, NA, NA, NA), race_a = c("", "", "",
"", "", ""), RELATIONSHIP_2 = c(9.02055358886719, 1, 9.02055358886719,
7.02113246917725, 6.54790019989014, 7.02113246917725), TotalOwnerCommission_2 = c(5196.15234375,
4354.64794921875, 4152.12744140625, 4354.64794921875, 7505.24560546875,
5859.02099609375)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
Briefly, I do want to thank the SO community for all their help with R. I don't know how I would have gotten this far without all of your help.
Try the apaTables Package! Format your data as per the example, and use the apa.aov.table() function to transform your table to APA style.

r for loop with names mutate

The goal is to replace NAs with 0 values in a set of variables using a loop function. Obviously, this is a super simple loop function, but I have no idea why this is not doing what it should.
two additional preferences, suggestions that use the variable names (as opposed to column numbers) and use dplyr are preferred.
library
library(plyr)
library(dplyr)
sample data
y <- structure(list(pid = c(1002L, 1002L, 1002L, 1002L, 1002L, 1002L,1002L, 1002L, 1002L, 1002L), year = 1968:1977, weeks_hd_e = c(3,0, 50, 49, 50, 50, 50, 50, 50, 49), weeks_wf_e = c(4, 6, 0, 0,0, 0, 0, 0, 0, 0), weeks_hd_u = c(NA, NA, 0, 0, 0, 0, 0, 0, 0,0), weeks_hd = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), weeks_wf_u = c(NA,NA, NA, NA, NA, NA, NA, NA, 0, NA), weeks_wf = c(NA_real_, NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,NA_real_)), .Names = c("pid", "year", "weeks_hd_e", "weeks_wf_e","weeks_hd_u", "weeks_hd", "weeks_wf_u", "weeks_wf"), row.names = c(NA,10L), class = "data.frame")
this command works
y <- mutate(y, i = ifelse(!is.na(i), i, 0))
this loop does not
vars <- c("weeks_hd_e", "weeks_hd_u", "weeks_wf_e", "weeks_wf_u", "weeks_hd", "weeks_wf")
for (i in names(vars)) {
y <- mutate(y, i = ifelse(!is.na(i), i, 0))
}
View(y)
i have been given two excellent answers from friends:
for (i in 1:length(vars)){
y[vars[i]][is.na(y[vars[i]])] <- 0
}
or
y[, vars] <- apply(y[, vars], 2, function(x) ifelse(is.na(x), 0, x))
The replace_na command from the tidyr package does exactly what you want.
Use it like this:
install.packages("tidyr")
library(tidyr)
# your data
y <- structure(list(pid = c(1002L, 1002L, 1002L, 1002L, 1002L, 1002L,1002L, 1002L, 1002L, 1002L), year = 1968:1977, weeks_hd_e = c(3,0, 50, 49, 50, 50, 50, 50, 50, 49), weeks_wf_e = c(4, 6, 0, 0,0, 0, 0, 0, 0, 0), weeks_hd_u = c(NA, NA, 0, 0, 0, 0, 0, 0, 0,0), weeks_hd = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), weeks_wf_u = c(NA,NA, NA, NA, NA, NA, NA, NA, 0, NA), weeks_wf = c(NA_real_, NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,NA_real_)), .Names = c("pid", "year", "weeks_hd_e", "weeks_wf_e","weeks_hd_u", "weeks_hd", "weeks_wf_u", "weeks_wf"), row.names = c(NA,10L), class = "data.frame")
# replacing NAs in your dataframe
# specify the variables you want to replace NAs in and the replacement in the `replace` = list argument
y <- replace_na(y, replace = list(weeks_hd_e = 0, weeks_hd_u = 0, weeks_wf_e = 0, weeks_wf_u = 0, weeks_hd = 0, weeks_wf = 0))
Note that this meets your preference to specify the variables by name and is more flexible in terms of replacement, i.e. you can replace NAs in numeric and character variables in the same command.

Resources