Boxplot with two levels and multiple data.frames - r

I have 4 data.frames with two factor levels in each data.frame. df1 is reproduced below. Please duplicate df1 to produce df2...df4.
How can I produce boxplots with ggplot2 such that my final figure looks very similar to the figure below? The seasons in the figure represent the dataframe names while present and future represent level names and the legend represents heavy, heavy, heaviest in the data reproduced here.
Ignore the dotted horizontal red line.
df1= structure(list(id = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NN", "SS"), class = "factor"),
heavy = c(0.136230125, 0.136281211, 0.136038018, 0.135392862,
0.137088902, 0.136028293, 0.13640057, 0.135317058, 0.13688615,
0.136448994, 0.137089424, 0.136810847, 0.135865471, 0.136130096,
0.136361327, 0.137796714, 0.136052839, 0.135892646, 0.13544437,
0.136452363, 0.135367421, 0.135617509, 0.138202559, 0.135396942,
0.135930092, 0.135661805, 0.135666, 0.135860128, 0.137648687,
0.136057353, 0.136057731, 0.135162399, 0.136080113, 0.135285036,
0.136204839, 0.138058091, 0.137215664, 0.135696637, 0.135863902,
0.135733243, 0.138274445, 0.136632122, 0.137787919, 0.135033093,
0.136926798, 0.136766413, 0.13690947, 0.135203152, 0.138370968,
0.136862356, 0.136083112, 0.138212845, 0.135964773, 0.13583601,
0.134923731, 0.135828965, 0.136272539, 0.138127602, 0.137028323,
0.136526836, 0.136407397, 0.137025373, 0.138358757, 0.137858521,
0.135464076, 0.136302506, 0.135528362, 0.137540677, 0.136455865,
0.138470144, 0.137227895, 0.136296955, 0.136792631, 0.135875782,
0.13815733, 0.136383864, 0.136696618, 0.13857652, 0.136700903,
0.136743873, 0.136033619, 0.135970522, 0.135816385, 0.136003984,
0.136583925, 0.136768202, 0.136292002, 0.136316737, 0.136540075,
0.136051218, 0.135924119, 0.136736303, 0.136946894, 0.136266073,
0.136263692, 0.136399301, 0.13611577, 0.135857095, 0.136769488,
0.136072466, 0.135564224, 0.136496131, 0.137659507, 0.136704681,
0.136542173, 0.136777403, 0.135771538, 0.13665463, 0.136984748,
0.137717859, 0.138195237, 0.136232227, 0.135956814), heavier = c(0.227332679,
0.227200132, 0.227299118, 0.227289816, 0.22724478, 0.227082442,
0.227861315, 0.227055561, 0.227112284, 0.228651438, 0.228158412,
0.228789678, 0.227188949, 0.228850198, 0.227246991, 0.227359368,
0.227359531, 0.227310607, 0.229490445, 0.227295226, 0.227958185,
0.228104958, 0.227254823, 0.22715392, 0.228062515, 0.227509559,
0.227143662, 0.230048719, 0.227860836, 0.228467792, 0.227263728,
0.227222794, 0.227165592, 0.227140611, 0.228424335, 0.227356425,
0.227243374, 0.228936267, 0.227320467, 0.22738371, 0.227694891,
0.227270428, 0.227751798, 0.228803279, 0.227330453, 0.229679261,
0.228999206, 0.227227604, 0.227247085, 0.227198567, 0.229234921,
0.227211613, 0.23007234, 0.226793036, 0.226474338, 0.226654333,
0.229964991, 0.22880328, 0.22700099, 0.226640822, 0.227522393,
0.227463578, 0.227832692, 0.227293936, 0.230154101, 0.229813709,
0.22761097, 0.227445308, 0.228669159, 0.22660539, 0.229017398,
0.230421347, 0.227041103, 0.227583471, 0.229547568, 0.22676335,
0.226737661, 0.229922588, 0.226907188, 0.227102239, 0.226469073,
0.230680908, 0.227763879, 0.226882448, 0.226741993, 0.226693024,
0.22671415, 0.226773662, 0.227795194, 0.226983096, 0.226647946,
0.226799552, 0.226759218, 0.22692942, 0.226601519, 0.227098192,
0.226886889, 0.226959012, 0.226552119, 0.226809761, 0.226786285,
0.226709252, 0.226834015, 0.228033943, 0.226693494, 0.22748613,
0.227608804, 0.22685023, 0.226586619, 0.227718907, 0.228890098,
0.226701909, 0.230919944), heaviest = c(0.316870607, 0.316772978,
0.316851707, 0.317017543, 0.316673994, 0.317224709, 0.319234458,
0.31861305, 0.319804304, 0.318605816, 0.316930034, 0.31688398,
0.316789552, 0.320783976, 0.317094325, 0.31809319, 0.317134565,
0.318173976, 0.317213167, 0.317084404, 0.321712205, 0.317128056,
0.316866913, 0.3170489, 0.31712423, 0.31684494, 0.319497635,
0.316932301, 0.316864646, 0.317279005, 0.316887692, 0.317134437,
0.316792589, 0.320894499, 0.319883014, 0.316924639, 0.316575642,
0.31686389, 0.316985994, 0.321566256, 0.316683995, 0.320299883,
0.317308965, 0.318151948, 0.316479828, 0.319857732, 0.317171909,
0.322137849, 0.316526917, 0.316870364, 0.322205784, 0.317055758,
0.320329144, 0.318015397, 0.318719989, 0.317910658, 0.317292016,
0.321348723, 0.319915048, 0.317160762, 0.318773245, 0.319627925,
0.31869767, 0.322422407, 0.32082693, 0.318034899, 0.318760783,
0.318325502, 0.320739086, 0.317216142, 0.32284544, 0.319466593,
0.318740499, 0.317489944, 0.319064923, 0.322014928, 0.317353897,
0.318904583, 0.317931141, 0.323295254, 0.318924712, 0.318965677,
0.317700019, 0.31793468, 0.317699508, 0.317168657, 0.318903983,
0.317493401, 0.317511406, 0.317483897, 0.31748495, 0.317776804,
0.318893431, 0.317663608, 0.316978585, 0.317473467, 0.317500429,
0.317144259, 0.317330826, 0.317610353, 0.317881476, 0.31707787,
0.317728374, 0.317452137, 0.31938939, 0.317199373, 0.31898747,
0.318878952, 0.317987024, 0.318951952, 0.318419561, 0.319568088,
0.321165413)), .Names = c("id", "heavy", "heavier", "heaviest"
), class = "data.frame", row.names = c(NA, -113L))

## create some data.frames: this results in a list of four dfs
createDF <- quote(data.frame(id=sample(c("NN", "SS"), 100, rep=T),
heavy=runif(100),
heavier=runif(100),
heaviest=runif(100)))
dfs <- lapply(1:4, function(i) eval(createDF))
## join and shape them
library(reshape2)
dat <- do.call(rbind, dfs)
dat$dfid <- paste("df", rep(1:4, times=sapply(dfs, nrow)))
dat <- melt(dat, id.vars=c("id", "dfid"))
ggplot(dat, aes(id, value, group=interaction(variable, id), fill=variable)) +
geom_boxplot() +
facet_grid(~dfid)

Something like this?
df1$season<- 'winter'
df2$season<- 'spring'
df3$season<- 'summer'
df4$season<- 'fall'
df1.m <- melt(df1, id.vars=c('id', 'season'), variable.name='weight', value.name='weight')
df2.m <- melt(df2, id.vars=c('id', 'season'), variable.name='weight', value.name='weight')
df3.m <- melt(df3, id.vars=c('id', 'season'), variable.name='weight', value.name='weight')
df4.m <- melt(df4, id.vars=c('id', 'season'), variable.name='weight', value.name='weight')
df.all <- rbind(df1.m, df2.m, df3.m, df4.m)
ggplot(df.all, aes(x=id, y=weight, fill=weightCat)) + geom_boxplot() + facet_grid(. ~ season)

Related

Form groups using block random assignment on two covariates

I often have groups of people who differ in their nationality and their status. They have to work in groups, and I would like to use block random assignment to create groups of a maximum of 5 individuals. Each group should have at least one person who is "foreign" and one who is "female". I have found the library randomizr which is supposedly able to do block random assignments, but my code does not work as intended.
An example dataset would be:
structure(list(Student = c("Susan", "Ciara", "Carl",
"Paula", "Emil", "Tammy", "Logan", "Anna", "Victor",
"Felix", "Federica", "Jesus", "Jens", "Samira", "Berit", "Yi",
"Lea", "Gordon", "Boris", "Silvester", "Celine", "Thomas", "Eduardo",
"RoY", "Marlene", "Amelie", "Claudius", "Herbert", "Cynthia", "Melanie",
"Leander", "Leona", "Tobias", "Leander", "Peter",
"Lilly", "Roxy", "Joachim"), Nationality = structure(c(2L, 2L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L,
1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 1L, 2L, 2L), levels = c("Non-foreign", "Foreign"), class = "factor"),
Gender = structure(c(1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L,
1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L), levels = c("female",
"male"), class = "factor")), class = "data.frame", row.names = c(NA,
-38L))
UPDATE: I have carefully read the vignette for the randomzir package again. I found that it is possible to create blocks with more than 1 covariate. I am now looking to see if i can assign these blocks to the students to get block random groups. I need to test if the code below works as intended.
blocks <- with(data, paste(Nationality, Gender, sep = "_"))
Z <- block_ra(blocks = blocks, num_arms = 6)
table(data$Student, Z)

tidy eval ggplot2 NSE not rendering correctly

I'm trying to write a function to pass quoted items for constructing multiple ggplots.The following code works great and does what I want.
fig2.data %>%
ggplot(aes(x = Surgery, y = BALF_Protein, fill = Exposure)) +
stat_summary(geom = "errorbar", fun.data = mean_se, position = "dodge") +
stat_summary(geom = "bar", fun = mean, position = "dodge") +
theme_classic() +
scale_fill_manual(values=c("lightgrey","darkgrey")) +
facet_grid(cols = vars(Duration))
Using this guide I constructed the following function and called the function.
plotf <- function(x, y, fill, facet){
x_var <- enquo(x)
y_var <- enquo(y)
facet_var <- enquo(facet)
fill_var <- enquo(fill)
ggplot(fig2.data, aes(x = !!x_var, y = !!y_var, fill = !!fill_var)) +
stat_summary(geom = "errorbar", fun.data = mean_se, position = "dodge") +
stat_summary(geom = "bar", fun = mean, position = "dodge") +
theme_classic() +
scale_fill_manual(values=c("lightgrey","darkgrey")) +
facet_grid(cols = vars(!!facet_var))
}
plotf(x = "Surgery", y = "BALF_Protein", fill = "Exposure", facet = "Duration")
My graph rendered without errors, but it is not rendered the same way.
What am I doing wrong?
Thank you #Stefan
I don't understand why, but calling it as you suggested worked. How is that going to work when I want to loop over a vector of variable names to call the function and those are going to be passed as quoted. Use syms() ?
plotf(x = Surgery, y = BALF_Protein, fill = Exposure, facet = Duration)
ReproData here with some rnorm() so your plot might be slightly different heights.
fig2.data <- structure(list(Surgery = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("SHAM", "HEP VAG"
), class = "factor"), Exposure = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Air",
"Ozone"), class = "factor"), Duration = structure(c(2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1d",
"2d"), class = "factor"), BALF_Protein = c(64.2302655135303,
75.8662498743628, 66.944160651771, 64.3494818599307, 93.5733806883362,
93.9843061725941, 94.9296956493259, 85.5985055395191, 80.4974511604734,
70.6316004306272, 85.3439438112908, 79.4666853120619, 84.7319693413318,
224.606438793638, 78.4487502522719, 78.2128699744882, 92.0151032176434,
79.2127901600167, 83.0909690767245, 92.0325415462662, 60.6200784843927,
97.7183404856683, 68.7510921525122, 41.9625493809036, 311.769822036931,
450.597937801349, 283.639976251784, 190.840750069959, 187.810222461528,
203.735530975931, 547.003463243173, 517.871472878502, 164.167773487012,
202.777306107217, 666.896662547508, 361.46103562071, 270.119121964956,
234.635143377769, 94.4541075117046, 91.1060986818939, 142.774777316869,
300.021992736686, 279.775933301683, 246.554185364089, 298.964364163939,
193.737945537319, 232.918974192744, 150.384203703162)), row.names = c(NA,
-48L), class = "data.frame")

Stacked barplot using ggplot2 - data visualisation

I have very little experience with R and am trying to make a stacked barplot using ggplot2.
I have 2 groups - control and experimental, and 2 choices - red and green. I'm not sure how to organise my data.
There were 80 animals in my trial (control n=40, experimental n=40) and they were given the choice of red and green substrate, I noted which substrate they chose, and that's the data I'm trying to plot.
I would essentially want 'Experimental' and 'Control on the x-axis, and the number of choices on the y-axis (e.g. Control, Red n=20, Control, Green = 12 etc).
Any help would be appreciated!
Edited to add:
This is the graph it's outputting
This is the code I'm using (including suggested adjustments):
df <- data.frame(group = rep(c("control", "experimental"), each = 40),
substrate = sample (c("red","green"), 80, TRUE))
ggplot(df, aes(x = group, y = substrate, fill = substrate)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("red", "green"))
This is the output:
structure(list(group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("control", "experimental"
), class = "factor"), substrate = structure(c(1L, 2L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L,
2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L), .Label = c("green",
"red"), class = "factor")), class = "data.frame", row.names = c(NA,
-80L))
output from df(behaviour) - original dataframe
structure(list(group = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Control", "Experimental"
), class = "factor"), substrate = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Green",
"Red"), class = "factor")), class = "data.frame", row.names = c(NA,
-80L))
Your data:
behaviour=structure(list(group = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Control", "Experimental"
), class = "factor"), substrate = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Green",
"Red"), class = "factor")), class = "data.frame", row.names = c(NA,
-80L))
We can tabulate your data:
table(behaviour$group,behaviour$substrate)
Green Red
Control 10 30
Experimental 27 13
So you can only specify fill or y with geom_bar. In your case, you specify the fill, the geom_bar() function will do the counting for you:
ggplot(behaviour,aes(x=group,fill=substrate))+
geom_bar() + scale_fill_manual(values=c("#29c7ac","#c02739"))
You could have your data like this, with one row for each observation (i.e. each animal), with the group and the substrate recorded for each:
df <- data.frame(group = rep(c("control", "experimental"), each = 40),
substrate = rep(c("green", "red", "green", "red"), c(10, 30, 27, 13)))
Now define your plot using ggplot, specifying group as your x axis, and ..count.. as your y axis. Use geom_bar to get the stacked bars you are looking for, and finally use scale_fill_manual to set the colours:
library(ggplot2)
ggplot(df, aes(x = group, y = ..count.., fill = substrate)) +
geom_bar(colour = "black") +
scale_fill_manual(values = c("green", "red"))

Error when running poisson regression with a binary outcome

I am trying to run a poisson regression to predict a common binary outcome.
This is my first attempt at using dput - if I have used it inappropriately, please let me know so I can correct it.
Example data:
df <- structure(list(id = 1:30, sex = structure(c(1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L), .Label = c("Female", "Male"
), class = "factor"), migStat = structure(c(1L, 2L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L), .Label = c("Australian-born",
"Migrant"), class = "factor"), mhAreaBi = structure(c(1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L), .Label = c("Metropolitan",
"Regional"), class = "factor"), empStatBi = structure(c(2L, 2L,
1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L,
2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Student / employed",
"Unemployed"), class = "factor"), pensBenBi = structure(c(1L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L,
1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L), .Label = c("No benefit",
"In receipt of pension benefit"), class = "factor"), maritStatBi = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L), .Label = c("Married (including de facto)",
"Not married"), class = "factor"), cto = structure(c(1L, 2L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor")), .Names = c("id", "sex", "migStat",
"mhAreaBi", "empStatBi", "pensBenBi", "maritStatBi", "cto"), row.names = c(NA,
-30L), class = "data.frame")
When running the regression using glm in R, I receive an error:
fit <- glm(cto ~ sex + migStat + mhAreaBi + empStatBi + pensBenBi + maritStatBi, df, family = poisson)
Error in if (any(y < 0)) stop("negative values not allowed for the 'Poisson' family") :
missing value where TRUE/FALSE needed
In addition: Warning message:
In Ops.factor(y, 0) : ‘<’ not meaningful for factors
The same error has been explained briefly in this thread:
Because the "<" operator is not defined for factors the result that is
passed to if is of length 0. Setting the factor variable on the RHS
and using the integer values on hte LHS succeeds.
The error does not appear when I convert the outcome to an integer; however, this:
seems to defeat the purpose of predicting a binary outcome (unless a numeric variable with range 0-1 is treated the same as a factor variable with two levels); and
does not seem necessary (at least according to this post, which uses geeglm from geepack to predict a binary outcome [unfortunately, I receive the same error when I adapt the code to my own dataset])
Questions:
Could I receive further explanation of the error?
If I convert my outcome to an integer with range 0-1, will glm treat it the same as a binary variable? If not, is there an approach better suited to running a regression for a common binary outcome?
I think the best option here is:
df$cto_binary <- as.numeric(df$cto == "Yes")
fit <- glm(cto_binary ~ sex + migStat + mhAreaBi + empStatBi + pensBenBi + maritStatBi,
df, family = poisson)
As this way you explicitly show in your code what will be a 1/success in your binary outcome and don't get tripped up by things like the ordering of factor levels. Note that in R as.numeric(c(FALSE, TRUE)) gives c(0, 1), so you always know what you're going to get from a logical comparison.

Standard evaluation with mutate_ to calculate percentages by group

I am trying to use standard evaluation with dplyr to calculate percents as a function of two grouping variables. The problem is in my mutate_ statement.
Here is a dataset:
structure(list(
var1 = structure(c(2L, 1L, 1L, 2L, 1L, 2L, 1L,
2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L,
2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L
),
.Label = c("No", "Yes"), class = "factor"),
var2 = structure(c(2L, 2L, 1L, 2L,
2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L,
1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L,
2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L,
2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L
),
.Label = c("Female", "Male"), class = "factor")),
.Names = c("var1", "var2"), row.names = c(NA, -100L), class = "data.frame")
Here is the code I am working with:
for_plots = function(data, var1, var2){
grouped_data = data %>% group_by_(var1, var2) %>%
summarise_(n_in_group = ~n()) %>%
mutate_(.dots = setNames(list(
interp(quote(n_in_group / sum(n_in_group, na.rm = TRUE) * 100),
n_in_group = as.name(n_in_group)))
))
return(grouped_data)
}
When I run the code, I receive an error:
Error in setNames(list(interp(quote(n_in_group/sum(n_in_group, na.rm = TRUE) * :
argument "nm" is missing, with no default
Any thoughts?
Here is some code based on #Frank's response:
for_plots = function(data, var1, var2) {
grouped_data = data %>% group_by_(var1, var2) %>%
summarise_(n_in_group = ~n()) %>%
mutate(percent = (n_in_group / sum(n_in_group, na.rm = TRUE)) * 100)
return(grouped_data)
}

Resources