Related
Why cant I run a Breusch-Pagan Test bptest() on a linear mixed effect model lmer() in order to test for heteroscedasticity? The bptest function works fine on models built with lm and glmer but not lmer. Is there a different function I should be using?
error message
Error: $ operator not defined for this S4 class
data <- structure(list(Mn_new = c(3.90508190744665, 3.41518826685297,
3.98107659173858, 4.06706444435455, 2.40431879320057, 3.8090250549363,
3.72177711209025, 2.93248691964847, 4.10035133820019, 4.20508065155943,
3.64103189844949, 4.24257964492719, 4.20182664641102, 3.41263061412322,
4.04144915900294, 4.28185091235415, 3.09415352803393, 3.67021392570071,
3.56418529613595, 3.21715355220772, 3.21429992539095, 3.54553486317315,
4.03025205893711, 2.97382166830262, 3.80757707518732, 3.78523559035143,
3.41487105608904, 2.75799799020337, 3.06834870580776, 3.30533869585591,
2.8380338262522, 2.65147541433061, 3.53356800468757, 2.51733199167976,
3.16115687664055, 3.64858366279116, 3.48272937241829, 2.91621249433787,
3.26028181088023, 3.49589461456199, 2.82832109354896, 3.40328200399306,
3.28568362736306, 2.87324453863543, 3.10651957200347, 2.81769064140214,
2.57165695575711, 2.97592292304521, 3.18174081921005, 3.54312301316704,
2.70447719350618, 3.48454089015539, 3.39666701335652, 3.03088932872189,
3.1057376517166, 2.91083893666025, 3.18752169045788, 3.04054322208808,
3.04284811683015, 3.53376439846743, 3.57155887085371, 2.67921235204479,
3.24539585432457, 3.32270430796322, 3.75933211625452, 3.30303225771367,
2.94140225772847, 3.22916966186489, 3.45512223500913, 2.89996056576201,
3.19536565883228, 2.49108662931588, 2.55337036896523, 2.98316003461686,
3.58241577241437, 3.40385600372579, 3.66136967423154, 3.71807222845311,
3.73004186004765, 4.10988004656572, 3.90759927253415, 2.86608298949975,
3.61450793458081, 3.85162032119424, 4.44992983828838, 3.19109366840847,
3.09329595776341, 3.69955310870145, 4.47202033690943, 3.61326633240611,
3.64532602062922, 3.33230174866167, 2.74653680127074, 3.61473897523957
), SEX = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("F", "M"), class = "factor"), S_M = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("AFTER",
"BEFORE"), class = "factor"), ID = structure(c(43L, 40L, 25L,
17L, 1L, 20L, 4L, 13L, 45L, 32L, 28L, 5L, 14L, 21L, 44L, 9L,
16L, 42L, 18L, 35L, 22L, 10L, 8L, 36L, 37L, 15L, 19L, 43L, 40L,
25L, 17L, 1L, 20L, 4L, 13L, 45L, 32L, 28L, 5L, 14L, 21L, 44L,
9L, 16L, 42L, 18L, 35L, 22L, 10L, 8L, 36L, 37L, 15L, 19L, 47L,
46L, 34L, 38L, 29L, 41L, 33L, 26L, 23L, 27L, 24L, 11L, 7L, 3L,
6L, 12L, 30L, 39L, 2L, 31L, 47L, 46L, 34L, 38L, 29L, 41L, 33L,
26L, 23L, 27L, 24L, 11L, 7L, 3L, 6L, 12L, 30L, 39L, 2L, 31L), .Label = c("BLA1",
"BLA10", "BLA14", "BLA16", "BLA17", "BLA2", "BLA20", "BLA202",
"BLA203", "BLA205", "BLA21", "BLA211", "BLA213", "BLA214", "BLA215",
"BLA216", "BLA217", "BLA219", "BLA221", "BLA224", "BLA228", "BLA23",
"BLA238", "BLA24", "BLA248", "BLA25", "BLA27", "BLA270", "BLA283",
"BLA294", "BLA296", "BLA300", "BLA307", "BLA31", "BLA33", "BLA36",
"BLA38", "BLA42", "BLA47", "BLA48", "BLA5", "BLA53", "BLA60",
"BLA61", "BLA74", "BLA79", "BLA80"), class = "factor")), class = "data.frame", row.names = c(NA,
-94L))
code for lmer
#Mg
Mg_model <- lmer(Mg_new ~ SEX * S_M + (1|ID), data=data)
summary(Mg_model)
library(lmtest)
bptest(Mg_model)
error
Error: $ operator not defined for this S4 class
The Breusch-Pagan test "fits a linear regression model to the residuals of a linear regression model ... By default the same explanatory variables are taken as in the main regression model".
The version in base R "works" for lm and glm models, but I wouldn't trust it for glm models — as far as I know the test doesn't apply, it's just that the generic functions it uses also work for glm objects. (Contrary to your question, it throws an error for glmer fits - maybe you meant to say glm?)
I don't know offhand if the B-P test has been extended to cover the LMM case. If you had continuous predictors it would be tricky, but as you only have factors you can use a Levene's test as in this answer:
library(lme4)
library(broom.mixed)
library(ggplot2)
Mn_model <- lmer(Mn_new ~ SEX * S_M + (1|ID), data=data)
aa <- augment(Mn_model, .data = data)
ggplot(aa, aes(x = interaction(S_M,SEX), y = .resid)) + geom_boxplot()
car::leveneTest(.resid ~ S_M*SEX, data = aa)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 3 2.271 0.08566 .
## 90
I was following this post, but I do not get how can I manage it with my data.
My plot looks like:
And I would like that the "strings" were the same color as the 2nd column, i.e. for ESR1 I would like the orange string, and for PIK3CA green.
Any idea about how can I manage with scale_fill_manual or any other argument?
Thanks!
My code:
colorfill <- c("white", "white", "darkgreen", "orange", "white", "white", "white", "white", "white", "white", "white", "white", "white", "white", "white", "white", "white")
ggplot(data = Allu,
aes(axis1 = Gene_mut, axis2 = Metastasis_Location, y = Freq)) +
geom_alluvium(aes(fill = Gene_mut),
curve_type = "quintic") +
geom_stratum(width = 1/4, fill = colorfill) +
geom_text(stat = "stratum", size = 3,
aes(label = after_stat(stratum))) +
scale_x_discrete(limits = c("Metastasis_Location", "Gene_mut"),
expand = c(0.05, .05)) +
theme_void()
My data:
structure(list(Metastasis_Location = structure(c(1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 9L, 9L, 9L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L,
11L, 11L), .Label = c("adrenal", "bone", "breast", "liver", "lung",
"muscle", "node", "pancreatic", "peritoneum", "pleural", "skin"
), class = "factor"), T0_T2_THERAPY_COD = structure(c(2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A",
"F"), class = "factor"), T0_T2_PD_event = structure(c(2L, 2L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("No Progression",
"Progression"), class = "factor"), Gene_mut = structure(c(4L,
5L, 1L, 3L, 4L, 1L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 5L, 5L, 5L, 6L, 3L, 6L, 6L, 6L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L,
5L, 6L, 2L, 3L, 4L, 4L, 3L, 3L, 3L, 4L, 5L, 6L, 3L, 6L, 3L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 3L, 4L, 4L, 5L, 6L,
1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L,
5L, 5L, 5L, 3L, 4L, 3L, 4L, 5L, 6L, 3L, 3L, 4L, 5L, 6L, 6L, 6L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 3L, 4L, 3L, 4L, 5L,
6L, 3L, 4L, 5L, 6L, 3L, 4L, 5L, 6L, 1L, 6L, 3L, 3L, 4L, 4L, 5L
), .Label = c("AKT1", "ERBB2", "ESR1", "PIK3CA", "TP53", "WT"
), class = "factor"), LABO_ID = structure(c(45L, 8L, 13L, 11L,
11L, 26L, 7L, 15L, 23L, 26L, 35L, 39L, 7L, 19L, 26L, 32L, 33L,
35L, 39L, 15L, 19L, 35L, 1L, 37L, 34L, 43L, 47L, 3L, 10L, 18L,
20L, 28L, 31L, 36L, 42L, 9L, 10L, 14L, 18L, 20L, 28L, 31L, 36L,
44L, 45L, 8L, 10L, 18L, 28L, 42L, 2L, 7L, 39L, 7L, 39L, 3L, 4L,
42L, 5L, 42L, 6L, 21L, 1L, 10L, 22L, 28L, 46L, 9L, 10L, 14L,
28L, 46L, 10L, 28L, 48L, 25L, 23L, 32L, 33L, 40L, 43L, 24L, 3L,
18L, 24L, 28L, 31L, 36L, 42L, 18L, 27L, 28L, 31L, 36L, 45L, 18L,
24L, 27L, 28L, 42L, 16L, 16L, 18L, 18L, 18L, 29L, 23L, 39L, 39L,
40L, 1L, 12L, 47L, 3L, 18L, 20L, 28L, 31L, 36L, 38L, 42L, 5L,
18L, 20L, 27L, 28L, 31L, 36L, 38L, 41L, 45L, 8L, 18L, 27L, 28L,
42L, 48L, 6L, 17L, 30L, 31L, 31L, 18L, 18L, 18L, 29L, 39L, 39L,
40L, 43L, 31L, 31L, 48L, 30L, 13L, 34L, 18L, 36L, 18L, 36L, 18L
), .Label = c("ER-11", "ER-19", "ER-21", "ER-22", "ER-29", "ER-30",
"ER-31", "ER-32", "ER-33", "ER-38", "ER-40", "ER-43", "ER-49",
"ER-8", "ER-AZ-04", "ER-AZ-05", "ER-AZ-06", "ER-AZ-07", "ER-AZ-08",
"ER-AZ-10", "ER-AZ-11", "ER-AZ-11=ER-47", "ER-AZ-13", "ER-AZ-14",
"ER-AZ-15", "ER-AZ-16", "ER-AZ-17", "ER-AZ-18", "ER-AZ-20", "ER-AZ-20=ER-27",
"ER-AZ-21", "ER-AZ-23", "ER-AZ-23=ER-52", "ER-AZ-24", "ER-AZ-29",
"ER-AZ-31", "ER-AZ-33", "ER-AZ-35", "ER-AZ-37", "ER-AZ-38", "ER-AZ-39",
"ER-AZ-40", "ER-AZ-43", "ER-AZ-44", "ER-AZ-45", "ER-AZ-49", "ER-AZ-51",
"ER-AZ-53"), class = "factor"), Freq = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -161L), groups = structure(list(
Metastasis_Location = structure(c(1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 8L, 8L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L), .Label = c("adrenal",
"bone", "breast", "liver", "lung", "muscle", "node", "pancreatic",
"peritoneum", "pleural", "skin"), class = "factor"), T0_T2_THERAPY_COD = structure(c(2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L), .Label = c("A",
"F"), class = "factor"), T0_T2_PD_event = structure(c(2L,
2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L), .Label = c("No Progression",
"Progression"), class = "factor"), Gene_mut = structure(c(4L,
5L, 1L, 3L, 4L, 1L, 2L, 3L, 4L, 5L, 6L, 3L, 6L, 3L, 4L, 5L,
6L, 2L, 3L, 4L, 3L, 4L, 5L, 6L, 3L, 6L, 3L, 4L, 5L, 6L, 3L,
4L, 5L, 6L, 1L, 3L, 4L, 5L, 3L, 4L, 3L, 4L, 5L, 6L, 3L, 4L,
5L, 6L, 6L, 3L, 4L, 5L, 6L, 3L, 4L, 3L, 4L, 5L, 6L, 3L, 4L,
5L, 6L, 3L, 4L, 5L, 6L, 1L, 6L, 3L, 4L, 5L), .Label = c("AKT1",
"ERBB2", "ESR1", "PIK3CA", "TP53", "WT"), class = "factor"),
.rows = structure(list(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8:12,
13:19, 20:22, 23L, 24L, 25:27, 28:35, 36:45, 46:50, 51L,
52L, 53L, 54:55, 56:58, 59L, 60L, 61L, 62L, 63L, 64:67,
68:72, 73:75, 76L, 77L, 78:79, 80L, 81L, 82L, 83:89,
90:95, 96:100, 101L, 102L, 103L, 104L, 105L, 106L, 107:108,
109L, 110L, 111:112, 113L, 114:121, 122:131, 132:137,
138:140, 141L, 142L, 143L, 144L, 145L, 146L, 147L, 148L,
149L, 150L, 151L, 152L, 153L, 154L, 155L, 156L, 157:158,
159:160, 161L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -72L), .drop = TRUE))
You're right to think of scale_fill_manual(). I think this is the more programmable alternative to passing a vector like colorfill to an aesthetic outside aes(). The following plot uses your data and color vector to control how the fill aesthetic is coded throughout the plot, and notice that fill is passed the same variable, Gene_mut, in both layers (alluvium and stratum):
ggplot(data = Allu,
aes(axis1 = Gene_mut, axis2 = Metastasis_Location, y = Freq)) +
geom_alluvium(aes(fill = Gene_mut),
curve_type = "quintic") +
geom_stratum(aes(fill = Gene_mut), width = 1/4) +
scale_fill_manual(values = colorfill) +
geom_text(stat = "stratum", size = 3,
aes(label = after_stat(stratum))) +
scale_x_discrete(limits = c("Metastasis_Location", "Gene_mut"),
expand = c(0.05, .05)) +
theme_void()
Since Metastasis_Location takes different values than Gene_mut, fill treats those strata as having missing values, which by default are colored grey. You can change that behavior by passing a color string to the na.value parameter of scale_fill_manual().
I am using the rms library and the lrm function to do a penalized logistic regression.
Just look to my data:
> dput(cs_data_train[1:50,])
structure(list(DataCRMSanoflore.Year_Sales = structure(c(1L,
2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
2L), .Label = c("2015", "2016", "2017"), class = "factor"), DataCRMSanoflore.HOURS_INSCR = c(14L,
18L, 17L, 16L, 11L, 22L, 23L, 17L, 9L, 21L, 18L, 19L, 12L, 11L,
17L, 16L, 21L, 20L, 14L, 19L, 22L, 17L, 22L, 13L, 19L, 13L, 21L,
16L, 23L, 19L, 11L, 21L, 11L, 22L, 20L, 13L, 11L, 17L, 15L, 12L,
15L, 21L, 17L, 14L, 10L, 17L, 10L, 12L, 18L, 13L), DataCRMSanoflore.Month_Sales = structure(c(9L,
2L, 5L, 9L, 4L, 7L, 3L, 9L, 7L, 12L, 3L, 3L, 12L, 3L, 3L, 6L,
3L, 4L, 5L, 8L, 8L, 1L, 4L, 10L, 9L, 5L, 4L, 9L, 2L, 12L, 9L,
4L, 4L, 3L, 6L, 8L, 6L, 4L, 12L, 5L, 6L, 9L, 7L, 9L, 1L, 9L,
7L, 11L, 11L, 4L), .Label = c("01", "02", "03", "04", "05", "06",
"07", "08", "09", "10", "11", "12"), class = "factor"), DataCRMSanoflore.Date_Sales = structure(c(3L,
10L, 22L, 23L, 26L, 13L, 12L, 2L, 25L, 11L, 10L, 9L, 4L, 10L,
18L, 9L, 9L, 1L, 14L, 24L, 4L, 2L, 2L, 22L, 17L, 4L, 14L, 22L,
2L, 5L, 29L, 13L, 2L, 10L, 25L, 5L, 10L, 1L, 6L, 20L, 7L, 9L,
1L, 3L, 17L, 22L, 3L, 9L, 20L, 13L), .Label = c("01", "02", "03",
"04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14",
"15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25",
"26", "27", "28", "29", "30", "31"), class = "factor"), DataCRMSanoflore.HOURS_INSCR.1 = c(14L,
18L, 17L, 16L, 11L, 22L, 23L, 17L, 9L, 21L, 18L, 19L, 12L, 11L,
17L, 16L, 21L, 20L, 14L, 19L, 22L, 17L, 22L, 13L, 19L, 13L, 21L,
16L, 23L, 19L, 11L, 21L, 11L, 22L, 20L, 13L, 11L, 17L, 15L, 12L,
15L, 21L, 17L, 14L, 10L, 17L, 10L, 12L, 18L, 13L), DataCRMSanoflore.Year_Creation_Sales = structure(c(1L,
2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L,
2L), .Label = c("2015", "2016", "2017"), class = "factor"), DataCRMSanoflore.Month_Creation_Sales = structure(c(9L,
2L, 10L, 10L, 9L, 7L, 12L, 9L, 7L, 12L, 3L, 4L, 2L, 6L, 3L, 6L,
10L, 4L, 5L, 8L, 3L, 1L, 4L, 11L, 9L, 5L, 4L, 9L, 2L, 12L, 10L,
4L, 4L, 3L, 10L, 8L, 6L, 4L, 12L, 8L, 6L, 2L, 10L, 5L, 1L, 9L,
8L, 11L, 11L, 4L), .Label = c("01", "02", "03", "04", "05", "06",
"07", "08", "09", "10", "11", "12"), class = "factor"), DataCRMSanoflore.Day_Creation_Sales = structure(c(11L,
15L, 2L, 31L, 26L, 23L, 5L, 2L, 25L, 16L, 10L, 13L, 7L, 3L, 18L,
9L, 8L, 27L, 18L, 24L, 6L, 2L, 4L, 16L, 17L, 12L, 15L, 22L, 10L,
5L, 1L, 14L, 2L, 10L, 5L, 5L, 10L, 25L, 6L, 5L, 28L, 8L, 10L,
18L, 17L, 22L, 31L, 9L, 21L, 22L), .Label = c("01", "02", "03",
"04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14",
"15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25",
"26", "27", "28", "29", "30", "31"), class = "factor"), DataCRMSanoflore.Year_Validation_Sales = structure(c(1L,
2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L,
2L), .Label = c("2015", "2016", "2017"), class = "factor"), DataCRMSanoflore.Month_Validation_Sales = structure(c(9L,
2L, 10L, 11L, 10L, 7L, 12L, 9L, 7L, 12L, 3L, 4L, 2L, 6L, 3L,
6L, 10L, 4L, 5L, 8L, 3L, 1L, 4L, 11L, 9L, 5L, 4L, 9L, 2L, 12L,
10L, 4L, 4L, 3L, 10L, 8L, 6L, 4L, 12L, 8L, 6L, 2L, 10L, 5L, 1L,
9L, 9L, 11L, 11L, 4L), .Label = c("01", "02", "03", "04", "05",
"06", "07", "08", "09", "10", "11", "12"), class = "factor"),
DataCRMSanoflore.Day_Validation_Sales = structure(c(14L,
16L, 3L, 3L, 1L, 27L, 6L, 5L, 27L, 21L, 19L, 27L, 8L, 5L,
21L, 10L, 9L, 30L, 26L, 27L, 7L, 4L, 15L, 17L, 18L, 13L,
20L, 29L, 11L, 7L, 2L, 16L, 3L, 20L, 6L, 6L, 13L, 29L, 8L,
6L, 30L, 9L, 12L, 20L, 18L, 29L, 1L, 10L, 23L, 25L), .Label = c("01",
"02", "03", "04", "05", "06", "07", "08", "09", "10", "11",
"12", "13", "14", "15", "16", "17", "18", "19", "20", "21",
"22", "23", "24", "25", "26", "27", "28", "29", "30", "31"
), class = "factor"), DataCRMSanoflore.AGE_CUSTUMER = c(37L,
23L, 34L, 32L, 45L, 52L, 44L, 55L, 37L, 29L, 33L, 29L, 30L,
37L, 56L, 48L, 44L, 42L, 45L, 33L, 37L, 53L, 55L, 60L, 57L,
33L, 51L, 32L, 35L, 54L, 41L, 47L, 59L, 33L, 45L, 35L, 36L,
28L, 42L, 24L, 32L, 39L, 33L, 36L, 49L, 56L, 45L, 39L, 54L,
55L), DataCRMSanoflore.MEAN_PURCHASE = c(71.75, 50.7142857142857,
18.6666666666667, 0, 0, 54.7, 0.666666666666667, 38, 6.5,
0, 83.3333333333333, 44.3333333333333, 25.7777777777778,
24.1818181818182, 23.3846153846154, 35.5294117647059, 21.6363636363636,
1.125, 6, 8.66666666666667, 18.4, 16.9285714285714, 0, 0,
36.5, 21.5, 18.5714285714286, 28.125, 101.333333333333, 0,
2, 0, 20.9166666666667, 69.1428571428571, 16.6666666666667,
1.5, 87.1666666666667, 48.25, 13.3333333333333, 20.5833333333333,
12, 0, 23, 15.1428571428571, 0, 30.4375, 30.3076923076923,
24.625, 23.4285714285714, 20.0833333333333), DataCRMSanoflore.NUMBER_GIFTS = c(1L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 3L, 4L, 3L,
4L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 3L, 1L, 3L, 2L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 2L, 3L, 1L, 3L, 1L, 4L, 1L, 1L, 1L,
2L, 5L, 2L, 2L), SENSIBILITE = c(4L, 4L, 1L, 3L, 1L, 1L,
2L, 1L, 1L, 1L, 4L, 1L, 3L, 1L, 3L, 3L, 4L, 1L, 1L, 1L, 4L,
1L, 1L, 4L, 1L, 3L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 4L, 1L, 1L,
1L, 4L, 1L, 3L, 2L, 1L, 3L, 4L, 1L, 1L, 4L, 3L, 1L, 4L),
IMPERFECTIONS = c(4L, 3L, 1L, 2L, 1L, 1L, 4L, 1L, 1L, 1L,
3L, 1L, 2L, 1L, 3L, 2L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 1L,
3L, 3L, 3L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 3L, 1L, 2L,
3L, 1L, 2L, 2L, 1L, 1L, 3L, 3L, 1L, 3L), BRILLANCE = c(2L,
2L, 1L, 4L, 1L, 1L, 4L, 1L, 1L, 1L, 4L, 1L, 4L, 1L, 4L, 4L,
4L, 1L, 1L, 1L, 4L, 1L, 1L, 3L, 1L, 4L, 4L, 4L, 4L, 1L, 1L,
1L, 1L, 4L, 1L, 1L, 1L, 4L, 1L, 4L, 4L, 1L, 4L, 4L, 1L, 1L,
4L, 4L, 1L, 4L), GRAIN_PEAU = c(4L, 4L, 1L, 4L, 1L, 1L, 2L,
1L, 1L, 1L, 4L, 1L, 2L, 1L, 2L, 4L, 4L, 1L, 1L, 1L, 3L, 1L,
1L, 2L, 1L, 2L, 4L, 4L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 4L, 4L, 1L, 2L, 4L, 1L, 1L, 4L, 3L, 1L, 4L), RIDES_VISAGE = c(2L,
2L, 1L, 4L, 1L, 1L, 4L, 1L, 1L, 1L, 4L, 1L, 2L, 1L, 4L, 2L,
4L, 1L, 1L, 1L, 4L, 1L, 1L, 4L, 1L, 2L, 4L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 4L, 1L, 2L, 4L, 1L, 2L, 4L, 1L, 1L,
4L, 4L, 1L, 4L), ALLERGIES = c(2L, 2L, 1L, 2L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L,
1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 3L, 2L, 1L, 2L), MAINS = c(4L,
4L, 1L, 4L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 3L, 1L, 3L, 3L,
3L, 1L, 1L, 1L, 4L, 1L, 1L, 4L, 1L, 3L, 4L, 4L, 3L, 1L, 1L,
1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 4L, 3L, 1L, 3L, 4L, 1L, 1L,
3L, 3L, 1L, 4L), PEAU_CORPS = c(3L, 3L, 1L, 2L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 3L, 1L, 1L, 1L, 2L, 1L,
1L, 3L, 1L, 3L, 3L, 2L, 3L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L,
3L, 1L, 3L, 2L, 1L, 2L, 4L, 1L, 1L, 3L, 3L, 1L, 3L), INTERET_ALIM_NATURELLE = c(4L,
4L, 1L, 2L, 1L, 1L, 4L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 4L, 2L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 4L, 1L, 4L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 4L, 4L, 1L, 4L, 2L, 1L, 1L,
4L, 2L, 1L, 2L), INTERET_ORIGINE_GEO = c(4L, 2L, 1L, 2L,
1L, 1L, 5L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 5L, 2L, 1L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 2L, 5L, 2L, 2L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 5L, 5L, 1L, 4L, 2L, 1L, 1L, 2L, 2L, 1L,
2L), INTERET_VACANCES = c(4L, 2L, 1L, 3L, 1L, 1L, 2L, 1L,
1L, 1L, 3L, 1L, 2L, 1L, 3L, 4L, 3L, 1L, 1L, 1L, 2L, 1L, 1L,
3L, 1L, 4L, 3L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L,
1L, 2L, 2L, 1L, 4L, 3L, 1L, 1L, 2L, 2L, 1L, 2L), INTERET_ENVIRONNEMENT = c(5L,
5L, 1L, 5L, 1L, 1L, 5L, 1L, 1L, 1L, 3L, 1L, 3L, 1L, 3L, 3L,
3L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 1L, 3L, 3L, 3L, 3L, 1L, 1L,
1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 3L, 5L, 1L, 5L, 3L, 1L, 1L,
3L, 5L, 1L, 3L), INTERET_COMPOSITION = c(2L, 2L, 1L, 4L,
1L, 1L, 4L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L,
1L, 2L, 1L, 1L, 4L, 1L, 4L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 4L, 1L, 2L, 4L, 1L, 4L, 2L, 1L, 1L, 2L, 2L, 1L,
2L), DataCRMSanoflore.Nb_achats = c(4, 7, 3, 3, 4, 10, 3,
4, 14, 4, 6, 6, 9, 22, 26, 17, 22, 8, 3, 9, 10, 14, 3, 7,
12, 6, 14, 16, 3, 3, 3, 3, 12, 7, 3, 6, 6, 12, 18, 12, 15,
6, 21, 7, 6, 16, 13, 16, 14, 12), OUTCOME = structure(c(1L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c("0", "1"), class = "factor")), .Names = c("DataCRMSanoflore.Year_Sales",
"DataCRMSanoflore.HOURS_INSCR", "DataCRMSanoflore.Month_Sales",
"DataCRMSanoflore.Date_Sales", "DataCRMSanoflore.HOURS_INSCR.1",
"DataCRMSanoflore.Year_Creation_Sales", "DataCRMSanoflore.Month_Creation_Sales",
"DataCRMSanoflore.Day_Creation_Sales", "DataCRMSanoflore.Year_Validation_Sales",
"DataCRMSanoflore.Month_Validation_Sales", "DataCRMSanoflore.Day_Validation_Sales",
"DataCRMSanoflore.AGE_CUSTUMER", "DataCRMSanoflore.MEAN_PURCHASE",
"DataCRMSanoflore.NUMBER_GIFTS", "SENSIBILITE", "IMPERFECTIONS",
"BRILLANCE", "GRAIN_PEAU", "RIDES_VISAGE", "ALLERGIES", "MAINS",
"PEAU_CORPS", "INTERET_ALIM_NATURELLE", "INTERET_ORIGINE_GEO",
"INTERET_VACANCES", "INTERET_ENVIRONNEMENT", "INTERET_COMPOSITION",
"DataCRMSanoflore.Nb_achats", "OUTCOME"), row.names = c(22L,
33L, 40L, 48L, 54L, 59L, 74L, 78L, 87L, 89L, 104L, 115L, 121L,
141L, 159L, 161L, 163L, 165L, 196L, 202L, 211L, 222L, 272L, 300L,
318L, 325L, 327L, 349L, 374L, 380L, 392L, 393L, 394L, 398L, 427L,
440L, 449L, 456L, 470L, 477L, 479L, 490L, 505L, 508L, 514L, 520L,
528L, 531L, 534L, 543L), class = "data.frame")
Then when I want to fit the model using this code:
fit = lrm(OUTCOME ~ .-1,data = cs_data_train,x=T, y=T)
It gives an error:
singular information matrix in lrm.fit (rank= 148 ). Offending
variable(s): DataCRMSanoflore.HOURS_INSCR.1 Error in lrm(OUTCOME ~ .
- 1, data = cs_data_train, x = T, y = T) : Unable to fit model using “lrm.fit”
I searched but I could not resolve this issue. Thank you for your help!
EDIT:
As Said in the comment below. I need to remove one of each both correlated variables. So I write this code :
> highlyCorrelated <- findCorrelation(correlationMatrix, cutoff=(0.7),verbose = FALSE)
> print(highlyCorrelated)
[1] 21 20 26 15 18 17 22 16 25 19 23 24 6 9 7 10 28 2
> important_var=colnames(DATA_BASE[,-highlyCorrelated])
> important_var
[1] "DataCRMSanoflore.Year_Sales" "DataCRMSanoflore.Date_Sales" "DataCRMSanoflore.HOURS_INSCR.1"
[4] "DataCRMSanoflore.Day_Creation_Sales" "DataCRMSanoflore.MEAN_PURCHASE" "OUTCOME"
> DATA_BASE<-DATA_BASE[,-highlyCorrelated]
> str(DATA_BASE)
'data.frame': 5775 obs. of 6 variables:
$ DataCRMSanoflore.Year_Sales : num 2 1 2 1 2 1 1 1 1 2 ...
$ DataCRMSanoflore.Date_Sales : num 13 3 10 22 23 26 13 1 12 2 ...
$ DataCRMSanoflore.HOURS_INSCR.1 : num 17 14 18 17 16 11 22 14 23 17 ...
$ DataCRMSanoflore.Day_Creation_Sales: num 13 11 15 2 31 26 23 1 5 2 ...
$ DataCRMSanoflore.MEAN_PURCHASE : num 0 71.8 50.7 18.7 0 ...
$ OUTCOME : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 2 2 1 1 ...
But I get then the same error
Error in lrm(OUTCOME ~ . - 1, data = train, x = T, y = T) : Unable
to fit model using “lrm.fit”
This really weird!
How can I resolve this please ?
I have the following
t <- structure(list(name = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("Alice", "Bob",
"Jane Doe", "John Doe"), class = "factor"), school = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("Alice School",
"Bob School", "Someother School", "Someschool College"), class = "factor"),
group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A", "B"), class = "factor"),
question = structure(c(2L, 4L, 6L, 8L, 1L, 3L, 5L, 7L, 2L,
4L, 6L, 8L, 1L, 3L, 5L, 7L, 2L, 4L, 6L, 8L, 1L, 3L, 5L, 7L,
2L, 4L, 6L, 8L, 1L, 3L, 5L, 7L), .Label = c("q1", "q2", "q3",
"q4", "q5", "q6", "q7", "q8"), class = "factor"), mark = c(0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 1L,
1L), subject = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("C", "M"), class = "factor")), .Names = c("name",
"school", "group", "question", "mark", "subject"), row.names = c(7L,
15L, 23L, 31L, 3L, 11L, 19L, 27L, 8L, 16L, 24L, 32L, 4L, 12L,
20L, 28L, 6L, 14L, 22L, 30L, 2L, 10L, 18L, 26L, 5L, 13L, 21L,
29L, 1L, 9L, 17L, 25L), class = "data.frame")
and I need to produce a data frame in which each student has one combined mark for each subject. The combination is simply a sum of the marks on each question. So, for example, Jane Doe will have 3 on subject C and 2 on subject M. I've been banging my head for long enough with Reduce and other approaches. I could possibly solve this in a very procedural way, but if I could do that with a one-liner (or close approximation), I'd be happier. I'm sure it can be done...
You said it in your question; you want to group_by student and subject and compute the sum
library(tidyverse)
asdf %>%
group_by(name, subject) %>%
summarise(score = sum(mark))
Here a data.table solution:
library(data.table)
setDT(t)[, sum(mark), by = list(name, subject)]
And just for completeness, base R:
aggregate(mark ~ name + subject, data=t, sum)
This says "aggregate the response variable mark by the grouping variables name and subject, using sum as the aggregation function".
I am trying to connect sets of (two) points at each level of x, in each facet. Here is a reproducible example:
datum <- structure(list(frequency = c(8L, 7L, 6L, 18L, 5L, 11L, 16L, 15L,
9L, 8L, 8L, 10L, 2L, 20L, 14L, 3L, 6L, 2L, 2L, 11L, 10L, 6L,
15L, 19L, 18L, 18L, 8L, 2L, 10L, 15L, 12L, 17L, 1L, 18L, 7L,
8L, 16L, 4L, 9L, 2L, 7L, 3L, 16L, 7L, 18L, 20L, 9L, 10L, 13L,
2L, 15L, 7L, 3L, 20L, 4L, 15L, 5L, 7L, 9L, 16L, 5L, 8L, 10L,
10L, 7L, 10L, 10L, 17L, 7L, 8L, 13L, 13L, 16L, 5L, 20L, 18L,
13L, 19L, 3L, 8L, 14L, 12L, 20L, 2L, 9L, 13L, 7L, 2L, 5L, 5L,
13L, 9L, 13L, 7L, 9L, 4L, 4L, 20L, 1L, 4L), band = structure(c(2L,
4L, 2L, 3L, 2L, 1L, 4L, 1L, 2L, 1L, 3L, 4L, 2L, 4L, 3L, 4L, 3L,
2L, 3L, 2L, 2L, 4L, 2L, 1L, 1L, 2L, 1L, 4L, 4L, 1L, 4L, 4L, 2L,
1L, 4L, 4L, 3L, 4L, 1L, 1L, 3L, 4L, 1L, 3L, 4L, 1L, 2L, 1L, 1L,
2L, 2L, 1L, 3L, 4L, 2L, 1L, 2L, 4L, 2L, 2L, 4L, 4L, 2L, 4L, 4L,
1L, 1L, 4L, 2L, 3L, 4L, 1L, 2L, 4L, 1L, 2L, 4L, 1L, 1L, 3L, 4L,
4L, 2L, 2L, 2L, 1L, 3L, 2L, 2L, 2L, 3L, 3L, 1L, 3L, 4L, 3L, 3L,
1L, 3L, 4L), .Label = c("1", "2", "3", "4"), class = "factor"),
test = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L,
2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L,
2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L
), .Label = c("1", "2"), class = "factor"), knowledge = structure(c(2L,
3L, 1L, 3L, 1L, 1L, 3L, 3L, 1L, 3L, 1L, 3L, 2L, 2L, 1L, 1L,
1L, 1L, 3L, 3L, 1L, 2L, 3L, 1L, 1L, 2L, 2L, 1L, 1L, 3L, 2L,
3L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 3L, 3L, 1L, 1L, 2L, 3L,
3L, 2L, 2L, 3L, 1L, 1L, 2L, 2L, 2L, 3L, 1L, 3L, 1L, 1L, 2L,
1L, 1L, 2L, 3L, 1L, 1L, 1L, 1L, 3L, 2L, 2L, 1L, 2L, 3L, 2L,
1L, 2L, 3L, 3L, 2L, 1L, 3L, 1L, 3L, 2L, 1L, 3L, 2L, 2L, 3L,
1L, 1L, 2L, 1L, 2L, 3L, 1L, 3L, 1L), .Label = c("1", "2",
"3"), class = "factor")), .Names = c("frequency", "band",
"test", "knowledge"), row.names = c(NA, -100L), class = "data.frame")
Here is the code I have so far:
ggplot(datum, aes(knowledge, frequency, color=test)) +
stat_summary(fun.y='mean', geom='point', position=position_dodge(width=.9), size=3) +
facet_grid(~band) +
labs(y='number of words (max = 20)', x='self-report knowledge') +
scale_x_discrete(labels=c('none', 'form', 'meaning'))
Looking at the left-most facet ('1') in the graph, I would like a line to connect the pretest to posttest in the none column, another line connecting pretest to posttest in the form column, and a line connecting the pretest to the posttest in the meaning column. I would like this done in each facet.
I hope that makes sense, and thanks!
I find relying on ggplot too much for data manipulation/summarizing can hurt more than it helps. I have no idea how to connect the position-dodged points with a line. Instead, I'd do something like this:
library(dplyr)
datsum = datum %>%
group_by(band, knowledge, test) %>%
summarize(mean = mean(frequency)) %>%
ungroup %>%
mutate(knowledge_fac = factor(knowledge, labels = c('none', 'form', 'meaning')))
ggplot(datsum, aes(x = test, y = mean)) +
geom_path(aes(group = band:knowledge)) +
geom_point(aes(color = factor(test))) +
facet_grid(band ~ knowledge_fac) +
labs(y='number of words (max = 20)', x='self-report knowledge')
Borrowing from Gregor's work in munging the data, I think this does what was requested. The mutate() chunk creates Test to be a numeric offset of -0.1 for test 1 and 0.1 for test 2. This is then added to the numeric value of knowledge. The result is the numeric x passed to ggplot2. Gregor correctly defined the groups, so the rest is straightforward.
library(dplyr)
datsum <- datum %>%
group_by(band, knowledge, test) %>%
summarize(mean = mean(frequency)) %>%
mutate(Test = 0.1 * (2 * (test == 2) - 1),
Knowledge = as.numeric(knowledge) + Test) %>%
ungroup
ggplot(datsum, aes(x = Knowledge, y = mean, color = test)) +
geom_path(aes(group = band:knowledge), color = "black") +
geom_point(size = 3) +
facet_wrap(~ band, nrow = 1) +
labs(y='number of words (max = 20)', x='self-report knowledge') +
scale_color_manual(values = c("orange", "blue")) +
scale_x_continuous(limits = c(0.5, 3.5), breaks = 1:3,
labels = c("none", "form", "meaning"))