Related
I have the following dataset and I would like to calculate the adjusted r-squared based on this dataset.
I have the formula for adjusted R-Squared "Adjusted R2 = 1 – [(1-R2)*(n-1)/(n-k-1)]".
where:
R2: The R-Squared
n: is the number of observations, in this case, "DV.obs"
k: is the number of predictor variables, in this case, "nParam" (where its either 0,1,2,3)
the R code to calculate it is the following, where it is grouped by "ITER", iterations, we have 4 iterations.So the idea is to calculate adjusted R-Squared based on the iterations(4)
iteration 1, the nParam should only be 0, iteration 2, the nParam should only be 1, etc, instead of choosing every nParam in the dataset, since the nParam is exactly the same for each iteration.
The output should be only 4 rows ( for every iteration, as its grouped by(ITER)) and 2 columns (R2, and adjusted R-Squared) and not for every row in the data.
i hope i have explained myself well.
library(dplyr)
ff <- df %>%
group_by(ITER) %>%
summarise(
Rsq = cor(x= DV.obs, y = DV.sim)^2,
adjRsq = 1 - ((1-Rsq)*(length(DV.obs)-1)/(length(DV.obs)- nParam - 1 ))
)
ff
however, this formula will go through every predictor variable(nParam),
df<-structure(list(CASE = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), ITER = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L), nParam = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L
), DV.obs = c(0.101483807, 0.069196694, 0.053869542, 0.043831971,
0.030330271, 0.023612088, 0.01978679, 0.014310351, 0.01164389,
0.007267871, 0.004536453, 0.002873573, 0.002408037, 0.001417053,
0.001136154, 0.101483807, 0.069196694, 0.053869542, 0.043831971,
0.030330271, 0.023612088, 0.01978679, 0.014310351, 0.01164389,
0.007267871, 0.004536453, 0.002873573, 0.002408037, 0.001417053,
0.001136154, 0.101483807, 0.069196694, 0.053869542, 0.043831971,
0.030330271, 0.023612088, 0.01978679, 0.014310351, 0.01164389,
0.007267871, 0.004536453, 0.002873573, 0.002408037, 0.001417053,
0.001136154, 0.101483807, 0.069196694, 0.053869542, 0.043831971,
0.030330271, 0.023612088, 0.01978679, 0.014310351, 0.01164389,
0.007267871, 0.004536453, 0.002873573, 0.002408037, 0.001417053,
0.001136154, 0.000116054, 0.003829787, 0.01206963, 0.02088975,
0.027388781, 0.03423598, 0.037833661, 0.037369438, 0.035164408,
0.034584139, 0.02947776, 0.023210831, 0.014622821, 0.009632495,
0.006731141, 0.0027853, 0.000116054, 0.003829787, 0.01206963,
0.02088975, 0.027388781, 0.03423598, 0.037833661, 0.037369438,
0.035164408, 0.034584139, 0.02947776, 0.023210831, 0.014622821,
0.009632495, 0.006731141, 0.0027853, 0.000116054, 0.003829787,
0.01206963, 0.02088975, 0.027388781, 0.03423598, 0.037833661,
0.037369438, 0.035164408, 0.034584139, 0.02947776, 0.023210831,
0.014622821, 0.009632495, 0.006731141, 0.0027853, 0.000116054,
0.003829787, 0.01206963, 0.02088975, 0.027388781, 0.03423598,
0.037833661, 0.037369438, 0.035164408, 0.034584139, 0.02947776,
0.023210831, 0.014622821, 0.009632495, 0.006731141, 0.0027853
), DV.sim = c(0, 0.0889808909410658, 0.0947484349571132, 0.0798169790285827,
0.0574006922793388, 0.0505799935506284, 0.0468774569150804, 0.0417447990739346,
0.0375742405164242, 0.0306761993989349, 0.0251120797996223, 0.0205737193532288,
0.0168649279846251, 0.0138327510148287, 0.0113531698574871, 0,
0.0829660195227578, 0.0876380159497916, 0.0723450386112931, 0.0464863987773657,
0.0380595525625348, 0.0343245102453232, 0.0307144539731741, 0.0283392784461379,
0.0245820489723981, 0.0214487023548782, 0.0187365858632326, 0.0163729577744008,
0.0143107050991059, 0.0125108672587574, 0, 0.0762191578459362,
0.0737615750578683, 0.0549565160764756, 0.0280085518714786, 0.0206076781625301,
0.0172540310333669, 0.0134899928846955, 0.0108952926749736, 0.00728254194885496,
0.00491441482789815, 0.00332488210681827, 0.00225250494349749,
0.00152820673925803, 0.00103880306820386, 0, 0.0329456788891303,
0.0365534415712808, 0.03318406650424, 0.0278133129626513, 0.0238151342895627,
0.0205330317793787, 0.0155563822799921, 0.0119589968463779, 0.0072024345056713,
0.00437676923945547, 0.00266755578568207, 0.00162810577310623,
0.000994532813206324, 0.000607859854716811, 0, 0.00238890872602278,
0.02000716184065, 0.0509446502289174, 0.0907202677155637, 0.173563302880525,
0.223891823887825, 0.2226231635499, 0.19175603264451, 0.168494781267643,
0.150974664176703, 0.136206244819164, 0.111464575245381, 0.0913691590994598,
0.0749306779146197, 0.0504548476848009, 0, 0.00141190656836649,
0.0124264488774641, 0.0328390336436031, 0.0603613019163447, 0.123470497330427,
0.172404586815834, 0.178024356626272, 0.151606226187945, 0.130227694458962,
0.117105708281994, 0.107832603356838, 0.0935153502613309, 0.081651206263304,
0.0713645335614684, 0.0545446672743561, 0, 0.00122455342249632,
0.00957195676775054, 0.0233009280455857, 0.0398901057214595,
0.069490838356018, 0.0753487069702148, 0.0619427798080445, 0.0388082119899989,
0.0282194718351961, 0.0223033058814705, 0.0181158699408174, 0.012206885059923,
0.00828045272134247, 0.00562572468560191, 0.00260434861259537,
0, 0.00337575118759914, 0.0123247819279197, 0.0212808990854769,
0.0292664165479362, 0.0407316533482074, 0.0457373328155279, 0.0440263413557409,
0.0350818961969019, 0.0268987657874823, 0.0206920115460456, 0.0160182394650579,
0.00970028643496338, 0.00590740063816313, 0.00360522091817113,
0.00134665597468616)), row.names = c(NA, 124L), class = "data.frame")
You could add distinct(ITER, .keep_all = TRUE)
library(tidyverse)
df %>%
group_by(ITER) %>%
summarise(
Rsq = cor(x = DV.obs, y = DV.sim)^2,
adjRsq = 1 - ((1 - Rsq) * (length(DV.obs) - 1) / (length(DV.obs) - nParam - 1))
) %>%
distinct(ITER, .keep_all = T)
#> `summarise()` has grouped output by 'ITER'. You can override using the
#> `.groups` argument.
#> # A tibble: 4 × 3
#> # Groups: ITER [4]
#> ITER Rsq adjRsq
#> <int> <dbl> <dbl>
#> 1 1 0.113 0.113
#> 2 2 0.116 0.0858
#> 3 3 0.334 0.286
#> 4 4 0.268 0.187
The issue is that you get a value per row as your are using the nParam column to compute the adjusted R^2 without any aggregating operation. This could be fixed by using unique(nParam) to "aggregate" nParam to just one value per group:
library(dplyr)
df %>%
group_by(ITER) %>%
summarise(
Rsq = cor(x = DV.obs, y = DV.sim)^2,
adjRsq = 1 - ((1 - Rsq) * (n() - 1) / (n() - unique(nParam) - 1))
)
#> # A tibble: 4 × 3
#> ITER Rsq adjRsq
#> <int> <dbl> <dbl>
#> 1 1 0.113 0.113
#> 2 2 0.116 0.0858
#> 3 3 0.334 0.286
#> 4 4 0.268 0.187
Problem
I would like to plot estimated marginal means from a three-way factorial experiment with letters indicating significantly different means, adjusted for multiple comparisons. My current workflow is to fit the model with lmer(), calculate estimated marginal means with emmeans(), then implement the compact letter display algorithm with cld().
My problem is that the graph is too busy when you plot all three-way interactions on the same plot. So I would like to split up the plot and generate different sets of letters for each subplot, starting with "a". The problem is that when I use the by argument in cld to split it up, it does a separate correction for multiple comparisons within each by group. Because there are now fewer tests within each group, this results in a less conservative correction. But if I try to manually split up the output of cld() without a by group, I would have to manually re-implement the letter algorithm for each subplot. I guess I could do that but it seems cumbersome. I am trying to share this code with a client for him to modify later, so that solution would probably be too complex. Does anyone have an easy way to either:
Get the output of cld() to use one combined correction for all by groups.
Using a relatively simple method, reduce the compact letter display for each subgroup to the minimal necessary number of letters.
Reproducible example
Load packages and data.
library(lme4)
library(emmeans)
library(multcomp)
dat <- structure(list(y = c(2933.928571, 930.3571429, 210.7142857, 255.3571429,
2112.5, 1835.714286, 1358.928571, 1560.714286, 9192.857143, 3519.642857,
2771.428571, 7433.928571, 4444.642857, 3025, 3225, 2103.571429,
3876.785714, 925, 1714.285714, 3225, 1783.928571, 2223.214286,
2537.5, 2251.785714, 7326.785714, 5130.357143, 2539.285714, 6116.071429,
5808.928571, 3341.071429, 2212.5, 7562.5, 3907.142857, 3241.071429,
1294.642857, 4325, 4487.5, 2551.785714, 5648.214286, 3198.214286,
1075, 335.7142857, 394.6428571, 1605.357143, 658.9285714, 805.3571429,
1580.357143, 1575, 2037.5, 1721.428571, 1014.285714, 2994.642857,
2116.071429, 800, 2925, 3955.357143, 9075, 3917.857143, 2666.071429,
6141.071429, 3925, 1626.785714, 2864.285714, 7271.428571, 3432.142857,
1826.785714, 514.2857143, 1319.642857, 1782.142857, 2637.5, 1355.357143,
3328.571429, 1914.285714, 817.8571429, 1896.428571, 2121.428571,
521.4285714, 360.7142857, 1114.285714, 1139.285714, 7042.857143,
2371.428571, 2287.5, 4967.857143, 2180.357143, 1944.642857, 2408.928571,
5289.285714, 7028.571429, 3080.357143, 5394.642857, 5973.214286,
7323.214286, 1419.642857, 1455.357143, 4657.142857, 7069.642857,
2451.785714, 4319.642857, 5562.5, 3953.571429, 1182.142857, 1957.142857,
3796.428571, 1773.214286, 400, 871.4285714, 842.8571429, 657.1428571,
1360.714286, 1853.571429, 1826.785714, 3405.357143, 2605.357143,
5983.928571, 4935.714286, 4105.357143, 7666.071429, 3619.642857,
5085.714286, 1592.857143, 1751.785714, 5992.857143, 2987.5, 794.6428571,
3187.5, 825, 3244.642857), f1 = structure(c(4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("A",
"B", "C", "D"), class = "factor"), f2 = structure(c(2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("foo",
"bar"), class = "factor"), f3 = structure(c(4L, 3L, 2L, 1L, 3L,
4L, 1L, 2L, 4L, 2L, 1L, 3L, 3L, 2L, 4L, 1L, 3L, 1L, 4L, 2L, 2L,
4L, 3L, 1L, 2L, 4L, 1L, 3L, 2L, 3L, 1L, 4L, 3L, 4L, 1L, 2L, 3L,
2L, 4L, 1L, 2L, 1L, 3L, 4L, 1L, 2L, 4L, 3L, 2L, 1L, 3L, 4L, 3L,
1L, 4L, 2L, 4L, 2L, 3L, 1L, 1L, 3L, 2L, 4L, 3L, 4L, 1L, 2L, 1L,
4L, 3L, 2L, 3L, 1L, 4L, 2L, 1L, 3L, 4L, 2L, 4L, 3L, 1L, 2L, 1L,
3L, 4L, 2L, 3L, 1L, 4L, 2L, 4L, 1L, 3L, 2L, 2L, 3L, 4L, 1L, 4L,
1L, 2L, 3L, 4L, 1L, 3L, 2L, 1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L,
4L, 2L, 3L, 1L, 3L, 4L, 2L, 1L, 3L, 2L, 4L), .Label = c("L1",
"L2", "L3", "L4"), class = "factor"), block = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("1",
"2", "3", "4"), class = "factor")), row.names = c(NA, -128L), class = "data.frame")
Fit model and get estimated marginal means.
fit <- lmer(log10(y) ~ f1 * f2 * f3 + (1 | block), data = dat)
emm <- emmeans(fit, ~ f1 + f2 + f3, mode = 'Kenward-Roger', type = 'response')
Version 1
In this version, I take the CLD as a whole which correctly uses the Sidak adjustment for 496 tests. However let's say I wanted to plot only those rows where f2 == 'bar'. The letters are no longer correct because some are redundant (less than 8 are needed). Is there any function that can reduce the letters down?
cldisplay1 <- cld(emm, adjust = 'sidak', Letters = letters)
subset(as.data.frame(cldisplay1), f2 == 'bar') # correct comparisons but contains redundant letters
output
f1 f2 f3 response SE df lower.CL upper.CL .group
8 D bar L1 365.6732 76.1231 96 185.9699 719.0244 a
24 D bar L3 582.8573 121.3349 96 296.4229 1146.0742 ab
16 D bar L2 682.9238 142.1659 96 347.3136 1342.8353 ab
7 C bar L1 898.1560 186.9714 96 456.7740 1766.0470 abcd
6 B bar L1 1627.7069 338.8438 96 827.8006 3200.5652 bcdefg
15 C bar L2 1635.4393 340.4534 96 831.7330 3215.7694 bcdefg
32 D bar L4 1746.6052 363.5951 96 888.2685 3434.3552 bcdefg
31 C bar L4 2348.6629 488.9270 96 1194.4562 4618.1832 cdefgh
21 A bar L3 2499.6772 520.3640 96 1271.2573 4915.1230 cdefgh
5 A bar L1 2545.4594 529.8946 96 1294.5407 5005.1448 cdefgh
23 C bar L3 2561.0138 533.1326 96 1302.4512 5035.7294 cdefgh
30 B bar L4 3158.6969 657.5538 96 1606.4140 6210.9556 efgh
22 B bar L3 3364.9438 700.4887 96 1711.3047 6616.4994 efgh
14 B bar L2 3411.4009 710.1598 96 1734.9313 6707.8482 efgh
13 A bar L2 3769.4223 784.6900 96 1917.0098 7411.8269 efgh
29 A bar L4 7006.3740 1458.5342 96 3563.2217 13776.6551 h
Version 2
In this version, I use the by argument to cld() to split by f2. This reduces the letters within each group, but the Sidak adjustment is now less conservative. For example, row 8 and row 16 are not significantly different at the adjusted alpha-level from the comparison above, but now they are different. But I do not want to change the tests used, just to plot only a subset of the data. Is there a way to specify the number of tests I'm adjusting for as a whole, even though cld is split up with by groups?
cldisplay2 <- cld(emm, adjust = 'sidak', by = 'f2', Letters = letters)
subset(as.data.frame(cldisplay2), f2 == 'bar')
output
f1 f2 f3 response SE df lower.CL upper.CL .group
8 D bar L1 365.6732 76.1231 96 185.9699 719.0244 a
24 D bar L3 582.8573 121.3349 96 296.4229 1146.0742 ab
16 D bar L2 682.9238 142.1659 96 347.3136 1342.8353 abc
7 C bar L1 898.1560 186.9714 96 456.7740 1766.0470 abcd
6 B bar L1 1627.7069 338.8438 96 827.8006 3200.5652 bcde
15 C bar L2 1635.4393 340.4534 96 831.7330 3215.7694 bcde
32 D bar L4 1746.6052 363.5951 96 888.2685 3434.3552 cde
31 C bar L4 2348.6629 488.9270 96 1194.4562 4618.1832 de
21 A bar L3 2499.6772 520.3640 96 1271.2573 4915.1230 def
5 A bar L1 2545.4594 529.8946 96 1294.5407 5005.1448 def
23 C bar L3 2561.0138 533.1326 96 1302.4512 5035.7294 def
30 B bar L4 3158.6969 657.5538 96 1606.4140 6210.9556 ef
22 B bar L3 3364.9438 700.4887 96 1711.3047 6616.4994 ef
14 B bar L2 3411.4009 710.1598 96 1734.9313 6707.8482 ef
13 A bar L2 3769.4223 784.6900 96 1917.0098 7411.8269 ef
29 A bar L4 7006.3740 1458.5342 96 3563.2217 13776.6551 f
With the two separate tables (or plots?) you are displaying a total of 90 + 90 = 180 comparisons. If you want an overall multiplicity adjustment for all of these 180 comparisons, you need to be considerably less conservative than for 496 comparisons. However, it is possible to speccify a different value of level so that the Sidak adjustment works out correctly. For example, if you want the overall alpha to be 0.05, use
cld(emm, adjust = 'sidak', by = 'f2', Letters = letters,
alpha = 1 - sqrt(0.95))
With this, you are specifying alpha = 0.02532. Note that if
p.adj = 1 - (1 - p)^90 < 1 - sqrt(.95)
then
(1 - p)^90 > sqrt(.95)
so that
(1 - p)^180 > .95
thus
1 - (1 - p)^180 < .05
That is, by splitting the CLD table into two parts showing 90 comparisons each, we correctly apply the Sidak adjustment to correct for the 180 comparisons total at a significance level of .05.
Enhancement
Another idea based on this that results in a less conservative adjustment is to specify the Tukey adjustment instead:
cld(emm, adjust = 'tukey', by = 'f2', Letters = letters,
alpha = 1 - sqrt(0.95))
Thus, each separate table has an exact familywise error rate of 1 - sqrt(0.05); and we used the Sidak adjustment (slightly conservative) so that the error rate for the whole family of 180 tests is less than 0.05.
I wanted to make plots that look like figure 1 (source: link)
In figure 1, they have plotted the regression analysis with one-year yield variability. In my case, I would like to plot variability between two locations and 4 blocks for each treatment group. So the plot I wanted would have three facets for factors B.glucosidase, Protein, POX.C of variable and four colors for treatments factors. Also, in my current plot I have legend for block and treatment. I should only have treatment because the block should be used for making error bar for variability.
I tried with this code, which obviously doesn't work for what I want. (Data for df.melted included below.)
ggplot(df.melted, aes(x = value, y = yield, color = as.factor(treatment))) +
geom_point(aes(shape= as.factor(block))) +
stat_smooth(method = "lm", formula = y ~ x, col = "darkslategrey", se=F) +
stat_poly_eq(formula = y~x,
# aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
aes(label = ..rr.label..),
parse = TRUE) +
theme_classic() +
geom_errorbar(aes(ymax = df.melted$yield+sd(df.melted$yield), ymin = df.melted$yield-sd(df.melted$yield)), width = 0.05)+
facet_wrap(~variable)
Data:
df.melted <- structure(list(Location = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("M", "U"), class = "factor"),
treatment = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L), .Label = c("CC",
"CCS", "CS", "SCS"), class = "factor"), block = c(1L, 2L,
3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L,
1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L,
3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L,
1L, 2L, 3L, 4L), yield = c(5156L, 5157L, 5551L, 5156L, 4804L,
4720L, 4757L, 5021L, 4826L, 4807L, 4475L, 4596L, 4669L, 4588L,
4542L, 4592L, 5583L, 5442L, 5693L, 5739L, 5045L, 4902L, 5006L,
5086L, 4639L, 4781L, 4934L, 4857L, 4537L, 4890L, 4842L, 4608L,
5156L, 5157L, 5551L, 5156L, 4804L, 4720L, 4757L, 5021L, 4826L,
4807L, 4475L, 4596L, 4669L, 4588L, 4542L, 4592L, 5583L, 5442L,
5693L, 5739L, 5045L, 4902L, 5006L, 5086L, 4639L, 4781L, 4934L,
4857L, 4537L, 4890L, 4842L, 4608L, 5156L, 5157L, 5551L, 5156L,
4804L, 4720L, 4757L, 5021L, 4826L, 4807L, 4475L, 4596L, 4669L,
4588L, 4542L, 4592L, 5583L, 5442L, 5693L, 5739L, 5045L, 4902L,
5006L, 5086L, 4639L, 4781L, 4934L, 4857L, 4537L, 4890L, 4842L,
4608L), variable = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("B.glucosidase",
"Protein", "POX.C"), class = "factor"), value = c(1.600946,
1.474084, 1.433078, 1.532492, 1.198667, 1.193193, 1.214941,
1.360981, 1.853056, 1.690117, 1.544357, 1.825132, 1.695409,
1.764123, 1.903743, 1.538684, 0.845077, 1.011463, 0.857032,
0.989803, 0.859022, 0.919467, 1.01717, 0.861689, 0.972332,
0.952922, 0.804431, 0.742634, 1.195837, 1.267285, 1.08571,
1.20097, 6212.631579, 5641.403509, 4392.280702, 7120.701754,
5305.964912, 4936.842105, 5383.157895, 6077.894737, 5769.122807,
5016.842105, 5060.350877, 5967.017544, 5576.842105, 5174.035088,
5655.438596, 5468.77193, 7933.333333, 7000, 6352.982456,
8153.684211, 6077.894737, 4939.649123, 5002.807018, 6489.122807,
4694.035088, 5901.052632, 4303.859649, 6768.421053, 6159.298246,
6090.526316, 4939.649123, 5262.45614, 810.3024, 835.5242,
856.206, 759.8589, 726.2298, 792.6472, 724.7165, 699.3266,
500.9153, 634.8698, 637.9536, 648.8814, 641.0357, 623.3822,
555.2834, 520.8119, 683.3528, 595.9173, 635.4315, 672.4234,
847.2944, 745.5665, 778.3548, 735.8141, 395.2647, 570.4148,
458.0383, 535.3851, 678.0293, 670.7419, 335.2923, 562.5674
)), row.names = c(NA, -96L), class = "data.frame")
library(dplyr)
library(ggplot2)
library(ggpmisc)
Summarize data frame (this could also be done with stat_summary(), but it's often clearer/more transparent to do it explicitly up front). (I think that because your data set is balanced you could collapse/average over the block structure first, and then do your whole plot with the reduced data set - it shouldn't change the outcome of the linear regressions at all, at least not the mean values ... and any statistical comparisons should probably done on block-level summaries anyway ...)
df.sum <- (df.melted
%>% group_by(Location,treatment,variable)
%>% summarise(value=mean(value),yield_sd=sd(yield),
## collapse yield to mean *after* computing sd!
yield=mean(yield))
)
Plot:
(ggplot(df.melted,
aes(x = value, y = yield, color = treatment))
+ stat_smooth(method = "lm", col = "darkslategrey", se=FALSE)
+ stat_poly_eq(
formula = y ~ x,
## aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
aes(group=1, label = ..rr.label..),
parse = TRUE)
+ theme_classic()
+ scale_shape(guide=FALSE)
+ geom_point(data=df.sum)
+ geom_errorbar(data=df.sum,
aes(ymax = yield+yield_sd, ymin = yield-yield_sd),
width = 0.05)
+ facet_wrap(~variable,scale="free_x")
)
(adding group=1 to the stat_poly_eq() aesthetics means we only plot a single R^2 value per facet)
Since you're no longer using the shape aesthetic for anything, you could consider using it to show the Location variable ...
I have been struggling with ggplot to display these plots how I would like. My data have 2 factors, quarter and species. Station will be on the x-axis, value on the y-axis, and the constituent will be used with the facet_wrap. I want quarter differentiated with shapes, and species with colors.
The issue is I'm trying to replicate a figure done in SigmaPlot. It is 4x4 grid of plots, with the first two rows of the first column are empty, to allow for the placement of the legend. My original plan was to have two separate facets made using facet-wrap, and combine those, however, this doesn't maintain the 4x4 arrangement, it transforms it into a 1x2, which ruins alignment of plots and shrinks the larger faceted grid.
My next thought was to create each plot individually, then arrange them in a grid using cowplot. This presents the plots how I'd like them arranged, but I can't figure out how to have two y-axis labels, due to different units. One label would be centered on the two leftmost plots, and one centered on the left of the next column of 4 plots.
I'm trying to use this code (just copy the example data below, and run):
library(ggplot)
library(gridExtra)
test.data1 <- test.data[1:95, ]
test.data2 <- test.data[96:111, ]
testplot1 <- ggplot(test.data1, aes(Station, value)) +
geom_point(aes(shape = factor(quarter), fill = Species)) +
scale_shape_manual(values = c(21, 22)) +
labs(x = "Station", y = "Unit a", shape = "Sampling Quarter", fill = "Species") +
theme(legend.position = "none", legend.title = element_blank()) +
guides(fill = guide_legend(override.aes = list(shape = 21), nrow = 2, byrow = TRUE), shape = guide_legend(nrow = 2, byrow = TRUE)) +
facet_wrap( ~ constituent, ncol = 3, scales = "free_y")
testplot2 <- ggplot(test.data2, aes(Station, value)) +
geom_point(aes(shape = factor(quarter), fill = Species))
scale_shape_manual(values = c(21, 22)) +
labs(x = "Station", y = "Unit b", shape = "Sampling Quarter", fill = "Species") +
theme(legend.position = "top", legend.title = element_blank()) +
guides(fill = guide_legend(override.aes = list(shape = 21), nrow = 2, byrow = TRUE), shape = guide_legend(nrow = 2, byrow = TRUE)) +
facet_wrap( ~ constituent, ncol = 1, scales = "free_y")
grid.arrange(testplot2, testplot1, ncol = 2)
Which generates this:
But I want it to be arranged like this, where the XX and YY plots from above are normalized in size with the other plots (this was done using individual plots, and using plot_grid):
Example data from a larger set:
test.data <- structure(list(Station = structure(c(1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("StA", "StB"), class = "factor"),
CollectionDate = structure(c(3L, 2L, 3L, 1L, 3L, 1L, 3L,
1L, 3L, 2L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 2L, 3L, 1L, 3L, 1L,
3L, 1L, 3L, 2L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 2L, 3L, 1L, 3L,
1L, 3L, 1L, 3L, 2L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 2L, 3L, 1L,
3L, 1L, 3L, 1L, 3L, 2L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 2L, 3L,
1L, 3L, 1L, 3L, 1L, 3L, 2L, 3L, 1L, 3L, 1L, 1L, 3L, 2L, 3L,
1L, 3L, 1L, 3L, 1L, 3L, 2L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 2L,
3L, 1L, 3L, 1L, 3L, 1L, 3L, 2L, 3L, 1L, 3L, 1L, 3L, 1L), .Label = c("10/1/2017",
"10/16/2017", "4/1/2017"), class = "factor"), Species = structure(c(1L,
2L, 2L, 3L, 1L, 2L, 2L, 3L, 1L, 2L, 2L, 3L, 1L, 2L, 2L, 3L,
1L, 2L, 2L, 3L, 1L, 2L, 2L, 3L, 1L, 2L, 2L, 3L, 1L, 2L, 2L,
3L, 1L, 2L, 2L, 3L, 1L, 2L, 2L, 3L, 1L, 2L, 2L, 3L, 1L, 2L,
2L, 3L, 1L, 2L, 2L, 3L, 1L, 2L, 2L, 3L, 1L, 2L, 2L, 3L, 1L,
2L, 2L, 3L, 1L, 2L, 2L, 3L, 1L, 2L, 2L, 3L, 1L, 2L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 2L, 3L, 1L, 2L, 2L, 3L, 1L, 2L, 2L, 3L,
1L, 2L, 2L, 3L, 1L, 2L, 2L, 3L, 1L, 2L, 2L, 3L, 1L, 2L, 2L,
3L, 1L, 2L, 2L, 3L), .Label = c("SpA", "SpB", "SpC"), class = "factor"),
quarter = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("2017 Q2",
"2017 Q4"), class = "factor"), constituent = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L,
13L, 13L, 13L, 13L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L
), .Label = c("A", "B", "C", "D", "E", "F", "G", "H", "I",
"J", "K", "L", "XX", "YY"), class = "factor"), value = c(16,
35, 46, 23, 40, 19, 9, 50, 0.2, 1, 0.5698, 0.322, 1, 0.45,
0.322, 0.5, 16, 9, 6, 19, 14, 13, 16, 9, 0, 0.004, 0, 0.004,
1, 0.32, 1, 0.678, 0, 0.39, 0.23, 0, 0, 1.1, 0.5, 0.5, 9,
4.9, 7, 4.768, 9, 8.65, 4.768, 6.54, 195, 195, 46, 46, 124,
124, 218, 218, 2, 1, 1, 1, 1, 2, 1, 1, 0.1, 0.4, 0.22, 0.4,
0.22, 0.4, 0.22, 0.1, 0.99, 0.99, 1.2, 0.45, 0.765, 0.99,
0.99, 0.99, 0.99, 1.2, 4.3, 0.98, 0.99, 1.2, 1.2, 34, 34,
65, 98, 150, 34, 65, 65, 2, 0, 4, 1.3, 5, 3.3, 1.56, 1, 9,
0.36, 4, 4, 11, 2, 2.22, 11)), class = "data.frame", row.names = c(NA,
-111L))
I am trying to connect sets of (two) points at each level of x, in each facet. Here is a reproducible example:
datum <- structure(list(frequency = c(8L, 7L, 6L, 18L, 5L, 11L, 16L, 15L,
9L, 8L, 8L, 10L, 2L, 20L, 14L, 3L, 6L, 2L, 2L, 11L, 10L, 6L,
15L, 19L, 18L, 18L, 8L, 2L, 10L, 15L, 12L, 17L, 1L, 18L, 7L,
8L, 16L, 4L, 9L, 2L, 7L, 3L, 16L, 7L, 18L, 20L, 9L, 10L, 13L,
2L, 15L, 7L, 3L, 20L, 4L, 15L, 5L, 7L, 9L, 16L, 5L, 8L, 10L,
10L, 7L, 10L, 10L, 17L, 7L, 8L, 13L, 13L, 16L, 5L, 20L, 18L,
13L, 19L, 3L, 8L, 14L, 12L, 20L, 2L, 9L, 13L, 7L, 2L, 5L, 5L,
13L, 9L, 13L, 7L, 9L, 4L, 4L, 20L, 1L, 4L), band = structure(c(2L,
4L, 2L, 3L, 2L, 1L, 4L, 1L, 2L, 1L, 3L, 4L, 2L, 4L, 3L, 4L, 3L,
2L, 3L, 2L, 2L, 4L, 2L, 1L, 1L, 2L, 1L, 4L, 4L, 1L, 4L, 4L, 2L,
1L, 4L, 4L, 3L, 4L, 1L, 1L, 3L, 4L, 1L, 3L, 4L, 1L, 2L, 1L, 1L,
2L, 2L, 1L, 3L, 4L, 2L, 1L, 2L, 4L, 2L, 2L, 4L, 4L, 2L, 4L, 4L,
1L, 1L, 4L, 2L, 3L, 4L, 1L, 2L, 4L, 1L, 2L, 4L, 1L, 1L, 3L, 4L,
4L, 2L, 2L, 2L, 1L, 3L, 2L, 2L, 2L, 3L, 3L, 1L, 3L, 4L, 3L, 3L,
1L, 3L, 4L), .Label = c("1", "2", "3", "4"), class = "factor"),
test = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L,
2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L,
2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L
), .Label = c("1", "2"), class = "factor"), knowledge = structure(c(2L,
3L, 1L, 3L, 1L, 1L, 3L, 3L, 1L, 3L, 1L, 3L, 2L, 2L, 1L, 1L,
1L, 1L, 3L, 3L, 1L, 2L, 3L, 1L, 1L, 2L, 2L, 1L, 1L, 3L, 2L,
3L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 3L, 3L, 1L, 1L, 2L, 3L,
3L, 2L, 2L, 3L, 1L, 1L, 2L, 2L, 2L, 3L, 1L, 3L, 1L, 1L, 2L,
1L, 1L, 2L, 3L, 1L, 1L, 1L, 1L, 3L, 2L, 2L, 1L, 2L, 3L, 2L,
1L, 2L, 3L, 3L, 2L, 1L, 3L, 1L, 3L, 2L, 1L, 3L, 2L, 2L, 3L,
1L, 1L, 2L, 1L, 2L, 3L, 1L, 3L, 1L), .Label = c("1", "2",
"3"), class = "factor")), .Names = c("frequency", "band",
"test", "knowledge"), row.names = c(NA, -100L), class = "data.frame")
Here is the code I have so far:
ggplot(datum, aes(knowledge, frequency, color=test)) +
stat_summary(fun.y='mean', geom='point', position=position_dodge(width=.9), size=3) +
facet_grid(~band) +
labs(y='number of words (max = 20)', x='self-report knowledge') +
scale_x_discrete(labels=c('none', 'form', 'meaning'))
Looking at the left-most facet ('1') in the graph, I would like a line to connect the pretest to posttest in the none column, another line connecting pretest to posttest in the form column, and a line connecting the pretest to the posttest in the meaning column. I would like this done in each facet.
I hope that makes sense, and thanks!
I find relying on ggplot too much for data manipulation/summarizing can hurt more than it helps. I have no idea how to connect the position-dodged points with a line. Instead, I'd do something like this:
library(dplyr)
datsum = datum %>%
group_by(band, knowledge, test) %>%
summarize(mean = mean(frequency)) %>%
ungroup %>%
mutate(knowledge_fac = factor(knowledge, labels = c('none', 'form', 'meaning')))
ggplot(datsum, aes(x = test, y = mean)) +
geom_path(aes(group = band:knowledge)) +
geom_point(aes(color = factor(test))) +
facet_grid(band ~ knowledge_fac) +
labs(y='number of words (max = 20)', x='self-report knowledge')
Borrowing from Gregor's work in munging the data, I think this does what was requested. The mutate() chunk creates Test to be a numeric offset of -0.1 for test 1 and 0.1 for test 2. This is then added to the numeric value of knowledge. The result is the numeric x passed to ggplot2. Gregor correctly defined the groups, so the rest is straightforward.
library(dplyr)
datsum <- datum %>%
group_by(band, knowledge, test) %>%
summarize(mean = mean(frequency)) %>%
mutate(Test = 0.1 * (2 * (test == 2) - 1),
Knowledge = as.numeric(knowledge) + Test) %>%
ungroup
ggplot(datsum, aes(x = Knowledge, y = mean, color = test)) +
geom_path(aes(group = band:knowledge), color = "black") +
geom_point(size = 3) +
facet_wrap(~ band, nrow = 1) +
labs(y='number of words (max = 20)', x='self-report knowledge') +
scale_color_manual(values = c("orange", "blue")) +
scale_x_continuous(limits = c(0.5, 3.5), breaks = 1:3,
labels = c("none", "form", "meaning"))