Tukey test after LMM keeping contrasts - r

I want to test a 2x3 factorial design and contrasted the variables like this
library(lme4)
library(emmeans)
my.helmert = matrix(c(2, -1, -1, 0, -1, 1), ncol = 2)
contrasts(Target3$mask) = my.helmert
contrasts(Target3$length)
So for mask I want to compare the first group with the average of the two other groups and in a second step the second with the third group.
This works fine in my LMM
Target3.2_TT.lmer = lmer(logTotalTime ~ mask*length+ (1+length|Subject) +(1|Trialnum), data = Target3)
There is a significant interaction between mask and length, that´s why I want to take a look at this effect and calculate a post hoc test (Turkey) like this:
emmeans(Target3.2_TT.lmer, pairwise ~ mask : length)
This also works pretty fine with one problem: now my contrasts are gone. The text calculates the differences for all masks and not just 1 vs. 2 and 3 and 2 vs. 3. Is there a possibility to keep my contrasts in the Post hoc test?
This is what the data looks like:
> dput(Target3)
structure(list(mask = structure(c(2L, 1L, 2L, 3L, 1L, 2L, 3L,
2L, 1L, 3L, 3L, 3L, 3L, 2L, 2L, 3L, 3L, 1L, 3L, 1L, 1L, 2L, 3L,
3L, 2L, 1L, 3L, 2L, 3L, 2L), contrasts = structure(c(2, -1, -1,
0, -1, 1), .Dim = c(3L, 2L), .Dimnames = list(c("keine Maske",
"syntaktisch\n korrekt", "syntaktisch \n inkorrekt"), NULL)), .Label = c("keine Maske",
"syntaktisch\n korrekt", "syntaktisch \n inkorrekt"), class = "factor"),
length = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L,
2L, 2L, 1L, 1L, 2L, 1L), .Label = c("kurzes \n N+1", "langes\n N+1"
), class = "factor"), logTotalTime = c(4.969813299576, 5.37989735354046,
5.14166355650266, 5.40717177146012, 5.27299955856375, 5.72358510195238,
5.4249500174814, 6.18001665365257, 5.67675380226828, 5.44241771052179,
5.66988092298052, 5.04985600724954, 5.78996017089725, 5.03043792139244,
5.92958914338989, 5.15329159449778, 6.11146733950268, 5.26269018890489,
5.17614973257383, 6.18001665365257, 6.03068526026126, 5.68697535633982,
5.17614973257383, 5.19849703126583, 5.29330482472449, 5.89989735358249,
5.73979291217923, 5.65599181081985, 5.94017125272043, 5.72031177660741
)), .Names = c("mask", "length", "logTotalTime"), row.names = c(2L,
4L, 6L, 8L, 9L, 11L, 13L, 15L, 16L, 18L, 20L, 22L, 27L, 29L,
31L, 33L, 35L, 37L, 39L, 41L, 42L, 44L, 47L, 49L, 51L, 54L, 55L,
57L, 59L, 61L), class = "data.frame")

Well, if you ask for pairwise comparisons, that’s what you get, and Helmert contrasts are not the same as pairwise comparisons. Further, the Tukey (not Turkey) method applies only to pairwise comparisons, not to other types of contrasts.
Here’s something to try that may give you what you want.
emm = emmeans(Target3.2_TT.lmer,
~ mask | length)
contrast(emm, list(
c1 = c(2, -1, -1)/2,
c2 = c(0, 1, -1)),
adjust = “mvt”)
This will work independently of whatever parameterization (i.e., contrasts settings) were used when fitting the model. The model parametrrization affects how the model matrix is set up and the interpretation of the coefficients, but does not affect the results from emmeans or its relatives.

Related

How to run a Breusch-Pagan Test for heteroskedasticity on lmer() model?

Why cant I run a Breusch-Pagan Test bptest() on a linear mixed effect model lmer() in order to test for heteroscedasticity? The bptest function works fine on models built with lm and glmer but not lmer. Is there a different function I should be using?
error message
Error: $ operator not defined for this S4 class
data <- structure(list(Mn_new = c(3.90508190744665, 3.41518826685297,
3.98107659173858, 4.06706444435455, 2.40431879320057, 3.8090250549363,
3.72177711209025, 2.93248691964847, 4.10035133820019, 4.20508065155943,
3.64103189844949, 4.24257964492719, 4.20182664641102, 3.41263061412322,
4.04144915900294, 4.28185091235415, 3.09415352803393, 3.67021392570071,
3.56418529613595, 3.21715355220772, 3.21429992539095, 3.54553486317315,
4.03025205893711, 2.97382166830262, 3.80757707518732, 3.78523559035143,
3.41487105608904, 2.75799799020337, 3.06834870580776, 3.30533869585591,
2.8380338262522, 2.65147541433061, 3.53356800468757, 2.51733199167976,
3.16115687664055, 3.64858366279116, 3.48272937241829, 2.91621249433787,
3.26028181088023, 3.49589461456199, 2.82832109354896, 3.40328200399306,
3.28568362736306, 2.87324453863543, 3.10651957200347, 2.81769064140214,
2.57165695575711, 2.97592292304521, 3.18174081921005, 3.54312301316704,
2.70447719350618, 3.48454089015539, 3.39666701335652, 3.03088932872189,
3.1057376517166, 2.91083893666025, 3.18752169045788, 3.04054322208808,
3.04284811683015, 3.53376439846743, 3.57155887085371, 2.67921235204479,
3.24539585432457, 3.32270430796322, 3.75933211625452, 3.30303225771367,
2.94140225772847, 3.22916966186489, 3.45512223500913, 2.89996056576201,
3.19536565883228, 2.49108662931588, 2.55337036896523, 2.98316003461686,
3.58241577241437, 3.40385600372579, 3.66136967423154, 3.71807222845311,
3.73004186004765, 4.10988004656572, 3.90759927253415, 2.86608298949975,
3.61450793458081, 3.85162032119424, 4.44992983828838, 3.19109366840847,
3.09329595776341, 3.69955310870145, 4.47202033690943, 3.61326633240611,
3.64532602062922, 3.33230174866167, 2.74653680127074, 3.61473897523957
), SEX = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("F", "M"), class = "factor"), S_M = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("AFTER",
"BEFORE"), class = "factor"), ID = structure(c(43L, 40L, 25L,
17L, 1L, 20L, 4L, 13L, 45L, 32L, 28L, 5L, 14L, 21L, 44L, 9L,
16L, 42L, 18L, 35L, 22L, 10L, 8L, 36L, 37L, 15L, 19L, 43L, 40L,
25L, 17L, 1L, 20L, 4L, 13L, 45L, 32L, 28L, 5L, 14L, 21L, 44L,
9L, 16L, 42L, 18L, 35L, 22L, 10L, 8L, 36L, 37L, 15L, 19L, 47L,
46L, 34L, 38L, 29L, 41L, 33L, 26L, 23L, 27L, 24L, 11L, 7L, 3L,
6L, 12L, 30L, 39L, 2L, 31L, 47L, 46L, 34L, 38L, 29L, 41L, 33L,
26L, 23L, 27L, 24L, 11L, 7L, 3L, 6L, 12L, 30L, 39L, 2L, 31L), .Label = c("BLA1",
"BLA10", "BLA14", "BLA16", "BLA17", "BLA2", "BLA20", "BLA202",
"BLA203", "BLA205", "BLA21", "BLA211", "BLA213", "BLA214", "BLA215",
"BLA216", "BLA217", "BLA219", "BLA221", "BLA224", "BLA228", "BLA23",
"BLA238", "BLA24", "BLA248", "BLA25", "BLA27", "BLA270", "BLA283",
"BLA294", "BLA296", "BLA300", "BLA307", "BLA31", "BLA33", "BLA36",
"BLA38", "BLA42", "BLA47", "BLA48", "BLA5", "BLA53", "BLA60",
"BLA61", "BLA74", "BLA79", "BLA80"), class = "factor")), class = "data.frame", row.names = c(NA,
-94L))
code for lmer
#Mg
Mg_model <- lmer(Mg_new ~ SEX * S_M + (1|ID), data=data)
summary(Mg_model)
library(lmtest)
bptest(Mg_model)
error
Error: $ operator not defined for this S4 class
The Breusch-Pagan test "fits a linear regression model to the residuals of a linear regression model ... By default the same explanatory variables are taken as in the main regression model".
The version in base R "works" for lm and glm models, but I wouldn't trust it for glm models — as far as I know the test doesn't apply, it's just that the generic functions it uses also work for glm objects. (Contrary to your question, it throws an error for glmer fits - maybe you meant to say glm?)
I don't know offhand if the B-P test has been extended to cover the LMM case. If you had continuous predictors it would be tricky, but as you only have factors you can use a Levene's test as in this answer:
library(lme4)
library(broom.mixed)
library(ggplot2)
Mn_model <- lmer(Mn_new ~ SEX * S_M + (1|ID), data=data)
aa <- augment(Mn_model, .data = data)
ggplot(aa, aes(x = interaction(S_M,SEX), y = .resid)) + geom_boxplot()
car::leveneTest(.resid ~ S_M*SEX, data = aa)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 3 2.271 0.08566 .
## 90

labelling of ordered factor variable

I am trying to produce a univariate output table using the gtsummary package.
structure(list(id = 1:10, age = structure(c(3L, 3L, 2L, 3L, 2L,
2L, 2L, 1L, 1L, 1L), .Label = c("c", "b", "a"), class = c("ordered",
"factor")), sex = structure(c(2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 2L), .Label = c("F", "M"), class = "factor"), country = structure(c(1L,
1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L, 2L), .Label = c("eng", "scot",
"wale"), class = "factor"), edu = structure(c(1L, 1L, 1L, 2L,
2L, 2L, 3L, 3L, 3L, 3L), .Label = c("x", "y", "z"), class = "factor"),
lungfunction = c(45L, 23L, 25L, 45L, 70L, 69L, 90L, 50L,
62L, 45L), ivdays = c(15L, 26L, 36L, 34L, 2L, 4L, 5L, 8L,
9L, 15L), no2 = c(40L, 70L, 50L, 60L, 30L, 25L, 80L, 89L,
10L, 40L), pm25 = c(15L, 20L, 36L, 48L, 25L, 36L, 28L, 15L,
25L, 15L)), row.names = c(NA, 10L), class = "data.frame")
...
library(gtsummary)
publication_dummytable1_sum %>%
select(sex,age,lungfunction,ivdays) %>%
tbl_uvregression(
method =lm,
y = lungfunction,
pvalue_fun = ~style_pvalue(.x, digits = 3)
) %>%
add_global_p() %>% # add global p-value
bold_p() %>% # bold p-values under a given threshold
bold_labels()
...
When I run this code I get the output below. The issue is the labeling of the ordered factor variable (age). R chooses its own labeling for the ordered factor variable. Is it possible to tell R not to choose its own labeling for ordered factor variables?
I want output like the following:
Like many other people, I think you might be misunderstanding the meaning of an "ordered" factor in R. All factors in R are ordered, in a sense; the estimates etc. are typically printed, plotted, etc. in the order of the levels vector. Specifying that a factor is of type ordered has two major effects:
it allows you to evaluate inequalities on the levels of the factor (e.g. you can filter(age > "b"))
the contrasts are set by default to orthogonal polynomial contrasts, which is where the L (linear) and Q (quadratic) labels come from: see e.g. this CrossValidated answer for more details.
If you want this variable treated in the same way a regular factor (so that the estimates are made for differences of groups from the baseline level, i.e. treatment contrasts), you can:
convert back to an unordered factor (e.g. factor(age, ordered=FALSE))
specify that you want to use treatment contrasts in your model (in base R you would specify contrasts = list(age = "contr.treatment"))
set options(contrasts = c(unordered = "contr.treatment", ordered = "contr.treatment")) (the default for ordered is "contr.poly")
If you have an unordered ("regular") factor and the levels are not in the order you want, you can reset the level order by specifying the levels explicitly, e.g.
mutate(across(age, factor,
levels = c("0-10 years", "11-20 years", "21-30 years", "30-40 years")))
R sets the factors in alphabetical order by default, which is sometimes not what you want (but I can't think of a case where the order would be 'random' ...)
The easiest way to remove the odd labelling for the ordered variables, is to remove the ordered class from these factor variables. Example below!
library(gtsummary)
library(tidyverse)
packageVersion("gtsummary")
#> [1] '1.4.2'
publication_dummytable1_sum <-
structure(list(id = 1:10, age = structure(c(3L, 3L, 2L, 3L, 2L,
2L, 2L, 1L, 1L, 1L), .Label = c("c", "b", "a"), class = c("ordered",
"factor")), sex = structure(c(2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 2L), .Label = c("F", "M"), class = "factor"), country = structure(c(1L,
1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L, 2L), .Label = c("eng", "scot",
"wale"), class = "factor"), edu = structure(c(1L, 1L, 1L, 2L,
2L, 2L, 3L, 3L, 3L, 3L), .Label = c("x", "y", "z"), class = "factor"),
lungfunction = c(45L, 23L, 25L, 45L, 70L, 69L, 90L, 50L,
62L, 45L), ivdays = c(15L, 26L, 36L, 34L, 2L, 4L, 5L, 8L,
9L, 15L), no2 = c(40L, 70L, 50L, 60L, 30L, 25L, 80L, 89L,
10L, 40L), pm25 = c(15L, 20L, 36L, 48L, 25L, 36L, 28L, 15L,
25L, 15L)), row.names = c(NA, 10L), class = "data.frame") |>
as_tibble()
# R labels the order factors like this in lm()
lm(lungfunction ~ age, publication_dummytable1_sum)
#>
#> Call:
#> lm(formula = lungfunction ~ age, data = publication_dummytable1_sum)
#>
#> Coefficients:
#> (Intercept) age.L age.Q
#> 51.17 -10.37 -15.11
tbl <-
publication_dummytable1_sum %>%
# remove ordered class
mutate(across(where(is.ordered), ~factor(., ordered = FALSE))) %>%
select(sex,age,lungfunction,ivdays) %>%
tbl_uvregression(
method =lm,
y = lungfunction,
pvalue_fun = ~style_pvalue(.x, digits = 3)
)
Created on 2021-07-22 by the reprex package (v2.0.0)

Subsetting a function variable within another variable

I have the following function that performs a step-wise linear regression, and it works well with numerical and integer values, although, when I have factors as independent variables, I get the following error:
Error in [.data.frame(d, , names(resul0)) : undefined columns selected
The layout of the function:
stepfor(bird$Richness, data.frame(GARDENSIZE, Site, season), alfa = 0.2)
I have figured out a way that splits the factors into columns and assigns them respective values following the comments, given by this:
x <- function(x) {x %>%
select(where(negate(is.numeric))) %>%
map_dfc(~ model.matrix(~ .x -1) %>%
as_tibble) %>%
rename_all(~ str_remove(., "\\.x"))
}
Though, I'm not sure how I can include it into the function below, so that x can be implemented with the function below by calling it stepfor likeso:
stepfor(bird$Richness, data.frame(x(bird)), alfa = 0.2)
I just want to know how to include the function x within the function below to have it work like above. And if there aren't any factors in the data, then set the function as FALSE so it doesn't return an error like x is missing.
Here is my function:
stepfor<-function (y = y, d = d, alfa = 0.05)
{
pval <- NULL
design <- NULL
j = 1
resul0 <- summary(lm(y ~ ., data = d))$coefficients[, 4]
d <- as.data.frame(d[, names(resul0)][-1])
for (i in 1:ncol(d)) {
sub <- cbind(design, d[, i])
sub <- as.data.frame(sub)
lm2 <- lm(y ~ ., data = sub)
result <- summary(lm2)
pval[i] <- result$coefficients[, 4][j + 1]
}
min <- min(pval)
while (min < alfa) {
b <- pval == min
c <- c(1:length(pval))
pos <- c[b]
pos <- pos[!is.na(pos)][1]
design <- cbind(design, d[, pos])
design <- as.data.frame(design)
colnames(design)[j] <- colnames(d)[pos]
j = j + 1
d <- as.data.frame(d[, -pos])
pval <- NULL
if (ncol(d) != 0) {
for (i in 1:ncol(d)) {
sub <- cbind(design, d[, i])
sub <- as.data.frame(sub)
lm2 <- lm(y ~ ., data = sub)
result <- summary(lm2)
pval[i] <- result$coefficients[, 4][j + 1]
}
min <- min(pval, na.rm = TRUE)
}
else min <- 1
}
if (is.null(design)) {
lm1 <- lm(y ~ 1)
}
else {
lm1 <- lm(y ~ ., data = design)
}
return(lm1)
}
Reproducible code:
bird<- structure(list(season = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L), .Label = c("Summer",
"Winter"), class = "factor"), Richness = c(20L, 17L, 18L, 19L,
11L, 15L, 17L, 15L, 15L, 9L, 13L, 14L, 12L, 18L, 30L, 30L, 17L,
25L, 32L, 32L, 29L, 29L, 27L, 18L, 25L, 24L, 15L, 18L, 23L, 22L,
25L, 22L, 22L, 23L, 17L, 22L, 7L, 15L, 16L, 20L, 24L, 21L, 22L,
39L, 17L, 17L, 13L, 26L, 25L, 20L), GARDEN_SIZE = structure(c(1L,
1L, 2L, 2L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 3L, 2L, 3L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 3L, 3L, 2L, 2L,
1L), .Label = c("L", "M", "S"), class = "factor"), Site = structure(c(1L,
1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("R", "S", "U"), class = "factor")), row.names = c(NA,
50L), class = "data.frame")
Consider this:
stepfor<-function (y = y, d = d, alfa = 0.05)
{
# split the incoming data to give non-numeric the factor treatment:
x1 <- d %>% select(where(negate(is.numeric))) %>%
map_dfc(~ model.matrix(~ .x -1) %>%
as_tibble) %>%
rename_all(~ str_remove(., "\\.x"))
x2 <- d %>% select(where(is.numeric))
d <- cbind( x1, x2 )
pval <- NULL
design <- NULL
j = 1
resul0 <- summary(lm(y ~ ., data = d))$coefficients[, 4][-1]
d <- as.data.frame(d[, names(resul0)])
# rest of function body as is
}
eg. move the [-1] from the 5th line to the 4th to remove the intercept term earlier. The reamining coefficients shouldnow match the ones you have in your data.frame and names(resul0) should all exist in your data.frame
There is a problem with your approach to tackle this. You do:
d <- as.data.frame( d[, names(resul0) ] [-1] )
This code tries to look up all of the names(resul0) inside the d data.frame. This includes the intercept term, and this therefore fails. (And at this point its too late to remove the intercept afterwards as the damage has already been done)
You need to remove the intercept before looking up the names inside d. Then the name-error won't happen.
The body of the x function can be inserted in there, quite straight forward.

Comparing two groups with the linear model

I want to do a regression when parendiv is my Dependent variable and routine1997 is my Independent variable, and compare males to females. The code is like this:
structure(list(gender = structure(c(2L, 1L, 2L, 1L, 2L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L), .Label = c("male",
"female"), class = "factor"), parent = structure(c(2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = c("intact", "parentaldivorce"), class = "factor"),
routine = structure(c(1L, 1L, 1L, 1L, NA, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 1L, 2L, 3L, 2L, 1L, 3L, 3L), .Label = c("Med",
"High", "Low"), class = "factor")), row.names = c(3L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 16L, 18L, 19L, 21L, 22L, 23L, 24L,
25L, 28L, 29L, 30L, 34L), class = "data.frame")
This is the code and I want to specifically compare coefficient among men and women.
lm(parent~routine, data=nlsy97, subset=gender)
There are two ways to compare the coefficients.
The easiest way would be to code gender as dummy (0/1) and include an interaction term in the model. Then, you get the difference the gender makes for the coefficient, complete with a p-value:
out = lm(parent ~ routine + gender + routine*gender, data=nlsy97)
The other way would be to use a multigroup regression and comparing the pooled regression model (all genders included) with the unpooled models (seperate slopes or intercepts or both for genders). The model with the smallest AIC fits the data best. If your random slope model yields the lowest AIC, you have gender differences in your effect. If the random intercept is best, you just have level differences between the genders but may assume equal effects.
library(lme4)
pooled = lm(parent ~ routine, data=nlsy97)
r.inter = lmer(parent ~ routine + (1|gender), data=nlsy97)
r.slope = lmer(parent ~ routine + (routine|gender), data=nlsy97)
r.unpooled = lmer(parent ~ routine + (1+routine|gender), data=nlsy97)
AIC(pooled)
AIC(r.inter)
AIC(r.slope)
AIC(r.unpooled)
Using the method coefficients() on the model with the lowest AIC provides you with the exact coefficients for the individual groups.
EDIT: I just noticed that you just have 20 cases in total. If this is your whole dataset you should probably not do any statistical analyses at all.

ggplot2 loop graph with conditional subsets

Data description:
I have a data set that is in long format with multiple different grouping variables (in data example: StandID and simID)
What I am trying to do:
I need to create simple scatter plots (x=predicted, y=observed) from this dataset for multiple columns based on a unique grouping variable.
An example of what I am trying to do using just standard plot is
obs=subset(example,simID=="OBS_OBS_OBS")
csfnw=example[example$simID== "CS_F_NW",]
plot(obs$X1HR,csfnw$X1HR)
I would need to do this for all simID and columns 9-14. (12 graphs total from data example)
What I have tried:
The problem I am running into is the y axis needs to remain the same, while cycling through the different subsets for the x axis.
I will admit up front, I have no idea what would be the best approach for this... I thought this would be easy for a split second because the data is already in long format and I would just be pointing to a subset of the data.
1) My original approach was to try and just splice up the data so that each simID had its own data frame, and compare it against the observation dataframe but I don't know how I would then pass it to ggplot.
2) My second idea was to make some kind of makeGraph function containing all the aesthetics I wanted essentially and use some kind of apply on it to pass everything through the function, but I could get neither to work.
makePlot=function(dat,x,y) {
ggplot(data=dat,aes(x=x,y=y))+geom_point(shape=Treat)+theme_bw()
}
What I could get to work was just breaking down the dataframe into the vectors of the variables I would then pass to some kind of loop/apply
sims=levels(example$simID)
sims2=sims[sims != "OBS_OBS_OBS"]
fuel_classes=colnames(example)[9:14]
Thank you
Data example:
example=structure(list(Year = structure(c(7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L), .Label = c("2001", "2002", "2003", "2004", "2005",
"2013", "2014", "2015"), class = "factor"), StandID = structure(c(10L,
2L, 6L, 22L, 14L, 18L, 34L, 26L, 30L, 10L, 2L, 6L, 22L, 14L,
18L, 34L, 26L, 30L, 10L, 2L, 6L, 22L, 14L, 18L, 34L, 26L, 30L
), .Label = c("1NB", "1NC", "1NT", "1NTB", "1RB", "1RC", "1RT",
"1RTB", "1SB", "1SC", "1ST", "1STB", "2NB", "2NC", "2NT", "2NTB",
"2RB", "2RC", "2RT", "2RTB", "2SB", "2SC", "2ST", "2STB", "3NB",
"3NC", "3NT", "3NTB", "3RB", "3RC", "3RT", "3RTB", "3SB", "3SC",
"3ST", "3STB"), class = "factor"), Block = structure(c(1L, 1L,
1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L,
1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("1", "2", "3"
), class = "factor"), Aspect = structure(c(3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L), .Label = c("N", "R", "S"), class = "factor"),
Treat = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("B", "C", "T", "TB"), class = "factor"),
Variant = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("CS", "OBS", "SN"), class = "factor"),
Fuels = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("F", "NF", "OBS"), class = "factor"),
Weather = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("NW", "OBS", "W"), class = "factor"),
X1HR = c(0.321666667, 0.177777778, 0.216111111, 0.280555556,
0.255555556, 0.251666667, 0.296666667, 0.231111111, 0.22,
0.27556628, 0.298042506, 0.440185249, 0.36150676, 0.398630172,
0.367523015, 0.345717251, 0.349305987, 0.412227929, 0.242860824,
0.258737177, 0.394024998, 0.287317872, 0.321927488, 0.281322986,
0.313588411, 0.303123146, 0.383658946), X10HR = c(0.440555556,
0.32, 0.266666667, 0.292222222, 0.496666667, 0.334444444,
0.564444444, 0.424444444, 0.432777778, 0.775042951, 0.832148314,
1.08174026, 1.023838878, 0.976997674, 0.844206274, 0.929837704,
1.0527215, 1.089246511, 0.88642776, 0.920596302, 1.209707737,
1.083737493, 1.077612877, 0.92481339, 1.041637182, 1.149550319,
1.229776621), X100HR = c(0.953888889, 1.379444444, 0.881666667,
1.640555556, 2.321666667, 1.122222222, 1.907777778, 1.633888889,
1.208333333, 1.832724094, 2.149356842, 2.364475727, 2.493232965,
2.262988567, 1.903909683, 2.135747433, 2.256677628, 2.288722038,
1.997704744, 2.087135553, 2.524872541, 2.34671092, 2.338253498,
2.06796217, 2.176314831, 2.580271006, 2.857197046), X1000HR = c(4.766666667,
8.342222222, 3.803333333, 8.057777778, 10.11444444, 6.931111111,
6.980555556, 13.20611111, 1.853333333, 3.389177084, 4.915714741,
2.795267582, 2.48227787, 2.218413353, 1.64684248, 2.716156483,
2.913746119, 2.238629341, 3.449863434, 3.432626724, 3.617531776,
3.641639471, 3.453454971, 3.176793337, 3.459602833, 3.871166945,
2.683447838), LITTER = c(2.4, 2.219444444, 2.772222222, 2.596666667,
2.693888889, 2.226111111, 2.552222222, 3.109444444, 2.963333333,
2.882233381, 3.025934696, 3.174396992, 3.291081667, 2.897673607,
2.737119675, 2.987895727, 3.679605484, 2.769756079, 2.882241249,
3.02594161, 3.174404144, 3.291091681, 2.897681713, 2.737129688,
2.987901449, 3.679611444, 2.769766569), DUFF = c(1.483333333,
1.723888889, 0.901666667, 1.520555556, 1.49, 1.366111111,
0.551666667, 1.056111111, 0.786111111, 2.034614563, 2.349547148,
1.685223818, 2.301301956, 2.609308243, 2.21895647, 2.043699026,
2.142618418, 0.953421116, 4.968493462, 4.990526676, 5.012362003,
5.023665905, 4.974074364, 4.947199821, 4.976779461, 5.082509995,
3.55211544), simID = structure(c(5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L), .Label = c("CS_F_NW", "CS_F_W",
"CS_NF_NW", "CS_NF_W", "OBS_OBS_OBS", "SN_F_NW", "SN_F_W",
"SN_NF_NW", "SN_NF_W"), class = "factor")), .Names = c("Year",
"StandID", "Block", "Aspect", "Treat", "Variant", "Fuels", "Weather",
"X1HR", "X10HR", "X100HR", "X1000HR", "LITTER", "DUFF", "simID"
), row.names = c(37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L,
82L, 83L, 84L, 85L, 86L, 87L, 88L, 89L, 90L, 127L, 128L, 129L,
130L, 131L, 132L, 133L, 134L, 135L), class = "data.frame")
You were actually on the right track. If all plots are the same, just make one function and then use loops to loop over the subsets. For your example this can be done like this:
library(ggplot2)
# the plot function
plotFun = function(dat, title) {
ggplot(data=dat) +
geom_point(aes(x = x, y = y), shape=18) +
ggtitle(title) +
theme_bw()
}
# columns of interest
colIdx = 9:14
# split on all values of simID
dfList = split(example, example$simID)
# simID has never appearing factors. These are removed
dfList = dfList[lapply(dfList, nrow) != 0]
# make empty array for saving plots
plotList = array(list(), dim = c(length(dfList), length(dfList), length(colIdx)),
dimnames = list(names(dfList), names(dfList), names(example)[colIdx]))
# the first two loops loop over all unique combinations of dfList
for (i in 2:length(dfList)) {
for (j in 1:(i-1)) {
# loop over target variables
for (k in seq_along(colIdx)) {
# store variables to plot in a temporary dataframe
tempDf = data.frame(x = dfList[[i]][, colIdx[k]],
y = dfList[[j]][, colIdx[k]])
# add a title so we can see in the plot what is plotted vs what
title = paste0(names(dfList)[i], ":", names(dfList[[i]])[colIdx[k]], " VS ",
names(dfList)[j], ":", names(dfList[[j]])[colIdx[k]])
# make and save plot
plotList[[i, j, k]] = plotFun(tempDf, title)
}
}
}
# call the plots like this
plotList[[2, 1, 4]]
# Note that we only filled the lower triangle of combinations
# therefore indexing with [[1, 1, 1]] just returns NULL
plotList[, , 1]
This process can probably be more optimized, but when creating graphs I would go for clarity above speed since speed usually isn't an issue.

Resources