using predict() with a continuous by continuous interaction - r

I have a mixed model with a interaction of two continuous variables. I understand how to use predict() for a continuous by categorical interaction, but can't find any information on how to use predict() to generate graphs of continuous by continuous interactions. So far I have:
#the data
mydata<-structure(list(Week = c(3L, 3L, 6L, 6L, 5L, 6L, 3L, 1L, 4L, 3L,
1L, 2L, 6L, 6L, 6L, 6L), X2 = c(20.8, 21.4, 22.2, 21.9, 21, 21.8,
16.6, 15.6, 21.9, 19.8, 17.5, 12.5, 20.1, 20.5, 21.7, 22.3),
X1 = c(78L, 90L, 81L, 44L, 9L, 35L, 99L, 17L, 1L, 7L, 23L,
14L, 9L, 77L, 84L, 1L), Y = c(14.97469781, 19.88267242, 15.59780954,
9.633809968, 15.12038794, 10.43636012, 10.7436911, 16.71840387,
12.43274774, 10.90741585, 8.79514591, 14.1932374, 8.776376951,
9.995133069, 12.38314719, 9.611533444)), class = "data.frame", row.names = c(NA,
-16L))
#assigning 'Week' as a factor
mydata$Week<-as.factor(mydata$Week)
#the model
model1<-glmer(Y~X1*X2+(1|Week),data=mydata, family=Gamma(link='log'))
NEWDATA <-
expand.grid(
X1 = seq(1, 99, length = 100),
X2 = seq(12.5, 22.3, length = 100),
Week = levels(mydata$Week)
)
PREDMASS <-
predict(model1,
newdata = NEWDATA,
re.form = ~ (1 | Week))
PREDSFRAME <- cbind(NEWDATA, PREDMASS)
head(PREDSFRAME)
If the interaction were between a continuous and a categorical variable, I would then use the code below, but this doesn't work:
ggplot(PREDSFRAME, aes(x = X1, y = PREDMASS)) +
geom_line() +
geom_point(data = mydata,
facet_grid(. ~ X2) +
aes(y = Y),
alpha = 0.3)
Any suggestions?

I think you actually want the facet_grid outside the geom_point() function. If you run it that way you won't get an error.
ggplot(PREDSFRAME, aes(x = X1, y = PREDMASS)) +
geom_line() +
geom_point(data = mydata,
aes(y = Y),
alpha = 0.3)+
facet_grid(. ~ X2)
But what you get is a grid of plots for every value of X2 (because it is continuous), which is also not what you want.
What you need to do is specify some X2 values that you would create different plots (or regression lines of), since I am assuming (perhaps incorrectly) that you don't want to plot every possible combination (which is a 2D plane, as #HongOoi suggests).
I know you ask for a solution using predict, which perhaps you can solve with the above information, but I offer this "pre-made" solution from sjPlot that I find quick and helpful:
library(sjPlot)
plot_model(model1, type="pred",terms=c("X1","X2"))

Related

How to make a lineplot with specific values out of a dataframe

I have a df as follow:
Variable Value
G1_temp_0 37.9
G1_temp_5 37.95333333
G1_temp_10 37.98333333
G1_temp_15 38.18666667
G1_temp_20 38.30526316
G1_temp_25 38.33529412
G1_mean_Q1 38.03666667
G1_mean_Q2 38.08666667
G1_mean_Q3 38.01
G1_mean_Q4 38.2
G2_temp_0 37.9
G2_temp_5 37.95333333
G2_temp_10 37.98333333
G2_temp_15 38.18666667
G2_temp_20 38.30526316
G2_temp_25 38.33529412
G2_mean_Q1 38.53666667
G2_mean_Q2 38.68666667
G2_mean_Q3 38.61
G2_mean_Q4 38.71
I like to make a lineplot with two lines which reflects the values "G1_mean_Q1 - G1_mean_Q4" and "G2_mean_Q1 - G2_mean_Q4"
In the end it should more or less look like this, the x axis should represent the different variables:
The main problem I have is, how to get a basic line plot with this df.
I've tried something like this,
ggplot(df, aes(x = c(1:4), y = Value) + geom_line()
but I have always some errors. It would be great if someone could help me. Thanks
Please post your data with dput(data) next time. it makes it easier to read your data into R.
You need to tell ggplot which are the groups. You can do this with aes(group = Sample). For this purpose, you need to restructure your dataframe a bit and separate the Variable into different columns.
library(tidyverse)
dat <- structure(list(Variable = structure(c(5L, 10L, 6L, 7L, 8L, 9L,
1L, 2L, 3L, 4L, 15L, 20L, 16L, 17L, 18L, 19L, 11L, 12L, 13L,
14L), .Label = c("G1_mean_Q1", "G1_mean_Q2", "G1_mean_Q3", "G1_mean_Q4",
"G1_temp_0", "G1_temp_10", "G1_temp_15", "G1_temp_20", "G1_temp_25",
"G1_temp_5", "G2_mean_Q1", "G2_mean_Q2", "G2_mean_Q3", "G2_mean_Q4",
"G2_temp_0", "G2_temp_10", "G2_temp_15", "G2_temp_20", "G2_temp_25",
"G2_temp_5"), class = "factor"), Value = c(37.9, 37.95333333,
37.98333333, 38.18666667, 38.30526316, 38.33529412, 38.03666667,
38.08666667, 38.01, 38.2, 37.9, 37.95333333, 37.98333333, 38.18666667,
38.30526316, 38.33529412, 38.53666667, 38.68666667, 38.61, 38.71
)), class = "data.frame", row.names = c(NA, -20L))
dat <- dat %>%
filter(str_detect(Variable, "mean")) %>%
separate(Variable, into = c("Sample", "mean", "time"), sep = "_")
g <- ggplot(data=dat, aes(x=time, y=Value, group=Sample)) +
geom_line(aes(colour=Sample))
g
Created on 2020-07-20 by the reprex package (v0.3.0)

Labeling a single point with ggrepel

I am trying to use geom_label_repel to add labels to a couple of data points on a plot. In this case, they happen to be outliers on box plots. I've got most of the code working, I can label the outlier, but for some reason I am getting multiple labels (equal to my sample size for the entire data set) mapped to that point. I'd like just one label for this outlier.
Example:
Here is my data:
dput(sus_dev_data)
structure(list(time_point = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L), .Label = c("3", "8", "12"), class = "factor"),
days_to_pupation = c(135L, 142L, 143L, 155L, 149L, 159L,
153L, 171L, 9L, 67L, 53L, 49L, 72L, 67L, 55L, 64L, 60L, 122L,
53L, 51L, 49L, 53L, 50L, 56L, 44L, 47L, 60L)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 20L, 21L, 22L, 23L, 24L, 26L, 27L, 28L, 29L, 30L), class = "data.frame")
and my code...
####################################################################################################
# Time to pupation statistical analysis
####################################################################################################
## linear model
pupation_Model=lm(sus_dev_data$days_to_pupation~sus_dev_data$time_point)
pupationANOVA=aov(pupation_Model)
summary(pupationANOVA)
# Tukey test to study each pair of treatment :
pupationTUKEY <- TukeyHSD(x=pupationANOVA, which = 'sus_dev_data$time_point',
conf.level=0.95)
## Function to generate significance labels on box plot
generate_label_df <- function(pupationTUKEY, variable){
# Extract labels and factor levels from Tukey post-hoc
Tukey.levels <- pupationTUKEY[[variable]][,4]
Tukey.labels <- data.frame(multcompLetters(Tukey.levels, reversed = TRUE)['Letters'])
#I need to put the labels in the same order as in the boxplot :
Tukey.labels$treatment=rownames(Tukey.labels)
Tukey.labels=Tukey.labels[order(Tukey.labels$treatment) , ]
return(Tukey.labels)
}
#generate labels using function
labels<-generate_label_df(pupationTUKEY , "sus_dev_data$time_point")
#rename columns for merging
names(labels)<-c('Letters','time_point')
# obtain letter position for y axis using means
pupationyvalue<-aggregate(.~time_point, data=sus_dev_data, max)
#merge dataframes
pupationfinal<-merge(labels,pupationyvalue)
####################################################################################################
# Time to pupation plot
####################################################################################################
# Plot of data
(pupation_plot <- ggplot(sus_dev_data, aes(time_point, days_to_pupation)) +
Alex_Theme +
geom_boxplot(fill = "grey80", outlier.size = 0.75) +
geom_text(data = pupationfinal, aes(x = time_point, y = days_to_pupation,
label = Letters),vjust=-2,hjust=.5, size = 4) +
#ggtitle(expression(atop("Days to pupation"))) +
labs(y = 'Days to pupation', x = 'Weeks post-hatch') +
scale_y_continuous(limits = c(0, 200)) +
scale_x_discrete(labels=c("3" = "13", "8" = "18",
"12" = "22")) +
geom_label_repel(aes(x = 1, y = 9),
label = '1')
)
Here's a shorter example to demonstrate what is going on. Essentially, your labels are beng recycled to be the same length as the data.
df = data.frame(x=1:5, y=1:5)
ggplot(df, aes(x,y, color=x)) +
geom_point() +
geom_label_repel(aes(x = 1, y = 1), label = '1')
You can override this by providing new data for the ggrepel
ggplot(df, aes(x,y, color=x)) +
geom_point() +
geom_label_repel(data = data.frame(x=1, y=1), label = '1')
Based on your data, you have 3 outliers (one in each group), you can manually identify them by applying the classic definition of outliers by John Tukey (Upper: Q3+1.5*IQR and Lower: Q1-1.5*IQR) (but you are free to set your own rules to define an outlier). You can use the function quantile and IQR to get those points.
Here, I incorporated them in a sequence of pipe using dplyr package:
library(tidyverse)
Outliers <- sus_dev_data %>% group_by(time_point) %>%
mutate(Out_up = ifelse(days_to_pupation > quantile(days_to_pupation,0.75)+1.5*IQR(days_to_pupation), "Out","In"))%>%
mutate(Out_Down = ifelse(days_to_pupation < quantile(days_to_pupation,0.25)-1.5*IQR(days_to_pupation), "Out","In")) %>%
filter(Out_up == "Out" | Out_Down == "Out")
# A tibble: 3 x 4
# Groups: time_point [3]
time_point days_to_pupation Out_up Out_Down
<fct> <int> <chr> <chr>
1 3 9 In Out
2 8 122 Out In
3 12 60 Out In
As mentioned by #dww, you need to pass a new dataframe to geom_label_repel if you want your outliers to be single labeled. So, here we use the dataframe Outliers to feed the geom_label_repel function:
library(ggplot2)
library(ggrepel)
ggplot(sus_dev_data, aes(time_point, days_to_pupation)) +
#Alex_Theme +
geom_boxplot(fill = "grey80", outlier.size = 0.75) +
geom_text(data = pupationfinal, aes(x = time_point, y = days_to_pupation,
label = Letters),vjust=-2,hjust=.5, size = 4) +
#ggtitle(expression(atop("Days to pupation"))) +
labs(y = 'Days to pupation', x = 'Weeks post-hatch') +
scale_y_continuous(limits = c(0, 200)) +
scale_x_discrete(labels=c("3" = "13", "8" = "18",
"12" = "22")) +
geom_label_repel(inherit.aes = FALSE,
data = Outliers,
aes(x = time_point, y = days_to_pupation, label = "Out"))
And you get the following graph:
I hope it helps you to figure it how to label all your outliers.

Add age adjustment to geom_smooth

I need to include age adjustment in the geom_smooth line I am adding to my ggscatter plot.
my data looks like~
table link
structure(list(Time = c(0L, 0L, 0L, 0L, 6L, 12L, 18L, 18L, 0L,
12L, 18L, 6L), group = structure(c(1L, 1L, 2L, 2L, 1L, 3L, 3L,
3L, 3L, 4L, 4L, 1L), .Label = c("A", "B", "C", "D"), class = "factor"),
Age = c(77, 70.2, 69.9, 65.7, 66.2, 66.7, 67.2, 67.7, 66.8,
67.8, 68.3, 68.8), Average = c(96L, 90L, 94L, 94L, 96L, 96L,
92L, 120L, 114L, 109L, 113L, 103L)), row.names = c(NA, 12L
), class = "data.frame")
What I currently have (the 'Average" value have dependency in age..):
ggscatter(dtable, "Time","Average",conf.int = TRUE)+theme_bw()+
geom_smooth(aes(group=1),method='lm')+facet_wrap(~groups)
What I would like to have is something like:
ggscatter(dtable, "Time","Average",conf.int = TRUE)+theme_bw()+
geom_smooth(aes(group=1),method='lm', adjust= ~age)+facet_wrap(~groups)
With adjustment per each group mean age
Any suggestions?
Here is I think what you are after.
First, we need to fit the more complicated model because ggplot does not have a functionality for multivariable models (yet)
fit <- lm(Average ~ Time + group + Age, data = tdata)
Then we can use some functionality from the broom package to add the predictions and associated standard errors. With these in hand we can manually build the plot using the geom_line and geom_ribbon geoms
library(broom)
tdata %>%
bind_cols(augment(fit)) %>%
ggplot(aes(Time, Average))+
geom_point()+
geom_line(aes(x = Time, y = .fitted), size = 2, color = "blue")+
geom_ribbon(aes(ymin = .fitted + .se.fit*2, ymax = .fitted - .se.fit*2), alpha = .2)+
facet_wrap(~group)+
theme_bw()
Additionally, if you wanted to look at pooled vs non-pooled estimates
fit_no_pool <- lm(Average ~ Time + group + Age, data = tdata)
fit_complete_pool <- lm(Average ~ Time + Age, data = tdata)
library(broom)
tdata %>%
bind_cols(augment(fit_no_pool) %>% setNames(sprintf("no_pool%s", names(.)))) %>%
bind_cols(augment(fit_complete_pool) %>% setNames(sprintf("pool%s", names(.)))) %>%
ggplot(aes(Time, Average))+
geom_point()+
# Non-Pooled Estimates
geom_line(aes(x = Time, y = no_pool.fitted, color = "blue"), size = 2)+
geom_ribbon(aes(ymin = no_pool.fitted + no_pool.se.fit*2,
ymax = no_pool.fitted - no_pool.se.fit*2), alpha = .2)+
# Pooled Estimates
geom_line(aes(x = Time, y = pool.fitted, color = "orange"), size = 2)+
geom_ribbon(aes(ymin = pool.fitted + pool.se.fit*2,
ymax = pool.fitted - pool.se.fit*2), alpha = .2)+
facet_wrap(~group)+
scale_color_manual(name = "Regression",
labels = c("Pooled", "Non-Pooled"),
values = c("blue", "orange"))+
theme_bw()
One way to go is to run your model with Age as an additional predictor in your model. then use predict to get the predicted value with CIs. Append to your data then use ggplot to plot. I know you want to facet by group, so it might be worth putting it into your model as well. Just a thought. The steps would be the same.
df <- structure(list(Time = c(0L, 0L, 0L, 0L, 6L, 12L, 18L, 18L, 0L,
12L, 18L, 6L), group = structure(c(1L, 1L, 2L, 2L, 1L, 3L, 3L,
3L, 3L, 4L, 4L, 1L), .Label = c("A", "B", "C", "D"), class = "factor"),
Age = c(77, 70.2, 69.9, 65.7, 66.2, 66.7, 67.2, 67.7, 66.8,
67.8, 68.3, 68.8), Average = c(96L, 90L, 94L, 94L, 96L, 96L,
92L, 120L, 114L, 109L, 113L, 103L)), row.names = c(NA, 12L
), class = "data.frame")
#model adjusted for age
mod <- lm(Average ~ Time + Age, data = df)
#get prediction with CIS
premod <- predict(mod, interval = "predict")
#append to data
df2 <- cbind(df,premod)
#add prediction to ggplot with scatter plot
ggplot(df2) +
geom_point(aes(x=Time,y=Average)) +
geom_line(aes(x=Time, y = fit)) +
geom_ribbon(aes(x = Time,ymin = lwr, ymax = upr), alpha = .1)+
facet_wrap(~group)+
theme_bw()

Multiple Regression - Error in model.frame.default variable lengths differ

I'm trying to run a multiple regression with 3 independent variables, and 3 dependent variables. The question is based on how water quality influences plankton abundance in and between 3 different locations aka guzzlers. With water quality variables being pH, phosphates, and nitrates. Dependent/response variables would be the plankton abundance in each 3 locations.
Here is my code:
model1 <- lm(cbind(Abundance[Guzzler.. == 1], Abundance[Guzzler.. == 2],
Abundance[Guzzler.. == 3]) ~ Phospates + Nitrates + pH,
data=WQAbundancebyGuzzler)
And this is the error message I am getting:
Error in model.frame.default(formula = cbind(Abundance[Guzzler.. == 1], :
variable lengths differ (found for 'Phospates')
I think it has to do with how my data is set up but I'm not sure how to go about changing this to get the model to run. What I'm trying to see is how these water quality variables are affecting the abundance in the different locations and how they vary between. So it doesn't seem quite logical to try multiple models which was my only other thought.
Here is the output from dput(head(WQAbundancebyGuzzler)):
structure(list(ï..Date = structure(c(2L, 4L, 1L, 3L, 5L, 2L), .Label = c("11/16/2018",
"11/2/2018", "11/30/2018", "11/9/2018", "12/7/2018"), class = "factor"),
Guzzler.. = c(1L, 1L, 1L, 1L, 1L, 2L), Phospates = c(2L,
2L, 2L, 2L, 2L, 1L), Nitrates = c(0, 0.3, 0, 0.15, 0, 0),
pH = c(7.5, 8, 7.5, 7, 7, 8), Air.Temp..C. = c(20.8, 25.4,
20.9, 16.8, 19.4, 27.4), Relative.Humidity... = c(62L, 31L,
41L, 59L, 59L, 43L), DO2.Concentration..mg.L. = c(3.61, 4.48,
3.57, 5.65, 2.45, 5.86), Water.Temp..C. = c(14.1, 11.5, 11.8,
13.9, 11.1, 17.8), Abundance = c(98L, 43L, 65L, 55L, 54L,
29L)), .Names = c("ï..Date", "Guzzler..", "Phospates", "Nitrates",
"pH", "Air.Temp..C.", "Relative.Humidity...", "DO2.Concentration..mg.L.",
"Water.Temp..C.", "Abundance"), row.names = c(NA, 6L), class = "data.frame")
I think the problem here is more theoretical: You say that you have three dependent variables that you want to enter into a multiple linear regression. However, at least in classic linear regression, there can only be one dependent variable. There might be ways around this, but I think in your case, one dependent variable works just fine: It's `Abundance´. Now you you have sampled three different locations: One solution to account for this could be to just enter the location as a categorical independent variable. So I would propose the following model:
# Make sure that Guzzler is not treated as numeric
WQAbundancebyGuzzler$Guzzler <- as.factor(WQAbundancebyGuzzler$Guzzler)
# Model with 4 independent variables
model1 <- lm(Abundance ~ Guzzler + Phospates + Nitrates + pH,
data=WQAbundancebyGuzzler)
It's probably also wise to think about possible interactions here, especially between Guzzler and the other independent variables.
The reason for your error is, that you try to subset only "Abundance" but not the other variables. So as a result their lenghts differ. You need to subset the whole data, e.g.
lm(Abundance ~ Phospates + Nitrates + pH,
data=WQAbundancebyGuzzler[WQAbundancebyGuzzler$Abundance %in% c(1, 2, 3), ])
With given head(WQAbundancebyGuzzler)
lm(Abundance ~ Phospates + Nitrates + pH,
data=WQAbundancebyGuzzler[WQAbundancebyGuzzler$Abundance %in% c(29, 43, 65), ])
results in
# Call:
# lm(formula = Abundance ~ Phospates + Nitrates + pH, data = WQAbundancebyGuzzler
# [WQAbundancebyGuzzler$Abundance %in%
# c(29, 43, 65), ])
#
# Coefficients:
# (Intercept) Phospates Nitrates pH
# -7.00 36.00 -73.33 NA

arrange by two factors

I have been trying the whole day to arrange two factor levels called "type" and "name" by a numeric value called "score", and plot by category type (with color determined by type) ordered by score. I am also trying to get the group called "ALL" on top so it is separated by the other 3 categories in "type". My attempts until now have been very unsuccessful, I don't get why I can't even get the reordering correctly. Any help is very appreciated.
This is my data:
df = structure(list(score = c(12, 12.2, 12.5, 12.3, 12.2, 12.4, 12.5, 12.7, 12.1, 12.8, 12.4, 12.3, 12.2, 12.6, 12.8, 12.1, 12.5), range1 = c(0.003356, 1.20497, -0.128138, -42.6093, -41.1975, -44.706, -20, -46.4245, -0.543379, 2.09828, -20, -20, -44.2262, -46.6559, -20, -20, 2.37709), point = c(1.56805, 2.11176, 0.1502, -22.6093, -21.1975, -24.706, -0.491829, -26.4245, 2.49973, 2.94457, 0.0443572, 0.0208999, -24.2262, -26.6559, 2.69408, 3.22951, 3.33255), range2 = c(2.3767, 2.73239, 0.430373, 4.34247, 4.96875, 3.78027, 1.91331, 4.07937, 3.54538, 3.5491, 1.87162, 2.41067, 5.26578, 4.50965, 4.55967, 5.05772, 3.97742), type = structure(c(1L, 1L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("ALL", "A", "B", "C"), class = "factor"), name = structure(c(13L, 14L, 15L, 1L, 4L, 5L, 6L, 8L, 12L, 17L, 2L, 3L, 7L, 9L, 10L, 11L, 16L), .Label = c("B_vision1", "C_vision2", "C_vision3", "B_vision4", "B_vision5", "A_vision2", "C_vision4", "B_vision6", "C_vision6", "C_vision5", "C_vision1", "B_vision7", "B_ALL", "C_ALL", "A", "C_vision7", "B_vision3"), class = "factor")), .Names = c("score", "range1", "point", "range2", "type", "name"), row.names = c(NA, -17L), class = "data.frame")
I have tried all these options:
df$name2 = reorder(df$name, -df$score)
# df$name <- reorder(df$name, -df$score)
df <- transform(df, category2 = factor(paste(name, type)))
df <- transform(df, category2 = reorder(category2, score))
#library(plyr)
#df = arrange(df,type, name)
ggplot(df, aes(x=name, y=point, ymin=range1, ymax=range2, colour=type)) +
geom_pointrange() +
coord_flip()
or
ggplot(df, aes(x=category2, y=point, ymin=range1, ymax=range2, colour=type)) +
geom_pointrange() +
coord_flip()
I am trying to get something similar to the grouped forest plot on this question but with each group defined by names and reordered by score.
I think I've interpreted what you're trying to do correctly, but I might have got it wrong.
The names (and scores) can be ordered by the sorted list of scores as
ordered.names <- as.character(df$name)[order(df$score)]
ordered.scores <- as.character(df$score)[order(df$score)]
Re-ordering the name levels (with the score annotated) is then
df$name <- factor(df$name, levels=ordered.names, labels=paste(ordered.names, "(", ordered.scores, ")"))
Plotting these with ggplot:
library(ggplot)
ggplot(df, aes(x=name, y=point, ymin=range1, ymax=range2, group=type, color=type)) +
geom_pointrange() +
theme(axis.text.x=element_text(angle=90, hjust=0))
produces
If you want this split up by type as well, you can facet the plot
ggplot(df, aes(x=name, y=point, ymin=range1, ymax=range2, group=type, color=type)) +
geom_pointrange() +
theme(axis.text.x=element_text(angle=90, hjust=0)) +
facet_wrap(~type, ncol=4, scale="free_x")

Resources