Plotting proper mixed models regression slope - r

I have a data set to which I am fitting a mixed models regression with lme4.
dat <- structure(list(dv280 = c(41L, 68L, 0L, 6L, 20L, 30L, 8L, 1L,
15L, NA, 59L, 5L, 21L, 41L, 11L, 14L, -2L, 20L, 25L, 33L, 32L,
30L, 68L, 16L, 11L, -1L, 8L, 0L), v0 = c(55L, 90L, 30L, 23L,
74L, 48L, 25L, 25L, 46L, NA, 60L, 69L, 55L, 41L, 34L, 41L, 53L,
76L, 72L, 64L, 34L, 37L, 75L, 21L, 26L, 14L, 24L, 19L), treatment = structure(c(2L,
1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L), .Label = c("hc",
"nhc"), class = "factor"), cse = structure(c(2, 2, 6, 6, -4,
-4, 5, 5, NA, NA, -4, -4, -3, -3, -2, -2, 3, 3, 2, 2, -4, -4,
-7, -7, 4, 4, 2, 2), .Dim = c(28L, 1L)), pp = structure(c(1L,
1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L,
9L, 10L, 10L, 11L, 11L, 12L, 12L, 13L, 13L, 14L, 14L), .Label = c("j.1",
"j.3", "j.6", "j.11", "j.13", "j.16", "j.17", "j.18", "j.19",
"j.22", "j.24", "j.30", "j.32", "j.36"), class = "factor")), .Names = c("dv280",
"v0", "treatment", "cse", "pp"), row.names = c(NA, 28L), class = "data.frame")
head(dat)
require(lme4)
m <- lmer(dv280 ~ 1 + v0:treatment + cse + (0 + v0 | pp), data=dat, REML=TRUE)
summary(m)
Next I would like to plot the resulting fit, so using the example code from http://glmm.wikidot.com/faq I generated predictions for the original values.
newdat <- data.frame(
v0=dat$v0,
treatment=dat$treatment,
cse=dat$cse,
dv280=0)
newdat <- newdat[-c(9,10),]
mm <- model.matrix(terms(m), newdat)
newdat$dv280 <- mm %*% fixef(m)
pvar1 <- diag(mm %*% tcrossprod(vcov(m), mm))
tvar1 <- pvar1 + VarCorr(m)$pp[1]
newdat <- data.frame(newdat, plo=newdat$dv280 - 2 * sqrt(pvar1),
phi=newdat$dv280 + 2 * sqrt(pvar1),
tlo=newdat$dv280 - 2 * sqrt(tvar1),
thi=newdat$dv280 + 2 * sqrt(tvar1))
This is easy to plot with ggplot:
p <- ggplot(data=newdat, mapping=aes(x=v0, y=dv280, colour=treatment)) +
geom_point() +
geom_smooth(method='lm', se=TRUE) +
scale_colour_discrete(guide=guide_legend(title.position='left', title.hjust=1))
p + .mytheme + coord_cartesian(xlim=c(-20,100)) +
geom_hline(yintercept=0, colour='gray35', linetype='dashed') +
geom_vline(xintercept=0, colour='gray35', linetype='dashed')
But as the image shows, the intercepts of the regression lines plotted by ggplot are off compared to the estimated intercept of the regression; at least this is the case for the nhc line which clearly goes to a negative intercept whereas the common estimated intercept is 0.54.
My mistake is easy (I presume): using geom_smooth results in a different fit than what the regression said given that geom_smooth takes the data separately per treatment and also at face value without the context of the fitted model. But I am at a loss how to get the lines properly plotted.

You need to consider the effect of cse. If I set cse=0 in newdat, the lines go through the intercept:
p <- ggplot(data=newdat, mapping=aes(x=v0, y=dv280, colour=treatment)) +
geom_point() +
geom_smooth(method='lm', se=TRUE, fullrange=TRUE) +
scale_colour_discrete(guide=guide_legend(title.position='left', title.hjust=1))
p + theme_bw() + coord_cartesian(xlim=c(-1,1), ylim=c(0.4,0.65)) +
geom_hline(yintercept=0, colour='gray35', linetype='dashed') +
geom_vline(xintercept=0, colour='gray35', linetype='dashed')
Note that it is normally not advisable to include an interaction in the model without the corresponding main effects.
Edit:
According to your comments you might want something like this (with your original newdat):
p <- ggplot(data=newdat, mapping=aes(x=v0, y=dv280, ymin=tlo, ymax=thi, colour=treatment, fill=treatment)) +
geom_ribbon(alpha=0.2, aes(colour=NULL)) +
geom_point() +
geom_line() +
scale_colour_discrete(guide=guide_legend(title.position='left', title.hjust=1))
p + theme_bw() +
geom_hline(yintercept=0, colour='gray35', linetype='dashed') +
geom_vline(xintercept=0, colour='gray35', linetype='dashed')
However, this plot doesn't seem very useful (in particular the lines between points) as it only depicts the projection on the dv280-v0-plane and you can't even see the linear relationship this way.

Related

starting values for nls() to solve singular gradient error in R

I am attempting to fit nls() for 520 users to achieve the coefficients of the exponential fitting. The following is a small representation of my data.
dput(head(Mfrq.df.2))
structure(list(User.ID = c("37593", "38643", "49433", "60403",
"70923", "85363"), V1 = c(9L, 3L, 4L, 80L, 19L, 0L), V2 = c(10L,
0L, 29L, 113L, 21L, 1L), V3 = c(5L, 2L, 17L, 77L, 7L, 2L), V4 = c(2L,
2L, 16L, 47L, 4L, 3L), V5 = c(2L, 10L, 16L, 40L, 1L, 8L), V6 = c(4L,
0L, 9L, 22L, 1L, 7L), V7 = c(6L, 8L, 9L, 8L, 0L, 6L), V8 = c(2L,
17L, 16L, 24L, 2L, 1L), V9 = c(3L, 20L, 7L, 30L, 0L, 4L), V10 = c(2L,
11L, 5L, 11L, 2L, 3L)), row.names = c(NA, 6L), class = "data.frame")
Finally, I found two ways of doing this. However for both, I get an error stating singular gradient.
#Way I
x=1:10
Mfrq.df.2_long <- pivot_longer(Mfrq.df.2, matches("V\\d{1,2}"), names_to = NULL, values_to = "Value")
Mfrq.df.2_long %>%
group_by(User.ID) %>%
mutate(fit = nls(Value ~ A * exp(-k * x), start = c(A =2, k = 0.01)) %>% list())
#Way2
L1 = c()
for (i in unique(Mfrq.df.2$User.ID)) {L1[[as.character(i)]]=seq(1,10)}
length(L1) #520 users
dput(head(L1))
list(`37593` = 1:10, `38643` = 1:10, `49433` = 1:10, `60403` = 1:10,
`70923` = 1:10, `85363` = 1:10)
#Way 2 Continue
L2=list.ids.RecSOC.2
length(L2) #520 users
dput(head(L2))
list(`37593` = c(9L, 10L, 5L, 2L, 2L, 4L, 6L, 2L, 3L, 2L), `38643` = c(3L,
0L, 2L, 2L, 10L, 0L, 8L, 17L, 20L, 11L), `49433` = c(4L, 29L,
17L, 16L, 16L, 9L, 9L, 16L, 7L, 5L), `60403` = c(80L, 113L, 77L,
47L, 40L, 22L, 8L, 24L, 30L, 11L), `70923` = c(19L, 21L, 7L,
4L, 1L, 1L, 0L, 2L, 0L, 2L), `85363` = c(0L, 1L, 2L, 3L, 8L,
7L, 6L, 1L, 4L, 3L))
#Way 2 Continue
control=nls.control(maxiter=1000)
res <- mapply(function(x,y){
nls(y~A*(exp(-k*x)),
start=list(A=100, k=0.01), control=control,
trace= TRUE, data=data.frame(x, y))},L1,L2, SIMPLIFY=FALSE)
To the best of my understanding, it has something to do with the starting values. I find it hard to find starting values that would work for all 520. Especially knowing not all of them are following the defined curve. I still need all 520 coefficients (A&k) to do my further analyses.
Any recommendations? Thanks
The starting values are very important with nls() so you need to spend some time getting them right. First let's extract the first group:
Value <- unlist(Mfrq.df.2_long[Mfrq.df.2_long$User.ID==37593, "Value"])
x <- 1:10
plot(x, Value)
This plot shows declining values from about 10 to 2 on the Y axis as the x values range from 1 to 10. What do your starting values look like on this plot?
Y <- 2 * exp(.01 * x)
lines(x, Y)
A very slightly increasing value from 2 to 2.2 over the range of x. Clearly the value inside exp() must be negative and greater than .01 since the decline is much greater than your starting values and the starting value for A should be more like 10:
Y <- 10 * exp(-.1 * x)
lines(x, Y, col="red")
This is much better. Now nls():
# fit = nls(Value ~ A * exp(k * x), start = c(A = 10, k = -.1))
# fit
# Nonlinear regression model
# model: Value ~ A * exp(k * x)
# data: parent.frame()
# A k
# 10.8628 -0.1901
# residual sum-of-squares: 33.66
#
# Number of iterations to convergence: 7
# Achieved convergence tolerance: 9.325e-06
Plotting the result:
xfit <- seq(1, 10, by=.1)
yfit <- predict(fit, list(x=xfit))
lines(xfit, yfit, col="blue")
Once you get the first group to work, it should be straightforward to apply it to the rest of the groups.
One option to estimate the coefficients is to perform a linear regression on a transformed data set.
y = A*exp(kx)
can be transformed to
ln(y) = kx + B
Mfrq.df.2_long <- pivot_longer(Mfrq.df.2, matches("V\\d{1,2}"), names_to = NULL, values_to = "Value")
data <- Mfrq.df.2_long[Mfrq.df.2_long$User.ID==37593, ]
#Add the independent value to data frame
data$x <- 1:10
#fit linear model ln(y)= A*x +B
model <- lm(log(Value) ~ x, data)
#predict(model, data.frame(x))
#plot original data
plot(data$x, data$Value)
#plot linear model
#taking the exp of the prediction since the model was to preduct ln(y)
lines(data$x, exp(predict(model, data.frame(x=1:10))), col="red")
#Extract out the coefficients A=exp(intercept) from linear model
# And set the start values
sA <- exp(model$coefficients[1])
sk <- model$coefficients[2]
model2 <- nls(Value ~ A * exp(k * x), data= data, start = list(A = sA, k = sk))
#plot the nls model
lines(data$x, predict(model2, data.frame(x=1:10)), col="blue")
Note: the equations are slightly different be the least squares is typing to minimize the differences between different values y and ln(y).

Bars in geom_bar have unwanted different widths when using facet_wrap

I can'd find a solution for the following problem(s). I would appreciate some help a lot!
The following code produces bar charts using facet. However, due to "extra space" ggplot2 has in some groups it makes the bars much wider, even if I specify a width of 0.1 or similar. I find that very annoying since it makes it look very unprofessional. I want all the bars to look the same (except for the fill). I hope somebody can tell me how to fix this.
Secondly, how can I reorder the different classes in the facet windows so that the order is always C1, C2 ... C5, M, F, All where applicable. I tried it with ordering the levels of the factor, but since not all classes are present in every graph part it did not work, or at least I assume that was the reason.
Thirdly, how can I reduce the space between the bars? So that the whole graph is more compressed. Even if I make the image smaller for exporting, R will scale the bars smaller but the spaces between the bars are still huge.
I would appreciate feedback for any of those answers!
My Data:
http://pastebin.com/embed_iframe.php?i=kNVnmcR1
My Code:
library(dplyr)
library(gdata)
library(ggplot2)
library(directlabels)
library(scales)
all<-read.xls('all_auto_visual_c.xls')
all$station<-as.factor(all$station)
#all$group.new<-factor(all$group, levels=c('C. hyperboreus','C. glacialis','Special Calanus','M. longa','Pseudocalanus sp.','Copepoda'))
allp <- ggplot(data = all, aes(x=shortname2, y=perc_correct, group=group,fill=sample_size)) +
geom_bar(aes(fill=sample_size),stat="identity", position="dodge", width=0.1, colour="NA") + scale_fill_gradient("Sample size (n)",low="lightblue",high="navyblue")+
facet_wrap(group~station,ncol=2,scales="free_x")+
xlab("Species and stages") + ylab("Automatic identification and visual validation concur (%)") +
ggtitle("Visual validation of predictions") +
theme_bw() +
theme(plot.title = element_text(lineheight=.8, face="bold", size=20,vjust=1), axis.text.x = element_text(colour="grey20",size=12,angle=0,hjust=.5,vjust=.5,face="bold"), axis.text.y = element_text(colour="grey20",size=12,angle=0,hjust=1,vjust=0,face="bold"), axis.title.x = element_text(colour="grey20",size=15,angle=0,hjust=.5,vjust=0,face="bold"), axis.title.y = element_text(colour="grey20",size=15,angle=90,hjust=.5,vjust=1,face="bold"),legend.position="none", strip.text.x = element_text(size = 12, face="bold", colour = "black", angle = 0), strip.text.y = element_text(size = 12, face="bold", colour = "black"))
allp
#ggsave(allp, file="auto_visual_stackover.jpeg", height= 11, width= 8.5, dpi= 400,)
The current graph that needs some fixing:
Thanks a lot!
Here what I did after suggestion from Gregor. Using geom_segment and geom_point makes a nice graph as I think.
library(ggplot2)
all<-read.xls('all_auto_visual_c.xls')
all$station<-as.factor(all$station)
all$group.new<-factor(all$group, levels=c('C. hyperboreus','C. glacialis','Combined','M. longa','Pseudocalanus sp.','Copepoda'))
all$shortname2.new<-factor(all$shortname2, levels=c('All','F','M','C5','C4','C3','C2','C1','Micro', 'Oith','Tric','Cegg','Cnaup','C3&2','C2&1'))
allp<-ggplot(all, aes(x=perc_correct, y=shortname2.new)) +
geom_segment(aes(yend=shortname2.new), xend=0, colour="grey50") +
geom_point(size=4, aes(colour=sample_size)) +
scale_colour_gradient("Sample size (n)",low="lightblue",high="navyblue") +
geom_text(aes(label = perc_correct, hjust = -0.5)) +
theme_bw() +
theme(panel.grid.major.y = element_blank()) +
facet_grid(group.new~station,scales="free_y",space="free") +
xlab("Automatic identification and visual validation concur (%)") + ylab("Species and stages")+
ggtitle("Visual validation of predictions")+
theme_bw() +
theme(plot.title = element_text(lineheight=.8, face="bold", size=20,vjust=1), axis.text.x = element_text(colour="grey20",size=12,angle=0,hjust=.5,vjust=.5,face="bold"), axis.text.y = element_text(colour="grey20",size=12,angle=0,hjust=1,vjust=0,face="bold"), axis.title.x = element_text(colour="grey20",size=15,angle=0,hjust=.5,vjust=0,face="bold"), axis.title.y = element_text(colour="grey20",size=15,angle=90,hjust=.5,vjust=1,face="bold"),legend.position="none", strip.text.x = element_text(size = 12, face="bold", colour = "black", angle = 0), strip.text.y = element_text(size = 8, face="bold", colour = "black"))
allp
ggsave(allp, file="auto_visual_no_label.jpeg", height= 11, width= 8.5, dpi= 400,)
This is what it produces!
Assuming the bar widths are inversely proportional to the number of x-breaks, an appropriate scaling factor can be entered as a width aesthetic to control the width of the bars. But first, calculate the number of x-breaks in each panel, calculate the scaling factor, and put them back into the "all" data frame.
Updating to ggplot2 2.0.0 Each column mentioned in facet_wrap gets its own line in the strip. In the edit, a new label variable is setup in the dataframe so that the strip label remains on one line.
library(ggplot2)
library(plyr)
all = structure(list(station = structure(c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Station 101",
"Station 126"), class = "factor"), shortname2 = structure(c(2L,
7L, 8L, 11L, 1L, 5L, 7L, 8L, 11L, 1L, 2L, 3L, 5L, 7L, 8L, 12L,
11L, 1L, 6L, 8L, 15L, 14L, 9L, 10L, 4L, 6L, 2L, 7L, 8L, 11L,
1L, 5L, 7L, 8L, 11L, 1L, 2L, 3L, 5L, 7L, 8L, 12L, 11L, 1L, 8L,
11L, 1L, 15L, 14L, 13L, 9L, 10L), .Label = c("All", "C1", "C2",
"C2&1", "C3", "C3&2", "C4", "C5", "Cegg", "Cnaup", "F", "M",
"Micro", "Oith", "Tric"), class = "factor"), color = c(1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L,
18L, 19L, 21L, 26L, 30L, 31L, 33L, 34L, 20L, 21L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L,
19L, 26L, 28L, 29L, 30L, 31L, 32L, 33L, 34L), group = structure(c(1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 6L, 5L, 3L, 3L, 3L, 3L, 6L, 6L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 3L, 3L,
3L, 3L, 3L), .Label = c("cgla", "Chyp", "Cope", "mlong", "pseudo",
"specC"), class = "factor"), sample_size = c(11L, 37L, 55L, 16L,
119L, 21L, 55L, 42L, 40L, 158L, 24L, 16L, 17L, 27L, 14L, 45L,
98L, 241L, 30L, 34L, 51L, 22L, 14L, 47L, 13L, 41L, 24L, 41L,
74L, 20L, 159L, 18L, 100L, 32L, 29L, 184L, 31L, 17L, 27L, 23L,
21L, 17L, 49L, 185L, 30L, 16L, 46L, 57L, 16L, 12L, 30L, 42L),
perc_correct = c(91L, 78L, 89L, 81L, 85L, 90L, 91L, 93L,
80L, 89L, 75L, 75L, 76L, 81L, 86L, 76L, 79L, 78L, 90L, 97L,
75L, 86L, 93L, 74L, 85L, 88L, 88L, 90L, 92L, 90L, 91L, 89L,
89L, 91L, 90L, 89L, 81L, 88L, 74L, 78L, 90L, 82L, 84L, 82L,
90L, 94L, 91L, 81L, 69L, 83L, 90L, 81L)), class = "data.frame", row.names = c(NA,
-52L))
all$station <- as.factor(all$station)
# Calculate scaling factor and insert into data frame
library(plyr)
N = ddply(all, .(station, group), function(x) length(row.names(x)))
N$Fac = N$V1 / max(N$V1)
all = merge(all, N[,-3], by = c("station", "group"))
all$label = paste(all$group, all$station, sep = ", ")
allp <- ggplot(data = all, aes(x=shortname2, y=perc_correct, group=group, fill=sample_size, width = .5*Fac)) +
geom_bar(stat="identity", position="dodge", colour="NA") +
scale_fill_gradient("Sample size (n)",low="lightblue",high="navyblue")+
facet_wrap(~label,ncol=2,scales="free_x") +
xlab("Species and stages") + ylab("Automatic identification and visual validation concur (%)") +
ggtitle("Visual validation of predictions") +
theme_bw() +
theme(plot.title = element_text(lineheight=.8, face="bold", size=20,vjust=1),
axis.text.x = element_text(colour="grey20",size=12,angle=0,hjust=.5,vjust=.5,face="bold"),
axis.text.y = element_text(colour="grey20",size=12,angle=0,hjust=1,vjust=0,face="bold"),
axis.title.x = element_text(colour="grey20",size=15,angle=0,hjust=.5,vjust=0,face="bold"),
axis.title.y = element_text(colour="grey20",size=15,angle=90,hjust=.5,vjust=1,face="bold"),
legend.position="none",
strip.text.x = element_text(size = 12, face="bold", colour = "black", angle = 0),
strip.text.y = element_text(size = 12, face="bold", colour = "black"))
allp

Variable label position in ggplot line chart

I have the following data frame summary created with dplyr
structure(list(maxrep = c(7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L,
11L, 12L, 12L, 13L, 13L, 14L, 14L, 15L, 15L, 16L, 16L, 17L, 17L,
18L, 18L, 19L, 19L, 20L, 20L, 21L, 21L, 22L, 22L, 23L, 23L, 24L,
24L, 26L, 26L), div = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("Premier Division",
"Second Division"), class = "factor"), freq = c(1L, 10L, 4L,
39L, 26L, 89L, 73L, 146L, 107L, 162L, 117L, 133L, 121L, 125L,
116L, 91L, 110L, 65L, 95L, 43L, 75L, 38L, 43L, 24L, 38L, 16L,
36L, 5L, 15L, 2L, 9L, 7L, 9L, 1L, 3L, 3L, 2L, 1L)), .Names = c("maxrep",
"div", "freq"), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -38L))
My intention is to use ggplot2 to plot line graphs of 2 lines with different colour with text labels for each value.
What I did was
ggplot(df, aes(x=maxrep, y=freq, colour=div)) +
geom_line() +
geom_text(aes(label=freq), vjust=-.5)
The result was
Now my question: All the labels in the chart are above the points in respective lines. I want to have the labels for the different colours to be in different relative position, e.g. labels for cyan above the line, and labels for red below the line (i.e. variable vjust). Is there a way to do that?
Also, is there a way to get read of the letter a in the colour legend on the right?
What about plotting the lines separately wich differing vjust values? You can get rid of a in the legend setting show_guide = FALSE.
ggplot(df, aes(x=maxrep, y=freq, colour=div, label = freq)) +
geom_line() +
geom_text(data = df[df$div == "Second Division",], vjust=2, show_guide = FALSE) + geom_text(data = df[df$div == "Premier Division",], vjust=-2, show_guide = FALSE)
Which returns:
Create a new variable in the data.frame holding the vjust adjustment parameter:
df$pos <- c(2, -2)[(df$div == "Premier Division")+1]
And you could call vjust inside aes with the new pos vector:
ggplot(df, aes(x=maxrep, y=freq, colour=div)) +
geom_line() +
geom_text(aes(label=freq, vjust=pos))

issue when reordering factor variable by numeric

require(ggplot2)
The data: It's shark incidents grouped by shark species. It's actually a real dataset, already summarized.
D <- structure(list(FL_FATAL = structure(c(2L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L), .Label = c("FATAL",
"NO FATAL"), class = "factor"), spec = structure(c(26L, 24L,
6L, 26L, 25L, 16L, 2L, 11L, 27L, 5L, 24L, 29L, 12L, 21L, 13L,
15L, 28L, 1L, 17L, 19L, 8L, 3L, 6L, 13L, 22L, 18L, 27L, 14L,
23L, 20L, 7L, 4L, 8L, 9L, 10L), .Label = c("blacknose", "blacktip",
"blue", "bonnethead", "bronze", "bull", "caribbean", "draughtsboard",
"dusky", "galapagos", "ganges", "hammerhead", "involve", "leon",
"mako", "nurse", "porbeagle", "recovered", "reef", "sand", "sandtiger",
"sevengill", "spinner", "tiger", "unconfired", "white", "whitespotted",
"whitetip", "wobbegong"), class = "factor"), N = c(368L, 169L,
120L, 107L, 78L, 77L, 68L, 59L, 56L, 53L, 46L, 42L, 35L, 35L,
33L, 30L, 29L, 29L, 26L, 25L, 25L, 25L, 24L, 24L, 21L, 21L, 20L,
20L, 17L, 16L, 16L, 15L, 11L, 11L, 11L)), .Names = c("FL_FATAL",
"spec", "N"), row.names = c(NA, -35L), class = "data.frame")
.
head(D)
# FL_FATAL spec N Especies
# 1 NO FATAL white 368 white
# 2 NO FATAL tiger 169 tiger
# 3 NO FATAL bull 120 bull
# 4 FATAL white 107 white
# 5 NO FATAL unconfired 78 unconfired
# 6 NO FATAL nurse 77 nurse
Reordering a factor variable by a numeric making a new variable.
# Re-order spec creating Especies variable ordered by D$N
D$Especies <- factor(D$spec, levels = unique(D[order(D$N), "spec"]))
# This two plots work as spected
ggplot(D, aes(x=N, y=Especies)) +
geom_point(aes(size = N, color = FL_FATAL))
ggplot(D, aes(x=N, y=Especies)) +
geom_point(aes(size = N, color = FL_FATAL)) +
facet_grid(. ~ FL_FATAL)
Reordering using reorder()
# Using reorder isn't working or am i missing something?
ggplot(D, aes(x=N, y=reorder(D$spec, D$N))) +
geom_point(aes(size = N, color = FL_FATAL))
# adding facets makes it worse
ggplot(D, aes(x=N, y=reorder(D$spec, D$N))) +
geom_point(aes(size = N, color = FL_FATAL)) +
facet_grid(. ~ FL_FATAL)
Which would be the correct approach for producing the plots with reorder()?
The problem is that by using D$ in your reorder call, you're reordering spec independent of the data frame, so the values no longer match up with the corresponding x values. You need to use it directly on the variables:
ggplot(D, aes(x=N, y=reorder(spec, N, sum))) +
geom_point(aes(size = N, color = FL_FATAL)) +
facet_grid(. ~ FL_FATAL)
I'm surprised you like your first way--it's a happy coincidence that worked out. Most of your species have one N value (NO_FATAL only), but you have a few that have both FATAL and NO_FATAL. Whenever there are more than two numeric rows corresponding to a factor, reorder uses a function of those numerics to do the final sort. The default function is mean, but you probably want sum, to sort by the total number of incidents.
D$spec_order <- reorder(D$spec, D$N, sum)
ggplot(D, aes(x=N, y=spec_order)) +
geom_point(aes(size = N, color = FL_FATAL))
ggplot(D, aes(x=N, y=spec_order)) +
geom_point(aes(size = N, color = FL_FATAL)) +
facet_grid(. ~ FL_FATAL)

Adding legend and structuring data for ggplot

In the data included below I have three sites (AAA,BBB,CCC) and individuals within each site (7, 12, 7 respectively). For each individual I have observed values (ObsValues) and three sets of predicted values each with a standard error. I have 26 rows (i.e. 26 individuals) and 9 columns.
The data is included here through dput()
help <- structure(list(StudyArea = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 3L), .Label = c("AAA", "BBB", "CCC"), class = "factor"),
Ind = structure(1:26, .Label = c("AAA_F01", "AAA_F17", "AAA_F33",
"AAA_F49", "AAA_F65", "AAA_F81", "AAA_F97", "BBB_P01", "BBB_P02",
"BBB_P03", "BBB_P04", "BBB_P05", "BBB_P06", "BBB_P07", "BBB_P08",
"BBB_P09", "BBB_P10", "BBB_P11", "BBB_P12", "CCC_F02", "CCC_F03",
"CCC_F04", "CCC_F05", "CCC_F06", "CCC_F07", "CCC_F08"), class = "factor"),
ObsValues = c(22L, 50L, 8L, 15L, 54L, 30L, 11L, 90L, 6L,
53L, 9L, 42L, 72L, 40L, 60L, 58L, 1L, 20L, 37L, 2L, 50L,
68L, 20L, 19L, 58L, 5L), AAAPred = c(28L, 52L, 6L, 15L, 35L,
31L, 13L, 79L, 6L, 58L, 5L, 42L, 88L, 49L, 68L, 60L, 1L,
26L, 46L, 0L, 34L, 71L, 20L, 15L, 35L, 5L), AAAPredSE = c(3.5027829,
4.7852191, 1.231803, 2.5244013, 4.873907, 3.8854192, 2.3532752,
6.3444402, 1.7387295, 5.605111, 1.667818, 4.4709107, 7.0437967,
5.447496, 6.0840486, 5.4371275, 0.8156916, 3.5153847, 4.698754,
0, 3.8901103, 5.993616, 3.1720272, 2.6777869, 4.5647313,
1.4864128), BBBPred = c(14L, 43L, 5L, 13L, 26L, 32L, 14L,
80L, 5L, 62L, 4L, 44L, 67L, 44L, 55L, 42L, 1L, 20L, 47L,
0L, 26L, 51L, 15L, 16L, 34L, 6L), BBBPredSE = c(3.1873435,
4.8782831, 1.3739863, 2.5752273, 4.4155679, 3.8102168, 2.3419518,
6.364606, 1.7096028, 5.6333421, 1.5861323, 4.4951428, 6.6046699,
5.302902, 5.9244328, 5.1887055, 0.8268689, 3.4014041, 4.6600598,
0, 3.8510512, 5.5776686, 3.0569531, 2.6358433, 4.5273782,
1.4263518), CCCPred = c(29L, 53L, 7L, 15L, 44L, 32L, 15L,
86L, 8L, 61L, 5L, 46L, 99L, 54L, 74L, 67L, 1L, 30L, 51L,
1L, 37L, 94L, 21L, 17L, 36L, 6L), CCCPredSE = c(3.4634488,
4.7953389, 0.9484051, 2.5207022, 5.053452, 3.8072731, 2.2764727,
6.3605968, 1.6044067, 5.590048, 1.6611899, 4.4183913, 7.0124638,
5.6495918, 6.1091934, 5.4797929, 0.8135164, 3.4353934, 4.6261147,
0.8187396, 3.7936333, 5.6512378, 3.1686123, 2.633179, 4.5841921,
1.3989955)), .Names = c("StudyArea", "Ind", "ObsValues",
"AAAPred", "AAAPredSE", "BBBPred", "BBBPredSE", "CCCPred", "CCCPredSE"
), class = "data.frame", row.names = c(NA, -26L))
The head() and dim() of help are below too
head(help)
StudyArea Ind ObsValues AAAPred AAAPredSE BBBPred BBBPredSE CCCPred CCCPredSE
1 AAA AAA_F01 22 28 3.502783 14 3.187343 29 3.4634488
2 AAA AAA_F17 50 52 4.785219 43 4.878283 53 4.7953389
3 AAA AAA_F33 8 6 1.231803 5 1.373986 7 0.9484051
4 AAA AAA_F49 15 15 2.524401 13 2.575227 15 2.5207022
5 AAA AAA_F65 54 35 4.873907 26 4.415568 44 5.0534520
6 AAA AAA_F81 30 31 3.885419 32 3.810217 32 3.8072731
dim(help)
> dim(help)
[1] 26 9
I am a relative newcomer to ggplot and am trying to make a plot that displays the observed and predicted values for each individual with a different color for each StudyArea. I can manually add points and force the color with the code below, however this feel rather clunky and also does not produce a legend as I have not specified color in aes().
require(ggplot2)
ggplot(help, aes(x=Ind, y=ObsValues))+
geom_point(color="red", pch = "*", cex = 10)+
geom_point(aes(y = AAAPred), color="blue")+
geom_errorbar(aes(ymin=AAAPred-AAAPredSE, ymax=AAAPred+AAAPredSE), color = "blue")+
geom_point(aes(y = BBBPred), color="darkgreen")+
geom_errorbar(aes(ymin=BBBPred-BBBPredSE, ymax=BBBPred+BBBPredSE), color = "darkgreen")+
geom_point(aes(y = CCCPred), color="black")+
geom_errorbar(aes(ymin=CCCPred-CCCPredSE, ymax=CCCPred+CCCPredSE), color = "black")+
theme(axis.text.x=element_text(angle=30, hjust=1))
In the figure above, the asterisks are the observed values and the values are the predicted values, one from each StudyArea.
I tried to melt() the data, but ran into more problems plotting. That being said, I suspect melt()ing or reshape()ing is the best option.
Any suggestions on how to best alter/restructure the help data so that I can plot the observed and predicted values for each individual with a different color for each StudyArea would be greatly appreciated.
I also hope to produce a legend - the likely default once the data is correctly formatted
Note: Indeed the resulting figure is very busy will likely be simplified once I get a better handle on ggplot.
thanks in advance.
Try this:
library(reshape2)
x.value <- melt(help,id.vars=1:3, measure.vars=c(4,6,8))
x.se <- melt(help,id.vars=1:3, measure.vars=c(5,7,9))
gg <- data.frame(x.value,se=x.se$value)
ggplot(gg)+
geom_point(aes(x=Ind, y=ObsValues),size=5,shape=18)+
geom_point(aes(x=Ind, y=value, color=variable),size=3, shape=1)+
geom_errorbar(aes(x=Ind, ymin=value-se, ymax=value+se, color=variable))+
theme(axis.text.x=element_text(angle=-90))
Produces this:
Edit:: Response to #B.Davis' questions below:
You have to group the ObsValues by StudyArea, not variable. But when you do that you get six colors, three for StudyArea and three for the predictor groups (variable). If we give the predictor groups (e.g., AAAPred, etc.) the same names as the StudyArea groups (e.g. AAA, etc.), then ggplot just generates three colors.
gg$variable <- substring(gg$variable,1,3) # removes "Pred" from group names
ggplot(gg)+
geom_point(aes(x=Ind, y=ObsValues, color=StudyArea),size=5,shape=18)+
geom_point(aes(x=Ind, y=value, color=variable),size=3, shape=1)+
geom_errorbar(aes(x=Ind, ymin=value-se, ymax=value+se, color=variable))+
theme(axis.text.x=element_text(angle=-90))
Produces this:
Similar to #jlhoward solution but I choose to treat ObsValues as a variable to get it in the legend.
help <- dat
x.value <- melt(help,id.vars=1:2, measure.vars=c(3,4,6,8))
x.se <- melt(help,id.vars=1:2, measure.vars=c(3,5,7,9))
gg <- data.frame(x.value,se=x.se$value)
ggplot(gg)+
geom_point(aes(x=Ind, y=value, color=variable),size=3, shape=1)+
geom_errorbar(data= subset(gg,variable!='ObsValues'),
aes(x=Ind, ymin=value-se, ymax=value+se, color=variable))+
theme(axis.text.x=element_text(angle=-90))
This is a little clumsy, but gets you what you want:
# jlhoward's melting is more elegant.
require(reshape2)
melted.points<-melt(help[,c('Ind','ObsValues','AAAPred','BBBPred','CCCPred')])
melted.points$observed<-ifelse(melted.points$variable=='ObsValues','observed','predicted')
melted.points.se<-melt(help[,c('Ind','AAAPredSE','BBBPredSE','CCCPredSE')])
melted.points.se$variable<-gsub('SE','',melted.points.se$variable,)
help2<-merge(melted.points,melted.points.se,by=c('Ind','variable'),all.x=TRUE)
help2<-rename(help2,c(value.x='value',value.y='se'))
And now the actual plot:
ggplot(help2,aes(x=Ind,y=value,color=variable,size=observed,shape=observed,ymin=value-se,ymax=value+se)) +
geom_point() +
geom_errorbar(size=1) +
scale_colour_manual(values = c("red","blue","darkgreen", "black")) +
scale_size_manual(values=c(observed=4,predicted=3)) +
scale_shape_manual(values=c(observed=8,predicted=16))

Resources