How to show directlabels after geom_smooth and not after geom_line? - r

I'm using directlabels to annotate my plot. As you can see in this picture the labels are after geom_line but I want them after geom_smooth. Is this supported by directlabels? Or any other ideas how to achieve this? Thanks in advance!
This is my code:
library(ggplot2)
library(directlabels)
set.seed(124234345)
# Generate data
df.2 <- data.frame("n_gram" = c("word1"),
"year" = rep(100:199),
"match_count" = runif(100 ,min = 1000 , max = 2000))
df.2 <- rbind(df.2, data.frame("n_gram" = c("word2"),
"year" = rep(100:199),
"match_count" = runif(100 ,min = 1000 , max = 2000)) )
# plot
ggplot(df.2, aes(year, match_count, group=n_gram, color=n_gram)) +
geom_line(alpha = I(7/10), color="grey", show_guide=F) +
stat_smooth(size=2, span=0.3, se=F, show_guide=F) +
geom_dl(aes(label=n_gram), method = "last.bumpup", show_guide=F) +
xlim(c(100,220))

This answer takes the basic concept of #celt-Ail's answer, and rather than function, base R, and direct label, attempts a tidyverse approach, stealing some code from here for the multiple loess models.
Happy to hear suggested improvements.
set.seed(124234345)
# Generate data
df.2 <- data.frame("n_gram" = c("word1"),
"year" = rep(100:199),
"match_count" = runif(100 ,min = 1000 , max = 2000))
df.2 <- rbind(df.2, data.frame("n_gram" = c("word2"),
"year" = rep(100:199),
"match_count" = runif(100 ,min = 1000 , max = 2000)) )
#example of loess for multiple models
#https://stackoverflow.com/a/55127487/4927395
library(dplyr)
library(tidyr)
library(purrr)
library(ggplot2)
models <- df.2 %>%
tidyr::nest(-n_gram) %>%
dplyr::mutate(
# Perform loess calculation on each CpG group
m = purrr::map(data, loess,
formula = match_count ~ year, span = .3),
# Retrieve the fitted values from each model
fitted = purrr::map(m, `[[`, "fitted")
)
# Apply fitted y's as a new column
results <- models %>%
dplyr::select(-m) %>%
tidyr::unnest()
#find final x values for each group
my_last_points <- results %>% group_by(n_gram) %>% summarise(year = max(year, na.rm=TRUE))
#Join dataframe of predictions to group labels
my_last_points$pred_y <- left_join(my_last_points, results)
# Plot with loess line for each group
ggplot(results, aes(x = year, y = match_count, group = n_gram, colour = n_gram)) +
geom_line(alpha = I(7/10), color="grey", show.legend=F) +
#stat_smooth(size=2, span=0.3, se=F, show_guide=F)
geom_point() +
geom_line(aes(y = fitted))+
geom_text(data = my_last_points, aes(x=year+5, y=pred_y$fitted, label = n_gram))

# use stat smooth with geom_dl to get matching direct labels.
span <- 0.3
ggplot(df.2, aes(year, match_count, group=n_gram, color=n_gram)) +
geom_line(alpha = I(7/10), color="grey") +
stat_smooth(size=2, span=span, se=F) +
geom_dl(aes(label=n_gram), method = "last.qp", stat="smooth", span=span) +
xlim(c(100,220))+
guides(colour="none")

This is not what you asked for as I don't know how to do that, but this might be more useful to you as you will lose less plotting area to labels:
PLOT <- ggplot(df.2, aes(year, match_count, group=n_gram, color=n_gram)) +
geom_line(alpha = I(7/10), color="grey", show_guide=F) +
stat_smooth(size=2, span=0.3, se=F, show_guide=F)
mymethod <- list(
"top.points",
dl.move("word1", hjust=-6.65, vjust=13),
dl.move("word2", hjust =-7.9, vjust=20.25)
)
direct.label(PLOT, mymethod)
which yields:
You could also try:
mymethod <- list(
"top.points",
dl.move("word1", hjust=-6, vjust=14),
dl.move("word2", hjust =-7.1, vjust=19.5)
)
ggplot(df.2, aes(year, match_count, group=n_gram, color=n_gram)) +
geom_line(alpha = I(7/10), color="grey", show_guide=F) +
xlim(c(100,220))+
stat_smooth(size=2, span=0.3, se=F, show_guide=F) +
geom_dl(aes(label=n_gram), method = mymethod, show_guide=F)
which yields:
NOTE: to print to other graphics devices (this was the windows rgui) you'll need to tweak the vjust and hjust to suit. But if there's a more direct way that would be nicer.

I'm gonna answer my own question here, since I figured it out thanks to a response from Tyler Rinker.
This is how I solved it using loess() to get label positions.
# Function to get last Y-value from loess
funcDlMove <- function (n_gram) {
model <- loess(match_count ~ year, df.2[df.2$n_gram==n_gram,], span=0.3)
Y <- model$fitted[length(model$fitted)]
Y <- dl.move(n_gram, y=Y,x=200)
return(Y)
}
index <- unique(df.2$n_gram)
mymethod <- list(
"top.points",
lapply(index, funcDlMove)
)
# Plot
PLOT <- ggplot(df.2, aes(year, match_count, group=n_gram, color=n_gram)) +
geom_line(alpha = I(7/10), color="grey", show_guide=F) +
stat_smooth(size=2, span=0.3, se=F, show_guide=F)
direct.label(PLOT, mymethod)
Which will generate this plot: http://i.stack.imgur.com/FGK1w.png

Related

How to hide (or remove) dots in the boxplot graph?

I have a question about how to hide(or remove) dots in the boxplot graph.
This is code what I implemented.
install.packages("randomForestSRC")
install.packages("ggRandomForests")
library(randomForestSRC)
library(ggRandomForests)
data(pbc, package="randomForestSRC")
pbc.na <- na.omit(pbc)
set.seed(123)
rsf <- rfsrc(Surv(days,status)~., data=pbc.na, ntree=500, importance=T)
gg_v <- gg_variable(rsf, time = c(2000, 4000),
time.labels = c("2000 days", "4000 days"))
gg_v$stage <- as.factor(gg_v$stage)
plot(gg_v, xvar="stage", panel=T, points=F)+
ggplot2::theme_bw() +
ggplot2::geom_boxplot(outlier.shape=NA)+
ggplot2::labs(y="Survival (%)")+
ggplot2::coord_cartesian(ylim=c(-.01, 1.02))
So I would like to hide(or remove) all of the event's dots (both of False and True).
However, I have no information about what I want.
Please let me know how to do it.
Thanks always.
I am not familiar how ggRandomForests work. But using the data frame gg_v, we can directly do the plotting in ggplot2.
ggplot(gg_v, aes(stage, yhat, group = stage)) +
geom_boxplot(outlier.shape = NA) +
facet_wrap(~time, nrow = 2, strip.position = "right") +
ylab("Survival (%)") +
theme_bw()
You can also use the function "geom_boxplot2" from github ("Ipaper")
# devtools::install_github('kongdd/Ipaper')
library(Ipaper)
library(ggplot2)
ggplot(gg_v, aes(stage, yhat, group = stage)) +
geom_boxplot2(width = 0.8, width.errorbar = 0.5)+
facet_wrap(~time, nrow = 2, strip.position = "right") +
ylab("Survival (%)") +
theme_bw()

position_dodge when using separate datasets

I am attempting to produce a graph that shows two groups of error bars, but the different error bars represent different estimates of central tendency/variability (e.g., mean with sd and median with quantiles). I'm trying to use position_dodge, but it's not working, and I suspect this is because I'm feeding it values from a different dataset. Here's a reproducible example:
#### simulate dosages
dose = factor(rep(c("small", "medium", "large"), times=10))
dose = relevel(dose, "small")
#### simulate fevers, based on dosage (but highly skewed)
fever = rnorm(length(dose), 100, 1)
betas = matrix(c(0, -3, -6), nrow=1)
fever = fever + as.numeric(betas%*%t(model.matrix(fever~dose)))
#### put into data frame
d = data.frame(dose=dose, fever=fever)
#### compute means and standard errors
means = d %>% group_by(dose) %>% summarise(mean=mean(fever), lower=mean - sd(fever), upper = mean + sd(fever))
medians = d %>% group_by(dose) %>% summarise(median=median(fever), lower=quantile(fever, .25), upper = quantile(fever, .75))
#### put all into a ggplot
ggplot(d, aes(x=dose, y=fever)) +
geom_jitter(alpha=.2, width=.2) +
geom_point(data=means, aes(x=dose, y=mean)) +
geom_point(data=medians, aes(x=dose, y=median), col="red") +
geom_errorbar(data=means, aes(y=mean, ymin=lower, ymax=upper), width=.2, position=position_dodge(width=.2)) +
geom_errorbar(data= medians, aes(y=median, ymin=lower, ymax=upper), width=.2, position=position_dodge(width=.2), col="red")
Which gives the results of the following image:
Notice dodging isn't working.
Let's assume I can't just use stat_summary (I can't...I'm actually comparing means with some robust estimates from another package). Is there any way to offset the error bars/dots so they can be better seen?
Combine your dataframes for both statistics so you can map the kind of statistic on group:
means <- df %>%
group_by(dose) %>%
summarise(Statistic = "Mean", Value = mean(fever), lower=mean(fever) - sd(fever), upper = mean(fever) + sd(fever))
medians <- df %>%
group_by(dose) %>%
summarise(Statistic = "Median", Value = median(fever), lower=quantile(fever, 0.25), upper = quantile(fever, 0.75))
df2 <- bind_rows(means, medians)
#### put all into a ggplot
ggplot(df, aes(x = dose, y = fever)) +
geom_jitter(alpha = .2, width = .2) +
geom_point(data = df2, aes(x = dose, y = Value, color = Statistic)) +
geom_errorbar(data = df2, aes(y = Value, ymin = lower, ymax = upper,
group = Statistic, color = Statistic),
width=.2, position = position_dodge(width = .2))

Create a matrix of residual plots using purrr and ggplot

Suppose I have the following dataframe:
library(tidyverse)
fit <- lm(speed ~ dist, data = cars)
select(broom::augment(fit), .fitted:.std.resid) -> dt
names(dt) <- substring(names(dt), 2)
I would like to create a grid of residuals plots using purrr. For example, I have the formulas for 2 diagnostic plots so far:
residual <- function(model) {ggplot(model, aes(fitted, resid)) +
geom_point() +
geom_hline(yintercept = 0) +
geom_smooth(se = FALSE)}
stdResidual <- function(model) {ggplot(model, aes(fitted, std.resid)) +
geom_point() +
geom_hline(yintercept = 0) +
geom_smooth(se = FALSE)}
And I am storing the formulas in a list that I plan to run against the fortified dataset dt.
formulas <- tibble(charts = list(residual, stdResidual))
# A tibble: 2 x 1
charts
<list>
1 <fun>
2 <fun>
Now I need to pass dt to each element of the column chart in formulas. I am actually also trying to combine both using gridExtra, but for now I would be satisfied if I could at least render both of them. I think I should run something like
pwalk(list(dt, formulas), ???)
But I have no idea what function I should use in ??? to render the plots.
Set up functions to plot each one, just like you did above:
diagplot_resid <- function(df) {
ggplot(df, aes(.fitted, .resid)) +
geom_hline(yintercept = 0) +
geom_point() +
geom_smooth(se = F) +
labs(x = "Fitted", y = "Residuals")
}
diagplot_stdres <- function(df) {
ggplot(df, aes(.fitted, sqrt(.std.resid))) +
geom_hline(yintercept = 0) +
geom_point() +
geom_smooth(se = F) +
labs(x = "Fitted", y = expression(sqrt("Standardized residuals")))
}
diagplot_qq <- function(df) {
ggplot(df, aes(sample = .std.resid)) +
geom_abline(slope = 1, intercept = 0, color = "black") +
stat_qq() +
labs(x = "Theoretical quantiles", y = "Standardized residuals")
}
Then call each in a list, with the dataframe as your second argument. Here you're invokeing a list of functions, and parallel-ly applying them to a list of function arguments. Since there's only one element to the second list, invoke_map loops over them.
fit <- lm(mpg~wt, mtcars)
df_aug <- augment(fit)
purrr::invoke_map(.f = list(diagplot_resid, diagplot_stdres, diagplot_qq),
.x = list(list(df_aug))) %>%
gridExtra::grid.arrange(grobs = ., ncol = 2,
top = paste("Diagnostic plots for",
as.expression(fit$call)))

Plotting lmer() in ggplot2

I'm trying to adjust my graph to make it suitable for a scientific report. See example below (from here: http://glmm.wikidot.com/faq).
How do I change the ggplot settings so the lines are displayed in greyscale?
library("lme4")
library("ggplot2") # Plotting
data("Orthodont",package="MEMSS")
fm1 <- lmer(
formula = distance ~ age*Sex + (age|Subject)
, data = Orthodont
)
newdat <- expand.grid(
age=c(8,10,12,14)
, Sex=c("Female","Male")
, distance = 0
)
mm <- model.matrix(terms(fm1),newdat)
newdat$distance <- predict(fm1,newdat,re.form=NA)
pvar1 <- diag(mm %*% tcrossprod(vcov(fm1),mm))
tvar1 <- pvar1+VarCorr(fm1)$Subject[1]
cmult <- 2 ## could use 1.96
newdat <- data.frame(
newdat
, plo = newdat$distance-cmult*sqrt(pvar1)
, phi = newdat$distance+cmult*sqrt(pvar1)
, tlo = newdat$distance-cmult*sqrt(tvar1)
, thi = newdat$distance+cmult*sqrt(tvar1)
)
g0 <- ggplot(newdat, aes(x=age, y=distance, colour=Sex))+geom_point()
g0 + geom_errorbar(aes(ymin = plo, ymax = phi))+
labs(title="CI based on fixed-effects uncertainty ONLY") + theme_bw()
I'm also unsure why sqrt() is used in this line of code:
plo = newdat$distance-cmult*sqrt(pvar1)
Thanks
#aosmith is right - scale_color_grey is probably what you're looking for.
g0 <- ggplot(newdat, aes(x=age, y=distance, colour=Sex))+geom_point()
g0 + geom_errorbar(aes(ymin = plo, ymax = phi)) +
labs(title="CI based on fixed-effects uncertainty ONLY") +
theme_bw() + scale_color_grey(start = 0.2, end = 0.5)
If you're able to (which you are here), it's generally best to use redundant encoding, i.e., encoding sex with two variables (such as color and linetype). It makes it easier to perceive the difference between the two.
g0 <- ggplot(newdat, aes(x=age, y=distance, colour=Sex, linetype = Sex)) + geom_point()
g0 + geom_errorbar(aes(ymin = plo, ymax = phi)) +
labs(title="CI based on fixed-effects uncertainty ONLY") +
theme_bw() + scale_color_grey(start = 0.2, end = 0.5) + scale_linetype()

Combining 2 different graph outputs in R into one graph

So I used the following code, to generate graphs, where appl and apple generate 2 different graphs and now I want to combine them into a single graph
data <- ddply(data, .(Value), summarise,
N = length(means),
mean = mean(means),
sd = sd(means),
se = sd(means) / sqrt(length(means)) )
apple=ggplot(data, aes(x=Value, y=mean)) +
geom_errorbar(aes(ymin=mean-se, ymax=mean+se), width=.1) +
geom_ribbon(aes(ymin=mean-se, ymax=mean+se),alpha=0.5) +
geom_line() +
geom_point()
dat <- ddply(dat1, .(Value), summarise,
N = length(means),
mean = mean(means),
sd = sd(means),
se = sd(means) / sqrt(length(means)))
appl=ggplot(dat, aes(x=Value, y=mean)) +
geom_errorbar(aes(ymin=mean-se, ymax=mean+se), width=.1) +
geom_ribbon(aes(ymin=mean-se, ymax=mean+se),alpha=0.5) +
geom_line() +
geom_point()
The answer involves combining the datasets into one big one, with an additional column specifying to which dataset that subset belonged. There is no need for creating plots separately and combining them. Let's assume that column is named id, then you can use an additional argument in aes to get the plot to work, i.e. aes(x=Value, y=mean, color=id). Combining the datasets can be done using rbind.
A code example:
df1 = data.frame(Value = sample(LETTERS[1:8], 1000, replace = TRUE),
means = runif(1000))
df2 = data.frame(Value = sample(LETTERS[1:8], 1000, replace = TRUE),
means = runif(1000) + 0.5)
df1 = ddply(df1, .(Value), summarise,
N = length(means),
mean = mean(means),
sd = sd(means),
se = sd(means) / sqrt(length(means)))
df1$id = "ID1"
df2 = ddply(df2, .(Value), summarise,
N = length(means),
mean = mean(means),
sd = sd(means),
se = sd(means) / sqrt(length(means)))
df2$id = "ID2"
df_all = rbind(df1, df2)
ggplot(df_all, aes(x=Value, y=mean, color = id)) +
geom_errorbar(aes(ymin=mean-se, ymax=mean+se), width=.1) +
geom_ribbon(aes(ymin=mean-se, ymax=mean+se),alpha=0.5) +
geom_line() +
geom_point()
Which results in the following graph:
Note that I have had to invent some data due to lack of example data form your side, so this might not entirely fit your situation. However, it nicely illustrates the approach.

Resources