Most succinct way to label/annotate extreme values with ggplot? - r

I'd like to annotate all y-values greater than a y-threshold using ggplot2.
When you plot(lm(y~x)), using the base package, the second graph that pops up automatically is Residuals vs Fitted, the third is qqplot, and the fourth is Scale-location. Each of these automatically label your extreme Y values by listing their corresponding X value as an adjacent annotation. I'm looking for something like this.
What's the best way to achieve this base-default behavior using ggplot2?

Updated scale_size_area() in place of scale_area()
You might be able to take something from this to suit your needs.
library(ggplot2)
#Some data
df <- data.frame(x = round(runif(100), 2), y = round(runif(100), 2))
m1 <- lm(y ~ x, data = df)
df.fortified = fortify(m1)
names(df.fortified) # Names for the variables containing residuals and derived qquantities
# Select extreme values
df.fortified$extreme = ifelse(abs(df.fortified$`.stdresid`) > 1.5, 1, 0)
# Based on examples on page 173 in Wickham's ggplot2 book
plot = ggplot(data = df.fortified, aes(x = x, y = .stdresid)) +
geom_point() +
geom_text(data = df.fortified[df.fortified$extreme == 1, ],
aes(label = x, x = x, y = .stdresid), size = 3, hjust = -.3)
plot
plot1 = ggplot(data = df.fortified, aes(x = .fitted, y = .resid)) +
geom_point() + geom_smooth(se = F)
plot2 = ggplot(data = df.fortified, aes(x = .fitted, y = .resid, size = .cooksd)) +
geom_point() + scale_size_area("Cook's distance") + geom_smooth(se = FALSE, show_guide = FALSE)
library(gridExtra)
grid.arrange(plot1, plot2)

Related

Is there an equivalent to points() on ggplot2

I'm working with stock prices and trying to plot the price difference.
I created one using autoplot.zoo(), my question is, how can I manage to change the point shapes to triangles when they are above the upper threshold and to circles when they are below the lower threshold. I understand that when using the basic plot() function you can do these by calling the points() function, wondering how I can do this but with ggplot2.
Here is the code for the plot:
p<-autoplot.zoo(data, geom = "line")+
geom_hline(yintercept = threshold, color="red")+
geom_hline(yintercept = -threshold, color="red")+
ggtitle("AAPL vs. SPY out of sample")
p+geom_point()
We can't fully replicate without your data, but here's an attempt with some sample generated data that should be similar enough that you can adapt for your purposes.
# Sample data
data = data.frame(date = c(2001:2020),
spread = runif(20, -10,10))
# Upper and lower threshold
thresh <- 4
You can create an additional variable that determines the shape, based on the relationship in the data itself, and pass that as an argument into ggplot.
# Create conditional data
data$outlier[data$spread > thresh] <- "Above"
data$outlier[data$spread < -thresh] <- "Below"
data$outlier[is.na(data$outlier)] <- "In Range"
library(ggplot2)
ggplot(data, aes(x = date, y = spread, shape = outlier, group = 1)) +
geom_line() +
geom_point() +
geom_hline(yintercept = c(thresh, -thresh), color = "red") +
scale_shape_manual(values = c(17,16,15))
# If you want points just above and below# Sample data
data = data.frame(date = c(2001:2020),
spread = runif(20, -10,10))
thresh <- 4
data$outlier[data$spread > thresh] <- "Above"
data$outlier[data$spread < -thresh] <- "Below"
ggplot(data, aes(x = date, y = spread, shape = outlier, group = 1)) +
geom_line() +
geom_point() +
geom_hline(yintercept = c(thresh, -thresh), color = "red") +
scale_shape_manual(values = c(17,16))
Alternatively, you can just add the points above and below the threshold as individual layers with manually specified shapes, like this. The pch argument points to shape type.
# Another way of doing this
data = data.frame(date = c(2001:2020),
spread = runif(20, -10,10))
# Upper and lower threshold
thresh <- 4
ggplot(data, aes(x = date, y = spread, group = 1)) +
geom_line() +
geom_point(data = data[data$spread>thresh,], pch = 17) +
geom_point(data = data[data$spread< (-thresh),], pch = 16) +
geom_hline(yintercept = c(thresh, -thresh), color = "red") +
scale_shape_manual(values = c(17,16))

why geom_smooth not showing the smooth line?

I am having a problem where geom_smooth() is not working on my ggplot2.
But instead of a smooth curve, there is a fold.
My X-axis variable is the factor variable(I've tried to convert it to a numerical variable, but it didn't work), and Y-axis is numeric variable.
My data.frame is that
ggplot(tmp, aes(x = x, y = y))+
geom_point()+
geom_smooth(formula = y ~ x, method = "loess", stat = "identity", se = T, group = "")
I hope to get a pic like this.
A quick fix will be to wrap the group inside aes. Generated a data similar to the structure you have (a factor x variable and a numeric y var).
set.seed(777)
x <- rep(c(LETTERS[1:7]), 3)
y <- rnorm(21, mean = 0, sd = 1)
tmp <- data.frame(x,y)
# -------------------------------------------------------------------------
base <- ggplot(tmp, aes(x = x, y = y))+geom_point()
base + geom_smooth(formula = y ~ x, method = "loess",se = TRUE, aes(group = "" ), level = 0.95) + theme_bw()
If you want to use a different level of confidence interval, you can change the value of level (which is a 95% by default).
Output

Transfer calculated value from stat_smooth to other geom like linerange

I have a question about ggplot2.
I want to connect data point with ols result via vertical line, like the code listed below.
Can I transfer ..y.., the value calculated by stat_smooth, to geom_linerange directly?
I tried stat_smooth(..., geom = "linerange", mapping(aes(ymin=pmin(myy, ..y..), ymax=pmax(myy,..y..)) but it is not the result I want.
library(ggplot2)
df <- data.frame(myx = 1:10,
myy = c(1:10) * 5 + 2 * rnorm(10, 0, 1))
lm.fit <- lm("myy~myx", data = df)
pred <- predict(lm.fit)
ggplot(df, aes(myx, myy)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
geom_linerange(mapping = aes(ymin = pmin(myy, pred),
ymax = pmax(myy, pred)))
stat_smooth evaluates the values at n evenly spaced points, with n = 80 by default. These points may not coincide with the original x values in your data frame.
Since you are calculating predicted values anyway, it would probably be more straightforward to add that back to your data frame and plot all geom layers based on that as your data source, for example:
df$pred <- pred
ggplot(df, aes(myx, myy)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
geom_linerange(aes(ymin = myy, ymax = pred))

Adjusting axis in ggtern ternary plots

I am trying to generate a ternary plot using ggtern.
My data ranges from 0 - 1000 for x, y,and z variables. I wondered if it is possible to extend the axis length above 100 to represent my data.
#Nevrome is on the right path, your points will still be plotted as 'compositions', ie, concentrations sum to unity, but you can change the labels of the axes, to indicate a range from 0 to 1000.
library(ggtern)
set.seed(1)
df = data.frame(x = runif(10)*1000,
y = runif(10)*1000,
z = runif(10)*1000)
breaks = seq(0,1,by=0.2)
ggtern(data = df, aes(x, y, z)) +
geom_point() +
limit_tern(breaks=breaks,labels=1000*breaks)
I think there is no direct solution to do this with ggtern. But an easy workaround could look like this:
library(ggtern)
df = data.frame(x = runif(50)*1000,
y = runif(50)*1000,
z = runif(50)*1000,
Group = as.factor(round(runif(50,1,2))))
ggtern() +
geom_point(data = df, aes(x/10, y/10, z/10, color = Group)) +
labs(x="X", y="Y", z="Z", title="Title") +
scale_T_continuous(breaks = seq(0,1,0.2), labels = 1000*seq(0,1,0.2)) +
scale_L_continuous(breaks = seq(0,1,0.2), labels = 1000*seq(0,1,0.2)) +
scale_R_continuous(breaks = seq(0,1,0.2), labels = 1000*seq(0,1,0.2))

Barplot with horizontal marker and asterisks in R (ggplot etc)

Using R's standard plot or better still GGPLOT,
is there a way to create a plot like this?
Note especially the horizontal lines across selected bar
with asterisk on top of it.
I don't know of an easy way to annotate graphs like this in ggplot2. Here's a relatively generic approach to make the data you'd need to plot. You can use a similar approach to annotate the relationships as necessary. I'll use the iris dataset as an example:
library(ggplot2)
library(plyr) #for summarizing data
#summarize average sepal length by species
dat <- ddply(iris, "Species", summarize, length = mean(Sepal.Length))
#Create the data you'll need to plot for the horizontal lines
horzlines <- data.frame(x = 1,
xend = seq_along(dat$Species)[-1],
y = seq(from = max(dat$length), by = 0.5, length.out = length(unique(dat$Species))-1),
yend = seq(from = max(dat$length), by = 0.5, length.out = length(unique(dat$Species))-1),
label = c("foo", "bar")
)
ggplot() +
geom_histogram(data = dat, aes(Species, length), stat = "identity") +
geom_segment(data = horzlines, aes(x = x, xend = xend, y = y, yend = yend)) +
geom_text(data = horzlines, aes(x = (x + xend)/2, y = y + .25, label = label))
Giving you something like this:

Resources