Related
I was trying to recreate this plot:
using the following code -
library(tidyverse)
set.seed(0); r <- rnorm(10000);
df <- as.data.frame(r)
avg <- round(mean(r),2)
SD <- round(sd(r),2)
x.scale <- seq(from = avg - 3*SD, to = avg + 3*SD, by = SD)
x.lab <- c("-3SD", "-2SD", "-1SD", "Mean", "1SD", "2SD", "3SD")
df %>% ggplot(aes(r)) +
geom_histogram(aes(y=..density..), bins = 20,
colour="black", fill="lightblue") +
geom_density(alpha=.2, fill="darkblue") +
scale_x_continuous(breaks = x.scale, labels = x.lab) +
labs(x = "")
Using the code I plotted this:
,
but this isn't near to the plot that I am trying to create. How do I make an additional axis with the X axis? How do I add the lines to automatically show the percentage of observations? Is there any way, that I can create the plot as nearly identical as possible using ggplot2?
Welcome to SO. Excellent first question!
It's actually quite tricky. You'd need to create a second plot (the second x axis) but it's not the most straight forward to align both perfectly.
I will be using Z.lin's amazing modification of the cowplot package.
I am not using the reprex package, because I think I'd need to define every single function (and I don't know how to use trace within reprex.)
library(tidyverse)
library(cowplot)
set.seed(0); r <- rnorm(10000);
foodf <- as.data.frame(r)
avg <- round(mean(r),2)
SD <- round(sd(r),2)
x.scale <- round(seq(from = avg - 3*SD, to = avg + 3*SD, by = SD), 1)
x.lab <- c("-3SD", "-2SD", "-1SD", "Mean", "1SD", "2SD", "3SD")
x2lab <- -3:3
# calculate the density manually
dens_r <- density(r)
# for each x value, calculate the closest x value in the density object and get the respective y values
y_dens <- dens_r$y[sapply(x.scale, function(x) which.min(abs(dens_r$x - x)))]
# added annotation for segments and labels.
# Arrow segments can be added in a similar way.
p1 <-
ggplot(foodf, aes(r)) +
geom_histogram(aes(y=..density..), bins = 20,
colour="black", fill="lightblue") +
geom_density(alpha=.2, fill="darkblue") +
scale_x_continuous(breaks = x.scale, labels = x.lab) +
labs(x = NULL) +# use NULL here
annotate(geom = "segment", x = x.scale, xend = x.scale,
yend = 1.1 * max(dens_r$y), y = y_dens, lty = 2 ) +
annotate(geom = "text", label = x.lab,
x = x.scale, y = 1.2 * max(dens_r$y))
p2 <-
ggplot(foodf, aes(r)) +
scale_x_continuous(breaks = x.scale, labels = x2lab) +
labs(x = NULL) +
theme_classic() +
theme(axis.line.y = element_blank())
# This is with the modified plot_grid() / align_plot() function!!!
plot_grid(p1, p2, ncol = 1, align = "v", rel_heights = c(1, 0.1))
I'm trying to graph multiple nonlinear least squares regression in r in different colors based on the value of a variable.
However, I also display the equation of the last one, and I would like the color in the nonlinear regression corresponding to the equation to be black as well.
What I've tried is shown in the geom_smooth() layer - I tried to include an ifelse() statement, but this doesn't work because of reasons described here: Different between colour argument and aes colour in ggplot2?
test <- function() {
require(ggplot2)
set.seed(1);
master <- data.frame(matrix(NA_real_, nrow = 0, ncol = 3))
for( i in 1:5 ) {
df <- data.frame(matrix(NA_real_, nrow = 50, ncol = 3))
colnames(df) <- c("xdata", "ydata", "test")
df$xdata = as.numeric(sample(1:100, size = nrow(df), replace = FALSE))
df$ydata = as.numeric(sample(1:3, size = nrow(df), prob=c(.60, .25, .15), replace = TRUE))
# browser()
df$test = i
master <- rbind(master, df)
}
df <- master
last <- 5
# based on https://stackoverflow.com/questions/18305852/power-regression-in-r-similar-to-excel
power_eqn = function(df, start = list(a=300,b=1)) {
m = nls(as.numeric(reorder(xdata,-ydata)) ~ a*ydata^b, start = start, data = df)
# View(summary(m))
# browser()
# eq <- substitute(italic(hat(y)) == a ~italic(x)^b*","~~italic(r)^2~"="~r2*","~~p~"="~italic(pvalue),
eq <- substitute(italic(y) == a ~italic(x)^b*","~~italic('se')~"="~se*","~~italic(p)~"="~pvalue,
list(a = format(coef(m)[1], digits = 6), # a
b = format(coef(m)[2], digits = 6), # b
# r2 = format(summary(m)$r.squared, digits = 3),
se = format(summary(m)$parameters[2,'Std. Error'], digits = 6), # standard error
pvalue = format(summary(m)$coefficients[2,'Pr(>|t|)'], digits=6) )) # p value (based on t statistic)
as.character(as.expression(eq))
}
plot1 <- ggplot(df, aes(x = as.numeric(reorder(xdata,-ydata)), y = ydata ) ) +
geom_point(color="black", shape=1 ) +
# PROBLEM LINE
stat_smooth(aes(color=ifelse(test==5, "black", test)), method = 'nls', formula = 'y~a*x^b', method.args = list(start= c(a =1,b=1)),se=FALSE, fullrange=TRUE) +
geom_text(x = quantile(df$xdata)[4], y = max(df$ydata), label = power_eqn(df), parse = TRUE, size=4, color="black") + # make bigger? add border around?
theme(legend.position = "none", axis.ticks.x = element_blank() ) + #, axis.title.x = "family number", axis.title.y = "number of languages" ) # axis.text.x = element_blank(),
labs( x = "xdata", y = "ydata", title="test" )
plot1
}
test()
This is the graph I got.
I would like the line corresponding to the points and equation to be black as well. Does anyone know how to do this?
I do not want to use a scale_fill_manual, etc., because my real data would have many, many more lines - unless the scale_fill_manual/etc. can be randomly generated.
You could use scale_color_manual using a custom created palette where your level of interest (in your example where test equals 5) is set to black. Below I use palettes from RColorBrewer, extend them if necessary to the number of levels needed and sets the last color to black.
library(RColorBrewer) # provides several great palettes
createPalette <- function(n, colors = 'Greens') {
max_colors <- brewer.pal.info[colors, ]$maxcolors # Get maximum colors in palette
palette <- brewer.pal(min(max_colors, n), colors) # Get RColorBrewer palette
if (n > max_colors) {
palette <- colorRampPalette(palette)(n) # make it longer i n > max_colros
}
# assume that n-th color should be black
palette[n] <- "#000000"
# return palette
palette[1:n]
}
# create a palette with 5 levels using the Spectral palette
# change from 5 to the needed number of levels in your real data.
mypalette <- createPalette(5, 'Spectral') # palettes from RColorBrewer
We can then use mypalette with scale_color_manual(values=mypalette) to color points and lines according to the test variable.
Please note that I have updated geom_point and stat_smooth to so that they use aes(color=as.factor(test)). I have also changed the call to power_eqn to only use data points where df$test==5. The black points, lines and equation should now be based on the same data.
plot1 <- ggplot(df, aes(x = as.numeric(reorder(xdata,-ydata)), y = ydata )) +
geom_point(aes(color=as.factor(test)), shape=1) +
stat_smooth(aes(color=as.factor(test)), method = 'nls', formula = 'y~a*x^b', method.args = list(start= c(a =1,b=1)),se=FALSE, fullrange=TRUE) +
geom_text(x = quantile(df$xdata)[4], y = max(df$ydata), label = power_eqn(df[df$test == 5,]), parse = TRUE, size=4, color="black") +
theme(legend.position = "none", axis.ticks.x = element_blank() ) +
labs( x = "xdata", y = "ydata", title="test" ) +
scale_color_manual(values = mypalette)
plot1
See resulting figure here (not reputation enough to include them)
I hope you find my answer useful.
The idea is to combine R packages ClustOfVar and ggdendro to give a visual summary of variable clustering.
When there are few columns in the data, the result is pretty good except that there are areas not covered(as circled in the chart below). Using mtcars for example:
library(plyr)
library(ggplot2)
library(gtable)
library(grid)
library(gridExtra)
library(ClustOfVar)
library(ggdendro)
fit = hclustvar(X.quanti = mtcars)
labels = cutree(fit,k = 5)
labelx = data.frame(Names=names(labels),group = paste("Group",as.vector(labels)),num=as.vector(labels))
p1 = ggdendrogram(as.dendrogram(fit), rotate=TRUE)
df2<-data.frame(cluster=cutree(fit, k =5), states=factor(fit$labels,levels=fit$labels[fit$order]))
df3<-ddply(df2,.(cluster),summarise,pos=mean(as.numeric(states)))
p2 = ggplot(df2,aes(states,y=1,fill=factor(cluster)))+geom_tile()+
scale_y_continuous(expand=c(0,0))+
theme(axis.title=element_blank(),
axis.ticks=element_blank(),
axis.text=element_blank(),
legend.position="none")+coord_flip()+
geom_text(data=df3,aes(x=pos,label=cluster))
gp1<-ggplotGrob(p1)
gp2<-ggplotGrob(p2)
maxHeight = grid::unit.pmax(gp1$heights[2:5], gp2$heights[2:5])
gp1$heights[2:5] <- as.list(maxHeight)
gp2$heights[2:5] <- as.list(maxHeight)
grid.arrange(gp2, gp1, ncol=2,widths=c(1/6,5/6))
When there are a large number of columns, another issue occurs. That is, the height of the color tiles part does not match the height the dendrogram.
library(ClustOfVar)
library(ggdendro)
X = data.frame(mtcars,mtcars,mtcars,mtcars,mtcars,mtcars)
fit = hclustvar(X.quanti = X)
labels = cutree(fit,k = 5)
labelx = data.frame(Names=names(labels),group = paste("Group",as.vector(labels)),num=as.vector(labels))
p1 = ggdendrogram(as.dendrogram(fit), rotate=TRUE)
df2<-data.frame(cluster=cutree(fit, k =5), states=factor(fit$labels,levels=fit$labels[fit$order]))
df3<-ddply(df2,.(cluster),summarise,pos=mean(as.numeric(states)))
p2 = ggplot(df2,aes(states,y=1,fill=factor(cluster)))+geom_tile()+
scale_y_continuous(expand=c(0,0))+
theme(axis.title=element_blank(),
axis.ticks=element_blank(),
axis.text=element_blank(),
legend.position="none")+coord_flip()+
geom_text(data=df3,aes(x=pos,label=cluster))
gp1<-ggplotGrob(p1)
gp2<-ggplotGrob(p2)
maxHeight = grid::unit.pmax(gp1$heights[2:5], gp2$heights[2:5])
gp1$heights[2:5] <- as.list(maxHeight)
gp2$heights[2:5] <- as.list(maxHeight)
grid.arrange(gp2, gp1, ncol=2,widths=c(1/6,5/6))
#Sandy Muspratt has actually provided an excellent solution to this IF we have the R upgraded to version 3.3.1.
R: ggplot slight adjustment for clustering summary
But since I cannot change the version of the R deployed in the corporate server, I wonder if there is any other workaround that can align these two parts.
As far as I can tell, your code is not far wrong. The problem is that you are trying to match a continuous scale to a discrete scale when you merge the two plots. Also, it appears that ggdendrogram() adds additional space to the y-axis.
library(plyr)
library(ggplot2)
library(gtable)
library(grid)
library(gridExtra)
library(ClustOfVar)
library(ggdendro)
# Data
X = data.frame(mtcars,mtcars,mtcars,mtcars,mtcars,mtcars)
# Cluster analysis
fit = hclustvar(X.quanti = X)
# Labels data frames
df2 <- data.frame(cluster = cutree(fit, k =5),
states = factor(fit$labels, levels = fit$labels[fit$order]))
df3 <- ddply(df2, .(cluster), summarise, pos = mean(as.numeric(states)))
# Dendrogram
# scale_x_continuous() for p1 should match scale_x_discrete() from p2
# scale_x_continuous strips off the labels. I grab them from df2
# scale _y_continuous() puts a little space between the labels and the dendrogram
p1 <- ggdendrogram(as.dendrogram(fit), rotate = TRUE) +
scale_x_continuous(expand = c(0, 0.5), labels = levels(df2$states), breaks = 1:length(df2$states)) +
scale_y_continuous(expand = c(0.02, 0))
# Tiles and labels
p2 <- ggplot(df2,aes(states, y = 1, fill = factor(cluster))) +
geom_tile() +
scale_y_continuous(expand = c(0, 0)) +
scale_x_discrete(expand = c(0, 0)) +
geom_text(data = df3, aes(x = pos, label = cluster)) +
coord_flip() +
theme(axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text = element_blank(),
legend.position = "none")
# Get the ggplot grobs
gp1 <- ggplotGrob(p1)
gp2 <- ggplotGrob(p2)
# Make sure the heights match
maxHeight <- unit.pmax(gp1$heights, gp2$heights)
gp1$heights <- as.list(maxHeight)
gp2$heights <- as.list(maxHeight)
# Combine the two plots
grid.arrange(gp2, gp1, ncol = 2,widths = c(1/6, 5/6))
How should I change the size of symbols in legends? I checked the document of theme but found no answer.
Here is an example:
library(ggplot2);library(grid)
set.seed(1000)
x <- 1:6
mu <- sin(x)
observed <- mu + rnorm(length(x), 0, 0.5*sd(mu))
data <- data.frame(
t=rep(x, 2),
value=c(mu, observed) - min(mu, observed) + 0.5,
class = rep(c("mu", "observed"), each=length(x)))
mu <- data$value[1:length(x)]
observed <- data$value[1:length(x) + length(x)]
mu.min <- mu - 3 * 0.5 * sd(mu)
mu.max <- mu + 3 * 0.5 * sd(mu)
g <- ggplot(data=data)
g <- g + geom_point(aes(x=value, y=t, shape=class, size=24)) + scale_size(guide="none")
g <- g + scale_shape(name="", labels=expression(paste(S[u](t), ", the observation at time ", t), paste(mu[u](t), ", the mean of ", tilde(S)[u](t), " ")))
stat_function.color <- gray(0.5)
g <- g + geom_segment(aes(y=1:6, yend=1:6, x=mu.min, xend=mu.max, linetype="2", alpha = 1), color=stat_function.color) + scale_alpha(guide="none") + scale_linetype(name= "", labels=expression(paste("probability density function (pdf) of ", tilde(S)[u], " at time ", t)))
for(i in 1:length(x)) {
g <- g + stat_function(fun=function(x, i) {
ifelse( x <= mu.max[i] & x >= mu.min[i], dnorm(x, mu[i], sd(mu)) + i, NA)
}, color=stat_function.color, args=list(i=i))
}
background.color <- gray(0.75)
g <- g + theme(
axis.text=element_blank(),
title=element_text(size=rel(1.5)),
legend.text=element_text(size=rel(1.5)),
legend.position="top",
legend.direction="vertical",
# legend.key.size = unit(2, "cm"),
panel.background=element_rect(fill=background.color),
panel.grid.major=element_line(color=background.color),
panel.grid.minor=element_line(color=background.color)
) + coord_flip()
plot(g)
You should use:
theme(legend.key.size = unit(3,"line"))
You can make these kinds of changes manually using the override.aes argument to guide_legend():
g <- g + guides(shape = guide_legend(override.aes = list(size = 5)))
print(g)
Marius's answer did not work for me as of R version 3.2.2. You can still call guide_legend() with the same override.aes argument but you will need to specify color instead of shape in the wrapper function.
So if you're running a later version of R, try this instead:
g + guides(color = guide_legend(override.aes = list(size=5)))
EDIT
As pointed out by #Ibo in the comment, this may have been due to the color scale in the ggplot object. If the ggplot object contains a color scale, the mapping of size (size=5) has to be set on the color instead.
If you want to change the sizes of 2 components of a legend independently, it gets trickier, but it can be done by manually editing the individual components of the plot using the grid package.
Example based on this SO answer:
set.seed(1)
dat <- data.frame(x = runif(n = 100),
x2 = factor(rep(c('first', 'second'), each = 50)))
set.seed(1)
dat$y = 5 + 1.8 * as.numeric(dat$x2) + .3 * dat$x + rnorm(100)
# basic plot
g <- ggplot(data = dat,
aes(x = x, y = y, color = x2))+
theme_bw()+
geom_point()+
geom_smooth(method = 'lm')
# make the size of the points & lines in the legend larger
g + guides(color = guide_legend(override.aes = list(size = 2)))
# Make JUST the legend points larger without changing the size of the legend lines:
# To get a list of the names of all the grobs in the ggplot
g
grid::grid.ls(grid::grid.force())
# Set the size of the point in the legend to 2 mm
grid::grid.gedit("key-[-0-9]-1-1", size = unit(4, "mm"))
# save the modified plot to an object
g2 <- grid::grid.grab()
ggsave(g2, filename = 'g2.png')
As of 12/1/2022, in ggplot2 version 3.4.0, the argument:
guides(shape = guide_legend(override.aes = list(size = 5)))
no longer works....instead, replace "size = 5" with "linewidth = 5"
Using ggplot2 I am plotting several functions and a series of points. I cannot figure out how to represent the points on the legend. I realize I need to use an aes() function, but I don't fully understand how to do this. I apologize that the example is so long, but I don't know how else to illustrate it.
## add ggplot2
library(ggplot2)
# Declare Chart values
y_label = expression("y_axis"~~bgroup("(",val / km^{2},")"))
x_label = "x_axis"
#############################
## Define functions
# Create a list to hold the functions
funcs <- list()
funcs[]
# loop through to define functions
for(k in 1:21){
# Make function name
funcName <- paste('func', k, sep = '' )
# make function
func = paste('function(x){exp(', k, ') * exp(x*0.01)}', sep = '')
funcs[[funcName]] = eval(parse(text=func))
}
# Specify values
yval = c(1:20)
xval = c(1:20)
# make a dataframe
d = data.frame(xval,yval)
# Specify Range
x_range <- range(1,51)
# make plot
p <-qplot(data = d,
x=xval,y=yval,
xlab = x_label,
ylab = y_label,
xlim = x_range
)+ geom_point(colour="green")
for(j in 1:length(funcs)){
p <- p + stat_function(aes(y=0),fun = funcs[[j]], colour="blue", alpha=I(1/5))
}
# make one function red
p <- p + stat_function(fun = funcs[[i]], aes(color="red"), size = 1) +
scale_colour_identity("", breaks=c("red", "green","blue"),
labels=c("Fitted Values", "Measured values","All values"))
# position legend and make remove frame
p <- p + opts(legend.position = c(0.85,0.7), legend.background = theme_rect(col = 0))
print(p)
Thank you in advance - I have learned I a lot from this community over the last few days.
See below for a solution. The main idea is the following: imagine the points having an invisible line under them, and the lines having invisible points. So each "series" gets color and shape and linetype attributes, and at the end we will manually set them to invisible values (0 for lines, NA for points) as necessary. ggplot2 will merge the legends for the three attributes automatically.
# make plot
p <- qplot(data = d, x=xval, y=yval, colour="Measured", shape="Measured",
linetype="Measured", xlab = x_label, ylab = y_label, xlim = x_range,
geom="point")
#add lines for functions
for(j in 1:length(funcs)){
p <- p + stat_function(aes(colour="All", shape="All", linetype="All"),
fun = funcs[[j]], alpha=I(1/5), geom="line")
}
# make one function special
p <- p + stat_function(fun = funcs[[1]], aes(colour="Fitted", shape="Fitted",
linetype="Fitted"), size = 1, geom="line")
# modify look
p <- p + scale_colour_manual("", values=c("green", "blue", "red")) +
scale_shape_manual("", values=c(19,NA,NA)) +
scale_linetype_manual("", values=c(0,1,1))
print(p)
Setting the colour aesthetic for each geom to a constant may help. Here is a small example:
require(ggplot2)
set.seed(666)
N<-20
foo<-data.frame(x=1:N,y=runif(N),z=runif(N))
p<-ggplot(foo)
p<-p+geom_line(aes(x,y,colour="Theory"))
p<-p+geom_point(aes(x,z,colour="Practice"))
#Optional, if you want your own colours
p<-p+scale_colour_manual("Source",c('blue','red'))
print(p)
This isn't supported natively in ggplot2, but I'm hoping I'll figure out how for a future version.