I am trying to overlay a Plot and a Histogram in R, usign the ggplot2 package.
The Plot contains a set of curves (visualized as straight lines due to logarithmich axis) and a horizontal line.
I would like to plot on the same image an histogram showing the density distribution of the crossing ponts between the curves and the horizontal line. I can plot the histogram alone but not on the graph because the aes-length is not the same (the last intersection is at x = 800, while the x asis is much longer).
the code I wrote is:
baseplot +
geom_histogram(data = timesdf, aes(v)) + xlim(0,2000)
where v contains the intersections between the curves and the dashed line.
Any ideas?
edited: as suggested I wrote a little reproducible example:
library(ggplot2)
xvalues <- c(0:100)
yvalues1 <- xvalues^2-1000
yvalues2 <- xvalues^3-100
yvalues3 <- xvalues^4-10
yvalues4 <- xvalues^5-50
plotdf <- as.data.frame(xvalues)
plotdf$horiz <- 5
plotdf$vert1 <- yvalues1
plotdf$vert2 <- yvalues2
plotdf$vert3 <- yvalues3
plotdf$vert4 <- yvalues4
baseplot <- ggplot(data = plotdf, mapping = aes(x= xvalues, y= horiz))+
geom_line(linetype = "dashed", size = 1)+
geom_line(data = plotdf, mapping = aes(x= xvalues, y = vert1))+
geom_line(data = plotdf, mapping = aes(x= xvalues, y = vert2))+
geom_line(data = plotdf, mapping = aes(x= xvalues, y = vert3))+
geom_line(data = plotdf, mapping = aes(x= xvalues, y = vert4))+
coord_cartesian(xlim=c(0, 100), ylim=c(0, 1000))
baseplot
v<-c(ncol(plotdf)-1)
for(i in 1:ncol(plotdf)){
v[i] <- plotdf[max(which(plotdf[,i]<5)),1]
}
v <- as.integer(v[-1])
timesdf <- as.data.frame(v)
# my wish: visualize baseplot and histplot on the same image
histplot <- ggplot() + geom_histogram(data = timesdf, aes(v)) +
coord_cartesian(xlim=c(0, 100), ylim=c(0, 10))
I'm trying to produce a scatter plot with geom_point where the points are circumscribed by a smoothed polygon, with geom_polygon.
Here's my point data:
set.seed(1)
df <- data.frame(x=c(rnorm(30,-0.1,0.1),rnorm(30,0,0.1),rnorm(30,0.1,0.1)),y=c(rnorm(30,-1,0.1),rnorm(30,0,0.1),rnorm(30,1,0.1)),val=rnorm(90),cluster=c(rep(1,30),rep(2,30),rep(3,30)),stringsAsFactors=F)
I color each point according the an interval that df$val is in. Here's the interval data:
intervals.df <- data.frame(interval=c("(-3,-2]","(-2,-0.999]","(-0.999,0]","(0,1.96]","(1.96,3.91]","(3.91,5.87]","not expressed"),
start=c(-3,-2,-0.999,0,1.96,3.91,NA),end=c(-2,-0.999,0,1.96,3.91,5.87,NA),
col=c("#2f3b61","#436CE8","#E0E0FF","#7d4343","#C74747","#EBCCD6","#D3D3D3"),stringsAsFactors=F)
Assigning colors and intervals to the points:
df <- cbind(df,do.call(rbind,lapply(df$val,function(x){
if(is.na(x)){
return(data.frame(col=intervals.df$col[nrow(intervals.df)],interval=intervals.df$interval[nrow(intervals.df)],stringsAsFactors=F))
} else{
idx <- which(intervals.df$start <= x & intervals.df$end >= x)
return(data.frame(col=intervals.df$col[idx],interval=intervals.df$interval[idx],stringsAsFactors=F))
}
})))
Preparing the colors for the leged which will show each interval:
df$interval <- factor(df$interval,levels=intervals.df$interval)
colors <- intervals.df$col
names(colors) <- intervals.df$interval
Here's where I constructed the smoothed polygons (using a function courtesy of this link):
clusters <- sort(unique(df$cluster))
cluster.cols <- c("#ff00ff","#088163","#ccbfa5")
splinePolygon <- function(xy,vertices,k=3, ...)
{
# Assert: xy is an n by 2 matrix with n >= k.
# Wrap k vertices around each end.
n <- dim(xy)[1]
if (k >= 1) {
data <- rbind(xy[(n-k+1):n,], xy, xy[1:k, ])
} else {
data <- xy
}
# Spline the x and y coordinates.
data.spline <- spline(1:(n+2*k), data[,1], n=vertices, ...)
x <- data.spline$x
x1 <- data.spline$y
x2 <- spline(1:(n+2*k), data[,2], n=vertices, ...)$y
# Retain only the middle part.
cbind(x1, x2)[k < x & x <= n+k, ]
}
library(data.table)
hulls.df <- do.call(rbind,lapply(1:length(clusters),function(l){
dt <- data.table(df[which(df$cluster==clusters[l]),])
hull <- dt[, .SD[chull(x,y)]]
spline.hull <- splinePolygon(cbind(hull$x,hull$y),100)
return(data.frame(x=spline.hull[,1],y=spline.hull[,2],val=NA,cluster=clusters[l],col=cluster.cols[l],interval=NA,stringsAsFactors=F))
}))
hulls.df$cluster <- factor(hulls.df$cluster,levels=clusters)
And here's my ggplot command:
library(ggplot2)
p <- ggplot(df,aes(x=x,y=y,colour=interval))+geom_point(cex=2,shape=1,stroke=1)+labs(x="X", y="Y")+theme_bw()+theme(legend.key=element_blank(),panel.border=element_blank(),strip.background=element_blank())+scale_color_manual(drop=FALSE,values=colors,name="DE")
p <- p+geom_polygon(data=hulls.df,aes(x=x,y=y,group=cluster),color=hulls.df$col,fill=NA)
which produces:
My question is how do I add a legend for the polygon under the legend for the points? I want it to a legend with 3 lines colored according to the cluster colors and the corresponding cluster number beside each line?
Slightly different output, only changing the last line of your code, it may solve your purpose:
p+geom_polygon(data=hulls.df,aes(x=x,y=y,group=cluster, fill=cluster),alpha=0.1)
Say, you want to add a legend of the_factor. My basic idea is,
(1) put the_factor into mapping by using unused aes arguments; aes(xx = the_factor)
(2) if (1) affects something, delete the effect by using scale_xx_manual()
(3) modify the legend by using guides(xx = guide_legend(override.aes = list()))
In your case, aes(fill) and aes(alpha) are unused. The former is better to do it because of no effect. So I used aes(fill=as.factor(cluster)).
p <- ggplot(df,aes(x=x,y=y,colour=interval, fill=as.factor(cluster))) + # add aes(fill=...)
geom_point(cex=2, shape=1, stroke=1) +
labs(x="X", y="Y",fill="cluster") + # add fill="cluster"
theme_bw() + theme(legend.key=element_blank(),panel.border=element_blank(),strip.background=element_blank()) + scale_color_manual(drop=FALSE,values=colors,name="DE") +
guides(fill = guide_legend(override.aes = list(colour = cluster.cols, pch=0))) # add
p <- p+geom_polygon(data=hulls.df,aes(x=x,y=y,group=cluster), color=hulls.df$col,fill=NA)
Of course, you can make the same graph by using aes(alpha = the_factor)). Because it has influence, you need to control it by using scale_alpha_manual().
g <- ggplot(df, aes(x=x,y=y,colour=interval)) +
geom_point(cex=2, shape=1, stroke=1, aes(alpha=as.factor(cluster))) + # add aes(alpha)
labs(x="X", y="Y",alpha="cluster") + # add alpha="cluster"
theme_bw() + theme(legend.key=element_blank(),panel.border=element_blank(),strip.background=element_blank()) + scale_color_manual(drop=FALSE,values=colors,name="DE") +
scale_alpha_manual(values=c(1,1,1)) + # add
guides(alpha = guide_legend(override.aes = list(colour = cluster.cols, pch=0))) # add
g <- p+geom_polygon(data=hulls.df,aes(x=x,y=y,group=cluster), color=hulls.df$col,fill=NA)
What you are asking for is two colour scales. My understanding is that this is not possible. But you can give the impression of having two colour scales with a bit of a cheat and using the filled symbols (shapes 21 to 25).
p <- ggplot(df, aes(x = x, y = y, fill = interval)) +
geom_point(cex = 2, shape = 21, stroke = 1, colour = NA)+
labs(x = "X", y = "Y") +
theme_bw() +
theme(legend.key = element_blank(), panel.border = element_blank(), strip.background = element_blank()) +
scale_fill_manual(drop=FALSE, values=colors, name="DE") +
geom_polygon(data = hulls.df, aes(x = x, y = y, colour = cluster), fill = NA) +
scale_colour_manual(values = cluster.cols)
p
Alternatively, use a filled polygon with a low alpha
p <- ggplot(df,aes(x=x,y=y,colour=interval))+
geom_point(cex=2,shape=1,stroke=1)+
labs(x="X", y="Y")+
theme_bw() +
theme(legend.key = element_blank(),panel.border=element_blank(), strip.background=element_blank()) +
scale_color_manual(drop=FALSE,values=colors,name="DE", guide = guide_legend(override.aes = list(fill = NA))) +
geom_polygon(data=hulls.df,aes(x=x,y=y,group=cluster, fill = cluster), alpha = 0.2, show.legend = TRUE) +
scale_fill_manual(values = cluster.cols)
p
But this might make the point colours difficult to see.
I am very new to R and ggplot2. I am trying to create a grid of plots of correlations as well as their trailing max and min values using a for loop. The plots are then saved as a PDF to a directory. When they are saved the blue lines(min max) are correctly plotted. However when I then use the do.call(grid.arrange,t) or any other call to the plots in the list. you do not get the correct blue lines, but the last plots blue lines populate all of the plots.
I dont understand how this can plot and save the pdf correctly but not store the ggplot object correctly in the t list() or how there is some confusion in the render using do.call(grid.arrange,t). How can the original line (black) plot correctly but the geom_line additions do not ? I am really confused.
If someone could kindly help me check this code and find out how to plot all lines correctly then place them in a grid that would be great.
reproducable code below using random data
require(TTR)
require(ggplot2)
library(gridExtra)
set.seed(12345)
filelocation = "c:/"
values <- as.data.frame(matrix( rnorm(5*500,mean=0,sd=3), 500, 5))
t <- list()
rollLength = 25
for( i in 1:(ncol(values)))
{
p <- ggplot(data=values, aes(x = index(values)) )
p <- p + geom_line(data=values, aes_string(y = colnames(values)[i]))
p <- p + geom_line(data = values, aes(x = index(values), y = runMax(values[,i], n = rollLength) ), colour = "blue", linetype = "longdash" )
p <- p + geom_line(data = values, aes(x = index(values), y = runMin(values[,i], n = rollLength) ), colour = "blue", linetype = "longdash" )
p <- p + ggtitle(colnames(values)[i]) + xlab("Date") + ylab("Pearson Correlation")
print(p)
ggsave( file = paste(colnames(values)[i],".pdf",sep = "") , path = filelocation)
assign(paste("p", i, sep = ""), p)
t[[i]] <- p
}
do.call(grid.arrange,t)
Hmm, this isn't exactly what you want I think, but close, and less code
require(TTR)
require(ggplot2)
set.seed(12345)
values <- as.data.frame(matrix( rnorm(5*500,mean=0,sd=3), 500, 5))
rollLength = 25
library(reshape2)
dfmelt <- melt(values)
dfmelt$max <- runMax(dfmelt$value, n=rollLength)
dfmelt$min <- runMin(dfmelt$value, n=rollLength)
dfmelt$row <- index(dfmelt)
ggplot(dfmelt, aes(x = row, y = value)) +
geom_line() +
geom_line(aes(x = row, y = max), data=dfmelt, colour = "blue",
linetype = "longdash") +
geom_line(aes(x = row, y = min), data=dfmelt, colour = "blue",
linetype = "longdash") +
facet_wrap(~ variable, scales="free")
I have following code to draw my logistic distribution:
x=seq(-2000,2000,length=1000)
dat <- data.frame(x=x)
dat$value <- dlogis(x,location=200,scale=400/log(10))
dat$type <- "Expected score"
p <- ggplot(data=dat, aes(x=x, y=value)) + geom_line(col="blue", size=1) +
coord_cartesian(xlim = c(-500, 900), ylim = c(0, 0.0016)) +
scale_x_continuous(breaks=c(seq(-500, 800, 100)))
pp <- p + geom_line(aes(x = c(0,0), y = c(0,0.0011)), size=0.9, colour="green", linetype=2, alpha=0.7)
Now what I would like to do is to highlight the area to the left of x = 0.
I tried to do it like this:
x = seq(-500, 0, length=10)
y = dlogis(x,location=200,scale=400/log(10))
pol <- data.frame(x = x, y = y)
pp + geom_polygon(aes(data=pol,x=x, y=y), fill="light blue", alpha=0.6)
But this does not work. Not sure what I am doing wrong. Any help?
I haven't diagnosed the problem with your polygon (although I think you would need to give the full path around the outside, i.e. attach rep(0,length(x)) to the end of y and rev(x) to the end of x), but geom_ribbon (as in Shading a kernel density plot between two points. ) seems to do the trick:
pp + geom_ribbon(data=data.frame(x=x,y=y),aes(ymax=y,x=x,y=NULL),
ymin=0,fill="light blue",alpha=0.5)