Related
I am trying to make a plot to show the intuition behind logistic (or probit) regression. How would I make a plot that looks something like this in ggplot?
(Wolf & Best, The Sage Handbook of Regression Analysis and Causal Inference, 2015, p. 155)
Actually, what I would rather even do is have one single normal distribution displayed along the y axis with mean = 0, and a specific variance, so that I can draw horizontal lines going from the linear predictor to the y axis and sideways normal distribution. Something like this:
What this is supposed to show (assuming I haven't misunderstood something) is . I haven't had much success so far...
library(ggplot2)
x <- seq(1, 11, 1)
y <- x*0.5
x <- x - mean(x)
y <- y - mean(y)
df <- data.frame(x, y)
# Probability density function of a normal logistic distribution
pdfDeltaFun <- function(x) {
prob = (exp(x)/(1 + exp(x))^2)
return(prob)
}
# Tried switching the x and y to be able to turn the
# distribution overlay 90 degrees with coord_flip()
ggplot(df, aes(x = y, y = x)) +
geom_point() +
geom_line() +
stat_function(fun = pdfDeltaFun)+
coord_flip()
I think this comes pretty close to the first illustration you give. If this is a thing you don't need to repeat many times, it is probably best to compute the density curves prior to plotting and use a seperate dataframe to plot these.
library(ggplot2)
x <- seq(1, 11, 1)
y <- x*0.5
x <- x - mean(x)
y <- y - mean(y)
df <- data.frame(x, y)
# For every row in `df`, compute a rotated normal density centered at `y` and shifted by `x`
curves <- lapply(seq_len(NROW(df)), function(i) {
mu <- df$y[i]
range <- mu + c(-3, 3)
seq <- seq(range[1], range[2], length.out = 100)
data.frame(
x = -1 * dnorm(seq, mean = mu) + df$x[i],
y = seq,
grp = i
)
})
# Combine above densities in one data.frame
curves <- do.call(rbind, curves)
ggplot(df, aes(x, y)) +
geom_point() +
geom_line() +
# The path draws the curve
geom_path(data = curves, aes(group = grp)) +
# The polygon does the shading. We can use `oob_squish()` to set a range.
geom_polygon(data = curves, aes(y = scales::oob_squish(y, c(0, Inf)),group = grp))
The second illustration is pretty close to your code. I simplified your density function by the standard normal density function and added some extra paramters to stat function:
library(ggplot2)
x <- seq(1, 11, 1)
y <- x*0.5
x <- x - mean(x)
y <- y - mean(y)
df <- data.frame(x, y)
ggplot(df, aes(x, y)) +
geom_point() +
geom_line() +
stat_function(fun = dnorm,
aes(x = after_stat(-y * 4 - 5), y = after_stat(x)),
xlim = range(df$y)) +
# We fill with a polygon, squishing the y-range
stat_function(fun = dnorm, geom = "polygon",
aes(x = after_stat(-y * 4 - 5),
y = after_stat(scales::oob_squish(x, c(-Inf, -1)))),
xlim = range(df$y))
I have a graph of vertices and edges which I'd like to plot using a fruchtermanreingold layout.
Here's the graph edges matrix:
edge.mat <- matrix(as.numeric(strsplit("3651,0,0,1,0,0,0,0,2,0,11,2,0,0,0,300,0,1,0,0,66,0,78,9,0,0,0,0,0,0,11690,0,1,0,0,0,0,0,0,0,0,493,1,1,0,4288,5,0,0,36,0,9,7,3,0,6,1,0,1,7,490,0,0,0,6,0,0,628,6,12,0,0,0,0,0,641,0,0,4,0,0,0,0,0,0,66,0,0,0,0,3165,0,281,0,0,0,0,0,0,0,0,45,1,0,0,35248,0,1698,2,0,1,0,2,99,0,0,6,29,286,0,31987,0,1,10,0,8,0,16,0,21,1,0,0,1718,0,51234,0,0,17,3,12,0,0,7,0,0,0,1,0,2,16736,0,0,0,3,0,0,4,630,0,0,0,9,0,0,29495,53,6,0,0,0,0,5,0,0,0,0,3,0,19,186,0,0,0,482,8,12,0,1,0,7,1,0,6,0,26338",
split = ",")[[1]]),
nrow = 14,
dimnames = list(LETTERS[1:14], LETTERS[1:14]))
I then create an igraph object from that using:
gr <- igraph::graph_from_adjacency_matrix(edge.mat, mode="undirected", weighted=T, diag=F)
And then use ggnetwork to convert gr to a data.frame, with specified vertex colors:
set.seed(1)
gr.df <- ggnetwork::ggnetwork(gr,
layout="fruchtermanreingold",
weights="weight",
niter=50000,
arrow.gap=0)
And then I plot it using ggplot2 and ggnetwork:
vertex.colors <- strsplit("#00BE6B,#DC2D00,#F57962,#EE8044,#A6A400,#62B200,#FF6C91,#F77769,#EA8332,#DA8E00,#C59900,#00ACFC,#C49A00,#DC8D00",
split=",")[[1]]
library(ggplot2)
library(ggnetwork)
ggplot(gr.df, aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(color = "gray", aes(size = weight)) +
geom_nodes(color = "black")+
geom_nodelabel(aes(label = vertex.names),
color = vertex.colors, fontface = "bold")+
theme_minimal() +
theme(axis.text=element_blank(),
axis.title=element_blank(),
legend.position="none")
In my case each vertex actually represents many points, where each vertex has a different number of points. Adding that information to gr.df:
gr.df$n <- NA
gr.df$n[which(is.na(gr.df$weight))] <- as.integer(runif(length(which(is.na(gr.df$weight))), 100, 500))
What I'd like to do is add to the plot gr.df$n radially jittered points around each vertex (i.e., with its corresponding n), with the same vertex.colors coding. Any idea how to do that?
I think sampling and then plotting with geom_point is a reasonable strategy. (otherwise you could create your own geom).
Here is some rough code, starting from the relevant bit of your question
gr.df$n <- 1
gr.df$n[which(is.na(gr.df$weight))] <- as.integer(runif(length(which(is.na(gr.df$weight))), 100, 500))
# function to sample
# https://stackoverflow.com/questions/5837572/generate-a-random-point-within-a-circle-uniformly
circSamp <- function(x, y, R=0.1){
n <- length(x)
A <- a <- runif(n,0,1)
b <- runif(n,0,1)
ind <- b < a
a[ind] <- b[ind]
b[ind] <- A[ind]
xn = x+b*R*cos(2*pi*a/b)
yn = y+b*R*sin(2*pi*a/b)
cbind(x=xn, y=yn)
}
# sample
d <- with(gr.df, data.frame(vertex.names=rep(vertex.names, n),
circSamp(rep(x,n), rep(y,n))))
# p is your plot
p + geom_point(data=d, aes(x, y, color = vertex.names),
alpha=0.1, inherit.aes = FALSE) +
scale_color_manual(values = vertex.colors)
Giving
I would like to add an image to two ggplots on the top-right. My problem is that the scales are different which is why I have to specify xmin, xmax... for each plot, seperately:
library(ggplot2)
library(png)
library(grid)
# subsetting
am0 <- subset(mtcars, am == 0)
am1 <- subset(mtcars, am == 1)
# ggplot
plt <- function(dat){
p <- ggplot(dat, aes(mpg,disp)) +
geom_point()
return(p)
}
p0 <- plt(am0)
p1 <- plt(am1)
# reading image
img <- readPNG(system.file("img", "Rlogo.png", package = "png"))
# adding image
p0 + annotation_custom(rasterGrob(img), xmin = 20, xmax = 25, ymin = 400, ymax = 450)
p1 + annotation_custom(rasterGrob(img), xmin = 30, xmax = 35, ymin = 300, ymax = 350)
leads to the following results:
My question is: Is there a way to add images so that the size and the position of the image is constant and not dependent on the scale of the plot?
it sould look something like this:
We can automate the process of specifying the location and scales, so that you don't need to change the locations manually, as shown in the following example:
get.xy <- function(p) {
g_data <- ggplot_build(p)
data.frame(xmax = max(g_data$data[[1]]$x),
ymax = max(g_data$data[[1]]$y),
xmin = min(g_data$data[[1]]$x),
ymin = min(g_data$data[[1]]$y))
}
# this returns the dataframe with required x, y params for annotation_custom,
# ensuring the size and position of the image constant
get.params.df <- function(p0, p1, width, height) {
df0 <- cbind.data.frame(get.xy(p0), width=width, height=height)
df1 <- cbind.data.frame(get.xy(p1))
df1$width <- df0$width*(df1$xmax-df1$xmin)/(df0$xmax-df0$xmin)
df1$height <- df0$height*(df1$ymax-df1$ymin)/(df0$ymax-df0$ymin)
df <- rbind(df0, df1)
return(data.frame(xmin=df$xmax-df$width, xmax=df$xmax+df$width, ymin=df$ymax-df$height, ymax=df$ymax+df$height))
}
p0 <- plt(am0)
p1 <- plt(am1)
df <- get.params.df(p0, p1, width=10, height=10)
# adding image
library(gridExtra)
grid.arrange(
p0 + annotation_custom(rasterGrob(img), xmin=df[1,1],xmax=df[1,2], ymin=df[1,3], ymax=df[1,4]),
p1 + annotation_custom(rasterGrob(img), xmin=df[2,1],xmax=df[2,2], ymin=df[2,3], ymax=df[2,4])
)
If you want bigger image change the width height parameter only, everything else remains unchanged.
df <- get.params.df(p0, p1, width=25, height=25)
library(gridExtra)
grid.arrange(
p0 + annotation_custom(rasterGrob(img), xmin=df[1,1],xmax=df[1,2], ymin=df[1,3], ymax=df[1,4]),
p1 + annotation_custom(rasterGrob(img), xmin=df[2,1],xmax=df[2,2], ymin=df[2,3], ymax=df[2,4])
)
Suppose I want to plot the following data:
# First set of X coordinates
x <- seq(0, 10, by = 0.2)
# Angles from 0 to 90 degrees
angles <- seq(0, 90, length.out = 10)
# Convert to radian
angles <- deg2rad(angles)
# Create an empty data frame
my.df <- data.frame()
# For each angle, populate the data frame
for (theta in angles) {
y <- sin(x + theta)
tmp <- data.frame(x = x, y = y, theta = as.factor(theta))
my.df <- rbind(my.df, tmp)
}
x1 <- seq(0, 12, by = 0.3)
y1 <- sin(x1 - 0.5)
tmp <- data.frame(x = x1, y = y1, theta = as.factor(-0.5))
my.df <- rbind(my.df, tmp)
ggplot(my.df, aes(x, y, color = theta)) + geom_line()
That gives me a nice plot:
Now I want to draw a heat map out of this data set. There are tutorials here and there that do it using geom_tile to do it.
So, let's try:
# Convert the angle values from factors to numerics
my.df$theta <- as.numeric(levels(my.df$theta))[my.df$theta]
ggplot(my.df, aes(theta, x)) + geom_tile(aes(fill = y)) + scale_fill_gradient(low = "blue", high = "red")
That does not work, and the reason is that my x coordinates do not have the same step:
x <- seq(0, 10, by = 0.2) vs x1 <- seq(0, 12, by = 0.3)
But as soon as I use the same step x1 <- seq(0, 12, by = 0.2), it works:
I real life, my data sets are not regularly spaced (these are experimental data), but I still need to display them as a heat map. How can I do?
You can use akima to interpolate the function into a form suitable for heat map plots.
library(akima)
library(ggplot2)
my.df.interp <- interp(x = my.df$theta, y = my.df$x, z = my.df$y, nx = 30, ny = 30)
my.df.interp.xyz <- as.data.frame(interp2xyz(my.df.interp))
names(my.df.interp.xyz) <- c("theta", "x", "y")
ggplot(my.df.interp.xyz, aes(x = theta, y = x, fill = y)) + geom_tile() +
scale_fill_gradient(low = "blue", high = "red")
If you wish to use a different resolution you can change the nx and ny arguments to interp.
Another way to do it with just ggplot2 is to use stat_summary_2d.
library(ggplot2)
ggplot(my.df, aes(x = theta, y = x, z = y)) + stat_summary_2d(binwidth = 0.3) +
scale_fill_gradient(low = "blue", high = "red")
I want to add shaded areas to a chart to help people understand where bad, ok, and good points can fit.
Good = x*y>=.66
Ok = x*y>=.34
Bad = x*y<.34
Generating the right sequence of data to correctly apply the curved boundaries to the chart is proving tough.
What is the most elegant way to generate the curves?
Bonus Q: How would you do this to produce non-overlapping areas so that different colours could be used?
Updates
I've managed to do in a rather hacky way the drawing of the circle segments. I updated the MRE to use the revised segMaker function.
MRE
library(ggplot2)
pts<-seq(0,1,.02)
x<-sample(pts,50,replace=TRUE)
y<-sample(pts,50,replace=TRUE)
# What function will generate correct sequence of values as these are linear?
segMaker<-function(x,by){
# Original
# data.frame(x=c(seq(0,x,by),0)
# ,y=c(seq(x,0,-by),0)
# )
zero <- data.frame(x = 0, y = 0)
rs <- seq(0, pi, by)
xc <- x * cos(rs)
yc <- x * sin(rs)
gr <- data.frame(x = xc, y = yc)
gr <- rbind(gr[gr$x >= 0, ], zero)
return(gr)
}
firstSeg <-segMaker(.34,0.02)
secondSeg <-segMaker(.66,0.02)
thirdSeg <-segMaker(1,0.02)
ggplot(data.frame(x,y),aes(x,y, colour=x*y))+
geom_point() +
geom_polygon(data=firstSeg, fill="blue", alpha=.25)+
geom_polygon(data=secondSeg, fill="blue", alpha=.25)+
geom_polygon(data=thirdSeg, fill="blue", alpha=.25)
Current & desired shadings
You can create a data frame with the boundaries between each region and then use geom_ribbon to plot it. Here's an example using the conditions you supplied (which result in boundaries that are the reciprocal function, rather than circles, but the idea is the same, whichever function you use for the boundaries):
library(ggplot2)
# Fake data
pts<-seq(0,1,.02)
set.seed(19485)
x<-sample(pts,50,replace=TRUE)
y<-sample(pts,50,replace=TRUE)
df = data.frame(x,y)
# Region boundaries
x = seq(0.001,1.1,0.01)
bounds = data.frame(x, ymin=c(-100/x, 0.34/x, 0.66/x),
ymax=c(0.34/x, 0.66/x, 100/x),
g=rep(c("Bad","OK","Good"), each=length(x)))
bounds$g = factor(bounds$g, levels=c("Bad","OK","Good"))
ggplot() +
coord_cartesian(ylim=0:1, xlim=0:1) +
geom_ribbon(data=bounds, aes(x, ymin=ymin, ymax=ymax, fill=g), colour="grey50", lwd=0.2) +
geom_point(data=df, aes(x,y), colour="grey20") +
scale_fill_manual(values=hcl(c(15, 40, 240), 100, 80)) +
#scale_fill_manual(values=hcl(c(15, 40, 240), 100, 80, alpha=0.25)) + # If you want the fill colors to be transparent
labs(fill="") +
guides(fill=guide_legend(reverse=TRUE))
For circular boundaries, assuming we want boundaries at r=1/3 and r=2/3:
# Calculate y for circle, given r and x
cy = function(r, x) {sqrt(r^2 - x^2)}
n = 200
x = unlist(lapply(c(1/3,2/3,1), function(to) seq(0, to, len=n)))
bounds = data.frame(x, ymin = c(rep(0, n),
cy(1/3, seq(0, 1/3, len=n/2)), rep(0, n/2),
cy(2/3, seq(0, 2/3, len=2*n/3)), rep(0, n/3)),
ymax = c(cy(1/3, seq(0,1/3,len=n)),
cy(2/3, seq(0,2/3,len=n)),
rep(1,n)),
g=rep(c("Bad","OK","Good"), each=n))
bounds$g = factor(bounds$g, levels=c("Bad","OK","Good"))
If you can use a github package, ggforce adds geom_arc_bar():
# devtools::install_github('thomasp85/ggforce')
library(ggplot2)
library(ggforce)
pts<-seq(0,1,.02)
x<-sample(pts,50,replace=TRUE)
y<-sample(pts,50,replace=TRUE)
arcs <- data.frame(
x0 = 0,
y0 = 0,
start = 0,
end = pi / 2,
r0 = c(0, 1/3, 2/3),
r = c(1/3, 2/3, 1),
fill = c("bad", "ok", "good")
)
ggplot() +
geom_arc_bar(data = arcs,
aes(x0 = x0, y0 = y0, start = start, end = end, r0 = r0, r = r,
fill = fill), alpha = 0.6) +
geom_point(data = data.frame(x = x, y = y),
aes(x = x, y = y))
Based on #eipi10's great answer, to do the product component (basically ends up with the same thing) I did:
library(ggplot2)
library(data.table)
set.seed(19485)
pts <- seq(0, 1, .001)
x <- sample(pts, 50, replace = TRUE)
y <- sample(pts, 50, replace = TRUE)
df <- data.frame(x,y)
myRibbon<-CJ(pts,pts)
myRibbon[,prod:=V1 * V2]
myRibbon[,cat:=ifelse(prod<=1/3,"bad",
ifelse(prod<=2/3,"ok","good"))]
myRibbon<-myRibbon[
,.(ymin=min(V2),ymax=max(V2))
,.(cat,V1)]
ggplot() +
geom_ribbon(data=myRibbon
, aes(x=V1, ymin=ymin,ymax=ymax
, group=cat, fill=cat),
colour="grey90", lwd=0.2, alpha=.5)+
geom_point(data=df, aes(x,y), colour="grey20") +
theme_minimal()
This doesn't do anything fancy but works out for each value of x, what the smallest and largest values were that could give rise to a specific banding.
If I had just wanted arcs, the use of ggforce (#GregF) would be really great- it tucks away all the complexity.