Related
I am trying to make a plot to show the intuition behind logistic (or probit) regression. How would I make a plot that looks something like this in ggplot?
(Wolf & Best, The Sage Handbook of Regression Analysis and Causal Inference, 2015, p. 155)
Actually, what I would rather even do is have one single normal distribution displayed along the y axis with mean = 0, and a specific variance, so that I can draw horizontal lines going from the linear predictor to the y axis and sideways normal distribution. Something like this:
What this is supposed to show (assuming I haven't misunderstood something) is . I haven't had much success so far...
library(ggplot2)
x <- seq(1, 11, 1)
y <- x*0.5
x <- x - mean(x)
y <- y - mean(y)
df <- data.frame(x, y)
# Probability density function of a normal logistic distribution
pdfDeltaFun <- function(x) {
prob = (exp(x)/(1 + exp(x))^2)
return(prob)
}
# Tried switching the x and y to be able to turn the
# distribution overlay 90 degrees with coord_flip()
ggplot(df, aes(x = y, y = x)) +
geom_point() +
geom_line() +
stat_function(fun = pdfDeltaFun)+
coord_flip()
I think this comes pretty close to the first illustration you give. If this is a thing you don't need to repeat many times, it is probably best to compute the density curves prior to plotting and use a seperate dataframe to plot these.
library(ggplot2)
x <- seq(1, 11, 1)
y <- x*0.5
x <- x - mean(x)
y <- y - mean(y)
df <- data.frame(x, y)
# For every row in `df`, compute a rotated normal density centered at `y` and shifted by `x`
curves <- lapply(seq_len(NROW(df)), function(i) {
mu <- df$y[i]
range <- mu + c(-3, 3)
seq <- seq(range[1], range[2], length.out = 100)
data.frame(
x = -1 * dnorm(seq, mean = mu) + df$x[i],
y = seq,
grp = i
)
})
# Combine above densities in one data.frame
curves <- do.call(rbind, curves)
ggplot(df, aes(x, y)) +
geom_point() +
geom_line() +
# The path draws the curve
geom_path(data = curves, aes(group = grp)) +
# The polygon does the shading. We can use `oob_squish()` to set a range.
geom_polygon(data = curves, aes(y = scales::oob_squish(y, c(0, Inf)),group = grp))
The second illustration is pretty close to your code. I simplified your density function by the standard normal density function and added some extra paramters to stat function:
library(ggplot2)
x <- seq(1, 11, 1)
y <- x*0.5
x <- x - mean(x)
y <- y - mean(y)
df <- data.frame(x, y)
ggplot(df, aes(x, y)) +
geom_point() +
geom_line() +
stat_function(fun = dnorm,
aes(x = after_stat(-y * 4 - 5), y = after_stat(x)),
xlim = range(df$y)) +
# We fill with a polygon, squishing the y-range
stat_function(fun = dnorm, geom = "polygon",
aes(x = after_stat(-y * 4 - 5),
y = after_stat(scales::oob_squish(x, c(-Inf, -1)))),
xlim = range(df$y))
I have made a contour plot in R with the following code:
library(mvtnorm)
# Define the parameters for the multivariate normal distribution
mu = c(0,0)
sigma = matrix(c(1,0.2,0.2,3),nrow = 2)
# Make a grid in the x-y plane centered in mu, +/- 3 standard deviations
xygrid = expand.grid(x = seq(from = mu[1]-3*sigma[1,1], to = mu[1]+3*sigma[1,1], length.out = 100),
y = seq(from = mu[2]-3*sigma[2,2], to = mu[2]+3*sigma[2,2], length.out = 100))
# Use the mvtnorm library to calculate the multivariate normal density for each point in the grid
distribution = as.matrix(dmvnorm(x = xygrid, mean = mu, sigma = sigma))
# Plot contours
df = as.data.frame(cbind(xygrid, distribution))
myPlot = ggplot() + geom_contour(data = df,geom="polygon",aes( x = x, y = y, z = distribution))
myPlot
I want to illustrate cumulative probability by shading/colouring certain parts of the plot, for instance everything in the region {x<0, y<0} (or any other self defined region).
Is there any way of achieving this in R with ggplot?
So you are able to get the coordinates used to draw the circles in the plot using ggplot_build. Subsequently you could try to use these coordinates in combination with geom_polygon to shade a particular region. My best try:
library(dplyr)
data <- ggplot_build(myPlot)$data[[1]]
xCoor <- 0
yCoor <- 0
df <- data %>% filter(group == '-1-001', x <= xCoor, y <= yCoor) %>% select(x,y)
# Insert the [0,0] coordinate in the right place
index <- which.max(abs(diff(rank(df$y))))
df <- rbind( df[1:index,], data.frame(x=xCoor, y=yCoor), df[(index+1):nrow(df),] )
myPlot + geom_polygon(data = df, aes(x=x, y=y), fill = 'red', alpha = 0.5)
As you can see it's not perfect because the [x,0] and [0,y] coordinates are not included in the data, but it's a start.
I want to add shaded areas to a chart to help people understand where bad, ok, and good points can fit.
Good = x*y>=.66
Ok = x*y>=.34
Bad = x*y<.34
Generating the right sequence of data to correctly apply the curved boundaries to the chart is proving tough.
What is the most elegant way to generate the curves?
Bonus Q: How would you do this to produce non-overlapping areas so that different colours could be used?
Updates
I've managed to do in a rather hacky way the drawing of the circle segments. I updated the MRE to use the revised segMaker function.
MRE
library(ggplot2)
pts<-seq(0,1,.02)
x<-sample(pts,50,replace=TRUE)
y<-sample(pts,50,replace=TRUE)
# What function will generate correct sequence of values as these are linear?
segMaker<-function(x,by){
# Original
# data.frame(x=c(seq(0,x,by),0)
# ,y=c(seq(x,0,-by),0)
# )
zero <- data.frame(x = 0, y = 0)
rs <- seq(0, pi, by)
xc <- x * cos(rs)
yc <- x * sin(rs)
gr <- data.frame(x = xc, y = yc)
gr <- rbind(gr[gr$x >= 0, ], zero)
return(gr)
}
firstSeg <-segMaker(.34,0.02)
secondSeg <-segMaker(.66,0.02)
thirdSeg <-segMaker(1,0.02)
ggplot(data.frame(x,y),aes(x,y, colour=x*y))+
geom_point() +
geom_polygon(data=firstSeg, fill="blue", alpha=.25)+
geom_polygon(data=secondSeg, fill="blue", alpha=.25)+
geom_polygon(data=thirdSeg, fill="blue", alpha=.25)
Current & desired shadings
You can create a data frame with the boundaries between each region and then use geom_ribbon to plot it. Here's an example using the conditions you supplied (which result in boundaries that are the reciprocal function, rather than circles, but the idea is the same, whichever function you use for the boundaries):
library(ggplot2)
# Fake data
pts<-seq(0,1,.02)
set.seed(19485)
x<-sample(pts,50,replace=TRUE)
y<-sample(pts,50,replace=TRUE)
df = data.frame(x,y)
# Region boundaries
x = seq(0.001,1.1,0.01)
bounds = data.frame(x, ymin=c(-100/x, 0.34/x, 0.66/x),
ymax=c(0.34/x, 0.66/x, 100/x),
g=rep(c("Bad","OK","Good"), each=length(x)))
bounds$g = factor(bounds$g, levels=c("Bad","OK","Good"))
ggplot() +
coord_cartesian(ylim=0:1, xlim=0:1) +
geom_ribbon(data=bounds, aes(x, ymin=ymin, ymax=ymax, fill=g), colour="grey50", lwd=0.2) +
geom_point(data=df, aes(x,y), colour="grey20") +
scale_fill_manual(values=hcl(c(15, 40, 240), 100, 80)) +
#scale_fill_manual(values=hcl(c(15, 40, 240), 100, 80, alpha=0.25)) + # If you want the fill colors to be transparent
labs(fill="") +
guides(fill=guide_legend(reverse=TRUE))
For circular boundaries, assuming we want boundaries at r=1/3 and r=2/3:
# Calculate y for circle, given r and x
cy = function(r, x) {sqrt(r^2 - x^2)}
n = 200
x = unlist(lapply(c(1/3,2/3,1), function(to) seq(0, to, len=n)))
bounds = data.frame(x, ymin = c(rep(0, n),
cy(1/3, seq(0, 1/3, len=n/2)), rep(0, n/2),
cy(2/3, seq(0, 2/3, len=2*n/3)), rep(0, n/3)),
ymax = c(cy(1/3, seq(0,1/3,len=n)),
cy(2/3, seq(0,2/3,len=n)),
rep(1,n)),
g=rep(c("Bad","OK","Good"), each=n))
bounds$g = factor(bounds$g, levels=c("Bad","OK","Good"))
If you can use a github package, ggforce adds geom_arc_bar():
# devtools::install_github('thomasp85/ggforce')
library(ggplot2)
library(ggforce)
pts<-seq(0,1,.02)
x<-sample(pts,50,replace=TRUE)
y<-sample(pts,50,replace=TRUE)
arcs <- data.frame(
x0 = 0,
y0 = 0,
start = 0,
end = pi / 2,
r0 = c(0, 1/3, 2/3),
r = c(1/3, 2/3, 1),
fill = c("bad", "ok", "good")
)
ggplot() +
geom_arc_bar(data = arcs,
aes(x0 = x0, y0 = y0, start = start, end = end, r0 = r0, r = r,
fill = fill), alpha = 0.6) +
geom_point(data = data.frame(x = x, y = y),
aes(x = x, y = y))
Based on #eipi10's great answer, to do the product component (basically ends up with the same thing) I did:
library(ggplot2)
library(data.table)
set.seed(19485)
pts <- seq(0, 1, .001)
x <- sample(pts, 50, replace = TRUE)
y <- sample(pts, 50, replace = TRUE)
df <- data.frame(x,y)
myRibbon<-CJ(pts,pts)
myRibbon[,prod:=V1 * V2]
myRibbon[,cat:=ifelse(prod<=1/3,"bad",
ifelse(prod<=2/3,"ok","good"))]
myRibbon<-myRibbon[
,.(ymin=min(V2),ymax=max(V2))
,.(cat,V1)]
ggplot() +
geom_ribbon(data=myRibbon
, aes(x=V1, ymin=ymin,ymax=ymax
, group=cat, fill=cat),
colour="grey90", lwd=0.2, alpha=.5)+
geom_point(data=df, aes(x,y), colour="grey20") +
theme_minimal()
This doesn't do anything fancy but works out for each value of x, what the smallest and largest values were that could give rise to a specific banding.
If I had just wanted arcs, the use of ggforce (#GregF) would be really great- it tucks away all the complexity.
Can ggplot2 be used to produce a so-called topoplot (often used in neuroscience)?
Sample data:
label x y signal
1 R3 0.64924459 0.91228430 2.0261520
2 R4 0.78789621 0.78234410 1.7880972
3 R5 0.93169511 0.72980685 0.9170998
4 R6 0.48406513 0.82383895 3.1933129
Full sample data.
Rows represent individual electrodes. Columns x and y represent the projection into 2D space and the column signal is essentially the z-axis representing voltage measured at a given electrode.
stat_contour doesn't work, apparently due to unequal grid.
geom_density_2d only provides a density estimation of x and y.
geom_raster is one not fitted for this task or I must be using it incorrectly since it quickly runs out of memory.
Smoothing (like in the image on the right) and head contours (nose, ears) aren't necessary.
I want to avoid Matlab and transforming the data so that it fits this or that toolbox… Many thanks!
Update (26 January 2016)
The closest I've been able to get to my objective is via
library(colorRamps)
ggplot(channels, aes(x, y, z = signal)) + stat_summary_2d() + scale_fill_gradientn(colours=matlab.like(20))
which produces an image like this:
Update 2 (27 January 2016)
I've tried #alexforrence's approach with full data and this is the result:
It's a great start but there is a couple of issues:
The last call (ggplot()) takes about 40 seconds on an Intel i7 4790K while Matlab toolboxes manage to generate these almost instantly; my ‘emergency solution’ above takes about a second.
As you can see, the upper and lower border of the central part appear to be ‘sliced’ – I'm not sure what causes this but it could be the third issue.
I'm getting these warnings:
1: Removed 170235 rows containing non-finite values (stat_contour).
2: Removed 170235 rows containing non-finite values (stat_contour).
Update 3 (27 January 2016)
Comparison between two plots produced with different interp(xo, yo) and stat_contour(binwidth) values:
Ragged edges if one chooses low interp(xo, yo), in this case xo/yo = seq(0, 1, length = 100):
Here's a potential start:
First, we'll attach some packages. I'm using akima to do linear interpolation, though it looks like EEGLAB uses some sort of spherical interpolation here? (the data was a little sparse to try it).
library(ggplot2)
library(akima)
library(reshape2)
Next, reading in the data:
dat <- read.table(text = " label x y signal
1 R3 0.64924459 0.91228430 2.0261520
2 R4 0.78789621 0.78234410 1.7880972
3 R5 0.93169511 0.72980685 0.9170998
4 R6 0.48406513 0.82383895 3.1933129")
We'll interpolate the data, and stick that in a data frame.
datmat <- interp(dat$x, dat$y, dat$signal,
xo = seq(0, 1, length = 1000),
yo = seq(0, 1, length = 1000))
datmat2 <- melt(datmat$z)
names(datmat2) <- c('x', 'y', 'value')
datmat2[,1:2] <- datmat2[,1:2]/1000 # scale it back
I'm going to borrow from some previous answers. The circleFun below is from Draw a circle with ggplot2.
circleFun <- function(center = c(0,0),diameter = 1, npoints = 100){
r = diameter / 2
tt <- seq(0,2*pi,length.out = npoints)
xx <- center[1] + r * cos(tt)
yy <- center[2] + r * sin(tt)
return(data.frame(x = xx, y = yy))
}
circledat <- circleFun(c(.5, .5), 1, npoints = 100) # center on [.5, .5]
# ignore anything outside the circle
datmat2$incircle <- (datmat2$x - .5)^2 + (datmat2$y - .5)^2 < .5^2 # mark
datmat2 <- datmat2[datmat2$incircle,]
And I really liked the look of the contour plot in R plot filled.contour() output in ggpplot2, so we'll borrow that one.
ggplot(datmat2, aes(x, y, z = value)) +
geom_tile(aes(fill = value)) +
stat_contour(aes(fill = ..level..), geom = 'polygon', binwidth = 0.01) +
geom_contour(colour = 'white', alpha = 0.5) +
scale_fill_distiller(palette = "Spectral", na.value = NA) +
geom_path(data = circledat, aes(x, y, z = NULL)) +
# draw the nose (haven't drawn ears yet)
geom_line(data = data.frame(x = c(0.45, 0.5, .55), y = c(1, 1.05, 1)),
aes(x, y, z = NULL)) +
# add points for the electrodes
geom_point(data = dat, aes(x, y, z = NULL, fill = NULL),
shape = 21, colour = 'black', fill = 'white', size = 2) +
theme_bw()
With improvements mentioned in the comments (setting extrap = TRUE and linear = FALSE in the interp call to fill in gaps and do a spline smoothing, respectively, and removing NAs before plotting), we get:
mgcv can do spherical splines. This replaces akima (the chunk containing interp() isn't necessary).
library(mgcv)
spl1 <- gam(signal ~ s(x, y, bs = 'sos'), data = dat)
# fine grid, coarser is faster
datmat2 <- data.frame(expand.grid(x = seq(0, 1, 0.001), y = seq(0, 1, 0.001)))
resp <- predict(spl1, datmat2, type = "response")
datmat2$value <- resp
I have a set of (2-dimensional) data points that I run through a classifier that uses higher order polynomial transformations. I want to visualize the results as a 2 dimensional scatterplot of the points with the classifier superimbosed on top, preferably using ggplot2 as all other visualizations are made by this. Pretty much like this one that was used in the ClatechX online course on machine learning (the background color is optional).
I can display the points with colors and symbols and all, that's easy but I can't figure out how to draw anything like the classifiers (the intersection of the classifiing hyperplane with the plane representing my threshold). The only thing I found was stat_function and that only takes a function with a single argument.
Edit:
The example that was asked for in the comments:
sample data:
"","x","y","x","x","y","value"
"1",4.17338115745224,0.303530843229964,1.26674990184152,17.4171102853774,0.0921309727918932,-1
"2",4.85514814266935,3.452660451876,16.7631779801937,23.5724634872656,11.9208641959486,1
"3",3.51938610081561,3.41200957307592,12.0081790673332,12.3860785266141,11.6418093267617,1
"4",3.18545089452527,0.933340128976852,2.97310914874565,10.1470974014319,0.87112379635852,-16
"5",2.77556006214581,2.49701633118093,6.93061880335166,7.70373365857888,6.23509055818427,-1
"6",2.45974169578403,4.56341833807528,11.2248303614692,6.05032920997851,20.8247869282818,1
"7",2.73947941488586,3.35344674880616,9.18669833727041,7.50474746458339,11.2456050970786,-1
"8",2.01721803518012,3.55453519499861,7.17027250203368,4.06916860145595,12.6347204524838,-1
"9",3.52376445778646,1.47073399974033,5.1825201951431,12.4169159539591,2.1630584979922,-1
"10",3.77387718763202,0.509284208528697,1.92197605658768,14.2421490273294,0.259370405056702,-1
"11",4.15821685106494,1.03675272315741,4.31104264382058,17.2907673804804,1.0748562089743,-1
"12",2.57985028671101,3.88512040604837,10.0230289934507,6.65562750184287,15.0941605694935,1
"13",3.99800728890114,2.39457673509605,9.5735352407471,15.9840622821066,5.73399774026327,1
"14",2.10979392635636,4.58358959294856,9.67042948411309,4.45123041169019,21.0092935565863,1
"15",2.26988795562647,2.96687697409652,6.73447830932721,5.15239133109813,8.80235897942413,-1
"16",1.11802248633467,0.114183261757717,0.127659454208164,1.24997427994995,0.0130378172656312,-1
"17",0.310411276295781,2.09426849964075,0.650084557879535,0.0963551604515758,4.38596054858751,-1
"18",1.93197490065359,1.72926536411978,3.340897280049,3.73252701675543,2.99035869954433,-1
"19",3.45879891654477,1.13636834081262,3.93046958599847,11.9632899450912,1.29133300600123,-1
"20",0.310697768582031,0.730971727753058,0.227111284709427,0.0965331034018534,0.534319666774291,-1
"21",3.88408110360615,0.915658151498064,3.55649052359657,15.0860860193904,0.838429850404852,-1
"22",0.287852146429941,2.16121324687265,0.622109872005114,0.0828588582043242,4.67084269845782,-1
"23",2.80277011333965,1.22467750683427,3.4324895146344,7.85552030822994,1.4998349957458,-1
"24",0.579150241101161,0.57801398797892,0.334756940497835,0.335415001767533,0.334100170299295-,1
"25",2.37193428212777,1.58276639413089,3.7542178708388,5.62607223873297,2.50514945839009,-1
"26",0.372461311053485,2.51207412336953,0.935650421453748,0.138727428231681,6.31051640130279,-1
"27",3.56567220995203,1.03982002707198,3.70765737388213,12.7140183088242,1.08122568869998,-1
"28",0.634770628530532,2.26303249713965,1.43650656059435,0.402933750845047,5.12131608311011,-1
"29",2.43812176748179,1.91849716124125,4.67752968967431,5.94443775306852,3.68063135769073,-1
"30",1.08741064323112,3.01656032912433,3.28023980783858,1.18246190701233,9.0996362192467,-1
"31",0.98,2.74,2.6852,0.9604,7.5076,1
"32",3.16,1.78,5.6248,9.9856,3.1684,1
"33",4.26,4.28,18.2328,18.1476,18.3184,-1
The code to generate a classifier:
perceptron_train <- function(data, maxIter=10000) {
set.seed(839)
X <- as.matrix(data[1:5])
Y <- data["value"]
d <- dim(X)
X <- cbind(rep(1, d[1]), X)
W <- rep(0, d[2] + 1)
count <- 0
while (count < maxIter){
H <- sign(X %*% W)
indexs <- which(H != Y)
if (length(indexs) == 0){
break
} else {
i <- sample(indexs, 1)
W <- W + 0.1 * (X[i,] * Y[i,])
}
count <- count + 1
point <- as.data.frame(data[i,])
plot_it(data, point, W, paste("plot", sprintf("%05d", count), ".png", sep=""))
}
W
}
The code to generate the plot:
plot_it <- function(data, point, weights, name = "plot.png") {
line <- weights_to_line(weights)
point <- point
png(name)
p = ggplot() + geom_point(data = data, aes(x, y, color = value, size = 2)) + theme(legend.position = "none")
p = p + geom_abline(intercept = line[2], slope = line[1])
print(p)
dev.off()
}
This was solved using material from the question and answers from Issues plotting a fitted SVM model's decision boundary using ggplot2's stat_contour(). I skipped the call to geom_point for the grid-entires and some of the aesthetical definitions like scale_fill_manual and scale_colour_manual. Removing the dots for the grid entries solved the problem with the vanishing contour-line in my case.
train_and_plot_svm <- function(train, kernel = "sigmoid", type ="C", cost, gamma) {
fit <- svm(as.factor(value) ~ x + y, data = train, kernel = kernel, type = type, cost = cost)
grid <- expand.grid (x = seq(from = -0.1, to = 15, length = 100), y = seq(from = -0.1, to = 15, length = 100))
decisionValues <- as.vector(attributes(predict(fit, grid, decision.values = TRUE))$decision)
p <- predict(fit, grid)
grid$value <- p
grid$z <- decisionValues
p <- ggplot() + stat_contour(data = grid, aes(x = x, y = y, z = z), breaks = c(0))
p <- p + geom_point(data = train, aes(x, y, colour = as.factor(value)), alpha = 0.7)
p <- p + xlim(0,15) + ylim(0,15) + theme(legend.position="none")
}
Note that this function doesn't return the result of the svm training but the ggplot2 object.
This is, what I got: