Add OLS squares to ggplot2 graph - r

I'm trying to plot a graph with points, OLS line and I want to add a square to every point to explain how OLS work. I am doing so using for loop which is not the best in the first place, but it is what it is. My code is as follows:
library(ggplot2)
set.seed(0)
x = sort(rnorm(20, 5, 5))
y = sort(rnorm(20, 5, 10))
lmod = lm(y ~ x)
subor <- data.frame(prva = x, druha = y)
a = ggplot(subor, aes_string(x = x, y = y)) +
geom_point() +
geom_abline(slope = lmod$coefficients[2], intercept = lmod$coefficients[1]) +
coord_fixed()
for(i in 1:20) {
a = a + geom_rect(aes(xmin = ifelse(y[i] > lmod$fitted.values[i], x[i] - (y[i] - lmod$fitted.values[i]), x[i]),
ymin = min(y[i], lmod$fitted.values[i]), ymax = max(y[i], lmod$fitted.values[i]),
xmax = ifelse(y[i] > lmod$fitted.values[i], x[i], x[i] + (lmod$fitted.values[i] - y[i]))))
}
a
But instead of getting all the squares I only get the last one.

You don't really need a loop here. Use the vectorized pmin and pmax functions to work out the minimum and maximum edges of your squares from the fitted values and residuals:
a + geom_rect(alpha = 0.2, col = 'gray60',
aes(ymin = pmin(druha, fitted(lmod)),
ymax = pmax(druha, fitted(lmod)),
xmin = pmin(prva, prva - resid(lmod)),
xmax = pmax(prva, prva - resid(lmod))))

Related

Plot logistic regression using parameters in ggplot2

I would like to plot a logistic regression directly from the parameter estimates using ggplot2, but not quite sure how to do it.
For example, if I had 1500 draws of alpha and beta parameter estimates, I could plot each of the lines thus:
alpha_post = rnorm(n=1500,mean=1.1,sd = .15)
beta_post = rnorm(n=1500,mean=1.8,sd = .19)
X_lim = seq(from = -3,to = 2,by=.01)
for (i in 1:length(alpha_post)){
print(i)
y = exp(alpha_post[i] + beta_post[i]*X_lim)/(1+ exp(alpha_post[i] + beta_post[i]*X_lim) )
if (i==1){plot(X_lim,y,type="l")}
else {lines(X_lim,y,add=T)}
}
How would I do this in ggplot2? I know how to use geom_smooth(), but this is a little different.
As always in ggplot, you want to make a data.frame with all data that needs to be plotted:
d <- data.frame(
alpha_post = alpha_post,
beta_post = beta_post,
X_lim = rep(seq(from = -3,to = 2,by=.01), each = length(alpha_post))
)
d$y <- with(d, exp(alpha_post + beta_post * X_lim) / (1 + exp(alpha_post + beta_post * X_lim)))
Then the plotting itself becomes quite easy:
ggplot(d, aes(X_lim, y, group = alpha_post)) + geom_line()
If you want to be more fancy, add a summary line with e.g. the mean:
ggplot(d, aes(X_lim, y)) +
geom_line(aes(group = alpha_post), alpha = 0.3) +
geom_line(size = 1, color = 'firebrick', stat = 'summary', fun.y = 'mean')

Plot for 100 random surveys

I am attempting to reproduce the following graph in R:
This is meant to represent 100 random polls confidence intervals, with a mean of 42.9. I made some progress with qplot, however there are some things that I still couldn't get.
library(ggplot2)
polls <- replicate(100, rnorm(100, mean = 30, sd=3))
# Calculate 90% confidence intervals for each row.
tint <- matrix(NA, nrow = dim(polls)[2], ncol = 2)
for (i in 1:dim(polls)[2]) {
temp <- t.test(polls[, i], conf.level = 0.9)
tint[i, ] <- temp$conf.int
}
colnames(tint) <- c("lcl", "ucl")
# The width of each confidence interval:
width <- apply(tint, 1, diff)
tint <- cbind(tint, width)
tint <- data.frame(tint)
And with the command:
qplot(tint$width, y=30, geom="pointrange",ymin = tint$lcl, ymax = tint$ucl) + coord_flip() +
theme_bw()
I get:
Questions:
How do I change y to represent each poll?
How to draw the line at the intended mean (30, in this case)?
Not a very elegant solution but it works:
ggplot(data = tint, aes(x = ucl - width/2, y = seq(1:100))) +
geom_point() +
geom_errorbarh(aes(xmin = lcl, xmax = ucl)) +
geom_vline(xintercept = 30, color = "red")

Use ggplot to plot partial effects obtained with effects library

I would like to use ggplot to replicate the plots partial effects (with partial residuals), as obtained with the "effect" package. To do this I need to retrieve some information.
This is the plot I want to replicate with ggplot.
library(effects)
mod <- lm(log(prestige) ~ income:type + education, data=Prestige)
eff = effect("education", mod, partial.residuals=T)
plot(eff)
From the eff object I am able to retrieve the partial residuals, as eff$residuals, but they are not sufficient to replicate the plot. I think that what I need is the both the residuals, AND the marginal predicted effect. However I was not able to retrieve them from my eff object.
Otherwise I only have the residuals scores that cannot be plotted against the line of the marginal effect.
Any hint on how to retrieve this information?
You have almost all the information available. This would take some more time to generalize, but here's some code that results in a figure approximately like from the effects package. Notice that the smoother is off, but I didn't bother to dig up why.
The code should be self explanatory. I only copied function closest from the package.
mod <- lm(log(prestige) ~ income:type + education, data=Prestige)
eff = effect("education", mod, partial.residuals=T)
library(ggplot2)
library(gridExtra)
closest <- function(x, x0) apply(outer(x, x0, FUN=function(x, x0) abs(x - x0)), 1, which.min)
x.fit <- unlist(eff$x.all)
trans <- I
x <- data.frame(lower = eff$lower, upper = eff$upper, fit = eff$fit, education = eff$x$education)
xy <- data.frame(x = x.fit, y = x$fit[closest(trans(x.fit), x$education)] + eff$residuals)
g <- ggplot(x, aes(x = education, y = fit)) +
theme_bw() +
geom_line(size = 1) +
geom_point(data = xy, aes(x = x, y = y), shape = 1, col = "blue", size = 2) +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.5) +
geom_smooth(data = xy, aes(x = trans(x), y = y),
method = "loess", span = 2/3, linetype = "dashed", se = FALSE)
grid.arrange(plot(eff), g, ncol = 2)

Add segments of circles to ggplot based on product of x & y

I want to add shaded areas to a chart to help people understand where bad, ok, and good points can fit.
Good = x*y>=.66
Ok = x*y>=.34
Bad = x*y<.34
Generating the right sequence of data to correctly apply the curved boundaries to the chart is proving tough.
What is the most elegant way to generate the curves?
Bonus Q: How would you do this to produce non-overlapping areas so that different colours could be used?
Updates
I've managed to do in a rather hacky way the drawing of the circle segments. I updated the MRE to use the revised segMaker function.
MRE
library(ggplot2)
pts<-seq(0,1,.02)
x<-sample(pts,50,replace=TRUE)
y<-sample(pts,50,replace=TRUE)
# What function will generate correct sequence of values as these are linear?
segMaker<-function(x,by){
# Original
# data.frame(x=c(seq(0,x,by),0)
# ,y=c(seq(x,0,-by),0)
# )
zero <- data.frame(x = 0, y = 0)
rs <- seq(0, pi, by)
xc <- x * cos(rs)
yc <- x * sin(rs)
gr <- data.frame(x = xc, y = yc)
gr <- rbind(gr[gr$x >= 0, ], zero)
return(gr)
}
firstSeg <-segMaker(.34,0.02)
secondSeg <-segMaker(.66,0.02)
thirdSeg <-segMaker(1,0.02)
ggplot(data.frame(x,y),aes(x,y, colour=x*y))+
geom_point() +
geom_polygon(data=firstSeg, fill="blue", alpha=.25)+
geom_polygon(data=secondSeg, fill="blue", alpha=.25)+
geom_polygon(data=thirdSeg, fill="blue", alpha=.25)
Current & desired shadings
You can create a data frame with the boundaries between each region and then use geom_ribbon to plot it. Here's an example using the conditions you supplied (which result in boundaries that are the reciprocal function, rather than circles, but the idea is the same, whichever function you use for the boundaries):
library(ggplot2)
# Fake data
pts<-seq(0,1,.02)
set.seed(19485)
x<-sample(pts,50,replace=TRUE)
y<-sample(pts,50,replace=TRUE)
df = data.frame(x,y)
# Region boundaries
x = seq(0.001,1.1,0.01)
bounds = data.frame(x, ymin=c(-100/x, 0.34/x, 0.66/x),
ymax=c(0.34/x, 0.66/x, 100/x),
g=rep(c("Bad","OK","Good"), each=length(x)))
bounds$g = factor(bounds$g, levels=c("Bad","OK","Good"))
ggplot() +
coord_cartesian(ylim=0:1, xlim=0:1) +
geom_ribbon(data=bounds, aes(x, ymin=ymin, ymax=ymax, fill=g), colour="grey50", lwd=0.2) +
geom_point(data=df, aes(x,y), colour="grey20") +
scale_fill_manual(values=hcl(c(15, 40, 240), 100, 80)) +
#scale_fill_manual(values=hcl(c(15, 40, 240), 100, 80, alpha=0.25)) + # If you want the fill colors to be transparent
labs(fill="") +
guides(fill=guide_legend(reverse=TRUE))
For circular boundaries, assuming we want boundaries at r=1/3 and r=2/3:
# Calculate y for circle, given r and x
cy = function(r, x) {sqrt(r^2 - x^2)}
n = 200
x = unlist(lapply(c(1/3,2/3,1), function(to) seq(0, to, len=n)))
bounds = data.frame(x, ymin = c(rep(0, n),
cy(1/3, seq(0, 1/3, len=n/2)), rep(0, n/2),
cy(2/3, seq(0, 2/3, len=2*n/3)), rep(0, n/3)),
ymax = c(cy(1/3, seq(0,1/3,len=n)),
cy(2/3, seq(0,2/3,len=n)),
rep(1,n)),
g=rep(c("Bad","OK","Good"), each=n))
bounds$g = factor(bounds$g, levels=c("Bad","OK","Good"))
If you can use a github package, ggforce adds geom_arc_bar():
# devtools::install_github('thomasp85/ggforce')
library(ggplot2)
library(ggforce)
pts<-seq(0,1,.02)
x<-sample(pts,50,replace=TRUE)
y<-sample(pts,50,replace=TRUE)
arcs <- data.frame(
x0 = 0,
y0 = 0,
start = 0,
end = pi / 2,
r0 = c(0, 1/3, 2/3),
r = c(1/3, 2/3, 1),
fill = c("bad", "ok", "good")
)
ggplot() +
geom_arc_bar(data = arcs,
aes(x0 = x0, y0 = y0, start = start, end = end, r0 = r0, r = r,
fill = fill), alpha = 0.6) +
geom_point(data = data.frame(x = x, y = y),
aes(x = x, y = y))
Based on #eipi10's great answer, to do the product component (basically ends up with the same thing) I did:
library(ggplot2)
library(data.table)
set.seed(19485)
pts <- seq(0, 1, .001)
x <- sample(pts, 50, replace = TRUE)
y <- sample(pts, 50, replace = TRUE)
df <- data.frame(x,y)
myRibbon<-CJ(pts,pts)
myRibbon[,prod:=V1 * V2]
myRibbon[,cat:=ifelse(prod<=1/3,"bad",
ifelse(prod<=2/3,"ok","good"))]
myRibbon<-myRibbon[
,.(ymin=min(V2),ymax=max(V2))
,.(cat,V1)]
ggplot() +
geom_ribbon(data=myRibbon
, aes(x=V1, ymin=ymin,ymax=ymax
, group=cat, fill=cat),
colour="grey90", lwd=0.2, alpha=.5)+
geom_point(data=df, aes(x,y), colour="grey20") +
theme_minimal()
This doesn't do anything fancy but works out for each value of x, what the smallest and largest values were that could give rise to a specific banding.
If I had just wanted arcs, the use of ggforce (#GregF) would be really great- it tucks away all the complexity.

How to visualise the difference between probability distribution functions? [closed]

Closed. This question is opinion-based. It is not currently accepting answers.
Want to improve this question? Update the question so it can be answered with facts and citations by editing this post.
Closed 7 years ago.
Improve this question
I try to visualise the difference between two histograms of distribution functions such as the difference in following two curves :
When the difference is big, you could just plot two curves on top of each other and fill the difference as denoted above, though when the difference becomes very small, this is cumbersome. Another way to plot this, is plotting the difference itself as follows :
However, this seems very hard to read for everyone seeing such a graph for the first time, so i was wondering: is there any other way you can visualise the difference between two distribution functions ?
I thought that maybe it might be an option to simply combine your two propositions, while scaling up the differences to make them visible.
What follows is an attempt to do this with ggplot2. Actually it was quite a bit more involved to do this than I initially thought, and I'm definitely not a hundred percent satisfied with the result; but maybe it helps nevertheless. Comments and improvements very welcome.
library(ggplot2)
library(dplyr)
## function that replicates default ggplot2 colors
## taken from [1]
gg_color_hue <- function(n) {
hues = seq(15, 375, length=n+1)
hcl(h=hues, l=65, c=100)[1:n]
}
## Set up sample data
set.seed(1)
n <- 2000
x1 <- rlnorm(n, 0, 1)
x2 <- rlnorm(n, 0, 1.1)
df <- bind_rows(data.frame(sample=1, x=x1), data.frame(sample=2, x=x2)) %>%
mutate(sample = as.factor(sample))
## Calculate density estimates
g1 <- ggplot(df, aes(x=x, group=sample, colour=sample)) +
geom_density(data = df) + xlim(0, 10)
gg1 <- ggplot_build(g1)
## Use these estimates (available at the same x coordinates!) for
## calculating the differences.
## Inspired by [2]
x <- gg1$data[[1]]$x[gg1$data[[1]]$group == 1]
y1 <- gg1$data[[1]]$y[gg1$data[[1]]$group == 1]
y2 <- gg1$data[[1]]$y[gg1$data[[1]]$group == 2]
df2 <- data.frame(x = x, ymin = pmin(y1, y2), ymax = pmax(y1, y2),
side=(y1<y2), ydiff = y2-y1)
g2 <- ggplot(df2) +
geom_ribbon(aes(x = x, ymin = ymin, ymax = ymax, fill = side, alpha = 0.5)) +
geom_line(aes(x = x, y = 5 * abs(ydiff), colour = side)) +
geom_area(aes(x = x, y = 5 * abs(ydiff), fill = side, alpha = 0.4))
g3 <- g2 +
geom_density(data = df, size = 1, aes(x = x, group = sample, colour = sample)) +
xlim(0, 10) +
guides(alpha = FALSE, colour = FALSE) +
ylab("Curves: density\n Shaded area: 5 * difference of densities") +
scale_fill_manual(name = "samples", labels = 1:2, values = gg_color_hue(2)) +
scale_colour_manual(limits = list(1, 2, FALSE, TRUE), values = rep(gg_color_hue(2), 2))
print(g3)
Sources: SO answer 1, SO answer 2
As suggested by #Gregor in the comments, here's a version that does two separate plots below eachother but sharing the same x axis scaling. At least the legends should obviously be tweaked.
library(ggplot2)
library(dplyr)
library(grid)
## function that replicates default ggplot2 colors
## taken from [1]
gg_color_hue <- function(n) {
hues = seq(15, 375, length=n+1)
hcl(h=hues, l=65, c=100)[1:n]
}
## Set up sample data
set.seed(1)
n <- 2000
x1 <- rlnorm(n, 0, 1)
x2 <- rlnorm(n, 0, 1.1)
df <- bind_rows(data.frame(sample=1, x=x1), data.frame(sample=2, x=x2)) %>%
mutate(sample = as.factor(sample))
## Calculate density estimates
g1 <- ggplot(df, aes(x=x, group=sample, colour=sample)) +
geom_density(data = df) + xlim(0, 10)
gg1 <- ggplot_build(g1)
## Use these estimates (available at the same x coordinates!) for
## calculating the differences.
## Inspired by [2]
x <- gg1$data[[1]]$x[gg1$data[[1]]$group == 1]
y1 <- gg1$data[[1]]$y[gg1$data[[1]]$group == 1]
y2 <- gg1$data[[1]]$y[gg1$data[[1]]$group == 2]
df2 <- data.frame(x = x, ymin = pmin(y1, y2), ymax = pmax(y1, y2),
side=(y1<y2), ydiff = y2-y1)
g2 <- ggplot(df2) +
geom_ribbon(aes(x = x, ymin = ymin, ymax = ymax, fill = side, alpha = 0.5)) +
geom_density(data = df, size = 1, aes(x = x, group = sample, colour = sample)) +
xlim(0, 10) +
guides(alpha = FALSE, fill = FALSE)
g3 <- ggplot(df2) +
geom_line(aes(x = x, y = abs(ydiff), colour = side)) +
geom_area(aes(x = x, y = abs(ydiff), fill = side, alpha = 0.4)) +
guides(alpha = FALSE, fill = FALSE)
## See [3]
grid.draw(rbind(ggplotGrob(g2), ggplotGrob(g3), size="last"))
... or with abs(ydiff) replaced by ydiff in the construction of the second plot:
Source: SO answer 3

Resources