Use coefficients to draw curve in ggplot - r

I have the coefficients for a curve and would like to draw the curve in ggplot2. My formula is just a polynomial:
y = a * x^2 + b * x + c
I have these coefficients:
a <- 0.000000308
b <- -0.0168
c <- 437
I don't know if these points will fall near the line, but say we are plotting this df:
df <- data.frame(group = c("a", "b", "c"),
x_variable = c(20000, 32000, 48000),
y_variable = c( 175, 200, 250))
Here's what I tried:
ggplot(df, aes(x = x_variable, y = y_variable)) +
geom_point() +
# this next line doesn't work, is it close?
# geom_smooth(method = 'lm', formula = y ~ 0.000000308 * x^2 + -0.0168 * x + 437)

One option is to use stat_function which applies a function along a grid of x values that fits the plotting area:
ggplot(df, aes(x = x_variable, y = y_variable)) +
geom_point() +
stat_function(fun = function(x){0.000000308 * x^2 + -0.0168 * x + 437})

Not sure if I'm totally misunderstanding your question, but I would just create some sample points:
library(data.table)
DT = data.table(x=1:100000)
DT[,y := a * x^2 + b * x + c]
ggplot(DT,aes(x=x,y=y))+geom_smooth()

Generate a function grid first and then plot the function along with your points:
library(ggplot2)
a <- 0.000000308
b <- -0.0168
c <- 437
grid <- data.frame( x = seq(15000, 60000, 1))
grid$y <- a*grid$x^2 + b*grid$x +c
points <- data.frame(x_variable = c(20000, 32000, 48000),
y_variable = c( 175, 200, 250))
ggplot() +
geom_line(data = grid, aes(x,y), color = "red") +
geom_point(data = points, aes(x_variable,y_variable))

Related

How to deal with vertical asymptotes in ggplot2

Consider three simple mathematical functions :
f1 <- function(x) 1/x
f2 <- function(x) tan(x)
f3 <- function(x) 1 / sin(x)
There exist certain vertical asymptotes respectively, i.e. f(x) almost gets infinity when x approaches some values. I plot these three functions by ggplot2::stat_function() :
# x is between -5 to 5
ggplot(data.frame(x = c(-5, 5)), aes(x)) +
stat_function(fun = f1, n = 1000) +
coord_cartesian(ylim = c(-50, 50))
# x is between -2*pi to 2*pi
ggplot(data.frame(x = c(-2*pi, 2*pi)), aes(x)) +
stat_function(fun = f2, n = 1000) +
coord_cartesian(ylim = c(-50, 50))
# x is between -2*pi to 2*pi
ggplot(data.frame(x = c(-2*pi, 2*pi)), aes(x)) +
stat_function(fun = f3, n = 1000) +
coord_cartesian(ylim = c(-50, 50))
The asymptotes appear respectively at :
x1 <- 0
x2 <- c(-3/2*pi, -1/2*pi, 1/2*pi, 3/2*pi)
x3 <- c(-pi, 0, pi)
Actually, these lines do not exist, but ggplot makes them visible. I attempted to use geom_vline() to cover them, namely :
+ geom_vline(xintercept = x1, color = "white")
+ geom_vline(xintercept = x2, color = "white")
+ geom_vline(xintercept = x3, color = "white")
The outputs seem rough and indistinct black marks can be seen. Are there any methods which are much robuster ?
A solution related to #Mojoesque's comments that is not perfect, but also relatively simple and with two minor shortcomings: a need to know the asymptotes (x1, x2, x3) and possibly to reduce the range of y.
eps <- 0.01
f1 <- function(x) if(min(abs(x - x1)) < eps) NA else 1/x
f2 <- function(x) if(min(abs(x - x2)) < eps) NA else tan(x)
f3 <- function(x) if(min(abs(x - x3)) < eps) NA else 1 / sin(x)
ggplot(data.frame(x = c(-5, 5)), aes(x)) +
stat_function(fun = Vectorize(f1), n = 1000) +
coord_cartesian(ylim = c(-30, 30))
ggplot(data.frame(x = c(-2*pi, 2*pi)), aes(x)) +
stat_function(fun = Vectorize(f2), n = 1000) +
coord_cartesian(ylim = c(-30, 30))
ggplot(data.frame(x = c(-2*pi, 2*pi)), aes(x)) +
stat_function(fun = Vectorize(f3), n = 1000) +
coord_cartesian(ylim = c(-30, 30))
This solution is based on #Mojoesque's comment, which uses piecewise skill to partition x-axis into several subintervals, and then execute multiple stat_function() by purrr::reduce(). The restraint is that asymptotes need to be given.
Take tan(x) for example :
f <- function(x) tan(x)
asymp <- c(-3/2*pi, -1/2*pi, 1/2*pi, 3/2*pi)
left <- -2 * pi # left border
right <- 2 * pi # right border
d <- 0.001
interval <- data.frame(x1 = c(left, asymp + d),
x2 = c(asymp - d, right))
interval # divide the entire x-axis into 5 sections
# x1 x2
# 1 -6.283185 -4.713389
# 2 -4.711389 -1.571796
# 3 -1.569796 1.569796
# 4 1.571796 4.711389
# 5 4.713389 6.283185
library(tidyverse)
pmap(interval, function(x1, x2) {
stat_function(fun = f, xlim = c(x1, x2), n = 1000)
}) %>% reduce(.f = `+`,
.init = ggplot(data.frame(x = c(left, right)), aes(x)) +
coord_cartesian(ylim = c(-50, 50)))

Adding orthogonal regression line in ggplot

I have plotted a scatter graph in R, comparing expected to observed values,using the following script:
library(ggplot2)
library(dplyr)
r<-read_csv("Uni/MSci/Project/DATA/new data sheets/comparisons/for comarison
graphs/R Regression/GAcAs.csv")
x<-r[1]
y<-r[2]
ggplot()+geom_point(aes(x=x,y=y))+
scale_size_area() +
xlab("Expected") +
ylab("Observed") +
ggtitle("G - As x Ac")+ xlim(0, 40)+ylim(0, 40)
My plot is as follows:
I then want to add an orthogonal regression line (as there could be errors in both the expected and observed values). I have calculated the beta value using the following:
v <- prcomp(cbind(x,y))$rotation
beta <- v[2,1]/v[1,1]
Is there a way to add an orthogonal regression line to my plot?
Borrowed from this blog post & this answer. Basically, you will need Deming function from MethComp or prcomp from stats packages together with a custom function perp.segment.coord. Below is an example taken from above mentioned blog post.
library(ggplot2)
library(MethComp)
data(airquality)
airquality <- na.exclude(airquality)
# Orthogonal, total least squares or Deming regression
deming <- Deming(y=airquality$Wind, x=airquality$Temp)[1:2]
deming
#> Intercept Slope
#> 24.8083259 -0.1906826
# Check with prcomp {stats}
r <- prcomp( ~ airquality$Temp + airquality$Wind )
slope <- r$rotation[2,1] / r$rotation[1,1]
slope
#> [1] -0.1906826
intercept <- r$center[2] - slope*r$center[1]
intercept
#> airquality$Wind
#> 24.80833
# https://stackoverflow.com/a/30399576/786542
perp.segment.coord <- function(x0, y0, ortho){
# finds endpoint for a perpendicular segment from the point (x0,y0) to the line
# defined by ortho as y = a + b*x
a <- ortho[1] # intercept
b <- ortho[2] # slope
x1 <- (x0 + b*y0 - a*b)/(1 + b^2)
y1 <- a + b*x1
list(x0=x0, y0=y0, x1=x1, y1=y1)
}
perp.segment <- perp.segment.coord(airquality$Temp, airquality$Wind, deming)
perp.segment <- as.data.frame(perp.segment)
# plot
plot.y <- ggplot(data = airquality, aes(x = Temp, y = Wind)) +
geom_point() +
geom_abline(intercept = deming[1],
slope = deming[2]) +
geom_segment(data = perp.segment,
aes(x = x0, y = y0, xend = x1, yend = y1),
colour = "blue") +
theme_bw()
Created on 2018-03-19 by the reprex package (v0.2.0).
The MethComp package seems to be no longer maintained (was removed from CRAN).
Russel88/COEF allows to use stat_/geom_summary with method="tls" to add an orthogonal regression line.
Based on this and wikipedia:Deming_regression I created the following functions, which allow to use noise ratios other than 1:
deming.fit <- function(x, y, noise_ratio = sd(y)/sd(x)) {
if(missing(noise_ratio) || is.null(noise_ratio)) noise_ratio <- eval(formals(sys.function(0))$noise_ratio) # this is just a complicated way to write `sd(y)/sd(x)`
delta <- noise_ratio^2
x_name <- deparse(substitute(x))
s_yy <- var(y)
s_xx <- var(x)
s_xy <- cov(x, y)
beta1 <- (s_yy - delta*s_xx + sqrt((s_yy - delta*s_xx)^2 + 4*delta*s_xy^2)) / (2*s_xy)
beta0 <- mean(y) - beta1 * mean(x)
res <- c(beta0 = beta0, beta1 = beta1)
names(res) <- c("(Intercept)", x_name)
class(res) <- "Deming"
res
}
deming <- function(formula, data, R = 100, noise_ratio = NULL, ...){
ret <- boot::boot(
data = model.frame(formula, data),
statistic = function(data, ind) {
data <- data[ind, ]
args <- rlang::parse_exprs(colnames(data))
names(args) <- c("y", "x")
rlang::eval_tidy(rlang::expr(deming.fit(!!!args, noise_ratio = noise_ratio)), data, env = rlang::current_env())
},
R=R
)
class(ret) <- c("Deming", class(ret))
ret
}
predictdf.Deming <- function(model, xseq, se, level) {
pred <- as.vector(tcrossprod(model$t0, cbind(1, xseq)))
if(se) {
preds <- tcrossprod(model$t, cbind(1, xseq))
data.frame(
x = xseq,
y = pred,
ymin = apply(preds, 2, function(x) quantile(x, probs = (1-level)/2)),
ymax = apply(preds, 2, function(x) quantile(x, probs = 1-((1-level)/2)))
)
} else {
return(data.frame(x = xseq, y = pred))
}
}
# unrelated hlper function to create a nicer plot:
fix_plot_limits <- function(p) p + coord_cartesian(xlim=ggplot_build(p)$layout$panel_params[[1]]$x.range, ylim=ggplot_build(p)$layout$panel_params[[1]]$y.range)
Demonstration:
library(ggplot2)
#devtools::install_github("Russel88/COEF")
library(COEF)
fix_plot_limits(
ggplot(data.frame(x = (1:5) + rnorm(100), y = (1:5) + rnorm(100)*2), mapping = aes(x=x, y=y)) +
geom_point()
) +
geom_smooth(method=deming, aes(color="deming"), method.args = list(noise_ratio=2)) +
geom_smooth(method=lm, aes(color="lm")) +
geom_smooth(method = COEF::tls, aes(color="tls"))
Created on 2019-12-04 by the reprex package (v0.3.0)
I'm not sure I completely understand the question, but if you want line segments to show errors along both x and y axis, you can do this using geom_segment.
Something like this:
library(ggplot2)
df <- data.frame(x = rnorm(10), y = rnorm(10), w = rnorm(10, sd=.1))
ggplot(df, aes(x = x, y = y, xend = x, yend = y)) +
geom_point() +
geom_segment(aes(x = x - w, xend = x + w)) +
geom_segment(aes(y = y - w, yend = y + w))

Add segments of circles to ggplot based on product of x & y

I want to add shaded areas to a chart to help people understand where bad, ok, and good points can fit.
Good = x*y>=.66
Ok = x*y>=.34
Bad = x*y<.34
Generating the right sequence of data to correctly apply the curved boundaries to the chart is proving tough.
What is the most elegant way to generate the curves?
Bonus Q: How would you do this to produce non-overlapping areas so that different colours could be used?
Updates
I've managed to do in a rather hacky way the drawing of the circle segments. I updated the MRE to use the revised segMaker function.
MRE
library(ggplot2)
pts<-seq(0,1,.02)
x<-sample(pts,50,replace=TRUE)
y<-sample(pts,50,replace=TRUE)
# What function will generate correct sequence of values as these are linear?
segMaker<-function(x,by){
# Original
# data.frame(x=c(seq(0,x,by),0)
# ,y=c(seq(x,0,-by),0)
# )
zero <- data.frame(x = 0, y = 0)
rs <- seq(0, pi, by)
xc <- x * cos(rs)
yc <- x * sin(rs)
gr <- data.frame(x = xc, y = yc)
gr <- rbind(gr[gr$x >= 0, ], zero)
return(gr)
}
firstSeg <-segMaker(.34,0.02)
secondSeg <-segMaker(.66,0.02)
thirdSeg <-segMaker(1,0.02)
ggplot(data.frame(x,y),aes(x,y, colour=x*y))+
geom_point() +
geom_polygon(data=firstSeg, fill="blue", alpha=.25)+
geom_polygon(data=secondSeg, fill="blue", alpha=.25)+
geom_polygon(data=thirdSeg, fill="blue", alpha=.25)
Current & desired shadings
You can create a data frame with the boundaries between each region and then use geom_ribbon to plot it. Here's an example using the conditions you supplied (which result in boundaries that are the reciprocal function, rather than circles, but the idea is the same, whichever function you use for the boundaries):
library(ggplot2)
# Fake data
pts<-seq(0,1,.02)
set.seed(19485)
x<-sample(pts,50,replace=TRUE)
y<-sample(pts,50,replace=TRUE)
df = data.frame(x,y)
# Region boundaries
x = seq(0.001,1.1,0.01)
bounds = data.frame(x, ymin=c(-100/x, 0.34/x, 0.66/x),
ymax=c(0.34/x, 0.66/x, 100/x),
g=rep(c("Bad","OK","Good"), each=length(x)))
bounds$g = factor(bounds$g, levels=c("Bad","OK","Good"))
ggplot() +
coord_cartesian(ylim=0:1, xlim=0:1) +
geom_ribbon(data=bounds, aes(x, ymin=ymin, ymax=ymax, fill=g), colour="grey50", lwd=0.2) +
geom_point(data=df, aes(x,y), colour="grey20") +
scale_fill_manual(values=hcl(c(15, 40, 240), 100, 80)) +
#scale_fill_manual(values=hcl(c(15, 40, 240), 100, 80, alpha=0.25)) + # If you want the fill colors to be transparent
labs(fill="") +
guides(fill=guide_legend(reverse=TRUE))
For circular boundaries, assuming we want boundaries at r=1/3 and r=2/3:
# Calculate y for circle, given r and x
cy = function(r, x) {sqrt(r^2 - x^2)}
n = 200
x = unlist(lapply(c(1/3,2/3,1), function(to) seq(0, to, len=n)))
bounds = data.frame(x, ymin = c(rep(0, n),
cy(1/3, seq(0, 1/3, len=n/2)), rep(0, n/2),
cy(2/3, seq(0, 2/3, len=2*n/3)), rep(0, n/3)),
ymax = c(cy(1/3, seq(0,1/3,len=n)),
cy(2/3, seq(0,2/3,len=n)),
rep(1,n)),
g=rep(c("Bad","OK","Good"), each=n))
bounds$g = factor(bounds$g, levels=c("Bad","OK","Good"))
If you can use a github package, ggforce adds geom_arc_bar():
# devtools::install_github('thomasp85/ggforce')
library(ggplot2)
library(ggforce)
pts<-seq(0,1,.02)
x<-sample(pts,50,replace=TRUE)
y<-sample(pts,50,replace=TRUE)
arcs <- data.frame(
x0 = 0,
y0 = 0,
start = 0,
end = pi / 2,
r0 = c(0, 1/3, 2/3),
r = c(1/3, 2/3, 1),
fill = c("bad", "ok", "good")
)
ggplot() +
geom_arc_bar(data = arcs,
aes(x0 = x0, y0 = y0, start = start, end = end, r0 = r0, r = r,
fill = fill), alpha = 0.6) +
geom_point(data = data.frame(x = x, y = y),
aes(x = x, y = y))
Based on #eipi10's great answer, to do the product component (basically ends up with the same thing) I did:
library(ggplot2)
library(data.table)
set.seed(19485)
pts <- seq(0, 1, .001)
x <- sample(pts, 50, replace = TRUE)
y <- sample(pts, 50, replace = TRUE)
df <- data.frame(x,y)
myRibbon<-CJ(pts,pts)
myRibbon[,prod:=V1 * V2]
myRibbon[,cat:=ifelse(prod<=1/3,"bad",
ifelse(prod<=2/3,"ok","good"))]
myRibbon<-myRibbon[
,.(ymin=min(V2),ymax=max(V2))
,.(cat,V1)]
ggplot() +
geom_ribbon(data=myRibbon
, aes(x=V1, ymin=ymin,ymax=ymax
, group=cat, fill=cat),
colour="grey90", lwd=0.2, alpha=.5)+
geom_point(data=df, aes(x,y), colour="grey20") +
theme_minimal()
This doesn't do anything fancy but works out for each value of x, what the smallest and largest values were that could give rise to a specific banding.
If I had just wanted arcs, the use of ggforce (#GregF) would be really great- it tucks away all the complexity.

How to visualise the difference between probability distribution functions? [closed]

Closed. This question is opinion-based. It is not currently accepting answers.
Want to improve this question? Update the question so it can be answered with facts and citations by editing this post.
Closed 7 years ago.
Improve this question
I try to visualise the difference between two histograms of distribution functions such as the difference in following two curves :
When the difference is big, you could just plot two curves on top of each other and fill the difference as denoted above, though when the difference becomes very small, this is cumbersome. Another way to plot this, is plotting the difference itself as follows :
However, this seems very hard to read for everyone seeing such a graph for the first time, so i was wondering: is there any other way you can visualise the difference between two distribution functions ?
I thought that maybe it might be an option to simply combine your two propositions, while scaling up the differences to make them visible.
What follows is an attempt to do this with ggplot2. Actually it was quite a bit more involved to do this than I initially thought, and I'm definitely not a hundred percent satisfied with the result; but maybe it helps nevertheless. Comments and improvements very welcome.
library(ggplot2)
library(dplyr)
## function that replicates default ggplot2 colors
## taken from [1]
gg_color_hue <- function(n) {
hues = seq(15, 375, length=n+1)
hcl(h=hues, l=65, c=100)[1:n]
}
## Set up sample data
set.seed(1)
n <- 2000
x1 <- rlnorm(n, 0, 1)
x2 <- rlnorm(n, 0, 1.1)
df <- bind_rows(data.frame(sample=1, x=x1), data.frame(sample=2, x=x2)) %>%
mutate(sample = as.factor(sample))
## Calculate density estimates
g1 <- ggplot(df, aes(x=x, group=sample, colour=sample)) +
geom_density(data = df) + xlim(0, 10)
gg1 <- ggplot_build(g1)
## Use these estimates (available at the same x coordinates!) for
## calculating the differences.
## Inspired by [2]
x <- gg1$data[[1]]$x[gg1$data[[1]]$group == 1]
y1 <- gg1$data[[1]]$y[gg1$data[[1]]$group == 1]
y2 <- gg1$data[[1]]$y[gg1$data[[1]]$group == 2]
df2 <- data.frame(x = x, ymin = pmin(y1, y2), ymax = pmax(y1, y2),
side=(y1<y2), ydiff = y2-y1)
g2 <- ggplot(df2) +
geom_ribbon(aes(x = x, ymin = ymin, ymax = ymax, fill = side, alpha = 0.5)) +
geom_line(aes(x = x, y = 5 * abs(ydiff), colour = side)) +
geom_area(aes(x = x, y = 5 * abs(ydiff), fill = side, alpha = 0.4))
g3 <- g2 +
geom_density(data = df, size = 1, aes(x = x, group = sample, colour = sample)) +
xlim(0, 10) +
guides(alpha = FALSE, colour = FALSE) +
ylab("Curves: density\n Shaded area: 5 * difference of densities") +
scale_fill_manual(name = "samples", labels = 1:2, values = gg_color_hue(2)) +
scale_colour_manual(limits = list(1, 2, FALSE, TRUE), values = rep(gg_color_hue(2), 2))
print(g3)
Sources: SO answer 1, SO answer 2
As suggested by #Gregor in the comments, here's a version that does two separate plots below eachother but sharing the same x axis scaling. At least the legends should obviously be tweaked.
library(ggplot2)
library(dplyr)
library(grid)
## function that replicates default ggplot2 colors
## taken from [1]
gg_color_hue <- function(n) {
hues = seq(15, 375, length=n+1)
hcl(h=hues, l=65, c=100)[1:n]
}
## Set up sample data
set.seed(1)
n <- 2000
x1 <- rlnorm(n, 0, 1)
x2 <- rlnorm(n, 0, 1.1)
df <- bind_rows(data.frame(sample=1, x=x1), data.frame(sample=2, x=x2)) %>%
mutate(sample = as.factor(sample))
## Calculate density estimates
g1 <- ggplot(df, aes(x=x, group=sample, colour=sample)) +
geom_density(data = df) + xlim(0, 10)
gg1 <- ggplot_build(g1)
## Use these estimates (available at the same x coordinates!) for
## calculating the differences.
## Inspired by [2]
x <- gg1$data[[1]]$x[gg1$data[[1]]$group == 1]
y1 <- gg1$data[[1]]$y[gg1$data[[1]]$group == 1]
y2 <- gg1$data[[1]]$y[gg1$data[[1]]$group == 2]
df2 <- data.frame(x = x, ymin = pmin(y1, y2), ymax = pmax(y1, y2),
side=(y1<y2), ydiff = y2-y1)
g2 <- ggplot(df2) +
geom_ribbon(aes(x = x, ymin = ymin, ymax = ymax, fill = side, alpha = 0.5)) +
geom_density(data = df, size = 1, aes(x = x, group = sample, colour = sample)) +
xlim(0, 10) +
guides(alpha = FALSE, fill = FALSE)
g3 <- ggplot(df2) +
geom_line(aes(x = x, y = abs(ydiff), colour = side)) +
geom_area(aes(x = x, y = abs(ydiff), fill = side, alpha = 0.4)) +
guides(alpha = FALSE, fill = FALSE)
## See [3]
grid.draw(rbind(ggplotGrob(g2), ggplotGrob(g3), size="last"))
... or with abs(ydiff) replaced by ydiff in the construction of the second plot:
Source: SO answer 3

Computing weighted average using lowess mthod in R

I am trying to use the lowess method from R to compute the weighted average of a data set which is not uniformly distributed along x axis. For example, the first 5 data points are like this, where the first column is the x and the second is the y.
375.0 2040.0
472.0 5538.0
510.0 4488.0
573.0 2668.0
586.0 7664.0
I used the following command in R:
x<-read.table(add,header=FALSE,sep="\t")
y<-lowess(x[,1],x[,2],f=0.01)
write.table(y, file = results , sep = "\t", col.names =FALSE, row.names =FALSE)
The output looks like this:
The green line shows the average computed by the smooth function in matlab (tri-cubic kernel), and the red line is the average line computed by lowess method in R. The blue dots are the data points.
I can't find why the method in R does not work. Do you have any idea?
Here is a link to part of the data.
Thanks a lot for your help.
Th smooth function in matlab is like a filter ,
yy = smooth(y)
yy(1) = y(1)
yy(2) = (y(1) + y(2) + y(3))/3
yy(3) = (y(1) + y(2) + y(3) + y(4) + y(5))/5 ## convolution of size 5
yy(4) = (y(2) + y(3) + y(4) + y(5) + y(6))/5
I think it is better to do a simple smooth here.
Here some attempts using loess, lowesss with f = 0.2(1/5) and using smooth.spline
I am using ggplot2 to plot ( to use geom_jitter with some alpha )
library(ggplot2)
dat <- subset(data, V2 < 5000)
#dat <- data
xy <- lowess(dat$V1,dat$V2,f = 0.8)
xy <- as.data.frame(do.call(cbind,xy))
p1<- ggplot(data = dat, aes(x= V1, y = V2))+
geom_jitter(position = position_jitter(width = .2), alpha= 0.1)+
geom_smooth()
xy <- lowess(dat$V1,dat$V2,f = 0.2)
xy <- as.data.frame(do.call(cbind,xy))
xy.smooth <- smooth.spline(dat$V1,dat$V2)
xy.smooth <- data.frame(x= xy.smooth$x,y = xy.smooth$y)
p2 <- ggplot(data = dat, aes(x= V1, y = V2))+
geom_jitter(position = position_jitter(width = .2), alpha= 0.1)+
geom_line(data = xy, aes(x=x, y = y, group = 1 ), color = 'red')+
geom_line(data = xy.smooth, aes(x=x, y = y, group = 1 ), color = 'blue')
library(gridExtra)
grid.arrange(p1,p2)

Resources