ggplot2 - Shade area above line - r

I have some data that is constrained below a 1:1 line. I would to demonstrate this on a plot by lightly shading the area ABOVE the line, to draw the attention of the viewer to the area beneath the line.
I'm using qplot to generate the graphs. Quickly, I have;
qplot(x,y)+geom_abline(slope=1)
but for the life of me, can't figure out how to easily shade the above area without plotting a separate object. Is there an easy fix for this?
EDIT
Ok, Joran, here is an example data set:
df=data.frame(x=runif(6,-2,2),y=runif(6,-2,2),
var1=rep(c("A","B"),3),var2=rep(c("C","D"),3))
df_poly=data.frame(x=c(-Inf, Inf, -Inf),y=c(-Inf, Inf, Inf))
and here is the code that I'm using to plot it (I took your advice and have been looking up ggplot()):
ggplot(df,aes(x,y,color=var1))+
facet_wrap(~var2)+
geom_abline(slope=1,intercept=0,lwd=0.5)+
geom_point(size=3)+
scale_color_manual(values=c("red","blue"))+
geom_polygon(data=df_poly,aes(x,y),fill="blue",alpha=0.2)
The error kicked back is: "object 'var1' not found" Something tells me that I'm implementing the argument incorrectly...

Building on #Andrie's answer here is a more (but not completely) general solution that handles shading above or below a given line in most cases.
I did not use the method that #Andrie referenced here since I ran into issues with ggplot's tendency to automatically extend the plot extents when you add points near the edges. Instead, this builds the polygon points manually using Inf and -Inf as needed. A few notes:
The points have to be in the 'correct' order in the data frame, since ggplot plots the polygon in the order that the points appear. So it's not enough to get the vertices of the polygon, they must be ordered (either clockwise or counterclockwise) as well.
This solution assumes that the line you are plotting does not itself cause ggplot to extend the plot range. You'll see in my example that I pick a line to draw by randomly choosing two points in the data and drawing the line through them. If you try to draw a line too far away from the rest of you points, ggplot will automatically alter the plot ranges, and it becomes hard to predict what they will be.
First, here's the function that builds the polygon data frame:
buildPoly <- function(xr, yr, slope = 1, intercept = 0, above = TRUE){
#Assumes ggplot default of expand = c(0.05,0)
xrTru <- xr + 0.05*diff(xr)*c(-1,1)
yrTru <- yr + 0.05*diff(yr)*c(-1,1)
#Find where the line crosses the plot edges
yCross <- (yrTru - intercept) / slope
xCross <- (slope * xrTru) + intercept
#Build polygon by cases
if (above & (slope >= 0)){
rs <- data.frame(x=-Inf,y=Inf)
if (xCross[1] < yrTru[1]){
rs <- rbind(rs,c(-Inf,-Inf),c(yCross[1],-Inf))
}
else{
rs <- rbind(rs,c(-Inf,xCross[1]))
}
if (xCross[2] < yrTru[2]){
rs <- rbind(rs,c(Inf,xCross[2]),c(Inf,Inf))
}
else{
rs <- rbind(rs,c(yCross[2],Inf))
}
}
if (!above & (slope >= 0)){
rs <- data.frame(x= Inf,y= -Inf)
if (xCross[1] > yrTru[1]){
rs <- rbind(rs,c(-Inf,-Inf),c(-Inf,xCross[1]))
}
else{
rs <- rbind(rs,c(yCross[1],-Inf))
}
if (xCross[2] > yrTru[2]){
rs <- rbind(rs,c(yCross[2],Inf),c(Inf,Inf))
}
else{
rs <- rbind(rs,c(Inf,xCross[2]))
}
}
if (above & (slope < 0)){
rs <- data.frame(x=Inf,y=Inf)
if (xCross[1] < yrTru[2]){
rs <- rbind(rs,c(-Inf,Inf),c(-Inf,xCross[1]))
}
else{
rs <- rbind(rs,c(yCross[2],Inf))
}
if (xCross[2] < yrTru[1]){
rs <- rbind(rs,c(yCross[1],-Inf),c(Inf,-Inf))
}
else{
rs <- rbind(rs,c(Inf,xCross[2]))
}
}
if (!above & (slope < 0)){
rs <- data.frame(x= -Inf,y= -Inf)
if (xCross[1] > yrTru[2]){
rs <- rbind(rs,c(-Inf,Inf),c(yCross[2],Inf))
}
else{
rs <- rbind(rs,c(-Inf,xCross[1]))
}
if (xCross[2] > yrTru[1]){
rs <- rbind(rs,c(Inf,xCross[2]),c(Inf,-Inf))
}
else{
rs <- rbind(rs,c(yCross[1],-Inf))
}
}
return(rs)
}
It expects the x and y ranges of your data (as in range()), the slope and intercept of the line you are going to plot, and whether you want to shade above or below the line. Here's the code I used to generate the following four examples:
#Generate some data
dat <- data.frame(x=runif(10),y=runif(10))
#Select two of the points to define the line
pts <- dat[sample(1:nrow(dat),size=2,replace=FALSE),]
#Slope and intercept of line through those points
sl <- diff(pts$y) / diff(pts$x)
int <- pts$y[1] - (sl*pts$x[1])
#Build the polygon
datPoly <- buildPoly(range(dat$x),range(dat$y),
slope=sl,intercept=int,above=FALSE)
#Make the plot
p <- ggplot(dat,aes(x=x,y=y)) +
geom_point() +
geom_abline(slope=sl,intercept = int) +
geom_polygon(data=datPoly,aes(x=x,y=y),alpha=0.2,fill="blue")
print(p)
And here are some examples of the results. If you find any bugs, of course, let me know so that I can update this answer...
EDIT
Updated to illustrate solution using OP's example data:
set.seed(1)
dat <- data.frame(x=runif(6,-2,2),y=runif(6,-2,2),
var1=rep(c("A","B"),3),var2=rep(c("C","D"),3))
#Create polygon data frame
df_poly <- buildPoly(range(dat$x),range(dat$y))
ggplot(data=dat,aes(x,y)) +
facet_wrap(~var2) +
geom_abline(slope=1,intercept=0,lwd=0.5)+
geom_point(aes(colour=var1),size=3) +
scale_color_manual(values=c("red","blue"))+
geom_polygon(data=df_poly,aes(x,y),fill="blue",alpha=0.2)
and this produces the following output:

As far as I know there is no other way other than creating a polygon with alpha-blended fill. For example:
df <- data.frame(x=1, y=1)
df_poly <- data.frame(
x=c(-Inf, Inf, -Inf),
y=c(-Inf, Inf, Inf)
)
ggplot(df, aes(x, y)) +
geom_blank() +
geom_abline(slope=1, intercept=0) +
geom_polygon(data=df_poly, aes(x, y), fill="blue", alpha=0.2) +

One easy way to do this is to use geom_ribbon with the ymax value set to Inf, and the ymin value calculated by stat_function:
library(ggplot2)
myfun <- function(x) x
myfun2 <- function(x) x^2
ggplot() +
geom_function(fun = myfun) +
geom_ribbon(stat = 'function', fun = myfun,
mapping = aes(ymin = after_stat(y), ymax = Inf),
fill = 'lightblue', alpha = 0.5)
ggplot() +
geom_function(fun = myfun2) +
geom_ribbon(stat = 'function', fun = myfun2,
mapping = aes(ymin = after_stat(y), ymax = Inf),
fill = 'lightblue', alpha = 0.5)
Created on 2022-05-26 by the reprex package (v2.0.1)

Based on a minimally modified version of #joran's answer:
library(ggplot2)
library(tidyr)
library(dplyr)
buildPoly <- function(slope, intercept, above, xr, yr){
# By Joran Elias, #joran https://stackoverflow.com/a/6809174/1870254
#Find where the line crosses the plot edges
yCross <- (yr - intercept) / slope
xCross <- (slope * xr) + intercept
#Build polygon by cases
if (above & (slope >= 0)){
rs <- data.frame(x=-Inf,y=Inf)
if (xCross[1] < yr[1]){
rs <- rbind(rs,c(-Inf,-Inf),c(yCross[1],-Inf))
}
else{
rs <- rbind(rs,c(-Inf,xCross[1]))
}
if (xCross[2] < yr[2]){
rs <- rbind(rs,c(Inf,xCross[2]),c(Inf,Inf))
}
else{
rs <- rbind(rs,c(yCross[2],Inf))
}
}
if (!above & (slope >= 0)){
rs <- data.frame(x= Inf,y= -Inf)
if (xCross[1] > yr[1]){
rs <- rbind(rs,c(-Inf,-Inf),c(-Inf,xCross[1]))
}
else{
rs <- rbind(rs,c(yCross[1],-Inf))
}
if (xCross[2] > yr[2]){
rs <- rbind(rs,c(yCross[2],Inf),c(Inf,Inf))
}
else{
rs <- rbind(rs,c(Inf,xCross[2]))
}
}
if (above & (slope < 0)){
rs <- data.frame(x=Inf,y=Inf)
if (xCross[1] < yr[2]){
rs <- rbind(rs,c(-Inf,Inf),c(-Inf,xCross[1]))
}
else{
rs <- rbind(rs,c(yCross[2],Inf))
}
if (xCross[2] < yr[1]){
rs <- rbind(rs,c(yCross[1],-Inf),c(Inf,-Inf))
}
else{
rs <- rbind(rs,c(Inf,xCross[2]))
}
}
if (!above & (slope < 0)){
rs <- data.frame(x= -Inf,y= -Inf)
if (xCross[1] > yr[2]){
rs <- rbind(rs,c(-Inf,Inf),c(yCross[2],Inf))
}
else{
rs <- rbind(rs,c(-Inf,xCross[1]))
}
if (xCross[2] > yr[1]){
rs <- rbind(rs,c(Inf,xCross[2]),c(Inf,-Inf))
}
else{
rs <- rbind(rs,c(yCross[1],-Inf))
}
}
return(rs)
}
you can also extend ggplot like this:
GeomSection <- ggproto("GeomSection", GeomPolygon,
default_aes = list(fill="blue", size=0, alpha=0.2, colour=NA, linetype="dashed"),
required_aes = c("slope", "intercept", "above"),
draw_panel = function(data, panel_params, coord) {
ranges <- coord$backtransform_range(panel_params)
data$group <- seq_len(nrow(data))
data <- data %>% group_by_all %>% do(buildPoly(.$slope, .$intercept, .$above, ranges$x, ranges$y)) %>% unnest
GeomPolygon$draw_panel(data, panel_params, coord)
}
)
geom_section <- function (mapping = NULL, data = NULL, ..., slope, intercept, above,
na.rm = FALSE, show.legend = NA) {
if (missing(mapping) && missing(slope) && missing(intercept) && missing(above)) {
slope <- 1
intercept <- 0
above <- TRUE
}
if (!missing(slope) || !missing(intercept)|| !missing(above)) {
if (missing(slope))
slope <- 1
if (missing(intercept))
intercept <- 0
if (missing(above))
above <- TRUE
data <- data.frame(intercept = intercept, slope = slope, above=above)
mapping <- aes(intercept = intercept, slope = slope, above=above)
show.legend <- FALSE
}
layer(data = data, mapping = mapping, stat = StatIdentity,
geom = GeomSection, position = PositionIdentity, show.legend = show.legend,
inherit.aes = FALSE, params = list(na.rm = na.rm, ...))
}
To be able to use it as easily as a geom_abline:
set.seed(1)
dat <- data.frame(x=runif(6,-2,2),y=runif(6,-2,2),
var1=rep(c("A","B"),3),var2=rep(c("C","D"),3))
ggplot(data=dat,aes(x,y)) +
facet_wrap(~var2) +
geom_abline(slope=1,intercept=0,lwd=0.5)+
geom_point(aes(colour=var1),size=3) +
scale_color_manual(values=c("red","blue"))+
geom_section(slope=1, intercept=0, above=TRUE)
This variant has the additional advantage that it also works with multiple slopes and non-default limit expansions.
ggplot(data=dat,aes(x,y)) +
facet_wrap(~var2) +
geom_abline(slope=1,intercept=0,lwd=0.5)+
geom_point(aes(colour=var1),size=3) +
scale_color_manual(values=c("red","blue"))+
geom_section(data=data.frame(slope=c(-1,1), above=c(FALSE,TRUE), selected=c("selected","selected 2")),
aes(slope=slope, above=above, intercept=0, fill=selected), size=1) +
expand_limits(x=3)

Related

deSolve: differential equations with two consecutive dynamics

I am simulating a ring tube with flowing water and a temperature gradient using deSolve::ode(). The ring is modelled as a vector where each element has a temperature value and position.
I am modelling the heat diffusion formula:
1)
But I'm struggling with also moving the water along the ring. In theory, it's just about substituting the temperature at the element i in the tube vector with that at the element s places earlier. Since s may not be an integer, it can be separated into the integer part (n) and the fractional part (p): s=n+p. Consequently, the change in temperature due to the water moving becomes:
2)
The problem is that s equals to the water velocity v by the dt evaluated at each iteration of the ode solver.
My idea is to treat the phenomenons as additive, that is first computing (1), then (2) and finally adding them together. I'm afraid though about the effect of time. The ode solver with implicit methods decides the time step automatically and scales down linearly the unitary change delta.
My question is whether just returning (1) + (2) in the derivative function is correct or if I should break the two processes apart and compute the derivatives separately. In the second case, what would be the suggested approach?
EDIT:
As by suggestion by #tpetzoldt I tried to implement the water flow using ReacTran::advection.1D(). My model has multiple sources of variation of temperature: the spontaneous symmetric heat diffusion; the water flow; a source of heat that is turned on if the temperature near a sensor (placed before the heat source) drops below a lower threshold and is turned off if raises above an upper threshold; a constant heat dispersion determined by a cyclical external temperature.
Below the "Moving water" section there is still my previous version of the code, now substituted by ReacTran::advection.1D().
The plot_type argument allows visualizing either a time sequence of the temperature in the water tube ("pipe"), or the temperature sequence at the sensors (before and after the heater).
library(deSolve)
library(dplyr)
library(ggplot2)
library(tidyr)
library(ReacTran)
test <- function(simTime = 5000, vel = 1, L = 500, thresh = c(16, 25), heatT = 25,
heatDisp = .0025, baseTemp = 15, alpha = .025,
adv_method = 'up', plot_type = c('pipe', 'sensors')) {
plot_type <- match.arg(plot_type)
thresh <- c(16, 25)
sensorP <- round(L/2)
vec <- c(rep(baseTemp, L), 0)
eventfun <- function(t, y, pars) {
heat <- y[L + 1] > 0
if (y[sensorP] < thresh[1] & heat == FALSE) { # if heat is FALSE -> T was above the threshold
#browser()
y[L + 1] <- heatT
}
if (y[sensorP] > thresh[2] & heat == TRUE) { # if heat is TRUE -> T was below the threshold
#browser()
y[L + 1] <- 0
}
return(y)
}
rootfun <- function (t, y, pars) {
heat <- y[L + 1] > 0
trigger_root <- 1
if (y[sensorP] < thresh[1] & heat == FALSE & t > 1) { # if heat is FALSE -> T was above the threshold
#browser()
trigger_root <- 0
}
if (y[sensorP] > thresh[2] & heat == TRUE & t > 1) { # if heat is TRUE -> T was below the threshold
#browser()
trigger_root <- 0
}
return(trigger_root)
}
roll <- function(x, n) {
x[((1:length(x)) - (n + 1)) %% length(x) + 1]
}
fun <- function(t, y, pars) {
v <- y[1:L]
# Heat diffusion: dT/dt = alpha * d2T/d2X
d2Td2X <- c(v[2:L], v[1]) + c(v[L], v[1:(L - 1)]) - 2 * v
dT_diff <- pars * d2Td2X
# Moving water
# nS <- floor(vel)
# pS <- vel - nS
#
# v_shifted <- roll(v, nS)
# nS1 <- nS + 1
# v_shifted1 <- roll(v, nS + 1)
#
# dT_flow <- v_shifted + pS * (v_shifted1 - v_shifted) - v
dT_flow <- advection.1D(v, v = vel, dx = 1, C.up = v[L], C.down = v[1],
adv.method = adv_method)$dC
dT <- dT_flow + dT_diff
# heating of the ring after the sensor
dT[sensorP + 1] <- dT[sensorP + 1] + y[L + 1]
# heat dispersion
dT <- dT - heatDisp * (v - baseTemp + 2.5 * sin(t/(60*24) * pi * 2))
return(list(c(dT, 0)))
}
out <- ode.1D(y = vec, times = 1:simTime, func = fun, parms = alpha, nspec = 1,
events = list(func = eventfun, root = T),
rootfunc = rootfun)
if (plot_type == 'sensors') {
## Trend of the temperature at the sensors levels
out %>%
{.[,c(1, sensorP + 1, sensorP + 3, L + 2)]} %>%
as.data.frame() %>%
setNames(c('time', 'pre', 'post', 'heat')) %>%
mutate(Amb = baseTemp + 2.5 * sin(time/(60*24) * pi * 2)) %>%
pivot_longer(-time, values_to = "val", names_to = "trend") %>%
ggplot(aes(time, val)) +
geom_hline(yintercept = thresh) +
geom_line(aes(color = trend)) +
theme_minimal() +
theme(panel.spacing=unit(0, "lines")) +
labs(x = 'time', y = 'T°', color = 'sensor')
} else {
## Trend of the temperature in the whole pipe
out %>%
as.data.frame() %>%
pivot_longer(-time, values_to = "val", names_to = "x") %>%
filter(time %in% round(seq.int(1, simTime, length.out = 40))) %>%
ggplot(aes(as.numeric(x), val)) +
geom_hline(yintercept = thresh) +
geom_line(alpha = .5, show.legend = FALSE) +
geom_point(aes(color = val)) +
scale_color_gradient(low = "#56B1F7", high = "red") +
facet_wrap(~ time) +
theme_minimal() +
theme(panel.spacing=unit(0, "lines")) +
labs(x = 'x', y = 'T°', color = 'T°')
}
}
It's interesting that setting an higher number of segment (L = 500) and high speed (vel = 2) it's possible to observe a spiking sequence in the post heating sensor. Also, the processing time drastically increases, but more as an effect of increased velocity than due to increased pipe resolution.
My biggest doubt now is whether ReacTran::advection.1D() does make sense in my context since I'm modeling water temperature, while this function seems more related to the concentration of a solute in flowing water.
The problem looks like a PDE example with a mobile and a fixed phase. A good introduction about the "method of lines" (MOL) approach with R/deSolve can be be found in the paper about ReachTran from Soetaert and Meysman (2012) doi.org/10.1016/j.envsoft.2011.08.011.
An example PDE can be found at slide 55 of some workshop slides, more in the teaching package RTM.
R/deSolve/ReacTran tries to make ODEs/PDEs easy, but pitfalls remain. If numerical dispersion or oscillations occur, it can be caused by violating the Courant–Friedrichs–Lewy condition.

Add a Passing-Bablok regression line

I have to perform many comparisons between different measurement methods and I have to use the Passing-Bablok regression approach.
I would like to take advantage of ggplot2 and faceting, but I don't know how to add a geom_smooth layer based on the Passing-Bablok regression.
I was thinking about something like: https://stackoverflow.com/a/59173260/2096356
Furthermore, I would also need to show the regression line equation, with confidence interval for intercept and slope parameters, in each plot.
Edit with partial solution
I've found a partial solution combining the code provided in this post and in this answer.
## Regression algorithm
passing_bablok.fit <- function(x, y) {
x_name <- deparse(substitute(x))
lx <- length(x)
l <- lx*(lx - 1)/2
k <- 0
S <- rep(NA, lx)
for (i in 1:(lx - 1)) {
for (j in (i + 1):lx) {
k <- k + 1
S[k] <- (y[i] - y[j])/(x[i] - x[j])
}
}
S.sort <- sort(S)
N <- length(S.sort)
neg <- length(subset(S.sort,S.sort < 0))
K <- floor(neg/2)
if (N %% 2 == 1) {
b <- S.sort[(N+1)/2+K]
} else {
b <- sqrt(S.sort[N / 2 + K]*S.sort[N / 2 + K + 1])
}
a <- median(y - b * x)
res <- as.vector(c(a,b))
names(res) <- c("(Intercept)", x_name)
class(res) <- "Passing_Bablok"
res
}
## Computing confidence intervals
passing_bablok <- function(formula, data, R = 100, weights = NULL){
ret <- boot::boot(
data = model.frame(formula, data),
statistic = function(data, ind) {
data <- data[ind, ]
args <- rlang::parse_exprs(colnames(data))
names(args) <- c("y", "x")
rlang::eval_tidy(rlang::expr(passing_bablok.fit(!!!args)), data, env = rlang::current_env())
},
R=R
)
class(ret) <- c("Passing_Bablok", class(ret))
ret
}
## Plotting confidence bands
predictdf.Passing_Bablok <- function(model, xseq, se, level) {
pred <- as.vector(tcrossprod(model$t0, cbind(1, xseq)))
if(se) {
preds <- tcrossprod(model$t, cbind(1, xseq))
data.frame(
x = xseq,
y = pred,
ymin = apply(preds, 2, function(x) quantile(x, probs = (1-level)/2)),
ymax = apply(preds, 2, function(x) quantile(x, probs = 1-((1-level)/2)))
)
} else {
return(data.frame(x = xseq, y = pred))
}
}
An example of usage:
z <- data.frame(x = rnorm(100, mean = 100, sd = 5),
y = rnorm(100, mean = 110, sd = 8))
ggplot(z, aes(x, y)) +
geom_point() +
geom_smooth(method = passing_bablok) +
geom_abline(slope = 1, intercept = 0)
So far, I haven't been able to show the regression line equation, with confidence interval for intercept and slope parameters (as +- or in parentheses).
You've arguably done with difficult part with the PaBa regression.
Here's a basic solution using your passing_bablok.fit function:
z <- data.frame(x = 101:200+rnorm(100,sd=10),
y = 101:200+rnorm(100,sd=8))
mycoefs <- as.numeric(passing_bablok.fit(x = z$x, y=z$y))
paba_eqn <- function(thecoefs) {
l <- list(m = format(thecoefs[2], digits = 2),
b = format(abs(thecoefs[1]), digits = 2))
if(thecoefs[1] >= 0){
eq <- substitute(italic(y) == m %.% italic(x) + b,l)
} else {
eq <- substitute(italic(y) == m %.% italic(x) - b,l)
}
as.character(as.expression(eq))
}
library(ggplot2)
ggplot(z, aes(x, y)) +
geom_point() +
geom_smooth(method = passing_bablok) +
geom_abline(slope = 1, intercept = 0) +
annotate("text",x = 110, y = 220, label = paba_eqn(mycoefs), parse = TRUE)
Note the equation will vary because of rnorm in the data creation..
The solution could definitely be made more slick and robust, but it works for both positive and negative intercepts.
Equation concept sourced from: https://stackoverflow.com/a/13451587/2651663

R Knitr not printing model results to finle

I have an R script that is generating a number of plots and well as assessing a few linear models. For some reason when I try to print out the plots and linear model statistics in a loop they don't end up in the file but when I remove the loop they get printed to file.
R/Knitr markup that doesn't work (desired lines to print with # I want this to print):
library('RODBC')
library('ggplot2')
library('dplyr')
library('reshape2')
con <- odbcConnect('yield_model')
sql <- "SELECT DISTINCT [grouping],[group],regionId,class,finalPk,avgArea,layer,runTime,random,name FROM dbo.pk p LEFT OUTER JOIN dred.dbo.yasMap y ON p.class = y.id WHERE random=1"
values <- sqlQuery(con,sql,stringsAsFactors = FALSE)
values$model = "NM"
groupings <- unique(select(values,regionId,class,layer))
groupings <- groupings[order(groupings$layer,groupings$regionId,groupings$class),]
for(i in 1:nrow(groupings)) {
data <- subset(values,regionId == groupings$regionId[i] & class == groupings$class[i] & layer == groupings$layer[i])
layer <- unique(select(data,layer))
region <- unique(select(data,regionId))
defectNo <- unique(select(data,class))
defectName <- unique(select(data,name))
count <- length(unique(data$avgArea))
average <- mean(data$finalPk)
myPlot <- ggplot(data=data,aes(x=avgArea,y=finalPk)) +
geom_point(size=4,color="red") +
ggtitle(paste("Defect=",defectName$name,"(", defectNo$class,"), Region=",region$regionId, ", Layer=", layer$layer,sep="")) +
geom_abline(intercept=average,slop=0,size=1,aes(color="mean"))
if(count > 1) {
myPlot <- myPlot + stat_smooth(method ="lm",formula = y ~ x, se = FALSE,size=1,aes(color="linear"))
}
if (count > 3) {
myPlot <- myPlot + stat_smooth(method = "lm", formula = y ~ log(x), se = FALSE, size=1, aes(color="log"))
myPlot <- myPlot + stat_smooth(method = "lm", formula = y ~ poly(x,2), se = FALSE, size=1, aes(color="poly"))
}
if(count > 1 & count < 4) {
myPlot <- myPlot + scale_colour_manual(name="",values=c("mean" = "orange","linear" = "green"))
} else if(count > 3) {
myPlot <- myPlot + scale_colour_manual(name="",values=c("mean" = "orange","linear" = "green","log" = "red","poly" = "blue"))
} else if (count == 2 | count == 3) {
}
plot(myPlot)
paste("Average Pk=",average,sep="") # I want this to print
if(count > 1) {
linMod <- lm(data$finalPk ~ data$avgArea)
coef(linMod) # I want this to print
summary(linMod) # I want this to print
}
if (count > 3) {
linModLog <- lm(data$finalPk ~ log(data$avgArea))
coef(linModLog) # I want this to print
summary(linModLog) # I want this to print
linModPoly <- lm(data$finalPk ~ poly(data$avgArea,2))
coef(linModPoly) # I want this to print
summary(linModPoly) # I want this to print
}
}
R/Knitr that does print:
library('RODBC')
library('ggplot2')
library('dplyr')
library('reshape2')
con <- odbcConnect('yield_model')
sql <- "SELECT DISTINCT [grouping],[group],regionId,class,finalPk,avgArea,layer,runTime,random,name FROM dbo.pk p LEFT OUTER JOIN dred.dbo.yasMap y ON p.class = y.id WHERE random=1"
values <- sqlQuery(con,sql,stringsAsFactors = FALSE)
values$model = "NM"
groupings <- unique(select(values,regionId,class,layer))
groupings <- groupings[order(groupings$layer,groupings$regionId,groupings$class),]
#for(i in 1:nrow(groupings)) {
i <- 1
data <- subset(values,regionId == groupings$regionId[i] & class == groupings$class[i] & layer == groupings$layer[i])
layer <- unique(select(data,layer))
region <- unique(select(data,regionId))
defectNo <- unique(select(data,class))
defectName <- unique(select(data,name))
count <- length(unique(data$avgArea))
average <- mean(data$finalPk)
myPlot <- ggplot(data=data,aes(x=avgArea,y=finalPk)) +
geom_point(size=4,color="red") +
ggtitle(paste("Defect=",defectName$name,"(", defectNo$class,"), Region=",region$regionId, ", Layer=", layer$layer,sep="")) +
geom_abline(intercept=average,slop=0,size=1,aes(color="mean"))
if(count > 1) {
myPlot <- myPlot + stat_smooth(method ="lm",formula = y ~ x, se = FALSE,size=1,aes(color="linear"))
}
if (count > 3) {
myPlot <- myPlot + stat_smooth(method = "lm", formula = y ~ log(x), se = FALSE, size=1, aes(color="log"))
myPlot <- myPlot + stat_smooth(method = "lm", formula = y ~ poly(x,2), se = FALSE, size=1, aes(color="poly"))
}
if(count > 1 & count < 4) {
myPlot <- myPlot + scale_colour_manual(name="",values=c("mean" = "orange","linear" = "green"))
} else if(count > 3) {
myPlot <- myPlot + scale_colour_manual(name="",values=c("mean" = "orange","linear" = "green","log" = "red","poly" = "blue"))
} else if (count == 2 | count == 3) {
}
plot(myPlot)
paste("Average Pk=",average,sep="") # I want this to print
if(count > 1) {
linMod <- lm(data$finalPk ~ data$avgArea)
coef(linMod) # I want this to print
summary(linMod) # I want this to print
}
if (count > 3) {
linModLog <- lm(data$finalPk ~ log(data$avgArea))
coef(linModLog) # I want this to print
summary(linModLog) # I want this to print
linModPoly <- lm(data$finalPk ~ poly(data$avgArea,2))
coef(linModPoly) # I want this to print
summary(linModPoly) # I want this to print
}
#}
My header values (I also tried with no results portion at all):
{r echo=FALSE,results='asis'}
Hopefully somebody has some advice for me. Appreciate the help!
Given that it's everything after the plot(myPlot) statement which is giving you problems in the loop, I'm thinking the answer here might apply to you.
Try adding a plot.new call after the plot(myPlot) statement

How to plot the intersection of a hyperplane and a plane in R

I have a set of (2-dimensional) data points that I run through a classifier that uses higher order polynomial transformations. I want to visualize the results as a 2 dimensional scatterplot of the points with the classifier superimbosed on top, preferably using ggplot2 as all other visualizations are made by this. Pretty much like this one that was used in the ClatechX online course on machine learning (the background color is optional).
I can display the points with colors and symbols and all, that's easy but I can't figure out how to draw anything like the classifiers (the intersection of the classifiing hyperplane with the plane representing my threshold). The only thing I found was stat_function and that only takes a function with a single argument.
Edit:
The example that was asked for in the comments:
sample data:
"","x","y","x","x","y","value"
"1",4.17338115745224,0.303530843229964,1.26674990184152,17.4171102853774,0.0921309727918932,-1
"2",4.85514814266935,3.452660451876,16.7631779801937,23.5724634872656,11.9208641959486,1
"3",3.51938610081561,3.41200957307592,12.0081790673332,12.3860785266141,11.6418093267617,1
"4",3.18545089452527,0.933340128976852,2.97310914874565,10.1470974014319,0.87112379635852,-16
"5",2.77556006214581,2.49701633118093,6.93061880335166,7.70373365857888,6.23509055818427,-1
"6",2.45974169578403,4.56341833807528,11.2248303614692,6.05032920997851,20.8247869282818,1
"7",2.73947941488586,3.35344674880616,9.18669833727041,7.50474746458339,11.2456050970786,-1
"8",2.01721803518012,3.55453519499861,7.17027250203368,4.06916860145595,12.6347204524838,-1
"9",3.52376445778646,1.47073399974033,5.1825201951431,12.4169159539591,2.1630584979922,-1
"10",3.77387718763202,0.509284208528697,1.92197605658768,14.2421490273294,0.259370405056702,-1
"11",4.15821685106494,1.03675272315741,4.31104264382058,17.2907673804804,1.0748562089743,-1
"12",2.57985028671101,3.88512040604837,10.0230289934507,6.65562750184287,15.0941605694935,1
"13",3.99800728890114,2.39457673509605,9.5735352407471,15.9840622821066,5.73399774026327,1
"14",2.10979392635636,4.58358959294856,9.67042948411309,4.45123041169019,21.0092935565863,1
"15",2.26988795562647,2.96687697409652,6.73447830932721,5.15239133109813,8.80235897942413,-1
"16",1.11802248633467,0.114183261757717,0.127659454208164,1.24997427994995,0.0130378172656312,-1
"17",0.310411276295781,2.09426849964075,0.650084557879535,0.0963551604515758,4.38596054858751,-1
"18",1.93197490065359,1.72926536411978,3.340897280049,3.73252701675543,2.99035869954433,-1
"19",3.45879891654477,1.13636834081262,3.93046958599847,11.9632899450912,1.29133300600123,-1
"20",0.310697768582031,0.730971727753058,0.227111284709427,0.0965331034018534,0.534319666774291,-1
"21",3.88408110360615,0.915658151498064,3.55649052359657,15.0860860193904,0.838429850404852,-1
"22",0.287852146429941,2.16121324687265,0.622109872005114,0.0828588582043242,4.67084269845782,-1
"23",2.80277011333965,1.22467750683427,3.4324895146344,7.85552030822994,1.4998349957458,-1
"24",0.579150241101161,0.57801398797892,0.334756940497835,0.335415001767533,0.334100170299295-,1
"25",2.37193428212777,1.58276639413089,3.7542178708388,5.62607223873297,2.50514945839009,-1
"26",0.372461311053485,2.51207412336953,0.935650421453748,0.138727428231681,6.31051640130279,-1
"27",3.56567220995203,1.03982002707198,3.70765737388213,12.7140183088242,1.08122568869998,-1
"28",0.634770628530532,2.26303249713965,1.43650656059435,0.402933750845047,5.12131608311011,-1
"29",2.43812176748179,1.91849716124125,4.67752968967431,5.94443775306852,3.68063135769073,-1
"30",1.08741064323112,3.01656032912433,3.28023980783858,1.18246190701233,9.0996362192467,-1
"31",0.98,2.74,2.6852,0.9604,7.5076,1
"32",3.16,1.78,5.6248,9.9856,3.1684,1
"33",4.26,4.28,18.2328,18.1476,18.3184,-1
The code to generate a classifier:
perceptron_train <- function(data, maxIter=10000) {
set.seed(839)
X <- as.matrix(data[1:5])
Y <- data["value"]
d <- dim(X)
X <- cbind(rep(1, d[1]), X)
W <- rep(0, d[2] + 1)
count <- 0
while (count < maxIter){
H <- sign(X %*% W)
indexs <- which(H != Y)
if (length(indexs) == 0){
break
} else {
i <- sample(indexs, 1)
W <- W + 0.1 * (X[i,] * Y[i,])
}
count <- count + 1
point <- as.data.frame(data[i,])
plot_it(data, point, W, paste("plot", sprintf("%05d", count), ".png", sep=""))
}
W
}
The code to generate the plot:
plot_it <- function(data, point, weights, name = "plot.png") {
line <- weights_to_line(weights)
point <- point
png(name)
p = ggplot() + geom_point(data = data, aes(x, y, color = value, size = 2)) + theme(legend.position = "none")
p = p + geom_abline(intercept = line[2], slope = line[1])
print(p)
dev.off()
}
This was solved using material from the question and answers from Issues plotting a fitted SVM model's decision boundary using ggplot2's stat_contour(). I skipped the call to geom_point for the grid-entires and some of the aesthetical definitions like scale_fill_manual and scale_colour_manual. Removing the dots for the grid entries solved the problem with the vanishing contour-line in my case.
train_and_plot_svm <- function(train, kernel = "sigmoid", type ="C", cost, gamma) {
fit <- svm(as.factor(value) ~ x + y, data = train, kernel = kernel, type = type, cost = cost)
grid <- expand.grid (x = seq(from = -0.1, to = 15, length = 100), y = seq(from = -0.1, to = 15, length = 100))
decisionValues <- as.vector(attributes(predict(fit, grid, decision.values = TRUE))$decision)
p <- predict(fit, grid)
grid$value <- p
grid$z <- decisionValues
p <- ggplot() + stat_contour(data = grid, aes(x = x, y = y, z = z), breaks = c(0))
p <- p + geom_point(data = train, aes(x, y, colour = as.factor(value)), alpha = 0.7)
p <- p + xlim(0,15) + ylim(0,15) + theme(legend.position="none")
}
Note that this function doesn't return the result of the svm training but the ggplot2 object.
This is, what I got:

How to get something like Matplotlib's symlog scale in ggplot or lattice?

For very heavy-tailed data of both positive and negative sign, I sometimes like to see all the data on a plot without hiding structure in the unit interval.
When plotting with Matplotlib in Python, I can achieve this by selecting a symlog scale, which uses a logarithmic transform outside some interval, and linear plotting inside it.
Previously in R I have constructed similar behavior by transforming the data with an arcsinh on a one-off basis. However, tick labels and the like are very tricky to do right (see below).
Now, I am faced with a bunch of data where the subsetting in lattice or ggplot would be highly convenient. I don't want to use Matplotlib because of the subsetting, but I sure am missing symlog!
Edit:
I see that ggplot uses a package called scales, which solves a lot of this problem (if it works). Automatically choosing tick mark and label placing still looks pretty hard to do nicely though. Some combination of log_breaks and cbreaks perhaps?
Edit 2:
The following code is not too bad
sinh.scaled <- function(x,scale=1){ sinh(x)*scale }
asinh.scaled <- function(x,scale=1) { asinh(x/scale) }
asinh_breaks <- function (n = 5, scale = 1, base=10)
{
function(x) {
log_breaks.callable <- log_breaks(n=n,base=base)
rng <- rng <- range(x, na.rm = TRUE)
minx <- floor(rng[1])
maxx <- ceiling(rng[2])
if (maxx == minx)
return(sinh.scaled(minx, scale=scale))
big.vals <- 0
if (minx < (-scale)) {
big.vals = big.vals + 1
}
if (maxx>scale) {
big.vals = big.vals + 1
}
brk <- c()
if (minx < (-scale)) {
rbrk <- log_breaks.callable( c(-min(maxx,-scale), -minx ) )
rbrk <- -rev(rbrk)
brk <- c(brk,rbrk)
}
if ( !(minx>scale | maxx<(-scale)) ) {
rng <- c(max(minx,-scale), min(maxx,scale))
minc <- floor(rng[1])
maxc <- ceiling(rng[2])
by <- floor((maxc - minc)/(n-big.vals)) + 1
cb <- seq(minc, maxc, by = by)
brk <- c(brk,cb)
}
if (maxx>scale) {
brk <- c(brk,log_breaks.callable( c(max(minx,scale), maxx )))
}
brk
}
}
asinh_trans <- function(scale = 1) {
trans <- function(x) asinh.scaled(x, scale)
inv <- function(x) sinh.scaled(x, scale)
trans_new(paste0("asinh-", format(scale)), trans, inv,
asinh_breaks(scale = scale),
domain = c(-Inf, Inf))
}
A solution based on the package scales and inspired by Brian Diggs' post mentioned by #Dennis:
symlog_trans <- function(base = 10, thr = 1, scale = 1){
trans <- function(x)
ifelse(abs(x) < thr, x, sign(x) *
(thr + scale * suppressWarnings(log(sign(x) * x / thr, base))))
inv <- function(x)
ifelse(abs(x) < thr, x, sign(x) *
base^((sign(x) * x - thr) / scale) * thr)
breaks <- function(x){
sgn <- sign(x[which.max(abs(x))])
if(all(abs(x) < thr))
pretty_breaks()(x)
else if(prod(x) >= 0){
if(min(abs(x)) < thr)
sgn * unique(c(pretty_breaks()(c(min(abs(x)), thr)),
log_breaks(base)(c(max(abs(x)), thr))))
else
sgn * log_breaks(base)(sgn * x)
} else {
if(min(abs(x)) < thr)
unique(c(sgn * log_breaks()(c(max(abs(x)), thr)),
pretty_breaks()(c(sgn * thr, x[which.min(abs(x))]))))
else
unique(c(-log_breaks(base)(c(thr, -x[1])),
pretty_breaks()(c(-thr, thr)),
log_breaks(base)(c(thr, x[2]))))
}
}
trans_new(paste("symlog", thr, base, scale, sep = "-"), trans, inv, breaks)
}
I am not sure whether the impact of a parameter scale is the same as in Python, but here are a couple of comparisons (see Python version here):
data <- data.frame(x = seq(-50, 50, 0.01), y = seq(0, 100, 0.01))
data$y2 <- sin(data$x / 3)
# symlogx
ggplot(data, aes(x, y)) + geom_line() + theme_bw() +
scale_x_continuous(trans = symlog_trans())
# symlogy
ggplot(data, aes(y, x)) + geom_line() + theme_bw()
scale_y_continuous(trans="symlog")
# symlog both, threshold = 0.015 for y
# not too pretty because of too many breaks in short interval
ggplot(data, aes(x, y2)) + geom_line() + theme_bw()
scale_y_continuous(trans=symlog_trans(thr = 0.015)) +
scale_x_continuous(trans = "symlog")
# Again symlog both, threshold = 0.15 for y
ggplot(data, aes(x, y2)) + geom_line() + theme_bw()
scale_y_continuous(trans=symlog_trans(thr = 0.15)) +
scale_x_continuous(trans = "symlog")

Resources