Can GGPLOT make 2D summaries of data? - r

I wish to plot mean (or other function) of reaction time as a function of the location of the target in the x y plane.
As test data:
library(ggplot2)
xs <- runif(100,-1,1)
ys <- runif(100,-1,1)
rts <- rnorm(100)
testDF <- data.frame("x"=xs,"y"=ys,"rt"=rts)
I know I can do this:
p <- ggplot(data = testDF,aes(x=x,y=y))+geom_bin2d(bins=10)
What I would like to be able to do, is the same thing but plot a function of the data in each bin rather than counts. Can I do this?
Or do I need to generate the conditional means first in R (e.g. drt <- tapply(testDF$rt,list(cut(testDF$x,10),cut(testDF$y,10)),mean)) and then plot that?
Thank you.

Update With the release of ggplot2 0.9.0, much of this functionality is covered by the new additions of stat_summary2d and stat_summary_bin.
here is a gist for this answer: https://gist.github.com/1341218
here is a slight modification of stat_bin2d so as to accept arbitrary function:
StatAggr2d <- proto(Stat, {
objname <- "aggr2d"
default_aes <- function(.) aes(fill = ..value..)
required_aes <- c("x", "y", "z")
default_geom <- function(.) GeomRect
calculate <- function(., data, scales, binwidth = NULL, bins = 30, breaks = NULL, origin = NULL, drop = TRUE, fun = mean, ...) {
range <- list(
x = scales$x$output_set(),
y = scales$y$output_set()
)
# Determine binwidth, if omitted
if (is.null(binwidth)) {
binwidth <- c(NA, NA)
if (is.integer(data$x)) {
binwidth[1] <- 1
} else {
binwidth[1] <- diff(range$x) / bins
}
if (is.integer(data$y)) {
binwidth[2] <- 1
} else {
binwidth[2] <- diff(range$y) / bins
}
}
stopifnot(is.numeric(binwidth))
stopifnot(length(binwidth) == 2)
# Determine breaks, if omitted
if (is.null(breaks)) {
if (is.null(origin)) {
breaks <- list(
fullseq(range$x, binwidth[1]),
fullseq(range$y, binwidth[2])
)
} else {
breaks <- list(
seq(origin[1], max(range$x) + binwidth[1], binwidth[1]),
seq(origin[2], max(range$y) + binwidth[2], binwidth[2])
)
}
}
stopifnot(is.list(breaks))
stopifnot(length(breaks) == 2)
stopifnot(all(sapply(breaks, is.numeric)))
names(breaks) <- c("x", "y")
xbin <- cut(data$x, sort(breaks$x), include.lowest=TRUE)
ybin <- cut(data$y, sort(breaks$y), include.lowest=TRUE)
if (is.null(data$weight)) data$weight <- 1
ans <- ddply(data.frame(data, xbin, ybin), .(xbin, ybin), function(d) data.frame(value = fun(d$z)))
within(ans,{
xint <- as.numeric(xbin)
xmin <- breaks$x[xint]
xmax <- breaks$x[xint + 1]
yint <- as.numeric(ybin)
ymin <- breaks$y[yint]
ymax <- breaks$y[yint + 1]
})
}
})
stat_aggr2d <- StatAggr2d$build_accessor()
and usage:
ggplot(data = testDF,aes(x=x,y=y, z=rts))+stat_aggr2d(bins=3)
ggplot(data = testDF,aes(x=x,y=y, z=rts))+
stat_aggr2d(bins=3, fun = function(x) sum(x^2))
As well, here is a slight modification of stat_binhex:
StatAggrhex <- proto(Stat, {
objname <- "aggrhex"
default_aes <- function(.) aes(fill = ..value..)
required_aes <- c("x", "y", "z")
default_geom <- function(.) GeomHex
calculate <- function(., data, scales, binwidth = NULL, bins = 30, na.rm = FALSE, fun = mean, ...) {
try_require("hexbin")
data <- remove_missing(data, na.rm, c("x", "y"), name="stat_hexbin")
if (is.null(binwidth)) {
binwidth <- c(
diff(scales$x$input_set()) / bins,
diff(scales$y$input_set() ) / bins
)
}
try_require("hexbin")
x <- data$x
y <- data$y
# Convert binwidths into bounds + nbins
xbnds <- c(
round_any(min(x), binwidth[1], floor) - 1e-6,
round_any(max(x), binwidth[1], ceiling) + 1e-6
)
xbins <- diff(xbnds) / binwidth[1]
ybnds <- c(
round_any(min(y), binwidth[1], floor) - 1e-6,
round_any(max(y), binwidth[2], ceiling) + 1e-6
)
ybins <- diff(ybnds) / binwidth[2]
# Call hexbin
hb <- hexbin(
x, xbnds = xbnds, xbins = xbins,
y, ybnds = ybnds, shape = ybins / xbins,
IDs = TRUE
)
value <- tapply(data$z, hb#cID, fun)
# Convert to data frame
data.frame(hcell2xy(hb), value)
}
})
stat_aggrhex <- StatAggrhex$build_accessor()
and usage:
ggplot(data = testDF,aes(x=x,y=y, z=rts))+stat_aggrhex(bins=3)
ggplot(data = testDF,aes(x=x,y=y, z=rts))+
stat_aggrhex(bins=3, fun = function(x) sum(x^2))

This turned out to be harder than I expected.
You can almost trick ggplot into doing this, by providing a weights aesthetic, but that only gives you the sum of the weights in the bin, not the mean (and you have to specify drop=FALSE to retain negative bin values). You can also retrieve either counts or density within a bin, but neither of those really solves the problem.
Here's what I ended up with:
## breaks vector (slightly coarser than the 10x10 spec above;
## even 64 bins is a lot for binning only 100 points)
bvec <- seq(-1,1,by=0.25)
## helper function
tmpf <- function(x,y,z,FUN=mean,breaks) {
midfun <- function(x) (head(x,-1)+tail(x,-1))/2
mids <- list(x=midfun(breaks$x),y=midfun(breaks$y))
tt <- tapply(z,list(cut(x,breaks$x),cut(y,breaks$y)),FUN)
mt <- melt(tt)
## factor order gets scrambled (argh), reset it
mt$X1 <- factor(mt$X1,levels=rownames(tt))
mt$X2 <- factor(mt$X2,levels=colnames(tt))
transform(X,
x=mids$x[mt$X1],
y=mids$y[mt$X2])
}
ggplot(data=with(testDF,tmpf(x,y,rt,breaks=list(x=bvec,y=bvec))),
aes(x=x,y=y,fill=value))+
geom_tile()+
scale_x_continuous(expand=c(0,0))+ ## expand to fill plot region
scale_y_continuous(expand=c(0,0))
This assumes equal bin widths, etc., could be extended ... it really is too bad that (as far as I can tell) stat_bin2d doesn't accept a user-specified function.

Related

Cannot update/edit ggplot2 object exported from a package (`gratia`) in R

I hope I am missing something painfully obvious here.
I wish to update (e.g., fix title, labs, etc.) on a ggplot object produced from gratia::draw(). Not really sure why I am unable to update the object.
Is there a simple solution?
# devtools::install_github('gavinsimpson/gratia')
library('mgcv')
library('gratia')
dat <- gamSim(1, n = 400, dist = "normal", scale = 2, verbose = FALSE)
mod <- gam(y ~ s(x0), data = dat, method = "REML")
draw(mod)
p = draw(mod)
# P is a ggobject.
class(p)
#> [1] "gg" "ggplot"
So, why can't I update p?
p + ggtitle("My title")
Created on 2019-02-26 by the reprex package (v0.2.1)
The object returned by draw.gam is the output of cowplot::plot_grid (not a pure ggplot2 graphical object).
I made a small change into draw.gam function of gratia. .
Now the p object can be manipulated:
# The modified draw.gam function
mydraw.gam <- function (object, parametric = TRUE, select = NULL, scales = c("free",
"fixed"), align = "hv", axis = "lrtb", n = 100, unconditional = FALSE,
overall_uncertainty = TRUE, dist = 0.1, ...)
{
scales <- match.arg(scales)
S <- smooths(object)
select <- gratia:::check_user_select_smooths(smooths = S, select = select)
d <- gratia:::smooth_dim(object)
take <- d <= 2L
select <- select[take]
S <- S[take]
d <- d[take]
is_re <- vapply(object[["smooth"]], gratia:::is_re_smooth, logical(1L))
is_by <- vapply(object[["smooth"]], gratia:::is_by_smooth, logical(1L))
if (any(is_by)) {
S <- vapply(strsplit(S, ":"), `[[`, character(1L), 1L)
}
npara <- 0
nsmooth <- length(S)
if (isTRUE(parametric)) {
terms <- parametric_terms(object)
npara <- length(terms)
p <- vector("list", length = npara)
}
g <- l <- vector("list", length = nsmooth)
for (i in unique(S)) {
eS <- evaluate_smooth(object, smooth = i, n = n, unconditional = unconditional,
overall_uncertainty = overall_uncertainty, dist = dist)
l[S == i] <- split(eS, eS[["smooth"]])
}
l <- l[select]
d <- d[select]
g <- g[select]
if (length(g) == 0L) {
message("Unable to draw any of the model terms.")
return(invisible(g))
}
for (i in seq_along(l)) {
g[[i]] <- draw(l[[i]])
}
if (isTRUE(parametric)) {
for (i in seq_along(terms)) {
p[[i]] <- evaluate_parametric_term(object, term = terms[i])
g[[i + length(g)]] <- draw(p[[i]])
}
}
if (isTRUE(identical(scales, "fixed"))) {
wrapper <- function(x) {
range(x[["est"]] + (2 * x[["se"]]), x[["est"]] -
(2 * x[["se"]]))
}
ylims <- range(unlist(lapply(l, wrapper)))
if (isTRUE(parametric)) {
ylims <- range(ylims, unlist(lapply(p, function(x) range(x[["upper"]],
x[["lower"]]))))
}
gg <- seq_along(g)[c(d == 1L, rep(TRUE, npara))]
for (i in gg) {
g[[i]] <- g[[i]] + lims(y = ylims)
}
}
g
}
# Example no. 1
dat <- gamSim(1, n = 400, dist = "normal", scale = 2, verbose = FALSE)
mod <- gam(y ~ s(x0), data = dat, method = "REML")
p <- mydraw.gam(mod)
p[[1]] + ggtitle("My title")
# Example no. 2
mod <- gam(y ~ s(x0) + x1, data = dat, method = "REML")
p <- mydraw.gam(mod)
# Plot graphs separately
p[[1]] + ggtitle("My title")
p[[2]] + ggtitle("My title")
# Arrange the two plots on the same figure
cowplot::plot_grid(plotlist = p)
Reposts from Gavin Simpson and Hao Ye, respectively:
I think the only way to change the title(s) on the individual plots of smooths would be to use draw(evaluate_smooth(model, "smooth"), title = "My title") individually at the moment.
You might be able to hack a title in a different way:
draw(mod) +
ggplot2::coord_cartesian(clip = "off") +
ggplot2::theme(plot.margin = ggplot2::unit(c(0.05, 0, 0, 0), "npc")) +
ggplot2::annotate("text", x = 0.5, y = 1, vjust = 0, label = "TITLE", size = 6)

How do I speed up my function, specifically the ggplot commands?

I put together a function to identify outliers. It takes a dataframe and then shows plots of the data with lines to indicate potential outliers. It'll give a table with outliers marked, too.
But, it is SLOOOW. The problem is it takes a really long time for the plots to load.
I was curious if you might have advice on how to speed this up.
Related: Is the default plotting system faster than ggplot?
I'll start with the dependencies
#These next four functions are not mine. They're used in GetOutliers()
ExtractDetails <- function(x, down, up){
outClass <- rep("N", length(x))
indexLo <- which(x < down)
indexHi <- which(x > up)
outClass[indexLo] <- "L"
outClass[indexHi] <- "U"
index <- union(indexLo, indexHi)
values <- x[index]
outClass <- outClass[index]
nOut <- length(index)
maxNom <- max(x[which(x <= up)])
minNom <- min(x[which(x >= down)])
outList <- list(nOut = nOut, lowLim = down,
upLim = up, minNom = minNom,
maxNom = maxNom, index = index,
values = values,
outClass = outClass)
return(outList)
}
Hampel <- function(x, t = 3){
#
mu <- median(x, na.rm = TRUE)
sig <- mad(x, na.rm = TRUE)
if (sig == 0){
message("Hampel identifer implosion: MAD scale estimate is zero")
}
up<-mu+t*sig
down<-mu-t*sig
out <- list(up = up, down = down)
return(out)
}
ThreeSigma <- function(x, t = 3){
#
mu <- mean(x, na.rm = TRUE)
sig <- sd(x, na.rm = TRUE)
if (sig == 0){
message("All non-missing x-values are identical")
}
up<-mu+t* sig
down<-mu-t * sig
out <- list(up = up, down = down)
return(out)
}
BoxplotRule <- function(x, t = 1.5){
#
xL <- quantile(x, na.rm = TRUE, probs = 0.25, names = FALSE)
xU <- quantile(x, na.rm = TRUE, probs = 0.75, names = FALSE)
Q<-xU-xL
if(Q==0){
message("Boxplot rule implosion: interquartile distance is zero")
}
up<-xU+t*Q
down<-xU-t*Q
out <- list(up = up, down = down)
return(out)
}
FindOutliers <- function(x, t3 = 3, tH = 3, tb = 1.5){
threeLims <- ThreeSigma(x, t = t3)
HampLims <- Hampel(x, t = tH)
boxLims <- BoxplotRule(x, t = tb)
n <- length(x)
nMiss <- length(which(is.na(x)))
threeList <- ExtractDetails(x, threeLims$down, threeLims$up)
HampList <- ExtractDetails(x, HampLims$down, HampLims$up)
boxList <- ExtractDetails(x, boxLims$down, boxLims$up)
sumFrame <- data.frame(method = "ThreeSigma", n = n,
nMiss = nMiss, nOut = threeList$nOut,
lowLim = threeList$lowLim,
upLim = threeList$upLim,
minNom = threeList$minNom,
maxNom = threeList$maxNom)
upFrame <- data.frame(method = "Hampel", n = n,
nMiss = nMiss, nOut = HampList$nOut,
lowLim = HampList$lowLim,
upLim = HampList$upLim,
minNom = HampList$minNom,
maxNom = HampList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
upFrame <- data.frame(method = "BoxplotRule", n = n,
nMiss = nMiss, nOut = boxList$nOut,
lowLim = boxList$lowLim,
upLim = boxList$upLim,
minNom = boxList$minNom,
maxNom = boxList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
threeFrame <- data.frame(index = threeList$index,
values = threeList$values,
type = threeList$outClass)
HampFrame <- data.frame(index = HampList$index,
values = HampList$values,
type = HampList$outClass)
boxFrame <- data.frame(index = boxList$index,
values = boxList$values,
type = boxList$outClass)
outList <- list(summary = sumFrame, threeSigma = threeFrame,
Hampel = HampFrame, boxplotRule = boxFrame)
return(outList)
}
#strip non-numeric variables out of a dataframe
num_vars <- function(df){
X <- which(sapply(df, is.numeric))
num_vars <- df[names(X)]
return(num_vars)
}
This is the function
GetOutliers <- function(df){
library('dplyr')
library('ggplot2')
#strip out the non-numeric columns
df_out <- num_vars(df)
#initialize the data frame
df_out$Hampel <- NA
df_out$threeSigma <- NA
df_out$boxplotRule <- NA
df_out_id <- df_out
#identify outliers for each column
for (i in 1:length(names(num_vars(df)))){
#find the outliers
Outs <- FindOutliers(df_out[[i]])
OutsSum <- Outs$summary
#re-enter the outlier status
df_out$Hampel <- NA
df_out$threeSigma <- NA
df_out$boxplotRule <- NA
ifelse(is.na(Outs$Hampel), print(), df_out[unlist(Outs$Hampel[1]),]$Hampel <- TRUE)
ifelse(is.na(Outs$threeSigma), print(), df_out[unlist(Outs$threeSigma[1]),]$threeSigma <- TRUE)
ifelse(is.na(Outs$boxplotRule), print(), df_out[unlist(Outs$boxplotRule[1]),]$boxplotRule <- TRUE)
#visualize the outliers and print outlier information
Temp <- df_out
A <- colnames(Temp)[i]
AA <- paste(A,"Index")
colnames(Temp)[i] <- 'curr_column'
#table with outlier status
X <- arrange(subset(Temp,Hampel == TRUE | boxplotRule == TRUE | threeSigma == TRUE), desc(curr_column))
#scatterplot with labels
Y <- ggplot(Temp,aes(seq_along(curr_column),curr_column)) + geom_point() +
geom_hline(yintercept=OutsSum$lowLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[3],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[3],linetype = 'dashed') +
geom_text(aes(40,OutsSum$lowLim[1],label="ThreeSigma Lower",vjust=-1)) +
geom_text(aes(40,OutsSum$lowLim[2],label="Hampel Lower",vjust=-1)) +
geom_text(aes(40,OutsSum$lowLim[3],label="Boxplot Lower",vjust=-1)) +
geom_text(aes(40,OutsSum$upLim[1],label="ThreeSigma Upper",vjust=-1)) +
geom_text(aes(40,OutsSum$upLim[2],label="Hampel Upper",vjust=-1)) +
geom_text(aes(40,OutsSum$upLim[3],label="Boxplot Upper",vjust=-1)) +
xlab(AA) + ylab(A)
#scatterplot without labels
Z <- ggplot(Temp,aes(seq_along(curr_column),curr_column)) + geom_point() +
geom_hline(yintercept=OutsSum$lowLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[3],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[3],linetype = 'dashed') +
xlab(AA) + ylab(A)
U <- ggplot(Temp,aes(curr_column)) + geom_density() + xlab(A)
print(A)
print(X)
print(OutsSum)
print(Z)
print(Y)
print(U)
#mark the extreme outliers, the rest are reasonable outliers
A <- colnames(df_out_id[i])
Q <- as.numeric(readline(prompt="Enter the index for final Extreme value on the upper limit (if none, enter 0): "))
W <- as.numeric(readline(prompt="Enter the index for first Extreme value on the lower limit (if none, enter 0): "))
col <- df_out_id[i]
df_out_id[i] <- sapply(col[[1]], function(x){
if(Q>1 & x %in% X$curr_column[1:Q]) return('Extreme')
if(W>1 & x %in% X$curr_column[W:length(X$curr_column)]) return('Extreme')
else if (x %in% X$curr_column[Q+1:length(X$curr_column)]) return('Reasonable')
else return('Non-Outlier')
})
}
#return a dataframe with outlier status, excluding the outlier ID columns
summary(df_out_id)
return(df_out_id[1:(length(names(df_out_id))-3)])
}
Example
library('ISLR')
data(Carseats)
GetOutliers(Carseats)
It'll show you the outliers for each numeric variable.
It'll plot the variable density and then a scatterplot with identifier lines
It will also accept input so you can mark some outliers as reasonable and other as extreme

R Programming other alternatives for plot

I wonder how you can simplify these two :
plot (payroll,wins)
id = identify(payroll, wins,labels = code, n = 5)
plot (payroll,wins)
with(data, text(payroll, wins, labels = code, pos = 1, cex=0.5))
using other alternatives - pch() dan as.numeric()?
Not sure it's easier but you change pch during identification as below (taken from the R-help). Every time you click empty point change to filled-in dot.
# data simulation
data <- data.frame(payroll = rnorm(10), wins = rnorm(10), code = letters[1:10])
identifyPch <- function(x, y = NULL, n = length(x), plot = FALSE, pch = 19, ...)
{
xy <- xy.coords(x, y)
x <- xy$x
y <- xy$y
sel <- rep(FALSE, length(x))
while (sum(sel) < n) {
ans <- identify(x[!sel], y[!sel], labels = which(!sel), n = 1, plot = plot, ...)
if(!length(ans)) {
break
}
ans <- which(!sel)[ans]
points(x[ans], y[ans], pch = pch)
sel[ans] <- TRUE
}
## return indices of selected points
which(sel)
}
if(dev.interactive()) { ## use it
with(data, plot(payroll,wins))
id = with(data, identifyPch(payroll, wins))
}

Ternary plots with binned means/medians

I am looking to generate a ternary plot with binned polygons (either triangle or hex, preferably in a ggplot framework) where the color of the polygon is a binned mean or median of selected values.
This script gets very close, but triangle cell color is representative of a number of observations, rather than a mean value of observations contained within the triangle cell.
So rather than soley providing X,Y, and Z; I would provide a fourth fill/value variable is provided from which binned means or medians are calculated and represented as a color on a gradient.
Akin to the below image, though in a ternary framework with an additional axis.
Image of stat_summary_hex() plot with color as binned mean value
I appreciate the help. Thank you.
Dummy data to begin with:
#load libraries
devtools::install_git('https://bitbucket.org/nicholasehamilton/ggtern')
library(ggtern)
library(ggplot)
# example data
sig <- matrix(c(3,0,0,2),2,2)
data <- data.frame(mvrnorm(n=10000, rep(2, 2), sig))
data$X1 <- data$X1/max(data$X1)
data$X2 <- data$X2/max(data$X2)
data$X1[which(data$X1<0)] <- runif(length(data$X1[which(data$X1<0)]))
data$X2[which(data$X2<0)] <- runif(length(data$X2[which(data$X2<0)]))
data$X3 <- with(data, 1-X1-X2)
data <- data[data$X3 >= 0,]
data$X4 <- rnorm(dim(data)[1])
data <- data.frame(X = data$X1, Y = data$X2, Z = data$X3, fill_variable = data$X4)
str(data)
# simple ternary plot where color of point is the fill variable value
ggtern(data,aes(X,Y,Z, color = fill_variable))+geom_point()
# 2D example, not a ternary though. Keep in mind in geom_hex Z is the fill, not the additional axis like ggtern
ggplot(data,aes(X,Y))+stat_summary_hex(aes(z = fill_variable))
This code isn't cleaned up, but it's a good jumping off point. Credit for original goes the OP referenced in the first question.
I made some minor adjustments to the count_bin function to instead of doing bin counts, it does bin medians. Use at your own risk and please point out any bugs. For my implementation this reports 0 for NA bins.
Example:
Function for binned median (pardon the name, just saves time):
count_bin <- function(data, minT, maxT, minR, maxR, minL, maxL) {
ret <- data
ret <- with(ret, ret[minT <= X1 & X1 < maxT,])
ret <- with(ret, ret[minL <= X2 & X2 < maxL,])
ret <- with(ret, ret[minR <= X3 & X3 < maxR,])
if(is.na(median(ret$VAR))) {
ret <- 0
} else {
ret <- median(ret$VAR)
}
ret
}
Modified heatmap function:
heatmap3d <- function(data, inc, logscale=FALSE, text=FALSE, plot_corner=TRUE) {
# When plot_corner is FALSE, corner_cutoff determines where to stop plotting
corner_cutoff = 1
# When plot_corner is FALSE, corner_number toggles display of obervations in the corners
# This only has an effect when text==FALSE
corner_numbers = TRUE
count <- 1
points <- data.frame()
for (z in seq(0,1,inc)) {
x <- 1- z
y <- 0
while (x>0) {
points <- rbind(points, c(count, x, y, z))
x <- round(x - inc, digits=2)
y <- round(y + inc, digits=2)
count <- count + 1
}
points <- rbind(points, c(count, x, y, z))
count <- count + 1
}
colnames(points) = c("IDPoint","T","L","R")
#str(points)
#str(count)
# base <- ggtern(data=points,aes(L,T,R)) +
# theme_bw() + theme_hidetitles() + theme_hidearrows() +
# geom_point(shape=21,size=10,color="blue",fill="white") +
# geom_text(aes(label=IDPoint),color="blue")
# print(base)
polygons <- data.frame()
c <- 1
# Normal triangles
for (p in points$IDPoint) {
if (is.element(p, points$IDPoint[points$T==0])) {
next
} else {
pL <- points$L[points$IDPoint==p]
pT <- points$T[points$IDPoint==p]
pR <- points$R[points$IDPoint==p]
polygons <- rbind(polygons,
c(c,p),
c(c,points$IDPoint[abs(points$L-pL) < inc/2 & abs(points$R-pR-inc) < inc/2]),
c(c,points$IDPoint[abs(points$L-pL-inc) < inc/2 & abs(points$R-pR) < inc/2]))
c <- c + 1
}
}
#str(c)
# Upside down triangles
for (p in points$IDPoint) {
if (!is.element(p, points$IDPoint[points$T==0])) {
if (!is.element(p, points$IDPoint[points$L==0])) {
pL <- points$L[points$IDPoint==p]
pT <- points$T[points$IDPoint==p]
pR <- points$R[points$IDPoint==p]
polygons <- rbind(polygons,
c(c,p),
c(c,points$IDPoint[abs(points$T-pT) < inc/2 & abs(points$R-pR-inc) < inc/2]),
c(c,points$IDPoint[abs(points$L-pL) < inc/2 & abs(points$R-pR-inc) < inc/2]))
c <- c + 1
}
}
}
#str(c)
# IMPORTANT FOR CORRECT ORDERING.
polygons$PointOrder <- 1:nrow(polygons)
colnames(polygons) = c("IDLabel","IDPoint","PointOrder")
df.tr <- merge(polygons,points)
Labs = ddply(df.tr,"IDLabel",function(x){c(c(mean(x$T),mean(x$L),mean(x$R)))})
colnames(Labs) = c("Label","T","L","R")
#str(Labs)
#triangles <- ggtern(data=df.tr,aes(L,T,R)) +
# geom_polygon(aes(group=IDLabel),color="black",alpha=0.25) +
# geom_text(data=Labs,aes(label=Label),size=4,color="black") +
# theme_bw()
# print(triangles)
bins <- ddply(df.tr, .(IDLabel), summarize,
maxT=max(T),
maxL=max(L),
maxR=max(R),
minT=min(T),
minL=min(L),
minR=min(R))
#str(bins)
count <- ddply(bins, .(IDLabel), summarize,
N=count_bin(data, minT, maxT, minR, maxR, minL, maxL)
#N=mean(data)
)
df <- join(df.tr, count, by="IDLabel")
str(count)
Labs = ddply(df,.(IDLabel,N),function(x){c(c(mean(x$T),mean(x$L),mean(x$R)))})
colnames(Labs) = c("Label","N","T","L","R")
if (plot_corner==FALSE){
corner <- ddply(df, .(IDPoint, IDLabel), summarize, maxperc=max(T,L,R))
corner <- corner$IDLabel[corner$maxperc>=corner_cutoff]
df$N[is.element(df$IDLabel, corner)] <- 0
if (text==FALSE & corner_numbers==TRUE) {
Labs$N[!is.element(Labs$Label, corner)] <- ""
text=TRUE
}
}
heat <- ggtern(data=df,aes(L,T,R)) +
geom_polygon(aes(fill=N,group=IDLabel),color="black",alpha=1, size = 0.1,show.legend = F)
if (logscale == TRUE) {
heat <- heat + scale_fill_gradient(name="Observations", trans = "log",
low=palette[2], high=palette[4])
} else {
heat <- heat + scale_fill_distiller(name="Median Value",
palette = "Spectral")
}
heat <<- heat +
Tlab("x") +
Rlab("y") +
Llab("z") +
theme_bw() +
theme(axis.tern.arrowsep=unit(0.02,"npc"), #0.01npc away from ticks ticklength
axis.tern.arrowstart=0.25,axis.tern.arrowfinish=0.75,
axis.tern.text=element_text(size=12),
axis.tern.arrow.text.T=element_text(vjust=-1),validate = F,
axis.tern.arrow.text.R=element_text(vjust=2),
axis.tern.arrow.text.L=element_text(vjust=-1),
#axis.tern.arrow.text=element_text(size=12),
axis.tern.title=element_text(size=15),
axis.tern.text=element_blank(),
axis.tern.arrow.text=element_blank())
if (text==FALSE) {
print(heat)
} else {
print(heat + geom_text(data=Labs,aes(label=N),size=3,color="white"))
}
}
Dummy example:
# dummy example
sig <- matrix(c(3,3,3,3),3,3)
data <- data.frame(mvrnorm(n=10000, rep(2, 2), sig))
data$X1[which(data$X1<0)] <- runif(length(data$X1[which(data$X1<0)]))
data$X2[which(data$X2<0)] <- runif(length(data$X2[which(data$X2<0)]))
data$X3 <- with(data, 1-X1-X2)
data <- data[data$X3 >= 0,]
data$VAR <- rnorm(dim(data)[1])
data <- data.frame(X = data$X1, Y = data$X2, Z = data$X3, fill_variable = data$X4)
str(data)
ggtern(data,aes(X1,
X2,
X3, color = VAR))+geom_point(size = 5)+scale_color_distiller(palette = "Spectral")
heatmap3d(data,.05)

How do I add shading and color to the confidence intervals in ggplot 2 generated Kaplan-Meier plot? [duplicate]

This question already has answers here:
Create a ggplot2 survival curve with censored table
(2 answers)
Closed 7 years ago.
I would like to have the shading of the confidence intervals for the survival estimates. Now I have black lines.
library(survival)
library(ggplot2)
library(GGally)
data(lung)
sf.sex <- survfit(Surv(time, status) ~ sex, data = lung)
pl.sex <- ggsurv(sf.sex, CI = TRUE)
pl.sex
Here is an easy, almost-right solution:
pl.sex <- ggsurv(sf.sex, CI = FALSE) +
geom_ribbon(aes(ymin=low,ymax=up,fill=group),alpha=0.3)
Set CI = FALSE to get rid of the dashed-line CI bands then add geom_ribbon() to get the confidence bands you want.
This is only mostly right, though, since we want the confidence bands to use the step function as well. To get exactly what we want, we can use some code from this link to create a new stat for geom_ribbon called "stepribbon" as follows:
library(proto)
stairstepn <- function( data, direction="hv", yvars="y" ) {
direction <- match.arg( direction, c( "hv", "vh" ) )
data <- as.data.frame( data )[ order( data$x ), ]
n <- nrow( data )
if ( direction == "vh" ) {
xs <- rep( 1:n, each = 2 )[ -2 * n ]
ys <- c( 1, rep( 2:n, each = 2 ) )
} else {
ys <- rep( 1:n, each = 2 )[ -2 * n ]
xs <- c( 1, rep( 2:n, each = 2))
}
data.frame(
x = data$x[ xs ]
, data[ ys, yvars, drop=FALSE ]
, data[ xs, setdiff( names( data ), c( "x", yvars ) ), drop=FALSE ]
)
}
stat_stepribbon <- function( mapping=NULL, data=NULL, geom="ribbon", position="identity" ) {
StatStepribbon$new( mapping=mapping, data=data, geom=geom, position=position )
}
StatStepribbon <- proto(ggplot2:::Stat, {
objname <- "stepribbon"
desc <- "Stepwise area plot"
desc_outputs <- list(
x = "stepped independent variable",
ymin = "stepped minimum dependent variable",
ymax = "stepped maximum dependent variable"
)
required_aes <- c( "x", "ymin", "ymax" )
default_geom <- function(.) GeomRibbon
default_aes <- function(.) aes( x=..x.., ymin = ..y.., ymax=Inf )
calculate <- function( ., data, scales, direction = "hv", yvars = c( "ymin", "ymax" ), ...) {
stairstepn( data = data, direction = direction, yvars = yvars )
}
examples <- function(.) {
DF <- data.frame( x = 1:3, ymin = runif( 3 ), ymax=rep( Inf, 3 ) )
ggplot( DF, aes( x=x, ymin=ymin, ymax=ymax ) ) + stat_stepribbon()
}
})
With that new stat you can get the solution I think you were really looking for:
pl.sex <- ggsurv(sf.sex, CI = FALSE) +
geom_ribbon(aes(ymin=low,ymax=up,fill=group),stat="stepribbon",alpha=0.3) +
guides(fill=guide_legend("sex"))
I found a better solution. You just use the Rcmdrplugin KMggplot2 which also has the optional features of adding the median line and number at risk table.

Resources