Condition layer across panels in lattice - r

I would like to plot individual subject means for two different conditions in a lattice stripplot with two panels. I would also like to add within-subject confidence intervals that I have calculated and stored in separate data frame. I am trying to overlay these confidence intervals with latticeExtra's layer function. When I add the layer, either both sets of intervals display on both panels (as illustrated in code and first image below) or both sets of intervals display on only the first panel if I add [subscripts] to the x's and y's in the layer command (illustrated in second code clip and image below). How do I get the appropriate intervals to display on the appropriate panel?
library(latticeExtra)
raw_data <- data.frame(subject = rep(1:6, 4), cond1 = as.factor(rep(1:2, each = 12)), cond2 = rep(rep(c("A", "B"), each = 6), 2), response = c(2:7, 6:11, 3:8, 7:12))
summary_data <- data.frame(cond1 = as.factor(rep(1:2, each = 2)), cond2 = rep(c("A", "B"), times = 2), mean = aggregate(response ~ cond2 * cond1, raw_data, mean)$response, within_ci = c(0.57, 0.54, 0.6, 0.63))
summary_data$lci <- summary_data$mean - summary_data$within_ci
summary_data$uci <- summary_data$mean + summary_data$within_ci
subject_stripplot <- stripplot(response ~ cond1 | cond2, groups = subject, data = raw_data,
panel = function(x, y, ...) {
panel.stripplot(x, y, type = "b", lty = 2, ...)
panel.average(x, y, fun = mean, lwd = 2, col = "black", ...) # plot line connecting means
}
)
addWithinCI <- layer(panel.segments(x0 = cond1, y0 = lci, x1 = cond1, y1 = uci, subscripts = TRUE), data = summary_data, under = FALSE)
plot(subject_stripplot + addWithinCI)
Stripplot with both sets of intervals on both panels:
addWithinCI2 <- layer(panel.segments(x0 = cond1[subscripts], y0 = lci[subscripts], x1 = cond1[subscripts], y1 = uci[subscripts], subscripts = TRUE), data = summary_data, under = FALSE)
plot(subject_stripplot + addWithinCI2)
Stripplot with both sets of intervals on only the first panel

One possible solution would be to print the stripplot (e.g., inside a png or any other graphics device) and subsequently modify each sub-panel using trellis.focus.
## display stripplot
print(subject_stripplot)
## loop over grops
for (i in c("A", "B")) {
# subset of current group
dat <- subset(summary_data, cond2 == i)
# add intervals to current panel
trellis.focus(name = "panel", column = ifelse(i == "A", 1, 2), row = 1)
panel.segments(x0 = dat$cond1, y0 = dat$lci,
x1 = dat$cond1, y1 = dat$uci, subscripts = TRUE)
trellis.unfocus()
}
Another (possibly more convenient) solution would be to create a separate xyplot and set the lower and upper y values (y0, y1) passed on to panel.segments manually in dependence of the current panel.number. In contrast to the initial approach using trellis.focus, the thus created plot can be stored in a variable and is hence available for subsequent processing inside R.
p_seg <- xyplot(lci ~ cond1 | cond2, data = summary_data, ylim = c(1, 13),
panel = function(...) {
# lower and upper y values
y0 <- list(summary_data$lci[c(1, 3)], summary_data$lci[c(2, 4)])
y1 <- list(summary_data$uci[c(1, 3)], summary_data$uci[c(2, 4)])
# insert vertical lines depending on current panel
panel.segments(x0 = 1:2, x1 = 1:2,
y0 = y0[[panel.number()]],
y1 = y1[[panel.number()]])
})
p_comb <- subject_stripplot +
as.layer(p_seg)
# print(p_comb)

Another solution that does not require latticeExtra (from Duncan Mackay):
summary_data$cond3 <- sapply(summary_data$cond2, pmatch, LETTERS)
mypanel <- function(x, y, ..., lci, uci, scond1, scond3, groups, type, lty){
pnl = panel.number()
panel.xyplot(x, y, ..., groups = groups, type = type, lty = lty)
panel.average(x, y, horizontal = FALSE, col = "black", lwd = 3)
panel.segments(x0 = scond1[scond3 == pnl],
y0 = lci[scond3 == pnl],
x1 = scond1[scond3 == pnl],
y1 = uci[scond3 == pnl])
}
with(summary_data,
stripplot(response ~ cond1 | cond2, data = raw_data,
groups = subject,
lci = lci,
uci = uci,
scond1 = summary_data$cond1,
scond3 = cond3,
type = "b",
lty = 2,
panel = mypanel)
)

Related

Coloring plotly's boxplot box lines by factor

I have a data.frame with two factor variables (type and age in df below) and a single numeric variable (value in df) that I'd like to plot using R's plotly package as a grouped boxplot.
Here's the data.frame:
set.seed(1)
df <- data.frame(type = c(rep("t1", 1000), rep("t2", 1000), rep("t3", 1000), rep("t4", 1000), rep("t5", 1000), rep("t6", 1000)),
age = rep(c(rep("y", 500),rep("o", 500)), 6),
value = rep(c(runif(500, 5, 10), runif(500, 7.5, 12.5)), 6),
stringsAsFactors = F)
df$age <- factor(df$age, levels = c("y", "o"), ordered = T)
Here's how I'm currently plotting it:
library(plotly)
library(dplyr)
plot_ly(x = df$type, y = df$value, name = df$age, color = df$type, type = "box", showlegend = F) %>%
layout(yaxis = list(title = "Diversity"), boxmode = "group", boxgap = 0, boxgroupgap = 0)
Which gives:
My question is whether it is possible to color the lines of the boxes by df$age?
I know that for coloring all the boxes with a single color (e.g., #AFB1B5) I can add to the plot_ly function:
line = list(color = "#AFB1B5")
But that would color all box lines similarly whereas what I'm trying to do is to color them differently by df$age.
There is a way to do this that's not that too complicated, but rather ugly. Or something that is over the top cumbersome (I didn't realize how far I was digging until I was done...)
Before I go too far... I noticed that there is a ton of white space and that you have gaps set to zero. You can add the parameter offsetgroup and get rid of a lot more whitespace. Check it out:
plot_ly(data = df,
x = ~type, y = ~value, name = ~age, offsetgroup = ~type, # <- I'm new!
color = ~type, type = "box", showlegend = F) %>%
layout(yaxis = list(title = "Diversity"),
boxmode = "group", boxgap = 0, boxgroupgap = 0)
With the not-too-complicated-but-kind-of-ugly method
The line is the box outline, the median line, the lines from Q1 to the lower fence, from Q3 to the upper fence, and the whiskers.
I assigned the plot to the object plt for this code. When I checked the object, it didn't have the data element, so I built the plot next.
plt <- plotly_build(plt)
Then I added colors with lapply.
# this looks ugly!
lapply(1:12,
function(i){
nm = plt$x$data[[i]]$name
cr = ifelse(nm == "o",
"#66FF66", "black")
plt$x$data[[i]]$line$color <<- cr # change graph by age
}
)
plt
With the ridiculous-amount-of-code-for-a-few-lines-but-looks-better method
I guess it isn't a few lines. It's 48 lines.
For this method, you need to build the plot like I did in the before (plotly_build), so that the data element is in the plt object.
Then you have to determine the first and third quantile, the IQR, the max and min values between the quantiles and 1.5 * IQR for each type and age grouping so that you have the y values for the lines.
I wrote a function to get the upper and lower fences.
fen <- function(vals){
iq = 1.5 * IQR(vals)
q3 = quantile(vals, 3/4) # top of the box
uf = q3 + iq # top of the fence
vt = max(vals[vals > q3 & vals < uf]) # max value in range
q1 = quantile(vals, 1/4) # btm of the box
bf = q1 - iq # btm of the fence
vb = min(vals[vals < q1 & vals > bf]) # min value in range
sz = function(no){
if(length(no) > 1) {no = no[[1]]}
return(no)
}
vt = sz(vt)
vb = sz(vb)
return(list(vt, vb))
}
Then I used this function and the data to determine the remaining values needed to draw the lines.
df1 <- df %>%
# have to reverse the order or it won't line up
mutate(age = factor(age, levels = c("o", "y"), ordered = T)) %>%
group_by(type, age) %>%
summarise(ufen = fen(value)[[1]], # top of the fence
q3 = quantile(value, 3/4), # top of the box
q1 = quantile(value, 1/4), # btm of the box
dfen = fen(value)[[2]]) # btm of the fence
To plot these new lines, I used shapes which is equivalent to ggplot2 annotations. (annotations in Plotly is primarily for text.)
There are several steps to drawing these lines. First I've started with some things that are essentially the same in every line. After that is a vector that helps place the lines on the x-axis.
# line shape basics; the same for every line
tellMe <- function(shade){
list(type = "line",
line = list(color = shade),
xref = "paper",
yref = "y")
}
# setup for placing lines on the x-axis; these are % of space
xers = c(rep(.0825, 4), rep(.083 * 3, 4), rep(.083 * 5, 4))
Now four lapply statements: the upper fences, the lower fences, the upper whiskers, and the lower whiskers.
lns <- lapply(1:12,
function(i) { # upper fence lines
nm = ifelse(df1[i, ]$age == "o",
"#66FF66", "black")
xb = 1/12 * (i - 1)
xn = xb + (1/6 * xers[[i]])
more = tellMe(nm)
c(x0 = xn, x1 = xn,
y0 = df1[i, ]$q3[[1]], # it's named; this makes it val only
y1 = df1[i, ]$ufen, more)
})
mlns <- lapply(1:12,
function(i) { # lower fence lines
nm = ifelse(df1[i, ]$age == "o",
"#66FF66", "black")
xb = 1/12 * (i - 1)
xn = xb + (1/6 * xers[[i]])
more = tellMe(nm)
c(x0 = xn, x1 = xn,
y0 = df1[i, ]$q1[[1]], # it's named; this makes it val only
y1 = df1[i, ]$dfen, more)
})
# default whisker width is 1/2 the width of the box
# current boxes of 1/4 of the space by type
# with domain [0, 1], the box width is 1/12 * .5, so
# the whisker width is
ww = 1/12 * .5 *.5
# already have the center, so half on each side...
ww = ww * .5
wwlns <- lapply(1:12,
function(i) { # upper fence whisker
nm = ifelse(df1[i, ]$age == "o",
"#66FF66", "black")
xb = 1/12 * (i - 1)
xn = xb + (1/6 * xers[[i]])
more = tellMe(nm)
c(x0 = xn - ww, x1 = xn + ww,
y0 = df1[i, ]$ufen, y1 = df1[i, ]$ufen,
more)
})
wwm <- lapply(1:12,
function(i) { # lower fence whisker
nm = ifelse(df1[i, ]$age == "o",
"#66FF66", "black")
xb = 1/12 * (i - 1)
xn = xb + (1/6 * xers[[i]])
more = tellMe(nm)
c(x0 = xn - ww, x1 = xn + ww,
y0 = df1[i, ]$dfen, y1 = df1[i, ]$dfen,
more)
})
Now you have to concatenate the lists and add them to the plot.
# combine shapes
shp <- append(lns, mlns)
shp <- append(shp, wwlns)
shp <- append(shp, wwm)
plt %>% layout(shapes = shp)
There are OBVIOUSLY better color choices out there.

How to plot grouped, stacked bar charts in R to show if the proportions of data are homogeneous across groups

Suppose you have 3 sets, numbered from 1 to 3.
Each set contains unique ID's associated to categorical active/inactive labels for variables A, B, C, D.
You want to make plots that show, for each variable, the proportions of active/inactive labels side to side in the 3 sets, to show if they are homogeneous or not.
The only way I could come up with to do this was the following:
# Simulate data: 3 different sets, each with 4 different variables, each with different proportions of labels
sets = c("1", "2", "3")
variables = c("A", "B", "C", "D")
labs = c("active", "inactive")
N = 10000
set.seed(1325)
d = data.frame("set" = sample(sets, N, replace = TRUE, prob = c(0.1, 0.2, 0.7)),
"variable" = sample(variables, N, replace = TRUE, prob = c(0.15, 0.25, 0.2, 0.4)))
d["label"] = "x"
for (v in variables) {
vw = which(d[["variable"]] == v)
vp = runif(1, 0.1, 0.6)
d[vw, "label"] = sample(labs, length(vw), replace = TRUE, prob = c(vp, 1 - vp))
}
d["ID"] <- 1:N
s = aggregate(ID ~ set + variable + label, d, length)
s.l = aggregate(ID ~ set + variable, d, length)
colnames(s.l)[3] <- "ID.l"
s = merge(s, s.l)
s["frac"] = with(s, ID / ID.l)
op = par()
par(mfrow = c(2,2))
for (v in variables) {
barplot(frac ~ label + set, s, subset = variable == v, col = c("blue", "orange"), main = v)
}
par(op)
Given how the labels are assigned in the code, their proportions are different for the different variables, but homogeneous across the sets.
To show what happens when the proportions are not homogeneous:
# change the proportion of labels for one set
sw = which(d[["set"]] == 1)
d.u = d
d.u[sw, "label"] = sample(labs, length(sw), replace = TRUE, prob = c(0.05, 1 - 0.05))
s.u = aggregate(ID ~ set + variable + label, d.u, length)
s.u.l = aggregate(ID ~ set + variable, d.u, length)
colnames(s.u.l)[3] <- "ID.l"
s.u = merge(s.u, s.u.l)
s.u["frac"] = with(s.u, ID / ID.l)
op = par()
par(mfrow = c(2,2))
for (v in variables) {
barplot(frac ~ label + set, s.u, subset = variable == v, col = c("blue", "orange"), main = v)
}
par(op)
Question: do you think this can be done or represented better / more efficiently?
In particular, I would have thought that the aggregate and division part might be already built-in for some type of plot.
And I am wondering if using mfrow and plotting an array of separate plots is any good, or if there is some way to make a more cohesive lattice or grid of plots by using variable as one of the parameters.
Any ideas?
Instead of the double aggregate calls, consider by to split data frame by variable and run xtabs + proportions on each subset:
boxplot formula style
op <- par(mfrow = c(2,2))
tbls <- by(d, d$variable, FUN=function(sub) {
tbl <- xtabs(~ label + set, sub)
props <- data.frame(proportions(tbl, 2))
barplot(Freq ~ label + set, props,
col = c("blue", "orange"),
main = sub$variable[1])
})
par(op)
barplot matrix style
op <- par(mfrow = c(2,2))
tbls <- by(d, d$variable, FUN=function(sub) {
tbl <- xtabs(~ label + set, sub)
props <- proportions(tbl, 2)
barplot(props, xlab = "set", ylab = "frac",
col = c("blue", "orange"),
main = sub$variable[1])
})
par(op)

Print segments for factor levels into stripchart in base R

I have a dataframe with a numerical variable and a factor variable, like this:
set.seed(123)
df <- data.frame(
numbers = c(rnorm(50, 3), runif(50)),
levels = sample(LETTERS[1:5], 100, replace = T)
)
What I'd like to do is a stripchart that plots df$numbersagainst df$levels and inserts vertical segment lines representing the mean for each level.
stripchart(df$numbers ~ df$levels, method = "jitter")
Obviously, I could insert the means line for each level separately, e.g.:
segments(x0 = mean(df$numbers[df$levels=="A"]), y0 = 1-0.3, y1 = 1+0.3, col = "red" )
And so on for all other levels, which is tedious if you have multiple levels. So I've tried this forloop:
for(i in seq(unique(df$levels))){
segments(x0 = mean(df$numbers[df$levels==i]),
y0 = i - 0.3,
y1 = i + 0.3,
col = "red", lty = 3, lwd = 2)
}
But that doesn't print anything (and doesn't throw an error either). What's the cleanest and simplest code to insert the means segments?
As the 'levels' column is factor, use levels to get the levels of the factor 'un1', then loop over the sequence of unique elements, get the mean of the 'numbers' where the levels column is the unique value to create the segments
un1 <- levels(df$levels)
for(i in seq_along(un1)){
segments(x0 = mean(df$numbers[df$levels==un1[i]]),
y0 = i - 0.3,
y1 = i + 0.3,
col = "red", lty = 3, lwd = 2)
}
-checking the mean
with(df, tapply(numbers, levels, FUN = mean))
# A B C D E
#1.390202 1.541655 2.086605 2.377122 1.663159

Calculate 5th quantile of curve generated from vectors of X, Y points

I have these curves below:
These curves were generated using a library called discreteRV.
library(discreteRV)
placebo.rate <- 0.5
mmm.rate <- 0.3
mmm.power <- power.prop.test(p1 = placebo.rate, p2 = mmm.rate, power = 0.8, alternative = "one.sided")
n <- as.integer(ceiling(mmm.power$n))
patients <- seq(from = 0, to = n, by = 1)
placebo_distribution <- dbinom(patients, size = n, prob = placebo.rate)
mmm_distribution <- dbinom(patients, size = n, prob = mmm.rate)
get_pmf <- function(p1, p2) {
X1 <- RV(patients,p1, fractions = F)
X2 <- RV(patients,p2, fractions = F)
pmf <- joint(X1, X2, fractions = F)
return(pmf)
}
extract <- function(string) {
ints <- unlist(strsplit(string,","))
x1 <- as.integer(ints[1])
x2 <- as.integer(ints[2])
return(x1-x2)
}
diff_prob <- function(pmf) {
diff <- unname(sapply(outcomes(pmf),FUN = extract)/n)
probabilities <- unname(probs(pmf))
df <- data.frame(diff,probabilities)
df <- aggregate(. ~ diff, data = df, FUN = sum)
return(df)
}
most_likely_rate <- function(x) {
x[which(x$probabilities == max(x$probabilities)),]$diff
}
mmm_rate_diffs <- diff_prob(get_pmf(mmm_distribution,placebo_distribution))
placebo_rate_diffs <- diff_prob(get_pmf(placebo_distribution,placebo_distribution))
plot(mmm_rate_diffs$diff,mmm_rate_diffs$probabilities * 100, type = "l", lty = 2, xlab = "Rate difference", ylab = "# of trials per 100", main = paste("Trials with",n,"patients per treatment arm",sep = " "))
lines(placebo_rate_diffs$diff, placebo_rate_diffs$probabilities * 100, lty = 1, xaxs = "i")
abline(v = c(most_likely_rate(placebo_rate_diffs), most_likely_rate(mmm_rate_diffs)), lty = c(1,2))
legend("topleft", legend = c("Alternative hypothesis", "Null hypothesis"), lty = c(2,1))
Basically, I took two binomial discrete random variables, created a joint probability mass function, determined the probability of any given rate difference then plotted them to demonstrate a distribution of those rate differences if the null hypothesis was true or if the alternative hypothesis was true over 100 identical trials.
Now I want to illustrate the 5% percentile on the null hypothesis curve. Unfortunately, I don't know how to do this. If I simply use quantile(x = placebo_rate_diffs$diff, probs = 0.05, I get -0.377027. This can't be correct looking at the graph. I want to calculate the 5th percentile like I would using pbinom() but I don't know how to do that with a graph created from essentially what are just x and y vectors.
Maybe I can approximate these two curves as binomial since they appear to be, but I am still not sure how to do this.
Any help would be appreciated.

Interpolate curved line betweenstart and end points for ggplot2

I'd like to create a sankey-like plot that I can create in ggplot2 where there are curved lines between my start and end locations. Currently, I have data that looks like this:
df <- data.frame(Line = rep(letters[1:4], 2),
Location = rep(c("Start", "End"), each=4),
X = rep(c(1, 10), each = 4),
Y = c(c(1,3, 5, 15), c(9,12, 14, 6)),
stringsAsFactors = F)
ex:
Line Location X Y
1 a Start 1 1
2 a End 10 9
and creates a plot that looks something like this:
library(ggplot2)
ggplot(df) +
geom_path(aes(x= X, y= Y, group = Line))
I would like to see the data come out like this:
This is another option for setting up the data:
df2 <- data.frame(Line = letters[1:4],
Start.X= rep(1, 4),
Start.Y = c(1,3,5,15),
End.X = rep(10, 4),
End.Y = c(9,12,14,6))
ex:
Line Start.X Start.Y End.X End.Y
1 a 1 1 10 9
I can find examples of how to add a curve to the graphics of base R but these examples don't demonstrate how to get a data frame of the points in between in order to draw that curve. I would prefer to use dplyr for data manipulation. I imagine this will require a for-loop to build a table of the interpolated points.
These examples are similar but do not produce an s-shaped curve:
Plotting lines on map - gcIntermediate
http://flowingdata.com/2011/05/11/how-to-map-connections-with-great-circles/
Thank you in advance!
The code below creates curved lines via a logistic function. You could use whatever function you like instead, but this is the main idea. I should note that for other than graphical purposes, creating a curved line out of 2 points is a bad idea. It implies that the data show a certain type of relation while it actually doesn't imply that relation.
df <- data.frame(Line = rep(letters[1:4], 2),
Location = rep(c("Start", "End"), each=4),
X = rep(c(1, 10), each = 4),
Y = c(c(1,3, 5, 15), c(9,12, 14, 6)),
stringsAsFactors = F)
# logistic function for curved lines
logistic = function(x, y, midpoint = mean(x)) {
ry = range(y)
if (y[1] < y[2]) {
sign = 2
} else {
sign = -2
}
steepness = sign*diff(range(x)) / diff(ry)
out = (ry[2] - ry[1]) / (1 + exp(-steepness * (x - midpoint))) + ry[1]
return(out)
}
# an example
x = c(1, 10)
y = c(1, 9)
xnew = seq(1, 10, .5)
ynew = logistic(xnew, y)
plot(x, y, type = 'b', bty = 'n', las = 1)
lines(xnew, ynew, col = 2, type = 'b')
# applying the function to your example
xnew = seq(min(df$X), max(df$X), .1) # new x grid
m = matrix(NA, length(xnew), 4) # matrix to store results
uniq = unique(df$Line) # loop over all unique values in df$Line
for (i in seq_along(uniq)) {
m[, i] = logistic(xnew, df$Y[df$Line == uniq[i]])
}
# base R plot
matplot(xnew, m, type = 'b', las = 1, bty = 'n', pch = 1)
# put stuff in a dataframe for ggplot
df2 = data.frame(x = rep(xnew, ncol(m)),
y = c(m),
group = factor(rep(1:ncol(m), each = nrow(m))))
library(ggplot2)
ggplot(df) +
geom_path(aes(x= X, y= Y, group = Line, color = Line)) +
geom_line(data = df2, aes(x = x, y = y, group = group, color = group))

Resources