Using panel.linejoin with missing data - r

This question is very much related to the question and answers received here, where #Mr. Flick helped me with a question I had regarding the xyplot in the lattice package. But seeing that I'm now trouble-shooting some code I thought I'd ask to the "broader public" for some help.
I've been asked by the reviewers of our paper, to present patient body mass index follow-up data similarly to the way we presented their intraoperative data in the link I provided above.
When I plot the data in an analog fashion, the black line representing "mean" stops at three months, but I want it to go through all time points. See image below.
Here's my data called bmi_data
dput(bmi_data)
structure(list(StudyID = structure(c(1L, 2L, 3L, 4L, 5L, 6L,
7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L), .Label = c("P1",
"P2", "P3", "P4", "P5", "P6", "P7"), class = "factor"), BMI = c(37.5,
43.82794785, 48.87848306, 39.93293705, 42.76788399, 39.44207394,
50.78043704, 25.61728395, 37.91099773, 39.02185224, 36.00823045,
37.75602259, 34.06360931, 39.12591051, 25.98765432, 34.89937642,
32.95178633, 35.62719098, 35.75127802, 32.27078777, NA, 23.61111111,
32.34835601, NA, 34.33165676, NA, 26.53375883, 35.79604579, 23.20987654,
31.71060091, NA, 34.29355281, NA, NA, NA), BMITIME2 = structure(c(5L,
5L, 5L, 5L, 5L, 5L, 5L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L,
4L, 4L), .Label = c("12 months FU", "3 months FU", "6 months FU",
"Over 12 months FU", "Preoperative BMI"), class = "factor"),
TIME2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), .Label = c("Preoperative BMI",
"3 months FU", "6 months FU", "12 months FU", "Over 12 months FU"
), class = "factor")), .Names = c("StudyID", "BMI", "BMITIME2",
"TIME2"), class = "data.frame", row.names = c(NA, -35L))
Some data.frame manipulation to get the right order of my time-points.
bmi_data$TIME2 <- factor(bmi_data$BMITIME2, unique(bmi_data$BMITIME2))
And now my code that doesn't seem to be working properly.
require(lattice)
stderr <- function(x) sqrt(var(x,na.rm=TRUE)/length(na.omit(x)))
panel.sem <- function(x, y, col.se=plot.line$col, alpha.se=.10, ...) {
plot.line <- trellis.par.get("plot.line")
xs <- if(is.factor(x)) {
factor(c(levels(x) , rev(levels(x))), levels=levels(x))
} else {
xx <- sort(unique(x))
c(xx, rev(xx))
}
means <- tapply(y,x, mean, na.rm=T)
stderr <- tapply(y,x, stderr)
panel.polygon(xs, c(means+stderr, rev(means-stderr)), col=col.se, alpha=alpha.se)}
xyplot(BMI~bmi_data$TIME2, groups=StudyID, data=bmi_data, ty=c("l", "p"),
panel = function(x, y, ...) {
panel.sem(x,y, col.se="grey")
panel.xyplot(x, y, ...)
panel.linejoin(x, y, horizontal = FALSE ,..., col="black", lty=1, lwd=4)}
,xlab="Measurement Time Point",
ylab=expression("BMI"~"(kg/m^2)"))
Which results in this plot:
Any help for solving this question is greatly appreciated!!!

The problem is that you have missing data (NA) values in this data set. The panel.linejoin() calls mean() over the observations at each x and if there are NA vales, by default the mean will be NA and then a line won't be drawn. To change that, you can specify a function wrapper to panel.linejoin. Try
xyplot(BMI~bmi_data$TIME2, groups=StudyID, data=bmi_data, ty=c("l", "p"),
panel = function(x, y, ...) {
panel.sem(x,y, col.se="grey")
panel.xyplot(x, y, ...)
panel.linejoin(x, y, horizontal = FALSE ,..., col="black",
lty=1, lwd=4, na.rm=T,
fun=function(x) mean(x, na.rm=T))
},
xlab="Measurement Time Point",
ylab=expression("BMI"~"(kg/m^2)")
)

Here's an approach using ggplot + dplyr but don't know lattice:
if (!require("pacman")) install.packages("pacman")
pacman::p_load(ggplot2, dplyr)
ave_data <- bmi_data %>%
group_by(TIME2) %>%
summarize(BMI = mean(BMI, na.rm=TRUE)) %>%
mutate(ave = TRUE)
ggplot(bmi_data, aes(y=BMI, x=TIME2)) +
geom_point(aes(color = StudyID), shape=21) +
geom_smooth(aes(group=1), alpha=.1) +
geom_line(size=.8, aes(group=StudyID, color = StudyID)) +
geom_path(data=ave_data, color="black", size=1.2, aes(group=ave)) +
xlab("Measurement Time Point") + theme_bw() +
ylab(expression("BMI"~"(kg/m^2)")) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position=c(.87, .70)
) +
guides(fill=guide_legend(title="ID"))

Related

multi panel plot show x-axis labels while minimizing white space base r

I used the answer from this question: Avoid wasting space when placing multiple aligned plots onto one page
par(mfrow = c(5, 2), # 2x2 layout
oma = c(2, 2, 0, 0), # two rows of text at the outer left and bottom margin
mar = c(1,6,2,1), # space for one row of text at ticks and to separate plots
mgp = c(2, 1, 0), # axis label at 2 rows distance, tick labels at 1 row
xpd = NA) # allow content to protrude into outer margin (and beyond)
and I get very close with the plot I have:
But the x-axis labels are cut off in the final plot. I need them to be seen, and they need to stay rotated.
I tried
par(mar = c(1,6,2,1)+1)
before the final 2 plots for the panel but that changes the size of the actual plot instead of making the x-axis labels visible.
How can I make the x-axis labels visible?
If you can use the data from the linked example that would work fine.
set.seed(42)
catA <- factor(c("m100", "m500", "m1000", "m2000", "m3000", "m5000"))
catB <- factor(20:28)
samples <- 100
rsample <- function(v) v[ceiling(runif(samples, max=length(v)))]
Tab <- data.frame(catA = rsample(catA),
catB = rsample(catB),
valA = rnorm(samples, 150, 8),
valB = pmin(1,pmax(0,rnorm(samples, 0.5, 0.3))))
op <- par(mfrow = c(2,2),
oma = c(5,4,0,0) + 0.1,
mar = c(0,0,1,1) + 0.1)
for (i in 0:3) {
x <- Tab[[1 + i %% 2]]
plot(x, Tab[[3 + i %/% 2]], axes = FALSE)
axis(side = 1,
at=1:nlevels(x),
labels = if (i %/% 2 == 1) levels(x) else FALSE)
axis(side = 2, labels = (i %% 2 == 0))
box(which = "plot", bty = "l")
}
title(xlab = "Some Categories",
ylab = "Some Values",
outer = TRUE, line = 3)
par(op)
Labels get automatically hidden according to the margins size.You are probably using the RStudio "Plots" tab, and if you resize the window, more labels show up. Better use another device, e.g. pdf or png device, where you may define a fixed size and the output is always the same.
You could use a case handling via modulo for the entire axes, not just the labels. Further you could define the las parameters which rotates the tick labels, also using modulo, yielding 1 (always horozontal) or 2 (always perpendicular) depending on case (here long or short labels). Slightly expand second oma to show y axis label.
png('plot1.png', width=600, height=600) ## open device
op <- par(mfrow=c(2, 2), oma=c(6, 6, 0, 0) + 0.1, mar=c(0, 0, 1, 1) + 0.1)
for (i in 0:3) {
x <- Tab[[1 + i %% 2]]
plot(x, Tab[[3 + i %/% 2]], axes=FALSE)
if (i %/% 2 == 1) {
axis(side=1, at=1:nlevels(x), labels=levels(x), las=(1 - i %% 2) + 1)
}
if (i %% 2 == 0) {
axis(side=2, labels=TRUE, las=2)
}
box()
}
title(xlab="Some Categories", ylab="Some Values", outer=TRUE, line=4)
par(op)
dev.off() ## close device (plot is saved in wd)
I assumed you only wanted to show axis ticks and labels at the outer margins, otherwise, please comment.
Data:
Tab <- structure(list(catA = structure(c(6L, 6L, 5L, 4L, 3L, 3L, 4L,
1L, 3L, 4L, 2L, 4L, 6L, 5L, 2L, 6L, 6L, 1L, 2L, 3L, 6L, 1L, 6L,
6L, 1L, 3L, 2L, 6L, 2L, 6L, 4L, 4L, 2L, 4L, 1L, 4L, 1L, 5L, 6L,
3L, 2L, 2L, 1L, 6L, 2L, 6L, 6L, 3L, 6L, 3L, 2L, 2L, 2L, 4L, 1L,
4L, 4L, 5L, 5L, 3L, 4L, 6L, 4L, 3L, 6L, 5L, 5L, 4L, 4L, 5L, 1L,
1L, 5L, 2L, 5L, 4L, 1L, 2L, 3L, 1L, 3L, 1L, 2L, 3L, 4L, 3L, 5L,
1L, 1L, 5L, 4L, 1L, 5L, 6L, 6L, 4L, 5L, 3L, 4L, 3L), levels = c("m100",
"m1000", "m2000", "m3000", "m500", "m5000"), class = "factor"),
catB = structure(c(6L, 2L, 2L, 4L, 9L, 9L, 7L, 7L, 5L, 1L,
6L, 8L, 7L, 5L, 5L, 5L, 1L, 4L, 6L, 8L, 4L, 4L, 6L, 6L, 7L,
4L, 9L, 9L, 3L, 7L, 9L, 6L, 6L, 9L, 8L, 6L, 8L, 2L, 7L, 6L,
2L, 1L, 5L, 8L, 7L, 8L, 2L, 9L, 3L, 2L, 7L, 3L, 8L, 4L, 7L,
7L, 2L, 1L, 2L, 7L, 9L, 5L, 6L, 2L, 5L, 2L, 5L, 3L, 2L, 2L,
7L, 4L, 4L, 5L, 4L, 2L, 8L, 6L, 8L, 7L, 9L, 8L, 3L, 3L, 7L,
7L, 9L, 8L, 2L, 3L, 2L, 8L, 2L, 2L, 1L, 1L, 5L, 2L, 7L, 7L
), levels = c("20", "21", "22", "23", "24", "25", "26", "27",
"28"), class = "factor"), valA = c(159.607723004788, 158.358008697342,
141.974330825281, 164.787855213382, 144.665812729937, 150.844110499649,
146.621952945049, 149.02119862436, 151.505544276012, 150.953287663976,
149.799259593061, 150.864581823536, 146.116518113227, 145.966262954497,
136.711207360681, 146.941330185009, 145.898797936978, 171.615128002758,
139.103070150482, 151.098049748469, 138.05099946147, 138.236514068506,
150.997619089576, 142.026886920928, 149.985419085562, 146.573928948593,
145.090627148404, 133.802577236647, 140.20201639712, 151.436131528943,
154.540964755388, 146.056981171572, 150.000503072523, 158.98311714704,
161.51884594381, 141.223089852753, 149.061443517999, 159.611987207358,
146.24216335547, 149.580244120488, 149.311141614103, 142.898567856749,
146.442527960922, 149.764440967294, 146.689049207537, 158.907088186946,
146.152057266768, 146.534647739194, 155.574900612417, 141.549052694633,
149.67441219879, 137.587641421219, 159.337356393885, 147.810834389007,
146.257237402622, 140.09398137611, 149.937903729781, 143.597742576387,
145.732061360397, 160.301401964677, 148.595793038063, 141.425740926795,
151.305655059739, 147.098092674976, 154.720108383899, 161.459375421848,
142.058459911124, 153.637202380642, 150.679184469428, 157.164524658116,
148.16177488843, 156.692952547685, 136.039553109306, 163.515671370507,
156.918223828149, 148.793792088914, 138.407942958887, 155.144069600336,
153.865550910518, 149.949154988629, 151.211647142899, 145.327128237202,
152.950453861042, 152.357234717756, 147.765925013259, 139.310106760855,
155.60599054752, 154.433572978192, 143.309547257589, 137.24329470395,
151.639668644701, 147.239296176217, 152.020893626916, 139.647980276124,
142.326636444957, 158.686198829439, 153.230199237726, 154.691900293754,
164.521827569232, 151.030571428819), valB = c(0, 0.600133159230071,
0.851397538207638, 1, 0.0869415205278437, 0.154743330311868,
0.288253581571964, 0.183783265376843, 0.306276883057253,
0.444386609697049, 0.139633384778004, 1, 0.532332423465664,
0.474767569848326, 0.648685892481378, 0.51122455583539, 0.460373588913227,
0.943036227065629, 0.434890936972369, 0.114919338772333,
0.615700367133021, 0.394546137941272, 0.343461171993119,
0.179560639793849, 0.628509770980008, 0.447794529671902,
0.654700318594409, 0.429690416808224, 0.302448972253469,
0.875070981223615, 0.418470885466581, 0.784385598762559,
0.139525270967319, 0.360165171087349, 0.419194581454046,
0.382710377560742, 0.904612103597514, 0.493170589610476,
0.573267755331035, 0.217288487640823, 0.281234817047128,
0.799420672566455, 0.877544499378668, 0.874659106643029,
0.085808885142401, 1, 0.805061848939862, 0.491984760758523,
0.711082333639478, 0.208584431254441, 0.171153127532343,
0.514715135281279, 0.140451243032666, 0.557005699567924,
0.8893117698814, 0.189837883110575, 0.278467773736081, 0.513969181835259,
0.194721164052168, 0.385014812034117, 0.761826623501419,
0.790863504191035, 0.615153999508128, 0, 0.48380097895215,
0.819431964301485, 0.743958511223481, 0.442755057768715,
0, 0.518289991640335, 0.672125509245073, 0.51374107391753,
0.547223762052763, 0.629469611860631, 0.381035079184936,
0.892993467725465, 0.641118019962183, 0.12719891882762, 0.91447263691566,
0.861337681102947, 0.747222189104564, 0.00121117934432169,
0.329208096918356, 0.690654145186858, 0.513116602273357,
0.604403691098392, 1, 0.254485902680099, 0, 0.582108581729124,
0.293720947626976, 0.63381231588856, 0.256284582862821, 1,
0.462888208526771, 0.356799348190984, 0.450121552554018,
0.758769015087929, 0.529202145560846, 0.0123149782366136)), class = "data.frame", row.names = c(NA,
-100L))

`non-finite value supplied` in ggstatsplot

I am working with ggstatsplot to get visual representations of my statistical analyses.
I have numerous datasets, all very similar in make-up. Some work just fine, while others don't. data1 is a working example, and data2 doesn't work.
data1 <- structure(list(
treatment = structure(c(1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L),
.Label = c("negative_ctrl", "positive_ctrl", "treatmentA", "treatmentB", "treatmentC", "treatmentD"), class = "factor"),
value = c(1.74501, 2.04001, 1.89501, 1.84001,
1.89501, 9.75001, 8.50001, 8.80001, 11.50001, 10.25001, 7.90001,
9.25001, 11.45001, 7.75001, 7.75001, 7.55001, 8.70001, 8.20001,
6.95001, 6.60001, 7.40001, 7.15001, 8.25001, 9.20001, 8.95001,
6.45001, 6.05001, 5.40001, 7.95001, 6.80001, 4.65001, 6.40001,
6.40001, 6.70001, 5.40001, 3.20001, 2.70001, 4.30001, 4.10001,
3.60001, 4.00001, 3.00001, 4.70001, 3.10001, 3.50001, 6.45001,
5.45001, 4.90001, 7.25001, 4.55001, 4.70001, 6.25001, 5.65001,
6.00001, 5.10001)),
row.names = c(NA, -55L), class = c("tbl_df", "tbl", "data.frame"))
data2 <- structure(list(
treatment = structure(c(1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L),
.Label = c("negative_ctrl", "positive_ctrl", "treatmentA", "treatmentB", "treatmentC", "treatmentD"), class = "factor"),
value = c(1.00001, 1.00001, 1.00001, 1.00001, 1.00001, 6.77501,
5.68751, 5.99201, 8.24501, 7.01251, 4.79501, 5.99126, 8.26276,
5.35376, 5.38751, 4.60251, 5.38901, 4.85201, 4.44401, 5.20501,
6.20701, 5.77001, 4.05201, 3.65126, 3.02401, 4.68351, 3.90001,
2.56951, 3.70001, 3.61901, 3.96401, 2.93601, 1.53901, 1.40801,
2.05601, 2.08501, 1.89701, 1.79501, 1.50001, 2.09151, 1.53551,
1.57501, 3.88851, 3.09151, 2.75501, 4.40626, 2.42001, 2.60951,
3.83501, 3.37151, 3.70001, 2.92701)),
row.names = c(NA, -52L), class = c("tbl_df", "tbl", "data.frame"))
I call the most basic analysis for both datasets:
library(Rmpfr)
library(ggstatsplot)
ggstatsplot::ggbetweenstats(
data = data1,
x = treatment,
y = value,
messages = FALSE )
ggstatsplot::ggbetweenstats(
data = data2,
x = treatment,
y = value,
messages = FALSE )
For data1 I get this:
for data2 I get:
> Error in stats::optim(par = 1.1 * rep(lambda, 2), fn = function(x) { : non-finite value supplied by optim
At first I thought the issue might be a few zeros that I passed on in the negative control, but I first upped them by a tiny amount and then by 1 to make sure the range of the values is not an issue. The only discrepancy I can see is that I only have 7 instead of 10 measurements for treatmentA (level 3) in data2 but 10 in data1 (had to remove a few NAs due to sample failure). However, in both cases the negative control (level 1) only has 5 values, and I don't think that in this type of analysis there is an issue with different sample sizes between the groups.
It's a good idea to try basic plots out in these cases eg isolate the boxplots:
So comparing the two datasets:
boxplot(value ~ treatment, data=data1)
boxplot(value ~ treatment, data=data2)
data2 has a treatment with no variability ("negative_ctrl"), 0 SD. I'm guessing this function is doing some tests that require variation. You will need to read the documentation for the function to see if this is brought up but you can get views either by removing these treatments, or forcing a very small amount of variation eg
# run without negative_ctrl
ggstatsplot::ggbetweenstats(
data = data2[data2$treatment != "negative_ctrl",],
x = treatment,
y = value,
messages = FALSE )
# add some tiny fake variation to force it through (this is a hack)
data3 <- data2
data3[data3$treatment=="negative_ctrl",][1,][["value"]] <- 1.0001
ggstatsplot::ggbetweenstats(
data = data3,
x = treatment,
y = value,
messages = FALSE )

ggplot2 dotplot how to create empty x axis categories

I have some data in a CSV file that I made up in order to create dot plots of different distributions.
These are the made-up data:
structure(list(uniform = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
4, 4, 4, 4, 5, 5, 5, 5), left_skew = c(1L, 2L, 2L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), right_skew = c(5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 3L,
2L, 2L, 1L), trunc_uni_left = c(3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), trunc_uni_right = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L), trunc_norm_left = c(3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L), trunc_norm_right = c(1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L), bimodal = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), extreme_left = c(3L,
3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L), extreme_right = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L)), row.names = c(NA,
-20L), class = "data.frame")
The dot-plot works when there are 'observations' in each of the five categories on the x-axis. However, if there are values missing then it only reflects those categories. For instance, in one plot there are no 1s and 2s so the plot only shows categories 3, 4, and 5.
I've tried using scale_x_discrete to set the limits and breaks but this doesn't work.
Here is the code I used to plot the data:
ggplot(df, aes(x = trunc_uni_left))+
geom_point()+
geom_dotplot(method = "histodot", binwidth = 0.25, fill = 'red', dotsize = 0.75)+
labs(x = 'Rating Categories', y = 'Rating Frequency')+
theme_bw()+
ylim(0 , 20)+
scale_x_discrete(breaks = c ("0.5", "1", "1.5", "2", "2.5"),
labels = c ("1", "2", '3', '4', '5'),
limits = c ("1", "2", "3", "4", "5"))+
theme(panel.grid = element_blank(),
text = element_text(size = 16),
axis.text.x = element_text(size = 16),
axis.title.x = element_text(size = 16, margin = margin(t = 20)),
axis.title.y = element_text(size = 16, margin = margin(r = 20)),
legend.title= element_text(size = 16))
Is there something I can do in ggplot to achieve this? Or alternatively, can I create a data frame in R that would allow me to do this?
I'm not the best coder in the world as you may be able to tell so would much appreciate the help.
Thanks!
Your breaks don't match the data. The breaks should be 1:5 which are the numbers in your df and supply new labels if required. However, I'm guessing you don't want new labels (please correct) and you just want to control the x-axis limits? In which case you can just supply the limits while changing trunc_uni_left to a factor:
ggplot(df, aes(as.factor(trunc_uni_left))) +
geom_dotplot(method = "histodot", binwidth = 0.25, fill = 'red', dotsize = 0.75)+
labs(x = 'Rating Categories', y = 'Rating Frequency')+
theme_bw() +
scale_x_discrete(limits = seq(1, 5, 1))
If you did want to re-label the x-axis with bespoke labels make sure you match the breaks to what is actually in your data:
ggplot(df, aes(as.factor(trunc_uni_left))) +
geom_dotplot(method = "histodot", binwidth = 0.25, fill = 'red', dotsize = 0.75) +
labs(x = 'Rating Categories', y = 'Rating Frequency')+
theme_bw() +
scale_x_discrete(limits = seq(1, 5, 1),
breaks = seq(1, 5, 1),
labels = paste0("my_lab_", seq(1, 5, 1)))
In this example you don't need the breaks as the data happens to be ordered because it's numeric. But if you had some string as the input you would need to match the breaks and labels in the order you want them.

Remove three sides of border around ggplot facet strip label

I have the following graph:
And would like to make what I thought would be a very simple change: I would like to remove the top, right and bottom sides of the left facet label border lines.
How do I do I remove those lines, or draw the equivalent of the right hand lines? I would rather not muck about with grobs, if possible, but won't say no to any solution that works.
Graph code:
library(ggplot2)
library(dplyr)
library(forcats)
posthoc1 %>%
mutate(ordering = -as.numeric(Dataset) + Test.stat,
Species2 = fct_reorder(Species2, ordering, .desc = F)) %>%
ggplot(aes(x=Coef, y=Species2, reorder(Coef, Taxa), group=Species2, colour=Taxa)) +
geom_point(size=posthoc1$Test.stat*.25, show.legend = FALSE) +
ylab("") +
theme_classic(base_size = 20) +
facet_grid(Taxa~Dataset, scales = "free_y", space = "free_y", switch = "y") +
geom_vline(xintercept = 0) +
theme(axis.text.x=element_text(colour = "black"),
strip.placement = "outside",
strip.background.x=element_rect(color = NA, fill=NA),
strip.background.y=element_rect(color = "black", fill=NA)) +
coord_cartesian(clip = "off") +
scale_x_continuous(limits=NULL)
Data:
structure(list(Dataset = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 5L, 5L, 5L, 5L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L), .Label = c("All.habitat", "Aut.habitat", "Habitat.season",
"Lit.season", "Spr.habitat"), class = "factor"), Species = structure(c(1L,
2L, 3L, 5L, 6L, 10L, 11L, 12L, 13L, 1L, 3L, 5L, 6L, 13L, 1L,
2L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L), .Label = c("Ar.sp1",
"Ar.sp2", "Arc.sp1", "B.pus", "Dal.sp1.bumps", "Dip.unID", "I.palladium",
"Pale", "Ph.sp3", "Port", "Somethus", "sty", "Sty.sp1"), class = "factor"),
Species2 = structure(c(2L, 9L, 1L, 4L, 5L, 7L, 11L, 12L,
13L, 2L, 1L, 4L, 5L, 13L, 2L, 9L, 4L, 5L, 6L, 10L, 8L, 7L,
11L, 13L), .Label = c("Arcitalitrus sp1", "Armadillidae sp1 ",
"Brachyiulus pusillus ", "Dalodesmidae sp1", "Diplopoda",
"Isocladosoma pallidulum ", "Ommatoiulus moreleti ", "Philosciidae sp2",
"Porcellionidae sp1", "Siphonotidae sp2", "Somethus sp1",
"Styloniscidae ", "Styloniscidae sp1"), class = "factor"),
Taxa = structure(c(3L, 3L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
1L, 2L, 2L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 3L), .Label = c("Amphipoda",
"Diplopoda", "Isopoda"), class = "factor"), Variable = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Autumn", "Litter",
"Spring", "Summer"), class = "factor"), Coef = c(1.911502938,
2.086917154, 1.571872993, 12.61184801, 15.6161116, -1.430032837,
-12.51944478, 12.33934516, -8.040249562, 8.08258816, 1.780142396,
12.88982576, 16.78107544, -13.22641153, 1.68810887, 2.093965381,
12.27209197, 15.08328526, -6.334640911, -11.29985948, -11.62658947,
-1.676293808, -6.246555908, -3.470297147), SE = c(0.403497472,
2.21607562, 0.348600794, 2.423896379, 0.509468128, 3.423013791,
2.382857733, 1.775086895, 2.087788334, 2.23631504, 0.33402261,
2.518562443, 0.459720131, 1.950974996, 0.2476205, 0.235648095,
1.815155489, 0.325804415, 2.564680067, 2.437104984, 2.212583358,
2.677618401, 2.324019051, 0.420436743), Test.stat = c(18.36532749,
13.27324683, 13.29039037, 20.50277493, 44.06097153, 10.55234932,
14.64951518, 13.22575401, 20.16415411, 16.55627107, 11.81407568,
15.15213717, 40.67205188, 12.62233207, 37.60085488, 16.90879258,
20.20215107, 80.30520371, 13.35250626, 13.01692428, 17.52987519,
20.03658771, 12.02467914, 53.5052683)), row.names = 10:33, class = "data.frame")
This solution is based on grobs: find positions of "strip-l" (left strips) and then substitute the rect grobs with line grobs.
p <- posthoc1 %>%
mutate(ordering = -as.numeric(Dataset) + Test.stat,
Species2 = fct_reorder(Species2, ordering, .desc = F)) %>%
ggplot(aes(x=Coef, y=Species2, reorder(Coef, Taxa), group=Species2, colour=Taxa)) +
geom_point(size=posthoc1$Test.stat*.25, show.legend = FALSE) +
ylab("") +
theme_classic(base_size = 20) +
facet_grid(Taxa~Dataset, scales = "free_y", space = "free_y", switch = "y") +
geom_vline(xintercept = 0) +
theme(axis.text.x=element_text(colour = "black"),
strip.placement = "outside",
#strip.background.x=element_rect(color = "white", fill=NULL),
strip.background.y=element_rect(color = NA)
) +
coord_cartesian(clip = "off") +
scale_x_continuous(limits=NULL)
library(grid)
q <- ggplotGrob(p)
lg <- linesGrob(x=unit(c(0,0),"npc"), y=unit(c(0,1),"npc"),
gp=gpar(col="red", lwd=4))
for (k in grep("strip-l",q$layout$name)) {
q$grobs[[k]]$grobs[[1]]$children[[1]] <- lg
}
grid.draw(q)

Create balloon plot with ggplot2: use ..count.. to adjust size of geom_point?

I want to essentially create a balloon plot with ggplot2 where the size of the points are the frequency of data at a given coordinate.
Given the data.frame d:
d = structure(list(value.x = structure(c(2L, 2L, 3L, 2L, 3L, 2L, 2L, 2L, 3L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L), .Label = c("Not at all Knowledgeable", "Somewhat Knowledgeable", "Very Knowledgeable"), class = c("ordered", "factor")), value.y = structure(c(5L, 5L, 3L, 5L, 5L, 5L, 5L, 5L, 4L, 4L, 5L, 4L, 4L, 4L, 5L, 4L, 5L, 5L, 4L, 4L), .Label = c("Much less knowledgeable", "Less knowledgeable", "Same as before workshop", "More knowledgeable", "Much more knowledgeable"), class = c("ordered", "factor"))), .Names = c("value.x", "value.y"), row.names = c(NA, 20L), class = "data.frame")
I want to do something like:
ggplot(d,aes(value.x,value.y,size=..count..))+geom_point()
where the data points are proportional to how many times data occur, but I cannot figure out how to properly set the size of the points for what I want.
Importantly, I would like to avoid creating a new column in d just for counts of data as has been done with other datasets (e.g. http://www.r-bloggers.com/balloon-plot-using-ggplot2/). This seems messy and I would like to utilize ggplot2's power if I can.
Per #BenBolker's suggestion, I found a solution using stat_sum():
ggplot(d, aes(value.x, value.y, size = ..n..)) + stat_sum()

Resources