ggplot axis order (factor) changes when using last_plot() - r

I've been able to successfully create a dotpot in ggplot for percentages across gender. But, I want to highlight the significant differences. I thought I could do this with a combination of subsetting and the use of last_plot().
Here’s my data:
require(ggplot2)
require(reshape2)
prog <- c("Honors", "Academic", "Social", "Media")
m <- c(30,35,40,23)
f <- c(25,40,45,15)
s <- c(0.7, 0.4, 0.1, 0.03)
temp <- as.data.frame(cbind(prog, m, f, s), stringsAsFactors=FALSE)
first <- temp[,1:3]
first.melt <- melt(first, id.vars = 'prog', variable.name = 'Gender', value.name = 'Percent')
first.melt <- as.data.frame(cbind(first.melt,temp[,4]), , stringsAsFactors=FALSE)
names(first.melt) <- c("program", "Gender", "Percent", "sig")
first.melt$program <- as.factor(first.melt$program)
Here’s where I reverse order my Program variable, so that when graphed if will be alphabetical from top to bottom.
first.melt[,1] = with(first.melt, factor(first.melt[,1], levels = rev(levels(first.melt[,1]))))
first.melt$sig <- as.numeric(as.character(first.melt$sig))
first.melt$Percent <- as.numeric(as.character(first.melt$Percent))
Now, I subset...
first.melt.ns <- subset(first.melt,sig > 0.05)
first.melt.sig <- subset(first.melt,sig <= 0.05)
ggplot(first.melt.ns, aes(program, y=Percent, shape=Gender)) +
geom_point(size=3) +
coord_flip() +
scale_shape_manual(values=c("m"=1, "f"=5))
The first run at ggplot get’s me my non-significant Program pairs – and it’s in the right order – so, I add my the two new points for male and female (making them solid, to draw attention as a significant pair):
last_plot() +
geom_point(data=first.melt.sig, aes(program[Gender=="m"], y=Percent[Gender=="m"]), size=3, shape=19) +
geom_point(data=first.melt.sig, aes(program[Gender=="f"], y=Percent[Gender=="f"]),size=4, shape=18)
The points get added just fine – ggplot works. But notice my Program axis – it’s correct, but reversed now.

First, you really should avoid as.data.frame(cbind(...)). It is dramatically increasing the amount of work necessary to prepare your data. The function for creating data frames is (naturally) data.frame. Use it!
What you're doing here is basically trying to get around the limitation of only having one shape scale. It's probably easiest to just do this:
temp <- data.frame(prog,m,f,s)
first <- temp[,1:3]
first.melt <- melt(first, id.vars = 'prog', variable.name = 'Gender', value.name = 'Percent')
first.melt$sig <- rep(temp$s,times = 2)
first.melt[,1] = with(first.melt, factor(first.melt[,1], levels = rev(levels(first.melt[,1]))))
first.melt.sig <- subset(first.melt,sig < 0.05)
first.melt$Percent[first.melt$sig < 0.05] <- NA
ggplot() +
geom_point(data = first.melt,aes(x = prog,y = Percent,shape = Gender),size = 3) +
geom_point(data = first.melt.sig[1,],aes(x = prog,y = Percent),shape = 19) +
geom_point(data = first.melt.sig[2,],aes(x = prog,y = Percent),shape = 18) +
coord_flip() +
scale_shape_manual(values=c("m"=1, "f"=5))
In general, work to structure your ggplot code so that you're subsetting data frames, not variables inside of aes. That gets both tricky and dangerous, because ggplot is assuming certain things about what you pass inside of aes in order for the evaluation to work properly.

Related

Plotting geom_segment with position_dodge

I have a data set with information of where individuals work at over time. More specifically, I have information on the interval at which individuals work in a given workplace.
library('tidyverse')
library('lubridate')
# individual A
a_id <- c(rep('A',1))
a_start <- c(201201)
a_end <- c(201212)
a_workplace <-c(1)
# individual B
b_id <- c(rep('B',2))
b_start <- c(201201, 201207)
b_end <- c(201206, 201211)
b_workplace <-c(1, 2)
# individual C
c_id <- c(rep('C',2))
c_start <- c(201201, 201202)
c_end <- c(201204, 201206)
c_workplace <-c(1, 2)
# individual D
d_id <- c(rep('D',1))
d_start <- c(201201)
d_end <- c(201201)
d_workplace <-c(1)
# final data frame
id <- c(a_id, b_id, c_id, d_id)
start <- c(a_start, b_start, c_start, d_start)
end <- c(a_end, b_end, c_end, d_end)
workplace <- as.factor(c(a_workplace, b_workplace, c_workplace, d_workplace))
mydata <- data.frame(id, start, end, workplace)
mydata_ym <- mydata %>%
mutate(ymd_start = as.Date(paste0(start, "01"), format = "%Y%m%d"),
ymd_end0 = as.Date(paste0(end, "01"), format = "%Y%m%d"),
day_end = as.numeric(format(ymd_end0 + months(1) - days(1), format = "%d")),
ymd_end = as.Date(paste0(end, day_end), format = "%Y%m%d")) %>%
select(-ymd_end0, -day_end)
I would like a plot where I can see the patterns of how long each individual works at each workplace as well as how they move around. I tried plotting a geom_segment as I have information of start and end date the individual works in each place. Besides, because the same individual may work in more than one place during the same month, I would like to use position_dodge to make it visible when there is overlap of different workplaces for the same id-time. This was suggested in this post here: Ggplot (geom_line) with overlaps
ggplot(mydata_ym) +
geom_segment(aes(x = id, xend = id, y = ymd_start, yend = ymd_end),
position = position_dodge(width = 0.1), size = 2) +
scale_x_discrete(limits = rev) +
coord_flip() +
theme(panel.background = element_rect(fill = "grey97")) +
labs(y = "time", title = "Work affiliation")
The problem I am having is that: (i) the position_dodge doesn't seem to be working, (ii) I don't know why all the segments are being colored in black. I would expect each workplace to have a different color and a legend to show up.
If you include colour = workplace in the aes() mapping for geom_segment you get colours and a legend and some dodging, but it doesn't work quite right (it looks like position_dodge only applies to x and not xend ... ? this seems like a bug, or at least an "infelicity", in position_dodge ...
However, replacing geom_segment with an appropriate use of geom_linerange does seem to work:
ggplot(mydata_ym) +
geom_linerange(aes(x = id, ymin = ymd_start, ymax = ymd_end, colour = workplace),
position = position_dodge(width = 0.1), size = 2) +
scale_x_discrete(limits = rev) +
coord_flip()
(some tangential components omitted).
A similar approach is previously documented here — a near-duplicate of your question once the colour= mapping is taken care of ...

R control jitter function - avoid overplotting / non-random jitter

My problems seems simple, I am using ggplot2 with geom_jitter() to plot a variable. (take my picture as an example)
Jitter now adds some random noise to the variable (the variable is just called "1" in this example) to prevent overplotting. So I have now random noise in the y-direction and clearly what otherwise would be completely overplotted is now better visible.
But here is my question:
As you can see, there are still some points, that overplot each other. In my example here, this could be easily prevented, if it wouldn't be random noise in y-direction... but somehow more strategically placed offsets.
Can I somehow alter the geom_jitter() behavior or is there a similar function in ggplot2 that does exactly this?
Not really a minimal example, but also not too long:
library("imputeTS")
library("ggplot2")
data <- tsAirgap
# 2.1 Create required data
# Get all indices of the data that comes directly before and after an NA
na_indx_after <- which(is.na(data[1:(length(data) - 1)])) + 1
# starting from index 2 moves all indexes one in front, so no -1 needed for before
na_indx_before <- which(is.na(data[2:length(data)]))
# Get the actual values to the indices and put them in a data frame with a label
before <- data.frame(id = "1", type = "before", input = na_remove(data[na_indx_before]))
after <- data.frame(id = "1", type = "after", input = na_remove(data[na_indx_after]))
all <- data.frame(id = "1", type = "source", input = na_remove(data))
# Get n values for the plot labels
n_before <- length(before$input)
n_all <- length(all$input)
n_after <- length(after$input)
# 2.4 Create dataframe for ggplot2
# join the data together in one dataframe
df <- rbind(before, after, all)
# Create the plot
gg <- ggplot(data = df) +
geom_jitter(mapping = aes(x = id, y = input, color = type, alpha = type), width = 0.5 , height = 0.5)
gg <- gg + ggplot2::scale_color_manual(
values = c("before" = "skyblue1", "after" = "yellowgreen","source" = "gray66"),
)
gg <- gg + ggplot2::scale_alpha_manual(
values = c("before" = 1, "after" = 1,"source" = 0.3),
)
gg + ggplot2::theme_linedraw() + theme(aspect.ratio = 0.5) + ggplot2::coord_flip()
So many good suggestions...here is what Bens suggestion would look like for my example:
I changed parts of my code to:
gg <- ggplot(data = df, aes(x = input, color = type, fill = type, alpha = type)) +
geom_dotplot(binwidth = 15)
Would basically also work as intended for me. ggbeeplot as suggested by Jon also worked great for my purpose.
I thought of a hack I really like, using ggrepel. It's normally used for labels, but nothing preventing you from making the label into a point.
df <- data.frame(x = rnorm(200),
col = sample(LETTERS[1:3], 200, replace = TRUE),
y = 1)
ggplot(df, aes(x, y, label = "●", color = col)) + # using unicode black circle
ggrepel::geom_text_repel(segment.color = NA,
box.padding = 0.01, key_glyph = "point")
A downside of this method is that ggrepel can take a lot time for a large number of points, and will recalculate differently each time you change the plot size. A faster alternative would be to use ggbeeswarm::geom_quasirandom, which uses a deterministic process to define jitter that looks random.
ggplot(df, aes(x,y, color = col)) +
ggbeeswarm::geom_quasirandom(groupOnX = FALSE)

How to add inbetween space in nested boxplots ggplot2

I would like to added a marginal space between groups of box plots by using the stats_summary method.
Here is a small example of my problem
library(ggplot2)
library(reshape2)
data1 <- (lapply(letters[1:5], function(l1) return(matrix(rt(5*3, 1), nrow = 5, ncol = 3, dimnames = list(cat2=letters[6:10], cat3=letters[11:13])))))
names(data1) <- letters[1:5]
data2 <- melt(data1)
customstats <- function(x) {
xs <- sort(x)
return(c(ymin=min(x), lower= mean(xs[xs < mean(x)]), middle = mean(x) , upper = mean(xs[xs > mean(x)]), ymax=max(x)))
}
ggplot(data2, aes(x=cat2, y=value, fill=cat3), width=2) +
stat_summary(fun.data = customstats, geom = "boxplot",
alpha = 0.5, position = position_dodge(1), mapping = aes(fill=cat3))
The result is the following picture.
I would like to achieve a visual separation for each "cat2" and add a "space" between the group of boxplots (I'm retricted to using the stats_summary since I have a custom statistic). How can I do it?
I have fixed a similar problem in an ugly (but effective for me) way by creating a dataframe with the same plotting variables as my original data, but with x (or y) positioned or factored that it fits between the two points I want to separate and missing values for y (or x). For your problem, I added the following code and got an image with spacial separation of clusters.
library(plyr)
empties <- data.frame(cat2_orig=unique(data2$cat2)[-length(unique(data2$cat2))])
#no extra space needed between last cluster and edge of plot
empties$cat2 <- paste0(empties$cat2_orig,empties$cat2_orig)
empties$value <- NA
data2_space <- rbind.fill(data2,empties)
ggplot(data2_space, aes(x=cat2, y=value, fill=cat3), width=2) +
stat_summary(fun.data = customstats, geom = "boxplot",
alpha = 0.5, position = position_dodge(1), mapping = aes(fill=cat3)) +
#remove tickmarks for non-interesting points on x-axis
scale_x_discrete(breaks=unique(data2$cat2))
Before & after

ggplot in a function: variable not found

I have an issue trying to create a function to creat a plot using ggplot. Here is some code:
y1<- sample(1:30,45,replace = T)
x1 <- rep(rep(c("a1","a2","a3","a4","a5"),3),each=3)
x2 <- rep(rep(c("b1","b2","b3","b4","b5"),3),each=3)
df <- data.frame(y1,x1,x2)
library(Rmisc)
dfsum <- summarySE(data=df, measurevar="y1",groupvars=c("x1","x2"))
myplot <- function(d,v, w,g) {
pd <- position_dodge(.1)
localenv <- environment()
ggplot(data=d, aes(x=v,y=w,group=g),environment = localenv) +
geom_errorbar(data=d,aes(ymin=d$w-d$se, ymax=d$w+d$se,col=d$g), width=.4, position=pd,environment = localenv) +
geom_line(position=pd,linetype="dotted") +
geom_point(data=d,position=pd,aes(col=g))
}
myplot(dfsum,x1,y1,x2)
As I was looking for similar questions, I found that specifying the local environment should solve the issue. However it did not help in my case.
Thank you
Preliminary Note
When looking at your data.frame, the group variable does not make any sense, as it is perfectly confounded with the x variable. Hence I adapted your data a bit, to show a full example:
Data
library(Rmisc)
library(ggplot2)
d <- expand.grid(x1 = paste0("a", 1:5),
x2 = paste0("b", 1:5))
d <- d[rep(1:NROW(d), each = 3), ]
d$y1 <- rnorm(NROW(d))
dfsum <- summarySE(d, measurevar = "y1", groupvars = paste0("x", 1:2))
Plot Function
myplot <- function(mydat, xvar, yvar, grpvar) {
mydat$ymin <- mydat[[yvar]] - mydat$se
mydat$ymax <- mydat[[yvar]] + mydat$se
pd <- position_dodge(width = .5)
ggplot(mydat, aes_string(x = xvar, y = yvar, group = grpvar,
ymin = "ymin", ymax = "ymax", color = grpvar)) +
geom_errorbar(width = .4, position = pd) +
geom_point(position = pd) +
geom_line(position = pd, linetype = "dashed")
}
myplot(dfsum, "x1", "y1", "x2")
Explanation
Your problem occurs because the scope of x1 x2 and y1 was ambiguous. As you defined these variables also at the top environmnet, R did not complain in the first place. If you had added a rm(x1, x2, y1)in your original code right after you created your data.frame you would have seen the problem already eralier.
ggplot looks in the data.frame you provide for all the variables you want to map to certain aesthetics. If you want to create a function, where you specify the name of the aesthatics as arguments, you should use aes_string instead of aes, as the former expects a string giving the name of the variable rather than the variable itself.
With this approach however, you cannot do calculations on the spot, so you need to create the variables yminand ymaxbeforehand in your data.frame. Furthermore, you do not need to provide the data argument for each geom if it is the same as provided to ggplot.
I've got it plotting something, let me know if this isn't the expected output.
The changes I've made to the code to get it working are:
Load the ggplot2 library
Remove the d$ from the geom_errorbar call to w and g, as these are function arguments rather than columns in d.
I've also removed the data=d calls from all layers except the main ggplot one as these aren't necessary.
library(ggplot2)
myplot <- function(d,v, w,g) {
pd <- position_dodge(.1)
localenv <- environment()
ggplot(data=d, aes(x=v,y=w,group=g),environment = localenv) +
geom_errorbar(aes(ymin=w-se, ymax=w+se,col=g), width=.4,
position=pd,environment = localenv) +
geom_line(position=pd,linetype="dotted") +
geom_point(position=pd,aes(col=g))
}
myplot(dfsum,x1,y1,x2)

Custom scatterplot matrix using facet_grid in ggplot2

I'm trying to write a custom scatterplot matrix function in ggplot2 using facet_grid. My data have two categorical variables and one numeric variable.
I'd like to facet (make the scatterplot rows/cols) according to one of the categorical variables and change the plotting symbol according to the other categorical.
I do so by first constructing a larger dataset that includes all combinations (combs) of the categorical variable from which I'm creating the scatterplot panels.
My questions are:
How to use geom_rect to white-out the diagonal and upper panels in facet_grid (I can only make the middle ones black so far)?
How can you move the titles of the facets to the bottom and left hand sides respectively?
How does one remove tick axes and labels for the top left and bottom right facets?
Thanks in advance.
require(ggplot2)
# Data
nC <- 5
nM <- 4
dat <- data.frame(
Control = rep(LETTERS[1:nC], nM),
measure = rep(letters[1:nM], each = nC),
value = runif(nC*nM))
# Change factors to characters
dat <- within(dat, {
Control <- as.character(Control)
measure <- as.character(measure)
})
# Check, lapply(dat, class)
# Define scatterplot() function
scatterplotmatrix <- function(data,...){
controls <- with(data, unique(Control))
measures <- with(data, unique(measure))
combs <- expand.grid(1:length(controls), 1:length(measures), 1:length(measures))
# Add columns for values
combs$value1 = 1
combs$value2 = 0
for ( i in 1:NROW(combs)){
combs[i, "value1"] <- subset(data, subset = Control==controls[combs[i,1]] & measure == measures[combs[i,2]], select = value)
combs[i, "value2"] <- subset(data, subset = Control==controls[combs[i,1]] & measure == measures[combs[i,3]], select = value)
}
for ( i in 1:NROW(combs)){
combs[i,"Control"] <- controls[combs[i,1]]
combs[i,"Measure1"] <- measures[combs[i,2]]
combs[i,"Measure2"] <- measures[combs[i,3]]
}
# Final pairs plot
plt <- ggplot(combs, aes(x = value1, y = value2, shape = Control)) +
geom_point(size = 8, colour = "#F8766D") +
facet_grid(Measure2 ~ Measure1) +
ylab("") +
xlab("") +
scale_x_continuous(breaks = c(0,0.5,1), labels = c("0", "0.5", "1"), limits = c(-0.05, 1.05)) +
scale_y_continuous(breaks = c(0,0.5,1), labels = c("0", "0.5", "1"), limits = c(-0.05, 1.05)) +
geom_rect(data = subset(combs, subset = Measure1 == Measure2), colour='white', xmin = -Inf, xmax = Inf,ymin = -Inf,ymax = Inf)
return(plt)
}
# Call
plt1 <- scatterplotmatrix(dat)
plt1
I'm not aware of a way to move the panel strips (the labels) to the bottom or left. Also, it's not possible to format the individual panels separately (e.g., turn off the tick marks for just one facet). So if you really need these features, you will probably have to use something other than, or in addition to ggplot. You should really look into GGally, although I've never had much success with it.
As far as leaving some of the panels blank, here is a way.
nC <- 5; nM <- 4
set.seed(1) # for reproducible example
dat <- data.frame(Control = rep(LETTERS[1:nC], nM),
measure = rep(letters[1:nM], each = nC),
value = runif(nC*nM))
scatterplotmatrix <- function(data,...){
require(ggplot2)
require(data.table)
require(plyr) # for .(...)
DT <- data.table(data,key="Control")
gg <- DT[DT,allow.cartesian=T]
setnames(gg,c("Control","H","x","V","y"))
fmt <- function(x) format(x,nsmall=1)
plt <- ggplot(gg, aes(x,y,shape = Control)) +
geom_point(subset=.(as.numeric(H)<as.numeric(V)),size=5, colour="#F8766D") +
facet_grid(V ~ H) +
ylab("") + xlab("") +
scale_x_continuous(breaks=c(0,0.5,1), labels=fmt, limits=c(-0.05, 1.05)) +
scale_y_continuous(breaks=c(0,0.5,1), labels=fmt, limits=c(-0.05, 1.05))
return(plt)
}
scatterplotmatrix(dat)
The main feature of this is the use of subset=.(as.numeric(H)<as.numeric(V)) in the call to geom_point(...). This subsets the dataset so you only get a point layer when the condition is met, e.g. in facets where is.numeric(H)<is.numeric(V). This works because I've left the H and V columns as factors and is.numeric(...) operating on a factor returns the levels, not the names.
The rest is just a more compact (and much faster) way of creating what you called comb.

Resources