Related
I am interested in showing how the proportions of cells change from health to disease. I wanted to show a 'flow' from health to disease rather than just have two separate stacked bar charts, but I'm unsure if this type of visualization has a name and I have not been able to find many examples online. I would like to do this in R. It's almost a mix between a sankey diagram and a chord diagram.
I was hoping some of you would have some ideas on which packages I could use to achieve this in R.
As Ian Campbell points out in the comments, this is called an alluvial plot, and you can probably get quite close with the ggalluvial package. However, it is possible to get a near-identical recreation of your plot using just geom_ribbon and geom_text from ggplot2:
However, it's a bit tricky to do. First we need a way of producing those nice smooth curves that go from one side to the other. The following function takes the starting and ending levels (as numbers between 0 and 1). It also allows an optional increasing or decreasing the width of the columns on either side:
ribbon_line <- function(p1, p2, width = 10, len = 100)
{
if (width > 50) width <- 50
if (width < 0) width <- 0
if (p1 < 0) p1 <- 0
if (p1 > 1) p1 <- 1
if (p2 < 0) p2 <- 0
if (p2 > 1) p2 <- 1
yvals <- c(p1, p1, pnorm(seq(-2.5, 2.5, length.out = len)) * (p2 - p1) + p1, p2, p2)
xvals <- c(0, seq(width, 100 - width, length.out = len + 2), 100)
list(x = xvals, y = yvals)
}
Now we need a way of combining two lines into a data frame with co-ordinates we can plot:
ribbon_df <- function(uppers, lowers, group, width = 10)
{
data.frame(x = ribbon_line(uppers[1], uppers[2], width)$x,
ymax = ribbon_line(uppers[1], uppers[2], width)$y,
ymin = ribbon_line(lowers[1], lowers[2], width)$y,
group = group, stringsAsFactors = FALSE)
}
Next, we need a method of taking a simple input and turning it into a group of these ribbons, plus left and right columns, plus text labels:
multi_ribbons <- function(left_bottom, right_bottom, left_top, right_top,
groups, width = 10)
{
if (length(left_bottom) != length(right_bottom) |
length(left_bottom) != length(left_top) |
length(left_top) != length(right_top))
stop("Left and right columns different length")
if (length(groups) != length(left_bottom))
stop("Group length has to be same length as columns")
d <- lapply(seq_along(groups), function(i) {
ribbon_df(c(left_top[i], right_top[i]),
c(left_bottom[i], right_bottom[i]),
groups[i], width)})
left_cols <- lapply(d, function(x) x[1:2,])
right_cols <- lapply(d, function(x) x[nrow(x) - 1:0,])
res <- list( left = do.call(rbind, left_cols),
right = do.call(rbind, right_cols),
bands = do.call(rbind, d))
text_y <- c((res$left$ymax + res$left$ymin)/2,
(res$right$ymax + res$right$ymin)/2)
text_x <- c(rep(width / 2, length(res$left$x)),
rep(100 - width/2, length(res$left$x)))
text_labels <- paste0(round(c(res$left$ymax - res$left$ymin,
res$right$ymax - res$right$ymin), 3) * 100, "%")
res$text <- data.frame(x = text_x, y = text_y, labels = text_labels)
res
}
Finally, we want a way of taking our data as a simple pair of factor vectors and using the above functions to plot them:
alluvial <- function(yvar, xvar, width = 20)
{
tab <- table(yvar, xvar)
x_labs <- rownames(tab)
y_labs <- colnames(tab)
left <- tab[1,]/sum(tab[1,])
left <- cumsum(sort(left))
right <- tab[2,]/sum(tab[2,])
right <- cumsum(sort(right))
left_lower <- c(0, left[-length(left)])
names(left_lower) <- names(left)
right_lower <- c(0, right[-length(right)])
names(right_lower) <- names(right)
right <- right[match(names(left), names(right))]
right_lower <- right_lower[match(names(left), names(right_lower))]
df_list <- multi_ribbons(left_lower, right_lower, left, right,
names(left), width = 20)
ggplot(df_list$bands, aes(x = x, ymin = ymin, ymax = ymax, fill = group)) +
geom_ribbon(alpha = 0.5) +
geom_ribbon(alpha = 1, data = df_list$left) +
geom_ribbon(alpha = 1, data = df_list$right) +
geom_text(data = df_list$text, inherit.aes = FALSE, colour = "white",
aes(x = x, y = y, label = labels), size = 8) +
geom_text(data = data.frame(x = c(width / 2, 100 - width /2), y = c(1.05, 1.05),
labels = factor(x_labs, levels = x_labs)),
inherit.aes = FALSE,
mapping = aes(x = x, y = y, label = labels), size = 12) +
geom_text(data = data.frame(x = rep(-5, length(y_labs)),
y = unique(df_list$text$y[1:(nrow(df_list$text)/2)]),
labs = unique(df_list$bands$group)),
mapping = aes(x = x, y = y, colour = labs, label = labs),
inherit.aes = FALSE, size = 8, hjust = 1) +
scale_fill_manual(values = c("#e64b35", "#806249", "#00a087", "#3c5488")) +
scale_colour_manual(values = c("#e64b35", "#806249", "#00a087", "#3c5488")) +
coord_cartesian(xlim = c(-15, 101)) +
theme_void() + theme(legend.position = "none")
}
So, if we you data frame is in a format like this:
head(df, 20)
#> condition variable
#> 110 Disease Immune
#> 149 Disease Fibroblast
#> 133 Disease Immune
#> 184 Disease Endothelial
#> 137 Disease Immune
#> 200 Disease Endothelial
#> 30 Health Immune
#> 11 Health Immune
#> 63 Health Fibroblast
#> 88 Health Endothelial
#> 42 Health Fibroblast
#> 38 Health Fibroblast
#> 106 Disease Immune
#> 139 Disease Immune
#> 6 Health Epithelial
#> 21 Health Immune
#> 27 Health Immune
#> 181 Disease Endothelial
#> 95 Health Endothelial
#> 108 Disease Immune
You can just do:
alluvial(df$condition, df$variable)
To get the above plot, or, for something more random:
set.seed(69)
alluvial(sample(c(TRUE, FALSE), 200, replace = TRUE),
sample(LETTERS[1:4], 200, replace = TRUE))
If you want more than four colour or fill levels, you can remove or adjust the scale_colour_manual and scale_fill_manual calls, to get, for example:
set.seed(69)
alluvial(sample(c(TRUE, FALSE), 200, replace = TRUE),
sample(LETTERS[1:20], 200, replace = TRUE))
Having the following sample dataset:
set.seed(20)
N <- 20
df1 <- data.frame(x = rnorm(N),
y = rnorm(N),
grp = paste0('grp_', sample(1:500, N, T)),
lab = sample(letters, N, T))
# x y grp lab
# 1 1.163 0.237 grp_104 w
# 2 -0.586 -0.144 grp_448 y
# 3 1.785 0.722 grp_31 m
# 4 -1.333 0.370 grp_471 z
# 5 -0.447 -0.242 grp_356 o
I want to plot all points but label only subset of them (say, those df1$x>0). It works fine when I use the same color=grp aesthetics for both geom_point and geom_text:
ggplot(df1, aes(x=x,y=y,color=grp))+
geom_point(size=4) +
geom_text(aes(label=lab),data=df1[df1$x>1,],size=5,hjust=1,vjust=1)+
theme(legend.position="none")
But if I want to change points design to fill=grp, colors of labels do not match anymore:
ggplot(df1, aes(x=x,y=y))+
geom_point(aes(fill=grp),size=4,shape=21) +
geom_text(aes(label=lab,color=grp),data=df1[df1$x>1,],size=5,hjust=1,vjust=1)+
theme(legend.position="none")
I understand palette is different because levels of the subset are not the same as levels of the whole dataset. But what would be the simplest solution to enforce using the same palette?
The issue arises from different factor levels for the text and fill colours. We can avoid dropping unused factor levels by using drop = FALSE inside scale_*_discrete:
ggplot(df1, aes(x=x,y=y))+
geom_point(aes(fill=grp),size=4,shape=21) +
geom_text(aes(label=lab,color=grp),data=df1[df1$x>1,],size=5,hjust=1,vjust=1)+
theme(legend.position="none") +
scale_fill_discrete(drop = F) +
scale_colour_discrete(drop = F)
Update
With your real data we need to make sure that grp is in fact a factor.
# Load sample data
load("df1.Rdat")
# Make sure `grp` is a factor
library(tidyverse)
df1 <- df1 %>% mutate(grp = factor(grp))
# Or in base R
# df1$grp = factor(df1$grp)
# Same as before
ggplot(df1, aes(x=x,y=y))+
geom_point(aes(fill=grp),size=4,shape=21) +
geom_text(aes(label=lab,color=grp),data=df1[df1$x>1,],size=5,hjust=1,vjust=1)+
theme(legend.position="none") +
scale_fill_discrete(drop = F) +
scale_colour_discrete(drop = F)
One way is to leave the colour / fill palettes alone, & set all unwanted labels to be transparent instead:
ggplot(df1, aes(x = x, y = y)) +
geom_point(aes(fill = grp), size = 4, shape = 21) +
geom_text(aes(label = lab, color = grp,
alpha = x > 1),
size = 5, hjust = 1, vjust = 1) +
scale_alpha_manual(values = c("TRUE" = 1, "FALSE" = 0)) +
theme(legend.position = "none")
I want to plot a heatmap in R from a set of points.
I have a data frame like
X Y col
1 2 1
1 1 4
2 4 9
.......
I want to have a heatmap from this, with X and Y being the coordinates of the point, and col can be from 0 to 40. I tried to plot in points or using melt(), but with no luck.
I can plot some points with geom_point(), but I'd like to have a smooth transition from one color to another, some probably this is not the reight thing to do.
set.seed(1)
library(ggplot2)
df <- as.data.frame(expand.grid(1:50, 1:50))
df$col <- sample(0:40, size = nrow(df), replace = TRUE)
ggplot(df, aes(x = Var1, y = Var2, colour = col, fill = col )) +
geom_tile()
produces:
Edit:
And this
set.seed(1)
library(ggplot2)
df <- as.data.frame(expand.grid(1:50, 1:50))
df$col <- sample(0:40, size = nrow(df), replace = TRUE)
df <- df[sample(1:nrow(df), nrow(df) * .2, replace = FALSE), ] # make holes
df <- df[rep(1:nrow(df), df$col), -3]
ggplot(df, aes(x = Var1, y = Var2)) +
geom_point() +
stat_density2d(aes(fill=..density..), geom = "tile", contour = FALSE) +
scale_fill_gradient2(low = "white", high = "red")
produces
severity <- c("Major","Serious","Minor","Negligible")
probability <- c("Highly Probable","Probable","Possible","Remote","Unlikely","Impossible")
df <- expand.grid(x=severity,y=probability)
df$x <- factor(df$x, levels=rev(unique(df$x)))
df$y <- factor(df$y, levels=rev(unique(df$y)))
df$color <- c(1,1,2,2,1,2,2,2,2,2,2,3,2,2,3,3,2,3,3,3,3,3,3,3)
ggplot(df,aes(x,y,fill=factor(color)))+
geom_tile(color="black")+
scale_fill_manual(guide="none",values=c("red","yellow","green"))+
scale_x_discrete(expand=c(0,0))+scale_y_discrete(expand=c(0,0))+
labs(x="",y="")
Produces a risk assesssment score card chart. I want to add points by using a csv file by adding a record. Each record has 3 fields, a item name, x, and y coordinate. x= severity and y = probability.
da <- data.frame(list(name=c("ENVIRONMENTAL","COSTS","SUPPLY","HEALTH"),
severity=c("Major","Serious","Minor","Serious"),
probability=c("Probable","Possible","Probable","Unlikely")))
da
name severity probability
1 ENVIRONMENTAL Major Probable
2 COSTS Serious Possible
3 SUPPLY Minor Probable
4 HEALTH Serious Unlikely
> p1 <- p + data.frame(da, aes(severity, probability)) + geom_point()
Error in as.data.frame.default(x[[i]], optional = TRUE, stringsAsFactors = stringsAsFactors) :
cannot coerce class ""uneval"" to a data.frame
>
> d <- data.frame(list(name=c("ENVIRONMENTAL","COSTS","SUPPLY","HEALTH"),
severity=c(2,3,4,1),probability=c(3,5,4,6)))
> d
name severity probability
1 ENVIRONMENTAL 2 3
2 COSTS 3 5
3 SUPPLY 4 4
4 HEALTH 1 6
> ggplot(d,x=severity, y=probability)+ geom_point()
Error in exists(name, envir = env, mode = mode) :
argument "env" is missing, with no default
How can I add points to the ggplot / geom_tile graph?
You can't add a data.frame to a plot (not like that, at least...). What you can do is add a new layer, geom_point(), and specify the data.frame it comes from. To make things work, you should have the columns from any aesthetics you still want to use (here, x and y) have the same names in both data.frames.
# It's better practice to modify your data
# then to convert to factor within the plot
df$color <- factor(c(1,1,2,2,1,2,2,2,2,2,2,3,2,2,3,3,2,3,3,3,3,3,3,3))
# get some meaningful names, that match da and d
names(df)[1:2] <- c("severity", "probability")
p <- ggplot(df, aes(x = severity, y = probability)) +
# moved fill to the geom_tile layer, because it's only used there
geom_tile(color = "black", aes(fill = color)) +
scale_fill_manual(guide = "none", values = c("red", "yellow", "green")) +
scale_x_discrete(expand = c(0, 0)) +
scale_y_discrete(expand = c(0, 0)) +
labs(x = "", y = "")
# alsonoticehowaddingspacesmakesiteasiertoread
# Using the same column names? Yup! Now it's this easy:
p + geom_point(data = da) +
geom_point(data = d, color = "dodgerblue4")
I'd like to write some conditional stats in my graph if the data is bigger than a certain value.
With the kind help of Jack Ryan (Cut data and access groups to draw percentile lines), I could create the following script that groups data into hours and plots the result:
# Read example data
A <- read.csv(url('http://people.ee.ethz.ch/~hoferr/download/data-20130812.csv'))
# Libraries
library(doBy)
library(ggplot2)
library(plyr)
library(reshape2)
library(MASS)
library(scales)
# Sample size function
give.n <- function(x){
return(c(y = min(x) - 0.2, label = length(x)))
}
# Calculate gaps
gaps <- rep(NA, length(A$Timestamp))
times <- A$Timestamp
loss <- A$pingLoss
gap.start <- 1
gap.end <- 1
for(i in 2:length(A$Timestamp))
{ #For all rows
if(is.na(A$pingRTT.ms.[i]))
{ #Currently no connection
if(!is.na(A$pingRTT.ms.[i-1]))
{ #Connection lost now
gap.start <- i
}
if(!is.na(A$pingRTT.ms.[i+1]))
{ # Connection restores next time
gap.end <- i+1
gaps[gap.start] <- as.numeric(A$Timestamp[gap.end]-A$Timestamp[gap.start], units="secs")
loss[gap.start] <- gap.end - gap.start
}
}
}
H <- data.frame(times, gaps, loss)
H <- H[complete.cases(H),]
C <- H
C$dates <- strptime(C$times, "%Y-%m-%d %H:%M:%S")
C$h1 <- C$dates$hour
# Calculate percentiles
cuts <- c(1, .75, .5, .25, 0)
c <- ddply(C, .(h1), function (x) { summarise(x, y = quantile(x$gaps, cuts)) } )
c$cuts <- cuts
c <- dcast(c, h1 ~ cuts, value.var = "y")
c.melt <- melt(c, id.vars = "h1")
p <- ggplot(c.h1.melt, aes(x = h1, y = value, color = variable)) +
geom_point(size = 4) +
stat_summary(fun.data = max.n, geom = "text", fun.y = max, colour = "red", angle = 90, size=4) +
scale_colour_brewer(palette="RdYlBu", name="Percentile", guide = guide_legend(reverse=TRUE)) +
scale_x_continuous(breaks=0:23, limits = c(0,23)) +
annotation_logticks(sides = "lr") +
theme_bw() +
scale_y_log10(breaks=c(1e0,1e1,1e2,1e3,1e4), labels = trans_format("log10", math_format(10^.x)), limits=c(1e0,1e4)) +
xlab("Hour of day") + ylab("Ping gaps [s]")
p
p <- ggplot(c.m1.melt, aes(x = m1/60, y = value, color = variable)) +
geom_point(size = 1) +
stat_summary(fun.data = give.n, geom = "text", fun.y = median, angle = 90, size=4) +
stat_summary(fun.data = max.n, geom = "text", fun.y = max, colour = "red", angle = 90, size=4) +
scale_colour_brewer(palette="RdYlBu", name="Percentile", guide = guide_legend(reverse=TRUE)) +
scale_x_continuous(breaks=0:23, limits = c(0,24)) +
annotation_logticks(sides = "lr") +
theme_bw() +
scale_y_log10(breaks=c(1e0,1e1,1e2,1e3,1e4), labels = trans_format("log10", math_format(10^.x)), limits=c(1e0,1e4)) +
xlab("Time of day") + ylab("Ping gaps [s]")
p
This creates an hourly grouped plot of gaps with the length of the longest gaps written right next to the data points:
Below is the minutely grouped plot. The number are unreadable why I'd like to add conditional stats if the gap is longer than 5 minutes or only for the ten longest gaps or something like this.
I tried to just change the stat function to
max.n.filt <- function(x){
filter = 300
if ( x > filter ) {
return(c(y = max(x) + 0.4, label = round(max(10^x),2)))
} else {
return(c(y=x, label = ""))
}
}
and use this for the minutely grouped plot. But I got this error:
Error in list_to_dataframe(res, attr(.data, "split_labels")) :
Results do not have equal lengths
In addition: There were 50 or more warnings (use warnings() to see the first 50)
Error in if (nrow(layer_data) == 0) return() : argument is of length zero
Calls: print ... print.ggplot -> ggplot_gtable -> Map -> mapply -> <Anonymous>
In addition: Warning message:
Removed 6 rows containing missing values (geom_point).
In addition, in the hourly plot, I'd like to write the number of samples per hour right next to the length of the gaps. I think I can add a new column to the c data frame, but unfortunately I can't find a way to do this.
Any help is very much appreciated.
See ?stat_summary.
fun.data : Complete summary function. Should take data frame as input
and return data frame as output
Your function max.n.filt uses an if() statement that tries to evaluate the condition x > filter. But when length(x) > 1, the if() statement only evaluates the condition for the first value of x. When used on a data frame, this will return a list cobbled together from the original input x and whatever label the if() statement returns.
> max.n.filt(data.frame(x=c(10,15,400)))
$y.x
[1] 10 15 400
$label
[1] ""
Try a function that uses ifelse() instead:
max.n.filt2 <- function(x){
filter = 300 # whatever threshold
y = ifelse( x > filter, max(x) + 1, x[,1] )
label = ifelse( x > filter, round(max(x),2), NA )
return(data.frame(y=y[,1], label=label[,1]))
}
> max.n.filt2(data.frame(x=c(10,15,400)))
y label
1 10 NA
2 15 NA
3 401 400
Alternatively, you might just find it easier to use geom_text(). I can't reproduce your example, but here's a simulated dataset:
set.seed(101)
sim_data <- expand.grid(m1=1:1440, variable=factor(c(0,0.25,0.5,0.75,1)))
sim_data$sample_size <- sapply(1:1440, function(.) sample(1:25, 1, replace=T))
sim_data$value = t(sapply(1:1440, function(.) quantile(rgamma(sim_data$sample_size, 0.9, 0.5),c(0,0.25,0.5,0.75,1))))[1:(1440*5)]
Just use the subset argument in geom_text() to select those points you wish to label:
ggplot(sim_data, aes(x = m1/60, y = value, color = variable)) +
geom_point(size = 4) + geom_text(aes(label=round(value)), subset = .(variable == 1 & value > 25), angle = 90, size = 4, colour = "red", hjust = -0.5)
If you have a column of sample sizes, those can be incorporated into label with paste():
ggplot(sim_data, aes(x = m1/60, y = value, color = variable)) +
geom_point(size = 4) + geom_text(aes(label=paste(round(value),", N=",sample_size)), subset = .(variable == 1 & value > 25), angle = 90, size = 4, colour = "red", hjust = -0.25)
(or create a separate column in your data with whatever labels you want.) If you're asking about how to retrieve the sample sizes, you could modify your call to ddply() like this:
...
c2 <- ddply(C, .(h1), function (x) { cbind(summarise(x, y = quantile(x$gaps, cuts)), n=nrow(x)) } )
c2$cuts <- cuts
c2 <- dcast(c2, h1 + n ~ cuts, value.var = "y")
c2.h1.melt <- melt(c2, id.vars = c("h1","n"))
...