Jitter dots without overlap - r

My data:
a <- sample(1:5, 100, replace = TRUE)
b <- sample(1:5, 100, replace = TRUE)
c <- sample(1:10, 100, replace = TRUE)
d <- sample(1:40, 100, replace = TRUE)
df <- data.frame(a, b, c, d)
Using ggplot2, I have created scatterplot over x = a and y = b, weighted in two dimension (by colour = c and size = d). Note that x and y are intentionally 1:5.
Obviously, the points of different sizes and colors therefore overlap, so I tried jitter to avoid overlapping:
ggplot(df, aes(a, b, colour = c, size = d)) +
geom_point(position = position_jitter())
Now I would like the dots clustering closer together, so I tried several
combinations of height and width for the jitter function, such as
ggplot(df, aes(a, b, colour = c, size = d)) +
geom_point(position = position_jitter(width = 0.2, height = 0.2))
Jitter makes the dots still overlap and also distributes them to randomly on the given area.
Is there a way to have the dots not overlapping at all, yet clustered as close together as possible, maybe even touching and also not "side by side" or stacked? (In a way, creating kind of bubbles with smaller dots)?
Thanks!

According to #Tjebo's suggestions I have arranged dots in "heaps".
set.seed(1234)
n <- 100
a <- sample(1:5,n,rep=TRUE)
b <- sample(1:5,n,rep=TRUE)
c <- sample(1:10,n,rep=TRUE)
d <- sample(1:40,n,rep=TRUE)
df0 <- data.frame(a,b,c,d)
# These parameters need carefully tuning
minr <- 0.05
maxr <- 0.2
# Order circles by dimension
ord <- FALSE
df1 <- df0
df1$d <- minr+(maxr-minr)*(df1$d-min(df1$d))/(max(df1$d)-min(df1$d))
avals <- unique(df1$a)
bvals <- unique(df1$b)
for (k1 in seq_along(avals)) {
for (k2 in seq_along(bvals)) {
print(paste(k1,k2))
subk <- (df1$a==avals[k1] & df1$b==bvals[k2])
if (sum(subk)>1) {
subdfk <- df1[subk,]
if (ord) {
idx <- order(subdfk$d)
subdfk <- subdfk[idx,]
}
subdfk.mod <- subdfk
posmx <- which.max(subdfk$d)
subdfk1 <- subdfk[posmx,]
subdfk2 <- subdfk[-posmx,]
angsk <- seq(0,2*pi,length.out=nrow(subdfk2)+1)
subdfk2$a <- subdfk2$a+cos(angsk[-length(angsk)])*(subdfk1$d+subdfk2$d)/2
subdfk2$b <- subdfk2$b+sin(angsk[-length(angsk)])*(subdfk1$d+subdfk2$d)/2
subdfk.mod[posmx,] <- subdfk1
subdfk.mod[-posmx,] <- subdfk2
df1[subk,] <- subdfk.mod
}
}
}
library(ggplot2)
library(ggforce)
ggplot(df1, aes()) +
geom_circle(aes(x0=a, y0=b, r=d/2, fill=c), alpha=0.7)+ coord_fixed()

An interesting visualization tool is the beeswarm plot.
In R the beeswarm and the ggbeeswarm packages implement this kind of plot.
Here is an example with ggbeeswarm:
set.seed(1234)
a <- sample(1:5,100,rep=TRUE)
b <- sample(1:5,100,rep=TRUE)
c <- sample(1:10,100,rep=TRUE)
d <- sample(1:40,100,rep=TRUE)
df <- data.frame(a,b,c,d)
library(ggbeeswarm)
ggplot(aes(x=a, y=b, col=c, size=d), data = df)+
geom_beeswarm(priority='random',cex=3.5, groupOnX=T)+coord_flip()
I hope this can help you.

Here is another possibile solution to the jittering problem of #Tjebo.
The parameter dst needs some tuning.
set.seed(1234)
a <- sample(1:5,100,rep=TRUE)
b <- sample(1:5,100,rep=TRUE)
c <- sample(1:10,100,rep=TRUE)
d <- sample(1:40,100,rep=TRUE)
df <- data.frame(a,b,c,d)
dst <- .2
df.mod <- df
avals <- unique(df$a)
bvals <- unique(df$b)
for (k1 in seq_along(avals)) {
for (k2 in seq_along(bvals)) {
subk <- (df$a==avals[k1] & df$b==bvals[k2])
if (sum(subk)>1) {
subdf <- df[subk,]
angsk <- seq(0,2*pi,length.out=nrow(subdf)+1)
ak <- subdf$a+cos(angsk[-1])*dst
bk <- subdf$b+sin(angsk[-1])*dst
df.mod[subk,c("a","b")] <- cbind(ak,bk)
}
}
}
library(ggplot2)
ggplot(df.mod, aes(a, b, colour = c, size = d)) + geom_point()

Related

Plot data based on three objects as z-scores

I'm using the package "networktools" in R (https://cran.r-project.org/web/packages/networktools/networktools.pdf).
I've created three "bridge"-objects: DataT5_SDQ_network_b, DataT6_SDQ_network_b, and DataT7_SDQ_network_b.
The three bridge-objects are downloadable here: https://drive.google.com/file/d/12Hgq78RjuXXRLplIXJw6SNU4NoZbULcc/view?usp=sharing.
The code which generates a bridge-object (using networktools):
DataT5_SDQ_network_b <- bridge(DataT5_SDQ_network,
communities=SDQ_communitiesSG, directed=FALSE, nodes =
DataT5_SDQ_list$names)
I have successfully plotted the three "bridge"-objects in the same plot (with legend) using this code:
p <- lapply(list(DataT5_SDQ_network_b,
DataT6_SDQ_network_b,
DataT7_SDQ_network_b), function(x) suppressWarnings(plot(x)))
p <- Map(function(a, b) { a$data$Class <- b; a}, a = p, b = c("T5", "T6", "T7"))
p[[1]]$data <- do.call(rbind, lapply(p, function(x) x$data))
p <- p[[1]] + aes(color = Class, group = Class)
p
The result:
Questions:
How can I plot the data as z-scores?
How can I get the plot only showing Bridge Expected Influence (1-step)?
I found out:
First, plot each plot:
gg1 <- plot(DataT5_SDQ_network_b, include=c("Bridge Expected Influence (1-step)"), theme_bw=FALSE, zscore=TRUE)
gg2 <- plot(DataT6_SDQ_network_b, include=c("Bridge Expected Influence (1-step)"), theme_bw=FALSE, zscore=TRUE)
gg3 <- plot(DataT7_SDQ_network_b, include=c("Bridge Expected Influence (1-step)"), theme_bw=FALSE, zscore=TRUE)
Then combine plots
p <- list(gg1, gg2, gg3)
p <- Map(function(a, b) { a$data$Class <- b; a}, a = p, b = c("T5", "T6", "T7"))
p[[1]]$data <- do.call(rbind, lapply(p, function(x) x$data))
p <- p[[1]] + aes(color = Class, group = Class)
p

Q-Q plot with ggplot2::stat_qq, colours, multiple groups with Q-Q lines

I need to do something similar to what's shown in this excellent question:
Q-Q plot with ggplot2::stat_qq, colours, single group
but unfortunately there's a slight difference which is blocking me. Unlike the original question, I do want to separate the quantile computations by group, but I also want to add a QQ-line for each group. Following the OP's code, I can create the quantile-quantile plots by group:
library(dplyr)
library(ggplot2)
library(broom) ## for augment()
set.seed(1001)
N <- 1000
G <- 10
dd <- data_frame(x = runif(N),
group = factor(sample(LETTERS[1:G], size=N, replace=TRUE)),
y = rnorm(N) + 2*x + as.numeric(group))
m1 <- lm(y~x, data=dd)
dda <- cbind(augment(m1), group=dd$group)
sample_var <- "y"
group_var <- "group"
p <- ggplot(dda)+stat_qq(aes_string(sample=sample_var, colour=group_var))
p
How can I add the quantile-quantile lines for each group? NOTE: ideally I would like to specify the sample column and the group column at runtime. That's why I used aes_string.
EDIT to better clarify my problem, I add code to compute quantile-quantile lines when there's only one group. I need to generalize the code to multiple groups.
library(dplyr)
library(ggplot2)
library(broom) ## for augment()
# this section of the code is the same as before, EXCEPT G = 1, because for
# now the code only works for 1 group
set.seed(1001)
N <- 1000
G <- 1
dd <- data_frame(x = runif(N),
group = factor(sample(LETTERS[1:G], size=N, replace=TRUE)),
y = rnorm(N) + 2*x + as.numeric(group))
m1 <- lm(y~x, data=dd)
dda <- cbind(augment(m1), group=dd$group)
sample_var <- "y"
group_var <- "group"
# code to compute the slope and the intercept of the qq-line: basically,
# I would need to compute the slopes and the intercepts of the qq-lines
# for each group
vec <- dda[, sample_var]
y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
x <- qnorm(c(0.25, 0.75))
slope <- diff(y)/diff(x)
int <- y[1] - slope * x[1]
# now plot with ggplot2
p <- ggplot(dda)+stat_qq(aes_string(sample=sample_var, colour=group_var))+geom_abline(slope = slope, intercept = int)
p
Turning the code to calculate the qqlines into a function and then using lapply to create a separate data.frame for your qqlines is one approach.
library(dplyr)
library(ggplot2)
library(broom) ## for augment()
set.seed(1001)
N <- 1000
G <- 3
dd <- data_frame(x = runif(N),
group = factor(sample(LETTERS[1:G], size=N, replace=TRUE)),
y = rnorm(N) + 2*x + as.numeric(group))
m1 <- lm(y~x, data=dd)
dda <- cbind(augment(m1), group=dd$group)
sample_var <- "y"
group_var <- "group"
# code to compute the slope and the intercept of the qq-line
qqlines <- function(vec, group) {
x <- qnorm(c(0.25, 0.75))
y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
slope <- diff(y)/diff(x)
int <- y[1] - slope * x[1]
data.frame(slope, int, group)
}
slopedf <- do.call(rbind,lapply(unique(dda$group), function(grp) qqlines(dda[dda$group == grp,sample_var], grp)))
# now plot with ggplot2
p <- ggplot(dda)+stat_qq(aes_string(sample=sample_var, colour=group_var)) +
geom_abline(data = slopedf, aes(slope = slope, intercept = int, colour = group))
p
A more concise alternative. In ggplot2 v.3.0.0 and above you can use stat_qq_line:
ggplot(dda, aes(sample = y, colour = group)) +
stat_qq() +
stat_qq_line()
Output:
Data, from Jeremy Voisey's answer:
library(ggplot2)
library(broom)
set.seed(1001)
N <- 1000
G <- 3
dd <- data_frame(
x = runif(N),
group = factor(sample(LETTERS[1:G], size = N, replace = TRUE)),
y = rnorm(N) + 2 * x + as.numeric(group)
)
m1 <- lm(y ~ x, data = dd)
dda <- cbind(augment(m1), group = dd$group)

Cumulative Density Plots with ggplot and plotly

When we take the following example from ggplot2 docs
df <- data.frame(x = c(rnorm(100, 0, 3), rnorm(100, 0, 10)),
g = gl(2, 100))
library(ggplot2)
p <- ggplot(df, aes(x, colour = g)) +
stat_ecdf(geom = "step", na.rm = T) + # interchange point and step
theme_bw()
p
We can create a standard cdf plot. Now if we want to play with the plot in plotly, I obtain a very confusing image when I use the step command. See below. However, when I use the point command plotly behaves like it should. What is happening with the step command? Why can't I recreate the image from using ggplot only?
library(plotly)
ggplotly(p)
I found the solution here https://community.plotly.com/t/bug-with-ggplot2-stat-ecdf-function/1187/3.
You should reorder the dataframe along x.
df <- dplyr::arrange(df, x)
library(ggplot2)
p <- ggplot(df, aes(x, colour = g)) +
stat_ecdf(geom = "step", na.rm = T) +
theme_bw()
p
library(plotly)
ggplotly(p)
This can be solved using ecdf() function.
## ecdf function to get y and 1-y
rcdf <- function (x) {
cdf <- ecdf(x)
y1 <- cdf(x)
y <- unique(y1)
# xrcdf <- 1-y ## to get reverse cdf
xrcdf <- y ## to get cdf
}
ug <- unique(df$g)
ng <- length(ug)
xll <- min(df$x)
xul <- max(df$x)
adr <- data.frame(myxx=c(), myyy=c(), mygg=c())
lapply(1:ng, function(i){
ad2r <- subset(df, g==ug[i])
myx1 <- unique(ad2r$x)
myxx <- c(xll,myx1,xul) ## add lowest value - dummy to assign 100%
myy1 <- rcdf(ad2r$x)
# myyy <- c(1.0,myy1,0.0) ## add 100% to get reverse cdf
myyy <- c(0.0,myy1,1.0) ## add 0% to get cdf
mygg <- ug[i]
ad2rf <- data.frame(myxx,myyy,mygg)
adr <<- rbind(adr,ad2rf)
})
adf <- adr[order(adr$myxx),]
pp <- ggplot(data=adf,
aes_(x=adf$myxx, y=100*adf$myyy, col=adf$mygg, group=adf$mygg)) +
geom_step() +
labs(title="CDF", y = "Y", x = "X", col=NULL)
ppp <- ggplotly(pp, tooltip=c("x","y"))
ppp
This gives the following output:
CDF

geom_raster faceted plot with ggplot2: control row height

In the example below I have a dataset containing two experiments F1 and F2. A classification is performed based on F1 signal, and both F1 and F2 values are ordered accordingly. In this diagram, each facet has the same dimension although the number of rows is not the same (e.g class #7 contains only few elements compare to the other classes). I would like to modify the code to force row height to be the same across facets (facets would thus have various blank space below). Any hints would be greatly appreciated.
Thank you
library(ggplot2)
library(reshape2)
set.seed(123)
# let's create a fake dataset
nb.experiment <- 4
n.row <- 200
n.col <- 5
nb.class <- 7
d <- matrix(round(runif(n.row * n.col),2), nc=n.col)
colnames(d) <- sprintf("%02d", 1:5)
# These strings will be the row names of each heatmap
# in the subsequent facet plot
elements <- sample(replicate(n.row/2, rawToChar(as.raw(sample(65:90, 6, replace=T)))))
# let's create a data.frame d
d <- data.frame(d,
experiment = sort(rep(c("F1","F2"), n.row/2)),
elements= elements)
# Now we split the dataset by experiments
d.split <- split(d, d$experiment)
# Now we create classes (here using hierarchical clustering )
# based on F1 experiment
dist.mat <- as.dist(1-cor(t(d.split$F1[,1:5]), method="pearson"))
hc <- hclust(dist.mat)
cuts <- cutree(hc, nb.class)
levels(cuts) <- sprintf("Class %02d", 1:nb.experiment)
# We split F1 and F2 based on classification result
for(s in names(d.split)){
d.split[[s]] <- split(d.split[[s]], cuts)
}
# Data are melt (their is perhaps a better solution...)
# in order to use the ggplot function
dm <- melt(do.call('rbind',lapply(d.split, melt)), id.var=c( "experiment", "elements", "variable", "L1"))
dm <- dm[, -5]
colnames(dm) <- c("experiment","elements", "pos", "class", "exprs")
dm$class <- as.factor(dm$class)
levels(dm$class) <- paste("Class", levels(dm$class))
# Now we plot the data
p <- ggplot(dm, aes(x = pos, y = elements, fill = exprs))
p <- p + geom_raster()
p <- p + facet_wrap(~class +experiment , scales = "free", ncol = 2)
p <- p + theme_bw()
p <- p + theme(text = element_text(size=4))
p <- p + theme(text = element_text(family = "mono", face = "bold"))
print(p)
Use facet_grid instead of facet_wrap and set the space attribute:
ggplot(dm, aes(x = pos, y = elements, fill = exprs)) +
geom_raster() +
facet_grid(rowMeanClass ~ experiment , scales = "free", space = "free_y") +
theme_bw()

Matrix of density plots with each plot overlaying two distributions

I have a data.frame with 5 columns and I'd like to generate a matrix of density plots, such that each density plot is an overlay of two density plots. (This is akin to plotmatrix, except that in my case, the number of non-NA value in each column differ from column to column and I want overlaid distributions rather than scatter plots).
My first attempt, which didn't work, is given below:
library(ggplot2)
library(reshape)
tmp1 <- data.frame(do.call(cbind, lapply(1:5, function(x) {
r <- rnorm(100)
r[sample(1:100, 20)] <- NA
return(r)
})))
ggplot( melt(tmp1), aes(x=value, fill=variable))+
geom_density(alpha=0.2, position="identity")+opts(legend.position = "none")+
facet_grid(variable ~ variable)
My second approach got me nearly there, but instead of 5 different colors, I only want to use two colors across all the plots. And, I'm sure there is a more elegant way to construct this expanded matrix:
tmp2 <- do.call(rbind, lapply(1:5, function(i) {
do.call(rbind, lapply(1:5, function(j) {
r <- rbind(data.frame(var=sprintf('X%d', i), val=tmp1[,i]),
data.frame(var=sprintf('X%d', j), val=tmp1[,j]))
r <- data.frame(xx=sprintf('X%d', i), yy=sprintf('X%d', j), r)
return(r)
}))
}))
ggplot(tmp2, aes(x=val, fill=var))+
geom_density(alpha=0.2, position="identity")+opts(legend.position = "none")+
facet_grid(xx ~ yy)
My solution was to manually loop through the pairs of columns and generate the overlaid density plots by hand, saving them to a list. I then arranged them in a grid using `grid.arrange' giving the image below.
But is it possible to achieve this using facet_grid instead?
The easiest way is to reshape your data with all permutations (5 * 5 = 25 of them).
require(gregmisc)
perm <- permutations(5, 2, paste0("X", 1:5), repeats.allowed=TRUE)
# instead of gregmisc + permutations, you can use expand.grid from base as:
# perm <- expand.grid(paste0("X", 1:5), paste0("X", 1:5))
o <- apply(perm, 1, function(idx) {
t <- tmp1[idx]
names(t) <- c("A", "B")
t$id1 <- idx[1]
t$id2 <- idx[2]
t
})
require(ggplot2)
require(reshape2)
o <- do.call(rbind, o)
o.m <- melt(o, c("id1", "id2"))
o.m$id1 <- factor(o.m$id1)
o.m$id2 <- factor(o.m$id2)
p <- ggplot(o.m, aes(x = value))
p <- p + geom_density(alpha = 0.2, position = "identity", aes(fill = variable))
p <- p + theme(legend.position = "none")
p <- p + facet_grid(id1 ~ id2)
p

Resources