dot plot different indicators, depending on the value, in R - r

I am visualising odds ratios.
You can find fake data and a plot below
Data <- data.frame(
odds = sample(0:9),
pvalue = c(0.1,0.04,0.02,0.03,0.2,0.5,0.03,
0.12,0.12,0.014),
Y = sample(c("a", "b"), 5, replace = TRUE),
letters = letters[1:10]
)
library(lattice)
dotplot(letters ~ odds| Y, data =Data,
aspect=0.5, layout = c(1,2), ylab=NULL)
I would like to show solid circles for p-values greater than 0.05, and empty circles if values are less than 0.05.

We could specify the pch with values 1/20 for empty/solid circles based on the 'pvalue' column.
dotplot(letters ~ odds| Y, data=Data, aspect= 0.5, layout= c(1,2),
ylab=NULL, pch= ifelse(Data$pvalue > 0.05, 20, 1))

The group argument together with pch should also do the job:
dotplot(letters ~ odds| Y, data =Data,
aspect=0.5, layout = c(1,2), ylab=NULL,
groups = pvalue <= 0.05,
pch = c(19, 21))

This is easy to create with ggplot2:
library(ggplot2)
Data$significant <- Data$pvalue > 0.05
ggplot(Data, aes(x=odds, y=letters, shape=significant)) +
geom_point(size=4) +
scale_x_continuous(breaks = seq(from=0, to= 8, by=2)) +
scale_shape_manual(values=c(1, 16)) +
ylab("") +
facet_wrap(~ Y, ncol = 1, nrow = 2) +
theme_bw()

Related

How to make beautiful ROC curves for two models in the same plot?

I've trained two xgboost models, say model1 and model2. I have the AUC scores for each model and I want them to appear in the plot. I want to make beautiful ROC curves for both models in the same plot. Something like this:
How can I do that?
I usually use the library pROC, and I know I need to extract the scores, and the truth from each model, right?
so something like this maybe:
roc1 = roc(model1$truth, model1$scores)
roc2 = roc(model2$truth, model2$scores)
I also need the fpr and tpr for each model:
D1 = data.frame = (fpr = 1 - roc1$specificities, tpr = roc1$sensitivities)
D2 = data.frame = (fpr = 1 - roc2$specificities, tpr = roc2$sensitivities)
Then I can maybe add arrows to point out which curve is which:
arrows = tibble(x1 = c(0.5, 0.13) , x2 = c(0.32, 0.2), y1 = c(0.52, 0.83), y2 = c(0.7,0.7) )
And finally ggplot: (this part is missing)
ggplot(data = D1, aes(x = fpr, y = tpr)) +
geom_smooth(se = FALSE) +
geom_smooth(data = D2, color = 'red', se = FALSE) +
annotate("text", x = 0.5, 0.475, label = 'score of model 1') +
annotate("text", x = 0.13, y = 0.9, label = scores of model 2') +
So I need help with two things:
How do I get the right information out from the models, to make ROC curves? How do I get the truth and the prediction scores? The truth are just the labels of the target feature in the training set maybe?
How do I continue the code? and is my code right so far?
You can get the sensitivity and specifity in a data frame using coords from pROC. Just rbind the results for the two models after first attaching a column labelling each set as model 1 or model 2. To get the smooth-looking ROC with automatic labels you can use geom_textsmooth from the geomtextpath package:
library(pROC)
library(geomtextpath)
roc1 <- roc(model1$truth, model1$scores)
roc2 <- roc(model2$truth, model2$scores)
df <- rbind(cbind(model = "Model 1", coords(roc1)),
cbind(model = "Model 2", coords(roc2)))
ggplot(df, aes(1 - specificity, sensitivity, color = model)) +
geom_textsmooth(aes(label = model), size = 7, se = FALSE, span = 0.2,
textcolour = "black", vjust = 1.5, linewidth = 1,
text_smoothing = 50) +
geom_abline() +
scale_color_brewer(palette = "Set1", guide = "none", direction = -1) +
scale_x_continuous("False Positive Rate", labels = scales::percent) +
scale_y_continuous("True Positive Rate", labels = scales::percent) +
coord_equal(expand = FALSE) +
theme_classic(base_size = 20) +
theme(plot.margin = margin(10, 30, 10, 10))
Data used
set.seed(2023)
model1 <- model2 <- data.frame(scores = rep(1:100, 50))
p1 <- model2$scores + rnorm(5000, 0, 20)
p2 <- model1$scores/100
model1$truth <- rbinom(5000, 1, (p1 - min(p1))/diff(range(p1)))
model2$truth <- rbinom(5000, 1, p2)

Illustrate standard deviation in histogram

Consider the following simple example:
# E. Musk in Grunheide
set.seed(22032022)
# generate random numbers
randomNumbers <- rnorm(n = 1000, mean = 10, sd = 10)
# empirical sd
sd(randomNumbers)
#> [1] 10.34369
# histogram
hist(randomNumbers, probability = TRUE, main = "", breaks = 50)
# just for illusatration purpose
###
# empirical density
lines(density(randomNumbers), col = 'black', lwd = 2)
# theortical density
curve(dnorm(x, mean = 10, sd = 10), col = "blue", lwd = 2, add = TRUE)
###
Created on 2022-03-22 by the reprex package (v2.0.1)
Question:
Is there a nice way to illustrate the empirical standard deviation (sd) in the histogram by colour?
E.g. representing the inner bars by a different color, or indicating the range of the sd by an interval, i.e., [mean +/- sd], on the x-axis?
Note, if ggplot2 provides an easy solution, suggesting this would be also much appreciated.
This is similar ggplot solution to Benson's answer, except we precompute the histogram and use geom_col, so that we don't get any of the unwelcome stacking at the sd boundary:
# E. Musk in Grunheide
set.seed(22032022)
# generate random numbers
randomNumbers <- rnorm(n=1000, mean=10, sd=10)
h <- hist(randomNumbers, breaks = 50, plot = FALSE)
lower <- mean(randomNumbers) - sd(randomNumbers)
upper <- mean(randomNumbers) + sd(randomNumbers)
df <- data.frame(x = h$mids, y = h$density,
fill = h$mids > lower & h$mids < upper)
library(ggplot2)
ggplot(df) +
geom_col(aes(x, y, fill = fill), width = 1, color = 'black') +
geom_density(data = data.frame(x = randomNumbers),
aes(x = x, color = 'Actual density'),
key_glyph = 'path') +
geom_function(fun = function(x) {
dnorm(x, mean = mean(randomNumbers), sd = sd(randomNumbers)) },
aes(color = 'theoretical density')) +
scale_fill_manual(values = c(`TRUE` = '#FF374A', 'FALSE' = 'gray'),
name = 'within 1 SD') +
scale_color_manual(values = c('black', 'blue'), name = 'Density lines') +
labs(x = 'Value of random number', y = 'Density') +
theme_minimal()
Here is a ggplot solution. First calculate mean and sd, and save the values in different vectors. Then use an ifelse statement to categorise the values into "Within range" and "Outside range", fill them with different colours.
Blue line represents the normal distribution stated in your question, and black line represents the density graph of the histogram we're plotting.
library(ggplot2)
set.seed(22032022)
# generate random numbers
randomNumbers <- rnorm(n=1000, mean=10, sd=10)
randomNumbers_mean <- mean(randomNumbers)
randomNumbers_sd <- sd(randomNumbers)
ggplot(data.frame(randomNumbers = randomNumbers), aes(randomNumbers)) +
geom_histogram(aes(
fill = ifelse(
randomNumbers > randomNumbers_mean + randomNumbers_sd |
randomNumbers < randomNumbers_mean - randomNumbers_sd,
"Outside range",
"Within range"
)
),
binwidth = 1, col = "gray") +
geom_density(aes(y = ..count..)) +
stat_function(fun = function(x) dnorm(x, mean = 10, sd = 10) * 1000,
color = "blue") +
labs(fill = "Data")
Created on 2022-03-22 by the reprex package (v2.0.1)
data.frame(rand = randomNumbers,
cut = {
sd <- sd(randomNumbers)
mn <- mean(randomNumbers)
cut(randomNumbers, c(-Inf, mn -sd, mn +sd, Inf))
}) |>
ggplot(aes(x = rand, fill = cut ) ) +
geom_histogram()

Nonparametric regression ggplot

I'm trying to plot some nonparametric regression curves with ggplot2. I achieved It with the base plot()function:
library(KernSmooth)
set.seed(1995)
X <- runif(100, -1, 1)
G <- X[which (X > 0)]
L <- X[which (X < 0)]
u <- rnorm(100, 0 , 0.02)
Y <- -exp(-20*L^2)-exp(-20*G^2)/(X+1)+u
m <- lm(Y~X)
plot(Y~X)
abline(m, col="red")
m2 <- locpoly(X, Y, bandwidth = 0.05, degree = 0)
lines(m2$x, m2$y, col = "red")
m3 <- locpoly(X, Y, bandwidth = 0.15, degree = 0)
lines(m3$x, m3$y, col = "black")
m4 <- locpoly(X, Y, bandwidth = 0.3, degree = 0)
lines(m4$x, m4$y, col = "green")
legend("bottomright", legend = c("NW(bw=0.05)", "NW(bw=0.15)", "NW(bw=0.3)"),
lty = 1, col = c("red", "black", "green"), cex = 0.5)
With ggplot2 have achieved plotting the linear regression:
With this code:
ggplot(m, aes(x = X, y = Y)) +
geom_point(shape = 1) +
geom_smooth(method = lm, se = FALSE) +
theme(axis.line = element_line(colour = "black", size = 0.25))
But I dont't know how to add the other lines to this plot, as in the base R plot. Any suggestions? Thanks in advance.
Solution
The shortest solution (though not the most beautiful one) is to add the lines using the data= argument of the geom_line function:
ggplot(m, aes(x = X, y = Y)) +
geom_point(shape = 1) +
geom_smooth(method = lm, se = FALSE) +
theme(axis.line = element_line(colour = "black", size = 0.25)) +
geom_line(data = as.data.frame(m2), mapping = aes(x=x,y=y))
Beautiful solution
To get beautiful colors and legend, use
# Need to convert lists to data.frames, ggplot2 needs data.frames
m2 <- as.data.frame(m2)
m3 <- as.data.frame(m3)
m4 <- as.data.frame(m4)
# Colnames are used as names in ggplot legend. Theres nothing wrong in using
# column names which contain symbols or whitespace, you just have to use
# backticks, e.g. m2$`NW(bw=0.05)` if you want to work with them
colnames(m2) <- c("x","NW(bw=0.05)")
colnames(m3) <- c("x","NW(bw=0.15)")
colnames(m4) <- c("x","NW(bw=0.3)")
# To give the different kernel density estimates different colors, they must all be in one data frame.
# For merging to work, all x columns of m2-m4 must be the same!
# the merge function will automatically detec columns of same name
# (that is, x) in m2-m4 and use it to identify y values which belong
# together (to the same x value)
mm <- Reduce(x=list(m2,m3,m4), f=function(a,b) merge(a,b))
# The above line is the same as:
# mm <- merge(m2,m3)
# mm <- merge(mm,m4)
# ggplot needs data in long (tidy) format
mm <- tidyr::gather(mm, kernel, y, -x)
ggplot(m, aes(x = X, y = Y)) +
geom_point(shape = 1) +
geom_smooth(method = lm, se = FALSE) +
theme(axis.line = element_line(colour = "black", size = 0.25)) +
geom_line(data = mm, mapping = aes(x=x,y=y,color=kernel))
Solution which will settle this for everyone and for eternity
The most beautiful and reproducable way though will be to create a custom stat in ggplot2 (see the included stats in ggplot).
There is this vignette of the ggplot2 team to this topic: Extending ggplot2. I have never undertaken such a heroic endeavour though.

Manually assigning colors with scale_fill_manual only works for certain hexagon sizes

I am trying to create a scatterplot that is summarized by hexagon bins of counts. I would like the user to be able to define the count breaks for the color scale. I have this working, using scale_fill_manual(). Oddly, however, it only works sometimes. In the MWE below, using the given seed value, if xbins=10, there are issues resulting in a plot as follows:
However, if xbins=20 or 40, for example, the plot doesn't seem to have problems:
My MWE is as follows:
library(ggplot2)
library(hexbin)
library(RColorBrewer)
set.seed(1)
xbins <- 20
x <- abs(rnorm(10000))
y <- abs(rnorm(10000))
minVal <- min(x, y)
maxVal <- max(x, y)
maxRange <- c(minVal, maxVal)
buffer <- (maxRange[2] - maxRange[1]) / (xbins / 2)
h <- hexbin(x = x, y = y, xbins = xbins, shape = 1, IDs = TRUE,
xbnds = maxRange, ybnds = maxRange)
hexdf <- data.frame (hcell2xy(h), hexID = h#cell, counts = h#count)
my_breaks <- c(2, 4, 6, 8, 20, 1000)
clrs <- brewer.pal(length(my_breaks) + 3, "Blues")
clrs <- clrs[3:length(clrs)]
hexdf$countColor <- cut(hexdf$counts, breaks = c(0, my_breaks, Inf),
labels = rev(clrs))
ggplot(hexdf, aes(x = x, y = y, hexID = hexID, fill = countColor)) +
scale_fill_manual(values = levels(hexdf$countColor)) +
geom_hex(stat = "identity") +
geom_abline(intercept = 0, color = "red", size = 0.25) +
coord_fixed(xlim = c(-0.5, (maxRange[2] + buffer)),
ylim = c(-0.5, (maxRange[2] + buffer))) +
theme(aspect.ratio=1)
My goal is to tweak this code so that the plot does not have problems (where suddenly certain hexagons are different sizes and shapes than the rest) regardless of the value assigned to xbins. However, I am puzzled what may be causing this problem for certain xbins values. Any advice would be greatly appreciated.
EDIT:
I am updating the example code after taking into account comments by #bdemarest and #Axeman. I followed the most popular answer in the link #Axeman recommends, and believe it is more useful when you are working with scale_fill_continuous() on an integer vector. Here, I am working on scale_fill_manual() on a factor vector. As a result, I am still unable to get this goal to work. Thank you.
library(ggplot2)
library(hexbin)
library(RColorBrewer)
set.seed(1)
xbins <- 10
x <- abs(rnorm(10000))
y <- abs(rnorm(10000))
minVal <- min(x, y)
maxVal <- max(x, y)
maxRange <- c(minVal, maxVal)
buffer <- (maxRange[2] - maxRange[1]) / (xbins / 2)
bindata = data.frame(x=x,y=y,factor=as.factor(1))
h <- hexbin(bindata, xbins = xbins, IDs = TRUE, xbnds = maxRange, ybnds = maxRange)
counts <- hexTapply (h, bindata$factor, table)
counts <- t (simplify2array (counts))
counts <- melt (counts)
colnames (counts) <- c ("factor", "ID", "counts")
counts$factor =as.factor(counts$factor)
hexdf <- data.frame (hcell2xy (h), ID = h#cell)
hexdf <- merge (counts, hexdf)
my_breaks <- c(2, 4, 6, 8, 20, 1000)
clrs <- brewer.pal(length(my_breaks) + 3, "Blues")
clrs <- clrs[3:length(clrs)]
hexdf$countColor <- cut(hexdf$counts, breaks = c(0, my_breaks, Inf), labels = rev(clrs))
ggplot(hexdf, aes(x = x, y = y, fill = countColor)) +
scale_fill_manual(values = levels(hexdf$countColor)) +
geom_hex(stat = "identity") +
geom_abline(intercept = 0, color = "red", size = 0.25) +
coord_cartesian(xlim = c(-0.5, maxRange[2]+buffer), ylim = c(-0.5, maxRange[2]+ buffer)) + theme(aspect.ratio=1)
you can define colors in 'geom' instead of 'scale' that modifies the scale of plot:
ggplot(hexdf, aes(x = x, y = y)) +
geom_hex(stat = "identity",fill =hexdf$countColor)

R: How to add the noise cluster into DBSCAN plot

I'm trying to plot DBSCAN results. This is what I have done so far. My distance matrix is here.
dbs55_CR_EUCL = dbscan(writeCRToMatrix,eps=0.006, MinPts = 4, method = "dist")
plot(writeCRToMatrix[dbs55_CR_EUCL$cluster>0,],
col=dbs55_CR_EUCL$cluster[dbs55_CR_EUCL$cluster>0],
main="DBSCAN Clustering K = 4 \n (EPS=0.006, MinPts=4) without noise",
pch = 20)
This is the plot:
When I tried plotting all the clusters including the noise cluster I could only see 2 points in my plot.
What I'm looking for are
To add the points in the noise cluster to the plot but with a different symbol. Something similar to the following picture
Shade the cluster areas like in the following picture
Noise clusters have an id of 0. R plots usually ignore a color of 0 so if you want to show the noise points (as black) then you need to do the following:
plot(writeCRToMatrix,
col=dbs55_CR_EUCL$cluster+1L,
main="DBSCAN Clustering K = 4 \n (EPS=0.006, MinPts=4) with noise",
pch = 20)
If you want a different symbol for noise then you could do the following (adapted from the man page):
library(dbscan)
n <- 100
x <- cbind(
x = runif(10, 0, 10) + rnorm(n, sd = 0.2),
y = runif(10, 0, 10) + rnorm(n, sd = 0.2)
)
res <- dbscan::dbscan(x, eps = .2, minPts = 4)
plot(x, col=res$cluster, pch = 20)
points(x[res$cluster == 0L], col = "grey", pch = "+")
Here is code that will create a shaded convex hull for each cluster
library(ggplot2)
library(data.table)
library(dbscan)
dt <- data.table(x, level=as.factor(res$cluster), key = "level")
hulls <- dt[, .SD[chull(x, y)], by = level]
### get rid of hull for noise
hulls <- hulls[level != "0",]
cols <- c("0" = "grey", "1" = "red", "2" = "blue")
ggplot(dt, aes(x=x, y=y, color=level)) +
geom_point() +
geom_polygon(data = hulls, aes(fill = level, group = level),
alpha = 0.2, color = NA) +
scale_color_manual(values = cols) +
scale_fill_manual(values = cols)
Hope this helps.

Resources