violin plot with constant data? - r

I have some weird behaviour of violin plots, when the data is (in parts) constant.
If I check for constant data and add some small errors artificially (e.g. by adding runif( N, min = -0.001, max = 0.001 ), the script will run. However, that distorts the other violin plot(s) to vertical line(s) (see 1), while it should look something like 2
Question:
Is it possible (when the partial data for a violin plot is constant) to
display a simple horizontal line for the respective constant data
display the other violin plots, as if the constant data wasn't present?
R code:
library(ggplot2)
library(grid)
library(gridExtra)
N <- 20
test_data <- data.frame(
idx <- c( 1:N, 1:N ),
vals <- c( runif(N, 0, 1),
rep( 0.5, N)), # <- R script won't run
#rep( 0.5, N) + runif( N, min = -0.001, max = 0.001 )), # <- delivers graphic (distorted)
type <- c( rep("range", N),
rep("const", N))
)
grid.arrange(
ggplot( test_data, aes( x = idx, y = vals)) +
geom_line( aes(colour = type)),
ggplot( test_data, aes( x = type, y = vals)) +
geom_violin( aes( fill = type),
position = position_dodge(width = 1))
)

I finally managed to get a violin plot with some group(s) having zero variance (standard deviation)
to display a flat line for 0-variance groups
display normal violin plots for other groups
In my example I have 3 groups of data - two without zero variance and the third is constant.
While accumulating the groups, I calculate the standard deviation (variance would be same functionality)
library(ggplot2)
library(gridExtra)
N <- 20
test_data <- data.frame()
# random data from range
for( grp_id in 1:2)
{
group_data <- data.frame(
idx = 1:N,
vals = runif(N, grp_id, grp_id + 1),
type = paste("range", grp_id)
)
group_data$sd_group <- sd( group_data$vals)
test_data = rbind( test_data, group_data)
}
# constant data
group_data = data.frame(
idx = 1:N,
vals = rep( 0.5, N),
type = "const"
)
group_data$sd_group <- sd( group_data$vals)
as suggested I add a little offset to obtain a violin plot for group 'const'
# add a little jittering to get the flat line
if( 0 == group_data$sd_group[1])
{
group_data$vals[1] = group_data$vals[1] + 0.00001
}
test_data = rbind( test_data, group_data)
Only thing now left to do is to scale all violin plots to the same width
grid.arrange(
ggplot( test_data, aes( x = idx)) +
geom_line( aes( y = vals, colour = type)),
ggplot( test_data, aes( x = type, y = vals, fill = type)) +
geom_violin( scale = "width"),
ncol = 1
)

Related

ggplot2 adding label to geom_area

I'm teaching undergrad statistics and trying to make a useful little R script to help my students understand calculating probabilities in the standard normal distribution. I have this script, which takes zscore breakpoints, calculates the fraction of data between each breakpoint, and colors each breakpoint section:
library(tidyverse)
library(ggplot2)
library(magrittr)
sim_dat = data.frame(z = seq(-5,5, length.out = 1001))
sim_dat$y = dnorm(sim_dat$z, mean = 0, sd=1)
#fill in z-score bkpts, excluding zero: 0 will always be included
zscores <- c(-1,1.5)
zscores <- sort( setdiff(zscores,0) )
bkpoints <- sort( c(-Inf, zscores,0, Inf))
#find pct data between brekpoints
pctdata <- numeric(length=length(bkpoints)-1)
interval <- character(length=length(bkpoints)-1)
for(i in 1:length(pctdata)){
pctdata[i] <- plyr::round_any( pnorm(q=bkpoints[i+1]) - pnorm(q=bkpoints[i]) , 0.0001)
interval[i] <- paste0(bkpoints[i],",",bkpoints[i+1])
}
pctdata_df <- cbind.data.frame(interval,pctdata,stringsAsFactors=FALSE)
sim_dat$standard_normal_sections = cut(sim_dat$z, breaks = bkpoints)
p1 <- ggplot2::ggplot(sim_dat, aes(z, y, fill = standard_normal_sections)) + geom_area() +
scale_x_continuous(breaks= c(seq(-5,5,1), zscores))
p1
pctdata_df
I'd like to use pctdata_df$pctdata(vector of how much data is in section of p1) as labels. I'm finding very little on how to add labels to geom_area. Any help is appreciated!
There is nothing special about geom_area. If you want to add labels you could do so with geom_text where you pass your pctdata_df to the data argument. As you gave no information on where you want to add your labels I have put them beneath the area chart.
Note: There is no need for a for loop. You could simply pass a vector to pnorm or paste.
library(scales)
library(ggplot2)
# find pct data between brekpoints
lower <- bkpoints[1:(length(bkpoints) - 1)]
upper <- bkpoints[2:length(bkpoints)]
pctdata <- pnorm(q = upper) - pnorm(q = lower)
interval <- paste0(lower, ",", upper)
pctdata_df <- data.frame(interval, lower, upper, pctdata)
pctdata_df$x_label <- with(pctdata_df, ifelse(is.infinite(lower), upper - 1, .5 * (lower + upper)))
pctdata_df$x_label <- with(pctdata_df, ifelse(is.infinite(upper), lower + 1, x_label))
sim_dat$standard_normal_sections <- cut(sim_dat$z, breaks = bkpoints)
ggplot(sim_dat, aes(z, y)) +
geom_area(aes(fill = standard_normal_sections)) +
geom_text(data = pctdata_df, aes(x = x_label, y = 0, label = scales::number(pctdata, .01)),
vjust = 1, size = 8 / .pt, nudge_y = -.01) +
scale_x_continuous(breaks = c(seq(-5, 5, 1), zscores))

Manually Set Scale of contour plot using geom_contour_filled

I would like manually adjust the scales of two contour plots such that each have the same scale even though they contain different ranges of values in the z-direction.
For instance, lets say that I want to make contour plots of z1 and z2:
x = 1:15
y = 1:15
z1 = x %*% t(y)
z2 = 50+1.5*(x %*% t(y))
data <- data.frame(
x = as.vector(col(z1)),
y = as.vector(row(z1)),
z1 = as.vector(z1),
z2 = as.vector(z2)
)
ggplot(data, aes(x, y, z = z1)) +
geom_contour_filled(bins = 8)
ggplot(data, aes(x, y, z = z2)) +
geom_contour_filled(bins = 8)
Is there a way I can manually adjust the scale of each plot such that each contain the same number of levels (in this case bins = 8), the minimum is the same for both (in this case min(z1)), and the max is the same for both (max(z2))?
One can manually define a vector of desired breaks points and then pass the vector to the "breaks" option in the geom_contour_filled() function.
In the below script, finds 8 break intervals between the grand minimum and the grand maximum of the dataset.
Also there are 2 functions defined to create the palette and label names for the legend.
#establish the min and max of scale
grandmin <- min(z1, z2)-1
grandmax <- max(z2, z2)
#define the number of breaks. In this case 8 +1
mybreaks <- seq(grandmin, ceiling(round(grandmax, 0)), length.out = 9)
#Function to return the dersired number of colors
mycolors<- function(x) {
colors<-colorRampPalette(c("darkblue", "yellow"))( 8 )
colors[1:x]
}
#Function to create labels for legend
breaklabel <- function(x){
labels<- paste0(mybreaks[1:8], "-", mybreaks[2:9])
labels[1:x]
}
ggplot(data, aes(x, y, z = z1)) +
geom_contour_filled(breaks= mybreaks, show.legend = TRUE) +
scale_fill_manual(palette=mycolors, values=breaklabel(8), name="Value", drop=FALSE) +
theme(legend.position = "right")
ggplot(data, aes(x, y, z = z2)) +
geom_contour_filled(breaks= mybreaks, show.legend = TRUE) +
scale_fill_manual(palette=mycolors, values=breaklabel(8), name="Value", drop=FALSE)

Plot one data frame column against all other columns using ggplots and showing densities in R

I have a data frame with 20 columns, and I want to plot one specific column (called BB) against each single column in the data frame. The plots I need are probability density plots, and I’m using the following code to generate one plot (plotting columns BB vs. AA as an example):
mydata = as.data.frame(fread("filename.txt")) #read my data as data frame
#function to calculate density
get_density <- function(x, y, n = 100) {
dens <- MASS::kde2d(x = x, y = y, n = n)
ix <- findInterval(x, dens$x)
iy <- findInterval(y, dens$y)
ii <- cbind(ix, iy)
return(dens$z[ii])
}
set.seed(1)
#define the x and y of the plot; x = column called AA; y = column called BB
xy1 <- data.frame(
x = mydata$AA,
y = mydata$BB
)
#call function get_density to calculate density for the defined x an y
xy1$density <- get_density(xy1$x, xy1$y)
#Plot
ggplot(xy1) + geom_point(aes(x, y, color = density), size = 3, pch = 20) + scale_color_viridis() +
labs(title = "BB vs. AA") +
scale_x_continuous(name="AA") +
scale_y_continuous(name="BB")
Would appreciate it if someone can suggest a method to produce multiple plot of BB against every other column, using the above density function and ggplot command. I tried adding a loop, but found it too complicated especially when defining the x and y to be plotted or calling the density function.
Since you don't provide sample data, I'll demo on mtcars. We convert the data to long format, calculate the densities, and make a faceted plot. We plot the mpg column against all others.
library(dplyr)
library(tidyr)
mtlong = gather(mtcars, key = "var", value = "value", -mpg) %>%
group_by(var) %>%
mutate(density = get_density(value, mpg))
ggplot(mtlong, aes(x = value, y = mpg, color = density)) +
geom_point(pch = 20, size = 3) +
labs(x = "") +
facet_wrap(~ var, scales = "free")

List of plots generated in ggplot2 using scale_color_gradientn have wrong coloring

I'm attempting to use library(scales) and scale_color_gradientn() to create a custom mapping of colors to a continuous variable, in an attempt to limit the effect of outliers on the coloring of my plot. This works for a single plot, but does not work when I use a loop to generate several plots and store them in a list.
Here is a minimal working example:
library(ggplot2)
library(scales)
data1 <- as.data.frame(cbind(x = rnorm(100),
y = rnorm(100),
v1 = rnorm(100, mean = 2, sd = 1),
v2 = rnorm(100, mean = -2, sd = 1)))
#add outliers
data1[1,"v1"] <- 200
data1[2,"v1"] <- -200
data1[1,"v2"] <- 50
data1[2,"v2"] <- -50
#define color palette
cols <- colorRampPalette(c("#3540FF","black","#FF3535"))(n = 100)
#simple color scale
col2 <- scale_color_gradient2(low = "#3540FF",
mid = "black",
high = "#FF3535"
)
#outlier-adjusted color scale
{
aa <- min(data1$v1)
bb <- quantile(data1$v1, 0.05)
cc <- quantile(data1$v1, 0.95)
dd <- max(data1$v1)
coln <- scale_color_gradientn(colors = cols[c(1,5,95,100)],
values = rescale(c(aa,bb,cc,dd),
limits = c(aa,dd))
)
}
Plots:
1. Plot with simple scales - outliers cause scales to stretch out.
ggplot(data1, aes(x = x, y = y, color = v1))+
geom_point()+
col2
2. Plot with outlier-adjusted scales - correct color scaling.
ggplot(data1, aes(x = x, y = y, color = v1))+
geom_point()+
coln
3. The scales for v1 do not work for v2 as the data is different.
ggplot(data1, aes(x = x, y = y, color = v2))+
geom_point()+
coln
#loop to produce list of plots each with own scale
{
plots <- list()
k <- 1
for (i in c("v1","v2")){
aa <- min(data1[,i])
bb <- quantile(data1[,i],0.05)
cc <- quantile(data1[,i], 0.95)
dd <- max(data1[,i])
colm <- scale_color_gradientn(colors = cols[c(1,5,95,100)],
values = rescale(c(aa,bb,cc,dd),
limits = c(aa,dd)))
plots[[k]] <- ggplot(data1, aes_string(x = "x",
y = "y",
color = i
))+
geom_point()+
colm
k <- k + 1
}
}
4. First plot has the wrong scales.
plots[[1]]
5. Second plot has the correct scales.
plots[[2]]
So I'm guessing this has something to do with the scale_color_gradientn() function being called when the plotting takes place, rather than within the loop.
If anyone can help with this, it'd be much appreciated. In base R I would bin the continuous data and assigning hex colors into a vector used for fill color, but I'm unsure how I can apply this within ggplot.
You need to use a closure (function with associated environment):
{
plots <- list()
k <- 1
for (i in c("v1", "v2")){
colm <- function() {
aa <- min(data1[, i])
bb <- quantile(data1[, i], 0.05)
cc <- quantile(data1[, i], 0.95)
dd <- max(data1[, i])
scale_color_gradientn(colors = cols[c(1, 5, 95, 100)],
values = rescale(c(aa, bb, cc, dd),
limits = c(aa, dd)))
}
plots[[k]] <- ggplot(data1, aes_string(x = "x",
y = "y",
color = i)) +
geom_point() +
colm()
k <- k + 1
}
}
plots[[1]]
plots[[2]]

Subset of data for specific plot layer

I want to
plot all data on some layers (here: geom_point)
plot only a subset on some other layers (here: geom_text for type "range")
However, I'm getting the text labels for the whole data, while they should only be added for the turquoise points.
I tried subsetting the data, but the output is not the desired. Still, the object sub_data holds only the wanted data.
Any suggestions?
R code:
library(ggplot2)
N <- 10
# create 20 = 2*10 data points
test_data <- data.frame(
idx <- c( 1:N, 1:N ),
vals <- c( runif(N, 0, 1),
rep( 0.5, N)),
type <- c( rep("range", N),
rep("const", N))
)
# this subsets to the 10 data points of type "range"
sub_data <- subset( test_data, type == "range")
ggplot( test_data, aes( x = idx, y = vals)) +
geom_point( aes( colour = type)) +
geom_text( data = sub_data, aes( x = idx + 0.1, label = idx ), size = 3.5)
output:
Change the <- to = inside your data.frame command, like this:
test_data <- data.frame(
idx = c(1:N, 1:N),
vals = c(runif(N, 0, 1), rep( 0.5, N)),
type = c(rep("range", N), rep("const", N))
)
Then execute your plot code and you should get the desired result.
An alternative to creating a dataframe in a correct way is:
idx <- c(1:N, 1:N),
vals <- c(runif(N, 0, 1), rep( 0.5, N)),
type <- c(rep("range", N), rep("const", N))
test_data <- data.frame(idx, vals, type)
For more background on the difference between the <- and the = assignment operators, see the answers to this question

Resources