Removing points from plot generated with stat_summary - r

I've been asked to remove points from a plot that I've made with ggplot2. I'm attaching a MWE:
require(ggplot2)
require(Hmisc)
x = 5
k = 50
kx = k*5
data.A.1 = data.frame(n = rep(sample(1:100, k, replace=FALSE), x), v = rnorm(kx, 1, 2) + 0)
data.B.1 = data.frame(n = rep(sample(1:100, k, replace=FALSE), x), v = rnorm(kx, 1, 2) + 1)
data.C.1 = data.frame(n = rep(sample(1:100, k, replace=FALSE), x), v = rnorm(kx, 1, 2) + 2)
data.A.2 = data.frame(n = rep(sample(1:100, k, replace=FALSE), x), v = rnorm(kx, 1, 2) + 3)
data.B.2 = data.frame(n = rep(sample(1:100, k, replace=FALSE), x), v = rnorm(kx, 1, 2) + 4)
data.C.2 = data.frame(n = rep(sample(1:100, k, replace=FALSE), x), v = rnorm(kx, 1, 2) + 5)
multiple.plot.6x3.interval <- function(D, L) {
data = data.frame()
# join all the data in D into 'data'
e = 0
NN = ""
for (i in seq(1, length(D))) {
lidx = i%%3
if (lidx == 0) { lidx = 3 }
if (lidx == 1) {
e = e + 1
NN = paste0("10^", e)
}
n.obs = length(D[[i]]$n)
D[[i]]$lang.name = rep(L[lidx], n.obs)
D[[i]]$N = rep(NN, n.obs)
data = rbind(data, D[[i]])
}
# make the plot
g <- ggplot(data, aes(x=n, y=v)) +
stat_summary( # plot confidence interval
fun.data = mean_cl_boot, fun.args = (conf.int = 0.99),
geom = "ribbon", fill = "darkgrey"
) +
stat_summary( # plot maximum and minimum bars
fun = mean, fun.min = min, fun.max = max,
colour = "red", size = 0.15
) +
stat_summary(
fun = mean,
geom = "line", linetype = "solid", size = 0.4, color = "black"
) +
coord_cartesian(xlim=c(1, 100)) +
scale_x_continuous(breaks=seq(1, 101, 10)-1) +
facet_grid(
N ~ lang.name, labeller = "label_parsed"
) +
labs(
x=bquote("X"),
y=bquote("Y")
) +
theme(text = element_text(size = 20))
return (g)
}
g <- multiple.plot.6x3.interval(
list(
data.A.1, data.B.1, data.C.1,
data.A.2, data.B.2, data.C.2
),
c("A", "B", "C")
)
plot(g)
The result of this code is the one I want, but with the exception that I've been asked to remove the points that this
stat_summary( # plot maximum and minimum bars
fun = mean, fun.min = min, fun.max = max,
colour = "red", size = 0.15
) +
generates while keeping the bars.
This is what I get, and I would like to remove the red points (not the red bars).
Using size = 0 will make the bars completely invisible. I haven't been able to do this myself. I wonder: can this be done? If so, how? Any help will be appreciated.
Thank you all.

Try geom = "errorbar" as an argument to stat_summary:
require(ggplot2)
require(Hmisc)
x = 5
k = 50
kx = k*5
data.A.1 = data.frame(n = rep(sample(1:100, k, replace=FALSE), x), v = rnorm(kx, 1, 2) + 0)
data.B.1 = data.frame(n = rep(sample(1:100, k, replace=FALSE), x), v = rnorm(kx, 1, 2) + 1)
data.C.1 = data.frame(n = rep(sample(1:100, k, replace=FALSE), x), v = rnorm(kx, 1, 2) + 2)
data.A.2 = data.frame(n = rep(sample(1:100, k, replace=FALSE), x), v = rnorm(kx, 1, 2) + 3)
data.B.2 = data.frame(n = rep(sample(1:100, k, replace=FALSE), x), v = rnorm(kx, 1, 2) + 4)
data.C.2 = data.frame(n = rep(sample(1:100, k, replace=FALSE), x), v = rnorm(kx, 1, 2) + 5)
multiple.plot.6x3.interval <- function(D, L) {
data = data.frame()
# join all the data in D into 'data'
e = 0
NN = ""
for (i in seq(1, length(D))) {
lidx = i%%3
if (lidx == 0) { lidx = 3 }
if (lidx == 1) {
e = e + 1
NN = paste0("10^", e)
}
n.obs = length(D[[i]]$n)
D[[i]]$lang.name = rep(L[lidx], n.obs)
D[[i]]$N = rep(NN, n.obs)
data = rbind(data, D[[i]])
}
# make the plot
g <- ggplot(data, aes(x=n, y=v)) +
stat_summary( # plot confidence interval
fun.data = mean_cl_boot, fun.args = (conf.int = 0.99),
geom = "ribbon", fill = "darkgrey"
) +
stat_summary( # plot maximum and minimum bars
fun = mean, fun.min = min, fun.max = max,
geom = "errorbar", ### HERE
colour = "red", size = 0.15
) +
stat_summary(
fun = mean,
geom = "line", linetype = "solid", size = 0.4, color = "black"
) +
coord_cartesian(xlim=c(1, 100)) +
scale_x_continuous(breaks=seq(1, 101, 10)-1) +
facet_grid(
N ~ lang.name, labeller = "label_parsed"
) +
labs(
x=bquote("X"),
y=bquote("Y")
) +
theme(text = element_text(size = 20))
return (g)
}
g <- multiple.plot.6x3.interval(
list(
data.A.1, data.B.1, data.C.1,
data.A.2, data.B.2, data.C.2
),
c("A", "B", "C")
)
plot(g)

Related

Changing the color of the contour curve

I'd like to change the color of the contour curve from z variable. My MWE can be seen below.
library(ggplot2)
library(tidyverse)
rosenbrock <- function(x){
d <- length(x)
out <- 0
for(i in 1 : (d - 1)){
out <- out + 100 * ( x[i]^2 - x[i + 1] )^2 + (x[i] - 1)^2
}
out
}
set.seed(1)
coord <- matrix(runif(2000, -50, 50), byrow = TRUE, ncol = 2)
graph <- apply(coord, 1, rosenbrock)
results <- data.frame(x = coord[, 1], y = coord[, 2], z = graph) %>%
arrange(x, y)
set.seed(2020)
n <- 5
x1 <- matrix(c(round(rnorm(n, -12, 5), 2), 0, round(rnorm(n, -6, 5), 2), 0), byrow = F, ncol = 2)
y1 <- apply(x1, 1, function(x) rosenbrock(x))
test_points <- data.frame(x = x1[, 1], y = x1[, 2],
z = y1)
results %>%
ggplot(aes(x = x, y = y, z = z)) +
stat_density2d() +
geom_point(data = test_points, aes(colour = z), size = 2.0, shape = 19) +
scale_colour_gradientn(colours=rainbow(4)) +
theme_light() +
labs(colour = 'Fitness')
Something like this?
results %>%
ggplot(aes(x = x, y = y, z = z)) +
stat_density2d(aes(fill = stat(level)), geom = "polygon") +
geom_point(data = test_points, aes(colour = z), size = 2.0, shape = 19) +
scale_colour_gradientn(colours=rainbow(4)) +
theme_light() +
labs(colour = 'Fitness')
The last few examples at https://ggplot2.tidyverse.org/reference/geom_density_2d.html might be what you're looking for

Generating multiple plots containing functions in ggplot2

I am trying to make a composite plot in R using the packages ggplot2 and ggpubr.
I have no problem in making the composite plots except each plot has a normal distribution curve specific to that dataset. When I generate the composite plot, both plots have the same curve, that of the last dataset.
How can I generate the composite plot with each plot having its own specific normal distribution curve?
CODE AND OUTPUT PLOTS
## PLOT 1 ##
results_matrix_C <- data.frame(matrix(rnorm(20), nrow=20))
colnames(results_matrix_C) <- c("X")
m <- mean(results_matrix_C$X)
sd <- sd(results_matrix_C$X)
dnorm_C <- function(x){
norm_C <- dnorm(x, m, sd)
return(norm_C)
}
e = 1
dnorm_one_sd_C <- function(x){
norm_one_sd_C <- dnorm(x, m, sd)
# Have NA values outside interval x in [e]:
norm_one_sd_C[x <= e] <- NA
return(norm_one_sd_C)
}
C <- ggplot(results_matrix_C, aes(x = results_matrix_C$X)) +
geom_histogram(aes(y=..density..), bins = 10, colour = "black", fill = "white") +
stat_function(fun = dnorm_one_sd_C, geom = "area", fill = "#CE9A05", color = "#CE9A05", alpha = 0.25, size = 1) +
stat_function(fun = dnorm_C, colour = "#CE0539", size = 1) +
theme_classic()
## PLOT 2 ##
results_matrix_U <- data.frame(matrix(rnorm(20)+1, nrow=20))
colnames(results_matrix_U) <- c("X")
m <- mean(results_matrix_U$X)
sd <- sd(results_matrix_U$X)
dnorm_U <- function(x){
norm_U <- dnorm(x, m, sd)
return(norm_U)
}
e = 2
dnorm_one_sd_U <- function(x){
norm_one_sd_U <- dnorm(x, m, sd)
# Have NA values outside interval x in [e]:
norm_one_sd_U[x <= e] <- NA
return(norm_one_sd_U)
}
U <- ggplot(results_matrix_U, aes(x = results_matrix_U$X)) +
geom_histogram(aes(y=..density..), bins = 10, colour = "black", fill = "white") +
stat_function(fun = dnorm_one_sd_U, geom = "area", fill = "#CE9A05", color = "#CE9A05", alpha = 0.25, size = 1) +
stat_function(fun = dnorm_U, colour = "#CE0539", size = 1) +
theme_classic()
library(ggpubr)
ggarrange(C, U,
nrow = 1, ncol = 2)
As you can see in the composite plot, the first one has taken the normal distribution curve of the second plot rather than its own one from my initial plot (Plot 1).
UPDATE
Variable "e" refers to the shaded area which is related to the distribution curve.
m = mean of the dataset
sd = standard deviation of the dataset
m and sd are used to generate the normal distribution curves
SOLVED
By inserting the function in full into the stat_function section of the ggplot2 code, this has worked
i.e:
## PLOT 1 ##
results_matrix_C <- data.frame(matrix(rnorm(20), nrow=20))
colnames(results_matrix_C) <- c("X")
mean <- mean(results_matrix_C$X)
sd <- sd(results_matrix_C$X)
e = 1
C <- ggplot(results_matrix_C, aes(x = results_matrix_C$X)) +
geom_histogram(aes(y=..density..), bins = 10, colour = "black", fill = "white") +
stat_function(
fun = function(x, mean, sd, e){
norm_one_sd_C <- dnorm(x, mean, sd)
norm_one_sd_C[x <= e] <- NA
return(norm_one_sd_C)},
args = c(mean = mean, sd = sd, e = e), geom = "area", fill = "#CE9A05", color = "#CE9A05", alpha = 0.25, size = 1) +
stat_function(
fun = function(x, mean, sd){
dnorm(x = x, mean = mean, sd = sd)},
args = c(mean = mean, sd = sd), colour = "#CE0539", size = 1) +
theme_classic()
## PLOT 2 ##
results_matrix_U <- data.frame(matrix(rnorm(20)+1, nrow=20))
colnames(results_matrix_U) <- c("X")
mean <- mean(results_matrix_U$X)
sd <- sd(results_matrix_U$X)
e = 2
U <- ggplot(results_matrix_U, aes(x = results_matrix_U$X)) +
geom_histogram(aes(y=..density..), bins = 10, colour = "black", fill = "white") +
stat_function(
fun = function(x, mean, sd, e){
norm_one_sd_U <- dnorm(x, mean, sd)
norm_one_sd_U[x <= e] <- NA
return(norm_one_sd_U)},
args = c(mean = mean, sd = sd, e = e), geom = "area", fill = "#CE9A05", color = "#CE9A05", alpha = 0.25, size = 1) +
stat_function(
fun = function(x, mean, sd){
dnorm(x = x, mean = mean, sd = sd)},
args = c(mean = mean, sd = sd), colour = "#CE0539", size = 1) +
theme_classic()
library(ggpubr)
ggarrange(C, U,
nrow = 1, ncol = 2)

plotting density cauchy distribution in R

Just curious how can you generate the dcauchy distribution from Wikipedia:
Normally, you have
dcauchy(x, location = 0, scale = 1, log = FALSE)
for one line density p(x) v.s x
I assume in order to generate the diagram from wiki, a data.frame involves?
cauchy_dist <- data.frame(cauchy1 = rcauchy(10, location = 0, scale = 1, log = FALSE), cauchy2 = ....... , cauchy3 = ..... )
or you just need to
plot(x, P(x))
and then add lines to it?
You can use ggplot2's stat_function:
ggplot(data.frame(x = c(-5, 5)), aes(x)) +
stat_function(fun = dcauchy, n = 1e3, args = list(location = 0, scale = 0.5), aes(color = "a"), size = 2) +
stat_function(fun = dcauchy, n = 1e3, args = list(location = 0, scale = 1), aes(color = "b"), size = 2) +
stat_function(fun = dcauchy, n = 1e3, args = list(location = 0, scale = 2), aes(color = "c"), size = 2) +
stat_function(fun = dcauchy, n = 1e3, args = list(location = -2, scale = 1), aes(color = "d"), size = 2) +
scale_x_continuous(expand = c(0, 0)) +
scale_color_discrete(name = "",
labels = c("a" = expression(x[0] == 0*","~ gamma == 0.5),
"b" = expression(x[0] == 0*","~ gamma == 1),
"c" = expression(x[0] == 0*","~ gamma == 2),
"d" = expression(x[0] == -2*","~ gamma == 1))) +
ylab("P(x)") +
theme_bw(base_size = 24) +
theme(legend.position = c(0.8, 0.8),
legend.text.align = 0)
You could create the data as follows:
location <- c(0, 0, 0, -2)
scale <- c(0.5, 1, 2, 1)
x <- seq(-5, 5, by = 0.1)
cauchy_data <- Map(function(l, s) dcauchy(x, l, s), location, scale)
names(cauchy_data) <- paste0("cauchy", seq_along(location))
cauchy_tab <- data.frame(x = x, cauchy_data)
head(cauchy_tab)
## x cauchy1 cauchy2 cauchy3 cauchy4
## 1 -5.0 0.006303166 0.01224269 0.02195241 0.03183099
## 2 -4.9 0.006560385 0.01272730 0.02272830 0.03382677
## 3 -4.8 0.006833617 0.01324084 0.02354363 0.03600791
## 4 -4.7 0.007124214 0.01378562 0.02440091 0.03839685
## 5 -4.6 0.007433673 0.01436416 0.02530285 0.04101932
## 6 -4.5 0.007763656 0.01497929 0.02625236 0.04390481
Map is used to apply a function of multiple variables to just as many vectors element by element. Thus, the first list element of cauchy_data will contain the following
dcauchy(x, location[1], scale[1])
and so on. I then put the Cauchy data in a data frame together with the vector of x coordinates, x. So you have the desired data table.
There are, of course, many ways to plot this. I prefer to use ggplot and show you how to plot as an example:
library(tidyr)
library(ggplot2)
curve_labs <- paste(paste("x0 = ", location), paste("gamma = ", scale), sep = ", ")
plot_data <- gather(cauchy_tab, key = curve, value = "P", -x )
ggplot(plot_data, aes(x = x, y = P, colour = curve)) + geom_line() +
scale_colour_discrete(labels = curve_labs)
You could tweak the plot in many ways to get something that more closely resembles the plot from Wikipedia.

KMggplot2 plugin only seems to work with package example

I am trying to use the KMggplot2 plugin for Rcmdr. It only seems to work with the built it data set dataKm. When I try to use other data sets such as lung, I get no plot - just the error message "numbers of columns of arguments do not match. Here is the code for each plot attempt.
Here is the code when I try using the lung data even without attempting to have a number at risk list.
library(survival, pos=17)
data(lung, package="survival")
sapply(c("ggplot2", "grid"), require, character.only = TRUE)
Loading required package: ggplot2
Loading required package: grid
ggplot2 grid
TRUE TRUE
.df <- data.frame(x = lung$time, y = lung$status, z = factor("At risk"))
.df <- .df[do.call(order, .df[, c("z", "x"), drop = FALSE]), , drop = FALSE]
.fit <- survival::survfit(survival::Surv(time = x, event = y, type = "right") ~ z, .df)
.fit <- data.frame(x = .fit$time, y = .fit$surv, nrisk = .fit$n.risk, nevent = .fit$n.event, ncensor= .fit$n.censor, upper = .fit$upper, lower = .fit$lower)
.df <- unique(.df)
.df <- .fit <- data.frame(.fit, .df[, c("z"), drop = FALSE])
Error in data.frame(.fit, .df[, c("z"), drop = FALSE]) : arguments imply differing number of rows: 186, 199
.df <- .fit <- rbind(unique(data.frame(x = 0, y = 1, nrisk = NA, nevent = NA, ncensor = NA, upper = 1, lower = 1, .df[, c("z"), drop = FALSE])), .fit)
Error in rbind(deparse.level, ...) : numbers of columns of arguments do not match
.cens <- subset(.fit, ncensor == 1)
.plot <- ggplot(data = .fit, aes(x = x, y = y, colour = z)) + geom_step(data = subset(.fit, !is.na(upper)), aes(y = upper), size = 1, lty = 2, alpha = 0.5, show_guide = FALSE, na.rm = FALSE) +
geom_step(data = subset(.fit, !is.na(lower)), + aes(y = lower), size = 1, lty = 2, alpha = 0.5, show_guide = FALSE, na.rm = FALSE) +
geom_step(size =1.5)+ geom_linerange(data = .cens, aes(x = x, ymin = y, ymax = y + 0.02), size = 1.5) +
scale_x_continuous(breaks = seq(0, 900, by = 300), limits = c(0, 900)) + scale_y_continuous(limits = c(0, 1), expand = c(0.01, 0)) +
scale_colour_brewer(palette = "Set1") + xlab("Time from entry") +
ylab("Proportion of survival") +
theme_gray(base_size = 14, base_family = "serif")
Error in +geom_step(size = 1.5) : invalid argument to unary operator+
theme(legend.position = "none")
Error in inherits(x, "theme") : argument "e2" is missing, with no default
print(.plot)
Error in eval(expr, envir, enclos) : object 'z' not found
I just received this response from the developer who said that an update would be uploaded by the end of the month:
"We found a bug was caused by a tie data handling.The following code can be used."
sapply(c("ggplot2", "grid"), require, character.only = TRUE)
.df <- data.frame(x = lung$time, y = lung$status, z = factor("At risk"))
.df <- .df[do.call(order, .df[, c("z", "x"), drop = FALSE]), , drop = FALSE]
.fit <- survival::survfit(survival::Surv(time = x, event = y, type = "right") ~ z, .df)
.fit <- data.frame(x = .fit$time, y = .fit$surv, nrisk = .fit$n.risk,
nevent = .fit$n.event, ncensor= .fit$n.censor, upper = .fit$upper, lower = .fit$lower)
.df <- .df[!duplicated(.df$x), ]
.df <- .fit <- data.frame(.fit, .df[, c("z"), drop = FALSE])
.df <- .fit <- rbind(unique(data.frame(x = 0, y = 1, nrisk = NA, nevent = NA, ncensor = NA, upper = 1, lower = 1, .df[, c("z"), drop = FALSE])), .fit)
.cens <- subset(.fit, ncensor == 1) .plot <- ggplot(data = .fit, aes(x = x, y = y, colour = z)) +
geom_step(data = subset(.fit, !is.na(upper)), aes(y = upper), size = 1, lty = 2, alpha = 0.5, show_guide = FALSE, na.rm = FALSE) +
geom_step(data = subset(.fit, !is.na(lower)), aes(y = lower), size = 1, lty = 2, alpha = 0.5, show_guide = FALSE, na.rm = FALSE) +
geom_step(size = 1.5) +
geom_linerange(data = .cens, aes(x = x, ymin = y, ymax = y + 0.02), size = 1.5) +
scale_x_continuous(breaks = seq(0, 900, by = 300), limits = c(0, 900)) +
scale_y_continuous(limits = c(0, 1), expand = c(0.01, 0)) +
scale_colour_brewer(palette = "Set1") +
xlab("Time from entry") +
ylab("Proportion of survival") +
theme_gray(base_size = 14, base_family = "serif") +
theme(legend.position = "none")
print(.plot)

How can you limit range of stat_function plots with ggplot2?

I am trying to create a figure to show different saturation levels and their effect on sampling dynamics for a talk using the following code:
max <- 2
decay <- function(x, k, C) {
C * (1 - exp(-k*x))
}
require("ggplot2")
ggplot(NULL, aes(x=x, colour = C)) +
stat_function(data = data.frame(x = 0:max, C = factor(1)), fun = function(x) { decay(x, k=10, C=1e1) }) +
stat_function(data = data.frame(x = 0:max, C = factor(2)), fun = function(x) { decay(x, k=10, C=1e2) }) +
stat_function(data = data.frame(x = 0:max, C = factor(3)), fun = function(x) { decay(x, k=10, C=1e3) }) +
stat_function(data = data.frame(x = 0:max, C = factor(4)), fun = function(x) { decay(x, k=10, C=1e4) }) +
stat_function(data = data.frame(x = 0:max, C = factor(5)), fun = function(x) { decay(x, k=10, C=1e5) }) +
stat_function(data = data.frame(x = 0:max, C = factor(6)), fun = function(x) { decay(x, k=10, C=1e6) }) +
scale_colour_manual(values = c("red", "orange", "yellow", "green", "blue", "violet"), labels = c(1, 2, 3, 4, 5, 6)) + scale_colour_discrete(name=expression(paste(C, " value"))) +
ylab(label="count") + ylim(0, 100)
The intention is to show that for the high C value cases the curve will appear linear. However, the ylim prevents any curve from being shown where it has a value greater than the max for the ylim when I would expect it to merely truncate the curve at the max value.
How do I get the desired behaviou?
You have noticed the difference between limiting the scale(using scale_y_continuous(limits=...))
or limiting the coordinate space (using coord_cartesian(ylim=...).
When you call ylim it uses the equivalent of scale_y_continuous and drops observations not in the range
The help for ylim and xlim describe this (and point to coord_cartesian as an alternative)
# here is your example rewritten
ggplot(data = NULL, aes(x=x,colour=C)) +
lapply(1:6, function(y){
stat_function(data = data.frame(x=0:max,C=factor(y)),
fun = function(x) decay(x,k=10, C = 10^y))) +
scale_colour_manual(values = c("red", "orange", "yellow", "green", "blue", "violet"),
labels = c(1, 2, 3, 4, 5, 6)) +
scale_colour_discrete(name=expression(paste(C, " value"))) +
ylab(label="count") +
coord_cartesian(ylim = c(0,100))

Resources