Fit grouped curves by label in ggplot2 - r

While making a nomogram of Remotion related to Depth and Time of sedimentation, I need to fit curves (as paraboles) to remotion labels if they are lower than its upper ten (7 ceils to 10, and 18 to 20). This is very close to what I need.
data.frame(
depth=rep(seq(0.5, 3.5, 0.5), each=8),
time=rep(seq(0, 280, 40), times=7),
ss = c(
820,369,238,164,107,66,41,33,
820,224,369,279,213,164,115,90,
820,631,476,361,287,230,180,148,
820,672,558,426,353,287,238,187,
820,713,590,492,402,344,262,230,
820,722,615,533,460,394,320,262,
820,738,656,574,492,418,360,303)
) %>%
transmute(
depth = depth,
time = time,
R = 100*(1- ss/820)
) %>%
mutate(G=factor(round(R, digits=-1))) %>%
ggplot(aes(x=time, y=depth, colour=time))+
geom_label(aes(label=round(R)))+
scale_y_continuous(trans = "reverse")+
geom_path(aes(group=G))
But it is not getting parabolical curves. How can I smooth them under the tens condition?

I'm not sure if this is what you're looking for. I separated the data and the plot and applied stat_smooth for each group. Unfortunately, the smoothed lines do not follow the color scheme. You will also see several warnings do to the method in which this creates the splines.
plt <- ggplot(df1, aes(x=time, y=depth, colour = time)) +
geom_label(aes(label=round(R))) +
scale_y_continuous(trans = "reverse") +
geom_path(aes(group=G), size = .6, alpha = .5)
lapply(1:length(unique(df1$G)),
function(i){
df2 <- df1 %>% filter(G == unique(G)[i])
plt <<- plt +
stat_smooth(data = df2, size = .5,
aes(x = time, y = depth),
se = F, method = lm, color = "darkred",
formula = y ~ splines::bs(x, knots = nrow(df2)))
})
You can extend this further with additional parameters. I'm just not sure exactly what you're expecting.
plt <- ggplot(df1, aes(x=time, y=depth, colour = time)) +
geom_label(aes(label=round(R))) +
scale_y_continuous(trans = "reverse") +
geom_path(aes(group=G), size = .6, alpha = .5)
lapply(1:length(unique(df1$G)),
function(i){
df2 <- df1 %>% filter(G == unique(G)[i])
# u <- df1 %>% {nrow(unique(.[,c(1:2)]))}
plt <<- plt +
stat_smooth(
data = df2, size = .5,
aes(x = time, y = depth),
se = F, method = lm, color = "darkred",
formula = y ~ splines::bs(x, knots = nrow(df2),
degree = ifelse(nrow(df2) <= 4,
3, nrow(df2) - 2)))
})

Related

How to plot stat_mean for scatterplot in R ggplot2?

For each treatment tmt, I want to plot the means using stat_summary in ggplot2 with different colour size. I find that the there are mulitple means being plotted over the current points. Not sure how to rectify it.
df <- data.frame(x = rnorm(12, 4,1), y = rnorm(12, 6,4), tmt = rep(c("A","B","C"), each = 4))
ggplot(aes(x = x, y = y, fill = tmt), data = df) +
geom_point(shape=21, size=5, alpha = 0.6) +
scale_fill_manual(values=c("pink","blue", "purple")) +
stat_summary(aes(fill = tmt), fun = 'mean', geom = 'point', size = 5) +
scale_fill_manual(values=c("pink","blue", "purple"))
Plot without the last two lines of code
Plot with the entire code
Using stat_summary you compute the mean of y for each pair of x and tmt. If you want the mean of x and the mean of y per tmt I would suggest to manually compute the means outside of ggplot and use a second geom_point to plot the means. In my code below I increased the size and used rectangles for the means:
df <- data.frame(x = rnorm(12, 4,1), y = rnorm(12, 6,4), tmt = rep(c("A","B","C"), each = 4))
library(ggplot2)
library(dplyr)
df_mean <- df |>
group_by(tmt) |>
summarise(across(c(x, y), mean))
ggplot(aes(x = x, y = y, fill = tmt), data = df) +
geom_point(shape=21, size=5, alpha = 0.6) +
geom_point(data = df_mean, shape=22, size=8, alpha = 0.6) +
scale_fill_manual(values=c("pink","blue", "purple"))

How to allow selected points and confidence intervals to display on graph?

I have a dataset with species SP1 and SP2 and abiotic variables AB1 and AB2. I want to plot all of them on the graph but with modifications
Points: Display on SP1 and SP2 but NOT the AB1 and AB2 points on the graph
Lines: The AB lines should be black, dashed (see code attempt that did not work)
Confidence intervals: Only the SP lines need to have confidence intervals. The confidence intervals should have alpha = 0.2
set.seed(111)
var <- rep(c("SP1","SP2","AB1","AB2"), times = 5)
var.val <- rnorm(20,5,1)
level <- rep(c(100,200,300,400), each = 5)
df <- data.frame(var, var.val, level)
ggplot(df, aes(x = level, y = var.val, col = var, group = var)) +
geom_point(aes(fill = var),colour="white",pch=21, size=4, stroke = 1, alpha = 0.7, position = pd) + theme_classic() +
geom_smooth(method="lm", formula = y ~ x ) +
scale_linetype_manual(values = c("dashed", "dashed","solid", "solid")) +
scale_colour_manual(values = c("black","black","red","blue"))
You can achieve this by passing filtered versions of the data frame to each layer:
ggplot(df, aes(x = level, y = var.val, col = var, group = var)) +
geom_point(aes(fill = var), colour="white",pch=21, size=4, stroke = 1,
alpha = 0.7, data = df[df$var %in% c("SP1", "SP2"),]) +
theme_classic() +
geom_smooth(data = df[df$var %in% c("SP1", "SP2"),],
method = "lm", formula = y ~ x, alpha = 0.2) +
geom_smooth(data = df[!df$var %in% c("SP1", "SP2"),],
method = "lm", formula = y ~ x, se = FALSE, linetype = 2 ) +
scale_linetype_manual(values = c("dashed", "dashed","solid", "solid")) +
scale_colour_manual(values = c("black","black","red","blue"))
Though if AB and SP are conceptually two different variables, you might want to consider pivoting to a wider format.

Fitting Rayleigh in R

This code
library(ggplot2)
library(MASS)
# Generate gamma rvs
x <- rgamma(100000, shape = 2, rate = 0.2)
den <- density(x)
dat <- data.frame(x = den$x, y = den$y)
ggplot(data = dat, aes(x = x, y = y)) +
geom_point(size = 3) +
theme_classic()
# Fit parameters (to avoid errors, set lower bounds to zero)
fit.params <- fitdistr(estimate, "gamma", lower = c(0, 0))
# Plot using density points
ggplot(data = dat, aes(x = x,y = y)) +
geom_point(size = 3) +
geom_line(aes(x=dat$x, y=dgamma(dat$x,fit.params$estimate["shape"], fit.params$estimate["rate"])),
color="red", size = 1) +
theme_classic()
fits and plots the distribution of series x. The resulting plot is:
Packages stats and MASS seem not to support the Rayleigh distribution. How can I extend the previous code to the Rayleigh distribution?
In the code below I start by recreating the vector x, this time setting the RNG seed, in order to make the results reproducible. Then a data.frame dat with only that vector is also recreated.
The density functions of the Gamma and Rayleigh distributions are fit to the histogram of x by first estimating their parameters and with stat_function.
library(ggplot2)
library(MASS)
library(extraDistr) # for the Rayleigh distribution functions
# Generate gamma rvs
set.seed(2020)
x <- rgamma(100000, shape = 2, rate = 0.2)
dat <- data.frame(x)
# Fit parameters (to avoid errors, set lower bounds to zero)
fit.params <- fitdistr(dat$x, "gamma", lower = c(0, 0))
ggplot(data = dat, aes(x = x)) +
geom_histogram(aes(y = ..density..), bins = nclass.Sturges(x)) +
stat_function(fun = dgamma,
args = list(shape = fit.params$estimate["shape"],
rate = fit.params$estimate["rate"]),
color = "red", size = 1) +
ggtitle("Gamma density") +
theme_classic()
fit.params.2 <- fitdistrplus::fitdist(dat$x, "rayleigh", start = list(sigma = 1))
fit.params.2$estimate
ggplot(data = dat, aes(x = x)) +
geom_histogram(aes(y = ..density..), bins = nclass.Sturges(x)) +
stat_function(fun = drayleigh,
args = list(sigma = fit.params.2$estimate),
color = "blue", size = 1) +
ggtitle("Rayleigh density") +
theme_classic()
To plot points and lines like in the question, not histograms, use the code below.
den <- density(x)
orig <- data.frame(x = den$x, y = den$y)
ggplot(data = orig, aes(x = x)) +
geom_point(aes(y = y), size = 3) +
geom_line(aes(y = dgamma(x, fit.params$estimate["shape"], fit.params$estimate["rate"])),
color="red", size = 1) +
geom_line(aes(y = drayleigh(x, fit.params.2$estimate)),
color="blue", size = 1) +
theme_classic()

Exclude a particular area from geom_smooth fit automatically

I am plotting different plots in my shiny app.
By using geom_smooth(), I am fitting a smoothing curve on a scatterplot.
I am plotting these plots with ggplot() and rendering with ggplotly().
Is there any way, I can exclude a particular data profile from geom_smooth().
For e.g.:
It can be seen in the fit, the fit is getting disturbed and which is not desirable. I have tried plotly_click(), plotly_brush(), plotly_select(). But, I don't want user's interference when plotting this fit, this makes the process much slower and inaccurate.
Here is my code to plot this:
#plot
g <- ggplot(data = d_f4, aes_string(x = d_f4$x, y = d_f4$y)) + theme_bw() +
geom_point(colour = "blue", size = 0.1)+
geom_smooth(formula = y ~ splines::bs(x, df = 10), method = "lm", color = "green3", level = 1, size = 1)
Unfortunately, I can not include my dataset in my question, because the dataset is quite big.
You can make an extra data.frame without the "outliers" and use this as the input for geom_smooth:
set.seed(8)
test_data <- data.frame(x = 1:100)
test_data$y <- sin(test_data$x / 10) + rnorm(100, sd = 0.1)
test_data[60:65, "y"] <- test_data[60:65, "y"] + 1
data_plot <- test_data[-c(60:65), ]
library(ggplot2)
ggplot(data = test_data, aes(x = x, y = y)) + theme_bw() +
geom_point(colour = "blue", size = 0.1) +
geom_smooth(formula = y ~ splines::bs(x, df = 10), method = "lm", color = "green3", level = 1, size = 1)
ggplot(data = test_data, aes(x = x, y = y)) + theme_bw() +
geom_point(colour = "blue", size = 0.1) +
geom_smooth(data = data_plot, formula = y ~ splines::bs(x, df = 10), method = "lm", color = "green3", level = 1, size = 1)
Created on 2020-11-27 by the reprex package (v0.3.0)
BTW: you don't need aes_string (which is deprecated) and d_f4$x, you can just use aes(x = x)

ggplot2: histogram with normal curve

I've been trying to superimpose a normal curve over my histogram with ggplot 2.
My formula:
data <- read.csv (path...)
ggplot(data, aes(V2)) +
geom_histogram(alpha=0.3, fill='white', colour='black', binwidth=.04)
I tried several things:
+ stat_function(fun=dnorm)
....didn't change anything
+ stat_density(geom = "line", colour = "red")
...gave me a straight red line on the x-axis.
+ geom_density()
doesn't work for me because I want to keep my frequency values on the y-axis, and want no density values.
Any suggestions?
Solution found!
+geom_density(aes(y=0.045*..count..), colour="black", adjust=4)
Think I got it:
library(ggplot2)
set.seed(1)
df <- data.frame(PF = 10*rnorm(1000))
ggplot(df, aes(x = PF)) +
geom_histogram(aes(y =..density..),
breaks = seq(-50, 50, by = 10),
colour = "black",
fill = "white") +
stat_function(fun = dnorm, args = list(mean = mean(df$PF), sd = sd(df$PF)))
This has been answered here and partially here.
The area under a density curve equals 1, and the area under the histogram equals the width of the bars times the sum of their height ie. the binwidth times the total number of non-missing observations. To fit both on the same graph, one or other needs to be rescaled so that their areas match.
If you want the y-axis to have frequency counts, there are a number of options:
First simulate some data.
library(ggplot2)
set.seed(1)
dat_hist <- data.frame(
group = c(rep("A", 200), rep("B",150)),
value = c(rnorm(200, 20, 5), rnorm(150,25,10)))
# Set desired binwidth and number of non-missing obs
bw = 2
n_obs = sum(!is.na(dat_hist$value))
Option 1: Plot both histogram and density curve as density and then rescale the y axis
This is perhaps the easiest approach for a single histogram.
Using the approach suggested by Carlos, plot both histogram and density curve as density
g <- ggplot(dat_hist, aes(value)) +
geom_histogram(aes(y = ..density..), binwidth = bw, colour = "black") +
stat_function(fun = dnorm, args = list(mean = mean(dat_hist$value), sd = sd(dat_hist$value)))
And then rescale the y axis.
ybreaks = seq(0,50,5)
## On primary axis
g + scale_y_continuous("Counts", breaks = round(ybreaks / (bw * n_obs),3), labels = ybreaks)
## Or on secondary axis
g + scale_y_continuous("Density", sec.axis = sec_axis(
trans = ~ . * bw * n_obs, name = "Counts", breaks = ybreaks))
Option 2: Rescale the density curve using stat_function
With code tidied as per PatrickT's answer.
ggplot(dat_hist, aes(value)) +
geom_histogram(colour = "black", binwidth = bw) +
stat_function(fun = function(x)
dnorm(x, mean = mean(dat_hist$value), sd = sd(dat_hist$value)) * bw * n_obs)
Option 3: Create an external dataset and plot using geom_line.
Unlike the above options, this one works with facets. (EDITED to provide dplyr rather than plyr based solution). Note, the summarised dataset is being used as the primary, and the raw passed in for the histogram only.
library(tidyverse)
dat_hist %>%
group_by(group) %>%
nest(data = c(value)) %>%
mutate(y = map(data, ~ dnorm(
.$value, mean = mean(.$value), sd = sd(.$value)
) * bw * sum(!is.na(.$value)))) %>%
unnest(c(data,y)) %>%
ggplot(aes(x = value)) +
geom_histogram(data = dat_hist, binwidth = bw, colour = "black") +
geom_line(aes(y = y)) +
facet_wrap(~ group)
Option 4: Create external functions to edit the data on the fly
A bit over the top perhaps, but might be useful for someone?
## Function to create scaled dnorm data along full x axis range
dnorm_scaled <- function(data, x = NULL, binwidth = 1, xlim = NULL) {
.x <- na.omit(data[,x])
if(is.null(xlim))
xlim = c(min(.x), max(.x))
x_range = seq(xlim[1], xlim[2], length.out = 101)
setNames(
data.frame(
x = x_range,
y = dnorm(x_range, mean = mean(.x), sd = sd(.x)) * length(.x) * binwidth),
c(x, "y"))
}
## Function to apply over groups
dnorm_scaled_group <- function(data, x = NULL, group = NULL, binwidth = NULL, xlim = NULL) {
dat_hists <- lapply(
split(data, data[, group]), dnorm_scaled,
x = x, binwidth = binwidth, xlim = xlim)
for(g in names(dat_hists))
dat_hists[[g]][, "group"] <- g
setNames(do.call(rbind, dat_hists), c(x, "y", group))
}
## Single histogram
ggplot(dat_hist, aes(value)) +
geom_histogram(binwidth = bw, colour = "black") +
geom_line(data = ~ dnorm_scaled(., "value", binwidth = bw),
aes(y = y))
## With a single faceting variable
ggplot(dat_hist, aes(value)) +
geom_histogram(binwidth = 2, colour = "black") +
geom_line(data = ~ dnorm_scaled_group(
., x = "value", group = "group", binwidth = 2, xlim = c(0,50)),
aes(y = y)) +
facet_wrap(~ group)
This is an extended comment on JWilliman's answer. I found J's answer very useful. While playing around I discovered a way to simplify the code. I'm not saying it is a better way, but I thought I would mention it.
Note that JWilliman's answer provides the count on the y-axis and a "hack" to scale the corresponding density normal approximation (which otherwise would cover a total area of 1 and have therefore a much lower peak).
Main point of this comment: simpler syntax inside stat_function, by passing the needed parameters to the aesthetics function, e.g.
aes(x = x, mean = 0, sd = 1, binwidth = 0.3, n = 1000)
This avoids having to pass args = to stat_function and is therefore more user-friendly. Okay, it's not very different, but hopefully someone will find it interesting.
# parameters that will be passed to ``stat_function``
n = 1000
mean = 0
sd = 1
binwidth = 0.3 # passed to geom_histogram and stat_function
set.seed(1)
df <- data.frame(x = rnorm(n, mean, sd))
ggplot(df, aes(x = x, mean = mean, sd = sd, binwidth = binwidth, n = n)) +
theme_bw() +
geom_histogram(binwidth = binwidth,
colour = "white", fill = "cornflowerblue", size = 0.1) +
stat_function(fun = function(x) dnorm(x, mean = mean, sd = sd) * n * binwidth,
color = "darkred", size = 1)
This code should do it:
set.seed(1)
z <- rnorm(1000)
qplot(z, geom = "blank") +
geom_histogram(aes(y = ..density..)) +
stat_density(geom = "line", aes(colour = "bla")) +
stat_function(fun = dnorm, aes(x = z, colour = "blabla")) +
scale_colour_manual(name = "", values = c("red", "green"),
breaks = c("bla", "blabla"),
labels = c("kernel_est", "norm_curv")) +
theme(legend.position = "bottom", legend.direction = "horizontal")
Note: I used qplot but you can use the more versatile ggplot.
Here's a tidyverse informed version:
Setup
library(tidyverse)
Some data
d <- read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/openintro/speed_gender_height.csv")
Preparing data
We'll use a "total" histogram for the whole sample, to that end, we'll need to remove the grouping information from the data.
d2 <-
d |>
select(-gender)
Here's a data set with summary data:
d_summary <-
d %>%
group_by(gender) %>%
summarise(height_m = mean(height, na.rm = T),
height_sd = sd(height, na.rm = T))
d_summary
Plot it
d %>%
ggplot() +
aes() +
geom_histogram(aes(y = ..density.., x = height, fill = gender)) +
facet_wrap(~ gender) +
geom_histogram(data = d2, aes(y = ..density.., x = height),
alpha = .5) +
stat_function(data = d_summary %>% filter(gender == "female"),
fun = dnorm,
#color = "red",
args = list(mean = filter(d_summary,
gender == "female")$height_m,
sd = filter(d_summary,
gender == "female")$height_sd)) +
stat_function(data = d_summary %>% filter(gender == "male"),
fun = dnorm,
#color = "red",
args = list(mean = filter(d_summary,
gender == "male")$height_m,
sd = filter(d_summary,
gender == "male")$height_sd)) +
theme(legend.position = "none",
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
labs(title = "Facetted histograms with overlaid normal curves",
caption = "The grey histograms shows the whole distribution (over) both groups, i.e. females and men") +
scale_fill_brewer(type = "qual", palette = "Set1")

Resources