Add a fitting function to histogram - r

library(ggplot2)
library(fitdistrplus)
set.seed(1)
dat <- data.frame(n = rlnorm(1000))
# binwidth
bw = 0.2
# fit a lognormal distribution
fit_params <- fitdistr(dat$n,"lognormal")
ggplot(dat, aes(n)) +
geom_histogram(aes(y = ..density..), binwidth = bw, colour = "black") +
stat_function(fun = dlnorm, size = 1, color = 'gray',
args = list(mean = fit_params$estimate[1], sd = fit_params$estimate[2]))
# my defined function
myfun <- function(x, a, b) 1/(sqrt(2*pi*b(x-1)))*exp(-0.5*((log(x-a)/b)^2)) # a and b are meanlog and sdlog resp.
I'd like to fit a modified lognormal defined by myfun to a density histogram. How do I add this function?

Maybe you are looking for this. Some values can not appear because of the domain of your myfun:
library(ggplot2)
library(fitdistrplus)
set.seed(1)
dat <- data.frame(n = rlnorm(1000))
# binwidth
bw = 0.2
# fit a lognormal distribution
fit_params <- fitdistr(dat$n,"lognormal")
# my defined function
myfun <- function(x, a, b) 1/(sqrt(2*pi*b*(x-1)))*exp(-0.5*((log(x-a)/b)^2))
# a and b are meanlog and sdlog resp.
#Plot
ggplot(dat, aes(n)) +
geom_histogram(aes(y = ..density..), binwidth = bw, colour = "black") +
stat_function(fun = myfun, size = 1, color = 'gray',
args = list(a = fit_params$estimate[1], b = fit_params$estimate[2]))
Output:

Related

How to add name labels to a graph using ggplot2 in R?

I have the following code:
plot <- ggplot(data = df_sm)+
geom_histogram(aes(x=simul_means, y=..density..), binwidth = 0.20, fill="slategray3", col="black", show.legend = TRUE)
plot <- plot + labs(title="Density of 40 Means from Exponential Distribution", x="Mean of 40 Exponential Distributions", y="Density")
plot <- plot + geom_vline(xintercept=sampl_mean,size=1.0, color="black", show.legend = TRUE)
plot <- plot + stat_function(fun=dnorm,args=list(mean=sampl_mean, sd=sampl_sd),color = "dodgerblue4", size = 1.0)
plot <- plot+ geom_vline(xintercept=th_mean,size=1.0,color="indianred4",linetype = "longdash")
plot <- plot + stat_function(fun=dnorm,args=list(mean=th_mean, sd=th_mean_sd),color = "darkmagenta", size = 1.0)
plot
I want to show the legends of each layer, I tried show.legend = TRUE but it does nothing.
All my data frame is means from exponential distribution simulations, also I have some theoretical values from the distribution (mean and standard deviation) which I describe as th_mean and th_mean_sd.
The code for my simulation is the following:
lambda <- 0.2
th_mean <- 1/lambda
th_sd <- 1/lambda
th_var <- th_sd^2
n <- 40
th_mean_sd <- th_sd/sqrt(n)
th_mean_var <- th_var/sqrt(n)
simul <- 1000
simul_means <- NULL
for(i in 1:simul) {
simul_means <- c(simul_means, mean(rexp(n, lambda)))
}
sampl_mean <- mean(simul_means)
sampl_sd <- sd(simul_means)
df_sm<-data.frame(simul_means)
If you want to get a legend you have to map on aesthetics instead of setting the color, fill, ... as parameter, i.e. move color=... inside aes(...) and make use of scale_color/fill_manual to set the color values. Personally I find it helpful to make use of some meaningful labels, e.g. in case of your histogram I map the label "hist" on the fill but you could whatever label you like:
set.seed(123)
lambda <- 0.2
th_mean <- 1 / lambda
th_sd <- 1 / lambda
th_var <- th_sd^2
n <- 40
th_mean_sd <- th_sd / sqrt(n)
th_mean_var <- th_var / sqrt(n)
simul <- 1000
simul_means <- NULL
for (i in 1:simul) {
simul_means <- c(simul_means, mean(rexp(n, lambda)))
}
sampl_mean <- mean(simul_means)
sampl_sd <- sd(simul_means)
df_sm <- data.frame(simul_means)
library(ggplot2)
ggplot(data = df_sm) +
geom_histogram(aes(x = simul_means, y = ..density.., fill = "hist"), binwidth = 0.20, col = "black") +
labs(title = "Density of 40 Means from Exponential Distribution", x = "Mean of 40 Exponential Distributions", y = "Density") +
stat_function(fun = dnorm, args = list(mean = sampl_mean, sd = sampl_sd), aes(color = "sampl_mean"), size = 1.0) +
stat_function(fun = dnorm, args = list(mean = th_mean, sd = th_mean_sd), aes(color = "th_dens"), size = 1.0) +
geom_vline(size = 1.0, aes(xintercept = sampl_mean, color = "sampl_mean")) +
geom_vline(size = 1.0, aes(xintercept = th_mean, color = "th_mean"), linetype = "longdash") +
scale_fill_manual(values = c(hist = "slategray3")) +
scale_color_manual(values = c(sampl_dens = "dodgerblue4", th_dens = "darkmagenta", th_mean = "indianred4", sampl_mean = "black"))

R function to find suitable values for fitting constants

library(ggplot2)
set.seed(1)
dataset <- data.frame(X = rnorm(1000))
dfun <- function(x, a, b) 1/(sqrt(2*pi)*b)*exp(-0.5*((x-a)^2/(2*b^2)))
ggplot(dataset, aes(x = X)) +
geom_histogram(aes(y = ..density..), binwidth = 0.5)+
stat_function(fun = dfun,
args = list(a = , b = ))
How can I calculate suitable values of a and b in case like this?
You can compute values for the arguments a and b with nls. Something like the following.
dens <- density(dataset$X, n = nrow(dataset))
df_dens <- data.frame(x = dens$x, y = dens$y)
a0 <- mean(dataset$X)
b0 <- sd(dataset$X)
fit <- nls(y ~ dfun(x, a, b), data = df_dens, start = list(a = a0, b = b0))
coef(fit)
# a b
#-0.007006625 0.97518478
Now plot the histogram and the function with these values for a and b.
ggplot(dataset, aes(x = X)) +
geom_histogram(aes(y = ..density..), binwidth = 0.5)+
stat_function(fun = dfun,
args = list(a = coef(fit)[1], b = coef(fit)[2]))

How to draw multiple contours in the same

I am trying to get two contours in the same plot using ggplot2 in R.
Here is a reproducible example:
library(MASS)
library(ggplot2)
# first contour
m <- c(.0, -.0)
sigma <- matrix(c(1,.5,.5,1), nrow=2)
data.grid <- expand.grid(s.1 = seq(-3, 3, length.out=200), s.2 = seq(-3, 3, length.out=200))
q.samp <- cbind(data.grid, prob = mvtnorm::dmvnorm(data.grid, mean = m, sigma = sigma))
plot1 <- ggplot(q.samp, aes(x = s.1, y = s.2, z = prob)) +
stat_contour(color = 'green')
# second contour
m1 <- c(1, 1)
sigma1 <- matrix(c(1,-.5,-.5,1), nrow=2)
set.seed(10)
data.grid1 <- expand.grid(s.1 = seq(-3, 3, length.out=200), s.2 = seq(-3, 3, length.out=200))
q.samp1 <- cbind(data.grid1, prob = mvtnorm::dmvnorm(data.grid1, mean = m1, sigma = sigma1))
plot2 <- ggplot(q.samp1, aes(x = s.1, y = s.2, z = prob)) +
stat_contour(color = 'red')
However, trying plot1 + plot2 also does not work. Is there a way to get the two contours on the same plot.
What about including another stat_contour with different data?
ggplot(q.samp1, aes(x = s.1, y = s.2, z = prob)) +
stat_contour(color = 'red') +
stat_contour(data = q.samp, aes(x = s.1, y = s.2, z = prob), color = 'green')

Generating multiple plots containing functions in ggplot2

I am trying to make a composite plot in R using the packages ggplot2 and ggpubr.
I have no problem in making the composite plots except each plot has a normal distribution curve specific to that dataset. When I generate the composite plot, both plots have the same curve, that of the last dataset.
How can I generate the composite plot with each plot having its own specific normal distribution curve?
CODE AND OUTPUT PLOTS
## PLOT 1 ##
results_matrix_C <- data.frame(matrix(rnorm(20), nrow=20))
colnames(results_matrix_C) <- c("X")
m <- mean(results_matrix_C$X)
sd <- sd(results_matrix_C$X)
dnorm_C <- function(x){
norm_C <- dnorm(x, m, sd)
return(norm_C)
}
e = 1
dnorm_one_sd_C <- function(x){
norm_one_sd_C <- dnorm(x, m, sd)
# Have NA values outside interval x in [e]:
norm_one_sd_C[x <= e] <- NA
return(norm_one_sd_C)
}
C <- ggplot(results_matrix_C, aes(x = results_matrix_C$X)) +
geom_histogram(aes(y=..density..), bins = 10, colour = "black", fill = "white") +
stat_function(fun = dnorm_one_sd_C, geom = "area", fill = "#CE9A05", color = "#CE9A05", alpha = 0.25, size = 1) +
stat_function(fun = dnorm_C, colour = "#CE0539", size = 1) +
theme_classic()
## PLOT 2 ##
results_matrix_U <- data.frame(matrix(rnorm(20)+1, nrow=20))
colnames(results_matrix_U) <- c("X")
m <- mean(results_matrix_U$X)
sd <- sd(results_matrix_U$X)
dnorm_U <- function(x){
norm_U <- dnorm(x, m, sd)
return(norm_U)
}
e = 2
dnorm_one_sd_U <- function(x){
norm_one_sd_U <- dnorm(x, m, sd)
# Have NA values outside interval x in [e]:
norm_one_sd_U[x <= e] <- NA
return(norm_one_sd_U)
}
U <- ggplot(results_matrix_U, aes(x = results_matrix_U$X)) +
geom_histogram(aes(y=..density..), bins = 10, colour = "black", fill = "white") +
stat_function(fun = dnorm_one_sd_U, geom = "area", fill = "#CE9A05", color = "#CE9A05", alpha = 0.25, size = 1) +
stat_function(fun = dnorm_U, colour = "#CE0539", size = 1) +
theme_classic()
library(ggpubr)
ggarrange(C, U,
nrow = 1, ncol = 2)
As you can see in the composite plot, the first one has taken the normal distribution curve of the second plot rather than its own one from my initial plot (Plot 1).
UPDATE
Variable "e" refers to the shaded area which is related to the distribution curve.
m = mean of the dataset
sd = standard deviation of the dataset
m and sd are used to generate the normal distribution curves
SOLVED
By inserting the function in full into the stat_function section of the ggplot2 code, this has worked
i.e:
## PLOT 1 ##
results_matrix_C <- data.frame(matrix(rnorm(20), nrow=20))
colnames(results_matrix_C) <- c("X")
mean <- mean(results_matrix_C$X)
sd <- sd(results_matrix_C$X)
e = 1
C <- ggplot(results_matrix_C, aes(x = results_matrix_C$X)) +
geom_histogram(aes(y=..density..), bins = 10, colour = "black", fill = "white") +
stat_function(
fun = function(x, mean, sd, e){
norm_one_sd_C <- dnorm(x, mean, sd)
norm_one_sd_C[x <= e] <- NA
return(norm_one_sd_C)},
args = c(mean = mean, sd = sd, e = e), geom = "area", fill = "#CE9A05", color = "#CE9A05", alpha = 0.25, size = 1) +
stat_function(
fun = function(x, mean, sd){
dnorm(x = x, mean = mean, sd = sd)},
args = c(mean = mean, sd = sd), colour = "#CE0539", size = 1) +
theme_classic()
## PLOT 2 ##
results_matrix_U <- data.frame(matrix(rnorm(20)+1, nrow=20))
colnames(results_matrix_U) <- c("X")
mean <- mean(results_matrix_U$X)
sd <- sd(results_matrix_U$X)
e = 2
U <- ggplot(results_matrix_U, aes(x = results_matrix_U$X)) +
geom_histogram(aes(y=..density..), bins = 10, colour = "black", fill = "white") +
stat_function(
fun = function(x, mean, sd, e){
norm_one_sd_U <- dnorm(x, mean, sd)
norm_one_sd_U[x <= e] <- NA
return(norm_one_sd_U)},
args = c(mean = mean, sd = sd, e = e), geom = "area", fill = "#CE9A05", color = "#CE9A05", alpha = 0.25, size = 1) +
stat_function(
fun = function(x, mean, sd){
dnorm(x = x, mean = mean, sd = sd)},
args = c(mean = mean, sd = sd), colour = "#CE0539", size = 1) +
theme_classic()
library(ggpubr)
ggarrange(C, U,
nrow = 1, ncol = 2)

operation between stat_summary_hex plots made in ggplot2

I have two populations A and B distributed spatially with one character Z, I want to be able to make an hexbin substracting the proportion of the character in each hexbin. Here I have the code for two theoretical populations A and B
library(hexbin)
library(ggplot2)
set.seed(2)
xA <- rnorm(1000)
set.seed(3)
yA <- rnorm(1000)
set.seed(4)
zA <- sample(c(1, 0), 20, replace = TRUE, prob = c(0.2, 0.8))
hbinA <- hexbin(xA, yA, xbins = 40, IDs = TRUE)
A <- data.frame(x = xA, y = yA, z = zA)
set.seed(5)
xB <- rnorm(1000)
set.seed(6)
yB <- rnorm(1000)
set.seed(7)
zB <- sample(c(1, 0), 20, replace = TRUE, prob = c(0.4, 0.6))
hbinB <- hexbin(xB, yB, xbins = 40, IDs = TRUE)
B <- data.frame(x = xB, y = yB, z = zB)
ggplot(A, aes(x, y, z = z)) + stat_summary_hex(fun = function(z) sum(z)/length(z), alpha = 0.8) +
scale_fill_gradientn(colours = c("blue","red")) +
guides(alpha = FALSE, size = FALSE)
ggplot(B, aes(x, y, z = z)) + stat_summary_hex(fun = function(z) sum(z)/length(z), alpha = 0.8) +
scale_fill_gradientn (colours = c("blue","red")) +
guides(alpha = FALSE, size = FALSE)
here is the two resulting graphs
My goal is to make a third graph with hexbins with the values of the difference between hexbins at the same coordinates but I don't even know how to start to do it, I have done something similar in the raster Package, but I need it as hexbins
Thanks a lot
You need to make sure that both plots use the exact same binning. In order to achieve this, I think it is best to do the binning beforehand and then plot the results with stat_identity / geom_hex. With the variables from your code sample you ca do:
## find the bounds for the complete data
xbnds <- range(c(A$x, B$x))
ybnds <- range(c(A$y, B$y))
nbins <- 30
# function to make a data.frame for geom_hex that can be used with stat_identity
makeHexData <- function(df) {
h <- hexbin(df$x, df$y, nbins, xbnds = xbnds, ybnds = ybnds, IDs = TRUE)
data.frame(hcell2xy(h),
z = tapply(df$z, h#cID, FUN = function(z) sum(z)/length(z)),
cid = h#cell)
}
Ahex <- makeHexData(A)
Bhex <- makeHexData(B)
## not all cells are present in each binning, we need to merge by cellID
byCell <- merge(Ahex, Bhex, by = "cid", all = T)
## when calculating the difference empty cells should count as 0
byCell$z.x[is.na(byCell$z.x)] <- 0
byCell$z.y[is.na(byCell$z.y)] <- 0
## make a "difference" data.frame
Diff <- data.frame(x = ifelse(is.na(byCell$x.x), byCell$x.y, byCell$x.x),
y = ifelse(is.na(byCell$y.x), byCell$y.y, byCell$y.x),
z = byCell$z.x - byCell$z.y)
## plot the results
ggplot(Ahex) +
geom_hex(aes(x = x, y = y, fill = z),
stat = "identity", alpha = 0.8) +
scale_fill_gradientn (colours = c("blue","red")) +
guides(alpha = FALSE, size = FALSE)
ggplot(Bhex) +
geom_hex(aes(x = x, y = y, fill = z),
stat = "identity", alpha = 0.8) +
scale_fill_gradientn (colours = c("blue","red")) +
guides(alpha = FALSE, size = FALSE)
ggplot(Diff) +
geom_hex(aes(x = x, y = y, fill = z),
stat = "identity", alpha = 0.8) +
scale_fill_gradientn (colours = c("blue","red")) +
guides(alpha = FALSE, size = FALSE)

Resources