This is really basic. Still hope I can get your help. I need to superimpose two density plots. The first is a generated normal density plot given mean and sd of AAPL. >
x <- seq(-20, 20, length.out = 5113)
normAAPL<-data.frame(x, f = dnorm(x,mean = meanAAPL, sd = sdAAPL)) %>%
ggplot(aes(x, f)) +
geom_line() +
stat_function(fun=dnorm, geom="line", col=2, lty=2)+
ylim(0,0.2)
> meanAAPL
[1] 0.101133
> sdAAPL
[1] 2.461525
The next is the actual distribution
dAAPL <-density(oldandnew$AAPL)
Where the 20 first AAPL data is
c(-8.810021, 1.45281, -9.051401, 4.628075, -1.774445, -5.25055,
-6.181806, 10.40407, 3.74302, 3.425328, 2.48944, 6.309463, -1.948374,
-4.652429, 5.493372, -1.852238, -0.1725783, -7.924, 2.074379,
-3.431709)
Do I need to combine the data in one data frame to plot them in the same ggplot?
Hope you can help me out.
df <- data.frame(x = seq(-20, 20, length.out = 5113),
f = dnorm(x))
df2 <- data.frame(x = c(-8.810021, 1.45281, -9.051401, 4.628075, -1.774445, -5.25055,
-6.181806, 10.40407, 3.74302, 3.425328, 2.48944, 6.309463, -1.948374,
-4.652429, 5.493372, -1.852238, -0.1725783, -7.924, 2.074379,
-3.431709))
ggplot() +
geom_line(data = df, aes(x, f, colour = "Normal")) +
geom_density(data = df2, aes(x, colour = "Actual")) +
ylim(0,0.2) +
scale_color_manual(name = "Distribution", values = c("Normal" = "Blue", "Actual" = "Red")) +
theme_minimal() + theme(legend.position = "top", aspect.ratio = 1)
Produces:
Related
I have the following graph and code:
Graph
ggplot(long2, aes(x = DATA, y = value, fill = variable)) + geom_area(position="fill", alpha=0.75) +
scale_y_continuous(labels = scales::comma,n.breaks = 5,breaks = waiver()) +
scale_fill_viridis_d() +
scale_x_date(date_labels = "%b/%Y",date_breaks = "6 months") +
ggtitle("Proporcions de les visites, només 9T i 9C") +
xlab("Data") + ylab("% visites") +
theme_minimal() + theme(legend.position="bottom") + guides(fill=guide_legend(title=NULL)) +
annotate("rect", fill = "white", alpha = 0.3,
xmin = as.Date.character("2020-03-16"), xmax = as.Date.character("2020-06-22"),
ymin = 0, ymax = 1)
But it has some sawtooth, how am I supposed to smooth it out?
I believe your situation is roughly analogous to the following, wherein we have missing x-positions for one group, but not the other at the same position. This causes spikes if you set position = "fill".
library(ggplot2)
x <- seq_len(100)
df <- data.frame(
x = c(x[-c(25, 75)], x[-50]),
y = c(cos(x[-c(25, 75)]), sin(x[-50])) + 5,
group = rep(c("A", "B"), c(98, 99))
)
ggplot(df, aes(x, y, fill = group)) +
geom_area(position = "fill")
To smooth out these spikes, it has been suggested to linearly interpolate the data at the missing positions.
# Find all used x-positions
ux <- unique(df$x)
# Split data by group, interpolate data groupwise
df <- lapply(split(df, df$group), function(xy) {
approxed <- approx(xy$x, xy$y, xout = ux)
data.frame(x = ux, y = approxed$y, group = xy$group[1])
})
# Recombine data
df <- do.call(rbind, df)
# Now without spikes :)
ggplot(df, aes(x, y, fill = group)) +
geom_area(position = "fill")
Created on 2022-06-17 by the reprex package (v2.0.1)
P.S. I would also have expected a red spike at x=50, but for some reason this didn't happen.
I've plotted a confusion matrix (predicting 5 outcomes) in R using ggplot and scales for geom_text labeling.
The way geom_text(aes(label = percent(Freq/sum(Freq))) is written in code, it's showing Frequency of each box divided by sum of all observations, but what I want to do is get Frequency of each box divided by sum Frequency for each Reference.
In other words, instead of A,A = 15.8%,
it should be A,A = 15.8%/(0.0%+0.0%+0.0%+0.0%+15.8%%) = 100.0%
library(ggplot2)
library(scales)
valid_actual <- as.factor(c("A","B","B","C","C","C","E","E","D","D","A","A","A","E","E","D","D","C","B"))
valid_pred <- as.factor(c("A","B","C","C","E","C","E","E","D","B","A","B","A","E","D","E","D","C","B"))
cfm <- confusionMatrix(valid_actual, valid_pred)
ggplotConfusionMatrix <- function(m){
mytitle <- paste("Accuracy", percent_format()(m$overall[1]),
"Kappa", percent_format()(m$overall[2]))
p <-
ggplot(data = as.data.frame(m$table) ,
aes(x = Reference, y = Prediction)) +
geom_tile(aes(fill = log(Freq)), colour = "white") +
scale_fill_gradient(low = "white", high = "green") +
geom_text(aes(x = Reference, y = Prediction, label = percent(Freq/sum(Freq)))) +
theme(legend.position = "none") +
ggtitle(mytitle)
return(p)
}
ggplotConfusionMatrix(cfm)
The problem is that, as far as I know, ggplot is not able to do group calculation. See this recent post for similar question.
To solve your problem you should take advantage of the dplyrpackage.
This should work
library(ggplot2)
library(scales)
library(caret)
library(dplyr)
valid_actual <- as.factor(c("A","B","B","C","C","C","E","E","D","D","A","A","A","E","E","D","D","C","B"))
valid_pred <- as.factor(c("A","B","C","C","E","C","E","E","D","B","A","B","A","E","D","E","D","C","B"))
cfm <- confusionMatrix(valid_actual, valid_pred)
ggplotConfusionMatrix <- function(m){
mytitle <- paste("Accuracy", percent_format()(m$overall[1]),
"Kappa", percent_format()(m$overall[2]))
data_c <- mutate(group_by(as.data.frame(m$table), Reference ), percentage =
percent(Freq/sum(Freq)))
p <-
ggplot(data = data_c,
aes(x = Reference, y = Prediction)) +
geom_tile(aes(fill = log(Freq)), colour = "white") +
scale_fill_gradient(low = "white", high = "green") +
geom_text(aes(x = Reference, y = Prediction, label = percentage)) +
theme(legend.position = "none") +
ggtitle(mytitle)
return(p)
}
ggplotConfusionMatrix(cfm)
And the result:
Is there a method to overlay something analogous to a density curve when the vertical axis is frequency or relative frequency? (Not an actual density function, since the area need not integrate to 1.) The following question is similar:
ggplot2: histogram with normal curve, and the user self-answers with the idea to scale ..count.. inside of geom_density(). However this seems unusual.
The following code produces an overinflated "density" line.
df1 <- data.frame(v = rnorm(164, mean = 9, sd = 1.5))
b1 <- seq(4.5, 12, by = 0.1)
hist.1a <- ggplot(df1, aes(v)) +
stat_bin(aes(y = ..count..), color = "black", fill = "blue",
breaks = b1) +
geom_density(aes(y = ..count..))
hist.1a
#joran's response/comment got me thinking about what the appropriate scaling factor would be. For posterity's sake, here's the result.
When Vertical Axis is Frequency (aka Count)
Thus, the scaling factor for a vertical axis measured in bin counts is
In this case, with N = 164 and the bin width as 0.1, the aesthetic for y in the smoothed line should be:
y = ..density..*(164 * 0.1)
Thus the following code produces a "density" line scaled for a histogram measured in frequency (aka count).
df1 <- data.frame(v = rnorm(164, mean = 9, sd = 1.5))
b1 <- seq(4.5, 12, by = 0.1)
hist.1a <- ggplot(df1, aes(x = v)) +
geom_histogram(aes(y = ..count..), breaks = b1,
fill = "blue", color = "black") +
geom_density(aes(y = ..density..*(164*0.1)))
hist.1a
When Vertical Axis is Relative Frequency
Using the above, we could write
hist.1b <- ggplot(df1, aes(x = v)) +
geom_histogram(aes(y = ..count../164), breaks = b1,
fill = "blue", color = "black") +
geom_density(aes(y = ..density..*(0.1)))
hist.1b
When Vertical Axis is Density
hist.1c <- ggplot(df1, aes(x = v)) +
geom_histogram(aes(y = ..density..), breaks = b1,
fill = "blue", color = "black") +
geom_density(aes(y = ..density..))
hist.1c
Try this instead:
ggplot(df1,aes(x = v)) +
geom_histogram(aes(y = ..ncount..)) +
geom_density(aes(y = ..scaled..))
library(ggplot2)
smoothedHistogram <- function(dat, y, bins=30, xlabel = y, ...){
gg <- ggplot(dat, aes_string(y)) +
geom_histogram(bins=bins, center = 0.5, stat="bin",
fill = I("midnightblue"), color = "#E07102", alpha=0.8)
gg_build <- ggplot_build(gg)
area <- sum(with(gg_build[["data"]][[1]], y*(xmax - xmin)))
gg <- gg +
stat_density(aes(y=..density..*area),
color="#BCBD22", size=2, geom="line", ...)
gg$layers <- gg$layers[2:1]
gg + xlab(xlabel) +
theme_bw() + theme(axis.title = element_text(size = 16),
axis.text = element_text(size = 12))
}
dat <- data.frame(x = rnorm(10000))
smoothedHistogram(dat, "x")
I would like to plot a barplot but I have dates on the x axis and I want those dates to be correctly spaced (as it is NON categorical)
set.seed(1)
m = matrix(abs(rnorm(6)),3,2)
rownames(m) = as.Date(c('2011-01-01','2011-01-03','2011-01-10'))
barplot(t(m),beside=T,col=c('red','blue'),las=2)
On this example I would like 14984 to be offset on the right.
I'd rather a graphics solution but ggplot2 is fine too
Would you mind to use ´ggplot´ instead?
library(ggplot2)
set.seed(1)
df <- data.frame(y=abs(rnorm(6)),
x=rep(as.Date(c('2011-01-01','2011-01-03','2011-01-10')),
times = 2),
g = factor(rep(c(1,2), each = 3)))
ggplot(aes(x=x, y=y, group = g, fill = g), data = df) +
geom_bar(stat = 'identity', position = 'dodge')
You can improve axis formatting with `scale_x_date´
library(scales)
ggplot(aes(x=x, y=y, group = g, fill = g), data = df) +
geom_bar(stat = 'identity', position = 'dodge') +
scale_x_date(breaks = '1 day') +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
And customize it to your purpose
ggplot(aes(x=x, y=y, group = g, fill = g), data = df) +
geom_bar(stat = 'identity', position = 'dodge') +
scale_x_date(breaks = '1 day') +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
scale_fill_manual('My\nclasses', values = c('1'='red', '2' = 'blue')) +
labs(list(title = 'Barplot\n', x = ('Date'), y = 'Values'))
With graphics, you probably have to prepare the data appropriately (with missing values for dates you don't consider) in order to do this. Then you can use barplot.
# matrix definition
set.seed(1)
m = matrix(abs(rnorm(6)),3,2)
rownames(m) = as.Date(c('2011-01-01','2011-01-03','2011-01-10'))
# get all dates in between
dts <- do.call(":", as.list(range(rownames(m))))
dts <- dts[!dts%in%rownames(m)]
mat <- matrix(NA, nrow=length(dts), ncol=2, dimnames=list(dts, NULL))
# combine with original matrix
m <- rbind(m, mat)
m <- m[order(rownames(m)), ]
which(!is.na(m[,1]))
# plot
barplot(t(m), beside=T, col=c('red','blue'),las=2, axes=FALSE, axisnames=FALSE)
axis(2)
axis(1, at=3*which(!is.na(m[,1]))-1, labels=rownames(m[!is.na(m[,1]),]))
I've been trying to superimpose a normal curve over my histogram with ggplot 2.
My formula:
data <- read.csv (path...)
ggplot(data, aes(V2)) +
geom_histogram(alpha=0.3, fill='white', colour='black', binwidth=.04)
I tried several things:
+ stat_function(fun=dnorm)
....didn't change anything
+ stat_density(geom = "line", colour = "red")
...gave me a straight red line on the x-axis.
+ geom_density()
doesn't work for me because I want to keep my frequency values on the y-axis, and want no density values.
Any suggestions?
Solution found!
+geom_density(aes(y=0.045*..count..), colour="black", adjust=4)
Think I got it:
library(ggplot2)
set.seed(1)
df <- data.frame(PF = 10*rnorm(1000))
ggplot(df, aes(x = PF)) +
geom_histogram(aes(y =..density..),
breaks = seq(-50, 50, by = 10),
colour = "black",
fill = "white") +
stat_function(fun = dnorm, args = list(mean = mean(df$PF), sd = sd(df$PF)))
This has been answered here and partially here.
The area under a density curve equals 1, and the area under the histogram equals the width of the bars times the sum of their height ie. the binwidth times the total number of non-missing observations. To fit both on the same graph, one or other needs to be rescaled so that their areas match.
If you want the y-axis to have frequency counts, there are a number of options:
First simulate some data.
library(ggplot2)
set.seed(1)
dat_hist <- data.frame(
group = c(rep("A", 200), rep("B",150)),
value = c(rnorm(200, 20, 5), rnorm(150,25,10)))
# Set desired binwidth and number of non-missing obs
bw = 2
n_obs = sum(!is.na(dat_hist$value))
Option 1: Plot both histogram and density curve as density and then rescale the y axis
This is perhaps the easiest approach for a single histogram.
Using the approach suggested by Carlos, plot both histogram and density curve as density
g <- ggplot(dat_hist, aes(value)) +
geom_histogram(aes(y = ..density..), binwidth = bw, colour = "black") +
stat_function(fun = dnorm, args = list(mean = mean(dat_hist$value), sd = sd(dat_hist$value)))
And then rescale the y axis.
ybreaks = seq(0,50,5)
## On primary axis
g + scale_y_continuous("Counts", breaks = round(ybreaks / (bw * n_obs),3), labels = ybreaks)
## Or on secondary axis
g + scale_y_continuous("Density", sec.axis = sec_axis(
trans = ~ . * bw * n_obs, name = "Counts", breaks = ybreaks))
Option 2: Rescale the density curve using stat_function
With code tidied as per PatrickT's answer.
ggplot(dat_hist, aes(value)) +
geom_histogram(colour = "black", binwidth = bw) +
stat_function(fun = function(x)
dnorm(x, mean = mean(dat_hist$value), sd = sd(dat_hist$value)) * bw * n_obs)
Option 3: Create an external dataset and plot using geom_line.
Unlike the above options, this one works with facets. (EDITED to provide dplyr rather than plyr based solution). Note, the summarised dataset is being used as the primary, and the raw passed in for the histogram only.
library(tidyverse)
dat_hist %>%
group_by(group) %>%
nest(data = c(value)) %>%
mutate(y = map(data, ~ dnorm(
.$value, mean = mean(.$value), sd = sd(.$value)
) * bw * sum(!is.na(.$value)))) %>%
unnest(c(data,y)) %>%
ggplot(aes(x = value)) +
geom_histogram(data = dat_hist, binwidth = bw, colour = "black") +
geom_line(aes(y = y)) +
facet_wrap(~ group)
Option 4: Create external functions to edit the data on the fly
A bit over the top perhaps, but might be useful for someone?
## Function to create scaled dnorm data along full x axis range
dnorm_scaled <- function(data, x = NULL, binwidth = 1, xlim = NULL) {
.x <- na.omit(data[,x])
if(is.null(xlim))
xlim = c(min(.x), max(.x))
x_range = seq(xlim[1], xlim[2], length.out = 101)
setNames(
data.frame(
x = x_range,
y = dnorm(x_range, mean = mean(.x), sd = sd(.x)) * length(.x) * binwidth),
c(x, "y"))
}
## Function to apply over groups
dnorm_scaled_group <- function(data, x = NULL, group = NULL, binwidth = NULL, xlim = NULL) {
dat_hists <- lapply(
split(data, data[, group]), dnorm_scaled,
x = x, binwidth = binwidth, xlim = xlim)
for(g in names(dat_hists))
dat_hists[[g]][, "group"] <- g
setNames(do.call(rbind, dat_hists), c(x, "y", group))
}
## Single histogram
ggplot(dat_hist, aes(value)) +
geom_histogram(binwidth = bw, colour = "black") +
geom_line(data = ~ dnorm_scaled(., "value", binwidth = bw),
aes(y = y))
## With a single faceting variable
ggplot(dat_hist, aes(value)) +
geom_histogram(binwidth = 2, colour = "black") +
geom_line(data = ~ dnorm_scaled_group(
., x = "value", group = "group", binwidth = 2, xlim = c(0,50)),
aes(y = y)) +
facet_wrap(~ group)
This is an extended comment on JWilliman's answer. I found J's answer very useful. While playing around I discovered a way to simplify the code. I'm not saying it is a better way, but I thought I would mention it.
Note that JWilliman's answer provides the count on the y-axis and a "hack" to scale the corresponding density normal approximation (which otherwise would cover a total area of 1 and have therefore a much lower peak).
Main point of this comment: simpler syntax inside stat_function, by passing the needed parameters to the aesthetics function, e.g.
aes(x = x, mean = 0, sd = 1, binwidth = 0.3, n = 1000)
This avoids having to pass args = to stat_function and is therefore more user-friendly. Okay, it's not very different, but hopefully someone will find it interesting.
# parameters that will be passed to ``stat_function``
n = 1000
mean = 0
sd = 1
binwidth = 0.3 # passed to geom_histogram and stat_function
set.seed(1)
df <- data.frame(x = rnorm(n, mean, sd))
ggplot(df, aes(x = x, mean = mean, sd = sd, binwidth = binwidth, n = n)) +
theme_bw() +
geom_histogram(binwidth = binwidth,
colour = "white", fill = "cornflowerblue", size = 0.1) +
stat_function(fun = function(x) dnorm(x, mean = mean, sd = sd) * n * binwidth,
color = "darkred", size = 1)
This code should do it:
set.seed(1)
z <- rnorm(1000)
qplot(z, geom = "blank") +
geom_histogram(aes(y = ..density..)) +
stat_density(geom = "line", aes(colour = "bla")) +
stat_function(fun = dnorm, aes(x = z, colour = "blabla")) +
scale_colour_manual(name = "", values = c("red", "green"),
breaks = c("bla", "blabla"),
labels = c("kernel_est", "norm_curv")) +
theme(legend.position = "bottom", legend.direction = "horizontal")
Note: I used qplot but you can use the more versatile ggplot.
Here's a tidyverse informed version:
Setup
library(tidyverse)
Some data
d <- read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/openintro/speed_gender_height.csv")
Preparing data
We'll use a "total" histogram for the whole sample, to that end, we'll need to remove the grouping information from the data.
d2 <-
d |>
select(-gender)
Here's a data set with summary data:
d_summary <-
d %>%
group_by(gender) %>%
summarise(height_m = mean(height, na.rm = T),
height_sd = sd(height, na.rm = T))
d_summary
Plot it
d %>%
ggplot() +
aes() +
geom_histogram(aes(y = ..density.., x = height, fill = gender)) +
facet_wrap(~ gender) +
geom_histogram(data = d2, aes(y = ..density.., x = height),
alpha = .5) +
stat_function(data = d_summary %>% filter(gender == "female"),
fun = dnorm,
#color = "red",
args = list(mean = filter(d_summary,
gender == "female")$height_m,
sd = filter(d_summary,
gender == "female")$height_sd)) +
stat_function(data = d_summary %>% filter(gender == "male"),
fun = dnorm,
#color = "red",
args = list(mean = filter(d_summary,
gender == "male")$height_m,
sd = filter(d_summary,
gender == "male")$height_sd)) +
theme(legend.position = "none",
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
labs(title = "Facetted histograms with overlaid normal curves",
caption = "The grey histograms shows the whole distribution (over) both groups, i.e. females and men") +
scale_fill_brewer(type = "qual", palette = "Set1")