Not smooth density plot using ggplot2 - r

When I try to plot the density of some numerical data either using geom_density() or stat_density(), I get a non-smooth curve. Using adjust do not change this.
Here I've used facet_zoom(), but also coord_cartesian(xlim = c(...)) produces this non-smooth curve. Pretty weird in my opinion. Any suggestions what's going on?
https://drive.google.com/file/d/1PjQp7XkY5G21NoIo8y8lyeaXKvuvrqVk/view?usp=sharing
Edit: I have uploaded 50000 rows of the original data. To reproduce the plot (not using ggforce), use the code:
data <- read.table("rep.txt")
(
ggplot(data, aes(x = x))
+ geom_density(adjust = 1, fill = "grey")
+ coord_cartesian(xlim = c(-50000,50000))
+ labs(x = "", y = "")
+ theme_bw()
)

I reproduced your code but was unable to reproduce the exact image in your original question. Are you concerned about the lack of smoothness at the very tip of the geom_density plot? There are other arguments you can try like kernel and bw, but the sheer number of zeroes in your data will make it hard to achieve a smooth curve (unless you ramp up your adjust value).
library(tidyverse)
options(scipen = 999999)
# https://stackoverflow.com/questions/33135060/read-csv-file-hosted-on-google-drive
id <- "1PjQp7XkY5G21NoIo8y8lyeaXKvuvrqVk" # google file ID
data <- read.table(sprintf("https://docs.google.com/uc?id=%s&export=download", id)) %>%
rownames_to_column(var = "var")
ggplot(data, aes(x = x)) +
geom_density(
adjust = 10,
fill = "grey",
kernel = "cosine",
bw = "nrd0") +
coord_cartesian(xlim = c(-50000,50000)) +
labs(x = "", y = "") + theme_bw()
# I didn't export images for these, but they showcase how many zeroes you have
ggplot(data, aes(x = x)) +
geom_histogram(bins = 1000) +
coord_cartesian(xlim = c(0,50000)) +
labs(x = "", y = "") + theme_bw()
ggplot(data, aes(x = x)) +
geom_freqpoly(bins = 1000) +
coord_cartesian(xlim = c(0,50000)) +
labs(x = "", y = "") + theme_bw()

Related

R Windrose percent label on figure

I am using the windrose function posted here: Wind rose with ggplot (R)?
I need to have the percents on the figure showing on the individual lines (rather than on the left side), but so far I have not been able to figure out how. (see figure below for depiction of goal)
Here is the code that makes the figure:
p.windrose <- ggplot(data = data,
aes(x = dir.binned,y = (..count..)/sum(..count..),
fill = spd.binned)) +
geom_bar()+
scale_y_continuous(breaks = ybreaks.prct,labels=percent)+
ylab("")+
scale_x_discrete(drop = FALSE,
labels = waiver()) +
xlab("")+
coord_polar(start = -((dirres/2)/360) * 2*pi) +
scale_fill_manual(name = "Wind Speed (m/s)",
values = spd.colors,
drop = FALSE)+
theme_bw(base_size = 12, base_family = "Helvetica")
I marked up the figure I have so far with what I am trying to do! It'd be neat if the labels either auto-picked the location with the least wind in that direction, or if it had a tag for the placement so that it could be changed.
I tried using geom_text, but I get an error saying that "aesthetics must be valid data columns".
Thanks for your help!
One of the things you could do is to make an extra data.frame that you use for the labels. Since the data isn't available from your question, I'll illustrate with mock data below:
library(ggplot2)
# Mock data
df <- data.frame(
x = 1:360,
y = runif(360, 0, 0.20)
)
labels <- data.frame(
x = 90,
y = scales::extended_breaks()(range(df$y))
)
ggplot(data = df,
aes(x = as.factor(x), y = y)) +
geom_point() +
geom_text(data = labels,
aes(label = scales::percent(y, 1))) +
scale_x_discrete(breaks = seq(0, 1, length.out = 9) * 360) +
coord_polar() +
theme(axis.ticks.y = element_blank(), # Disables default y-axis
axis.text.y = element_blank())
#teunbrand answer got me very close! I wanted to add the code I used to get everything just right in case anyone in the future has a similar problem.
# Create the labels:
x_location <- pi # x location of the labels
# Get the percentage
T_data <- data %>%
dplyr::group_by(dir.binned) %>%
dplyr::summarise(count= n()) %>%
dplyr::mutate(y = count/sum(count))
labels <- data.frame(x = x_location,
y = scales::extended_breaks()(range(T_data$y)))
# Create figure
p.windrose <- ggplot() +
geom_bar(data = data,
aes(x = dir.binned, y = (..count..)/sum(..count..),
fill = spd.binned))+
geom_text(data = labels,
aes(x=x, y=y, label = scales::percent(y, 1))) +
scale_y_continuous(breaks = waiver(),labels=NULL)+
scale_x_discrete(drop = FALSE,
labels = waiver()) +
ylab("")+xlab("")+
coord_polar(start = -((dirres/2)/360) * 2*pi) +
scale_fill_manual(name = "Wind Speed (m/s)",
values = spd.colors,
drop = FALSE)+
theme_bw(base_size = 12, base_family = "Helvetica") +
theme(axis.ticks.y = element_blank(), # Disables default y-axis
axis.text.y = element_blank())

ggplot2 confusion matrix geom_text labeling

I've plotted a confusion matrix (predicting 5 outcomes) in R using ggplot and scales for geom_text labeling.
The way geom_text(aes(label = percent(Freq/sum(Freq))) is written in code, it's showing Frequency of each box divided by sum of all observations, but what I want to do is get Frequency of each box divided by sum Frequency for each Reference.
In other words, instead of A,A = 15.8%,
it should be A,A = 15.8%/(0.0%+0.0%+0.0%+0.0%+15.8%%) = 100.0%
library(ggplot2)
library(scales)
valid_actual <- as.factor(c("A","B","B","C","C","C","E","E","D","D","A","A","A","E","E","D","D","C","B"))
valid_pred <- as.factor(c("A","B","C","C","E","C","E","E","D","B","A","B","A","E","D","E","D","C","B"))
cfm <- confusionMatrix(valid_actual, valid_pred)
ggplotConfusionMatrix <- function(m){
mytitle <- paste("Accuracy", percent_format()(m$overall[1]),
"Kappa", percent_format()(m$overall[2]))
p <-
ggplot(data = as.data.frame(m$table) ,
aes(x = Reference, y = Prediction)) +
geom_tile(aes(fill = log(Freq)), colour = "white") +
scale_fill_gradient(low = "white", high = "green") +
geom_text(aes(x = Reference, y = Prediction, label = percent(Freq/sum(Freq)))) +
theme(legend.position = "none") +
ggtitle(mytitle)
return(p)
}
ggplotConfusionMatrix(cfm)
The problem is that, as far as I know, ggplot is not able to do group calculation. See this recent post for similar question.
To solve your problem you should take advantage of the dplyrpackage.
This should work
library(ggplot2)
library(scales)
library(caret)
library(dplyr)
valid_actual <- as.factor(c("A","B","B","C","C","C","E","E","D","D","A","A","A","E","E","D","D","C","B"))
valid_pred <- as.factor(c("A","B","C","C","E","C","E","E","D","B","A","B","A","E","D","E","D","C","B"))
cfm <- confusionMatrix(valid_actual, valid_pred)
ggplotConfusionMatrix <- function(m){
mytitle <- paste("Accuracy", percent_format()(m$overall[1]),
"Kappa", percent_format()(m$overall[2]))
data_c <- mutate(group_by(as.data.frame(m$table), Reference ), percentage =
percent(Freq/sum(Freq)))
p <-
ggplot(data = data_c,
aes(x = Reference, y = Prediction)) +
geom_tile(aes(fill = log(Freq)), colour = "white") +
scale_fill_gradient(low = "white", high = "green") +
geom_text(aes(x = Reference, y = Prediction, label = percentage)) +
theme(legend.position = "none") +
ggtitle(mytitle)
return(p)
}
ggplotConfusionMatrix(cfm)
And the result:

Set specific color in ggplot2 using scale_fill

I am visualizing missing data in R using this method which uses ggplot2:
library(reshape2)
library(ggplot2)
ggplot_missing <- function(x){
x %>%
is.na %>%
melt %>%
ggplot(data = .,
aes(x = Var2,
y = Var1)) +
geom_raster(aes(fill = value)) +
scale_fill_grey(name = "", labels = c("Present","Missing")) +
theme_minimal() +
theme(axis.text.x = element_text(angle=45, vjust=0.5)) +
labs(x = "Columns / Attributes",
y = "Rows / Observations")
}
The scale_fill_grey method uses black and grey. How can I change the color of the cells to a specific color, say "red"?
I have tried:
scale_fill_brewer(name = "", labels = c("Present","Missing"), na.val="red")
Also,
scale_fill_gradient(name = "", labels = c("Present","Missing"), low = "#FF69B4", high = "#FF0000")
But I get the error:
Error: Discrete value supplied to continuous scale
I got it to work by replacing scale_fill_grey with the following:
scale_fill_manual(name = "", values = c('my_color_1', 'my_color_2'), labels = c("Present","Missing")) +

Directlabels package-- labels do not fit in plot area

I want to explore the directlabels package with ggplot. I am trying to plot labels at the endpoint of a simple line chart; however, the labels are clipped by the plot panel. (I intend to plot about 10 financial time series in one plot and I thought directlabels would be the best solution.)
I would imagine there may be another solution using annotate or some other geoms. But I would like to solve the problem using directlabels. Please see code and image below. Thanks.
library(ggplot2)
library(directlabels)
library(tidyr)
#generate data frame with random data, for illustration and plot:
x <- seq(1:100)
y <- cumsum(rnorm(n = 100, mean = 6, sd = 15))
y2 <- cumsum(rnorm(n = 100, mean = 2, sd = 4))
data <- as.data.frame(cbind(x, y, y2))
names(data) <- c("month", "stocks", "bonds")
tidy_data <- gather(data, month)
names(tidy_data) <- c("month", "asset", "value")
p <- ggplot(tidy_data, aes(x = month, y = value, colour = asset)) +
geom_line() +
geom_dl(aes(colour = asset, label = asset), method = "last.points") +
theme_bw()
On data visualization principles, I would like to avoid extending the x-axis to make the labels fit--this would mean having data space with no data. Rather, I would like the labels to extend toward the white space beyond the chart box/panel (if that makes sense).
In my opinion, direct labels is the way to go. Indeed, I would position labels at the beginning and at the end of the lines, creating space for the labels using expand(). Also note that with the labels, there is no need for the legend.
This is similar to answers here and here.
library(ggplot2)
library(directlabels)
library(grid)
library(tidyr)
x <- seq(1:100)
y <- cumsum(rnorm(n = 100, mean = 6, sd = 15))
y2 <- cumsum(rnorm(n = 100, mean = 2, sd = 4))
data <- as.data.frame(cbind(x, y, y2))
names(data) <- c("month", "stocks", "bonds")
tidy_data <- gather(data, month)
names(tidy_data) <- c("month", "asset", "value")
ggplot(tidy_data, aes(x = month, y = value, colour = asset, group = asset)) +
geom_line() +
scale_colour_discrete(guide = 'none') +
scale_x_continuous(expand = c(0.15, 0)) +
geom_dl(aes(label = asset), method = list(dl.trans(x = x + .3), "last.bumpup")) +
geom_dl(aes(label = asset), method = list(dl.trans(x = x - .3), "first.bumpup")) +
theme_bw()
If you prefer to push the labels into the plot margin, direct labels will do that. But because the labels are positioned outside the plot panel, clipping needs to be turned off.
p1 <- ggplot(tidy_data, aes(x = month, y = value, colour = asset, group = asset)) +
geom_line() +
scale_colour_discrete(guide = 'none') +
scale_x_continuous(expand = c(0, 0)) +
geom_dl(aes(label = asset), method = list(dl.trans(x = x + .3), "last.bumpup")) +
theme_bw() +
theme(plot.margin = unit(c(1,4,1,1), "lines"))
# Code to turn off clipping
gt1 <- ggplotGrob(p1)
gt1$layout$clip[gt1$layout$name == "panel"] <- "off"
grid.draw(gt1)
This effect can also be achieved using geom_text (and probably also annotate), that is, without the need for direct labels.
p2 = ggplot(tidy_data, aes(x = month, y = value, group = asset, colour = asset)) +
geom_line() +
geom_text(data = subset(tidy_data, month == 100),
aes(label = asset, colour = asset, x = Inf, y = value), hjust = -.2) +
scale_x_continuous(expand = c(0, 0)) +
scale_colour_discrete(guide = 'none') +
theme_bw() +
theme(plot.margin = unit(c(1,3,1,1), "lines"))
# Code to turn off clipping
gt2 <- ggplotGrob(p2)
gt2$layout$clip[gt2$layout$name == "panel"] <- "off"
grid.draw(gt2)
Since you didn't provide a reproducible example, it's hard to say what the best solution is. However, I would suggest trying to manually adjust the x-scale. Use a "buffer" increase the plot area.
#generate data frame with random data, for illustration and plot:
p <- ggplot(tidy_data, aes(x = month, y = value, colour = asset)) +
geom_line() +
geom_dl(aes(colour = asset, label = asset), method = "last.points") +
theme_bw() +
xlim(minimum_value, maximum_value + buffer)
Using scale_x_discrete() or scale_x_continuous() would likely also work well here if you want to use the direct labels package. Alternatively, annotate or a simple geom_text would also work well.

How do I add vline to all the graphs in facet_grid?

I am working on the Boston data set and trying to see where the 8 room data are on each graph. I have commented out the line which is giving me the error.
I have to add vertical lines at all the points corresponding to rm = 8, to see the spread of data, in every graph of the grid. I want to know:
1. what I have done wrong.
2. A better way to find/represent data points where rm = 8.
library(ggplot2)
library(reshape2)
library(MASS)
library(data.table)
data("Boston")
Boston <- as.data.table(Boston)
molten_boston <- Boston[, `:=`(rm = round(rm),
nox = nox * 100,
chas = chas * 10)]
molten_boston <- melt(data = molten_boston, id.vars = "rm")
comments_bar <- ggplot(molten_boston) +
geom_bar(binwidth = 1, aes(x = value), color = "black", fill = "salmon") +
# geom_vline(data = molten_boston[rm == 8, .SD, by = variable, .SDcols = "value"], aes(xintercept = value)) +
facet_wrap(~ variable, scales = "free")
print(comments_bar)
One other visualization would be stacked bars, it looks ok when large:
molten_boston$EightRooms <- as.factor(molten_boston$rm == 8)
molten_boston$EightRooms <- relevel(molten_boston$EightRooms, 2)
ggplot(molten_boston, aes(x = value, fill = EightRooms)) +
geom_bar(binwidth = 1, color = "black") +
facet_wrap(~ variable, scales = "free")
Using a density plot in the background would be nice, but is a bit tricky in this case because of the changing y-axis. You probably have to do some pre-calculation. Here's my best attempt:
ggplot(molten_boston, aes(x = value)) +
geom_density(data = subset(molten_boston, rm == 8), aes(y =..density.. * 300),
fill = 'blue', alpha = 0.5) +
geom_bar(binwidth = 1, color = "black", fill = "salmon", alpha = 0.5) +
facet_wrap(~ variable, scales = "free")
Another way to do this would be using a rug-plot, with the rugs at the top. This is more or less the same as using geom_vline(...) but the lines don't extend all the way down, obscuring the bars. Also, I don't see why you want to use binwidth=1.
ggplot(molten_boston) +
geom_bar(aes(x = value), color = "grey50", fill = "salmon") +
geom_rug(data=molten_boston[rm==8,value, by=variable],
aes(x=value), sides="t", color="blue") +
facet_wrap(~ variable, scales = "free")
I don't have package data.table, so I can't tell if the problem lies in the data.table part of the code or not. But you need a single value for each room size, so
Boston$rm = round(Boston$rm)
molten_boston <- melt(data =Boston, id.vars = "rm")
rm.means = aggregate.data.frame(molten_boston$value,by=molten_boston[,1:2],FUN=mean)
comments_bar <- ggplot(molten_boston) +
geom_bar(binwidth = 1, aes(x = value), color = "black", fill = "salmon") +
geom_vline(data = rm.means[rm.means$rm==8,], aes(xintercept = x)) +
facet_wrap(~ variable, scales = "free")
print(comments_bar)
seems to work.

Resources