predefine bins in geom_histogram - r

How do I pre-define bins for the histogram? For e.g.
predefine_bin_edges <- seq(0, 10, 1)
Can someone please use this predefine bin edges to update the example.
Edited later -
Also, Is there a way to include the extreme points, that are outside the binrange, in the corner bins for a complete picture of data distribution.
library(tidyverse)
# data
x <- rnorm(n = 1000, mean = 5, sd = 3)
tbl <- tibble(x)
# geom_histogram()
ggplot(data = tbl,
aes(x = x)) +
geom_histogram()

Do you mean like this?
library(tidyverse)
# data
x <- rnorm(n = 1000, mean = 5, sd = 3)
tbl <- tibble(x)
# geom_histogram()
ggplot(data = tbl,
aes(x = x)) +
geom_histogram(breaks = seq(0, 10, 1))
I don't think there is an argument to geom histogram that can include the values outside the range in the corner bins. However, you can squish the data at the aes() level.
ggplot(data = tbl,
aes(x = pmax(pmin(x, 10), 0))) +
geom_histogram(breaks = seq(0, 10, 1))
Or if you're uncomfortable with that, you can set the limits + oob arguments in the scale.
ggplot(data = tbl,
aes(x = x)) +
geom_histogram(breaks = seq(0, 10, 1)) +
scale_x_continuous(limits = c(0, 10),
oob = scales::oob_squish)

Related

How to produce neat label positions in the ggplot2 line chart?

I have a line chart built using ggplot2. It looks following:
Lines are close to each other and data labels are overlapping. It is not convenient. It would be better if light red labels were below the line and green labels where there is room for them. Something of the sort:
This post is helpful. However, I do not know in advance for which line it would be better to put labels above and for which it would be better to keep them below. Therefore I am looking for a generic solution.
ggrepel does a great job in organizing labels. But cannot figure out how to make it work in my case. I tried different parameters. Here is one of the simplest variants (not the best looking):
Questions:
Is there any way to make in R the chart look like on the 2nd picture?
I think ggrepel computes the best label position taking into account the size of the chart. If I export the chart to PowerPoint, for example, the size of the PowerPoint chart might be different from the size used to get optimal data label positions. Is there any way to pass the size of the chart to ggrepel?
Here is a code I used to generate data and charts:
library(ggplot2)
library(ggrepel)
set.seed(1)
x = rep(1:20, 3)
y = c(runif(20, 10, 11),
runif(20, 11, 12),
runif(20, 12, 13))
z = rep(c("a", "b", "c"), each = 20)
df = data.frame(x = x, y = y, z = z)
ggplot(data = df, aes(x = x, y = y, group = z, color = z)) +
geom_line() +
geom_text(aes(label = round(y, 1)), nudge_y = 1) +
ylim(c(0, 20))
ggplot(data = df, aes(x = x, y = y, group = z, color = z)) +
geom_line() +
geom_text_repel(aes(label = round(y, 1)), nudge_y = 1) +
ylim(c(0, 20))
Changing the theme to theme_bw() and removing gridlines from {ggExtra}'s removeGridX() gets the plot closer your second image. I also increased the size of the lines, limited the axes, and changed geom_text_repel to geom_label_repel to improve readability.
library(ggplot2)
library(ggrepel)
library(ggExtra)
set.seed(1)
x = rep(1:20, 3)
y = c(runif(20, 10, 11),
runif(20, 11, 12),
runif(20, 12, 13))
z = rep(c("a", "b", "c"), each = 20)
df = data.frame(x = x, y = y, z = z)
ggplot(data = df, aes(x = x, y = y, group = z, color = z)) +
theme_bw() + removeGridX() +
geom_line(size = 2) +
geom_label_repel(aes(label = round(y, 1)),
nudge_y = 0.5,
point.size = NA,
segment.color = NA,
min.segment.length = 0.1,
key_glyph = draw_key_path) +
scale_x_continuous(breaks=seq(0,20,by=1)) +
scale_y_continuous(breaks = seq(0, 14, 2), limits = c(0, 14))

Remove points with 0 density (no data) in stat_density_2d(geom = 'point')

I have two dataframes, one which I want to make a stat_density_2d plot using a 'raster' geom and one in which I want to use a 'point' geom. For the point geom I want to remove any point where there is no data though, as measured by a point size of 0.
The following is my code:
library(tidyverse)
set.seed(1)
#tibble for raster density plot
df <- tibble(x = runif(1000000, min = -7, max = 5),
y = runif(1000000, min = 0, max = 1000))
#tibble for point density plot
df2 <- tibble(x = runif(20000, min = -2, max = 2),
y = runif(20000, min = 0, max = 500))
#create the density plot
p1 <- ggplot(NULL, aes(x=x, y=y) ) +
stat_density_2d(data = df, aes(fill = stat(density)), geom = "raster", contour = FALSE) +
scale_fill_gradient(low="transparent", high="red") +
stat_density_2d(data = df2, geom = "point", aes(size = ..density..), n = 40, contour = FALSE) +
theme_bw() +
theme(text=element_text(size=18)) +
ylim(0, 1000) + xlim(-7, 5)
p1
which returns:
But where the points are smallest (outside the bounds specified in the df2 tibble) I don't want any density points to be shown. Is there anyway to remove these?
Here's a hack, though I don't know how robust it is to differences in data.
BLUF: add scale_radius(range=c(-1,6)).
I reduced your data a lot so that it doesn't take 5 minutes to render.
set.seed(1)
df <- tibble(x = runif(1000, min = -7, max = 5),
y = runif(1000, min = 0, max = 1000))
df2 <- tibble(x = runif(20, min = -2, max = 2),
y = runif(20, min = 0, max = 500))
Four plots:
Your code (my data), no other change;
scale_radius();
scale_radius(range = c(-0.332088004, 6)); and
scale_radius(range = c(-1, 6)).
This is surely a hack, and I don't know how to find a more precise way of filtering out specific levels.
The modified code:
p1 <- ggplot(NULL, aes(x=x, y=y) ) +
stat_density_2d(data = df, aes(fill = stat(density)), geom = "raster", contour = FALSE) +
scale_fill_gradient(low="transparent", high="red") +
stat_density_2d(data = df2, geom = "point", aes(size = ..density..), n = 40, contour = FALSE) +
theme_bw() +
# scale_radius() +
# scale_radius(range = c(-0.332088004, 6)) +
scale_radius(range = c(-1, 6)) +
theme(text=element_text(size=18)) +
ylim(0, 1000) + xlim(-7, 5)

Show multiple histogram using facet_wrap

Sample data
df <- data.frame(id = rep(1:6, each = 50), x = rnorm(50*6, mean = 10, sd = 5),
y = rnorm(50*6, mean = 20, sd = 10),
z = rnorm(50*6, mean = 30, sd = 15))
ggplot(df, aes(x)) + geom_histogram() + facet_wrap(~id)
How do I show x, y, z in the same plot for each id in different colours
It's best to reshape data from wide to long first, and then add a fill aesthetic to map what (i.e. x, y, z) to different fill colours:
library(tidyverse)
df %>%
gather(what, val, -id) %>%
ggplot(aes(val, fill = what)) + geom_histogram() + facet_wrap(~id)

Add hline with population median for each facet

I'd like to plot a horizontal facet-wide line with the population median of that facet.
I tried the approach without creating a dummy summary table with the following code:
require(ggplot2)
dt = data.frame(gr = rep(1:2, each = 500),
id = rep(1:5, 2, each = 100),
y = c(rnorm(500, mean = 0, sd = 1), rnorm(500, mean = 1, sd = 2)))
ggplot(dt, aes(x = as.factor(id), y = y)) +
geom_boxplot() +
facet_wrap(~ gr) +
geom_hline(aes(yintercept = median(y), group = gr), colour = 'red')
However, the line is drawn for the median of the entire dataset instead of the median separately for each facet:
In the past, a solution has been suggested to use
geom_line(stat = "hline", yintercept = "median")
but it's been discontinued (produces the error "No stat called StatHline").
Another solution suggested
geom_errorbar(aes(ymax=..y.., ymin=..y.., y = mean))
but it generates
Error in data.frame(y = function (x, ...) :
arguments imply differing number of rows: 0, 1000
Finally, there's a way to plot the median by creating a dummy table with the desired stats but I'd like to avoid it.
You could create an extra column in dt for median per facet.
library(dplyr) # With dplyr for example
dt <- dt %>% group_by(gr) %>%
mutate(med = median(y))
# Rerun ggplot line with yintercept = med
ggplot(dt, aes(x = as.factor(id), y = y)) +
geom_boxplot() +
facet_wrap(~ gr) +
geom_hline(aes(yintercept = med, group = gr), colour = 'red')
If you don't want to add a new column with the computed median, you can add a geom_smooth using a quantile regression :
library(ggplot2)
library(quantreg)
set.seed(1234)
dt <- data.frame(gr = rep(1:2, each = 500),
id = rep(1:5, 2, each = 100),
y = c(rnorm(500, mean = 0, sd = 1),
rnorm(500, mean = 1, sd = 2)))
ggplot(dt, aes(y = y)) +
geom_boxplot(aes(x = as.factor(id))) +
geom_smooth(aes(x = id), method = "rq", formula = y ~ 1, se = FALSE) +
facet_wrap(~ gr)

structure diagram where each members of group are connected to center and all cluster grand center in r

I am trying to create a structure diagram from the data like the following:
mydf <- data.frame ( group = rep (1:5, each = 20), z = rnorm (20, 10, 1),
x = c(rnorm (20, 2, 0.5), rnorm (20, 2, 0.5),
rnorm (20, 9, 0.5), rnorm (20, 9, 0.5),rnorm (20, 5, 0.5)),
y = c(rnorm (20, 2, 0.5), rnorm (20, 9, 0.5), rnorm (20, 2, 0.5),
rnorm (20, 9, 0.5), rnorm (20, 2, 0.5)))
means <- aggregate(. ~ group, data = mydf, mean)
gmx <-mean (mydf$x)
gmy <- mean (mydf$y)
library(ggplot2)
ggplot(mydf, aes(x, y)) +
geom_point(aes(colour= factor (group), size=z)) + theme_bw()
I want make connect every points within each cluster to its center and then the cluster center to grad mean. This will be produce a plot like the following (just rough sketch where two cluster are connected to the center, in real all cluster have the same):.........
(I would like to use the line segments of same color as of cluster if possible)
Here is an example:
library(plyr)
ms <- ddply(mydf, .(group), colwise(mean))
mydf2ms <- merge(mydf, ms, by = "group")
gm <- ddply(mydf, NULL, colwise(mean))
ms2gm <- data.frame(ms, gm)
ci <- expand.grid(1:3*2, seq(0, 2*pi, length = 180))
ci <- transform(ci, x = cos(Var2) * Var1 + gm$x, y = sin(Var2) * Var1 + gm$y)
library(ggplot2)
ggplot(mydf, aes(x, y)) +
geom_point(aes(colour= factor (group), size=z)) +
geom_segment(data = mydf2ms, mapping = aes(x = x.x, y = y.x, xend = x.y, yend = y.y, colour = factor(group))) +
geom_segment(data = ms2gm, mapping = aes(x = x, y = y, xend = x.1, yend = y.1)) +
geom_point(data = ms, colour = "black", size = 10, shape = 4) +
geom_point(data = gm, colour = "red", size = 10, shape = 4) +
geom_path(data = ci, mapping = aes(group = Var1), colour = "pink")

Resources