showing count on x-axis for dot plot - r

I'd like to have a dot plot that shows the count on the x-axis. How can you get the dotplot below to show the count on the x-asix?
Thank you.
date = seq(as.Date("2016/1/5"), as.Date("2016/1/11"), "day")
value = c(11,11,12,12,13,14,14)
dat =data.frame(date = date, value = value)
dat
library(ggplot2)
library(ggplot2)
ggplot(dat, aes(x = value)) + geom_dotplot(binwidth = .8) +
scale_y_discrete(breaks= seq(1,max(table(dat$value))+2,1),
labels = seq(1,max(table(dat$value))+2,1) ) #tried using scale_y discrete but it does nothing

ylim(0, A) gives what you want, where A is the number of stacked dots necessary to count 1.00 density. We can calculate the exact value of A (but a little complexly ; Dialogical approach gives you approximate value).
(I reffered to post1, post2, and post3)
library(ggplot2); library(grid)
date = seq(as.Date("2016/1/5"), as.Date("2016/1/12"), "day")
value = c(11,11,12,12,13,14,14,14)
dat =data.frame(date = date, value = value)
### base plot
g <- ggplot(dat, aes(x = value)) + geom_dotplot(binwidth = 0.8) + coord_flip()
g # output to read parameter
### calculation of width and height of panel
grid.ls(view=TRUE,grob=FALSE)
seekViewport('panel.3-4-3-4')
real_width <- convertWidth(unit(1,'npc'), 'inch', TRUE)
real_height <- convertHeight(unit(1,'npc'), 'inch', TRUE)
### calculation of other values
height_coordinate_range <- diff(ggplot_build(g)$panel$ranges[[1]]$y.range)
real_binwidth <- real_height / height_coordinate_range * 0.8 # 0.8 is the argument binwidth
num_balls <- real_width / 1.1 / real_binwidth # the number of stacked balls. 1.1 is expanding value.
g + ylim(0, num_balls)
# The dirty balls border probably comes from my environment.

You can add coord_flip() to switch the x and y axes in ggplot. Here's an example with your script:
date = seq(as.Date("2016/1/5"), as.Date("2016/1/11"), "day")
value = c(11,11,12,12,13,14,14)
dat =data.frame(date = date, value = value)
dat
Edit, count on x-axis:
This will give a dotplot with simplified commands, and the counts as labels on the x-axis. Note: The binwidth has been changed from 0.8 to 1 to accommodate the use of ylim rather than scales.
library(ggplot2)
ggplot(dat, aes(x = value)) +
geom_dotplot(binwidth = 1) +
coord_flip() +
ylim(0,max(table(dat$value))+2)
Edit, count on y-axis:
library(ggplot2)
ggplot(dat, aes(x = value)) +
geom_dotplot(binwidth = 1) +
ylim(0,max(table(dat$value))+2)

Related

How do I add data labels to a ggplot histogram with a log(x) axis?

I am wondering how to add data labels to a ggplot showing the true value of the data points when the x-axis is in log scale.
I have this data:
date <- c("4/3/2021", "4/7/2021","4/10/2021","4/12/2021","4/13/2021","4/13/2021")
amount <- c(105.00, 96.32, 89.00, 80.84, 121.82, 159.38)
address <- c("A","B","C","D","E","F")
df <- data.frame(date, amount, address)
And I plot it in ggplot2:
plot <- ggplot(df, aes(x = log(amount))) +
geom_histogram(binwidth = 1)
plot + theme_minimal() + geom_text(label = amount)
... but I get the error
"Error: geom_text requires the following missing aesthetics: y"
I have 2 questions as a result:
Why am I getting this error with geom_histogram? Shouldn't it assume to use count as the y value?
Will this successfully show the true values of the data points from the 'amount' column despite the plot's log scale x-axis?
Perhaps like this?
ggplot(df, aes(x = log(amount), y = ..count.., label = ..count..)) +
geom_histogram(binwidth = 1) +
stat_bin(geom = "text", binwidth = 1, vjust = -0.5) +
theme_minimal()
ggplot2 layers do not (at least in any situations I can think of) take the summary calculations of other layers, so I think the simplest thing would be to replicate the calculation using stat_bin(geom = "text"...
Or perhaps simpler, you could pre-calculate the numbers:
library(dplyr)
df %>%
count(log_amt = round(log(amount))) %>%
ggplot(aes(log_amt, n, label = n)) +
geom_col(width = 1) +
geom_text(vjust = -0.5)
EDIT -- to show buckets without the log transform we could use:
df %>%
count(log_amt = round(log(amount))) %>%
ggplot(aes(log_amt, n, label = n)) +
geom_col(width = 0.5) +
geom_text(vjust = -0.5) +
scale_x_continuous(labels = ~scales::comma(10^.),
minor_breaks = NULL)

ggplot2 - a custom histogram with a rug plot

I am trying to create a custom histogram with a rug plot showing the original values on the X axis.
I am going to use the mtcars dataset to illustrate. Its not be best dataset for this question...but hopefully the reader will understand what I am trying to achieve...
Below shows the basic histogram, without any rug plot attempt.
I want to create the histogram using geom_bar as this allows for more flexibility with custom bins.
I also want a small gap between the histgram bars (i.e width = 0.95) .... which adds to this
problem's complexity.
library(dplyr)
library(ggplot2)
# create custom bins
vct_seq <- c(seq(from = 10, to = 25, by = 5), 34)
mtcars$bin <- cut(mtcars$mpg, breaks = vct_seq)
# create data.frame for the ggplot graph..using bins above
df_mtcars_count <- mtcars %>% group_by(bin) %>% summarise(count = n())
# indicative labels
vct_labels <- c("bin 1", "bin 2", "bin 3", "bin 4")
# attempt 1 - basic plot -- no rug plot
p <- ggplot(data = df_mtcars_count, aes(x = bin, y = count))
p <- p + geom_bar(stat = "identity", width = 0.95)
p <- p + geom_text(aes(label = count), vjust = -0.5)
p <- p + scale_x_discrete("x title to go here", labels = df_mtcars_count$bin, breaks = df_mtcars_count$bin)
p
Next, try and add a basic rug plot on the X axis. This obviously doesn't work as the geom_bar and geom_rug have completely different scales.
# attempt 2 with no scaling.... doesn't work as x scale for ordinal (bins) and
# x scale for continuous (mpg) do not match
p <- ggplot(data = df_mtcars_count, aes(x = bin, y = count))
p <- p + geom_bar(stat = "identity", width = 0.95)
p <- p + geom_text(aes(label = count), vjust = -0.5)
p <- p + scale_x_discrete("x title to go here", labels = df_mtcars_count$bin, breaks = df_mtcars_count$bin)
p <- p + geom_rug(data = mtcars, aes(x = mpg), inherit.aes = F, alpha = 0.3)
p
Now, try and rescale the mpg column to match with the ordinal scale....
First define a linear mapping function...
fn_linear_map <- function(vct_existing_val, vct_new_range) {
# example....converts 1:20 into the range 1 to 10 like this:
# fn_linear_map(1:20, c(1, 10))
fn_r_diff <- function(x) x %>% range() %>% diff()
flt_ratio <- fn_r_diff(vct_new_range) / fn_r_diff(vct_existing_val)
vct_old_min_offset <- vct_existing_val - min(vct_existing_val)
vct_new_range_val <- (vct_old_min_offset * flt_ratio) + min(vct_new_range)
return(vct_new_range_val)
}
Now apply the function...we try and map mpg to the range 1 to 4 (which is an attempt to match
the ordinal scale)
mtcars$mpg_remap <- fn_linear_map(mtcars$mpg, c(1, 4))
Try the plot again.... getting closer ... but not really accurate...
# attempt 3: getting closer but doesn't really match the ordinal scale
p <- ggplot(data = df_mtcars_count, aes(x = bin, y = count))
p <- p + geom_bar(stat = "identity", width = 0.95)
p <- p + geom_text(aes(label = count), vjust = -0.5)
p <- p + scale_x_discrete("x title to go here", labels = df_mtcars_count$bin, breaks = df_mtcars_count$bin)
p <- p + geom_rug(data = mtcars, aes(x = mpg_remap), inherit.aes = F, alpha = 0.3)
p
The graph above is getting close to what I want....but rug plot does not line up
with the actual data ... example the max observation (33.9) should be displayed
almost aligning with the right hand side of the bar.. see below:
mtcars %>% filter(bin == "(25,34]") %>% arrange(mpg) %>% dplyr::select(mpg, mpg_remap)
Your scale makes no sense to me, as you are showing a bin that is twice as wide using the same bar width. Doing that in combination with a rug strikes me as confusing as best and misleading at worst. I suggest you plot the bars with their correct widths, after which the rug is trivial.
I think the best solution is to just use geom_histogram:
ggplot(mtcars, aes(mpg)) +
geom_histogram(breaks = vct_seq, col = 'grey80') +
geom_rug(aes(mpg, y = NULL))
If you really want the gaps between the bars you'll have to do more work:
library(tidyr)
d <- mtcars %>%
count(bin) %>%
separate(bin, c('min', 'max'), sep = ',', remove = FALSE) %>%
mutate_at(vars('min', 'max'), readr::parse_number) %>%
mutate(
middle = min + (max - min) / 2,
width = 0.9 * (max - min)
)
ggplot(d, aes(middle, n)) +
geom_col(width = d$width) +
geom_rug(aes(mpg, y = NULL), mtcars)

How to expand ggplot y axis limits to include maximum value

Often in plots the Y axis value label is chopped off below the max value being plotted.
For example:
library(tidyverse)
mtcars %>% ggplot(aes(x=mpg, y = hp))+geom_point()
I know of scale_y_continous - but I can't figure out a smart way to do this. Maybe I'm just overthinking things. I don't wish to mess up the 'smart' breaks that are generated automatically.
I might try to go about this manually...
mtcars %>% ggplot(aes(x=mpg, y=hp, color=as.factor(carb)))+geom_point() + scale_y_continuous(limits = c(0,375))
But this doesn't work like I mentioned above because of the 'smart breaks'. Is there anyway for me to extend the default break interval to 1 more, so that in this case it would be 400? Of course I would want this to be flexible for whatever dataset I am working with.
You can use expand_limits() to increase the maximum y-axis value. You can also ensure that the maximum y-axis value is rounded up to the next highest value on the scale of the data, e.g., next highest tens value, next highest hundreds value, etc., depending on the whether the highest value in the data is within the tens, hundreds, etc.
For example, the function below finds the base 10 log of the maximum y value and rounds it down. This gives us the base ten scale of the maximum y value (e.g., tens, hundreds, thousands, etc.). It then rounds the maximum y-axis value up to the nearest ten, hundred, etc., that is higher than the maximum y value.
expandy = function(vec, ymin=NULL) {
max.val = max(vec, na.rm=TRUE)
min.log = floor(log10(max.val))
expand_limits(y=c(ymin, ceiling(max.val/10^min.log)*10^min.log))
}
p = mtcars %>% ggplot(aes(x=mpg, y = hp)) +
geom_point()
p + expandy(mtcars$hp)
p + expandy(mtcars$hp, 0)
Or, to make things a bit easier, you could set up the function so that the y-range data is collected directly from the plot:
library(gridExtra)
expandy = function(plot, ymin=0) {
max.y = max(layer_data(plot)$y, na.rm=TRUE)
min.log = floor(log10(max.y))
expand_limits(y=c(ymin, ceiling(max.y/10^min.log)*10^min.log))
}
p = mtcars %>% ggplot(aes(x=mpg, y = hp)) +
geom_point()
grid.arrange(p, p + expandy(p), ncol=2)
p = iris %>% ggplot(aes(x=Sepal.Width, y=Petal.Width)) +
geom_point()
grid.arrange(p, p + expandy(p), ncol=2)
Choosing a step for breaking the y axis you can use the ceiling() function
library(gridExtra)
p1 <- mtcars %>% ggplot(aes(x=mpg, y = hp)) + geom_point()
p2 <- p1 +
scale_y_continuous(
limits = c(0, ceiling(max(mtcars$hp)/50)*50),
breaks = seq(0, ceiling(max(mtcars$hp)/50)*50, 50)
)
p3 <- p1 + scale_y_continuous(
limits = c(0, ceiling(max(mtcars$hp)/100)*100),
breaks = seq(0, ceiling(max(mtcars$hp)/100)*100, 100)
)
grid.arrange(p1, p2, p3, ncol=3)
For the p2 the ste is 50 while for p3 the step is 100
Here a solution that allow any kind of numeric scales:
expandy <- function(y, base, v_min = NULL) {
max.val <- max(y, na.rm = TRUE)
expand_limits(
y = c(
v_min,
base * (max.val %/% base + as.logical(max.val %% base))
)
)
}
here is a rather simple answer, just set one limit to NA:
mtcars %>%
ggplot(aes(x=mpg, y=hp, color=as.factor(carb))) +
geom_point() +
scale_y_continuous(limits = c(0, NA))

Format axis and label for line graph using ggplot2

Here is my sample data:
Singer <- c("A","B","C","A","B","C")
Rank <- c(1,2,3,3,2,1)
Episode <- c(1,1,1,2,2,2)
Votes <- c(0.3,0.28,0.11,0.14,0.29,0.38)
data <- data_frame(Episode,Singer,Rank,Votes)
data$Episode <- as.character(data$Episode)
I would like to make a line graph to show the performance of each singer.
I tried to use ggplot2 like below:
ggplot(data,aes(x=Episode,y=Votes,group = Singer)) + geom_line()
I have two questions:
How can I format the y-axis as percentage?
How can I label each dot in this line graph as the values of "Rank", which allows me to show rank and votes in the same graph?
To label each point use:
geom_label(aes(label = Rank))
# or
geom_text(aes(label = Rank), nudge_y = .01, nudge_x = 0)
To format the axis labels use:
scale_y_continuous(labels = scales::percent_format())
# or without package(scales):
scale_y_continuous(breaks = (seq(0, .4, .2)), labels = sprintf("%1.f%%", 100 * seq(0, .4, .2)), limits = c(0,.4))
Complete code:
library(ggplot2)
library(scales)
ggplot(data, aes(x = factor(Episode), y = Votes, group = Singer)) +
geom_line() +
geom_label(aes(label = Rank)) +
scale_y_continuous(labels = scales::percent_format())
Data:
Singer <- c("A","B","C","A","B","C")
Rank <- c(1,2,3,3,2,1)
Episode <- c(1,1,1,2,2,2)
Votes <- c(0.3,0.28,0.11,0.14,0.29,0.38)
data <- data_frame(Episode,Singer,Rank,Votes)
# no need to transform to character bc we use factor(Episode) in aes(x=..)

Align discrete and continuous axes with ggplot2 and grid

I'm attempting to display a grid figure of summarized weekly data of several variables. The two components of this graph that are most pertinent are a distributional summary graph (so box plot or violin plot) of the values that a certain variables took over a given week and a cumulative count graph of an integer variable accumulating over weeks (so a step plot). I would like to plot these two graphs in on an aligned x-axis using grid. I'll be using ggplot2 to make the individual graphs, because I've got a crush on Hadley Wickham (j/k, ggplot is just really, really nice).
The problem is that geom_boxplot only takes factors for x-axis and the geom_step only takes continuous data for the x-axis. These don't necessarily align even if you force similar x-limits with coord_cartesian or scale_x_....
I've cobbled together a hack using geom_rect that will work for this specific application, but that will be a pain to adapt if, for example, I have some other factor that results in multiple boxes for a single week.
The obligatory reproducible:
library(ggplot2)
library(grid)
var1 <- data.frame(val = rnorm(300),
week = c(rep(25, 100),
rep(26, 100),
rep(27, 100))
)
var2 <- data.frame(cumul = cumsum(c(0, rpois(2, 15))),
week = c(25, 26, 27)
)
g1 <- ggplot(var1, aes(x = factor(week), y = val)) +
geom_boxplot()
g2 <- ggplot(var2, aes(x = week, y = cumul)) +
geom_step() + scale_x_continuous(breaks = 25:27)
grid.newpage()
grid.draw(rbind(ggplotGrob(g1),
ggplotGrob(g2),
size = "last"))
And the kludge:
library(dplyr)
chiggity_check <- var1 %>%
group_by(week) %>%
summarise(week.avg = mean(val),
week.25 = quantile(val)[2],
week.75 = quantile(val)[4],
week.05 = quantile(val)[1],
week.95 = quantile(val)[5])
riggity_rect <- ggplot(chiggity_check) +
geom_rect(aes(xmin = week - 0.25, xmax = week + 0.25,
ymin = week.25,
ymax = week.75)) +
geom_segment(aes(x = week - 0.25, xend = week + 0.25,
y = week.avg, yend=week.avg),
color = "white") +
geom_segment(aes(x = week, xend = week ,
y = week.25, yend=week.05)) +
geom_segment(aes(x = week, xend = week ,
y = week.75, yend=week.95)) +
coord_cartesian(c(24.5,27.5)) +
scale_x_continuous(breaks = 25:27)
grid.newpage()
grid.draw(rbind(ggplotGrob(riggity_rect),
ggplotGrob(g2 + coord_cartesian(c(24.5,27.5))),
size = "last"))
So the question(s) is/are: is there a way to force geom_boxplot to a continuous axis or geom_step to a factor axis? Or is there some other implementation, perhaps stat_summary that will be a bit more flexible so that I can align axes and also potentially easily add in things like grouping color variables?
One approach is to plot the two charts on an x-axis set up with factor(week), but in the g2 plot (the step plot) do so in geom_blank() so that the scale is set up. Then in geom_step(), plot on a numeric scale: as.numeric(factor(week))
library(ggplot2)
library(grid)
# Your data
var1 <- data.frame(val = rnorm(300),
week = c(rep(25, 100),
rep(26, 100),
rep(27, 100))
)
var2 <- data.frame(cumul = cumsum(c(0, rpois(2, 15))),
week = c(25, 26, 27)
)
# Your g1
g1 <- ggplot(var1, aes(x = factor(week), y = val)) +
geom_boxplot()
# Modified g2
g2 <- ggplot(var2) + geom_blank(aes(x = factor(week), y = cumul)) +
geom_step(aes(x = as.numeric(as.factor(week)), y = cumul))
grid.newpage()
grid.draw(gridExtra::rbind.gtable(ggplotGrob(g1),
ggplotGrob(g2),
size = "last"))

Resources