Plotting a Chart when Data isn't there - r

I'm just trying to plot monthly data when there is no data for two months. My code is below, where the data for Months April and May is empty. However, when I try to plot the chart, I get an error
Error: (converted from warning) Removed 2 rows containing non-finite values (stat_boxplot).
If I just add a zero in each entry (instead of NA) I get the plot, but now with a value when there shouldn't be any (see https://i.stack.imgur.com/CHI51.png). If I add na.omit(df) it just removes the two months. Could someone assist me please?
Cost_Delta<-c(85000,-32672.62,28335.64,-85000,30963.5,-28335.64,NA,NA,
-85000,32672.62,85000,-32672.62,-85000,-32672.62,85000,
-32672.62,-85000,32672.62,85000,32672.62,-85000,-32672.62)
Month<-c("Jan","Jan","Feb","Feb","Mar","Mar","Apr","May","Jun","Jun",
"Jul","Jul","Aug","Aug","Sep","Sep","Oct","Oct","Nov","Nov","Dec","Dec")
df<-data.frame(Cost_Delta,Month)
df$Month <- as.character(df$Month)
df$Month <- factor(df$Month, levels=unique(df$Month))
library(ggplot2)
p<-ggplot(df, aes(x=Month, y=Cost_Delta)) +
geom_point(aes(fill=Month), size=2, shape=21, colour="grey20",
position=position_jitter(width=0.2, height=0.1)) +
geom_boxplot(outlier.colour=NA, fill=NA, colour="grey20") +
scale_y_continuous(labels=scales::comma,breaks=seq(-300000,400000,50000)) +
labs(x="Month-Year", y="Cost Delta (Demand-Mean Forecast)")
p

ggplot2 warns the user when they try to plot NA values. If you want to explicitly ignore this behavior, you can use the argument na.rm = TRUE with the layer.
Cost_Delta<-c(85000,-32672.62,28335.64,-85000,30963.5,-28335.64,NA,NA,
-85000,32672.62,85000,-32672.62,-85000,-32672.62,85000,
-32672.62,-85000,32672.62,85000,32672.62,-85000,-32672.62)
Month<-c("Jan","Jan","Feb","Feb","Mar","Mar","Apr","May","Jun","Jun",
"Jul","Jul","Aug","Aug","Sep","Sep","Oct","Oct","Nov","Nov","Dec","Dec")
df <- data.frame(Cost_Delta, Month)
df$Month <- as.character(df$Month)
df$Month <- factor(df$Month, levels = unique(df$Month))
library(ggplot2)
p <- ggplot(df, aes(x = Month, y = Cost_Delta)) +
geom_point(
aes(fill = Month),
size = 2,
shape = 21,
colour = "grey20",
position = position_jitter(width = 0.2, height = 0.1),
na.rm = TRUE
) +
geom_boxplot(outlier.colour = NA,
fill = NA,
colour = "grey20",
na.rm = TRUE) +
scale_y_continuous(label = scales::comma, breaks = seq(-300000, 400000, 50000)) +
labs(x = "Month-Year", y = "Cost Delta (Demand-Mean Forecast)")
p

Related

How to smooth out a time-series geom_area with fill in ggplot?

I have the following graph and code:
Graph
ggplot(long2, aes(x = DATA, y = value, fill = variable)) + geom_area(position="fill", alpha=0.75) +
scale_y_continuous(labels = scales::comma,n.breaks = 5,breaks = waiver()) +
scale_fill_viridis_d() +
scale_x_date(date_labels = "%b/%Y",date_breaks = "6 months") +
ggtitle("Proporcions de les visites, només 9T i 9C") +
xlab("Data") + ylab("% visites") +
theme_minimal() + theme(legend.position="bottom") + guides(fill=guide_legend(title=NULL)) +
annotate("rect", fill = "white", alpha = 0.3,
xmin = as.Date.character("2020-03-16"), xmax = as.Date.character("2020-06-22"),
ymin = 0, ymax = 1)
But it has some sawtooth, how am I supposed to smooth it out?
I believe your situation is roughly analogous to the following, wherein we have missing x-positions for one group, but not the other at the same position. This causes spikes if you set position = "fill".
library(ggplot2)
x <- seq_len(100)
df <- data.frame(
x = c(x[-c(25, 75)], x[-50]),
y = c(cos(x[-c(25, 75)]), sin(x[-50])) + 5,
group = rep(c("A", "B"), c(98, 99))
)
ggplot(df, aes(x, y, fill = group)) +
geom_area(position = "fill")
To smooth out these spikes, it has been suggested to linearly interpolate the data at the missing positions.
# Find all used x-positions
ux <- unique(df$x)
# Split data by group, interpolate data groupwise
df <- lapply(split(df, df$group), function(xy) {
approxed <- approx(xy$x, xy$y, xout = ux)
data.frame(x = ux, y = approxed$y, group = xy$group[1])
})
# Recombine data
df <- do.call(rbind, df)
# Now without spikes :)
ggplot(df, aes(x, y, fill = group)) +
geom_area(position = "fill")
Created on 2022-06-17 by the reprex package (v2.0.1)
P.S. I would also have expected a red spike at x=50, but for some reason this didn't happen.

How to get a complete vector of breaks from the scale of a plot in R?

I am trying to add captions outside the plots using the solutions from this post and this one.
I think I managed to get what I want, but I am trying to automatize the code if the data changes. Now my problem is that I need a way to get the vector of all the values/breaks from the y-axis from the plot. I don't want to change the y-axis and I don't want to get only the range (I found this post to get the ranges, but I don't want only that)
On the other hand, I found this post, but the solution doesn't work for new versions of ggplot2 (mine is 3.3.5).
This is my example:
library(ggplot2)
library(dplyr)
# DATA
val1 <- c(2.1490626,2.2035281,1.5927854,3.1399245,2.3967338,3.7915825,4.6691277,3.0727319,2.9230937,2.6239759,3.7664386,4.0160378,1.2500835,4.7648343,0.0000000,5.6740227,2.7510256,3.0709322,2.7998003,4.0809085,2.5178086,5.9713330,2.7779843,3.6724801,4.2648527,3.6841084,2.5597235,3.8477471,2.6587736,2.2742209,4.5862788,6.1989269,4.1167091,3.1769325,4.2404515,5.3627032,4.1576810,4.3387921,1.4024381,0.0000000,4.3999099,3.4381837,4.8269218,2.6308474,5.3481382,4.9549753,4.5389650,1.3002293,2.8648220,2.4015338,2.0962332,2.6774765,3.0581759,2.5786137,5.0539080,3.8545796,4.3429043,4.2233248,2.0434363,4.5980727)
val2 <- c(3.7691229,3.6478055,0.5435826,1.9665861,3.0802654,1.2248374,1.7311236,2.2492826,2.2365337,1.5726119,2.0147144,2.3550348,1.9527204,3.3689502,1.7847986,3.5901329,1.6833872,3.4240479,1.8372175,0.0000000,2.5701453,3.6551315,4.0327091,3.8781182)
val3 <- c(2.1490626,2.2035281,1.5927854,3.1399245,2.3967338,3.7915825,4.6691277,3.0727319,2.9230937,2.6239759,3.7664386,4.0160378,1.2500835,4.7648343,0.0000000,5.6740227,2.7510256,3.0709322,2.7998003,4.0809085,2.5178086,5.9713330,2.7779843,3.6724801,4.2648527,3.6841084,2.5597235,3.8477471,2.6587736,2.2742209,4.5862788,6.1989269,4.1167091,3.1769325,4.2404515,5.3627032,4.1576810,4.3387921,1.4024381,0.0000000,4.3999099,3.4381837,4.8269218,2.6308474,5.3481382,4.9549753,4.5389650,1.3002293,2.8648220,2.4015338,2.0962332,2.6774765,3.0581759,2.5786137,5.0539080,3.8545796,4.3429043,4.2233248,2.0434363,4.5980727)
df1 <- data.frame(value = val1)
df2 <- data.frame(value = val2)
df3 <- data.frame(value = val3)
data <- bind_rows(lst(df1, df2, df3), .id = 'id')
data$Sex <- rep(c("Male", "Female"), times=72)
data$d <- "ff"
data <- as.data.frame(unclass(data), stringsAsFactors = TRUE)
# PLOT
p <- data %>%
ggplot(aes(value)) +
geom_density(lwd = 1.2, colour="red", show.legend = FALSE) +
geom_histogram(aes(y=..density.., fill = id), bins=10, col="black", alpha=0.2) +
facet_grid(id ~ Sex ) +
xlab("type_data") +
ylab("Density") +
ggtitle("title") +
guides(fill=guide_legend(title="legend_title")) +
theme(strip.text.y = element_blank())
p
# ADD CAPTION
caption_df = data.frame(value = c(min(data$value), max(data$value)), id = c(rep(tail(levels(data$id), n=1), times=length(levels(data$Sex)))),
Sex = c(levels(data$Sex)))
p + coord_cartesian(clip = "off",
ylim = layer_scales(p)$y$range$range,
xlim = layer_scales(p)$x$range$range) +
geom_text(data = caption_df,
aes(y = -0.15, label = c(levels(data$Sex))))
Before adding the caption:
After the caption:
The idea is that I want to avoid having to set up the y parameter every time I change the data. Imagine that that the y-axis is different (it is something like this: 0.0000, 0.0005, 0.0010, 0.0015). In that case, the appropriate y would be -0.0005 because the "jump" is 0.0005, so I just have to make it negative.
For that reason, I was wondering if it is possible to get the COMPLETE vector of values from the y-axis.
For example, if we want to get all the values/breaks of the y-axis from the previous images would be: c(0.0, 0.2, 0.4, 0.6).
Does anyone know if I can get ALL the values from the y-axis of a plot?
Thanks in advance
You can get the y axis breaks from the p object like this:
as.numeric(na.omit(layer_scales(p)$y$break_positions()))
#> [1] 0.0 0.2 0.4 0.6
However, if you want the labels to be a fixed distance below the panel regardless of the y axis scale, it would be best to use a fixed fraction of the entire panel range rather than the breaks:
yrange <- layer_scales(p)$y$range$range
ypos <- min(yrange) - 0.2 * diff(yrange)
p + coord_cartesian(clip = "off",
ylim = layer_scales(p)$y$range$range,
xlim = layer_scales(p)$x$range$range) +
geom_text(data = caption_df,
aes(y = ypos, label = c(levels(data$Sex))))
For example, suppose you had a y scale that was twice the size:
p <- data %>%
ggplot(aes(value)) +
geom_density(lwd = 1.2, colour="red", show.legend = FALSE) +
geom_histogram(aes(y= 2 * ..density.., fill = id), bins=10, col="black", alpha=0.2) +
facet_grid(id ~ Sex ) +
xlab("type_data") +
ylab("Density") +
ggtitle("title") +
guides(fill=guide_legend(title="legend_title")) +
theme(strip.text.y = element_blank())
Then the exact same code would give you the exact same label placement, without any reference to breaks:
yrange <- layer_scales(p)$y$range$range
ypos <- min(yrange) - 0.2 * diff(yrange)
p + coord_cartesian(clip = "off",
ylim = layer_scales(p)$y$range$range,
xlim = layer_scales(p)$x$range$range) +
geom_text(data = caption_df,
aes(y = ypos, label = c(levels(data$Sex))))

Color outlier dots above a specific value in R

How do I color outliers that are above a specific value using ggplot2 in R?.
(Sorry for the seemingly easy question, I am a beginner. the reason why is that these are frequencies of a value of 0, I am then transforming this column of data by taking the -log10(). So anything that has a frequency of 0 would then be transformed into Inf. Attached is a screenshot of my plot, essentially I want to make all the outlier points above 10 on the y axis to be a different color.
boxplots <- function(df){
df$'frequency'[is.na(df$'frequency')] <- 0.00
df$'-log10(frequency)' <- -log10(df$'frequency')
x <- data.frame(group = 'x', value = df$'-log10(frequency)'[df$'Type'=='x'])
y <- data.frame(group = 'y', value = df$'-log10(frequency)'[df$'Type'=='y'])
z <- data.frame(group = 'z', value = df$'-log10(frequency)'[df$'Type'=='c=z'])
plot.data <<- rbind(x, y, z)
labels <- c("z", "y", "z")
t<-plot.data %>%
ggplot(aes(x = group, y = value, fill = group))+
geom_boxplot()+
scale_fill_viridis(discrete = TRUE, alpha = 0.6)+
geom_jitter(color="black", size=0.4, alpha=0.9) +
theme_ipsum() +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
ggtitle("Distribution of -log10(frequency) by Type") +
xlab("Type")+
ylab("-log10(frequency)")+
scale_x_discrete(labels=labels)+
scale_y_continuous(limits = c(0, 10), breaks = seq(0, 10, by = 2))
print(t)
s<<-t
ggsave("frequency_by_type.png", plot = t)
}
you could just create a new column indicating wheather it is an outlier or not and map this to the geom_jitter color. I resumed the answer in a smaller example but you should be able to fit this accordingly:
library(ggplot2)
library(viridis)
plot.data <- data.frame(group = c("1","1","1","1","1","2","2","2","2","2"),
value = c(1,5,10,6,3,1,5,10,6,3))
t<-plot.data %>%
mutate(outlier = ifelse(value >9, "YES", "NO")) %>%
ggplot(aes(x = group, y = value, fill = group))+
geom_boxplot()+
geom_jitter(aes(group, value, color = outlier) , size=2, alpha=0.9)+
scale_fill_viridis(discrete = TRUE, alpha = 0.6)
t
library(ggplot2)
# Basic box plot
p <- ggplot(ToothGrowth, aes(x=dose, y=len)) +
geom_boxplot()
p
# Rotate the box plot
p + coord_flip()
# Notched box plot
ggplot(ToothGrowth, aes(x=dose, y=len)) +
geom_boxplot(notch=TRUE)
# Change outlier, color, shape and size
ggplot(ToothGrowth, aes(x=dose, y=len)) +
geom_boxplot(outlier.colour="red", outlier.shape=8,
outlier.size=4)

Highlight top 3 values

I have a plot like this with the following code:
aus_cases <- ggplot(data = daily_cases,aes(x= date, as.numeric(V1)))+
geom_col(fill = 'blue', alpha= 0.6)+
theme_minimal(base_size =14)+
xlab(NULL)+
ylab(NULL)+
theme_bw()+
scale_x_date(date_labels = "%d/%m/%Y")
And I wanted to highlight the top 3 value within the plot and show the date in the plot as well, was thinking to use gghighlight but am not sure how to do it.
Using the ggplot2::economics dataset as example data you can highlight and label the top 3 values like so:
Add an indicator for the top3 values to your df using e.g. the rank function.
To highlight, map the top3 indicator on fill.
To add the dates use geom_text to add labels only for the top3 values
Try this:
library(ggplot2)
library(dplyr)
# Example data
d <- filter(economics, date >= as.Date("2010-01-01"))
# Add top3 indicator
d <- mutate(d, top3 = rank(-psavert) %in% 1:3)
ggplot(data = d, aes(date, psavert, fill = top3)) +
geom_col(alpha = 0.6) +
geom_text(aes(label = ifelse(top3, as.character(date), "")), nudge_y = .1) +
scale_fill_manual(values = c("TRUE" = "red", "FALSE" = "blue")) +
theme_minimal(base_size = 14) +
xlab(NULL) +
ylab(NULL) +
theme_bw() +
scale_x_date(date_labels = "%d/%m/%Y")
Here is one way to do it. You didn't dputyour data, therefore I used this test data.
library(lubridate)
library(tidyverse)
library(gghighlight)
daily_cases <- data.frame(V1 = c(10,20,30, 10, 5, 10, 10, 40, 50, 10),
date = ymd("2020-02-01", "2020-02-02",
"2020-02-03","2020-02-04",
"2020-02-05","2020-02-06",
"2020-02-07","2020-02-08",
"2020-02-09","2020-02-10"))
At first I specified the top 3 values and their date in top. And used these information in ggplot in gghighlight (highlighting the three bars) and scale_x_date (just show the dates of the highlightes bars).
top <- daily_cases %>%
arrange(desc(V1)) %>%
slice(1:3)
aus_cases <- ggplot(data = daily_cases,aes(x= date, as.numeric(V1)))+
geom_col(fill = 'blue', alpha= 0.6)+
gghighlight(V1 >= min(top$V1)) +
theme_minimal(base_size = 14)+
xlab(NULL)+
ylab(NULL)+
theme_bw()+
scale_x_date(breaks = top$date, date_labels = "%d/%m/%Y")
Here is the plot.

R bubble plot using ggplot manually selecting the colour and axis names

I using ggplot to create a bubble plot. With this code:
ggplot(df, aes(x = order, y = mean, size = n, fill = name)) +
geom_point(shape = 21) +
theme_bw() +
theme() +
scale_size(range = c(1, 50)) +
ylim(0,100)
It is working perfectly apart from 2 things:
For each name (fill) I would like to manually specify the colour used (via a dataframe that maps name to colour) - this is to provide consistency across multiple figures.
I would like to substitute the numbers on the y for text labels (for several reasons I cannot use the text labels from the outset due to ordering issues)
I have tried several methods using scale_color_manual() and scale_y_continuous respectively and I am getting nowhere! Any help would be very gratefully received!
Thanks
Since you have not specified an example df, I created one of my own.
To manually specify the color, you have to use scale_fill_manual with a named vector as the argument of values.
Edit 2
This appears to do what you want. We use scale_y_continuous. The breaks argument specifies the vector of positions, while the labels argument specifies the labels which should appear at those positions. Since we already created the vectors when creating the data frame, we simply pass those vectors as arguments.
ggplot(df, aes(x = order, y = mean, size = n, fill = name)) +
geom_point(shape = 21) +
scale_fill_manual(values = gcolors) +
scale_size(limits = c(min(df$n), max(df$n))) +
scale_y_continuous(breaks = mean, labels = order_label)
Edit 1
From your comment, it appears that you want to label the circles. One option would be to use geom_text. Code below. You may need to experiment with values of nudge_y to get the position correct.
order <- c(1, 2)
mean <- c(0.75, 0.3)
n <- c(180, 200)
name <- c("a", "b")
order_label <- c("New York", "London")
df <- data.frame(order, mean, n, name, order_label, stringsAsFactors = FALSE)
color <- c("blue", "red")
name_color <- data.frame(name, color, stringsAsFactors = FALSE)
gcolors <- name_color[, 2]
names(gcolors) <- name_color[, 1]
ggplot(df, aes(x = order, y = mean, size = n, fill = name)) +
geom_point(shape = 21) +
geom_text(aes(label = order_label), size = 3, hjust = "inward",
nudge_y = 0.03) +
scale_fill_manual(values = gcolors) +
scale_size(limits = c(min(df$n), max(df$n))) +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
ylab(NULL)
Original Answer
It is not clear what you mean by "substitute the numbers on the y for text labels". In the example below, I have formatted the y-axis as a percentage using the scales::percent_format() function. Is this similar to what you want?
order <- c(1, 2)
mean <- c(0.75, 0.3)
n <- c(180, 200)
name <- c("a", "b")
df <- data.frame(order, mean, n, name, stringsAsFactors = FALSE)
color <- c("blue", "red")
name_color <- data.frame(name, color, stringsAsFactors = FALSE)
gcolors <- name_color[, 2]
names(gcolors) <- name_color[, 1]
ggplot(df, aes(x = order, y = mean, size = n, fill = name)) +
geom_point(shape = 21) +
scale_fill_manual(values = gcolors) +
scale_size(limits = c(min(df$n), max(df$n))) +
scale_y_continuous(labels = scales::percent_format())
Thanks, for all your help, this worked perfectly:
ggplot(df, aes(x = order, y = mean, size = n, fill = name)) +
geom_point(shape = 21) +
scale_fill_manual(values = gcolors) +
scale_size(limits = c(min(df$n), max(df$n))) +
scale_x_continuous(breaks = order, labels = order_label)

Resources