The data I have contain four columns: x, y_cnt, y1_rate, y2_rate.
set.seed(123)
x <- seq(1,10)
y_cnt <- rnorm(10, 200, 50)
y1_rate <- runif(10,0,1)
y2_rate <- runif(10,0,1)
df <- data.frame(x, y_cnt, y1_rate, y2_rate)
I need to produce a plot such that x is on the x-axis, both y1_rate and y2_rate are on the main y-axis, and y_cnt on the secondary y-axis.
Here how it looks in Excel:
Update:
This is what I've so far. It seems that the figure below shows y1_rate only.
transf_fact <- max(df$y_cnt)/max(df$y1_rate)
# Plot
ggplot(data = df,
mapping = aes(x = as.factor(x),
y = y_cnt)) +
geom_col(fill = 'red') +
geom_line(aes(y = transf_fact * y1_rate), group = 1) +
geom_line(aes(y = transf_fact * y2_rate)) +
scale_y_continuous(sec.axis = sec_axis(trans = ~ . / transf_fact,
name = "Rate"))+
labs(x = "X")
Here's an approach that adjusts the scaling of the rate variables, then gathers all the series into long form, and then shows the variables with their respective geoms.
transf_fact <- max(df$y_cnt)/max(df$y1_rate)
library(tidyverse) # Using v1.2.1
df %>%
# Scale any variables with "rate" in their name
mutate_at(vars(matches("rate")), ~.*transf_fact) %>%
# Gather into long form;
# one column specifying variable, one column specifying value
gather(y_var, val, -x) %>%
# Pipe into ggplot; all layers share x, y, and fill/color columns
ggplot(aes(x = as.factor(x), y = val, fill = y_var)) +
# bar chart only uses "y_cnt" data
geom_col(data = . %>% filter(y_var == "y_cnt")) +
# lines only use non-"y_cnt" data
geom_line(data = . %>% filter(y_var != "y_cnt"),
aes(color = y_var, group = y_var),
size = 1.2) +
# Use the "fill" aesthetic to control both colour and fill;
# geom_col typically uses fill, geom_line uses colour
scale_fill_discrete(aesthetics = c("colour", "fill")) +
scale_y_continuous(sec.axis = sec_axis(trans = ~ . / transf_fact,
name = "Rate")) +
labs(x = "X")
Related
I am working on percentage changes between periods and struggling with logaritmic transformation of labels. Here is an example based on the storms dataset:
library(dplyr)
library(ggplot2)
library(scales)
df <- storms |>
group_by(year) |>
summarise(wind = mean(wind)) |>
mutate(lag = lag(wind, n = 1)) |>
mutate(perc = (wind / lag) - 1) |>
tidyr::drop_na()
I want to visualize the distribution of percentages, making the percentage change symmetrical (log difference) with log1p.
ggplot(df, aes(x = log1p(perc))) +
geom_histogram(bins = 5)
x-axis with log1p values
At this point I wanted to transform the x-axis label back to the original percentage value.
I tried to create my own transformation with trans_new, and applied it to the labels in scale_x_continuous, but I can't make it work.
trans_perc <- trans_new(
name = "trans_perc",
transform = log1p_trans(),
inverse = function(x)
expm1(x),
breaks = breaks_log(),
format = percent_format(),
domain = c(-Inf, Inf)
)
ggplot(df, aes(x = log1p(perc))) +
geom_histogram(bins = 5) +
scale_x_continuous(labels = trans_perc)
Currently, the result is:
Error in get_labels():
! breaks and labels are different lengths
Run rlang::last_error() to see where the error occurred.
Thanks!
EDIT
I am adding details on the different output I am getting from Alan's first answer:
trans_perc <- trans_new(
name = "trans_perc",
transform = log1p,
inverse = expm1,
breaks = pretty_breaks(5),
format = percent_format(),
domain = c(-Inf, Inf)
)
library(ggpubr)
a <- ggplot(df, aes(x = log1p(perc))) +
geom_histogram(bins = 5)
b <- ggplot(df, aes(x = log1p(perc))) +
geom_histogram(bins = 5) +
scale_x_continuous(trans = trans_perc)
c <- ggplot(df, aes(x = perc)) +
geom_histogram(bins = 5) +
scale_x_continuous(trans = trans_perc)
ggarrange(a, b, c,
ncol = 3,
labels = c("Log on Value only",
"Log on Value and X",
"Log on X only"))
[different outcomes]:(https://i.stack.imgur.com/dCW2m.png
If I understand you correctly, you want to keep the shape of the histogram, but change the labels so that they reflect the value of the perc column rather the transformed log1p(perc) value. If that is the case, there is no need for a transformer object. You can simply put the reverse transformation (plus formatting) as a function into the labels argument of scale_x_continuous.
ggplot(df, aes(x = log1p(perc))) +
geom_histogram(bins = 5) +
scale_x_continuous("Percentage Change",
breaks = log1p(pretty(df$perc, 5)),
labels = ~ percent(expm1(.x)))
Note that although the histogram remains symmetrical in shape, the axis labels represent the back-transformed values of the original axis labels.
The point of a transformer object is to do all this for you without having to pass a transformed data set (i.e. without having to pass log1p(perc)). So in your case, you could do:
trans_perc <- trans_new(
name = "trans_perc",
transform = log1p,
inverse = expm1,
format = percent_format(),
domain = c(-Inf, Inf)
)
ggplot(df, aes(x = perc)) +
geom_histogram(bins = 5) +
scale_x_continuous(trans = trans_perc)
Which gives essentially the same result
Is there a way to set a constant width for geom_bar() in the event of missing data in the time series example below? I've tried setting width in aes() with no luck. Compare May '11 to June '11 width of bars in the plot below the code example.
colours <- c("#FF0000", "#33CC33", "#CCCCCC", "#FFA500", "#000000" )
iris$Month <- rep(seq(from=as.Date("2011-01-01"), to=as.Date("2011-10-01"), by="month"), 15)
colours <- c("#FF0000", "#33CC33", "#CCCCCC", "#FFA500", "#000000" )
iris$Month <- rep(seq(from=as.Date("2011-01-01"), to=as.Date("2011-10-01"), by="month"), 15)
d<-aggregate(iris$Sepal.Length, by=list(iris$Month, iris$Species), sum)
d$quota<-seq(from=2000, to=60000, by=2000)
colnames(d) <- c("Month", "Species", "Sepal.Width", "Quota")
d$Sepal.Width<-d$Sepal.Width * 1000
g1 <- ggplot(data=d, aes(x=Month, y=Quota, color="Quota")) + geom_line(size=1)
g1 + geom_bar(data=d[c(-1:-5),], aes(x=Month, y=Sepal.Width, width=10, group=Species, fill=Species), stat="identity", position="dodge") + scale_fill_manual(values=colours)
Some new options for position_dodge() and the new position_dodge2(), introduced in ggplot2 3.0.0 can help.
You can use preserve = "single" in position_dodge() to base the widths off a single element, so the widths of all bars will be the same.
ggplot(data = d, aes(x = Month, y = Quota, color = "Quota")) +
geom_line(size = 1) +
geom_col(data = d[c(-1:-5),], aes(y = Sepal.Width, fill = Species),
position = position_dodge(preserve = "single") ) +
scale_fill_manual(values = colours)
Using position_dodge2() changes the way things are centered, centering each set of bars at each x axis location. It has some padding built in, so use padding = 0 to remove.
ggplot(data = d, aes(x = Month, y = Quota, color = "Quota")) +
geom_line(size = 1) +
geom_col(data = d[c(-1:-5),], aes(y = Sepal.Width, fill = Species),
position = position_dodge2(preserve = "single", padding = 0) ) +
scale_fill_manual(values = colours)
The easiest way is to supplement your data set so that every combination is present, even if it has NA as its value. Taking a simpler example (as yours has a lot of unneeded features):
dat <- data.frame(a=rep(LETTERS[1:3],3),
b=rep(letters[1:3],each=3),
v=1:9)[-2,]
ggplot(dat, aes(x=a, y=v, colour=b)) +
geom_bar(aes(fill=b), stat="identity", position="dodge")
This shows the behavior you are trying to avoid: in group "B", there is no group "a", so the bars are wider. Supplement dat with a dataframe with all the combinations of a and b:
dat.all <- rbind(dat, cbind(expand.grid(a=levels(dat$a), b=levels(dat$b)), v=NA))
ggplot(dat.all, aes(x=a, y=v, colour=b)) +
geom_bar(aes(fill=b), stat="identity", position="dodge")
I had the same problem but was looking for a solution that works with the pipe (%>%). Using tidyr::spread and tidyr::gather from the tidyverse does the trick. I use the same data as #Brian Diggs, but with uppercase variable names to not end up with double variable names when transforming to wide:
library(tidyverse)
dat <- data.frame(A = rep(LETTERS[1:3], 3),
B = rep(letters[1:3], each = 3),
V = 1:9)[-2, ]
dat %>%
spread(key = B, value = V, fill = NA) %>% # turn data to wide, using fill = NA to generate missing values
gather(key = B, value = V, -A) %>% # go back to long, with the missings
ggplot(aes(x = A, y = V, fill = B)) +
geom_col(position = position_dodge())
Edit:
There actually is a even simpler solution to that problem in combination with the pipe. Use tidyr::complete gives the same result in one line:
dat %>%
complete(A, B) %>%
ggplot(aes(x = A, y = V, fill = B)) +
geom_col(position = position_dodge())
The following data frame is represented in a tile plot. Group B data has a different scale and therefore the Y-axis must be free. The plot is separated by facets according to the group. However, the group B tile plot appears as thin bands rather than looking like the group A plot. How can I make it so that, despite the Y-axis being free, the plot of group B fills all the white space as in group A?
library(ggplot2)
X <- 1:3
Y1 <- 1:3
Y2 <- seq(10, 30, 10)
Y <- c(rep(Y1,3), rep(Y2,3))
Grid <- 1:3
Group <- c("A", "B")
DF <- expand.grid(Grid = Grid,
X = X,
Group = Group)
DF$Y <- Y
DF$Grid <- NULL
DF$Z <- 1:18
ggplot(data = DF,
aes(x = X,
y = Y,
fill = Z)) +
geom_tile() +
facet_wrap(~ Group,
scales = "free_y")
One option to achieve your desired result would be to convert Y to a factor:
library(ggplot2)
ggplot(data = DF,
aes(x = X,
y = factor(Y),
fill = Z)) +
geom_tile() +
facet_wrap(~ Group,
scales = "free_y")
library(tidyverse)
DF %>%
complete(X = 1:3,Y = 1:30,Group = c("A","B"),fill = list(Z = NA_real_)) %>%
filter(!(Group == "A" & is.na(Z))) %>%
ggplot(aes(x = X,
y = Y,
fill = Z)) +
geom_tile(col = "white") +
facet_wrap(~ Group,
scales = "free_y")
I'm really new to R and I'm trying to plot data from air polution with NOx from 5 different locations (having a data of monthly averages from every location from 01-1996 to 12-2019). Each plot line should represent different location.
I've created a ggplot but I find it really unclear. I would like to ask you about your tips to make that plot better to read (It will be no bigger than A4, because it will be included in my work and printed). I would also like to have more years on X axis (1996, 1997, 1998)
ALIBA <- read_csv("ALIBA_Praha/NOx/all_sorted.csv")
BMISA <- read_csv("BMISA_Mikulov/NOx/all_sorted.csv")
CCBDA <- read_csv("CCBDA_CB/NOx/all_sorted.csv")
TKARA <- read_csv("TKARA_Karvina/NOx/all_sorted.csv")
UULKA <- read_csv("UULKA_UnL/NOx/all_sorted.csv")
ggplot() +
geom_line(data = ALIBA, aes(x = START_TIME, y = VALUE), color = "blue") +
geom_line(data = BMISA, aes(x = START_TIME, y = VALUE), color = "red") +
geom_line(data = CCBDA, aes(x = START_TIME, y = VALUE), color = "yellow") +
geom_line(data = TKARA, aes(x = START_TIME, y = VALUE), color = "green") +
geom_line(data = UULKA, aes(x = START_TIME, y = VALUE), color = "pink")
all csv files are in format:
START_TIME,VALUE
1996-01-01T00:00:00Z,61.3049451304964
1996-02-01T00:00:00Z,47.7234010245664
1996-03-01T00:00:00Z,33.083512309072
1996-04-01T00:00:00Z,47.771166691758
1996-05-01T00:00:00Z,24.7022422574005
1996-06-01T00:00:00Z,25.4495954480684
1996-07-01T00:00:00Z,23.301224242488
...
Thanks
First, I would paste all data sets together:
ALIBA <- read_csv("ALIBA_Praha/NOx/all_sorted.csv")
ALIBA$Location <- "ALIBA" # and so on
BMISA <- read_csv("BMISA_Mikulov/NOx/all_sorted.csv")
CCBDA <- read_csv("CCBDA_CB/NOx/all_sorted.csv")
TKARA <- read_csv("TKARA_Karvina/NOx/all_sorted.csv")
UULKA <- read_csv("UULKA_UnL/NOx/all_sorted.csv")
df <- rbind(ALIBA, BMISA, ...) # and so on
ggplot(data = df, aes(x = START_TIME, y = VALUE, color = Location) +
geom_line(size = 1) + # play with the stroke thickness
scale_color_brewer(palette = "Set1") + # here you can choose from a wide variety of palettes, just google
How would you like to add more years? In the same graph (everything will be tiny) or in seperate "windows" (= facets, better)?
I want to create a function that makes a heatmap where the y axis will have unique breaks, but repeated and ordered labels. I know that this is might not be a great practice. I am also aware that similar questions have been asked before. For example: ggplot in R, reordering the bars. But I want to achieve these repeated and ordered labels through sorting within a function, not by typing them manually. I am aware of solutions for reordering axes based on the values of factor (e.g., Order Bars in ggplot2 bar graph), but I don't think they apply or can't see how to apply these to my case, where the breaks are unique but the labels repeat.
Here is some code to reproduce the problem and some of my attempts:
Libraries and data
library(ggplot2)
library(dplyr)
library(tidyr)
set.seed(4)
id <- LETTERS[1:10]
lab <- paste(c("AB", "CD"), 1:5, sep = "_") %>%
sample(., size = 10, replace = TRUE)
val <- sample.int(n = 6, size = 10, replace = TRUE)
tes <- ifelse(val >= 4, 1, 0)
dat <- data.frame(id, lab, val, tes)
A heatmap with unique breaks on the y axis
dat2 <- dat %>% gather(kind, value, val:tes)
ggplot(dat2) +
geom_tile(aes(x = kind, y = id, fill = value), color="white", size=1)
A heatmap where the y axis is labeled with repeated labels instead of the unique breaks
This works, to the point that labels are used instead of unique ids, but the y axis is not ordered by the labels. Also, I am not sure about setting breaks and labels from the data frame in wide format (dat), rather than the data frame in long format used by ggplot (dat2).
dat2 <- dat %>% gather(kind, value, val:tes)
ggplot(dat2) +
geom_tile(aes(x = kind, y = id, fill = value), color="white", size=1) +
scale_y_discrete(breaks=dat$id, labels=dat$lab)
Mapping the vector of with repeated values on the y axis obviously doesn't work
dat2 <- dat %>% gather(kind, value, val:tes)
ggplot(dat2) +
geom_tile(aes(x = kind, y = lab, fill = value), color="white", size=1)
Repeated and ordered labels, try 1
As expected, merely sorting the input data by the non-unique lab variable does not work.
dat2 <- dat %>% gather(kind, value, val:tes) %>%
arrange(lab)
ggplot(dat2) +
geom_tile(aes(x = kind, y = id, fill = value), color="white", size=1) +
scale_y_discrete(breaks=id, label=lab)
Repeated and ordered labels, try 2
Try to create a named breaks vector ordered by the (repeating) labels. This gets me nowhere. Half the labels are missing and they are still not sorted.
dat2 <- dat %>% gather(kind, value, val:tes)
brks <- setNames(dat$id, dat$lab)[sort(dat$lab)]
ggplot(dat2) +
geom_tile(aes(x = kind, y = id, fill = value), color="white", size=1) +
scale_y_discrete(breaks = brks, labels = names(brks))
Repeated and ordered labels, try 3
Starting with the data frame sorted by label, try to create an ordered factor for lab. Then sort the table by this ordered factor. No luck.
dat2 <- dat %>% gather(kind, value, val:tes) %>% arrange(lab)
dat2 <- mutate(dat2, lab_f=factor(lab, levels=sort(unique(lab)), ordered = TRUE))
dat2 <- arrange(dat2, lab_f)
# check
dat2$lab_f
ggplot(dat2) +
geom_tile(aes(x = kind, y = id, fill = value), color="white", size=1) +
scale_y_discrete(breaks = dat2$id, labels = dat2$lab_f)
A workaround, which I can use if I have to, but I am trying to avoid
We can create a combination of id and lab which will be unique and use it for the y axis
dat2 <- dat %>% gather(kind, value, val:tes) %>%
mutate(id_lab=paste(lab, id, sep="_"))
ggplot(dat2) +
geom_tile(aes(x = kind, y = id_lab, fill = value), color="white", size=1)
I must be missing something. Any help is much appreciated.
The goal is to have a function that will take an arbitrarily long table and plot a y axis with unique breaks but (possibly) repeated and ordered labels.
heat <- function(dat) {
dat2 <- dat %>% gather(kind, value, val:tes)
# any other manipulation here
ggplot(dat2) +
geom_tile(aes(x = kind, y = id, fill = value), color="white", size=1)
# scale_y_discrete() (if needed)
}
The plot I am looking for is something like this (created in inkscape)
Using limits instead of breaks sets the order:
ggplot(dat2) +
geom_tile(aes(x = kind, y = id, fill = value), color="white", size=1) +
geom_text(aes(x = 1, y = id, label = id), col = 'white') +
scale_y_discrete(limits = dat$id[order(dat$lab)], labels = sort(dat$lab))