insert labels in proportional bar chart with ggplot2 and geom_text - r

I am trying to insert labels into a proportional barchart: one label per segment, with as text the percentage of each segment. With the help of thothal I managed to do this:
var1 <- factor(as.character(c(1,1,2,3,1,4,3,2,3,2,1,4,2,3,2,1,4,3,1,2)))
var2 <- factor(as.character(c(1,4,2,3,4,2,1,2,3,4,2,1,1,3,2,1,2,4,3,2)))
data <- data.frame(var1, var2)
dat <- ddply(data, .(var1), function(.) {
res <- cumsum(prop.table(table(factor(.$var2))))
data.frame(lab = names(res), y = c(res))
})
ggplot(data, aes(x = var1)) + geom_bar(aes(fill = var2), position = 'fill') +
geom_text(aes(label = lab, x = var1, y = y), data = dat)
I would like to have for labels the percentage of each level, and not the level name.
Any help appreciated!

You are telling geom_text to use var2 as your y variable. That is in fact as.numeric(data$var2), which translates to a range of 1-4. However, your barplot uses the cumulative percentages.
Hence you have to calculate these positions before:
library(ggplot2)
library(plyr) # just for convenience
var1 <- factor(as.character(c(1,1,2,3,1,4,3,2,3,2,1,4,2,3,2,1,4,3,1,2)))
var2 <- factor(as.character(c(1,4,2,3,4,2,1,2,3,4,2,1,1,3,2,1,2,4,3,2)))
data <- data.frame(var1, var2)
dat <- ddply(data, .(var1), function(.) {
res <- cumsum(prop.table(table(factor(.$var2)))) # re-factor to use only used levels
res2 <- prop.table(table(factor(.$var2))) # re-factor to use only used levels
data.frame(lab = names(res), y = c(res), lab2 = c(res2))
})
ggplot(data, aes(x = var1)) + geom_bar(aes(fill = var2), position = 'fill') +
geom_text(aes(label = round(lab2, 2), x = var1, y = y), data = dat)
This places the labs at the end of each bar. If you want to have them slightly offset, you should play arround in the creation of dat.

Another way to get non-cumulative percentage plus centering the labels, for future reference:
dat <- ddply(data, .(var1), function(.) {
good <- prop.table(table(factor(.$var2)))
res <- cumsum(prop.table(table(factor(.$var2))))
data.frame(lab = names(res), y = c(res), good = good, pos = cumsum(good) - 0.5*good)
})
ggplot(data, aes(x = var1)) + geom_bar(aes(fill = var2), position = 'fill') +
geom_text(aes(label = round(good.Freq, 2), x = var1, y = pos.Freq), data = dat)

I used the following code and work well for me, give it a try.
geom_text(aes(label = paste(round(dat2$value,0), "%"),
vjust = ifelse(value >= 0, -0.05, 1.15)
),
size = 4, position = position_stack(vjust=0.5)
)
Basically, you need label = paste(y value, "%"). In my code, dat2 is the data file name; value is the Y value in the figure. In this case, I rounded up the number with 0 decimal.Good luck.

Related

for loop run over factor levels to plot several plots not working

I am trying to run a for loop over a factor level (treatment in this case) to plot graphs for each of the levels using a function. My goal is to obtain several graphs on my wd(), one for each treatment level.
Problem: The outcome is always one single messed up barplot with all the variables and errorbars included.
dataset looks something like this:
set.seed(108) test <- data.frame(
n = 1:12,
treatment = factor(paste("trt", 1:2)),
rep= factor(paste("rep", 1:2)),
type = sample(LETTERS, 3),
mean= sample(1:100, 12),
sd= sample(1:50, 12),
var3 = sample(1:100, 12),
var4 = sample(1:100, 12))
I believe that I'm missing something on my for loop code:
df$treatment<- as.factor(df$treatment)
treatment_levels<- unique(levels(df$treatment))
for(i in 1:length(treatment_levels)){
df <- df[treatment_levels[i],]
x <- df$type
avg <- df$mean
sd <- df$sd
grp<- df$rep
title<- treatment_levels[i]
xtitle<- "type"
ytitle<- " "
fig_name <- paste(title,"_bp")
bpfunction(df, x, avg, sd, grp, title, xtitle, ytitle, fig_name)
}
my function to plot a barplot is:
bpfunction(df, x, avg, sd, grp, title, xtitle, ytitle, fig_name)
{
bp <- ggplot(df, aes(x = x, y = avg, fill = grp)) +
geom_bar(stat = 'identity', aes(fill = grp), size = 1) +
geom_errorbar(aes(ymin=avg-sd , ymax=avg +sd))+
labs(x = x, y = avg, title = title)
ggsave(paste(fig_name, "png", sep = "."), plot = bp)
}
Several issues with your implementation:
Currently, your row indexing is not logical but factor value:
df <- df[treatment_levels[i],]
Therefore, adjust to proper filtering by column:
sub_df <- df[df$treatment == treatment_levels[i],]
Better yet, avoid bookkeeping of unique treatment factor levels and use by (object-oriented wrapper to tapply) or split + lapply.
Reassigning df in a for loop. After first iteration, df can no longer be subsetted for other treatment values. Therefore, use a different object name. (Actually, avoid df altogether for more substantive name).
Using a numeric vector of many values as labels per your labs argument.
Avoid passing vectors pointing to data frame columns into aes. Instead, pass string variables to be dynamically rendered with .data[[]] or double curly brace {{}} (ggplot2 v3.0.0+):
bpfunction(treatment_df, x, avg, sd, grp, title, xtitle, ytitle, fig_name)
{
bp <- ggplot(treatment_df, aes(x = .data[[x]], y = .data[[avg]], fill = .data[[grp]])) +
geom_bar(stat = 'identity', aes(fill = .data[[grp]]), size = 1) +
geom_errorbar(aes(ymin={{avg}}-{{sd}}, ymax={{avg}}+{{sd}})) +
labs(x = x, y = avg, title = title)
ggsave(fig_name, plot = bp)
return(bp)
}
# REFACTOR USING by
treatment_plots <- by(df, df$treatment, function(sub_df)
bpfunction(
sub_df,
x = "type",
avg = "mean",
sd = "sd",
grp = "rep",
title sub_df$treatment[1],
xtitle = "type",
ytitle = " ",
fig_name = paste0(sub_df$treatment[1], "_bp.png")
)
)
# REFACTOR USING split + lapply
treatment_plots <- split(df, df$treatment) |> lapply(
function(sub_df) bp_function(...same as above...)
)

How to draw a multi-colored dashed line (alternating colors for visual effect) [duplicate]

This question already has answers here:
Alternating color of individual dashes in a geom_line
(4 answers)
Closed 8 months ago.
I was wondering if it is possible to create a multicolored dashed line in ggplot.
Basically I have a plot displaying savings based on two packages.
A orange line with savings based on package A
A green line with savings based on package B
I also have a third line and I would like that one to be dashed alterenating between orange and green. Is that something that somebody has been able to do?
Here is an example:
library(tidyverse)
S <- seq(0, 5, by = 0.05)
a <- S ^ 2
b <- S
a_b = a + b #This data should have the dashed multicolor line, since it is the sum of the other two lines.
S <- data.frame(S)
temp <- cbind(S, a, b, a_b)
temp <- gather(temp, variable, value, -S)
desiredOrder <- c("a", "b", "a_b")
temp$variable <- factor(temp$variable, levels = desiredOrder)
temp <- temp[order(temp$variable),]
p <- ggplot(temp, aes(x = S, y = value, colour = variable)) +
theme_minimal() +
geom_line(size = 1) +
scale_color_manual(name = "Legend", values = c("orange", "green", "#0085bd"),
breaks = c("a", "b", "a_b"))
p
I basically want to have a multicolored (dashed or dotted) line for "c"
This is, to my best knowledge, currently only possible via creation of new segments for each alternate color. This is fiddly.
Below I've tried a largely programmatic approach in which you can define the size of the repeating segment (based on your x unit). The positioning of y values is slightly convoluted and it will also result in slightly irregular segment lengths when dealing with different slopes. I also haven't tested it on many data, either. But I guess it's a good start :)
For the legend, I'm taking the same approach, by creating a fake legend and stitching it onto the other plot. The challenges here include:
positioning of legend elements relative to the plot
relative distance between the legend elements
update
For a much neater way to create those segments and a Stat implementation see this thread
library(tidyverse)
library(patchwork)
S <- seq(0, 5, by = 0.05)
a <- S^2
b <- S
a_b <- a + b
df <- data.frame(x = S, a, b, a_b) %>%
pivot_longer(-x, names_to = "variable", values_to = "value")
## a function to create modifiable cuts in order to get segments.
## this looks convoluted - and it is! there are a few if/else statements.
## Why? The assigment of new y to x values depends on how many original values
## you have.
## There might be more direct ways to get there
alt_colors <- function(df, x, y, seg_length, my_cols) {
x <- df[[x]]
y <- df[[y]]
## create new x for each tiny segment
length_seg <- seg_length / length(my_cols)
new_x <- seq(min(x, na.rm = TRUE), x[length(x)], length_seg)
## now we need to interpolate y values for each new x
## This is different depending on how many x and new x you have
if (length(new_x) < length(x)) {
ind_int <- findInterval(new_x, x)
new_y <- sapply(seq_along(ind_int), function(i) {
if (y[ind_int[i]] == y[ind_int[length(ind_int)]]) {
y[ind_int[i]]
} else {
seq_y <- seq(y[ind_int[i]], y[ind_int[i] + 1], length.out = length(my_cols))
head(seq_y, -1)
}
})
} else {
ind_int <- findInterval(new_x, x)
rle_int <- rle(ind_int)
new_y <- sapply(rle_int$values, function(i) {
if (y[i] == y[max(rle_int$values)]) {
y[i]
} else {
seq_y <- seq(y[i], y[i + 1], length.out = rle_int$lengths[i] + 1)
head(seq_y, -1)
}
})
}
## THis is also a bit painful and might cause other bugs that I haven't
## discovered yet.
if (length(unlist(new_y)) < length(new_x)) {
newdat <- data.frame(
x = new_x,
y = rep_len(unlist(new_y), length.out = length(new_x))
)
} else {
newdat <- data.frame(x = new_x, y = unlist(new_y))
}
newdat <- newdat %>%
mutate(xend = lead(x), yend = lead(y)) %>%
drop_na(xend)
newdat$color <- my_cols
newdat
}
## the below is just a demonstration of how the function would work
## using different segment widths
df_alt1 <-
df %>%
filter(variable == "a_b") %>%
alt_colors("x", "value", 1, c("orange", "green"))
df_alt.5 <-
df %>%
filter(variable == "a_b") %>%
alt_colors("x", "value", .5, c("orange", "green"))
df_ab <-
df %>%
filter(variable != "a_b") %>%
# for the identity mapping
mutate(color = ifelse(variable == "a", "green", "orange"))
## create data frame for the legend, also using the alt_colors function as per above
## the amount of x is a bit of trial and error, this is just a quick hack
## this is a trick to center the legend more or less relative to the main plot
y_leg <- ceiling(mean(range(df$value, na.rm = TRUE)))
dist_y <- 2
df_legend <-
data.frame(
variable = rep(unique(df$variable), each = 2),
x = 1:2,
y = rep(seq(y_leg - dist_y, y_leg + dist_y, by = dist_y), each = 2)
)
df_leg_onecol <-
df_legend %>%
filter(variable != "a_b") %>%
mutate(color = ifelse(variable == "a", "green", "orange"))
df_leg_alt <-
df_legend %>%
filter(variable == "a_b") %>%
alt_colors("x", "y", .5, c("orange", "green"))
## I am mapping the colors globally using identity mapping (see scale_identity).
p1 <-
ggplot(mapping = aes(x, value, colour = color)) +
theme_minimal() +
geom_line(data = df_ab, size = 1) +
geom_segment(data = df_alt1, aes(y = y, xend = xend, yend = yend), size = 1) +
scale_color_identity() +
ggtitle("alternating every 1 unit")
p.5 <-
ggplot(mapping = aes(x, value, colour = color)) +
theme_minimal() +
geom_line(data = df_ab, size = 1) +
geom_segment(data = df_alt.5, aes(y = y, xend = xend, yend = yend), size = 1) +
scale_color_identity() +
ggtitle("alternating every .5 unit")
p_leg <-
ggplot(mapping = aes(x, y, colour = color)) +
theme_void() +
geom_line(data = df_leg_onecol, size = 1) +
geom_segment(data = df_leg_alt, aes(xend = xend, yend = yend), size = 1) +
scale_color_identity() +
annotate(
geom = "text", y = unique(df_legend$y), label = unique(df_legend$variable),
x = max(df_legend$x + 1), hjust = 0
)
## set y limits to the range of the main plot
## in order to make the labels visible you need to adjust the plot margin and
## turn clipping off
p1 + p.5 +
(p_leg + coord_cartesian(ylim = range(df$value), clip = "off") +
theme(plot.margin = margin(r = 20, unit = "pt"))) +
plot_layout(widths = c(1, 1, .2))
Created on 2022-01-18 by the reprex package (v2.0.1)
(Copied this over from Alternating color of individual dashes in a geom_line)
Here's a ggplot hack that is simple, but works for two colors only. It results in two lines being overlayed, one a solid line, the other a dashed line.
library(dplyr)
library(ggplot2)
library(reshape2)
# Create df
x_value <- 1:10
group1 <- c(0,1,2,3,4,5,6,7,8,9)
group2 <- c(0,2,4,6,8,10,12,14,16,18)
dat <- data.frame(x_value, group1, group2) %>%
mutate(group2_2 = group2) %>% # Duplicate the column that you want to be alternating colors
melt(id.vars = "x_value", variable.name = "group", value.name ="y_value") # Long format
# Put in your selected order
dat$group <- factor(dat$group, levels=c("group1", "group2", "group2_2"))
# Plot
ggplot(dat, aes(x=x_value, y=y_value)) +
geom_line(aes(color=group, linetype=group), size=1) +
scale_color_manual(values=c("black", "red", "black")) +
scale_linetype_manual(values=c("solid", "solid", "dashed"))
Unfortunately the legend still needs to be edited by hand. Here's the example plot.

Can't print all ggplot charts I need

There is a good discussion about using ggplot in loop and other creative ways at Looping over variables in ggplot. However, the discussion does not quite solve my problem.
I have a vertical dataset that I need to create plots from in a loop. There is no error in the code but my code only prints the last plot. Can't figure out why. Here is a reproducible example:
df <- cbind.data.frame(var = sample(c('a','b'), size = 100, replace = TRUE),
grp = sample(c('x','y'), size = 100, replace = TRUE), value = rnorm(100))
for (i in 2) {
plot.df <- df[which(df$var == c('a','b')[i]),]
print(ggplot(plot.df, aes(x = 1:nrow(plot.df), y = value, color = grp)) +
geom_line() + ggtitle(c('a','b')[i]))
}
As an alternative, you might also consider using lapply, as it makes the code a lot more readable.
If I am not mistaken you want to produce plots for each of the levels of the variable var.
You can firstly define your function, and then apply it to all levels
my_plot <- function(x){
# debug: x <- "a"
plot.df <- df[df$var %in% x,]
ggplot(plot.df, aes(x = 1:nrow(plot.df), y = value, color = grp)) +
geom_line() + ggtitle(x)
}
lapply(unique(df$var), my_plot)
The comment by #EJJ is correct, your loop isn't you need something like
for (i in seq_along(1:nlevels(factor(df$var))))
library(ggplot2)
library(dplyr)
df <- cbind.data.frame(var = sample(c('a','b'), size = 100, replace = TRUE),
grp = sample(c('x','y'), size = 100, replace = TRUE), value = rnorm(100))
for (i in seq_along(1:nlevels(factor(df$var)))) {
plot.df <- df[which(df$var == c('a','b')[i]),]
print(ggplot(plot.df, aes(x = 1:nrow(plot.df), y = value, color = grp)) +
geom_line() + ggtitle(c('a','b')[i]))
}

How to control legend with many groups

I have a plot like this:
Which was created with this code:
# Make data:
set.seed(42)
n <- 1000
df <- data.frame(values = sample(0:5, size = n, replace = T, prob = c(9/10, rep(0.0167,5))),
group = rep(1:100, each = 10),
fill2 = rep(rnorm(10), each = 100),
year = rep(2001:2010, times = 100)
)
df$values <- ifelse(df$year %in% 2001:2007 == T, 0, df$values)
# Plot
require(ggplot2)
p <- ggplot(data = df, aes(x = year, y = values, colour = as.factor(group))) + geom_line()
p
Since there are so many groups, the legend is really not helpfull.
Ideally I would like just two elements in the legend, one for group = 1 and for all the other groups (they should all have the same color). Is there a way to force this?
you can define a new variable that has only two values, but still plot lines according to their original group,
ggplot(data = df, aes(x = year, y = values, group = group,
colour = ifelse(group == 1, "1", "!1"))) +
geom_line() +
scale_colour_brewer("groups", palette="Set1")

ggplot2 histogram binwidth [duplicate]

This question already has answers here:
Different breaks per facet in ggplot2 histogram
(4 answers)
Closed 8 years ago.
I would like to create multiple histograms within one plot (using facet_wrap).
This could be an example code:
df <- data.frame(p1 = rnorm(100,5,2), p2 = rnorm(100,80,20), group = rep(LETTERS[1:4],25))
library(ggplot2)
library(reshape)
plotData <- melt(df, id.vars = "group", measure.vars = c("p1","p2") )
m <- ggplot(plotData, aes(x = value, color = group, fill = group, group = group))
m <- m + geom_bar(position=position_dodge())
m <- m + facet_wrap( ~ variable,scales = "free_x")
print(m)
Now, I would like to modify the plot that it creates per parameter ("p1,"p2") let's say 10 bins.
Up to now, I could not find a way to do this as binwidth/breaks calculation should be dependent on a subset of data.
Is it possible at all?
I want to share my solution (taken from the answered question linked above) extended by the possibility to overlay the histograms with density curves scaled to histogram counts:
df <- data.frame(p1 = rnorm(1000,5,2), p2 = rnorm(1000,80,20), group = rep(LETTERS[1:4],25))
library(ggplot2)
library(reshape)
library(plyr)
plotData <- melt(df, id.vars = "group", measure.vars = c("p1","p2") )
nBins <- 10
groupedData <- dlply(plotData, .(variable))
groupedBinWidth <- llply(groupedData, .fun = function(data, nBins) {
r <- range(data$value, na.rm = TRUE, finite = TRUE)
widthOfBins = (r[2] - r[1])/nBins
if (is.na(widthOfBins) || is.infinite(widthOfBins) || (widthOfBins <= 0)) widthOfBins <- NULL
widthOfBins
}, nBins = nBins)
densData <- dlply(plotData, .(variable, group), .fun = function(subData){
param <- subData$variable[1]
group <- subData$group[1]
d <- density(subData$value)
bw <- groupedBinWidth[[param]]
data.frame(x = d$x, y = d$y * nrow(subData) * bw , group = group, variable = param)
})
hls <- mapply(function(x, b) geom_bar(aes(x = value), position = position_dodge(), data = x, binwidth = b),
groupedData, groupedBinWidth)
dLay <- mapply(function(data) geom_density(data = data, aes(x = x, y = y), stat = "identity", fill = NA, size = 1),
densData)
m <- ggplot(plotData, aes(x = value, color = group, fill = group, group = group))
m <- m + hls
m <- m + dLay
m <- m + facet_wrap( ~ variable,scales = "free")
print(m)
Try this - really ugly code, but works if I understand you correctly. You might want to play with geom_density and maybe remove fill to make it more readable.
nbin<- 5
m <- ggplot(plotData, aes(x = value, color = group, fill = group, group = group))
m <- m + geom_histogram(data = subset(plotData, variable == "p1"), binwidth=diff(range(subset(plotData, variable == "p1")$value))/nbin)
m <- m + geom_histogram(data = subset(plotData, variable == "p2"), binwidth=diff(range(subset(plotData, variable == "p2")$value))/nbin)
m <- m + facet_wrap( ~ variable,scales = "free_x")
print(m)

Resources