Related
This is not my data (for confidentiality reasons), but I have tried to create a reproducible example using a dataset included in the ggplot2 library. I have an histogram summarizing the value of some variable by group (factor of 2 levels). First, I did not want the counts but proportions of the total, so I used that code:
library(ggplot2)
library(dplyr)
df_example <- diamonds %>% as.data.frame() %>% filter(cut=="Premium" | cut=="Ideal")
ggplot(df_example,aes(x=z,fill=cut)) +
geom_histogram(aes(y=after_stat(width*density)),binwidth=1,center=0.5,col="black") +
facet_wrap(~cut) +
scale_x_continuous(breaks=seq(0,9,by=1)) +
scale_y_continuous(labels=scales::percent_format(accuracy=2,suffix="")) +
scale_fill_manual(values=c("#CC79A7","#009E73")) +
labs(x="Depth (mm)",y="Count") +
theme_bw() + theme(legend.position="none")
It gave me this as a result.
enter image description here
The issue is that I would like to print the numeric percentages on top of the bins and haven't find a way to do so.
As I saw it done for printing counts elsewhere, I attempted to print them using stat_bin(), including the same y and label values as the y in geom_histogram, thinking it would print the right numbers:
ggplot(df_example,aes(x=z,fill=cut)) +
geom_histogram(aes(y=after_stat(width*density)),binwidth=1,center=0.5,col="black") +
stat_bin(aes(y=after_stat(width*density),label=after_stat(width*density*100)),geom="text",vjust=-.5) +
facet_wrap(~cut) +
scale_x_continuous(breaks=seq(0,9,by=1)) +
scale_y_continuous(labels=scales::percent_format(accuracy=2,suffix="")) +
scale_fill_manual(values=c("#CC79A7","#009E73")) +
labs(x="Depth (mm)",y="%") +
theme_bw() + theme(legend.position="none")
However, it does print way more values than there are bins, these values do not appear consistent with what is portrayed by the bar heights and they do not print in respect to vjust=-.5 which would make them appear slightly above the bars.
enter image description here
What am I missing here? I know that if there was no grouping variable/facet_wrap, I could use after_stat(count/sum(count)) instead of after_stat(width*density) and it seems that it would have fixed my issue. But I need the histograms for both groups to appear next to each other. Thanks in advance!
You have to use the same arguments in stat_bin as for the histogram when adding your labels to get same binning for both layers and to align the labels with the bars:
library(ggplot2)
library(dplyr)
df_example <- diamonds %>%
as.data.frame() %>%
filter(cut == "Premium" | cut == "Ideal")
ggplot(df_example, aes(x = z, fill = cut)) +
geom_histogram(aes(y = after_stat(width * density)),
binwidth = 1, center = 0.5, col = "black"
) +
stat_bin(
aes(
y = after_stat(width * density),
label = scales::number(after_stat(width * density), scale = 100, accuracy = 1)
),
geom = "text", binwidth = 1, center = 0.5, vjust = -.25
) +
facet_wrap(~cut) +
scale_x_continuous(breaks = seq(0, 9, by = 1)) +
scale_y_continuous(labels = scales::number_format(scale = 100)) +
scale_fill_manual(values = c("#CC79A7", "#009E73")) +
labs(x = "Depth (mm)", y = "%") +
theme_bw() +
theme(legend.position = "none")
Here I have interactive barplot given by ggplotly. The only issue is that when I move mouse around bars, in the "model" category there is strange number instead of A or B (see the picture). Is it possible to customize plotly popup windows?
df <- data.frame (model = c("A", "A","B","B"),
year = c("2022","2021","2022","2021"),
sale = c(350,170,300,150),
change = c(180,NA,150,NA),
percent = c(105.8,NA,100,NA),
info = c("180, 105.8%",NA,"300,100%",NA)
)
#ggplot
plot <- ggplot(df, aes(fill=year, y=model, x=sale)) +
geom_bar(position="dodge", stat="identity") + geom_text(aes(label=info, x=1.11*max(sale),), fontface='bold')+ xlim(0, 1.2*max(df$sale)) +
theme(legend.position="bottom")+labs(fill = " ")+
scale_fill_brewer(palette = "Paired")
ggplotly(plot)
Personally, i avoid using ggplotly() as it more often than not formats the visuals in a way that i do not want.
A full plotly approach could look like this:
plot_ly(
data = df,
x = ~sale,
y = ~model,
color = ~year,
text = ~year,
type = "bar") %>%
add_trace(
x = ~max(df$sale) * 1.1,
y = ~model,
type = 'scatter',
mode = 'text',
text = ~info,
showlegend = FALSE
) %>%
style(hovertemplate = paste("Sale: %{x}",
"Model: %{y}",
"Year: %{text}",
sep = "<br>"))
You could also try to append the style() object to your ggplotly() object. I am not sure if this will work however.
For some reason, it works better if you use x=model and flip the axes:
plot <- ggplot(df, aes(fill=year, x=model, y=sale)) +
geom_bar(position="dodge", stat="identity") + geom_text(aes(label=info,y=1.11*max(sale),), fontface='bold')+
ylim(0, 1.2*max(df$sale)) +
theme(legend.position="bottom")+labs(fill = " ")+
scale_fill_brewer(palette = "Paired")+
coord_flip()
ggplotly(plot)
I am currently making a hate crime case study. For my plot I am using one zip-code as my y-axis and plotting how many crimes and what group is being targeted on the x-axis using geom-col. The problem is my y-axis is adding the zip-codes together rather than counting each frequency of how many times the zip-code shows up. Here is my dataset looks like:
structure(list(ID = 1:5, CRIME_TYPE = c("VANDALISM", "ASSAULT", "VANDALISM", "ASSAULT",
"OTHER"), BIAS_MOTIVATION_GROUP = c("ANTI-BLACK ",
"ANTI-BLACK ", "ANTI-FEMALE HOMOSEXUAL (LESBIAN) ",
"ANTI-MENTAL DISABILITY ", "ANTI-JEWISH "),
ZIP_CODE = c(40291L, 40219L, 40243L, 40212L, 40222L
)), row.names = c(NA, 5L), class = "data.frame")
Here is my code:
library(ggplot2)
df <- read.csv(file = "LMPD_OP_BIAS.csv", header = T)
library(tidyverse)
hate_crime <- df %>%
filter(ZIP_CODE == "40245")
hate_crime_plot <- hate_crime %>%
ggplot(., aes(x = BIAS_MOTIVATION_GROUP, y = ZIP_CODE, fill =
BIAS_MOTIVATION_GROUP)) +
geom_col() + labs(x = "BIAS_MOTIVATION_GROUP", fill = "BIAS_MOTIVATION_GROUP") +
theme_minimal() +
theme(axis.text.x=element_text (angle =45, hjust =1))
print(hate_crime_plot)
hate_crime_ploter <- hate_crime %>%
ggplot(., aes(x = UOR_DESC, y = ZIP_CODE, fill =
UOR_DESC)) +
geom_col() + labs(x = "UOR_DESC", fill = "UOR_DESC") +
theme_minimal() +
theme(axis.text.x=element_text (angle =45, hjust =1))
print(hate_crime_ploter)
For full data visit here: visit site to download data set
Alright, I think you've got a couple issues here. What's happening in your code is you're asking ggplot to make a bar plot with a categorical variable (BIAS_MOTIVATION_GROUP and UOR_DESC) on the x-axis and a continuous variable (ZIP_CODE) on the y-axis. Since there are more than one row per x-y combination, ggplot adds things together by x value, which is what you'd expect out of a bar plot. Long story short, I wonder if what you actually want is a histogram here. Your dataset (hate_crime) only has one value of ZIP_CODE, so I'm not sure what plotting ZIP on the y-axis is supposed to visualize. A histogram would look like this:
hate_crime %>%
ggplot(., aes(x = UOR_DESC, , fill = UOR_DESC)) +
geom_histogram(stat = "count") +
labs(x = "UOR_DESC", fill = "UOR_DESC") +
theme_minimal() +
theme(axis.text.x=element_text (angle =45, hjust =1))
If, instead, you're trying to visualize how often each ZIP code shows up in each category, you'd have to approach things differently. Perhaps you're looking for something like this?
df %>%
ggplot(aes(x = UOR_DESC, fill = factor(ZIP_CODE))) +
geom_histogram(stat = "count") +
theme(axis.text.x=element_text (angle =45, hjust =1))
I'm little bit stuck on ggplot2 trying to plot several data frame in one plot.
I have several data frame here I'll present just two exemples.
The data frame have the same Header but are different. Let say that I want to count balls that I have in 2 boxes.
name=c('red','blue','green','purple','white','black')
value1=c(2,3,4,2,6,8)
value2=c(1,5,7,3,4,2)
test1=data.frame("Color"=name,"Count"=value1)
test2=data.frame("Color"=name,"Count"=value2)
What I'm trying to do it's to make a bar plot of my count.
At the moment what I did it's :
(plot_test=ggplot(NULL, aes(x= Color, y=Count)) +
geom_bar(data=test1,stat = "identity",color='green')+
geom_bar(data=test2,stat = "identity",color='blue')
)
I want to have x=Color and y=Count, and barplot of test2 data frame next to test1. Here there are overlapping themselves. So I'll have same name twice in x but I want to plot the data frames in several color and got in legend the name.
For example "Green bar" = test1
"Blue bar" = test2
Thank you for your time and your help.
Best regards
You have two options here:
Either tweak the size and position of the bars
ggplot(NULL, aes(x= Color, y=Count)) +
geom_bar(data=test1, aes(color='test1'), stat = "identity",
width=.4, position=position_nudge(x = -0.2)) +
geom_bar(data=test2, aes(color='test2'), stat = "identity",
width=.4, position=position_nudge(x = 0.2))
or what I recommend is join the two data frames together and then plot
library(dplyr)
test1 %>%
full_join(test2, by = 'Color') %>%
data.table::melt(id.vars = 'Color') %>%
ggplot(aes(x= Color, y=value, fill = variable)) +
geom_bar(stat = "identity", position = 'dodge')
Try this:
name=c('red','blue','green','purple','white','black')
value1=c(2,3,4,2,6,8)
value2=c(1,5,7,3,4,2)
test1=data.frame("Color"=name,"Count"=value1)
test2=data.frame("Color"=name,"Count"=value2)
test1$var <- 'test1'
test2$var <- 'test2'
test_all <- rbind(test1,test2)
(plot_test=ggplot(data=test_all) +
geom_bar(aes(x=Color,y=Count,color=var),
stat = "identity", position=position_dodge(1))+
scale_color_manual(values = c('green', 'blue'))
)
This will do what you were trying to do:
balls <- data.frame(
count = c(c(2,3,4,2,6,8),c(1,5,7,3,4,2)),
colour = c(c('red','blue','green','purple','white','black'),c('red','blue','green','purple','white','black')),
box = c(rep("1", times = 6), rep("2", times = 6))
)
ggplot(balls, aes(x = colour, y = count, fill = box)) +
geom_col() +
scale_fill_manual(values = c("green","blue"))
This is better because it facilitates comparisons between the box counts:
ggplot(balls, aes(x = colour, y = count)) +
geom_col() +
facet_wrap(~ box, ncol = 1, labeller = as_labeller(c("1" = "Box #1", "2" = "Box #2")))
I have been trying to plot a graph of two sets of data with different point symbols and connecting lines with different colors using the R package ggplot2, but for the life of me, I have not been able to get the legend correctly distinguish between the two curves by showing the associated data point symbol for each curve.
I can get the legend to show different line colors. But I have not been able to make the legend to show different data point symbols for each set of data.
The following code:
df <- data.frame( thrd_cnt=c(1,2,4,8,16),
runtime4=c(53,38,31,41,54),
runtime8=c(54,35,31,35,44))
library("ggplot2")
print(
ggplot(data = df, aes(df$thrd_cnt, y=df$runtime, color=)) +
geom_line(aes(y=df$runtime4, color = "4 cores")) +
geom_point(aes(y=df$runtime4, color = "4 cores"), fill = "white",
size = 3, shape = 21) +
geom_line(aes(y=df$runtime8, color = "8 cores")) +
geom_point(aes(y=df$runtime8, color = "8 cores"), fill = "white",
size = 3, shape = 23) +
xlab("Number of Threads") +
ylab(substitute(paste("Execution Time, ", italic(milisec)))) +
scale_x_continuous(breaks=c(1,2,4,8,16)) +
theme(legend.position = c(0.3, 0.8)) +
labs(color="# cores")
)
## save a pdf and a png
ggsave("runtime.pdf", width=5, height=3.5)
ggsave("runtime.png", width=5, height=3.5)
outputs this graph:
plot
But the data point symbols in the legend are not distinguishable. The legend shows the same symbol for both graphs (which is formed of both data point symbols on top of each other).
One possible solution is to define the number of threads as a factor, then I might be able to get the data point symbols on the legend right, but still I don't know how to do that.
Any help would be appreciated.
As noted, you need to gather the data into a long format so you can map the cores variable to colour and shape. To keep the same choices of shape and fill as in your original plot, use scale_shape_manual to set the shape corresponding to each level of cores. Note that you need to set the name for both the colour and shape legends in labs() to ensure they coincide and don't produce two legends. I also used mutate so that the levels of cores don't confusingly include the word runtime.
df <- data.frame( thrd_cnt=c(1,2,4,8,16),
runtime4=c(53,38,31,41,54),
runtime8=c(54,35,31,35,44))
library(tidyverse)
ggplot(
data = df %>%
gather(cores, runtime, runtime4, runtime8) %>%
mutate(cores = str_c(str_extract(cores, "\\d"), " cores")),
mapping = aes(x = thrd_cnt, y = runtime, colour = cores)
) +
geom_line() +
geom_point(aes(shape = cores), size = 3, fill = "white") +
scale_x_continuous(breaks = c(1, 2, 4, 8, 16)) +
scale_shape_manual(values = c("4 cores" = 21, "8 cores" = 23)) +
theme(legend.position = c(0.3, 0.8)) +
labs(
x = "Number of Threads",
y = "Execution Time (millisec)",
colour = "# cores",
shape = "# cores"
)
Created on 2018-04-10 by the reprex package (v0.2.0).
or shape is fine too, and if you're doing more stuff with df, might make sense to convert and keep it in long, 'tidy' format.
library("ggplot2")
df <- data.frame( thrd_cnt=c(1,2,4,8,16),
runtime4=c(53,38,31,41,54),
runtime8=c(54,35,31,35,44))
df <- df %>% gather("runtime", "millisec", 2:3)
ggplot(data = df, aes(x = thrd_cnt, y = millisec, color = runtime, shape =
runtime)) + geom_line() + geom_point()
after gathering into a "long" formatted data frame, you pass colour and shape (pch) to the aesthetics arguments:
library(tidyverse)
df <- data.frame( thrd_cnt=c(1,2,4,8,16),
runtime4=c(53,38,31,41,54),
runtime8=c(54,35,31,35,44))
df %>% gather(key=run, value=time, -thrd_cnt) %>%
ggplot(aes(thrd_cnt, time, pch=run, colour=run)) + geom_line() + geom_point()
(Notice how brief the code is, compared to the original post)