I have 16000 ish missing persons data that I am trying to order by Count and then plot on a graph. this is the code i am using. I am wanting to plot only the top ten.
mp.city <-mp.All %>%
group_by(State, City, Sex) %>%
summarise(Count = n())
mp.city %>%
arrange(desc(Count)) %>%
slice(1:10) %>%
ggplot(aes(y = City)) +
geom_bar()
the code will run but the plot is garbage. Any help would be amazing thank you!
I think you can manage it con head():
url<-'https://raw.githubusercontent.com/kitapplegate/fall2020/master/mpAll.csv'
mp.All<-read.csv(url)
library(ggplot2)
library(dplyr)
mp.city <-mp.All %>%
group_by(State, City, Sex) %>%
summarise(Count = n())
mp.city %>%
# sort
arrange(desc(Count)) %>%
# top 10 overall
head(10) %>%
# plot ordered
ggplot(aes(x = reorder(City,Count), y = Count))+
geom_bar( stat = "identity") +
# flipped
coord_flip() +
# label for x axis (flipped)
xlab("City")
P.S.
Next time try to share your data with dput(head(yourdata)) and posting the result, it's way better.
Related
I am working on the Tidy Tuesday data this week and ran into my geom_area doing what I think is overlapping the data. If I facet_wrap the data then there are no missing values in any year, but as soon as I make an area plot and fill it the healthcare/education data seems to disappear.
Below are example plots of what I mean.
library(tidyverse)
chain_investment <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-08-10/chain_investment.csv')
plottable_investment <- chain_investment %>%
filter(group_num == c(12,17)) %>%
mutate(small_cat = case_when(
group_num == 12 ~ "Transportation",
group_num == 17 ~ "Education/Health"
)) %>%
group_by(small_cat, year, category) %>%
summarise(sum(gross_inv_chain)) %>%
ungroup %>%
rename(gross_inv_chain = 4)
# This plot shows that there is NO missing education, health, or highway data
# Goal is to combine the data on one plot and fill based on the category
plottable_investment %>%
ggplot(aes(year, gross_inv_chain)) +
geom_area() +
facet_wrap(~category)
# Some of the data in the health category gets lost? disappears? unknown
plottable_investment %>%
ggplot(aes(year, gross_inv_chain, fill = category)) +
geom_area()
# Something is going wrong here?
plottable_investment %>%
filter(category %in% c("Education","Health")) %>%
ggplot(aes(year, gross_inv_chain, fill = category)) +
geom_area(position = "identity")
# The data is definitely there
plottable_investment %>%
filter(category %in% c("Education","Health")) %>%
ggplot(aes(year, gross_inv_chain)) +
geom_area() +
facet_wrap(~category)
The issue is that you filtered your data using == instead of using %in%.
In your case using == has the subtle side effect that for some categories (e.g. Health) your filtered data contains only obs for even years, while for others (e.g. Education) we end up with obs for only uneven years. As a result you end up with "two" area charts which overlap each other.
This could be easily seen by switching to geom_col which gives you a "dodged" bar plot as we have only one category per year.
plottable_investment %>%
filter(category %in% c("Education","Health")) %>%
ggplot(aes(year, gross_inv_chain, fill = category)) +
geom_col()
Using %in% instead gives the desired stacked area chart with all observations per category:
plottable_investment1 <- chain_investment %>%
filter(group_num %in% c(12,17)) %>%
mutate(small_cat = case_when(
group_num == 12 ~ "Transportation",
group_num == 17 ~ "Education/Health"
)) %>%
group_by(small_cat, year, category) %>%
summarise(gross_inv_chain = sum(gross_inv_chain)) %>%
ungroup()
#> `summarise()` has grouped output by 'small_cat', 'year'. You can override using the `.groups` argument.
plottable_investment1 %>%
filter(category %in% c("Education","Health")) %>%
ggplot(aes(year, gross_inv_chain, fill = category)) +
geom_area()
I am trying to create a "order" stacked bar chart that each stack is colored by one variable and ordered by another variable, please find my example as below:
library(ggplot2)
library(dplyr)
data(iris)
chart.df.st00 <- iris %>%
as_tibble %>%
mutate(`Sepal.Length`=round(`Sepal.Length`)) %>%
count(Species,`Sepal.Length`) %>%
mutate(`Sepal.Length`=as.character(`Sepal.Length`)) %>%
group_by(Species) %>%
mutate(percent=n/sum(n)*100) %>%
arrange(desc(n)) %>%
mutate(rank=1:n()) %>%
ungroup %>%
mutate(rank=paste(Species,rank,sep='-'))
chart.df.st01 <- chart.df.st00 %>%
left_join(chart.df.st00 %>%
distinct(`Sepal.Length`) %>%
mutate(color=colorRampPalette(
RColorBrewer::brewer.pal(length(unique(chart.df.st00$`Sepal.Length`)),'Set1'))(length(unique(chart.df.st00$`Sepal.Length`)))))
chart.color1.st00 <- chart.df.st01 %>%
distinct(rank,color) %>%
arrange(rank)
chart.color1.st01 <- chart.color1.st00$color
names(chart.color1.st01) <- chart.color1.st00$rank
chart1 <- ggplot(data=chart.df.st01,
aes(x=1,y=percent)) +
geom_bar(aes(fill=rank),stat='identity') +
scale_fill_manual(values=chart.color1.st01) +
facet_wrap(.~Species,ncol = 1) +
scale_y_reverse(breaks=c(0,25,50,75,100),labels=c(100,75,50,25,0)) +
coord_flip()
chart.color2.st00 <- chart.df.st01 %>%
distinct(color,Sepal.Length) %>%
arrange(Sepal.Length)
chart.color2.st01 <- chart.color2.st00$color
names(chart.color2.st01) <- chart.color2.st00$`Sepal.Length`
chart2 <- ggplot(data=chart.df,
aes(x=1,y=percent)) +
geom_bar(aes(fill=`Sepal.Length`),stat='identity') +
scale_fill_manual(values=chart.color2.st01) +
facet_wrap(.~Species,ncol = 1) +
coord_flip()
In my example, each stack is filled by Sepal.Length, and order by rank, chart1 has the ordering of the stacks I want, but not the legend, while chart2 has the legend I want, but not the ordering of the stacks.
Is there a way to have a single chart with the stacked bar of chart1 and the legend of chart2?
Thanks!
Using the code for your second chart this could be achieved by additionally mapping rank on the group aes:
library(ggplot2)
library(dplyr)
data(iris)
chart.df.st00 <- iris %>%
as_tibble %>%
mutate(`Sepal.Length`=round(`Sepal.Length`)) %>%
count(Species,`Sepal.Length`) %>%
mutate(`Sepal.Length`=as.character(`Sepal.Length`)) %>%
group_by(Species) %>%
mutate(percent=n/sum(n)*100) %>%
arrange(desc(n)) %>%
mutate(rank=1:n()) %>%
ungroup %>%
mutate(rank=paste(Species,rank,sep='-'))
chart.df.st01 <- chart.df.st00 %>%
left_join(chart.df.st00 %>%
distinct(`Sepal.Length`) %>%
mutate(color=colorRampPalette(
RColorBrewer::brewer.pal(length(unique(chart.df.st00$`Sepal.Length`)),'Set1'))(length(unique(chart.df.st00$`Sepal.Length`)))))
#> Joining, by = "Sepal.Length"
chart.color2.st00 <- chart.df.st01 %>%
distinct(color,Sepal.Length) %>%
arrange(Sepal.Length)
chart.color2.st01 <- chart.color2.st00$color
names(chart.color2.st01) <- chart.color2.st00$`Sepal.Length`
ggplot(data=chart.df.st01,
aes(x=1,y=percent)) +
geom_bar(aes(fill=`Sepal.Length`, group = rank), stat='identity') +
scale_fill_manual(values = chart.color2.st01) +
facet_wrap(.~Species,ncol = 1) +
scale_y_reverse(breaks=c(0,25,50,75,100),labels=c(100,75,50,25,0)) +
coord_flip()
Let's say I'm working with proportions, I have two main variables (sex and pain_level). It's not difficult to plot them:
With tidyverse and broom (and thanks for this link here: Calling prop.test function in R with dplyr) I can compare if the proportions are statistically different.
Now comes the question!
I want to add to the plot, the error bar. I know it's not as difficult as I'm thinking, but I could not find a way to do it. I've tried to replicate this link here (http://www.andrew.cmu.edu/user/achoulde/94842/labs/lab07_solution.html) but I'm trying to stay at tidyverse environment.
The desired output should be something like that:
Please feel free to use the script/syntax below that simulate the original dataset.
library(tidyverse)
ds <- data.frame(sex = rep(c("M","F"), 18),
pain_level = c("High","Moderate","low"))
#plot
ds %>%
group_by(pain_level, sex) %>%
summarise(n=n()) %>%
mutate(prop = n/sum(n)*100) %>%
ggplot(., aes(x = sex, fill = pain_level, y = prop)) +
geom_bar(stat = "summary") +
facet_wrap( ~ pain_level) +
theme(legend.position = "none")
#p values of proportion test
ds %>%
rowwise %>%
group_by(pain_level, sex) %>%
summarise(cases = n()) %>%
mutate(pop = sum(cases)) %>% #compute totals
distinct(., pain_level, .keep_all= TRUE) %>% #keep only one value of the row
mutate(tst = list(broom::tidy(prop.test(cases, pop, conf.level=0.95)))) %>%
tidyr::unnest(tst)
I think the following might roughly resemble your desired output:
ds %>%
group_by(pain_level, sex) %>%
summarise(cases = n()) %>%
mutate(pop = sum(cases)) %>%
rowwise() %>%
mutate(tst = list(broom::tidy(prop.test(cases, pop, conf.level=0.95)))) %>%
tidyr::unnest(tst) %>%
ggplot(aes(sex, estimate, group = pain_level)) +
geom_col(aes(fill = pain_level)) +
geom_errorbar(aes(ymin = conf.low, ymax = conf.high)) +
facet_wrap(~ pain_level)
on my research on how to plot multiple line charts I came across the following paper:
https://arxiv.org/pdf/1808.06019.pdf
It is showing a way on how huge amount of time series data is displayed by combining every line chart with a common headmap, the result looks like kind of equal to this representation:
I was looking for an R package (but could not find anything) or a nice implementation for ggplot to achieve the same result. So I am able to plot a lot of geom_lines and color them differently but I do not know how to actually apply the headmap to it.
Does anyone has a hint/idea for me?
Thanks!
Stephan
library(tidyverse)
datasets::ChickWeight # from Base R
ggplot(ChickWeight, aes(Time, weight, group = Chick)) + geom_line()
The wrangling here counts how many readings in each time / weight bucket, and normalizes to "share of most common reading" for each Time.
ChickWeight %>%
count(Time, weight = 10*floor(weight/10)) %>%
complete(Time, weight = 10*0:30, fill = list(n = 0)) %>%
group_by(Time) %>%
mutate(share = n / max(n)) %>% # weighted for num as % of max for that Time
ungroup() %>%
ggplot(aes(Time, weight, fill = share)) +
geom_tile(width = 2) +
scale_fill_viridis_c(direction = -1)
If your data has sparse time readings, it might be useful to interpolate your lines to get more resolution for binning:
ChickWeight %>%
group_by(Chick) %>%
arrange(Time) %>%
padr::pad_int("Time", step = 0.5) %>%
mutate(weight_approx = approx(Time, weight, Time)$y) %>%
ungroup() %>%
count(Time, weight_approx = 10*floor(weight_approx/10)) %>%
complete(Time, weight_approx = 10*0:60, fill = list(n = 0)) %>%
group_by(Time) %>%
mutate(share = n / sum(n)) %>% # Different weighting option
ungroup() %>%
ggplot(aes(Time, weight_approx, fill = share)) +
geom_tile() +
scale_fill_viridis_c(direction = -1)
Although my query shows me values in descending order, ggplot then displays them alphabetically instead of ascending order.
Known solutions to this problem haven't seem to work. They suggest using Reorder or factor for values, which didn't work in this case
This is my code:
boxoffice %>%
group_by(studio) %>%
summarise(movies_made = n()) %>%
arrange(desc(movies_made)) %>%
top_n(10) %>%
arrange(desc(movies_made)) %>%
ggplot(aes(x = studio, y = movies_made, fill = studio, label = as.character(movies_made))) +
geom_bar(stat = 'identity') +
geom_label(label.size = 1, size = 5, color = "white") +
theme(legend.position = "none") +
ylab("Movies Made") +
xlab("Studio")
for those wanting a more complete example, here's where I got:
library(dplyr)
library(ggplot2)
# get some dummy data
boxoffice = boxoffice::boxoffice(dates=as.Date("2017-1-1"))
df <- (
boxoffice %>%
group_by(distributor) %>%
summarise(movies_made = n()) %>%
mutate(studio=reorder(distributor, -movies_made)) %>%
top_n(10))
ggplot(df, aes(x=distributor, y=movies_made)) + geom_col()
You'll need to convert boxoffice$studio to an ordered factor. ggplot will then respect the order of rows in the data set, rather than alphabetizing. Your dplyr chain will look like this:
boxoffice %>%
group_by(studio) %>%
summarise(movies_made = n()) %>%
arrange(desc(movies_made)) %>%
ungroup() %>% # ungroup
mutate(studio = factor(studio, studio, ordered = T)) %>% # convert variable
top_n(10) %>%
arrange(desc(movies_made)) %>%
ggplot(aes(x = studio, y... (rest of plotting code)