Get row columns by group for geom_col in ggplot - r

I am trying to calculate row percentages by demographics of various score levels--in my data, that would be what % of white people (or % of black people, or % male, or % who have education level 2, and so on) have a score of 0 (or 1, 2, or 3)--and then use that to create a big plot.
So in my example data below, 8.33% of race == 1 (which is white) have a score of 0, 25% have a score of 1, 25% have a score of 2, and 41.67% have a score of 3.
Then the ultimate end goal would be to get some type of bar plot where the 4 levels of 'score' are across the x axis, and the various comparisons of demographics run down the y axis. Something that looks visually sort of like this, but with the levels of 'score' across the top instead of education levels: .
I already have some code to make the actual figure, which I've done in other instances but with externally/already-calculated percentages:
ggplot(data, aes(x = percent, y = category, fill = group)) +
geom_col(orientation = "y", width = .9) +
facet_grid(group~score_var,
scales = "free_y", space = "free_y") +
labs(title = "Demographic breakdown of 'Score'") +
theme_bw()
I am just struggling to figure out the best way to calculate these row percentages, presumably using group_by() and summarize and then storing or configuring them in a way that they can be plotted. Thank you.
d <- structure(list(race = c(1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1,
1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2,
3, 3), gender = c(0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,
0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1
), education = c(1, 3, 3, 2, 1, 3, 2, 3, 4, 4, 2, 3, 3, 2, 3,
4, 1, 3, 1, 3, 3, 2, 1, 3, 2, 3, 4, 4, 2, 3, 3, 2, 3, 4, 1, 3
), score = c(1, 2, 2, 1, 2, 3, 3, 2, 0, 0, 1, 2, 1, 3, 0, 0,
3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 2, 3, 1, 3, 3, 0, 1, 2, 2, 0)), row.names = c(NA,
-36L), spec = structure(list(cols = list(race = structure(list(), class = c("collector_double",
"collector")), gender = structure(list(), class = c("collector_double",
"collector")), education = structure(list(), class = c("collector_double",
"collector")), score = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), problems = <pointer: 0x000001bd978b0df0>, class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))

This may get you started:
library(dplyr)
library(ggplot2)
prop <- data %>%
mutate(race = factor(race, levels = c(1, 2, 3), labels = c("White", "Black", "Others"))) %>%
group_by(race) %>%
mutate(race_n = n()) %>%
group_by(race, score) %>%
summarise(percent = round(100*n()/race_n[1], 1))
prop %>%
ggplot(aes(x = percent, y = score, fill = race)) +
geom_col(orientation = "y", width = .9) +
geom_text(aes(label = percent), hjust = 1)+
facet_grid(~race) +
labs(title = "Demographic breakdown of 'Score'") +
theme_bw()
Edit
To put all characters together, you can get a bigger graph:
df <- data %>% mutate(
gender = factor(2-gender),
race = factor(race),
education = factor(education)) %>%
pivot_longer(!score, names_to = "character", values_to = "levels")
df %>% group_by(character, levels) %>%
mutate(group_n = n()) %>%
group_by(character, levels, score) %>%
summarise(percent = round(100*n()/group_n[1], 1)) %>%
ggplot(aes(x = percent, y = score, fill = character)) +
geom_col(orientation = "y", width = .9) +
geom_text(aes(label = percent), hjust = 1)+
facet_grid(character ~ levels) +
labs(title = "Demographic breakdown of 'Score'") +
theme_bw()
please note: I have changed the code for gender.

Taking inspiration from #Zhiqiang Wang's excellent first pass at this, I finally figured out a solution. I still need to change the order of the labels (to put the education levels in order, and move the race variables to the top of the figure) but this is basically what I was envisioning.
d_test <- d %>% mutate(
gender = factor(2-gender),
race = factor(race),
education = factor(education)) %>%
pivot_longer(!score, names_to = "group", values_to = "levels")
d_test <- d_test %>% group_by(group, levels) %>%
mutate(group_n = n()) %>%
group_by(group, levels, score) %>%
summarise(percent = round(100*n()/group_n[1], 1))
d_test <- d_test %>%
mutate(var = case_when(group == "gender" & levels == 1 ~ "female",
group == "gender" & levels == 2 ~ "male",
group == "race" & levels == 1 ~ "white",
group == "race" & levels == 2 ~ "black",
group == "race" & levels == 3 ~ "hispanic",
group == "education" & levels == 1 ~ "dropout HS",
group == "education" & levels == 2 ~ "grad HS",
group == "education" & levels == 3 ~ "some coll",
group == "education" & levels == 4 ~ "grad coll"))
ggplot(d_test, aes(x = percent, y = var, fill = group)) +
geom_col(orientation = "y", width = .9) +
facet_grid(group ~ score,
scales = "free_y", space = "free_y") +
labs(title = "Demographic breakdown of 'Score'",
y = "",
x = "Percent") +
theme_minimal() +
theme(legend.position = "none",
strip.text.y = element_blank())

Related

Alluvial diagram with varying aesthetic / colors over one flow in R based on ggplot2

I have created the following alluvial diagram in R as follows:
df <- data.frame(Variable = c("X1", "X2", "X3", "X4", "X5", "X6"),
Pearson1 = c(6, 3, 2, 5, 4, 1),
Spearman1 = c(6, 5, 1, 2, 3, 4),
Kendall1 = c(6, 5, 1, 2, 3, 4),
Pearson2 = c(6, 5, 1, 2, 3, 4),
Spearman2 = c(6, 5, 1, 2, 4, 3),
Kendall2 = c(6, 5, 1, 2, 3, 4))
df$freq<-1
alluvial(df[1:7], freq=df$freq, cex = 0.7,col= "red")
which results in
How can I set some specific lines to have different col than red? e.g. X1 from Variables to Pearson1, and then again from Kendall1 to Spearman2 and X3 in all states? I see I can't do that based on alluvial(). How can I recreate the above alluvial based on another function??
ggalluvial allows for varying aesthetics over one "flow" (or alluvium). The documentation provides a trick to use geom_flow with stat = "alluvium" and to specify "lode.guidance = "frontback".
The actual aesthetic (color) will need to be added to the data. geom_flow and geom_stratum will require different columns for the aesthetic, (try what happens when you use the same for both). I am passing the color directly and using scale_identity, but you can of course also use random values and then define your colors with scale_manual.
library(ggalluvial)
#> Loading required package: ggplot2
library(tidyverse)
df <- data.frame(Variable = c("X1", "X2", "X3", "X4", "X5", "X6"),
Pearson1 = c(6, 3, 2, 5, 4, 1),
Spearman1 = c(6, 5, 1, 2, 3, 4),
Kendall1 = c(6, 5, 1, 2, 3, 4),
Pearson2 = c(6, 5, 1, 2, 3, 4),
Spearman2 = c(6, 5, 1, 2, 4, 3),
Kendall2 = c(6, 5, 1, 2, 3, 4))
df_long <-
df %>%
## reshape your data in order to bring it to the right shape
mutate(across(everything(), as.character)) %>%
rownames_to_column("ID") %>%
pivot_longer(-ID) %>%
## correct order of your x
mutate(
name = factor(name, levels = names(df)),
## now hard code where you want to change the color.
## lodes need a different highlighting then your strata
## there are of course many ways to add this information, I am using case_when here
## you could also create separate vectors and add them to your data frame
highlight_lode = case_when(
ID == 3 ~ "blue",
ID == 1 & name %in% c("Variable", "Kendall1", "Pearson2") ~ "orange",
TRUE ~ "red"
),
highlight_stratum = case_when(
ID == 3 ~ "blue",
ID == 1 & name %in% c(
"Variable", "Pearson1", "Kendall1", "Pearson2",
"Spearman2"
) ~ "orange",
TRUE ~ "red"
)
)
ggplot(df_long,
## now use different color aesthetics in geom_flow and geom_stratum
aes(x = name, stratum = value, alluvium = ID, label = value)) +
## I took this trick with lode guidance from the documentation - this allows varying aesthetics over one flow.
geom_flow(aes(fill = highlight_lode), stat = "alluvium", lode.guidance = "frontback", color = "darkgray") +
geom_stratum(aes(fill = highlight_stratum)) +
geom_text(stat = "stratum") +
## as I have named the colors directly, it is appropriate to use scale_identity
scale_fill_identity()
#> Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
#> ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
Created on 2023-01-29 with reprex v2.0.2

Why does plot_ly pie chart always turn 5th value text from white to black in R?

I'm making a pie chart using the plotly package in R, and whenever I have 5 choices to visualize, it always changes the 5th one to a brown/black color, despite me overriding the color to be white. In case it has something to do with the value being NA, how do I work around that? I have data that is labeled as NA, so I need to keep it named that. For what it's worth, I tried renaming to "NA ", but it still appeared the same.
Why is this happening and how do I fix it?
library(plotly)
library(dplyr)
data <- tibble(employee = c("Justin", "Corey","Sibley", "Justin", "Corey","Sibley", "Lisa", "NA"),
education = c("graudate", "student", "student", "graudate", "student", "student", "nurse", "doctor"),
fte_max_capacity = c(1, 2, 3, 1, 2, 3, 4, 5),
project = c("big", "medium", "small", "medium", "small", "small", "medium", "medium"),
aug_2021 = c(1, 1, 1, 1, 1, 1, 2, 5),
sep_2021 = c(1, 1, 1, 1, 1, 1, 2, 5),
oct_2021 = c(1, 1, 1, 1, 1, 1, 2, 5),
nov_2021 = c(1, 1, 1, 1, 1, 1, 2, 5))
data2 <- data %>%
dplyr::group_by(employee) %>%
mutate(sum = sum(rowSums(select(cur_data_all(), contains("_20"))))) %>%
dplyr::select(employee, sum) %>%
distinct()
my_colors <- c("#CA001B", "#1D28B0", "#D71DA4", "#00A3AD", "#FF8200", "#753BBD", "#00B5E2", "#008578", "#EB6FBD", "#FE5000", "#6CC24A", "#D9D9D6", "#AD0C27", "#950078")
fig <- plot_ly(type='pie', labels=data2$employee, values=data2$sum,
textinfo='label+percent', marker = list(colors = my_colors),
insidetextorientation='horizontal')
fig
t <- list(
family = "Arial",
size = 18,
color = 'white')
fig %>% layout(font=t, showlegend = FALSE)
Set color in insidetextfont -
library(plotly)
fig <- plot_ly(type='pie', labels=data2$employee, values=data2$sum,
textinfo='label+percent', insidetextfont = list(color = '#FFFFFF'),
marker = list(colors = my_colors),
insidetextorientation='horizontal')
fig

Double index/category bar plot in R? [duplicate]

For a sample dataframe:
df <- structure(list(year = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3,
3, 3, 4, 4, 4, 4, 4), imd.quintile = c(1, 2, 3, 4, 5, 1, 2, 3,
4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5), average_antibiotic = c(1.17153515458827,
1.11592565388857, 1.09288449967773, 1.07442652168281, 1.06102887394413,
1.0560582933182, 1.00678980505929, 0.992997489072538, 0.978343676071694,
0.967900478870214, 1.02854157116164, 0.98339099101476, 0.981198852494798,
0.971392872980818, 0.962289579742817, 1.00601488964457, 0.951187417739673,
0.950706064156994, 0.939174499710836, 0.934948233015044)), .Names = c("year",
"imd.quintile", "average_antibiotic"), row.names = c(NA, -20L
), vars = "year", drop = TRUE, class = c("grouped_df", "tbl_df",
"tbl", "data.frame"))
I want to produce a grouped bar chart, very similar to this post.
I want year on the x axes, and average_antibiotic on the y axes. I want the five bar charts (for each imd.quintile - which is the legend).
I have tried a couple of options (based on the post and elsewhere), but can't make it work.
ggplot(df, aes(x = imd.quintile, y = average_antibiotic)) +
geom_col() +
facet_wrap(~ year)
ggplot(df, aes(x = imd.quintile, y = average_antibiotic)) +
geom_bar(aes(fill = imd.quintile), position = "dodge", stat="identity")
Any ideas?
I believe you are looking for something like this:
library(ggplot2)
ggplot(df ) +
geom_col(aes(x = year, y = average_antibiotic, group=imd.quintile, fill=imd.quintile), position = "dodge" )

Plot the sum of all but one variable

I have a dataframe with data on the number of TVs and radios owned by survey respondents in three different countries (Canada, Mexico, US) at two different points in time (now and before):
DF <- data.frame(TV_now = as.numeric(c(4, 9, 1, 0, 4, NA)),
TV_before = as.numeric(c(4, 1, 2, 4, 5, 2)),
Radio_now = as.numeric(c(4, 5, 1, 5, 6, 9)),
Radio_before = as.numeric(c(6, 5, 3, 6, 7, 10)),
Country = as.factor(c("Mexico", "Canada", "US", "US", "Canada", "US")))
I want to sum the total value of each variable and then create a barplot that shows the number of TVs and radios owned by survey respondents now and before per country.
Now, if my dataframe didn't contain the Country factor, I could generate the plot in this way:
library(tidyverse)
library(ggplot2)
DF %>% mutate_all(funs(sum), na.rm = TRUE) %>%
gather(key=Device, value=Number) %>%
ggplot(aes(x=Device,fill=Device)) +
geom_bar(aes(x = Device, y = Number), position = "dodge", stat = "identity")
However, the variation
DF %>% mutate_all(funs(sum), na.rm = TRUE) %>%
gather(key=Device, value=Number, -Country) %>%
ggplot(aes(x=Device,fill=Device)) +
geom_bar(aes(x = Device, y = Number), position = "dodge", stat = "identity") +
facet_wrap(~Country)
results in the error:
Error in mutate_impl(.data, dots) :
Evaluation error: ‘sum’ not meaningful for factors.
Is there a way to exclude the factor from sum, or another way to generate the intended plot?
You can use the summarise function to sum up the different columns. Below I have summed up the numeric columns using dplyr's summarise_if() function.
DF <- data.frame(TV_now = as.numeric(c(4, 9, 1, 0, 4, NA)),
TV_before = as.numeric(c(4, 1, 2, 4, 5, 2)),
Radio_now = as.numeric(c(4, 5, 1, 5, 6, 9)),
Radio_before = as.numeric(c(6, 5, 3, 6, 7, 10)),
Country = as.factor(c("Mexico", "Canada", "US", "US", "Canada", "US")))
DF %>%
group_by(Country) %>%
summarise_if(is.numeric,sum,na.rm=TRUE) %>%
gather(key=Device, value=Number, -Country) %>%
ggplot(aes(x=Device,fill=Device)) +
geom_bar(aes(x = Device, y = Number),position = "dodge", stat = "identity") +
facet_wrap(~Country)
The result is:

Creating arrow head matching size (or lwd) in ggplot2

To draw arrows in ggplot, I use geom_segment and arrow=arrow().
I would like the arrow head size to match the segment width (or size).
However, arrow does not recognize variables directly from the data argument in ggplot and one must specify data.frame containing the variable using the $ operator. This causes a disjunct between the values used for plotting the line and those used for plotting the arrow head (the largest arrow head can be on the thinest segment).
Example:
d <- structure(list(Process = structure(c(2L, 1L, 1L, 1L, 2L, 2L,
1L, 1L, 2L, 1L, 2L), .Label = c("First", "Second"), class = "factor"),
x.sink = c(1, 3, 1, 2, 2, 3, 3, 2, 2, 2, 2), y.sink = c(1,
1, 1, 2, 2, 1, 1, 1, 1, 2, 2), x.source = c(2, 2, 2, 2, 2,
2, 2, 1, 1, 1, 3), y.source = c(2, 2, 2, 1, 1, 1, 1, 1, 1,
2, 1), offset = c(1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1),
Std.Flux = c(0.179487179487179, 0.170940170940171, 0.944444444444444,
0.0854700854700855, 0.726495726495726, 0.128205128205128,
0.213675213675214, 0.213675213675214, 0.128205128205128,
0.106837606837607, 1)), .Names = c("Process", "x.sink", "y.sink",
"x.source", "y.source", "offset", "Std.Flux"), class = "data.frame", row.names = c(NA,
-11L))
p <- qplot(data=d,
#alpha=I(0.4),
colour=Process,
size=Std.Flux,
xlim=c(0,4),
ylim=c(0,3),
x=x.source+as.numeric(Process)/10,
y=y.source+as.numeric(Process)/10,
xend=x.sink+as.numeric(Process)/10,
yend=y.sink+as.numeric(Process)/10,
geom="segment",
arrow = arrow(type="closed",
length = unit(d$Std.Flux,"cm")))
print(p)
Any suggestions?
Here's one way:
require(ggplot2)
df <- mtcars
arrow_pos <- data.frame(y = 250)
ggplot(df, aes(x=factor(cyl), y=mpg)) +
geom_bar(width = .4, stat="identity", fill="darkblue") +
geom_segment(data=arrow_pos,
aes(x=1.526, xend=1.01, y=y + 90.02, yend=y + 0.25),
arrow=arrow(length=unit(4.2, "mm")), lwd=2,
color="black") +
geom_segment(data=arrow_pos,
aes(x=1.525, xend=1.01, y=y + 90, yend=y + 0.25),
arrow=arrow(length=unit(4, "mm")), lwd=1,
color="gold2") +
annotate("text", x=2.39, y=360,
label='This arrow points to the highest MPG.') +
scale_y_continuous(limits = c(0,400)) +
xlab('CYL') + ylab('MPG')
Output:
Must have been fixed in the last 8 years :)
Here translated into a call to ggplot()
library(ggplot2)
ggplot(d, aes(colour=Process, size=Std.Flux)) +
geom_segment(aes(x=x.source+as.numeric(Process)/10,
y=y.source+as.numeric(Process)/10,
xend=x.sink+as.numeric(Process)/10,
yend=y.sink+as.numeric(Process)/10),
arrow = arrow(type="closed",
length = unit(d$Std.Flux,"cm")))

Resources