plot multiple datasets and compare categories with barplots - r

I have three datasets with the same variables and I want to compare one variables over 29 different categories between the three datasets. The example below should work as a reproducible example. I tried already to plot it but the out put was not as expected. I would like to have the three bars next to each other and a small plot in the plot for every category.
number_trackers = c(1, 2, 3, 4, 5, 6),
category = c("Ads", "Analytics", "Ads", "Analytics", "Ads", "Ads"),
c4 = c("url1.com","ur2.com","url3.com","url4.com","url5.com","url6.com"))
List_short_after=data.frame = c("Tracker1", "Tracker2", "Tracker3", "Tracker4","Tracker5","Tracker6"),
number_trackers = c(1, 2, 3, 4, 5, 6),
category = c("Ads", "Analytics", "Ads", "Analytics", "Ads", "Ads"),
c4 = c("url1.com","ur2.com","url3.com","url4.com","url5.com","url6.com"))
List_after=data.frame = c("Tracker1", "Tracker2", "Tracker3", "Tracker4","Tracker5","Tracker6"),
number_trackers = c(1, 2, 3, 4, 5, 6),
category = c("Ads", "Analytics", "Ads", "Analytics", "Ads", "Ads"),
c4 = c("url1.com","ur2.com","url3.com","url4.com","url5.com","url6.com"))
ggplot(data = NULL,
mapping = aes(y = number_trackers,x=category)) +
geom_col(data = List_before,fill= "#ca93ef", colour="#ca93ef") +
geom_col(data = List_short_after,fill= "#5034c4", colour="#5034c4") +
geom_col(data = List_after,fill= "#795fc6", colour="#795fc6") +
facet_wrap(facets = vars(category))+
theme_minimal() +
theme(text = element_text(color = "#795fc6",size=12,face="bold"),
axis.text = element_text(color = "#795fc6",size=14,face="bold"))+
labs( y = "Number Trackers", x = "Categories")
[![This is how the plot shut look like just with 3 bars instead of 2][1]][1]
[1]: https://i.stack.imgur.com/nDq36.png

Here's code that may help you reach your goal. Note that I took some liberties with your input data because it seems to be incomplete in your question.
library(ggplot2)
List_before <- data.frame(
list_id = "list_before",
name = c("Tracker1", "Tracker2", "Tracker3", "Tracker4","Tracker5","Tracker6"),
number_trackers = sample(c(1, 2, 3, 4, 5, 6)),
category = c("Ads", "Analytics", "Other 1", "Other 2", "Other 3", "Other 4"),
c4 = c("url1.com","ur2.com","url3.com","url4.com","url5.com","url6.com"))
List_short_after <- data.frame(
list_id = "list_short_after",
name = c("Tracker1", "Tracker2", "Tracker3", "Tracker4","Tracker5","Tracker6"),
number_trackers = sample(c(1, 2, 3, 4, 5, 6)),
category = c("Ads", "Analytics", "Other 1", "Other 2", "Other 3", "Other 4"),
c4 = c("url1.com","ur2.com","url3.com","url4.com","url5.com","url6.com"))
List_after <- data.frame(
list_id = "list_after",
name = c("Tracker1", "Tracker2", "Tracker3", "Tracker4","Tracker5","Tracker6"),
number_trackers = sample(c(1, 2, 3, 4, 5, 6)),
category = c("Ads", "Analytics", "Other 1", "Other 2", "Other 3", "Other 4"),
c4 = c("url1.com","ur2.com","url3.com","url4.com","url5.com","url6.com"))
df <- rbind(List_before, List_short_after, List_after)
df$list_id <- as.factor(df$list_id)
df$category <- as.factor(df$category)
ggplot(df, aes(y = number_trackers, x = list_id)) +
geom_bar(aes(fill = list_id), stat = "identity", position = position_dodge()) +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
facet_grid(~category) +
labs(y = "Number of Trackers", x = NULL)

Related

Get row columns by group for geom_col in ggplot

I am trying to calculate row percentages by demographics of various score levels--in my data, that would be what % of white people (or % of black people, or % male, or % who have education level 2, and so on) have a score of 0 (or 1, 2, or 3)--and then use that to create a big plot.
So in my example data below, 8.33% of race == 1 (which is white) have a score of 0, 25% have a score of 1, 25% have a score of 2, and 41.67% have a score of 3.
Then the ultimate end goal would be to get some type of bar plot where the 4 levels of 'score' are across the x axis, and the various comparisons of demographics run down the y axis. Something that looks visually sort of like this, but with the levels of 'score' across the top instead of education levels: .
I already have some code to make the actual figure, which I've done in other instances but with externally/already-calculated percentages:
ggplot(data, aes(x = percent, y = category, fill = group)) +
geom_col(orientation = "y", width = .9) +
facet_grid(group~score_var,
scales = "free_y", space = "free_y") +
labs(title = "Demographic breakdown of 'Score'") +
theme_bw()
I am just struggling to figure out the best way to calculate these row percentages, presumably using group_by() and summarize and then storing or configuring them in a way that they can be plotted. Thank you.
d <- structure(list(race = c(1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1,
1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2,
3, 3), gender = c(0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,
0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1
), education = c(1, 3, 3, 2, 1, 3, 2, 3, 4, 4, 2, 3, 3, 2, 3,
4, 1, 3, 1, 3, 3, 2, 1, 3, 2, 3, 4, 4, 2, 3, 3, 2, 3, 4, 1, 3
), score = c(1, 2, 2, 1, 2, 3, 3, 2, 0, 0, 1, 2, 1, 3, 0, 0,
3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 2, 3, 1, 3, 3, 0, 1, 2, 2, 0)), row.names = c(NA,
-36L), spec = structure(list(cols = list(race = structure(list(), class = c("collector_double",
"collector")), gender = structure(list(), class = c("collector_double",
"collector")), education = structure(list(), class = c("collector_double",
"collector")), score = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), problems = <pointer: 0x000001bd978b0df0>, class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))
This may get you started:
library(dplyr)
library(ggplot2)
prop <- data %>%
mutate(race = factor(race, levels = c(1, 2, 3), labels = c("White", "Black", "Others"))) %>%
group_by(race) %>%
mutate(race_n = n()) %>%
group_by(race, score) %>%
summarise(percent = round(100*n()/race_n[1], 1))
prop %>%
ggplot(aes(x = percent, y = score, fill = race)) +
geom_col(orientation = "y", width = .9) +
geom_text(aes(label = percent), hjust = 1)+
facet_grid(~race) +
labs(title = "Demographic breakdown of 'Score'") +
theme_bw()
Edit
To put all characters together, you can get a bigger graph:
df <- data %>% mutate(
gender = factor(2-gender),
race = factor(race),
education = factor(education)) %>%
pivot_longer(!score, names_to = "character", values_to = "levels")
df %>% group_by(character, levels) %>%
mutate(group_n = n()) %>%
group_by(character, levels, score) %>%
summarise(percent = round(100*n()/group_n[1], 1)) %>%
ggplot(aes(x = percent, y = score, fill = character)) +
geom_col(orientation = "y", width = .9) +
geom_text(aes(label = percent), hjust = 1)+
facet_grid(character ~ levels) +
labs(title = "Demographic breakdown of 'Score'") +
theme_bw()
please note: I have changed the code for gender.
Taking inspiration from #Zhiqiang Wang's excellent first pass at this, I finally figured out a solution. I still need to change the order of the labels (to put the education levels in order, and move the race variables to the top of the figure) but this is basically what I was envisioning.
d_test <- d %>% mutate(
gender = factor(2-gender),
race = factor(race),
education = factor(education)) %>%
pivot_longer(!score, names_to = "group", values_to = "levels")
d_test <- d_test %>% group_by(group, levels) %>%
mutate(group_n = n()) %>%
group_by(group, levels, score) %>%
summarise(percent = round(100*n()/group_n[1], 1))
d_test <- d_test %>%
mutate(var = case_when(group == "gender" & levels == 1 ~ "female",
group == "gender" & levels == 2 ~ "male",
group == "race" & levels == 1 ~ "white",
group == "race" & levels == 2 ~ "black",
group == "race" & levels == 3 ~ "hispanic",
group == "education" & levels == 1 ~ "dropout HS",
group == "education" & levels == 2 ~ "grad HS",
group == "education" & levels == 3 ~ "some coll",
group == "education" & levels == 4 ~ "grad coll"))
ggplot(d_test, aes(x = percent, y = var, fill = group)) +
geom_col(orientation = "y", width = .9) +
facet_grid(group ~ score,
scales = "free_y", space = "free_y") +
labs(title = "Demographic breakdown of 'Score'",
y = "",
x = "Percent") +
theme_minimal() +
theme(legend.position = "none",
strip.text.y = element_blank())

Difficulty getting a legend to add to my graph

I have tried many ways to add a legend to the bottom two graphs but for some reason it either gives an error or doesn't show the legend.
Here is my code:
lg <- function(x, a = 1, b = 1){
exp(a+b*x) / (1+exp(a+b*x))
}
plot(NA, xlim=c(-5,5), ylim=c(0,1), xlab = "x", ylab = "y", legend(2, 0.4, legend=c("b = 1", "b = 2", "b = 3", "b = 4", "b = 5"))
for (b in c(1:5)){
curve(expr = lg(x, 1, b), from = -5, to = 5, n = 100, add= TRUE, col = b)
}
plot(NA, xlim=c(-5,5), ylim=c(0,1), xlab = "x", ylab = "y", legend(2, 0.4, legend=c("a = 1", "a = 2", "a = 3", "a = 4", "a = 5"))
for (a in c(1:5)){
curve(expr = lg(x, a, 1), from = -5, to = 5, n = 100, add= TRUE, col = a)
}
Is there something wrong with my placement of the legend within the code?
Put legend function in new row without comma before.
And close parenthesis of plot:
plot(NA, xlim=c(-5,5), ylim=c(0,1), xlab = "x", ylab = "y")
legend(2, 0.4, legend=c("b = 1", "b = 2", "b = 3", "b = 4", "b = 5"))
for (b in c(1:5)){
curve(expr = lg(x, 1, b), from = -5, to = 5, n = 100, add= TRUE, col = b)
}
same for plot2
plot(NA, xlim=c(-5,5), ylim=c(0,1), xlab = "x", ylab = "y")
legend(2, 0.4, legend=c("a = 1", "a = 2", "a = 3", "a = 4", "a = 5"))
for (a in c(1:5)){
curve(expr = lg(x, a, 1), from = -5, to = 5, n = 100, add= TRUE, col = a)
}

R ggraph/ggplot2 color legend not displayed properly

I'm pretty new to ggraph and strugling a bit to get the legend to properly display the colors of the nodes.
I have the following example data set:
nodes <- data.frame( ID = c( 2, 3, 4, 5, 6, 7 ),
cl = c( "A", "B", "A", "A", "C", "B" ),
ty = c( 1, 1, 0, 0, 0, 1 ),
assets = c( 20000000, 10000, 500000, 10000, 150, 50 )
)
edges <- data.frame( from = c( 2, 5, 4, 6, 7, 4, 3 ),
to = c( 3, 4, 3, 5, 5, 3, 2 ),
we = c( 1, 1, 3, 2, 1, 1, 3 ),
pa = c( 0, 0, 1, 0, 1, 0, 0 ))
Based on these data I tried to plot the graph:
library( 'tidygraph' )
library( 'igraph' )
library( 'ggraph' )
graph <- graph_from_data_frame( edges, vertices = nodes, directed = TRUE ) %>% as_tbl_graph()
ggraph( graph, layout = 'fr' ) +
# Create edge layer
geom_edge_link0( aes( width = we, color = factor( pa )),
arrow = arrow( angle = 10, length = unit( 0.15, "inches" ),
ends = "last", type = "closed" )) +
scale_edge_width( range = c( 0.2, 2.2 )) +
scale_edge_color_grey( start = 0.4, end = 0.8 ) +
# Create node layer
geom_node_point( aes( shape = factor( ty ), fill = cl, size = log( assets ))) +
# Title and legend
labs( edge_width = "Power", edge_color = "Ownertype" ) +
ggtitle( "Title" ) +
theme( legend.key = element_rect( fill = "white", colour = "black" ),
legend.title = element_text(face = "bold" )) +
scale_size_continuous( name = "Assets", range = c( 3, 6 ), breaks = c( 5, 10, 15 )) +
scale_shape_manual( name = "Same branch", values = c( 21, 23 ), labels = c( "no", "yes" )) +
scale_fill_brewer( name = "Sector", palette = "Dark2" )
I have two issues with the legend under heading 'Sector':
The color keys are not displayed, they are all black. This happens everytime I let both the color and the shape of the nodes vary.
The color keys are too small, so that it is really hard to distinguish the colors (once they are there).
Unfortunately all my attempts to solve these two problems have been unsuccessful.
By default, the legend guide for points doesn't use a shape that supports a fill color. You need to set such a shape for the guide:
+ guides(fill = guide_legend(override.aes = list(size = 5, shape = 21)))

Highchart stacked columns

I have the following graph:
highchart() %>%
hc_title(text = "Composition") %>%
hc_subtitle(text = "Subtitle") %>%
hc_chart(type = "column", polar = F) %>%
hc_xAxis(categories = c("A", "B", "C", "D", "E")) %>%
hc_add_series(name = "Type A", data = c(5, 3, 4, 7, 2), stack = "A") %>%
hc_add_series(name = "Type B", data = c(5, 3, 4, 7, 2), stack = "A") %>%
hc_add_series(name = "Type C", data = c(5, 3, 4, 7, 2), stack = "A") %>%
hc_add_series(name = "Type D", data = c(5, 3, 4, 7, 2), stack = "A") %>%
hc_add_series(name = "Type E", data = c(5, 3, 4, 7, 2), stack = "A") %>%
hc_add_series(name = "Type A", data = c(2, 2, 3, 2, 1), stack = "B") %>%
hc_add_series(name = "Type B", data = c(2, 2, 3, 2, 1), stack = "B") %>%
hc_add_series(name = "Type C", data = c(2, 2, 3, 2, 1), stack = "B") %>%
hc_add_series(name = "Type D", data = c(2, 2, 3, 2, 1), stack = "B") %>%
hc_add_series(name = "Type E", data = c(2, 2, 3, 2, 1), stack = "B") %>%
hc_plotOptions(column = list(
dataLabels = list(enabled = FALSE),
stacking = "normal"))
The graph is creating different variables and also duplicating the legend for each stack = "A" and stack = "B". Then, we have Type A in the legend two times, Type B in the legend two times, etc. How can I stack different groups and have only one variable for each group?
I add a picture:
EDIT: I add some data
df <- structure(list(TYPE = c("TYPE_A", "TYPE_A", "TYPE_A", "TYPE_A",
"TYPE_A", "TYPE_A", "TYPE_A", "TYPE_A", "TYPE_A", "TYPE_A", "TYPE_A",
"TYPE_A", "TYPE_A", "TYPE_A", "TYPE_A", "TYPE_B", "TYPE_B", "TYPE_B",
"TYPE_B", "TYPE_B", "TYPE_B", "TYPE_B", "TYPE_B", "TYPE_B", "TYPE_B",
"TYPE_B", "TYPE_B", "TYPE_B", "TYPE_B", "TYPE_B"), SUB_TYPE = c("Sub_A",
"Sub_A", "Sub_A", "Sub_A", "Sub_A", "Sub_B", "Sub_B", "Sub_B",
"Sub_B", "Sub_B", "Sub_C", "Sub_C", "Sub_C", "Sub_C", "Sub_C",
"Sub_A", "Sub_A", "Sub_A", "Sub_A", "Sub_A", "Sub_B", "Sub_B",
"Sub_B", "Sub_B", "Sub_B", "Sub_C", "Sub_C", "Sub_C", "Sub_C",
"Sub_C"), PERIOD = c("curr", "t0", "t1", "t2", "t3", "curr",
"t0", "t1", "t2", "t3", "curr", "t0", "t1", "t2", "t3", "curr",
"t0", "t1", "t2", "t3", "curr", "t0", "t1", "t2", "t3", "curr",
"t0", "t1", "t2", "t3"), VALUE = c(296.6954, 352.8885, 360.5875,
375.2185, 389.8869, 1409.0355, 1470.8772, 1537.3365, 1590.0784,
1650.0942, 115.7838, 117.8871, 122.5989, 133.0147, 140.5065,
296.6954, 352.8885, 360.5875, 437.319, 382.3504, 1409.0355, 1470.8772,
1440.1987, 1289.093, 1244.0955, 115.7838, 117.8871, 169.3969,
113.2784, 98.1415)), .Names = c("TYPE", "SUB_TYPE", "PERIOD",
"VALUE"), row.names = c(NA, -30L), class = "data.frame")
TYPE are the categories in the X, SUB_TYPE are the stacked vales ina column, and the PERIOD are the columns in each category (TYPE).
When you add series "Type A" the second time, it is considered as a different series even though it has the same name. What we can do here is link the second to the first to attach it to the same legend entry (see API reference) and manually set the same color.
library(highcharter)
default_colors <- c("#7cb5ec", "#434348", "#90ed7d", "#f7a35c", "#8085e9",
"#f15c80", "#e4d354", "#2b908f", "#f45b5b", "#91e8e1")
highchart() %>%
hc_title(text = "Composition") %>%
hc_subtitle(text = "Subtitle") %>%
hc_chart(type = "column", polar = F) %>%
hc_xAxis(categories = c("A", "B", "C", "D", "E")) %>%
hc_add_series(name = "Type A", data = c(5, 3, 4, 7, 2), stack = "A",
id = "AA", color = default_colors[[1]]) %>%
hc_add_series(name = "Type B", data = c(5, 3, 4, 7, 2), stack = "A",
id = "BA", color = default_colors[[2]]) %>%
hc_add_series(name = "Type C", data = c(5, 3, 4, 7, 2), stack = "A",
id = "CA", color = default_colors[[3]]) %>%
hc_add_series(name = "Type D", data = c(5, 3, 4, 7, 2), stack = "A",
id = "DA", color = default_colors[[4]]) %>%
hc_add_series(name = "Type E", data = c(5, 3, 4, 7, 2), stack = "A",
id = "EA", color = default_colors[[5]]) %>%
hc_add_series(name = "Type A", data = c(2, 2, 3, 2, 1), stack = "B",
linkedTo = "AA", color = default_colors[[1]]) %>%
hc_add_series(name = "Type B", data = c(2, 2, 3, 2, 1), stack = "B",
linkedTo = "BA", color = default_colors[[2]]) %>%
hc_add_series(name = "Type C", data = c(2, 2, 3, 2, 1), stack = "B",
linkedTo = "CA", color = default_colors[[3]]) %>%
hc_add_series(name = "Type D", data = c(2, 2, 3, 2, 1), stack = "B",
linkedTo = "DA", color = default_colors[[4]]) %>%
hc_add_series(name = "Type E", data = c(2, 2, 3, 2, 1), stack = "B",
linkedTo = "EA", color = default_colors[[5]]) %>%
hc_plotOptions(column = list(
dataLabels = list(enabled = FALSE),
stacking = "normal"))
UPD
If the number of series/categories is not known beforehand, we can still create the plot by defining the list of series with all the options and passing this list to hc_add_series_list. The purrr package comes in handy when we want to tranform a data.frame into a complex list.
library(dplyr)
library(purrr)
x_cats <- unique(df$TYPE)
default_colors <- c("#7cb5ec", "#434348", "#90ed7d", "#f7a35c", "#8085e9",
"#f15c80", "#e4d354", "#2b908f", "#f45b5b", "#91e8e1")
colors_df <- tibble(SUB_TYPE = unique(df$SUB_TYPE)) %>%
mutate(color = default_colors[1:n()])
series <- df %>%
left_join(colors_df, by = "SUB_TYPE") %>%
group_by(SUB_TYPE, color, PERIOD) %>%
group_split() %>%
map(~list(
name = .$SUB_TYPE[[1]],
stack = .$PERIOD[[1]],
color = .$color[[1]],
data = .$VALUE,
id = paste(.$SUB_TYPE[[1]], .$PERIOD[[1]], sep = "-"),
linkedTo = paste(.$SUB_TYPE[[1]], "curr", sep = "-")
)) %>%
# remove links of curr to itself
modify_if(~ .$stack == "curr", ~.[-6])
highchart() %>%
hc_chart(type = "column") %>%
hc_plotOptions(column = list(
dataLabels = list(enabled = FALSE),
stacking = "normal")
) %>%
hc_xAxis(categories = unique(df$TYPE)) %>%
hc_add_series_list(series)

Count all values in a column based on string in another column in R for a Venn diagram

I have a file that I converted to a dataframe that looks as follows:
D <- data.frame(
V1 =c("B", "A_B", "A_B_C", "C_D", "A_C", "C_B_D", "C", "C_A_B_D", "B_C", "C_A_D", "A_D", "D", "A", "B_D", "A_B_D"),
V2 = c(15057, 5, 9, 1090, 4, 1250, 3943, 11, 2517, 5, 5, 2280, 5, 1735, 4))
I need to convert this dataframe into a list of numbers that I can use to create a 4-way venn plot. In this example the values are the correct values if added correctly. I did this manually but since I need to create several similar plots I would like to find a way to do this more efficiently.
library("VennDiagram")
venn.plot <- draw.quad.venn(
area1 = 48,
area2 = 20588,
area3 = 8829,
area4 = 6380,
n12 = 29,
n13 = 29,
n14 = 25,
n23 = 3787,
n24 = 3000,
n34 = 2356,
n123 = 20,
n124 = 15,
n134 = 16,
n234 = 1261,
n1234 = 11,
category = c("A", "B", "C", "D"),
fill = c("orange", "red", "green", "blue"),
lty = "dashed",
cex = 2,
cat.cex = 2,
cat.col = c("orange", "red", "green", "blue")
);
In this case I would need to count up all values from D$V2 that has an "A" in column V1 and so on. Then I would need to order appropriately for the venn plot function.
Here's what I would do
# setup
myset = LETTERS[1:4]
# create dummies
D[,myset] <- lapply(myset, grepl, D$V1)
# construct counts
myn <- length(myset)
mynums <- unlist(sapply(seq(myn), function(n)
apply(if (n==myn) matrix(seq(myn)) else combn(myn,n), 2, function(x)
with(D, sum( V2[Reduce("&", mget(myset[x]))] ))
)))
# pass counts to plotter
do.call(draw.quad.venn, c(as.list(unname(mynums)), list(
category = myset,
fill = c("orange", "red", "green", "blue"),
lty = "dashed",
cex = 2,
cat.cex = 2,
cat.col = c("orange", "red", "green", "blue")
)))

Resources