ggplot boxplot not plotting when supplying breaks - r

I have 7 values that I am trying to plot using geom_boxpot and I wish to only show certain values in the legend so am using the breaks argument in the scale_linetype_manual function but it doesn't plot those boxplots at all,
Boxplot using scale_linetype_manual
However, if I use the same code with scale_linetype_discrete it works fine and plots and only gives me the required values in the legend. But, I can't control the linetype in the function using the values argument. Is there a way to add values to the scale_linetype_discrete function?
Boxplot using scale_linetype_discrete
EDIT - UPDATED with fake data + code
> head(debug)
# A tibble: 6 × 4
subj time cond y
<dbl> <chr> <chr> <dbl>
1 1 one one_A 2
2 1 two two_A 1
3 1 two two_B 5
4 1 two two_C 0
5 1 three three_A 4
6 1 four four_A 4
> dput(debug)
structure(list(subj = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4), time = c("one",
"two", "two", "two", "three", "four", "four", "one", "two", "two",
"two", "three", "four", "four", "one", "two", "two", "two", "three",
"four", "four", "one", "two", "two", "two", "three", "four",
"four"), cond = c("one_A", "two_A", "two_B", "two_C", "three_A",
"four_A", "four_B", "one_A", "two_A", "two_B", "two_C", "three_A",
"four_A", "four_B", "one_A", "two_A", "two_B", "two_C", "three_A",
"four_A", "four_B", "one_A", "two_A", "two_B", "two_C", "three_A",
"four_A", "four_B"), y = c(2, 1, 5, 0, 4, 4, 2, 2, 4, 3, 0, 1,
5, 3, 1, 5, 4, 2, 0, 4, 4, 0, 0, 4, 2, 1, 5, 5)), row.names = c(NA,
-28L), spec = structure(list(cols = list(subj = structure(list(), class = c("collector_double",
"collector")), time = structure(list(), class = c("collector_character",
"collector")), cond = structure(list(), class = c("collector_character",
"collector")), y = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), problems = <pointer: 0x600004737380>, class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))
# Scale_linetype_discrete (would like to add linetype values to this)
ggplot() +
geom_boxplot(debug,
mapping = aes(x = time,
y = y,
linetype = cond),
show.legend = TRUE) +
scale_x_discrete(limits = c("one","two","three","four"),
labels = c("one","two","three","four")) +
scale_linetype_discrete(name = "My legend",
breaks = c("two_A","two_B","two_C","four_A","four_B"),
labels = c("two_A","two_B","two_C","four_A","four_B"))
# Scale_linetype_manual (would like 'one' and 'three' to be actually plotted but not showing in my legend)
ggplot() +
geom_boxplot(debug,
mapping = aes(x = time,
y = y,
linetype = cond),
show.legend = TRUE) +
scale_x_discrete(limits = c("one","two","three","four"),
labels = c("one","two","three","four")) +
scale_linetype_manual(name = "My legend",
breaks = c("two_A","two_B","two_C","four_A","four_B"),
labels = c("two_A","two_B","two_C","four_A","four_B"),
values = c("solid","dashed","solid","dashed","solid", "dashed","solid"))

Not sure about the reasons why your code does not work. But one option to fix your issue would be to make use of a named vector to assign linetypes to categories of you cond variable:
library(ggplot2)
lty <- c("solid", "dashed", "solid", "dashed", "solid", "dashed", "solid")
names(lty) <- c("four_A", "four_B", "one_A", "three_A", "two_A", "two_B", "two_C")
ggplot() +
geom_boxplot(debug,
mapping = aes(
x = time,
y = y,
linetype = cond
)
) +
scale_x_discrete(
limits = c("one", "two", "three", "four"),
labels = c("one", "two", "three", "four")
) +
scale_linetype_manual(
name = "My legend",
breaks = c("two_A", "two_B", "two_C", "four_A", "four_B"),
values = lty
)

Related

Get row columns by group for geom_col in ggplot

I am trying to calculate row percentages by demographics of various score levels--in my data, that would be what % of white people (or % of black people, or % male, or % who have education level 2, and so on) have a score of 0 (or 1, 2, or 3)--and then use that to create a big plot.
So in my example data below, 8.33% of race == 1 (which is white) have a score of 0, 25% have a score of 1, 25% have a score of 2, and 41.67% have a score of 3.
Then the ultimate end goal would be to get some type of bar plot where the 4 levels of 'score' are across the x axis, and the various comparisons of demographics run down the y axis. Something that looks visually sort of like this, but with the levels of 'score' across the top instead of education levels: .
I already have some code to make the actual figure, which I've done in other instances but with externally/already-calculated percentages:
ggplot(data, aes(x = percent, y = category, fill = group)) +
geom_col(orientation = "y", width = .9) +
facet_grid(group~score_var,
scales = "free_y", space = "free_y") +
labs(title = "Demographic breakdown of 'Score'") +
theme_bw()
I am just struggling to figure out the best way to calculate these row percentages, presumably using group_by() and summarize and then storing or configuring them in a way that they can be plotted. Thank you.
d <- structure(list(race = c(1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1,
1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2,
3, 3), gender = c(0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,
0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1
), education = c(1, 3, 3, 2, 1, 3, 2, 3, 4, 4, 2, 3, 3, 2, 3,
4, 1, 3, 1, 3, 3, 2, 1, 3, 2, 3, 4, 4, 2, 3, 3, 2, 3, 4, 1, 3
), score = c(1, 2, 2, 1, 2, 3, 3, 2, 0, 0, 1, 2, 1, 3, 0, 0,
3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 2, 3, 1, 3, 3, 0, 1, 2, 2, 0)), row.names = c(NA,
-36L), spec = structure(list(cols = list(race = structure(list(), class = c("collector_double",
"collector")), gender = structure(list(), class = c("collector_double",
"collector")), education = structure(list(), class = c("collector_double",
"collector")), score = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), problems = <pointer: 0x000001bd978b0df0>, class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))
This may get you started:
library(dplyr)
library(ggplot2)
prop <- data %>%
mutate(race = factor(race, levels = c(1, 2, 3), labels = c("White", "Black", "Others"))) %>%
group_by(race) %>%
mutate(race_n = n()) %>%
group_by(race, score) %>%
summarise(percent = round(100*n()/race_n[1], 1))
prop %>%
ggplot(aes(x = percent, y = score, fill = race)) +
geom_col(orientation = "y", width = .9) +
geom_text(aes(label = percent), hjust = 1)+
facet_grid(~race) +
labs(title = "Demographic breakdown of 'Score'") +
theme_bw()
Edit
To put all characters together, you can get a bigger graph:
df <- data %>% mutate(
gender = factor(2-gender),
race = factor(race),
education = factor(education)) %>%
pivot_longer(!score, names_to = "character", values_to = "levels")
df %>% group_by(character, levels) %>%
mutate(group_n = n()) %>%
group_by(character, levels, score) %>%
summarise(percent = round(100*n()/group_n[1], 1)) %>%
ggplot(aes(x = percent, y = score, fill = character)) +
geom_col(orientation = "y", width = .9) +
geom_text(aes(label = percent), hjust = 1)+
facet_grid(character ~ levels) +
labs(title = "Demographic breakdown of 'Score'") +
theme_bw()
please note: I have changed the code for gender.
Taking inspiration from #Zhiqiang Wang's excellent first pass at this, I finally figured out a solution. I still need to change the order of the labels (to put the education levels in order, and move the race variables to the top of the figure) but this is basically what I was envisioning.
d_test <- d %>% mutate(
gender = factor(2-gender),
race = factor(race),
education = factor(education)) %>%
pivot_longer(!score, names_to = "group", values_to = "levels")
d_test <- d_test %>% group_by(group, levels) %>%
mutate(group_n = n()) %>%
group_by(group, levels, score) %>%
summarise(percent = round(100*n()/group_n[1], 1))
d_test <- d_test %>%
mutate(var = case_when(group == "gender" & levels == 1 ~ "female",
group == "gender" & levels == 2 ~ "male",
group == "race" & levels == 1 ~ "white",
group == "race" & levels == 2 ~ "black",
group == "race" & levels == 3 ~ "hispanic",
group == "education" & levels == 1 ~ "dropout HS",
group == "education" & levels == 2 ~ "grad HS",
group == "education" & levels == 3 ~ "some coll",
group == "education" & levels == 4 ~ "grad coll"))
ggplot(d_test, aes(x = percent, y = var, fill = group)) +
geom_col(orientation = "y", width = .9) +
facet_grid(group ~ score,
scales = "free_y", space = "free_y") +
labs(title = "Demographic breakdown of 'Score'",
y = "",
x = "Percent") +
theme_minimal() +
theme(legend.position = "none",
strip.text.y = element_blank())

How to set the stat_function in for loop to plot two graphs with normal distribution, central and variance parameters

I would like to create the following plots in parallel
I have used the following code using the wide format dataset:
sumstatz_1 <- data.frame(whichstat = c("mean",
"sd upr",
"sd lwr",
"median"),
value = c(mean(data$score),
mean(data$score)+sd(data$score),
mean(data$score)-sd(data$score),
median(data$score)))
plot2 = ggplot(data, aes(x = score)) +
geom_histogram(aes(y =..density..),
breaks = seq(0, max(data$score), by = 5),
colour = "black",
fill = "white") + stat_function(fun = dnorm,
args = list(mean = mean(data$score, na.rm = TRUE),
sd = sd(data$score, na.rm = TRUE)),
colour = 'black', size = 1) +
labs(title='score', x='score', y= 'Distribution') +
geom_vline(data=sumstatz_1,aes(xintercept = value,
linetype = whichstat,
col = whichstat),size=1)
I have taken it by changing just the variable of interest to create the second graph. Anyway, I would like to create the same result by using an interactive graph. Here I have set up the following code that I have converted into a long format for convenience and then I have coded the following for loop:
for (i in 101:ncol(long)) {
p <- ggplot(long, aes(x = points)) +
geom_histogram(aes(y =..density..),
breaks = seq(0, 50, by = 3),
colour = "black",
fill = "white") + facet_grid(.~ score)
} for (j in seq_along(long$score)){
p +
stat_function(fun = dnorm[???],
args = list(mean = mean(long$points[long$score == 'j'], na.rm = TRUE),
sd = mean(long$points[long$score == 'j'], na.rm = TRUE)),
colour = 'black', size = 1)
}
print(p)
But I have no clue how to set parameters in stat_function() nor wether it is possible to use in a for loop or another iterative method. Would you have possibly any suggestion?
Here the dataset
structure(list(ID = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7,
7, 8, 8, 9, 9, 10, 10), score = structure(list(MM_score = c("score_2",
"score_1", "score_2", "score_1", "score_2", "score_1", "score_2",
"score_1", "score_2", "score_1", "score_2", "score_1", "score_2",
"score_1", "score_2", "score_1", "score_2", "score_1", "score_2",
"score_1")), row.names = c(NA, -20L), class = c("tbl_df", "tbl",
"data.frame")), points = c(53, 13.25, 17.5, 1.59090909090909,
48.5, 6.92857142857143, 40, 3.63636363636364, 46, 7.07692307692308,
38, 4.47058823529412, 14.5, 1.61111111111111, 19.5, 3.54545454545455,
37.5, 3.40909090909091, 5.5, 0.916666666666667)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -20L), groups = structure(list(
ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), .rows = structure(list(
1:2, 3:4, 5:6, 7:8, 9:10, 11:12, 13:14, 15:16, 17:18,
19:20), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -10L), .drop = TRUE))

R: non-numeric arguments to binary operators

I am working with the R programming language. I am trying to make a "parallel coordinates plot" using some fake data:
library(MASS)
a = rnorm(100, 10, 10)
b = rnorm(100, 10, 5)
c = rnorm(100, 5, 10)
d = matrix(a, b, c)
parcoord(d[, c(3, 1, 2)], col = 1 + (0:149) %/% 50)
However, a problem arises when I try to mix numeric and factor variables together:
group <- sample( LETTERS[1:4], 100, replace=TRUE, prob=c(0.25, 0.25, 0.25, 0.25) )
d = matrix(a,b, group)
parcoord(d[, c(3, 1, 2)], col = 1 + (0:149) %/% 50)
Error in x - min(x, na.rm = TRUE): non-numeric argument to binary operator
I am just curious. Can this problem be resolved? Or is it simply impossible to make such a plot using numeric and factor variables together?
I saw a previous stackoverflow post over here where a similar plot is made using numeric and factor variables: How to plot parallel coordinates with multiple categorical variables in R
However, I am using a computer with no USB port or internet access - I have a pre-installed version of R with limited libraries (I have plotly, ggplot2, dplyr, MASS ... I don't have ggally or tidyverse) and was looking for a way to do this only with the parcoord() function.
Does anyone have any ideas if this can be done?
Thanks
Thanks
One option is to label rows of the matrix using a factor and use that on the plot, e.g.
library(MASS)
set.seed(300)
par(xpd=TRUE)
par(mar=c(4, 4, 4, 6))
a = rnorm(12, 10, 10)
b = rnorm(12, 10, 5)
c = rnorm(12, 5, 10)
group <- sample(c("#FF9289", "#FF8AFF", "#00DB98", "#00CBFF"),
12, replace=TRUE)
d = cbind(a, b, c)
rownames(d) <- group
parcoord(d[, c(3, 1, 2)], col = group)
title(main = "Plot", xlab = "Variable", ylab = "Values")
axis(side = 2, at = seq(0, 1, 0.1),
tick = TRUE, las = 1)
legend(3.05, 1, legend = c("A", "B", "C", "D"), lty = 1,
col = c("#FF9289", "#FF8AFF", "#00DB98", "#00CBFF"))
EDIT
Thanks for the additional explanation. What you want does make sense, but unfortunately it doesn't look like it will work as I expected. I tried to make a plot using an ordered factor as the middle variable (per https://pasteboard.co/JKK4AUD.jpg) but got the same error ("non-numeric argument to binary operator").
One way I thought of doing it is to recode the factor as a number (e.g. "Var_1" -> 0.2, "Var_2" -> 0.4) as below:
library(MASS)
set.seed(123)
par(xpd=TRUE)
par(mar=c(4, 4, 4, 6))
a = rnorm(12, 10, 10)
b = c(rep("Var_1", 3),
rep("Var_2", 3),
rep("Var_3", 3),
rep("Var_4", 3))
c = rnorm(12, 5, 10)
group <- c(rep("#FF9289", 3),
rep("#FF8AFF", 3),
rep("#00DB98", 3),
rep("#00CBFF", 3))
d = data.frame("A" = a,
"Factor" = b,
"C" = c,
"Group" = group)
d$Factor <- sapply(d$Factor, switch,
"Var_1" = 0.8,
"Var_2" = 0.6,
"Var_3" = 0.4,
"Var_4" = 0.2)
parcoord(d[, c(1, 2, 3)], col = group)
title(main = "Plot", xlab = "Variable", ylab = "Values")
axis(side = 2, at = seq(0, 1, 0.1),
tick = TRUE, las = 1)
legend(3.05, 1, legend = c("A", "B", "C", "D"), lty = 1,
col = c("#FF9289", "#FF8AFF", "#00DB98", "#00CBFF"))
mtext(text = "Var 1", side = 1, adj = 0.6, padj = -30)
mtext(text = "Var 3", side = 1, adj = 0.6, padj = -12)
mtext(text = "Var 2", side = 1, adj = 0.6, padj = -21)
mtext(text = "Var 4", side = 1, adj = 0.6, padj = -3)

R ggraph/ggplot2 color legend not displayed properly

I'm pretty new to ggraph and strugling a bit to get the legend to properly display the colors of the nodes.
I have the following example data set:
nodes <- data.frame( ID = c( 2, 3, 4, 5, 6, 7 ),
cl = c( "A", "B", "A", "A", "C", "B" ),
ty = c( 1, 1, 0, 0, 0, 1 ),
assets = c( 20000000, 10000, 500000, 10000, 150, 50 )
)
edges <- data.frame( from = c( 2, 5, 4, 6, 7, 4, 3 ),
to = c( 3, 4, 3, 5, 5, 3, 2 ),
we = c( 1, 1, 3, 2, 1, 1, 3 ),
pa = c( 0, 0, 1, 0, 1, 0, 0 ))
Based on these data I tried to plot the graph:
library( 'tidygraph' )
library( 'igraph' )
library( 'ggraph' )
graph <- graph_from_data_frame( edges, vertices = nodes, directed = TRUE ) %>% as_tbl_graph()
ggraph( graph, layout = 'fr' ) +
# Create edge layer
geom_edge_link0( aes( width = we, color = factor( pa )),
arrow = arrow( angle = 10, length = unit( 0.15, "inches" ),
ends = "last", type = "closed" )) +
scale_edge_width( range = c( 0.2, 2.2 )) +
scale_edge_color_grey( start = 0.4, end = 0.8 ) +
# Create node layer
geom_node_point( aes( shape = factor( ty ), fill = cl, size = log( assets ))) +
# Title and legend
labs( edge_width = "Power", edge_color = "Ownertype" ) +
ggtitle( "Title" ) +
theme( legend.key = element_rect( fill = "white", colour = "black" ),
legend.title = element_text(face = "bold" )) +
scale_size_continuous( name = "Assets", range = c( 3, 6 ), breaks = c( 5, 10, 15 )) +
scale_shape_manual( name = "Same branch", values = c( 21, 23 ), labels = c( "no", "yes" )) +
scale_fill_brewer( name = "Sector", palette = "Dark2" )
I have two issues with the legend under heading 'Sector':
The color keys are not displayed, they are all black. This happens everytime I let both the color and the shape of the nodes vary.
The color keys are too small, so that it is really hard to distinguish the colors (once they are there).
Unfortunately all my attempts to solve these two problems have been unsuccessful.
By default, the legend guide for points doesn't use a shape that supports a fill color. You need to set such a shape for the guide:
+ guides(fill = guide_legend(override.aes = list(size = 5, shape = 21)))

Merge contents within list of list by duplicate name

Given a list of lists as such, is there an elegant way to convert original to treated? I'm using simple values like 1,2,3, but values could be data frames or whatever. The goal is not to de-dupe the contents of each unique named, simply to de-dupe names by merging contents.
original = structure(list(name1 = structure(list(one = 1, two = 2, three = 3), .Names = c("one",
"two", "three")), name2 = structure(list(a = 9), .Names = "a"),
name1 = structure(list(four = 4, five = 5, six = 6), .Names = c("four",
"five", "six")), name2 = structure(list(b = 8), .Names = "b")), .Names = c("name1",
"name2", "name1", "name2"))
treated = structure(list(name1 = structure(list(one = 1, two = 2, three = 3,
four = 4, five = 5, six = 6), .Names = c("one", "two", "three",
"four", "five", "six")), name2 = structure(list(a = 9, b = 8), .Names = c("a",
"b"))), .Names = c("name1", "name2"))
Here is a solution using plyr.
require(plyr)
lnames = names(original)
ulnames = unique(lnames)
treated = plyr::llply(ulnames, function(x) original[lnames == x])
treated = llply(treated, unlist, recursive = F)
names(treated) = ulnames

Resources