Different legend positions on plot with multiple legends - r

When making a geom_point() that has both color = column_1 and size = column_2 options passed through, ggplot provides two separate legends. One for the color column and one for the size. This is great.
I would like to split the two legends so the bit which maps onto color is shown across the top horizontally and the bit that maps onto size is shown on the right-hand side of the plotregion vertically.
The data and code below reproduce the graph shown below. In that graph I would like the size shown on the right-hand size of the graph vertically and the bit that maps onto the actor's name to be shown along the top as it is.
Is this kind of thing possible? I've found ways to put both of them on the left-hand side but that's not really what I want as you read the actor's name left to right in the plot, and you read size top to bottom, so I want the legends to display in the same way the reader would naturally read the data.
df <- structure(list(count = c(1025, 360, 625, 1108, 3018, 7376, 16318,
19114, 16947, 21532, 2088, 923, 1109, 1751, 3710, 7160, 13904,
20096, 17049, 24597, 2094, 607, 817, 1340, 2909, 6667, 13870,
18657, 17502, 34533, 1132, 447, 606, 940, 2038, 4564, 12141,
19197, 18426, 31272, 1144, 387, 646, 1081, 2164, 5451, 12343,
16194, 16783, 24880, 1450, 549, 759, 1278, 2568, 5623, 11406,
15957, 16445, 22850, 1707, 788, 1023, 1594, 3292, 6852, 14749,
18550, 13815, 19754, 1977, 819, 1051, 1522, 2873, 5469, 10692,
14740, 12352, 16335, 1256, 554, 633, 946, 1780, 3301, 6260, 10608,
11575, 20720, 1365, 547, 565, 1066, 2177, 4650, 9590, 11570,
8160, 11119, 13175, 3088, 2869, 3375, 5123, 7292, 9714, 9088,
5927, 10775, 8387, 1954, 1817, 1996, 2776, 3972, 5746, 5968,
3965, 5969), doctor = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L), .Label = c("Christopher Eccleston", "David Tennant", "Matt Smith",
"Peter Capaldi", "Jodie Whitaker"), class = "factor"), rating = c(1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), season_num = c(27L,
27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 28L, 28L, 28L, 28L,
28L, 28L, 28L, 28L, 28L, 28L, 29L, 29L, 29L, 29L, 29L, 29L, 29L,
29L, 29L, 29L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L,
31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 31L, 32L, 32L, 32L,
32L, 32L, 32L, 32L, 32L, 32L, 32L, 33L, 33L, 33L, 33L, 33L, 33L,
33L, 33L, 33L, 33L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L,
34L, 35L, 35L, 35L, 35L, 35L, 35L, 35L, 35L, 35L, 35L, 36L, 36L,
36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 37L, 37L, 37L, 37L, 37L,
37L, 37L, 37L, 37L, 37L, 38L, 38L, 38L, 38L, 38L, 38L, 38L, 38L,
38L, 38L)), row.names = c(NA, -120L), groups = structure(list(
season_num = 27:38, .rows = structure(list(1:10, 11:20, 21:30,
31:40, 41:50, 51:60, 61:70, 71:80, 81:90, 91:100, 101:110,
111:120), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
df %>%
ggplot() +
geom_point(aes(x = factor(season_num), y = rating, size = count, color = doctor)) +
labs(x = "Season", y = "Rating (1-10)", title = "IMDb ratings distributions by Season") +
theme(legend.position = 'top',
legend.title = element_blank(),
plot.title = element_text(size = 10),
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10)) +
scale_size_continuous(range = c(1,8)) +
scale_y_continuous(limits=c(1, 10), breaks=c(seq(1, 10, by = 1))) +
scale_x_discrete(breaks=c(seq(27, 38, by = 1))) +
scale_color_brewer(palette = "Dark2")

I do not think this is possible with ggplot2-only functions. However, a common trick is:
to make a plot without the legend,
make other plots with target legends,
extract the legends from these plots,
arrange everything in a grid using packages like cowplot or gridExtra
You can find some examples of this process on SO:
ggplot - Multiple legends arrangement
How to place legends at different sides of plot (bottom and right side) with ggplot2?
How do I position two legends independently in ggplot
Here is an example with the provided data, I have not put much effort in arranging the grid because it can change a lot depending on the package you choose in the end. It is just to showcase the process.
library(cowplot)
library(ggplot2)
# plot without legend
main_plot <- ggplot(data = df) +
geom_point(aes(x = factor(season_num), y = rating, size = count, color = doctor)) +
labs(x = "Season", y = "Rating (1-10)", title = "IMDb ratings distributions by Season") +
theme(legend.position = 'none',
legend.title = element_blank(),
plot.title = element_text(size = 10),
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10)) +
scale_size_continuous(range = c(1,8)) +
scale_y_continuous(limits=c(1, 10), breaks=c(seq(1, 10, by = 1))) +
scale_x_discrete(breaks=c(seq(27, 38, by = 1))) +
scale_color_brewer(palette = "Dark2")
# color legend, top, horizontally
color_plot <- ggplot(data = df) +
geom_point(aes(x = factor(season_num), y = rating, color = doctor)) +
theme(legend.position = 'top',
legend.title = element_blank()) +
scale_color_brewer(palette = "Dark2")
color_legend <- cowplot::get_legend(color_plot)
# size legend, right-hand side, vertically
size_plot <- ggplot(data = df) +
geom_point(aes(x = factor(season_num), y = rating, size = count)) +
theme(legend.position = 'right',
legend.title = element_blank()) +
scale_size_continuous(range = c(1,8))
size_legend <- cowplot::get_legend(size_plot)
# combine all these elements
cowplot::plot_grid(plotlist = list(color_legend,NULL, main_plot, size_legend),
rel_heights = c(1, 5),
rel_widths = c(4, 1))
Output:

Related

R: Trying to remove NAs from a boxplot

I am trying to do a basic boxplot, and I can't get NA values away from it. I have tried many tricks for this issue. For example this one.
ggplot(df=subset(df, !is.na(sum_variable)), aes(x = gender, y = sum_variable, fill = gender)) +
stat_boxplot(geom ="errorbar", width = 0.5) +
geom_boxplot(fill = "light blue") +
stat_summary(fun.y=mean, geom="point", shape=10, size=3.5, color="black") +
ggtitle("Title") +
theme_bw() + theme(legend.position="none")
And this ggplot(na.omit(data), aes(x=luse, y=rich)) +
And none of these solve the issue. What would you recommend?
Data↓
structure(list(gender = structure(c(2L, 2L, NA, 2L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 2L, 1L, 1L), .Label = c("1", "2"), class = "factor"),
sum_variable = c(9, 6, 13, 3, 4, 3, 12, 2, 7, 8, 7, 4, 5,
10, 2, 5, 4, NA, 14, 9, 2, 5, 7, 3, NA, 3, 5, 7, 3, 8, 3,
3, 4, 8, 10, 9, 5, 7, 8, 4, 9, NA, 10, 14, 10, 3, 4, 10,
3, NA, 5, 3, 4, 4, NA, 5, 4, 6, 6, 9, 6, 2, 3, NA, 4, NA,
2, 2, 6, 5, 5, 3, 5, NA, 4, 4)), class = c("rowwise_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -76L), groups = structure(list(
.rows = structure(list(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L,
21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L,
32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L,
43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L,
54L, 55L, 56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 64L,
65L, 66L, 67L, 68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L,
76L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -76L), class = c("tbl_df",
"tbl", "data.frame")))
You can filter your NA values before creating the plot:
df %>%
filter(!is.na(gender)) %>%
ggplot(aes(x = gender, y = sum_variable, fill = gender)) +
stat_boxplot(geom ="errorbar", width = 0.5) +
geom_boxplot(fill = "light blue") +
stat_summary(fun.y=mean, geom="point", shape=10, size=3.5, color="black") +
ggtitle("Title") +
theme_bw() + theme(legend.position="none")
Try removing NAs first before passing in the dataset.
sub_dta = na.omit(dta)
ggplot(data = sub_dta, aes(x = gender, y = sum_variable, fill = gender)) +
stat_boxplot(geom ="errorbar", width = 0.5) +
geom_boxplot(fill = "light blue") +
stat_summary(fun =mean, geom="point", shape=10, size=3.5, color="black") +
ggtitle("Title") +
theme_bw() + theme(legend.position="none")
One solution is to use complete.cases
ggplot(df[complete.cases(df), ], aes(x = gender, y = sum_variable,
fill = gender)) +
stat_boxplot(geom ="errorbar", width = 0.5) +
geom_boxplot(fill = "light blue") +
stat_summary(fun.y=mean, geom="point", shape=10, size=3.5, color="black") +
ggtitle("Title") +
theme_bw() + theme(legend.position="none")
Data
df <- structure(list(gender = structure(c(2L, 2L, NA, 2L, 1L, 1L, 1L,
2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 2L, 1L, 1L), levels = c("1", "2"), class = "factor"),
sum_variable = c(9, 6, 13, 3, 4, 3, 12, 2, 7, 8, 7, 4, 5,
10, 2, 5, 4, NA, 14, 9, 2, 5, 7, 3, NA, 3, 5, 7, 3, 8, 3,
3, 4, 8, 10, 9, 5, 7, 8, 4, 9, NA, 10, 14, 10, 3, 4, 10,
3, NA, 5, 3, 4, 4, NA, 5, 4, 6, 6, 9, 6, 2, 3, NA, 4, NA,
2, 2, 6, 5, 5, 3, 5, NA, 4, 4)), class = c("rowwise_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -76L), groups = structure(list(
.rows = structure(list(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L,
21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L,
32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L,
43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L,
54L, 55L, 56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 64L,
65L, 66L, 67L, 68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L,
76L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -76L), class = c("tbl_df",
"tbl", "data.frame")))

how to assign groupings based on attributes?

Imagine, I have a list of 51 personas, each of them has a standardized value inherent to their 6 skills.
Now, I am wondering if there is a programmable way to accurately and equally assign those individuals into equal teams, with the skill levels in mind. I wasn't sure which format of the data is more suitable, but intuitively I decided long dataset will make it easier:
df <- structure(list(unique_id = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L,
5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L,
7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 10L,
10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 12L, 12L,
12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 14L, 14L,
14L, 14L, 14L, 15L, 15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L,
16L, 16L, 17L, 17L, 17L, 17L, 17L, 17L, 18L, 18L, 18L, 18L, 18L,
18L, 19L, 19L, 19L, 19L, 19L, 19L, 20L, 20L, 20L, 20L, 20L, 20L,
21L, 21L, 21L, 21L, 21L, 21L, 22L, 22L, 22L, 22L, 22L, 22L, 23L,
23L, 23L, 23L, 23L, 23L, 24L, 24L, 24L, 24L, 24L, 24L, 25L, 25L,
25L, 25L, 25L, 25L, 26L, 26L, 26L, 26L, 26L, 26L, 27L, 27L, 27L,
27L, 27L, 27L, 28L, 28L, 28L, 28L, 28L, 28L, 29L, 29L, 29L, 29L,
29L, 29L, 30L, 30L, 30L, 30L, 30L, 30L, 31L, 31L, 31L, 31L, 31L,
31L, 32L, 32L, 32L, 32L, 32L, 32L, 33L, 33L, 33L, 33L, 33L, 33L,
34L, 34L, 34L, 34L, 34L, 34L, 35L, 35L, 35L, 35L, 35L, 35L, 36L,
36L, 36L, 36L, 36L, 36L, 37L, 37L, 37L, 37L, 37L, 37L, 38L, 38L,
38L, 38L, 38L, 38L, 39L, 39L, 39L, 39L, 39L, 39L, 40L, 40L, 40L,
40L, 40L, 40L, 41L, 41L, 41L, 41L, 41L, 41L, 42L, 42L, 42L, 42L,
42L, 42L, 43L, 43L, 43L, 43L, 43L, 43L, 44L, 44L, 44L, 44L, 44L,
44L, 45L, 45L, 45L, 45L, 45L, 45L, 46L, 46L, 46L, 46L, 46L, 46L,
47L, 47L, 47L, 47L, 47L, 47L, 48L, 48L, 48L, 48L, 48L, 48L, 49L,
49L, 49L, 49L, 49L, 49L, 50L, 50L, 50L, 50L, 50L, 50L, 51L, 51L,
51L, 51L, 51L, 51L), attribute = structure(c(2L, 1L, 3L, 4L,
5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L,
3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L,
2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L,
5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L,
3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L,
2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L,
5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L,
3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L,
2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L,
5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L,
3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L,
2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L,
5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L,
3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L,
2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L,
5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L,
3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L,
2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L,
5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L, 2L, 1L, 3L, 4L, 5L, 6L), .Label = c("Analytics",
"Communication", "Creativity", "Problem solving", "Programming",
"Project management"), class = "factor"), skill_level = c(1,
1, 2, 1, 0, 0, 1, 2, 1, 1, 1, 1, 4, 2, 2, 3, 2, 4, 2, 1, 1, 2,
2, 2, 2, 0, 0, 3, 0, 0, 2, 3, 3, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2,
3, 3, 3, 3, 1, 3, 3, 3, 3, 1, 3, 1, 1, 1, 2, 2, 2, 4, 0, 0, 2,
0, 0, 3, 2, 3, 3, 2, 1, 1, 3, 4, 4, 4, 3, 3, 2, 3, 3, 3, 1, 2,
2, 1, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 0, 2, 2, 2, 2, 3,
1, 2, 1, 1, 1, 0, 0, 0, 3, 2, 2, 3, 4, 3, 2, 2, 2, 2, 0, 2, 2,
2, 1, 2, 0, 0, 3, 3, 4, 3, 2, 3, 2, 1, 0, 3, 0, 2, 2, 1, 1, 2,
1, 1, 2, 1, 1, 2, 0, 1, 2, 3, 3, 3, 2, 2, 2, 2, 1, 2, 1, 1, 2,
1, 1, 2, 1, 1, 0, 1, 2, 2, 0, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 2,
1, 2, 1, 1, 1, 1, 1, 0, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 1, 3,
2, 2, 3, 0, 1, 3, 2, 3, 2, 3, 2, 1, 1, 1, 2, 0, 2, 2, 2, 2, 2,
2, 1, 2, 2, 2, 2, 2, 0, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2,
2, 2, 3, 2, 1, 2, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2, 0, 2, 1, 2, 2,
2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 4, 3, 3, 3, 2, 3, 2,
2, 2, 3, 1, 2, 2, 3, 2, 3, 1, 3)), class = c("spec_tbl_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -306L))
My idea was to somehow focus on running averages in each skill group, but I have no clue where to start.
Perhaps, I am over complicating the problem, and it may be achieved through a specific set of grouping and sorting operations. Frankly, I am not even sure how to search for some existing assignment problems like that, which is slowing me down.
Thank you.
What you describe sounds like you want to do cluster analysis. Here is one using kmeans clustering and 4 groups (finding the right number of cluster is a longer story, I'm just guessing here):
library(tidyr)
library(dplyr)
library(broom)
# kmeans needs wide format
mat <- df %>%
pivot_wider(id_cols = unique_id, names_from = attribute, values_from = skill_level)
# for the clustering we remove the id as it would be seen as a variable
clust <- mat %>%
select(-unique_id) %>%
kmeans(4)
# we can attach group membership back to the data
df_new <- mat %>%
mutate(group = clust$cluster)
df_new %>%
select(unique_id, group)
#> # A tibble: 51 x 2
#> unique_id group
#> <int> <int>
#> 1 1 3
#> 2 2 3
#> 3 3 4
#> 4 4 2
#> 5 5 2
#> 6 6 1
#> 7 7 2
#> 8 8 1
#> 9 9 4
#> 10 10 2
#> # ... with 41 more rows
# and also obtain group averages
group_average <- clust %>%
tidy() %>%
rename(Communication = x1,
Analytics = x2,
Creativity = x3,
"Problem solving" = x4,
Programming = x5,
"Project management" = x6)
group_average
#> # A tibble: 4 x 9
#> Communication Analytics Creativity `Problem solvin~ Programming
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2.11 1.94 2.22 2.33 1.94
#> 2 2 1.22 0.944 2.22 0.667
#> 3 0.833 1.33 1.5 1 0.5
#> 4 2.78 2.67 2.89 3 2.33
#> # ... with 4 more variables: `Project management` <dbl>, size <int>,
#> # withinss <dbl>, cluster <fct>
Now the groups are pretty homogeneous, meaning people in each group have relatively similar skill values. If your intention is to get groups that are equally strong, you could randomly select people from the different clusters so that each group has the same number of people from cluster 1,2,3 and 4.

Cannot use self-starting models when manually defining maxiter for nls()?

Data:
structure(list(ID = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L,
24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L,
37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L,
50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 59L, 60L, 61L,
62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L), Stage = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 3L, 3L, 5L, 5L, 5L, 1L, 1L, 6L, 6L,
4L, 4L, 2L, 2L, 7L, 7L), .Label = c("milpa", "robir", "jurup che",
"pak che kor", "mehen che", "nu kux che", "tam che"), class = "factor"),
Time.Since.Burn = c(4, 2, 0.21, 2, 0.42, 4, 0.33, 0.33, 3,
6, 2.5, 5, 4, 5, 1.5, 6, 4, 6, 3, 6.5, 6.5, 6, 4, 2.5, 12,
10, 8, 18, 5, 10, 8, 16, 28, 22, 22, 21, 20, 18, 30, 27,
30, 36, 36, 40, 32, 28, 50, 32, 60, 60, 60, 60, 60, 60, 60,
60, 6, 6, 24, 26, 22, 2, 1, 50, 45, 10, 10, 4, 4, 60, 60),
meandec = c(0.3625, 0.3025, 0.275, 0.1075, 0.26, 0.395, 0.265,
0.4075, 0.9, 0.9275, 0.7075, 0.9625, 0.7725, 0.9325, 0.9875,
0.81, 0.575, 0.3075, 0.4675, 0.6975, 0.33, 0.8725, 0.46,
0.19, 0.495, 0.3825, 0.58, 0.2275, 0.45, 0.3925, 0.605, 0.515,
0.425, 0.34, 0.2475, 0.1375, 0.4225, 0.505, 0.36, 0.4325,
0.26, 0.1575, 0.125, 0.3125, 0.1725, 0.3175, 0.43, 0.3475,
0.2025, 0.395, 0.12, 0.1625, 0.3175, 0.1975, 0.1525, 0.2775,
0.4975, 0.725, 0.04, 0.326666666666667, 0.1425, 0.445, 0.4725,
0.3775, 0.27, 0.2225, 0.23, 0.3275, 0.9725, 0.215, 0.2325
)), row.names = c(NA, -71L), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), vars = c("ID", "Stage"), drop = TRUE)
Problem:
I'm trying to run an exponential decay model on these data. I've done it with similar data, but when I try to do it on this particular dataset, it says that the number of max iterations has been exceeded without convergence.
nonlinmod6<-nls(meandec~SSasymp(Time.Since.Burn, Asym,R0,lrc),data=averaged_perherb)
Error in nls(y ~ cbind(1 - exp(-exp(lrc) * x), exp(-exp(lrc) * x)), data = xy, : number of iterations exceeded maximum of 50
So, I tried to manually increase the maximum number of iterations using the code below:
nonlinmod6<-nls(meandec~SSasymp(Time.Since.Burn, Asym,R0,lrc),data=averaged_perherb,nls.control(maxiter=500))
but it then gives me an error saying that :
Error in nls(meandec ~ SSasymp(Time.Since.Burn, Asym, R0, lrc), data =
averaged_perherb,: parameters without starting value in 'data': Asym, R0, lrc
which I don't think should be the case given that I'm using a self-starting function to identify the starting parameters. Is there any way to resolve this?
The problem is that the SSaymp intialization routine itself uses nls and it is that hidden invocation of nls that is the problem.
You are going to have to hack the intialization routine. Make a new copy of SSasymp called SSasymp2, grab its initialization routine and call it SSasymp2Init, say. Then use trace to insert into the initialization a new version of nls having the required control argument. To do that we use the partial function in the pryr package. Replace the initialization routine with the hacked one and then run nls.
library(pryr)
SSasymp2 <- SSasymp
SSasymp2Init <- attr(SSasymp2, "initial")
trace(SSasymp2Init,
quote(nls <- partial(stats::nls, control = nls.control(maxiter = 500))))
attr(SSasymp2, "initial") <- SSasymp2Init
nls(meandec ~ SSasymp2(Time.Since.Burn, Asym, R0, lrc), data = averaged_perherb)
giving:
Tracing (attr(object, "initial"))(mCall = mCall, data = data, LHS = LHS) on entry
Nonlinear regression model
model: meandec ~ SSasymp2(Time.Since.Burn, Asym, R0, lrc)
data: averaged_perherb
Asym R0 lrc
0.1641 0.5695 -3.4237
residual sum-of-squares: 2.977
Number of iterations to convergence: 15
Achieved convergence tolerance: 5.875e-06

Set the thickness of geom_line based on frequency (like geom_count)

I would like to set the thickness of geom_line to the proportion of data that follows that path, in the same way that geom_count sets the size of points based on the proportion of data that overlap at that point, or find a function that will allow me to do this.
I would also be happy if I could do this as a count rather than a proportion - either would work. I have attached the graph the grey lines represent connections between the same ID (ie. same individual in different categories), if I could set the thickness of the lines I can show the most common connection pathways.
My current code is:
ggplot(dat, aes(x = Category, y = Metric, group = ID)) +
geom_line(aes(group = ID), colour = "gray59") +
geom_count(aes(size = ..prop.., group = 1), colour = "gray59") +
scale_size_area(max_size = 5) +
theme_bw() +
geom_smooth(method = "lm", se = F, colour = "black",
aes(group = 1), linetype = "dotdash") +
xlab("Category") +
ylab("Metric") +
theme(text = element_text(size = 16))
This is the resulting graph, point size shows the proportion of data that overlaps at that point, I would like to do the same with line thickness if possible:
My searching has so far turned up nothing helpful but maybe I am searching the wrong terms. Any help would be much appreciated!
Here is the data - unsure how to upload it as a file
dat <- structure(list(IDD = structure(c(1L, 1L, 1L, 1L, 3L, 3L, 4L,
4L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 2L, 2L, 2L, 2L, 7L, 7L, 7L,
8L, 8L, 8L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 11L, 11L, 12L,
12L, 13L, 13L, 13L, 13L, 14L, 14L, 15L, 15L, 15L, 15L, 16L, 16L,
16L, 16L, 17L, 17L, 18L, 18L, 18L, 18L, 19L, 19L, 20L, 20L, 21L,
21L, 21L, 22L, 22L, 23L, 23L, 24L, 24L, 25L, 25L, 25L, 26L, 26L,
26L, 26L, 27L, 27L, 28L, 28L, 29L, 29L, 29L, 30L, 30L, 30L, 31L,
31L, 31L, 31L, 32L, 32L, 33L, 33L, 33L, 34L, 34L, 34L, 34L, 35L,
35L, 36L, 36L, 36L, 37L, 37L, 37L, 37L, 38L, 38L, 38L, 39L, 39L,
39L, 40L, 40L, 40L, 41L, 41L, 42L, 42L, 43L, 43L, 44L, 44L, 44L,
44L, 45L, 45L, 45L, 46L, 46L, 46L, 47L, 47L, 47L, 48L, 48L, 49L,
49L, 50L, 50L, 51L, 51L, 51L, 51L, 52L, 52L, 53L, 53L, 54L, 54L,
55L, 55L, 56L, 56L, 57L, 57L, 57L, 58L, 58L, 59L, 59L, 59L, 59L
), .Label = c("ID005", "ID040", "ID128", "ID131", "ID133", "ID134",
"ID147", "ID149", "ID166", "ID167", "ID175", "ID181", "ID191",
"ID198", "ID213", "ID235", "ID254", "ID257", "ID259", "ID273",
"ID279", "ID287", "ID292", "ID299", "ID300", "ID321", "ID334",
"ID348", "ID349", "ID354", "ID359", "ID377", "ID379", "ID383",
"ID390", "ID395", "ID409", "ID445", "ID467", "ID469", "ID482",
"ID492", "ID496", "ID524", "ID526", "ID527", "ID534", "ID535",
"ID538", "ID545", "ID564", "ID576", "ID578", "ID579", "ID600",
"ID610", "ID622", "ID631", "ID728"), class = "factor"), Category = c(2L,
4L, 5L, 5L, 2L, 4L, 1L, 3L, 3L, 4L, 4L, 2L, 4L, 5L, 5L, 5L, 2L,
5L, 5L, 5L, 3L, 2L, 5L, 4L, 5L, 5L, 4L, 4L, 5L, 5L, 3L, 4L, 5L,
5L, 2L, 4L, 2L, 5L, 3L, 4L, 5L, 5L, 4L, 5L, 3L, 4L, 5L, 5L, 3L,
4L, 5L, 5L, 5L, 5L, 2L, 3L, 4L, 4L, 5L, 5L, 5L, 5L, 4L, 4L, 5L,
5L, 5L, 3L, 4L, 5L, 5L, 4L, 5L, 5L, 1L, 3L, 4L, 4L, 3L, 5L, 3L,
5L, 2L, 3L, 4L, 3L, 4L, 4L, 3L, 3L, 4L, 4L, 3L, 5L, 3L, 4L, 4L,
3L, 3L, 4L, 5L, 2L, 3L, 2L, 3L, 4L, 2L, 2L, 3L, 4L, 4L, 5L, 5L,
2L, 3L, 4L, 2L, 3L, 4L, 3L, 4L, 4L, 5L, 3L, 4L, 1L, 2L, 3L, 4L,
1L, 3L, 4L, 1L, 3L, 4L, 1L, 3L, 4L, 3L, 4L, 3L, 3L, 2L, 3L, 2L,
2L, 3L, 3L, 2L, 3L, 2L, 3L, 3L, 4L, 3L, 4L, 3L, 4L, 1L, 2L, 3L,
2L, 3L, 1L, 3L, 4L, 4L), Metric = c(2, 2, 3.5, 4, 2, 1.5, 2,
2, 3, 3, 2, 2, 2, 2, 3.5, 3.5, 2, 3, 3.5, 4, 2, 2, 3, 2, 3, 3,
2, 3, 3, 2.5, 1.5, 3, 3.5, 4, 2, 2, 1.5, 2, 1.5, 2, 2, 2, 2.5,
3, 2.5, 3.5, 3.5, 3.5, 1.5, 2, 2.5, 2.5, 3.5, 4, 2, 2, 1.5, 3,
3.5, 3, 3, 3, 3.5, 2.5, 3, 3, 3, 2, 3, 2.5, 2.5, 2, 2, 2, 2,
2, 2, 2, 2.5, 2.5, 2, 3, 2.5, 2, 2.5, 2, 2.5, 2.5, 2, 2, 2.5,
3.5, 2, 2.5, 2.5, 2.5, 2.5, 2, 2, 2, 2.5, 2, 2, 1.5, 2, 2, 2.5,
2, 2, 2.5, 2, 2, 2.5, 2.5, 2.5, 3, 2.5, 2.5, 2.5, 2, 2, 2.5,
2.5, 2, 2, 2, 2, 1.5, 2, 1.5, 2, 2, 2, 1.5, 2, 2, 2.5, 2.5, 1.5,
1.5, 2, 2.5, 2, 2, 2, 2, 2.5, 2, 1.5, 2, 2.5, 2, 1.5, 1.5, 1.5,
2, 2, 2, 2, 2, 1.5, 2, 2.5, 2, 2, 2.5, 2.5)), .Names = c("IDD",
"Category", "Metric"), class = "data.frame", row.names = c(NA,
-167L))
I am a bit confused about how you want to scale different line segments, but I was able to create a proportional variable within dat and then plot that as an argument to geom_line():
dat$thickness <- with(dat, ave(Category, Metric, FUN = prop.table))
ggplot(dat, aes(x = Category, y = Metric, group = ID)) +
geom_line(aes(group = ID), colour = "gray59", size = dat$thickness) +
geom_count(aes(size = ..prop.., group = 1), colour = "gray59") +
scale_size_area(max_size = 5) +
theme_bw() +
geom_smooth(method = "lm", se = F, colour = "black",
aes(group = 1), linetype = "dotdash") +
xlab("Category") +
ylab("Metric") +
theme(text = element_text(size = 16))
Which yields this plot:

ggmap with ggsubplot creates blank map

I am trying to place some plots on a map but nothing appears on the map. Here is a reproducible example. The first plot shows how each subplot should look. The second excludes the map but the subplot sizes are too large. The last is one attempt at the final product. I have tried many permutations but this has me stuck. Thanks in advance.
library(ggplot2)
library(ggmap)
library(ggsubplot)
pDat <- structure(list(Location = structure(c(13L, 12L, 14L, 14L, 15L, 15L, 16L, 16L, 17L, 17L, 18L, 19L, 32L, 19L, 19L, 20L, 20L, 20L, 21L, 21L, 21L, 22L, 22L, 22L, 23L, 23L, 24L, 25L, 25L, 26L, 27L, 28L, 28L, 29L, 30L, 30L, 31L), .Label = c("PW-29", "PW-31", "PW-32", "PW-33", "PW-35", "PW-36", "PW-37", "PW-38", "PW-39", "PW-40", "PW29", "SD-03", "SD-03a", "SD-12", "SD-18", "SD-19", "SD-27", "SD-29", "SD-30", "SD-31", "SD-32", "SD-33", "SD-35", "SD-36", "SD-37", "SD-38", "SD-40", "SD-41", "SD-42", "SD-43", "SD-44", "SD30"), class = "factor"), Lat = c(47.292351, 47.292351, 47.289376, 47.289376, 47.288299, 47.288299, 47.288014, 47.288014, 47.287338, 47.287338, 47.29476, 47.293246, 47.293246, 47.293246, 47.293246, 47.293259, 47.293259, 47.293259, 47.292206, 47.292206, 47.292206, 47.291523, 47.291523, 47.291523, 47.290496, 47.290496, 47.289826, 47.288262, 47.288262, 47.287735, 47.286672, 47.290059, 47.290059, 47.290482, 47.28852, 47.28852, 47.288377), Long = c(-73.098418, -73.098418, -73.101282, -73.101282, -73.102558, -73.102558, -73.102178, -73.102178, -73.103016, -73.103016, -73.096432, -73.096412, -73.096412, -73.096412, -73.096412, -73.098245, -73.098245, -73.098245, -73.097552, -73.097552, -73.097552, -73.100022, -73.100022, -73.100022, -73.099395, -73.099395, -73.100051, -73.101199, -73.101199, -73.101895, -73.102629, -73.100954, -73.100954, -73.100184, -73.102246, -73.102246, -73.101477), SBD_ft = c(0, 2, 0, 7, 0, 10, 0, 6, 2, 5, 0, 0.5, 0.5, 0, 2.5, 0.5, 0, 3, 0.5, 0, 2.5, 0.5, 0, 2.5, 0.5, 0, 0, 0.5, 0, 0, 0, 2, 5, 3, 0, 6, 0), SED_ft = c(20, 4, 2, 9, 2, 12, 2, 8, 4, 7, 0.5, 2.5, 2.5, 0.5, 4.5, 2.5, 0.5, 5, 2.5, 0.5, 4.5, 2.5, 0.5, 3.5, 2.5, 0.5, 0.5, 2.5, 0.5, 0.5, 0.5, 4, 7, 5, 2, 8, 2), Cluster = structure(c(3L, 3L, 3L, 4L, 5L, 5L, 2L, 2L, 4L, 5L, 1L, 6L, 6L, 6L, 6L, 1L, 1L, 1L, 6L, 1L, 6L, 4L, 1L, 6L, 1L, 1L, 1L, 5L, 1L, 4L, 1L, 3L, 4L, 3L, 4L, 4L, 4L), .Label = c("1", "2", "3", "4", "5", "6"), class = "factor")), .Names = c("Location", "Lat", "Long", "SBD_ft", "SED_ft", "Cluster"), row.names = 5:41, class = "data.frame")
BBox<-c(-73.01, 47.28, -73.1, 47.30)
#Base <-get_map(BBox,zoom=13,source='google',maptype = 'hybrid')
Base_z <-get_map(BBox,zoom=15,source='google',maptype = 'hybrid')
fm0<-ggmap(Base_z,legend = "none",
base_layer=ggplot(aes(x=Long,y=Lat),data=pDat))
# Example subplots
ggplot(pDat,aes(ymin=SBD_ft,ymax=SED_ft,xmin=0,xmax=1,fill=Cluster))+
facet_wrap(~Location)+
geom_rect() +
scale_y_reverse()
# TEST 1, need to control size of subplots
ggplot(pDat)+
geom_subplot(aes(x=Long,y=Lat,group=Location,
subplot=geom_rect(data=pDat,aes(ymin=SBD_ft,ymax=SED_ft,xmin=0,xmax=1,fill=Cluster))))
# Final , does not work
fm0+
geom_subplot(aes(x=Long,y=Lat,group=Location,
subplot=geom_rect(data=pDat,aes(ymin=SBD_ft,ymax=SED_ft,xmin=0,xmax=1,fill=Cluster))))

Resources