Altering thickness of line graph based on counts - r

Dataframe "id" has the columns year, id, and matriline, where each row is an incident. I wanted to count the number of incidents by matriline per year, so I did:
events.bymatr =
id %>%
group_by(year, matr, .drop = FALSE) %>%
dplyr::summarise(n = n()) %>%
ungroup()
events.bymatr
I plotted a line graph of the number of incidents over time, by matriline.
ggplot(events.bymatr, aes(x=year, y=n, group=matr)) + geom_line(aes(color=matr))
My question is twofold:
Is there a way I could recreate this line graph where the thickness of the lines is determined by how many IDs there were, per matriline? I imagine this would involve reshaping my data above but when I tried to group_by(year,matr,id,.drop=FALSE) my data came out all wonky.
I want to change the color palete so that each color is very distinct - how do I attach a new color palette? I tried using this c25 palette with this code but it makes all my lines disappear.
ggplot(events.bymatr, aes(x=year, y=n, group=matr)) + geom_line(aes(color=c25))
Thanks so much in advance!
Output of "id" (shortened to just the first five rows per column):
> dput(id)
structure(list(date = structure(c(8243, 8243, 8243, 8248, 8947,
class = "Date"), year = c(1992L, 1992L, 1992L, 1992L, 1994L),
event.id = c(8L, 8L, 8L, 10L, 11L), id = structure(c(51L, 55L, 59L,
46L, 51L), .Label = c("J11", "J16", "J17", "J2", "J22"),
class = "factor"), sex = structure(c(1L, 2L, 2L, 1L, 1L),
.Label = c("0", "1"), class = "factor"), age = c(28L, 12L, 6L, 42L,
30L), matr = structure(c(20L, 20L, 20L, 11L, 20L), .Label = c("J2",
"J4", "J7", "J9", "K11"), class = "factor"),
matralive = structure(c(2L, 2L, 2L, 2L, 2L),
.Label = c("0", "1"), class = "factor"), pod = structure(c(3L, 3L,
3L, 3L, 3L), .Label = c("J", "K", "L"), class = "factor")),
row.names = c(NA, -134L), class = c("tbl_df", "tbl", "data.frame"))
Output of events.bymatr:
> dput(events.bymatr)
structure(list(year = c(1992L, 1992L, 1992L, 1992L, 1992L),
matr = structure(c(1L, 2L, 3L, 4L, 5L), .Label = c("J2", "J4",
"J7", "J9", "K11"), class = "factor"), n = c(0L, 0L, 0L, 0L, 0L)),
row.names = c(NA, -380L), class = c("tbl_df", "tbl",
"data.frame"))

As #r2evans noted, it is surprisingly hard to distinguish clearly among more than a handful of colors. I used an example 20-color scale here that does a pretty good job, but even so a few can be tricky to distinguish. Here's an attempt using the storms dataset included with dplyr.
library(dplyr)
storms %>%
group_by(name, year) %>%
summarize(n = n(), .groups = "drop") %>% # = number of name per year View
tidyr::complete(name, year = 1975:2015, fill = list(n = 0)) %>%
group_by(name) %>%
mutate(total = sum(n)) %>% # = number of name overall
ungroup() %>%
filter(total %% 12 == 0) %>% # Arbitrary, to reduce scope of data for example
ggplot(aes(year, n, color = name, size = total, group = name)) +
geom_line() +
guides(color = guide_legend(override.aes = list(size = 3))) +
ggthemes::scale_color_tableau(palette = "Tableau 20")

Related

Ordering bars according to the values of y [duplicate]

This question already has answers here:
Set the order of a stacked bar chart by the value of one of the variables
(2 answers)
Closed 9 months ago.
Using the code below, I have created the below chart. To make it easier for people to see the pattern, I'd like to order states from left to right according to the y values (Dx) by age 65.
Thanks,
NM
Here is my data:
structure(list(Age = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("30", "50", "65"), class = "factor"), Dx = c(3.057, 7.847, 17.157, 2.851, 8.861, 21.885, 2.521, 7.889, 21.328), PopName = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("AK", "AL", "AR"), class = "factor")), row.names = c(NA, -9L), class = c("tbl_df", "tbl", "data.frame"))
library(tidyverse)
library(tidyverse)
CAPS_2019 %>%
group_by(Age, PopName) %>%
mutate(PopName1 = sum(Dx)) %>%
ungroup() %>%
ggplot(aes(x = fct_reorder(PopName, PopName1), y = Dx, fill = factor(as.character(Age)))) +
geom_col(position = position_stack(reverse = TRUE)) +
theme_classic()+
coord_flip()+
labs(x = "State", y = "Deaths (%)", caption = (""), face = "bold", fill = "Age")
Update 2 Try this in your new dataset Age and Popname are already factors. So maybe this should work as expected:
CAPS_2019_data %>%
group_by(Age, PopName) %>%
mutate(PopName1 = sum(Dx)) %>%
ungroup() %>%
ggplot(aes(x = reorder(PopName, PopName1), y = Dx, fill = Age)) +
geom_col(position = position_stack(reverse = TRUE)) +
theme_classic()+
coord_flip()+
labs(x = "State", y = "Deaths (%)", caption = (""), face = "bold", fill = "Age")
Update:
data:
CAPS_2019 <- structure(list(Age = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L), .Label = c("30", "50", "65"), class = "factor"), Dx = c(3.057,
7.847, 17.157, 2.851, 8.861, 21.885, 2.521, 7.889, 21.328), PopName = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("AK", "AL", "AR"), class = "factor")), row.names = c(NA,
-9L), class = c("tbl_df", "tbl", "data.frame"))
To get the stacks ordered use position = position_stack(reverse = TRUE)
To order y axis do some preprocessing with group_by and sum and use fct_reorder from forcats package (it is in tidyverse)
library(tidyverse)
CAPS_2019 %>%
group_by(Age, PopName) %>%
mutate(PopName1 = sum(Dx)) %>%
ungroup() %>%
ggplot(aes(x = fct_reorder(PopName, PopName1), y = Dx, fill = factor(as.character(Age)))) +
geom_col(position = position_stack(reverse = TRUE)) +
theme_classic()+
coord_flip()+
labs(x = "State", y = "Deaths (%)", caption = (""), face = "bold", fill = "Age")

Add blank spaces in x-categorical axis

I'm plotting a monthly time series of abundance using a bar chart, and there are some months where I don't have data to show. It is possible to add blank spaces y the categorical x-axis?
This is my data:
library(dplyr)
library(ggplot2)
structure(list(spp = structure(c(9L, 10L, 1L, 2L, 3L), .Label = c("sp10", "sp15", "sp16", "sp20", "sp21", "sp22", "sp23", "sp24", "sp8", "sp9"), class = "factor"), abundance = c(0, 0, 0, 0, 11.19404656), estation = c(5L, 5L, 5L,5L, 5L), year = c(1995L, 1995L, 1995L, 1995L, 1995L), month = structure(c(2L, 2L, 2L, 2L, 2L), .Label = c("abr", "dic", "ene", "feb"), class = "factor"), date = structure(c(2L, 2L, 2L, 2L, 2L), .Label = c("abr-96", "dic-95", "ene-96", "feb-96"), class = "factor")), row.names = c(NA, 5L), class = "data.frame")
subset_group <- subset %>% group_by(date,spp) %>% summarize(sum_pl = sum(abundance))
ggplot(subset_group, aes(x = date, y = sum_pl, fill = spp)) + geom_bar(stat = "identity") + scale_x_discrete(labels = c("dic-95" = "D", "ene-96" = "E", "feb-96" = "F", "M", "abr-96" = "A"))
I used scale_x_discrete(), but with no positive results.

Multiple vertical shaded area

I am plotting the proportion of deep sleep (y axis) vs days (x axis). I would like to add vertical shaded area for a better understanding (e.g. grey for week-ends, orange for sick period...).
I have tried using geom_ribbon (I created a variable taking the value of 30, with is the top of my y axis if the data is during the WE - information given in another column), but instead of getting rectangles, I get trapezes.
In another post, someone proposed the use of "geom_rect", or "annotate" if one's know the x and y coordinates, but I don't see how to adapt it in my case, when I want to have the colored area repeated to all week-end (it is not exactly every 7 days because some data are missing).
Do you have any idea ?
Many thanks in advance !
ggplot(Sleep.data, aes(x = DATEID)) +
geom_line(aes(y = P.DEEP, group = 1), col = "deepskyblue3") +
geom_point(aes(y = P.DEEP, group = 1, col = Sign.deep)) +
guides(col=FALSE) +
geom_ribbon(aes(ymin = min, ymax = max.WE), fill = '#6495ED80') +
facet_grid(MONTH~.) +
geom_hline(yintercept = 15, col = "forestgreen") +
geom_hline(yintercept = 20, col = "forestgreen", linetype = "dashed") +
geom_vline(xintercept = c(7,14,21,28), col = "grey") +
scale_x_continuous(breaks=seq(0,28,7)) +
scale_y_continuous(breaks=seq(0,30,5)) +
labs(x = "Days",y="Proportion of deep sleep stage", title = "Deep sleep")
Proportion of deep sleep vs time
Head(Sleep.data)
> dput(head(Sleep.data))
structure(list(DATE = structure(c(1L, 4L, 7L, 10L, 13L, 16L), .Label = c("01-Dec-17",
"01-Feb-18", "01-Jan-18", "02-Dec-17", "02-Feb-18", "02-Jan-18",
"03-Dec-17", "03-Feb-18", "03-Jan-18", "04-Dec-17", "04-Feb-18",
"04-Jan-18", "05-Dec-17", "05-Feb-18", "05-Jan-18", "06-Dec-17",
"06-Feb-18", "06-Jan-18", "07-Dec-17", "07-Feb-18", "07-Jan-18",
"08-Dec-17", "08-Jan-18", "09-Dec-17", "09-Feb-18", "09-Jan-18",
"10-Dec-17", "10-Jan-18", "11-Dec-17", "11-Feb-18", "11-Jan-18",
"12-Dec-17", "12-Jan-18", "13-Dec-17", "13-Feb-18", "13-Jan-18",
"14-Dec-17", "14-Feb-18", "14-Jan-18", "15-Dec-17", "15-Jan-18",
"16-Dec-17", "16-Jan-18", "17-Dec-17", "17-Jan-18", "18-Dec-17",
"18-Jan-18", "19-Dec-17", "19-Jan-18", "20-Dec-17", "21-Dec-17",
"21-Jan-18", "22-Dec-17", "22-Jan-18", "23-Dec-17", "23-Jan-18",
"24-Dec-17", "24-Jan-18", "25-Dec-17", "25-Jan-18", "26-Dec-17",
"26-Jan-18", "27-Dec-17", "27-Jan-18", "28-Dec-17", "28-Jan-18",
"29-Dec-17", "29-Jan-18", "30-Dec-17", "30-Jan-18", "31-Dec-17",
"31-Jan-18"), class = "factor"), DATEID = 1:6, MONTH = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("Decembre", "Janvier", "FĂ©vrier"
), class = "factor"), DURATION = c(8.08, 7.43, 6.85, 6.23, 7.27,
6.62), D.DEEP = c(1.67, 1.37, 1.62, 1.75, 1.95, 0.9), P.DEEP = c(17L,
17L, 21L, 24L, 25L, 12L), STIMS = c(0L, 0L, 0L, 0L, 390L, 147L
), D.REM = c(1.7, 0.95, 0.95, 1.43, 1.47, 0.72), P.REM = c(17L,
11L, 12L, 20L, 19L, 9L), D.LIGHT = c(4.7, 5.12, 4.27, 3.05, 3.83,
4.98), P.LIGHT = c(49L, 63L, 55L, 43L, 49L, 66L), D.AWAKE = c(1.45,
0.58, 0.47, 0.87, 0.37, 0.85), P.AWAKE = c(15L, 7L, 6L, 12L,
4L, 11L), WAKE.UP = c(-2L, 0L, 2L, -1L, 3L, 1L), AGITATION = c(-1L,
-3L, -1L, -2L, 2L, -1L), FRAGMENTATION = c(1L, -2L, 2L, 1L, 0L,
-1L), PERIOD = structure(c(3L, 3L, 4L, 4L, 4L, 4L), .Label = c("HOLIDAYS",
"SICK", "WE", "WORK"), class = "factor"), SPORT = structure(c(2L,
1L, 2L, 2L, 2L, 1L), .Label = c("", "Day", "Evening"), class = "factor"),
ACTIVITY = structure(c(6L, 1L, 3L, 4L, 5L, 1L), .Label = c("",
"Bkool", "eBike", "Gym", "Natation", "Run"), class = "factor"),
TABLETS = c(0.5, 0.5, 0.5, 0.5, 0.5, 0.5), Ratio = c(1.15,
2.36, 3.45, 2.01, 5.27, 1.06), Sign = structure(c(2L, 2L,
2L, 2L, 2L, 2L), .Label = c("0", "1"), class = "factor"),
Sign.ratio = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), Sign.deep = structure(c(2L, 2L,
2L, 2L, 2L, 1L), .Label = c("0", "1"), class = "factor"),
Sign.awake = structure(c(1L, 2L, 2L, 1L, 2L, 1L), .Label = c("0",
"1"), class = "factor"), Sign.light = structure(c(2L, 1L,
1L, 2L, 2L, 1L), .Label = c("0", "1"), class = "factor"),
index = structure(c(1L, 1L, 1L, 1L, 2L, 1L), .Label = c("0",
"1"), class = "factor"), min = c(0, 0, 0, 0, 0, 0), max.WE = c(30,
30, 0, 0, 0, 0)), .Names = c("DATE", "DATEID", "MONTH", "DURATION",
"D.DEEP", "P.DEEP", "STIMS", "D.REM", "P.REM", "D.LIGHT", "P.LIGHT",
"D.AWAKE", "P.AWAKE", "WAKE.UP", "AGITATION", "FRAGMENTATION",
"PERIOD", "SPORT", "ACTIVITY", "TABLETS", "Ratio", "Sign", "Sign.ratio",
"Sign.deep", "Sign.awake", "Sign.light", "index", "min", "max.WE"
), row.names = c(NA, 6L), class = "data.frame")
Thanks for adding the data, that makes it easier to understand exactly what you're working with and to confirm that an answer actually addresses your question.
I thought it would be helpful to make a separate table with just the start and end of each contiguous set of rows with the same PERIOD. I did this using dplyr::case_when, assuming we should mark dates as a "start" if they are the first row in the table (row_number() == 1), or they have a different PERIOD value than the prior row. I mark dates as an "end" if they are the last row of the table, or have a different PERIOD than the next row. I only keep the starts and ends, and spread these into new columns called start and end.
library(tidyverse)
Period_ranges <- Sleep.data %>%
mutate(period_status = case_when(row_number() == 1 ~ "start",
PERIOD != lag(PERIOD) ~ "start",
row_number() == n() ~ "end",
PERIOD != lead(PERIOD) ~ "end",
TRUE ~ "other")) %>%
filter(period_status %in% c("start", "end")) %>%
select(DATEID, PERIOD, period_status) %>%
mutate(PERIOD_NUM = cumsum(PERIOD != lag(PERIOD) | row_number() == 1)) %>%
spread(period_status, DATEID)
# Output based on sample data only. If there's a problem with the full data, please add more. To share full data, use `dput(Sleep.data)` or to share 20 rows use `dput(head(Sleep.data, 20))`.
>Period_ranges
PERIOD PERIOD_NUM end start
1 WE 1 2 1
2 WORK 2 6 3
We can now use that in the plot. If you want to toggle the inclusion or fiddle with the appearance separately of different PERIOD types, you could modify the code below with Period_ranges %>% filter(PERIOD == "WE"),
ggplot(Sleep.data, aes(x = DATEID)) +
# Here I specify that this geom should use its own data.
# I start the rectangles half a day before and end half a day after to fill the space.
geom_rect(data = Period_ranges, inherit.aes = F,
aes(xmin = start - 0.5, xmax = end + 0.5,
ymin = 0, ymax = 30,
fill = PERIOD), alpha = 0.5) +
# Here we can specify the shading color for each type of PERIOD
scale_fill_manual(values = c(
"WE" = '#6495ED80',
"WORK" = "gray60"
)) +
# rest of your code
Chart based on data sample:

ggplot add aggregated summaries to a bar plot

I have the following data frame:
structure(list(StepsGroup = structure(c(1L, 1L, 1L, 2L, 2L, 2L,
3L, 3L, 3L), .Label = c("(-Inf,3e+03]", "(3e+03,1.2e+04]", "(1.2e+04, Inf]"
), class = "factor"), GlucoseGroup = structure(c(1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L), .Label = c("<100", "100-180", ">180"
), class = "factor"), n = c(396L, 1600L, 229L, 787L, 4182L, 375L,
110L, 534L, 55L), freq = c(0.177977528089888, 0.719101123595506,
0.102921348314607, 0.147267964071856, 0.782559880239521, 0.0701721556886228,
0.157367668097282, 0.763948497854077, 0.0786838340486409)), class =
c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -9L), vars = "StepsGroup",
labels = structure(list(
StepsGroup = structure(1:3, .Label = c("(-Inf,3e+03]", "(3e+03,1.2e+04]",
"(1.2e+04, Inf]"), class = "factor")), class = "data.frame", row.names =
c(NA, -3L), vars = "StepsGroup", drop = TRUE), indices = list(0:2,
3:5, 6:8), drop = TRUE, group_sizes = c(3L, 3L, 3L), biggest_group_size =
3L)
I would like to create a stacked bar plot, and add a summary of each StepsGroup on top of each bar. So the first group will have 2225, the second 5344 and the third 699.
I am using the following script:
ggplot(d_stepsFastingSummary , aes(y = freq, x = StepsGroup, fill =
GlucoseGroup)) + geom_bar(stat = "identity") +
geom_text(aes(label = sum(n()), vjust = 0))
The part until before the geom_text works, but for the last bit I get the following error:
Error: This function should not be called directly
Any idea how to add the aggregated quantity?
We could create a new dataframe stacked_df which would have sum for each StepsGroup
stacked_df <- df %>% group_by(StepsGroup) %>% summarise(nsum = sum(n))
ggplot(df) +
geom_bar(aes(y = freq, x = StepsGroup, fill= GlucoseGroup),stat = "identity") +
geom_text(data = stacked_df, aes(label = nsum, StepsGroup,y = 1.1))

ggvis: x-axis with whole number scale

I want to get whole numbers for x-axis for ggvis plot.
MWE
df <-
structure(list(Factor = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L), .Label = c("A", "B", "C"), class = "factor"),
X = c(15.5133333333333, 14.63, 14.41, 14.1266666666667, 13.1833333333333,
12.9466666666667, 13.6133333333333, 13.55, 13.5333333333333,
11.5566666666667, 11.3066666666667, 11.4566666666667), Y = c(20L,
20L, 20L, 40L, 40L, 40L, 70L, 70L, 70L, 100L, 100L, 100L)), .Names = c("Factor",
"X", "Y"), row.names = c(NA, -12L), class = "data.frame")
library(ggvis)
ggvis(data=df
, x= ~X
, y= ~Y
, fill= ~Factor
, stroke = ~Factor) %>%
arrange(Y) %>%
group_by(Factor) %>%
layer_points(shape=~Factor) %>%
layer_paths(fill := NA) %>%
add_axis('x', orient=c('bottom'), format='####')
One possibility is use values=seq(from=10, to=16, by=1) in add_axis(). But this is approach is not automated.
Setting the format argument to 'd' will display only integer values in the axis label:
library(ggvis)
library(dplyr)
##
ggvis(data=df
, x= ~X
, y= ~Y
, fill= ~Factor
, stroke = ~Factor) %>%
arrange(Y) %>%
group_by(Factor) %>%
layer_points(shape=~Factor) %>%
layer_paths(fill := NA) %>%
add_axis('x', orient=c('bottom'), format='d')
More information on d3 formatting specifications is available on this page, as mentioned in the Common Properties section of this ggvis guide.

Resources