ggplot group by fill and show mean

ggplot group by fill and show mean - r

I'm working on a heatmap and following along this tutorial:
https://www.r-graph-gallery.com/283-the-hourly-heatmap/
To save a click, here's the code block to reproduce:
library(ggplot2)
library(dplyr) # easier data wrangling
library(viridis) # colour blind friendly palette, works in B&W also
library(Interpol.T) # will generate a large dataset on initial load
library(lubridate) # for easy date manipulation
library(ggExtra) # because remembering ggplot theme options is beyond me
library(tidyr)
data<- data(Trentino_hourly_T,package = "Interpol.T")
names(h_d_t)[1:5]<- c("stationid","date","hour","temp","flag")
df<- tbl_df(h_d_t) %>%
filter(stationid =="T0001")
df<- df %>% mutate(year = year(date),
month = month(date, label=TRUE),
day = day(date))
df$date<-ymd(df$date) # not necessary for plot but
#useful if you want to do further work with the data
#cleanup
rm(list=c("h_d_t","mo_bias","Tn","Tx",
"Th_int_list","calibration_l",
"calibration_shape","Tm_list"))
#create plotting df
df <-df %>% select(stationid,day,hour,month,year,temp)
Then a heatmap is made:
p <-ggplot(df,aes(day,hour,fill=temp))+
geom_tile(color= "white",size=0.1) +
scale_fill_viridis(name="Hrly Temps C",option ="C")
p <-p + facet_grid(year~month)
p <-p + scale_y_continuous(trans = "reverse", breaks = unique(df$hour))
So far so good, I can recreate this. However my own dataset is website visit data at the visit level, so many visits in a given day and hour. In addition to visits I also have a timeOnPage metric.
Sample of data below with dput.
I wouldlike to heatmap the average hourly visits or timeOnPage. Here's what I tried.
Sample of my data:
> dput(sam)
structure(list(Day = structure(c(4L, 4L, 4L, 5L, 3L, 2L, 3L,
6L, 2L, 2L, 4L, 2L, 3L, 3L, 6L, 1L, 4L, 2L, 3L, 5L, 2L, 5L, 4L,
2L, 5L, 2L, 7L, 5L, 6L, 2L, 2L, 6L, 4L, 6L, 2L, 2L, 2L, 5L, 5L,
2L, 6L, 5L, 3L, 5L, 3L, 2L, 6L, 4L, 2L, 5L, 2L, 5L, 4L, 2L, 6L,
2L, 7L, 2L, 2L, 2L, 5L, 6L, 3L, 2L, 3L, 4L, 4L, 3L, 6L, 2L, 5L,
3L, 4L, 4L, 3L, 2L, 5L, 5L, 5L, 3L, 5L, 2L, 4L, 5L, 5L, 2L, 3L,
6L, 2L, 2L, 5L, 4L, 6L, 7L, 3L, 3L, 4L, 4L, 2L, 6L), .Label = c("Sun",
"Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"), class = c("ordered",
"factor")), Hour = c(18L, 7L, 3L, 22L, 11L, 11L, 9L, 16L, 16L,
13L, 18L, 18L, 10L, 19L, 7L, 13L, 18L, 14L, 10L, 20L, 17L, 6L,
21L, 15L, 18L, 7L, 12L, 10L, 16L, 14L, 18L, 13L, 17L, 10L, 19L,
20L, 14L, 16L, 10L, 9L, 16L, 9L, 8L, 13L, 17L, 17L, 11L, 15L,
22L, 17L, 18L, 17L, 7L, 19L, 12L, 2L, 12L, 15L, 7L, 17L, 17L,
18L, 13L, 10L, 19L, 9L, 13L, 13L, 17L, 21L, 23L, 4L, 17L, 12L,
12L, 9L, 17L, 19L, 7L, 4L, 5L, 17L, 6L, 23L, 3L, 14L, 19L, 13L,
7L, 11L, 9L, 13L, 9L, 19L, 11L, 5L, 20L, 20L, 19L, 11L), sessionID = c("1508980591045.l027p6mt",
"1510155616668.57i2wj1", "1510140439620.qu19kyo", "1510296404412.xasqfwqd10v1qdtl6jemi",
"1510082622485.szj2ja1e", "1511204933263.mq9bvi0d", "1511285142249.vp2fyfd9",
"1510965282725.x04h1dko", "1508801295434.e056cpef", "1508790369346.ly63bjgr",
"1509585154520.3usd036k", "1511834881064.e6f5evp", "1509471114265.2u807dwo",
"1507688054076.9dls0jk", "1509721031589.ho125mpb", "1510521845178.99j1ibkr",
"1510194555297.ioepfjgr", "1508793469455.hkc3xwa8", "1511288175700.62n5oc5",
"1510287319653.7ye9sjc", "1511227016523.yyn1of99", "1511448209341.1u5vir5p",
"1510205972493.qvu4ev7o", "1510615247987.swxhwct", "1508463701266.p52sdjzp",
"1510588449881.d6ffruv9", "1507404213416.rovwmmge", "1510857718956.2z57w2vr",
"1510360661780.19hznp3m78pvi", "1511820500742.48cyvo2a", "1508809029952.up0wqq5h",
"1508533120441.gdvhacjr7jswiquwuyp66r", "1509583258224.j8krac0sz5kx8pxohl4n29",
"1511549442901.5vm7na1l", "1508811367845.7b36epqk", "1509421407861.om0ydylt",
"1508794534361.p3gcoa0e", "1510877729807.viad220f", "1511460355269.omwvd00l",
"1508775703610.usuk2akm", "1510964376869.7e2crw9d", "1510247098808.np9ia23",
"1508860753512.3z4182b", "1510868797935.3nmpvkri", "1510105270807.4evhpys",
"1511831565084.27izf13f", "1510340973580.l9qj5drou5wmi", "1508364715184.14l4ikj",
"1509426566404.9qnp0m3", "1510275972333.hhqu0exc", "1510625679744.jk3vvt1v",
"1510881839700.c34skful", "1511365134270.57thqyir", "1509416741055.1f2cnmrp",
"1509738404263.8ajwpij", "1510570338116.h9a5j88", "1511640706961.qw8q1eh",
"1510011913201.eqd54kw", "1508769010911.wrpb329", "1508803518777.56b2ej2l",
"1509670743316.yhncp17j", "1511576965410.y47g0wgj", "1508876390209.wem8i3lh",
"1508779846415.hyx8qar", "1511322782502.s835px9", "1509554323957.osxgi0em",
"1510176829762.jncm9xwb", "1509482328620.sqdbob0u", "1508545652936.a5hqcmp1fw29",
"1508817816447.6mbdldxb", "1510297785623.33i6yhko", "1508843299131.3m26sqf5",
"1510191633431.cl5fh9ik", "1509565114633.bd5yrkf5", "1510690660714.818yxn5o",
"1507567660773.ybpbfgn", "1509667501973.1a9f9pyp", "1509674601865.yqvmcclv",
"1511450423709.s149r25q", "1511267096892.n5u1d0nv", "1509624499459.u57lgtt8",
"1510019204298.ka4w9kfh", "1511362131909.t26h6ig", "1510904968660.eowoea2q",
"1510225256391.4dk073ej", "1510006654569.reo2eili", "1509501692686.ng48bwnz",
"1509741958143.bxbf325r", "1508770633217.33ymrfgc", "1511810438817.zcgpr6vj",
"1510852180447.wywsj7f", "1510176833767.nev0iaec", "1509727547082.53van2sr",
"1507430914148.niu297m", "1508868705810.akd7r18h", "1510060231388.mz9ojf6g",
"1509592760232.qtrlxye8", "1509592651211.1r82ucw4", "1508812928318.f3st4004",
"1509734102140.leol1dnw"), uniquePageviews = c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), timeOnPage = c(359, 149, 69, 146, 147, 119, 168, 69, 29, 0,
1542, 148, 242, 49, 457, 175, 175, 97, 79, 12, 0, 1141, 150,
236, 74, 128, 23, 147, 172, 223, 225, 88, 69, 156, 0, 49, 110,
150, 70, 123, 30, 145, 1629, 1, 119, 169, 48, 136, 529, 130,
149, 124, 281, 2483, 0, 60, 149, 50, 29, 124, 149, 0, 92, 149,
915, 47, 50, 89, 143, 84, 129, 147, 138, 80, 33, 226, 70, 146,
177, 98, 150, 32, 148, 149, 12, 338, 146, 204, 149, 148, 26,
149, 1110, 148, 23, 151, 0, 100, 0, 28)), row.names = c(20219L,
42612L, 42149L, 46707L, 40122L, 57449L, 60878L, 56707L, 11725L,
10102L, 29911L, 71743L, 25952L, 1492L, 35570L, 48411L, 43917L,
10530L, 61004L, 46446L, 58846L, 65695L, 44287L, 49341L, 2999L,
48502L, 627L, 54118L, 48148L, 70166L, 13346L, 4770L, 29745L,
67979L, 13832L, 24814L, 10692L, 54744L, 65995L, 8216L, 56683L,
44920L, 18121L, 54499L, 41155L, 71353L, 47606L, 1900L, 25023L,
45811L, 49937L, 54904L, 63607L, 24571L, 36060L, 48479L, 69086L,
37708L, 7353L, 12117L, 33912L, 68752L, 19081L, 8768L, 62647L,
28317L, 43172L, 26286L, 6359L, 14907L, 46733L, 16418L, 43797L,
28637L, 51671L, 1273L, 33677L, 34226L, 65759L, 60247L, 31739L,
38171L, 63497L, 55589L, 44462L, 37454L, 27141L, 36178L, 7543L,
69636L, 54030L, 43173L, 35743L, 852L, 18784L, 39283L, 30672L,
30663L, 14142L, 35933L), class = "data.frame", .Names = c("Day",
"Hour", "sessionID", "uniquePageviews", "timeOnPage"))
It looks like this:
> head(sam)
Day Hour sessionID uniquePageviews timeOnPage
20219 Wed 18 1508980591045.l027p6mt 1 359
42612 Wed 7 1510155616668.57i2wj1 1 149
42149 Wed 3 1510140439620.qu19kyo 1 69
46707 Thurs 22 1510296404412.xasqfwqd10v1qdtl6jemi 1 146
40122 Tues 11 1510082622485.szj2ja1e 1 147
57449 Mon 11 1511204933263.mq9bvi0d 1 119
> glimpse(sam)
Observations: 100
Variables: 5
$ Day <ord> Wed, Wed, Wed, Thurs, Tues, Mon, Tues, Fri, Mon, Mon, Wed, Mon, Tues, Tues, Fri, Sun, Wed, M...
$ Hour <int> 18, 7, 3, 22, 11, 11, 9, 16, 16, 13, 18, 18, 10, 19, 7, 13, 18, 14, 10, 20, 17, 6, 21, 15, 1...
$ sessionID <chr> "1508980591045.l027p6mt", "1510155616668.57i2wj1", "1510140439620.qu19kyo", "1510296404412.x...
$ uniquePageviews <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
$ timeOnPage <dbl> 359, 149, 69, 146, 147, 119, 168, 69, 29, 0, 1542, 148, 242, 49, 457, 175, 175, 97, 79, 12, ...
Metric uniquePageviews will always be 1 or o and in a heatmap it doesn't look great. Since it's session level data there are multiple entries for each day / hour. For timeOnPage I wouldlike to heatmap the mean time on page for a given hour and day of week combination.
So, as far as I can tell ggplot is summing everything whereas I want mean().
My initial code block:
# creates the initial heatmap
p <- ggplot(sam, aes(x = Day, y = Hour, fill = uniquePageviews)) +
geom_tile(color = "white", size = 0.1) +
scale_fill_viridis(name = "TimeOnPage", option ="C")
# order by hour of day going top to bottom asc
p <-p + scale_y_continuous(trans = "reverse", breaks = unique(df$hour))
I tried changing it to this but the results look the exact same:
# gets the initial heatmap
p <- ggplot(sam, aes(x = Day, y = Hour, fill = uniquePageviews),
stat = "summary", fun.y = "mean") +
geom_tile(color = "white", size = 0.1) +
scale_fill_viridis(name = "Mean TimeOnPage", option ="C")
# order by hour of day going top to bottom asc
p <-p + scale_y_continuous(trans = "reverse", breaks = unique(df$hour))
I could do some dplyr group by transformations on the dataframe sam but I was not sure if ggplot::geom_tile() takes care of that or not?
How can I create a heatmap with ggplot where the fill is based on mean? Also, can someone clarify what exactly it's showing now? Total sum?

Not sure if I get your problem but you can try following:
library(tidyverse)
library(viridis)
d %>%
group_by(Day, Hour) %>%
summarise(Mean=mean(timeOnPage)) %>%
ggplot(aes(x = Day, y = Hour, fill = Mean)) +
geom_tile(color = "white", size = 0.1) +
scale_fill_viridis(name = "TimeOnPage", option ="C")
this will caclulate the mean timeOnPage per Day and Hour and plot it as a heatmap.

Related

scale_x_date returns error: how can I apply time frame to x-axis?

I am comparing the frequency of acute surgical procedures per week before and during the Covid-19 pandemic. I have a simple, linear regression in which each observation correspond to a week and is represented in lin.model$cons_week. There are 221 observation, corresponding to 221 successive weeks since 2017-01-02 (first monday of 2017) and up to 2021-04-05
mod <- fortify(lm(n ~ cons_week * corona, data = lin.model))
With n being number of surgical procedures and corona indicates which time period cons_week belongs to.
I have
mod %>%
ggplot(aes(y = .cooksd)) +
geom_col(data = filter(mod, corona == "Normal") %>% droplevels(),
aes(seq_along(.cooksd)),
color = "#6DBCC3", fill = alpha("#6DBCC3", .2)) +
geom_col(data = filter(mod, corona == "C19") %>% droplevels(),
aes(seq_along(.cooksd)+167),
color = "#8B3A62", fill = alpha("#8B3A62", .2)) +
geom_hline(yintercept = 4/nrow(lin.model), col = "black", lty = 2) +
geom_vline(xintercept = 167, color = "red", lty = 2) +
scale_y_continuous(name = "Cook's Distance")
Giving
However, I would like the x-axis to show some sort of sensible time line as each observation in Cook's correspond to one week.
I tried scale_date_x():
mod %>%
mutate(cons_week_dt = as.Date("2017-01-02") + cons_week*7) %>%
ggplot(aes(x = cons_week_dt, y = .cooksd)) +
geom_col(data = filter(mod, corona == "Normal") %>% droplevels(),
aes(seq_along(.cooksd)),
color = "#6DBCC3", fill = alpha("#6DBCC3", .2)) +
geom_col(data = filter(mod, corona == "C19") %>% droplevels(),
aes(seq_along(.cooksd)+167),
color = "#8B3A62", fill = alpha("#8B3A62", .2)) +
geom_hline(yintercept = 4/nrow(lin.model), col = "black", lty = 2) +
geom_vline(xintercept = 167, color = "red", lty = 2) +
scale_y_continuous(name = "Cook's Distance") +
scale_x_date(name = "",
date_breaks = "3 months", date_labels = "%B-%Y", expand = c(0.01, 0))
But that returns an error:
Error: Invalid input: date_trans works with objects of class Date only
lin.model <- structure(list(corona = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("Normal", "C19"), class = "factor"), cons_week = c(56,
6, 150, 87, 16, 100, 16, 149, 62, 38, 74, 3, 80, 64, 72, 80,
71, 25, 100, 159, 80, 72, 35, 14, 55, 162, 14, 4, 100, 34, 81,
59, 156, 84, 165, 129, 70, 140, 113, 13, 65, 76, 33, 7, 93, 40,
107, 72, 43, 123, 142, 65, 83, 119, 63, 116, 18, 48, 97, 6, 93,
79, 81, 158, 8, 45, 143, 114, 83, 96, 26, 91, 113, 22, 54, 126,
93, 39, 33, 132, 97, 126, 136, 145, 35, 30, 109, 160, 20, 74,
162, 26, 112, 145, 28, 22, 28, 153, 131, 136, 209, 203, 188,
180, 214, 180, 176, 203, 219, 172, 222, 212, 183, 200, 218, 208,
198, 169, 210, 222, 207, 184, 175, 185, 198, 192, 206, 195, 219,
193, 197, 217, 204, 193, 193, 182, 175, 169, 170, 208, 192, 181,
186, 186, 209, 212, 200, 194, 213, 178, 213, 212, 209, 205, 183,
206, 199, 181, 187, 174, 177, 215, 193, 207, 207, 204, 187, 195,
174, 184, 171, 218, 188, 181, 197, 180, 208, 203, 192, 173, 218,
173, 196, 185, 212, 201, 194, 221, 205, 210, 213, 174, 207, 181,
189, 179, 200, 196, 216, 201), n = c(9L, 14L, 11L, 15L, 19L,
12L, 19L, 20L, 12L, 17L, 9L, 13L, 7L, 6L, 12L, 7L, 11L, 15L,
12L, 8L, 7L, 12L, 15L, 13L, 10L, 11L, 13L, 20L, 12L, 10L, 11L,
11L, 16L, 17L, 13L, 12L, 15L, 6L, 13L, 14L, 14L, 16L, 25L, 15L,
11L, 19L, 22L, 12L, 18L, 18L, 12L, 14L, 11L, 18L, 14L, 11L, 14L,
14L, 15L, 14L, 11L, 15L, 11L, 15L, 16L, 14L, 11L, 12L, 11L, 18L,
19L, 16L, 13L, 10L, 14L, 19L, 11L, 12L, 25L, 9L, 15L, 19L, 15L,
19L, 15L, 17L, 11L, 11L, 17L, 9L, 11L, 19L, 16L, 19L, 17L, 10L,
17L, 14L, 12L, 15L, 15L, 12L, 14L, 10L, 13L, 10L, 9L, 12L, 18L,
15L, 20L, 17L, 13L, 10L, 14L, 13L, 17L, 15L, 14L, 20L, 16L, 10L,
11L, 9L, 17L, 15L, 15L, 9L, 18L, 12L, 14L, 10L, 16L, 12L, 12L,
16L, 11L, 15L, 8L, 13L, 15L, 13L, 19L, 19L, 15L, 17L, 10L, 8L,
10L, 12L, 10L, 17L, 15L, 19L, 13L, 15L, 17L, 13L, 15L, 13L, 11L,
16L, 12L, 16L, 16L, 16L, 15L, 9L, 13L, 10L, 11L, 14L, 14L, 13L,
14L, 10L, 13L, 12L, 15L, 22L, 14L, 22L, 22L, 9L, 17L, 15L, 8L,
9L, 19L, 14L, 10L, 13L, 16L, 13L, 12L, 15L, 10L, 22L, 14L, 15L
)), row.names = c(NA, -200L), groups = structure(list(corona = structure(1:2, .Label = c("Normal",
"C19"), class = "factor"), .rows = structure(list(1:100, 101:200), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))

The main issue with your code is that you mapped seq_along(.cooksd) on x. To solve this issue simply prepare your data as a separate step so that all layers make use of the same data. Also, there is no need for subsetting and two geom_col. Only one is needed while mapping your var corona on color and fill and setting the colors via scale_color/fill_manual
library(ggplot2)
library(dplyr)
mod <- fortify(lm(n ~ cons_week * corona, data = lin.model))
mod1 <- mod %>%
mutate(cons_week_dt = as.Date("2017-01-02") + cons_week*7)
cols <- c(Normal = "#6DBCC3", C19 = "#8B3A62")
ggplot(mod1, aes(x = cons_week_dt, y = .cooksd)) +
geom_col(aes(color = corona, fill = corona)) +
geom_hline(yintercept = 4/nrow(lin.model), col = "black", lty = 2) +
geom_vline(xintercept = 167, color = "red", lty = 2) +
scale_color_manual(values = cols) +
scale_fill_manual(values = alpha(cols, .2)) +
scale_y_continuous(name = "Cook's Distance") +
scale_x_date(name = "",
date_breaks = "3 months", date_labels = "%B-%Y", expand = c(0.01, 0)) +
guides(color = "none", fill = "none")

How to keep labels of a factor variable after transforming it to numeric in R?

I have the following data with three main variables:
i) education (factor): represents three different level of education(1, 2, 3)
ii) share (numeric): represents the % of people in the country for each level of education
iii) country (factor): represents 30 countries.
The aim is to plot the share of education in each country by re_ordering the highest level of education (3) by countries that have the lowest share in it to the ones that have the highest. The issue is that i lose the country labels by the end because i have to transform the variable to numeric in order to reorder it. After plotting the graph, countries are labelled as (25, 6, 26, 17) instead of the correct labels (PT, CZ, RO, IT). I have read different threads in stackoverflow but non have solved the issue. I there a way to keep the labels of countries after re-ordering so i don't have to enter them manually later on?
library(forcats)
library(ggplot2)
library(dplyr)
x$country = as.numeric(x$cntry2)
x$educ = as.integer(x$educ)
x$educ = as.factor(x$educ)
country_order <- x %>%
filter(educ == 3) %>%
mutate(country = fct_reorder(factor(country), share, .desc = FALSE)) %>%
pull(country) %>%
levels()
df2 <- x %>%
mutate(country = fct_relevel(factor(country), country_order))
ggplot(df2, aes(x=country, y=share)) +
geom_col(aes(fill=educ), color = "black") +
labs(fill= "Education") +
theme_classic() +
xlab("Country")
This is the data below:
structure(list(educ = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("1", "2",
"3"), class = "factor"), cntry2 = structure(c(1L, 1L, 1L, 2L,
2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L,
7L, 8L, 8L, 8L, 9L, 9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L,
12L, 12L, 13L, 13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L,
16L, 17L, 17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L,
21L, 21L, 21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L, 24L, 25L,
25L, 25L, 26L, 26L, 26L, 27L, 27L, 27L), .Label = c("AU", "BE",
"BG", "CH", "CZ", "DK", "EE", "ES", "FI", "FR", "GR", "HU", "IE",
"IS", "IT", "LT", "LU", "LV", "NL", "NO", "PO", "PT", "RO", "SE",
"SK", "SV", "UK"), class = "factor"), share = c(14.9585723390695,
64.8311026131294, 20.2103250478011, 20.3203525363306, 37.9050825638106,
41.7745648998589, 20.5482068669118, 58.6719831908696, 20.7798099422186,
11.0478359908884, 52.7334851936219, 36.2186788154898, 8.1806499751285,
77.2156358812801, 14.6037141435914, 18.43684842358, 44.6831364124597,
36.8800151639603, 13.0425889732285, 58.1996272896687, 28.7577837371029,
42.6625051189251, 24.1934234264148, 33.1440714546602, 16.4821228232769,
46.3050582898395, 37.2128188868836, 22.0117072122872, 47.7342785027657,
30.2540142849471, 31.6958715347475, 40.8370856615852, 27.4670428036673,
15.620426612099, 63.1486925776748, 21.2308808102263, 27.79203576455,
33.4878715125424, 38.7200927229075, 29.0666986564299, 41.950575815739,
28.9827255278311, 36.0270124068613, 47.1984225312789, 16.7745650618598,
8.20398339670027, 60.9892218075273, 30.8067947957724, 37.0050817095017,
37.4766935985084, 25.5182246919899, 15.7399902739504, 59.1482759419216,
25.111733784128, 19.2624176167015, 43.4944817814291, 37.2431006018693,
17.6501727404436, 44.6784798840967, 37.6713473754597, 10.0098831213475,
69.2849776555517, 20.7051392231007, 64.5019644095216, 21.5391726369309,
13.9588629535475, 21.8434913468774, 62.6661650363682, 15.4903436167545,
11.4840104928012, 55.3435190932938, 33.172470413905, 4.23006072183939,
74.1147574537763, 21.6551818243843, 15.6869892409901, 61.3851490387442,
22.9278617202657, 14.2357801080394, 49.3703276303246, 36.393892261636
), country = c(1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 6, 6, 6, 7,
7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12, 14,
14, 14, 15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19,
19, 20, 20, 20, 22, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25,
26, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30)), row.names = c(NA,
-81L), class = c("tbl_df", "tbl", "data.frame"))

Try this tropical approach:
library(ggplot2)
library(dplyr)
#Data
x$lab <- as.character(x$cntry2)
x$country = as.numeric(x$cntry2)
x$educ = as.integer(x$educ)
x$educ = as.factor(x$educ)
Now, we store the labels:
#Labels
labs <- x[!duplicated(x$country),]
labs <- labs[,c('country','lab')]
Then more data process:
#Data
country_order <- x %>%
filter(educ == 3) %>%
mutate(country = fct_reorder(factor(country), share, .desc = FALSE)) %>%
pull(country) %>%
levels()
df2 <- x %>%
mutate(country = fct_relevel(factor(country), country_order))
And the plot:
#Plot
ggplot(df2, aes(x=country, y=share)) +
geom_col(aes(fill=educ), color = "black") +
labs(fill= "Education") +
theme_classic() +
xlab("Country")+
scale_x_discrete(labels=labs$lab[match(country_order,labs$country)])
Output:

Facet Wrap Issue with ggplot Rain Cloud Plot

I'm very new to R and am trying to facet_wrap raincloud plots. I am trying to facet_wrap by Hypothesis chosen (which has been binary coded), so ideally would like to plot proportion of confirmatory and disconfirmatory leads chosen by hypothesis.
Here is what I have so far:
my_data2 <- melt(my_data, id.vars = c("ID"),
measure.vars = c("Proportion.of.Disconfirmatory.Leads.Chosen","Proportion.of.Confirmatory.Leads.Chosen", "Hypothesis"),
variable.name = "Leads", "Hyp",
value.name = "Proportion")
plot3 <- ggplot(data = my_data2, aes(y = Proportion, x = Leads, fill = Leads)) +
geom_flat_violin(position = position_nudge(x = .2, y = 0), alpha = .8) +
geom_point(aes(y = Proportion, color = Leads), position = position_jitter(width = .15), size = .5, alpha = 0.8) +
geom_boxplot(width = .1, guides = FALSE, outlier.shape = NA, alpha = 0.5) +
facet_wrap(vars(Hypothesis), nrow = 2)+
expand_limits(x = 5.25) +
guides(fill = FALSE) +
guides(color = FALSE) +
scale_color_brewer(palette = "Spectral") +
scale_fill_brewer(palette = "Spectral") +
coord_flip() +
theme_bw()
plot3
However, I am receiving this error:
"Error: At least one layer must contain all faceting variables: `Hypothesis`.
* Plot is missing `Hypothesis`
* Layer 1 is missing `Hypothesis`
* Layer 2 is missing `Hypothesis`
* Layer 3 is missing `Hypothesis`
* Layer 4 is missing `Hypothesis`"
> dput(my_data)
structure(list(ID = c(2L, 5L, 23L, 34L, 35L, 48L, 53L, 59L, 71L,
76L, 1L, 3L, 4L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 20L, 21L, 22L, 24L, 25L, 26L, 27L, 28L, 29L, 30L,
31L, 32L, 33L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L,
46L, 47L, 49L, 50L, 51L, 52L, 54L, 55L, 56L, 57L, 58L, 60L, 61L,
62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L, 72L, 73L, 74L, 75L,
78L), Hypothesis = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), Sum.of.Disconfirmatory.Leads.Chosen = c(9L, 7L, 0L,
3L, 4L, 1L, 2L, 3L, 6L, 3L, 2L, 3L, 5L, 3L, 4L, 3L, 3L, 5L, 0L,
5L, 5L, 1L, 4L, 5L, 6L, 4L, 5L, 2L, 6L, 4L, 6L, 1L, 4L, 4L, 8L,
3L, 4L, 2L, 5L, 2L, 4L, 7L, 1L, 1L, 2L, 3L, 5L, 2L, 5L, 8L, 0L,
5L, 4L, 7L, 3L, 4L, 6L, 1L, 1L, 4L, 4L, 8L, 7L, 3L, 4L, 6L, 2L,
5L, 2L, 5L, 5L, 8L, 2L, 4L, 5L, 7L), Sum.of.Confirmatory.Leads.Chosen = c(5L,
2L, 2L, 2L, 8L, 3L, 4L, 5L, 4L, 2L, 4L, 6L, 3L, 7L, 4L, 3L, 2L,
3L, 3L, 7L, 4L, 5L, 2L, 3L, 6L, 4L, 9L, 6L, 5L, 5L, 1L, 1L, 3L,
6L, 6L, 3L, 7L, 1L, 2L, 3L, 6L, 8L, 2L, 2L, 6L, 9L, 5L, 6L, 5L,
4L, 6L, 6L, 2L, 3L, 2L, 5L, 6L, 4L, 5L, 4L, 5L, 4L, 5L, 7L, 4L,
5L, 4L, 4L, 3L, 5L, 5L, 7L, 6L, 4L, 3L, 7L), Proportion.of.Disconfirmatory.Leads.Chosen = c(64.28571429,
77.77777778, 0, 60, 33.33333333, 25, 33.33333333, 37.5, 60, 60,
33.33333333, 33.33333333, 62.5, 30, 50, 50, 60, 62.5, 0, 41.66666667,
55.55555556, 16.66666667, 66.66666667, 62.5, 50, 50, 35.71428571,
25, 54.54545455, 44.44444444, 85.71428571, 50, 57.14285714, 40,
57.14285714, 50, 36.36363636, 66.66666667, 71.42857143, 40, 40,
46.66666667, 33.33333333, 33.33333333, 25, 25, 50, 25, 50, 66.66666667,
0, 45.45454545, 66.66666667, 70, 60, 44.44444444, 50, 20, 16.66666667,
50, 44.44444444, 66.66666667, 58.33333333, 30, 50, 54.54545455,
33.33333333, 55.55555556, 40, 50, 50, 53.33333333, 25, 50, 62.5,
50), Proportion.of.Confirmatory.Leads.Chosen = c(35.71428571,
22.22222222, 100, 40, 66.66666667, 75, 66.66666667, 62.5, 40,
40, 66.66666667, 66.66666667, 37.5, 70, 50, 50, 40, 37.5, 100,
58.33333333, 44.44444444, 83.33333333, 33.33333333, 37.5, 50,
50, 64.28571429, 75, 45.45454545, 55.55555556, 14.28571429, 50,
42.85714286, 60, 42.85714286, 50, 63.63636364, 33.33333333, 28.57142857,
60, 60, 53.33333333, 66.66666667, 66.66666667, 75, 75, 50, 75,
50, 33.33333333, 100, 54.54545455, 33.33333333, 30, 40, 55.55555556,
50, 80, 83.33333333, 50, 55.55555556, 33.33333333, 41.66666667,
70, 50, 45.45454545, 66.66666667, 44.44444444, 60, 50, 50, 46.66666667,
75, 50, 37.5, 50)), class = "data.frame", row.names = c(NA, -76L
))
> head(my_data)
ID Hypothesis Sum.of.Disconfirmatory.Leads.Chosen Sum.of.Confirmatory.Leads.Chosen
1 2 0 9 5
2 5 0 7 2
3 23 0 0 2
4 34 0 3 2
5 35 0 4 8
6 48 0 1 3
Proportion.of.Disconfirmatory.Leads.Chosen Proportion.of.Confirmatory.Leads.Chosen
1 64.28571 35.71429
2 77.77778 22.22222
3 0.00000 100.00000
4 60.00000 40.00000
5 33.33333 66.66667
6 25.00000 75.00000
I suspect that I have introduced the variable Hypothesis incorrectly in the code, however I have no idea where or how! I have tried to include it in sumld however am receiving this error when doing so:
Error in fs[[1]](x, ...) : attempt to apply non-function
Thank you all in advance for your help.

Hypothesis is not a independent column in your melted data frame mydata2, and cannot be used for faceting. You have included Hypothesis in measure.vars and it has been converted to one of the categories of Leads in mydata2.
my_data2 %>% group_by(Leads) %>% summarize(n=n())
# A tibble: 3 x 2
Leads n
<fct> <int>
1 Proportion.of.Disconfirmatory.Leads.Chosen 76
2 Proportion.of.Confirmatory.Leads.Chosen 76
3 Hypothesis 76
If you want to use it for faceting, include Hypothesis in id.vars instead.
my_data2 <- melt(my_data, id.vars = c("ID", "Hypothesis"),
measure.vars = c("Proportion.of.Disconfirmatory.Leads.Chosen",
"Proportion.of.Confirmatory.Leads.Chosen"),
variable.name = "Leads",
value.name = "Proportion")
head(my_data2)
ID Hypothesis Leads Proportion
1 2 0 Proportion.of.Disconfirmatory.Leads.Chosen 64.28571
2 5 0 Proportion.of.Disconfirmatory.Leads.Chosen 77.77778
3 23 0 Proportion.of.Disconfirmatory.Leads.Chosen 0.00000
4 34 0 Proportion.of.Disconfirmatory.Leads.Chosen 60.00000
5 35 0 Proportion.of.Disconfirmatory.Leads.Chosen 33.33333
6 48 0 Proportion.of.Disconfirmatory.Leads.Chosen 25.00000
Now you can use Hypothesis for faceting:
library(ggplot2)
source("https://gist.githubusercontent.com/benmarwick/2a1bb0133ff568cbe28d/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R")
plot3 <- ggplot(data = my_data2, aes(y = Proportion, x = Leads, fill = Leads)) +
geom_flat_violin(position = position_nudge(x = .2, y = 0), alpha = .8) +
geom_point(aes(y = Proportion, color = Leads),
position = position_jitter(width = .15), size = .5, alpha = 0.8) +
geom_boxplot(width = .1, guides = FALSE, outlier.shape = NA, alpha = 0.5) +
facet_wrap(~Hypothesis, nrow = 2) +
expand_limits(x = 5.25) +
guides(fill = FALSE) +
guides(color = FALSE) +
scale_color_brewer(palette = "Spectral") +
scale_fill_brewer(palette = "Spectral") +
coord_flip() +
theme_bw()
plot3
Edited: Solution to follow-up question on how to modifying variable names on axis labels. One approach is by factoring variable and assigning labels to factors.
my_data2$Leads <- factor(my_data2$Leads,
levels=c("Proportion.of.Disconfirmatory.Leads.Chosen",
"Proportion.of.Confirmatory.Leads.Chosen"),
labels=c("Proportion of Confirmatory Leads Chosen",
"Proportion of Disconfirmatory Leads Chosen"))
Rerun ggplot code to produce this:

How do I display percentage in geom_line in ggplot2?

I am trying to display percentages in ggplot2 using geom_line and geom_point.
My code is:
print(ggplot(data=dfComb, aes(x=hour_adj, y=(..count..)/sum(..count..), group=word)) +
geom_line(aes(colour=dfComb$word)) +
geom_point(aes(colour=dfComb$word))
+ ggtitle(paste("Hourly Frequencies of Tweets")) +
xlab("Hour of Day") +
ylab("Count") +
scale_colour_discrete("Word", breaks=c("A","B"), labels=c("yid", "abbo")) +
scale_y_continuous(labels = scales::percent)
)
This errors:
Error in FUN(X[[i]], ...) : object 'count' not found
because the ..count.. variable is only created by geom_histogram (I think!) and not geom_line.
Is there an easy way to use percentages with geom_line?
FYI: EDIT, my data is:
dput(dfComb)
structure(list(hour_adj = c(0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L,
22L, 23L, 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L,
13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L), count = c(44,
24, 22, 36, 26, 18, 39, 35, 50, 46, 46, 41, 57, 49, 34, 56, 54,
54, 49, 45, 36, 49, 43, 47, 35, 20, 18, 10, 10, 25, 25, 26, 32,
25, 29, 39, 37, 45, 52, 43, 46, 67, 38, 69, 108, 80, 73, 48),
word = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A", "B"), class = "factor")), .Names = c("hour_adj",
"count", "word"), row.names = c(NA, -48L), class = "data.frame")

You can calculate percentage in the dataframe first.
Also, as per Roman Lustrik's comment, it's better to call variable by name from within aes().
library(dplyr)
# sample data
set.seed(1)
dfComb <- data.frame(hour_adj = rep(0:4, 2),
count = sample(10:50, 10, replace = T),
word = c(rep("A", 5), rep("B", 5)))
ggplot(dfComb %>%
group_by(word) %>%
mutate(perc = count/sum(count)) %>%
ungroup(),
aes(x=hour_adj, y=perc, group=word, colour = word)) +
geom_line() +
geom_point() +
ggtitle(paste("Hourly Frequencies of Tweets")) +
xlab("Hour of Day") +
ylab("Count") +
scale_colour_discrete("Word", breaks=c("A","B"), labels=c("yid", "abbo")) +
scale_y_continuous(labels = scales::percent)

How can I order the months chronologically in ggplot2 short of writing the months out?

I am trying to plot count v/s month
ggplot(dat, aes(x=month, y=count,group=region)) +
geom_line(data=mcount[mcount$region == "West coast", ],colour="black",stat="identity", position="dodge")+
geom_point(data=mcount[mcount$region == "West coast", ],colour="black", size=2, shape=21, fill="white")+
theme_bw()+
theme(legend.key = element_rect(colour = "black")) +
guides(fill = guide_legend(override.aes = list(colour = NULL)))+
ggsave("test.png",width=6, height=4,dpi=300)
But I want to order the months chronologically from Jan to Dec. How can I do this short of writing all the months out?
dput
structure(list(region = structure(c(6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
3L), .Label = c("West coast", "Arizona", "Front range", "Flash flood alley",
"Mississippi valley", "Appalachians"), class = "factor"), month = structure(c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 4L, 12L, 11L,
5L, 2L, 9L, 8L, 6L, 10L, 3L, 7L, 8L, 10L, 5L, 1L, 6L, 7L, 4L,
6L, 8L, 2L, 1L, 7L, 5L, 3L, 11L, 12L, 9L, 10L, 2L, 7L, 3L, 6L,
12L, 11L, 10L, 9L, 4L, 1L, 11L, 4L, 2L, 1L, 12L, 9L, 3L, 8L,
5L, 6L, 10L, 7L, 5L, 8L, 11L, 12L, 4L, 3L, 9L, 2L), .Label = c("Apr",
"Dec", "Oct", "Mar", "May", "Jul", "Sep", "Jun", "Nov", "Aug",
"Jan", "Feb"), class = "factor"), count = c(566, 545, 427, 751,
357, 399, 568, 433, 454, 347, 511, 251, 267, 207, 167, 142, 417,
109, 117, 373, 207, 130, 125, 145, 7, 14, 2, 2, 7, 3, 107, 74,
135, 48, 80, 53, 117, 125, 59, 53, 103, 30, 21, 18, 8, 22, 26,
37, 20, 5, 11, 1, 96, 29, 109, 8, 33, 53, 6, 1, 5, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0)), .Names = c("region", "month", "count"), row.names = c(NA,
-72L), class = c("data.table", "data.frame"))

Use the built-in month.name or month.abb variable to specify the levels of your factor in the correct order. In this case, you have abbreviations so month.abb is appropriate.
your_data$month = factor(your_data$month, levels = month.abb)
I think creating the factor in the correct order is the best way, but you can also just order the axis using the limits argument of the discrete scale (see ?discrete_scale for more info).
+ scale_x_discrete(limits = month.abb)
Locales
If you are in a non-English locale, you can construct your own month name constants with a little date formatting (basically stolen from Brian Ripley in this R-Help thread):
month.name.loc = format(ISOdate(2004, 1:12, 1), "%B")
month.abb.loc = format(ISOdate(2004, 1:12, 1), "%b")
If you want to use month names/abbreviations from a different locale than you're in, the withr package is useful.

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

ggplot group by fill and show mean - r

Related

scale_x_date returns error: how can I apply time frame to x-axis?

How to keep labels of a factor variable after transforming it to numeric in R?

Facet Wrap Issue with ggplot Rain Cloud Plot

How do I display percentage in geom_line in ggplot2?

How can I order the months chronologically in ggplot2 short of writing the months out?

Categories

Resources