Related
I'm working on a heatmap and following along this tutorial:
https://www.r-graph-gallery.com/283-the-hourly-heatmap/
To save a click, here's the code block to reproduce:
library(ggplot2)
library(dplyr) # easier data wrangling
library(viridis) # colour blind friendly palette, works in B&W also
library(Interpol.T) # will generate a large dataset on initial load
library(lubridate) # for easy date manipulation
library(ggExtra) # because remembering ggplot theme options is beyond me
library(tidyr)
data<- data(Trentino_hourly_T,package = "Interpol.T")
names(h_d_t)[1:5]<- c("stationid","date","hour","temp","flag")
df<- tbl_df(h_d_t) %>%
filter(stationid =="T0001")
df<- df %>% mutate(year = year(date),
month = month(date, label=TRUE),
day = day(date))
df$date<-ymd(df$date) # not necessary for plot but
#useful if you want to do further work with the data
#cleanup
rm(list=c("h_d_t","mo_bias","Tn","Tx",
"Th_int_list","calibration_l",
"calibration_shape","Tm_list"))
#create plotting df
df <-df %>% select(stationid,day,hour,month,year,temp)
Then a heatmap is made:
p <-ggplot(df,aes(day,hour,fill=temp))+
geom_tile(color= "white",size=0.1) +
scale_fill_viridis(name="Hrly Temps C",option ="C")
p <-p + facet_grid(year~month)
p <-p + scale_y_continuous(trans = "reverse", breaks = unique(df$hour))
So far so good, I can recreate this. However my own dataset is website visit data at the visit level, so many visits in a given day and hour. In addition to visits I also have a timeOnPage metric.
Sample of data below with dput.
I wouldlike to heatmap the average hourly visits or timeOnPage. Here's what I tried.
Sample of my data:
> dput(sam)
structure(list(Day = structure(c(4L, 4L, 4L, 5L, 3L, 2L, 3L,
6L, 2L, 2L, 4L, 2L, 3L, 3L, 6L, 1L, 4L, 2L, 3L, 5L, 2L, 5L, 4L,
2L, 5L, 2L, 7L, 5L, 6L, 2L, 2L, 6L, 4L, 6L, 2L, 2L, 2L, 5L, 5L,
2L, 6L, 5L, 3L, 5L, 3L, 2L, 6L, 4L, 2L, 5L, 2L, 5L, 4L, 2L, 6L,
2L, 7L, 2L, 2L, 2L, 5L, 6L, 3L, 2L, 3L, 4L, 4L, 3L, 6L, 2L, 5L,
3L, 4L, 4L, 3L, 2L, 5L, 5L, 5L, 3L, 5L, 2L, 4L, 5L, 5L, 2L, 3L,
6L, 2L, 2L, 5L, 4L, 6L, 7L, 3L, 3L, 4L, 4L, 2L, 6L), .Label = c("Sun",
"Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"), class = c("ordered",
"factor")), Hour = c(18L, 7L, 3L, 22L, 11L, 11L, 9L, 16L, 16L,
13L, 18L, 18L, 10L, 19L, 7L, 13L, 18L, 14L, 10L, 20L, 17L, 6L,
21L, 15L, 18L, 7L, 12L, 10L, 16L, 14L, 18L, 13L, 17L, 10L, 19L,
20L, 14L, 16L, 10L, 9L, 16L, 9L, 8L, 13L, 17L, 17L, 11L, 15L,
22L, 17L, 18L, 17L, 7L, 19L, 12L, 2L, 12L, 15L, 7L, 17L, 17L,
18L, 13L, 10L, 19L, 9L, 13L, 13L, 17L, 21L, 23L, 4L, 17L, 12L,
12L, 9L, 17L, 19L, 7L, 4L, 5L, 17L, 6L, 23L, 3L, 14L, 19L, 13L,
7L, 11L, 9L, 13L, 9L, 19L, 11L, 5L, 20L, 20L, 19L, 11L), sessionID = c("1508980591045.l027p6mt",
"1510155616668.57i2wj1", "1510140439620.qu19kyo", "1510296404412.xasqfwqd10v1qdtl6jemi",
"1510082622485.szj2ja1e", "1511204933263.mq9bvi0d", "1511285142249.vp2fyfd9",
"1510965282725.x04h1dko", "1508801295434.e056cpef", "1508790369346.ly63bjgr",
"1509585154520.3usd036k", "1511834881064.e6f5evp", "1509471114265.2u807dwo",
"1507688054076.9dls0jk", "1509721031589.ho125mpb", "1510521845178.99j1ibkr",
"1510194555297.ioepfjgr", "1508793469455.hkc3xwa8", "1511288175700.62n5oc5",
"1510287319653.7ye9sjc", "1511227016523.yyn1of99", "1511448209341.1u5vir5p",
"1510205972493.qvu4ev7o", "1510615247987.swxhwct", "1508463701266.p52sdjzp",
"1510588449881.d6ffruv9", "1507404213416.rovwmmge", "1510857718956.2z57w2vr",
"1510360661780.19hznp3m78pvi", "1511820500742.48cyvo2a", "1508809029952.up0wqq5h",
"1508533120441.gdvhacjr7jswiquwuyp66r", "1509583258224.j8krac0sz5kx8pxohl4n29",
"1511549442901.5vm7na1l", "1508811367845.7b36epqk", "1509421407861.om0ydylt",
"1508794534361.p3gcoa0e", "1510877729807.viad220f", "1511460355269.omwvd00l",
"1508775703610.usuk2akm", "1510964376869.7e2crw9d", "1510247098808.np9ia23",
"1508860753512.3z4182b", "1510868797935.3nmpvkri", "1510105270807.4evhpys",
"1511831565084.27izf13f", "1510340973580.l9qj5drou5wmi", "1508364715184.14l4ikj",
"1509426566404.9qnp0m3", "1510275972333.hhqu0exc", "1510625679744.jk3vvt1v",
"1510881839700.c34skful", "1511365134270.57thqyir", "1509416741055.1f2cnmrp",
"1509738404263.8ajwpij", "1510570338116.h9a5j88", "1511640706961.qw8q1eh",
"1510011913201.eqd54kw", "1508769010911.wrpb329", "1508803518777.56b2ej2l",
"1509670743316.yhncp17j", "1511576965410.y47g0wgj", "1508876390209.wem8i3lh",
"1508779846415.hyx8qar", "1511322782502.s835px9", "1509554323957.osxgi0em",
"1510176829762.jncm9xwb", "1509482328620.sqdbob0u", "1508545652936.a5hqcmp1fw29",
"1508817816447.6mbdldxb", "1510297785623.33i6yhko", "1508843299131.3m26sqf5",
"1510191633431.cl5fh9ik", "1509565114633.bd5yrkf5", "1510690660714.818yxn5o",
"1507567660773.ybpbfgn", "1509667501973.1a9f9pyp", "1509674601865.yqvmcclv",
"1511450423709.s149r25q", "1511267096892.n5u1d0nv", "1509624499459.u57lgtt8",
"1510019204298.ka4w9kfh", "1511362131909.t26h6ig", "1510904968660.eowoea2q",
"1510225256391.4dk073ej", "1510006654569.reo2eili", "1509501692686.ng48bwnz",
"1509741958143.bxbf325r", "1508770633217.33ymrfgc", "1511810438817.zcgpr6vj",
"1510852180447.wywsj7f", "1510176833767.nev0iaec", "1509727547082.53van2sr",
"1507430914148.niu297m", "1508868705810.akd7r18h", "1510060231388.mz9ojf6g",
"1509592760232.qtrlxye8", "1509592651211.1r82ucw4", "1508812928318.f3st4004",
"1509734102140.leol1dnw"), uniquePageviews = c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), timeOnPage = c(359, 149, 69, 146, 147, 119, 168, 69, 29, 0,
1542, 148, 242, 49, 457, 175, 175, 97, 79, 12, 0, 1141, 150,
236, 74, 128, 23, 147, 172, 223, 225, 88, 69, 156, 0, 49, 110,
150, 70, 123, 30, 145, 1629, 1, 119, 169, 48, 136, 529, 130,
149, 124, 281, 2483, 0, 60, 149, 50, 29, 124, 149, 0, 92, 149,
915, 47, 50, 89, 143, 84, 129, 147, 138, 80, 33, 226, 70, 146,
177, 98, 150, 32, 148, 149, 12, 338, 146, 204, 149, 148, 26,
149, 1110, 148, 23, 151, 0, 100, 0, 28)), row.names = c(20219L,
42612L, 42149L, 46707L, 40122L, 57449L, 60878L, 56707L, 11725L,
10102L, 29911L, 71743L, 25952L, 1492L, 35570L, 48411L, 43917L,
10530L, 61004L, 46446L, 58846L, 65695L, 44287L, 49341L, 2999L,
48502L, 627L, 54118L, 48148L, 70166L, 13346L, 4770L, 29745L,
67979L, 13832L, 24814L, 10692L, 54744L, 65995L, 8216L, 56683L,
44920L, 18121L, 54499L, 41155L, 71353L, 47606L, 1900L, 25023L,
45811L, 49937L, 54904L, 63607L, 24571L, 36060L, 48479L, 69086L,
37708L, 7353L, 12117L, 33912L, 68752L, 19081L, 8768L, 62647L,
28317L, 43172L, 26286L, 6359L, 14907L, 46733L, 16418L, 43797L,
28637L, 51671L, 1273L, 33677L, 34226L, 65759L, 60247L, 31739L,
38171L, 63497L, 55589L, 44462L, 37454L, 27141L, 36178L, 7543L,
69636L, 54030L, 43173L, 35743L, 852L, 18784L, 39283L, 30672L,
30663L, 14142L, 35933L), class = "data.frame", .Names = c("Day",
"Hour", "sessionID", "uniquePageviews", "timeOnPage"))
It looks like this:
> head(sam)
Day Hour sessionID uniquePageviews timeOnPage
20219 Wed 18 1508980591045.l027p6mt 1 359
42612 Wed 7 1510155616668.57i2wj1 1 149
42149 Wed 3 1510140439620.qu19kyo 1 69
46707 Thurs 22 1510296404412.xasqfwqd10v1qdtl6jemi 1 146
40122 Tues 11 1510082622485.szj2ja1e 1 147
57449 Mon 11 1511204933263.mq9bvi0d 1 119
> glimpse(sam)
Observations: 100
Variables: 5
$ Day <ord> Wed, Wed, Wed, Thurs, Tues, Mon, Tues, Fri, Mon, Mon, Wed, Mon, Tues, Tues, Fri, Sun, Wed, M...
$ Hour <int> 18, 7, 3, 22, 11, 11, 9, 16, 16, 13, 18, 18, 10, 19, 7, 13, 18, 14, 10, 20, 17, 6, 21, 15, 1...
$ sessionID <chr> "1508980591045.l027p6mt", "1510155616668.57i2wj1", "1510140439620.qu19kyo", "1510296404412.x...
$ uniquePageviews <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
$ timeOnPage <dbl> 359, 149, 69, 146, 147, 119, 168, 69, 29, 0, 1542, 148, 242, 49, 457, 175, 175, 97, 79, 12, ...
Metric uniquePageviews will always be 1 or o and in a heatmap it doesn't look great. Since it's session level data there are multiple entries for each day / hour. For timeOnPage I wouldlike to heatmap the mean time on page for a given hour and day of week combination.
So, as far as I can tell ggplot is summing everything whereas I want mean().
My initial code block:
# creates the initial heatmap
p <- ggplot(sam, aes(x = Day, y = Hour, fill = uniquePageviews)) +
geom_tile(color = "white", size = 0.1) +
scale_fill_viridis(name = "TimeOnPage", option ="C")
# order by hour of day going top to bottom asc
p <-p + scale_y_continuous(trans = "reverse", breaks = unique(df$hour))
I tried changing it to this but the results look the exact same:
# gets the initial heatmap
p <- ggplot(sam, aes(x = Day, y = Hour, fill = uniquePageviews),
stat = "summary", fun.y = "mean") +
geom_tile(color = "white", size = 0.1) +
scale_fill_viridis(name = "Mean TimeOnPage", option ="C")
# order by hour of day going top to bottom asc
p <-p + scale_y_continuous(trans = "reverse", breaks = unique(df$hour))
I could do some dplyr group by transformations on the dataframe sam but I was not sure if ggplot::geom_tile() takes care of that or not?
How can I create a heatmap with ggplot where the fill is based on mean? Also, can someone clarify what exactly it's showing now? Total sum?
Not sure if I get your problem but you can try following:
library(tidyverse)
library(viridis)
d %>%
group_by(Day, Hour) %>%
summarise(Mean=mean(timeOnPage)) %>%
ggplot(aes(x = Day, y = Hour, fill = Mean)) +
geom_tile(color = "white", size = 0.1) +
scale_fill_viridis(name = "TimeOnPage", option ="C")
this will caclulate the mean timeOnPage per Day and Hour and plot it as a heatmap.
I'm trying to figure out what I'm doing wrong passing arguments to ggplot. I've come a long way with existing posts, but have hit a wall here. Probably something stupid, but here goes (I'm leaving out some of the plot formatting since that is not where the problem is):
melted data set "lagres" is the same in both scenarios.
> str(lagres)
'data.frame': 30 obs. of 4 variables:
$ ST : Factor w/ 3 levels
$ year : Factor w/ 6 levels
$ variable: Factor w/ 2 levels
$ value : num
The first plotting call works great:
ggplot(lagres, aes(quarter, value, group = interaction(ERTp, variable), linetype = variable, color = ERTp, shape = variable ))
Trying to convert this to accept arguments and be re-used in a for-loop script does NOT work, even though the structure is really the same:
timevar <- "quarter"
grpvar <- "ERTp"
fplot <- function(lagres, timevar, grpvar, ylb, tlb){
plot <- ggplot(lagres, aes_string(x=timevar, y="value", group = interaction("variable", grpvar), linetype = "variable", color = grpvar, shape = "variable")) +
geom_line(size = 0.5) + geom_point(size = 3) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(y = ylb) +
ggtitle(paste(tlb, grpvar, today, sep = ", ")) +
theme(plot.title = element_text(lineheight = .8, face = "bold", hjust = 0.5))
fplot(lagres, timevar, grpvar)
Error: geom_path: If you are using dotted or dashed lines, colour,
size and linetype must be constant over the line
The problem seems to lie with the "linetype" arg, as removing this results in an appropriate graph in terms of values/colors, but the lines connected wrong and obviously no separate line for each variable/grp.
Trying to analyze the problem further by looking at the structure of the argument, it looks like aes() and aes_string() parse the group interaction differently. Maybe this is the problem. Parsing the "aes()" formulation with raw variables, I get:
> str(aes(quarter, value, group = interaction(ERTp, variable), linetype = variable, color = ERTp, shape = variable ))
List of 6
$ x : symbol quarter
$ y : symbol value
$ group : language interaction(ERTp, variable)
$ linetype: symbol variable
$ colour : symbol ERTp
$ shape : symbol variable
Then, the "aes_string()" method with referenced arguments:
> str(aes_string(timevar, "value", group = interaction(grpvar, "variable"), linetype = "variable", color = grpvar, shape = "variable" ))
List of 6
$ group : Factor w/ 1 level "ST.variable": 1
$ linetype: symbol variable
$ colour : symbol ST
$ shape : symbol variable
$ x : symbol quarter
$ y : symbol value
So, having the group be either a "language interaction" vs. a 1-level factor, would make a difference? Can't figure out what to do about that parsing issue so the group interaction comes out properly. Saw somewhere that "paste()" could be used, but, no, that does not work. Passing ALL arguments (thus, no quoted text in the aes_string() formula) does not help either.
> dput(lagres)
structure(list(ST = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 3L, 1L, 2L,
3L, 2L, 3L, 1L, 3L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("GeraghtyM",
"Other", "WeenJ"), class = "factor"), quarter = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L,
6L, 7L, 7L, 7L, 1L, 2L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L,
6L, 7L, 7L, 7L), .Label = c("2015-Q2", "2015-Q3", "2015-Q4",
"2016-Q1", "2016-Q2", "2016-Q3", "2016-Q4"), class = "factor"),
variable = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("ScanLag",
"TPADoorToLag"), class = "factor"), value = c(45.3333333333333,
60.2857142857143, 37.6, 0, 51.375, 95.4166666666667, 26.8,
42.75, 200, 28, 134, 68.2941176470588, 29, 42.8, 140.7, 0,
49.2222222222222, 103.833333333333, 0, 20.125, 0, 67.75,
48, 87, 93, 78, 49.5, 55, 65.6, 83, 59, 54, 153, 114, 111,
83, 8.66666666666667)), .Names = c("ST", "quarter", "variable",
"value"), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 24L, 25L,
26L, 27L, 29L, 30L, 31L, 33L, 35L, 36L, 37L, 38L, 39L, 40L, 41L,
42L), class = "data.frame", na.action = structure(c(22L, 23L,
28L, 32L, 34L), .Names = c("22", "23", "28", "32", "34"), class = "omit"))
aes_string isn't reading the interaction code that you are using. One way to avoid this is to simply make a new "interaction" variable in your dataset within the function prior to plotting.
For example:
fplot <- function(lagres, timevar, grpvar){
lagres$combine = interaction(lagres[["variable"]], lagres[[grpvar]])
plot <- ggplot(lagres, aes_string(x=timevar, y="value",
group = "combine", linetype = "variable",
color = grpvar, shape = "variable")) +
geom_line(size = 0.5) +
geom_point(size = 3)
plot
}
I am trying to plot count v/s month
ggplot(dat, aes(x=month, y=count,group=region)) +
geom_line(data=mcount[mcount$region == "West coast", ],colour="black",stat="identity", position="dodge")+
geom_point(data=mcount[mcount$region == "West coast", ],colour="black", size=2, shape=21, fill="white")+
theme_bw()+
theme(legend.key = element_rect(colour = "black")) +
guides(fill = guide_legend(override.aes = list(colour = NULL)))+
ggsave("test.png",width=6, height=4,dpi=300)
But I want to order the months chronologically from Jan to Dec. How can I do this short of writing all the months out?
dput
structure(list(region = structure(c(6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
3L), .Label = c("West coast", "Arizona", "Front range", "Flash flood alley",
"Mississippi valley", "Appalachians"), class = "factor"), month = structure(c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 4L, 12L, 11L,
5L, 2L, 9L, 8L, 6L, 10L, 3L, 7L, 8L, 10L, 5L, 1L, 6L, 7L, 4L,
6L, 8L, 2L, 1L, 7L, 5L, 3L, 11L, 12L, 9L, 10L, 2L, 7L, 3L, 6L,
12L, 11L, 10L, 9L, 4L, 1L, 11L, 4L, 2L, 1L, 12L, 9L, 3L, 8L,
5L, 6L, 10L, 7L, 5L, 8L, 11L, 12L, 4L, 3L, 9L, 2L), .Label = c("Apr",
"Dec", "Oct", "Mar", "May", "Jul", "Sep", "Jun", "Nov", "Aug",
"Jan", "Feb"), class = "factor"), count = c(566, 545, 427, 751,
357, 399, 568, 433, 454, 347, 511, 251, 267, 207, 167, 142, 417,
109, 117, 373, 207, 130, 125, 145, 7, 14, 2, 2, 7, 3, 107, 74,
135, 48, 80, 53, 117, 125, 59, 53, 103, 30, 21, 18, 8, 22, 26,
37, 20, 5, 11, 1, 96, 29, 109, 8, 33, 53, 6, 1, 5, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0)), .Names = c("region", "month", "count"), row.names = c(NA,
-72L), class = c("data.table", "data.frame"))
Use the built-in month.name or month.abb variable to specify the levels of your factor in the correct order. In this case, you have abbreviations so month.abb is appropriate.
your_data$month = factor(your_data$month, levels = month.abb)
I think creating the factor in the correct order is the best way, but you can also just order the axis using the limits argument of the discrete scale (see ?discrete_scale for more info).
+ scale_x_discrete(limits = month.abb)
Locales
If you are in a non-English locale, you can construct your own month name constants with a little date formatting (basically stolen from Brian Ripley in this R-Help thread):
month.name.loc = format(ISOdate(2004, 1:12, 1), "%B")
month.abb.loc = format(ISOdate(2004, 1:12, 1), "%b")
If you want to use month names/abbreviations from a different locale than you're in, the withr package is useful.
I need the 11 bars in the following stacked barplot to be reordered by the sum of the first two segments of each bar, i.e. sorted by the (red+green) segments in the plot.
> dput(q1m.bl)
structure(list(ItemA = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L,
1L, 2L, 3L, 4L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L), .Label = c("sehr wichtig", "wichtig", "unwichtig",
"keine Angabe"), class = "factor"), ItemQ = structure(c(1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L,
5L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 9L, 9L, 9L,
10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L), .Label = c("PUSHERS_AA",
"PUSHERS_COM", "PUSHERS_BED", "PUSHERS_SEC", "PUSHERS_STAB",
"PUSHERS_COST", "PUSHERS_INNO", "PUSHERS_VAL", "PUSHERS_INDEP",
"PUSHERS_STDS", "PUSHERS_SRC"), class = "factor"), Counts = c(1L,
3L, 4L, 1L, 3L, 3L, 2L, 1L, 4L, 2L, 2L, 1L, 3L, 5L, 1L, 1L, 1L,
6L, 1L, 5L, 1L, 2L, 1L, 1L, 1L, 6L, 1L, 2L, 6L, 1L, 2L, 4L, 2L,
1L, 3L, 3L, 2L, 1L, 2L, 1L, 5L, 1L), blpos = c(0.111111111111111,
0.444444444444444, 0.888888888888889, 1, 0.333333333333333, 0.666666666666667,
0.888888888888889, 1, 0.444444444444444, 0.666666666666667, 0.888888888888889,
1, 0.333333333333333, 0.888888888888889, 1, 0.111111111111111,
0.222222222222222, 0.888888888888889, 1, 0.555555555555556, 0.666666666666667,
0.888888888888889, 1, 0.111111111111111, 0.222222222222222, 0.888888888888889,
1, 0.222222222222222, 0.888888888888889, 1, 0.222222222222222,
0.666666666666667, 0.888888888888889, 1, 0.333333333333333, 0.666666666666667,
0.888888888888889, 1, 0.222222222222222, 0.333333333333333, 0.888888888888889,
1)), .Names = c("ItemA", "ItemQ", "Counts", "blpos"), row.names = c(NA,
-42L), class = "data.frame")
The plot ...
ggplot(q1m.bl, aes(x = ItemQ, y = Counts, fill = ItemA)) +
geom_bar(stat="identity", position="fill") +
geom_text(aes(y = blpos, label = Counts), hjust = 1) +
theme(axis.text.x=element_text(angle=90, hjust = 0), text = element_text(size=10)) +
coord_flip()
Ugh, not enough rep points to embed images. Sorry for the inconvenience. Plot is here: http://i.stack.imgur.com/am0Ud.png
I played around with arrange() and after checking the data frame itself, I thought the following sorting should do the trick. (Note: blpos means "bar label position" and are the positions of the various numbers in the plot.) But plotting this "sorted" data frame leads to the identical plot as above. I do not understand which information to change to change the plotting order of the ItemQ column.
q1m.bl.s <- arrange(q1m.bl, ItemA, desc(blpos))
ggplot(q1m.bl.s, ....
What's the best approach anyway? Should I manipulate the df (using ddply/arrange/reorder/etc.) prior to plotting? Because I tend to think this is a presentation issue and should be done inside ggplot. Does it even matter? The "ggplot ordered barchart" questions I found on SO seem to use both approaches; yet none I found was referring to stacked bar segments and using factor data... hence this new question.
Thank you very much for enlightening me!
It's all about re-ordering the factor levels of the ItemQ variable.
d <- subset(q1m.bl, ItemA %in% c("sehr wichtig", "wichtig"))
totals <- aggregate(d$Counts, list(ItemQ = d$ItemQ), sum)
ItemQ.order <- as.character(totals[order(-totals$x), ]$ItemQ)
q1m.bl$ItemQ <- factor(q1m.bl$ItemQ, levels = ItemQ.order)
Then you should be able to run the code exactly as you provided it and it will produce this:
EDIT (digisus): konvas, I am just re-adding your first answer showing the use of ddply because even I do not feel comfortable with it/do not fully get it, I am sure others can benefit from it. :-) So, with your permission I repost it here:
library(plyr)
ItemQ.order <- q1m.bl %>%
group_by(ItemQ) %>%
filter(ItemA %in% c("sehr wichtig", "wichtig")) %>%
summarise(total = sum(Counts)) %>%
arrange(-total) %>%
select(ItemQ) %>%
unlist %>%
as.character
q1m.bl$ItemQ <- factor(q1m.bl$ItemQ, levels = ItemQ.order)
library(ggplot2)
fac_ord <- function(seed){
set.seed(seed)
return(sample(letters[1:4]))
}
# this seed simulates arbitrary sortings
seed <- 2
fac_ord(seed)
val = c(1,2,3,4,2,2,2,2)
fac = factor(c("a","b","c","d","a","b","c","d"),
levels=fac_ord(seed),
labels=fac_ord(seed),
ordered=FALSE)
dif = c(rep("x",4),rep("y",4))
df = data.frame(val = val, fac = fac)
ggplot(df, aes(x=fac, y=val, fill=dif)) +
geom_bar(stat="identity") +
labs(title = sprintf("seed = %d / %s", seed, paste(fac_ord(seed),collapse=",")))
As the example shows - ggplot will use same ordering for fac in the plot as the internal order of fac. So to influence the plotted order you have to write a function which returns the intended order - in dependence on whatever facts and values - and use this to create the factor fac - and then use this propperly-ordered factor for the plotting.
The intended result can also be reached by application of reorder() for reordering the levels of the factor.
Consider the following data frame df, which has an X column and 3 related value columns (Y1..Y3), and additional columns not used in this graph.
My question: How/Can I use the data from df to create a geom_area(fill) plot?
All examples I can find only apply if I create another data frame with the data in separate rows, like in df2. Simplified data is given below.
head(df, 3)
nth tot y1 y2 y3 other1 other2
1 1 1.9449 0.8724 0.1070 0.9655 31 63
2 2 1.2693 0.4519 0.5235 0.2939 46 67
3 3 2.6845 0.8147 0.8963 0.9735 46 42
head(df2,3)
nth tag val
1 1 y1 0.8724
2 2 y1 0.4519
3 3 y1 0.8147
If the answer is that reformatting to df2 is required, what is the "best" way to create df2?
df <- structure(list(x = 1:15, tot = c(1.9449, 1.2693, 2.6845, 1.3311, 1.0887, 1.7291, 1.8173, 1.6097, 1.9690, 1.4961, 1.5411, 1.5308, 1.5634, 1.3179, 1.1292), y1 = c(0.8724, 0.4519, 0.8147, 0.9769, 0.3094, 0.0342, 0.8947, 0.9457, 0.9295, 0.5742, 0.2235, 0.2140, 0.0472, 0.8690, 0.2460), y2 = c(0.1070, 0.5235, 0.8963, 0.0852, 0.0193, 0.8287, 0.7999, 0.5966, 0.4534, 0.8592, 0.3610, 0.9233, 0.7566, 0.1929, 0.5331), y3 = c(0.9655, 0.2939, 0.9735, 0.2690, 0.7600, 0.8662, 0.1227, 0.0674, 0.5861, 0.0627, 0.9566, 0.3935, 0.7596, 0.2560, 0.3501), other1 = c(31, 46, 46, 41, 32, 22, 49, 35, 41, 27, 37, 26, 20, 44, 30), other2 = c(63, 67, 42, 55, 73, 30, 75, 76, 53, 38, 69, 52, 30, 78, 63)), .Names = c("x", "tot", "y1", "y2", "y3", "other1", "other2"), row.names = c(NA, -15L), class = "data.frame")
df2 <- structure(list(x = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), tag = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("y1", "y2", "y3"), class = "factor"), val = c(0.8724, 0.4519, 0.8147, 0.9769, 0.3094, 0.0342, 0.8947, 0.9457, 0.9295, 0.5742, 0.2235, 0.2140, 0.0472, 0.8690, 0.2460, 0.1070, 0.5235, 0.8963, 0.0852, 0.0193, 0.8287, 0.7999, 0.5966, 0.4534, 0.8592, 0.3610, 0.9233, 0.7566, 0.1929, 0.5331, 0.9655, 0.2939, 0.9735, 0.2690, 0.7600, 0.8662, 0.1227, 0.0674, 0.5861, 0.0627, 0.9566, 0.3935, 0.7596, 0.2560, 0.3501)), .Names = c("x", "tag", "val"), row.names = c(NA, -45L), class = "data.frame")
library(ggplot2)
ggplot(df2, aes(x, val, color=tag, fill=tag)) + geom_area(position='fill')
The idiomatic way to produce df2 from df is using melt(...) in the reshape2 package.
library(reshape2)
df3 <- melt(df,id.vars="x",measure.vars=c("y1","y2","y3"))
ggplot(df3, aes(x, value, color=variable, fill=variable)) +
geom_area(position='fill')
So melt(...) takes a data frame in "wide" format (data in different columns) and converts it to "long" format (data in 1 column, with original column names in a separate column). You identify the columns which are repeated using id.vars=..., and the columns containing the data in measure.vars=..... Then, melt(...) produces a new data frame with the repeating columns, the data in a column value and the column names in a column variable.