Grouped barplot with two factors - r

I would like to create a boxplot with ggplot2 which has a factor variable on the x-axis, a numeric value on the y-axis and multiple bars per each x-axis tick.
This is a tibble similar to the one that I'm using:
A tibble: 26 x 4
Param Value Res Comp
<fctr> <dbl> <fctr> <chr>
1 Par1 13.45 Re1 a
2 Par2 10.86 Re1 a
3 Par3 10.32 Re2 a
4 Par2 23.62 Re2 a
5 Par1 19.43 Re2 a
6 <NA> 0.00 Re3 b
7 <NA> 0.00 Re4 b
8 Par1 27.44 Re5 b
9 <NA> 0.00 Re6 b
10 <NA> 0.00 Re6 b
Here's the data:
structure(list(Param = c("Par1", "Par2", "Par3", "Par1",
"Par4", NA, NA, "Par5", NA, NA, "Par6", "Par3", "Par7", "Par8",
"Par3", "Par3", "Par6", "Par3", "Par7", "Par8", "Par3", "Par3",
"Par6", "Par3", "Par7", "Par8"), Value = c(13.45, 10.86, 10.32,
23.62, 19.43, 0, 0, 27.44, 0, 0, 11.37, 37.94, 11.23, 22.8, 19.25,
22.26, 11.36, 19.03, 11.94, 22.79, 14.22, 17.21, 11.66, 23.93,
12.33, 23.39), Res = c("Re1", "Re1", "Re2", "Re2", "Re2",
"Re3", "Re4", "Re5", "Re6", "Re6_DOC", "Re7", "Re7", "Re7",
"Re7", "Re6", "Re6_1", "Re7", "Re7", "Re7", "Re7", "Re6",
"Re6_1", "Re7", "Re7", "Re7", "Re7"), Comp = c("Comp1",
"Comp1", "Comp1", "Comp1",
"Comp1", "Comp1", "Comp1",
"Comp1", "Comp2", "Comp2", "Comp2",
"Comp2", "Comp2", "Comp2", "Comp3",
"Comp3", "Comp3", "Comp3", "Comp3", "Comp3",
"Comp4", "Comp4", "Comp4", "Comp4", "Comp4",
"Comp4")), .Names = c("Param", "Value", "Res", "Comp"), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -26L))
So I'd like to have Res on the x-axis, Value on the y-axis and, in case multiple Params are present for each Res there should be different columns. I want also that in the facet wrap, if a Res is not present it should not be on the x-axis (for example Re1 should not be in the b facet).
For now I've done this:
tibble %>%
ggplot(aes(x=Res,y=Value)) +
geom_bar(position="dodge",stat = "identity") +
facet_wrap(~Comp)
But a single bar appears for every Res and on the x-axis of the various facets empty ticks appear.
Does anyone know how to solve this?
Thanks

The code for the final image is then:
df%>%ggplot(aes(x=Res,y=Value,fill=Param,label=Param))+geom_bar(position=position_dodge2(width = 0.9, preserve = "single"),stat = "identity")+facet_wrap(~Comp,scales = "free_x", as.table = FALSE)+
geom_text(position = position_dodge(width = 0.9),hjust="top",angle = 90,size=5) + guides(fill=FALSE)+scale_fill_manual(values=brewer.pal(8, "Paired"))+theme(axis.title.x=element_blank())+scale_y_continuous("Percentage")

Related

Can't display lines and points with ggplot2

I have a dataframe df as follows:
structure(list(date = structure(c(1L, 13L, 16L, 19L, 22L, 25L,
28L, 31L, 34L, 4L, 7L, 10L, 2L, 14L, 17L, 20L, 23L, 26L, 29L,
32L, 35L, 5L, 8L, 11L, 3L, 15L, 18L, 21L, 24L, 27L, 30L, 33L,
36L, 6L, 9L, 12L), .Label = c("1/1/2010", "1/1/2011", "1/1/2012",
"10/1/2010", "10/1/2011", "10/1/2012", "11/1/2010", "11/1/2011",
"11/1/2012", "12/1/2010", "12/1/2011", "12/1/2012", "2/1/2010",
"2/1/2011", "2/1/2012", "3/1/2010", "3/1/2011", "3/1/2012", "4/1/2010",
"4/1/2011", "4/1/2012", "5/1/2010", "5/1/2011", "5/1/2012", "6/1/2010",
"6/1/2011", "6/1/2012", "7/1/2010", "7/1/2011", "7/1/2012", "8/1/2010",
"8/1/2011", "8/1/2012", "9/1/2010", "9/1/2011", "9/1/2012"), class = "factor"),
a = c(NA, 365.07, 653.19, 980.72, 1455.6, 1867.07, 2036.92,
2372.84, 2693.96, 2973.04, 3227.23, 3678.01, NA, 555.51,
1058.18, 1539.01, 2102.23, 2769.65, 3146.88, 3604.71, 4043.18,
4438.55, 4860.76, 5360.94, NA, 594.67, 1287.05, 1666.5, 2362.27,
2818.16, 3226, 3924.67, 4295.79, 4751.97, 5410.37, 5986.46
), b = c(NA, 158.18, 268.53, 331.81, 434.19, 538.49, 606.62,
651.46, 736.55, 890.81, 981.65, 1748.44, NA, 227.68, 366.95,
486.41, 614.75, 729.44, 836.46, 929.72, 1092.73, 1222.48,
1409.07, 2179.42, NA, 172.99, 359.8, 478.05, 597.88, 660.4,
823.61, 924.57, 1020.33, 1189.15, 1347.44, 2315.36), ratio_a = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 52.17, 62,
56.93, 44.42, 48.34, 54.49, 51.92, 50.08, 49.29, 50.62, 45.76,
NA, 7.05, 21.63, 8.28, 12.37, 1.75, 2.51, 8.88, 6.25, 7.06,
11.31, 11.67), ratio_b = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 43.94, 36.65, 46.59, 41.59, 35.46, 37.89,
42.71, 48.36, 37.23, 43.54, 24.65, NA, -24.02, -1.95, -1.72,
-2.74, -9.46, -1.54, -0.55, -6.63, -2.73, -4.37, 6.24)), class = "data.frame", row.names = c(NA,
-36L))
Out:
date a b ratio_a ratio_b
0 1/1/2010 NaN NaN NaN NaN
1 2/1/2010 365.07 158.18 NaN NaN
2 3/1/2010 653.19 268.53 NaN NaN
3 4/1/2010 980.72 331.81 NaN NaN
4 5/1/2010 1455.60 434.19 NaN NaN
5 6/1/2010 1867.07 538.49 NaN NaN
6 7/1/2010 2036.92 606.62 NaN NaN
7 8/1/2010 2372.84 651.46 NaN NaN
8 9/1/2010 2693.96 736.55 NaN NaN
9 10/1/2010 2973.04 890.81 NaN NaN
10 11/1/2010 3227.23 981.65 NaN NaN
11 12/1/2010 3678.01 1748.44 NaN NaN
12 1/1/2011 NaN NaN NaN NaN
13 2/1/2011 555.51 227.68 52.17 43.94
14 3/1/2011 1058.18 366.95 62.00 36.65
15 4/1/2011 1539.01 486.41 56.93 46.59
16 5/1/2011 2102.23 614.75 44.42 41.59
17 6/1/2011 2769.65 729.44 48.34 35.46
18 7/1/2011 3146.88 836.46 54.49 37.89
19 8/1/2011 3604.71 929.72 51.92 42.71
20 9/1/2011 4043.18 1092.73 50.08 48.36
21 10/1/2011 4438.55 1222.48 49.29 37.23
22 11/1/2011 4860.76 1409.07 50.62 43.54
23 12/1/2011 5360.94 2179.42 45.76 24.65
24 1/1/2012 NaN NaN NaN NaN
25 2/1/2012 594.67 172.99 7.05 -24.02
26 3/1/2012 1287.05 359.80 21.63 -1.95
27 4/1/2012 1666.50 478.05 8.28 -1.72
28 5/1/2012 2362.27 597.88 12.37 -2.74
29 6/1/2012 2818.16 660.40 1.75 -9.46
30 7/1/2012 3226.00 823.61 2.51 -1.54
31 8/1/2012 3924.67 924.57 8.88 -0.55
32 9/1/2012 4295.79 1020.33 6.25 -6.63
33 10/1/2012 4751.97 1189.15 7.06 -2.73
34 11/1/2012 5410.37 1347.44 11.31 -4.37
35 12/1/2012 5986.46 2315.36 11.67 6.24
I'm trying to use the code below to plot a and b for y axis left with barchart, ratio_a and ratio_b for y axis right with lines and point:
library(ggplot2)
library(dplyr)
df$date <- as.Date(df$date, format = "%m/%d/%Y")
df_m <- melt(df, id.vars='date')
df_m_x <- df_m %>%
filter(variable %in% c("a", 'b'))
df_m_ratio_x <- df_m %>%
filter(variable %in% c("ratio_a", 'ratio_b')) %>%
mutate(value = value * 80)
coeff = 1/80
ggplot() +
geom_bar(data = df_m_x, aes(x = date, y = value, fill = variable), alpha = 0.6, position = 'dodge', stat = 'identity') +
geom_line(data = df_m_ratio_x, aes(x = date, y = value, linetype = variable, col = variable), alpha = 1, size = 1.5) +
geom_point(data = df_m_ratio_x, aes(x = date, y = value, col = variable), size = 3) +
scale_y_continuous(
name = "㎡",
sec.axis = sec_axis(~.*coeff, name = "%")) +
scale_color_manual(values = c("a" = "#E7B800", "b" = "#FC4E07")) +
theme(
legend.title = element_blank(),
legend.position = "bottom",
panel.grid.major = element_line(colour = "grey99"),
panel.border = element_rect(colour = "grey95", fill=NA),
panel.background = element_blank(),
legend.text = element_text(size = 18),
) +
scale_x_date(breaks = date_breaks("6 months"), date_labels = "%Y-%m")
Out:
Removed 6 rows containing missing values (geom_bar).
Removed 72 row(s) containing missing values (geom_path).
Removed 72 rows containing missing values (geom_point).
It works for bar charts, however it doesn't display lines and points for ratio_a and ratio_b? Does anyone can help me find out why this happens? Thanks a lot.
The issue is that you used the wrong labels in scale_color_manual. Instead of a and b you have to use ratio_a/b as these are the values of variable in your dataset. Put differently ggplot2 finds no values for ratio_a/b in the color scale and hence they are removed:
library(ggplot2)
library(reshape2)
library(dplyr)
library(scales)
df$date <- as.Date(df$date, format = "%m/%d/%Y")
df_m <- melt(df, id.vars = "date")
df_m_x <- df_m %>%
filter(variable %in% c("a", "b"))
df_m_ratio_x <- df_m %>%
filter(variable %in% c("ratio_a", "ratio_b")) %>%
mutate(value = value * 80)
coeff <- 1 / 80
ggplot() +
geom_bar(data = df_m_x, aes(x = date, y = value, fill = variable), alpha = 0.6, position = "dodge", stat = "identity") +
geom_line(data = df_m_ratio_x, aes(x = date, y = value, linetype = variable, col = variable), alpha = 1, size = 1.5) +
geom_point(data = df_m_ratio_x, aes(x = date, y = value, col = variable), size = 3) +
scale_y_continuous(
name = "<U+33A1>",
sec.axis = sec_axis(~ . * coeff, name = "%")
) +
scale_color_manual(values = c("ratio_a" = "#E7B800", "ratio_b" = "#FC4E07")) +
theme(
legend.title = element_blank(),
legend.position = "bottom",
panel.grid.major = element_line(colour = "grey99"),
panel.border = element_rect(colour = "grey95", fill = NA),
panel.background = element_blank(),
legend.text = element_text(size = 18),
) +
scale_x_date(breaks = date_breaks("6 months"), date_labels = "%Y-%m")
#> Warning: Removed 6 rows containing missing values (geom_bar).
#> Warning: Removed 26 row(s) containing missing values (geom_path).
#> Warning: Removed 28 rows containing missing values (geom_point).

Mutate a column to take the minimum of non-NA values within a group, when can't use group_by

I am trying to take a column of data that has a mix of values and NAs, and replace values with the minimum values in a group. The challenge is I haven't figured out a way to use group_by in this case because there is not a unique grouping in the data set.
What I am looking to do is to say: if the value in the value column is a number, then use that number, unless the previous value is less than the current value. If it is not a number, then leave the value as NA.
I tried group_by but realized that wouldn't work as described above. Then I tried an if_else, but I think this fails because is.numeric is not vectorized.
The final data frame is what I am trying to achieve.
Example data
library(dplyr)
# Initial
initial <- structure(list(dates = structure(c(17532, 17539, 17546, 17553,
17560, 17567, 17574, 17581, 17588, 17595, 17602, 17609, 17616,
17623, 17630, 17637, 17644, 17651, 17658, 17665, 17672, 17679
), class = "Date"), values = c(10, 10, 10, 11, NA, NA, NA, NA,
NA, 20, 20, 21, 22, NA, NA, NA, NA, NA, 30, 30, 31, NA)), class = "data.frame", row.names = c(NA,
-22L))
# Final
final <- structure(list(dates = structure(c(17532, 17539, 17546, 17553,
17560, 17567, 17574, 17581, 17588, 17595, 17602, 17609, 17616,
17623, 17630, 17637, 17644, 17651, 17658, 17665, 17672, 17679
), class = "Date"), values = c(10, 10, 10, 11, NA, NA, NA, NA,
NA, 20, 20, 21, 22, NA, NA, NA, NA, NA, 30, 30, 31, NA), desired = c(10,
10, 10, 10, NA, NA, NA, NA, NA, 20, 20, 20, 20, NA, NA, NA, NA,
NA, 30, 30, 30, NA)), class = "data.frame", row.names = c(NA,
-22L))
This, and other attempts, did not work. I suspect because is.numeric is not vectorized. I tried some mutate_at versions as well but could not get that to work either.
# Did not work
initial %>%
mutate(desired = ifelse(is.numeric(values), ifelse(is.numeric(lag(values)), lag(values), values), values))
We can do use data.table::rleid for creating groups and select min value from each.
library(dplyr)
initial %>%
group_by(group = data.table::rleid(is.na(values))) %>%
mutate(ans = min(values)) %>%
ungroup() %>%
select(-group)
# A tibble: 22 x 3
# dates values ans
# <date> <dbl> <dbl>
# 1 2018-01-01 10 10
# 2 2018-01-08 10 10
# 3 2018-01-15 10 10
# 4 2018-01-22 11 10
# 5 2018-01-29 NA NA
# 6 2018-02-05 NA NA
# 7 2018-02-12 NA NA
# 8 2018-02-19 NA NA
# 9 2018-02-26 NA NA
#10 2018-03-05 20 20
# … with 12 more rows
For a purely dplyr solution, we can replace group_by statement with
group_by(group = cumsum(is.na(values) != lag(is.na(values), default = FALSE))) %>%

Issue with ggplot with discrete x-axis in date format (yyyy-mm-dd)

I have a data frame like below. I need to plot a line plot using ggplot each line showing for each year (i.e. 2014, 2015 etc) with different colors for each year.
I cannot connect the points by a line with my code showing below. But, it does plot the points only with different colors for each year. This may be a simple thing but I just cannot figure this out.
library(reshape2)
library(ggplot2)
plot.data <- melt(Table_1, id.vars = 'Day Obs')
ggplot(plot.data, aes(x = `Day Obs`, y = value)) +
geom_line(mapping = aes(x = `Day Obs`, y = value, colour = variable),size=1.0) +
geom_point(mapping = aes(x = `Day Obs`, y = value, colour = variable),size=2.3)
Table_1:
Day Obs 2014 2015 2016 2017 2018
2018-08-01 NA NA NA NA 1.002
2018-08-03 NA 0.85 NA NA NA
2018-08-06 NA NA NA NA 0.9
2018-08-07 NA NA 0.78 0.88 NA
.
.
The issue is that you have missing values i between observations and geom_line then doesn't connect the points (as information is missing). This can be seenfor the year 2018:
library(reshape2)
library(ggplot2)
plot.data <- melt(Table_1, id.vars = 'Day Obs')
plot.data[plot.data$variable == 2018, ]
# Day Obs variable value
# 17 2018-08-01 2018 1.002
# 18 2018-08-03 2018 NA
# 19 2018-08-06 2018 0.900
# 20 2018-08-07 2018 NA
Here the information for 2018-08-03 is missing explicitly. Therfore there is no connection between the points, if we plot them.
ggplot(plot.data, aes(x = `Day Obs`, y = value, colour = variable)) +
geom_line(size = 1.0) +
geom_point(size = 2.3)
You can remove the explicit missing values and it works:
ggplot(plot.data[!is.na(plot.data$value), ],
aes(x = `Day Obs`, y = value, colour = variable)) +
geom_line(size = 1.0) +
geom_point(size = 2.3)
Data
Table_1 <- structure(list(`Day Obs` = structure(c(17744, 17746, 17749, 17750), class = "Date"),
`2014` = c(NA, NA, NA, NA),
`2015` = c(NA, 0.85, NA, NA),
`2016` = c(NA, NA, NA, 0.78),
`2017` = c(NA, NA, NA, 0.88),
`2018` = c(1.002, NA, 0.9, NA)),
row.names = c(NA, -4L), class = "data.frame")

split, transpose and gather data frame

I need to gather and transpose the data in data frame, hourly values should be in one column. First column should be date with hour and second transposed hourly values.
Sample of data:
structure(list(Year = c(2016L, 2016L), JDay = 1:2, Hour_1 = c(2.59,
5.95), Hour_2 = c(2.19, 5.84), Hour_3 = c(1.84, 5.75), Hour_4 = c(1.51,
5.66), Hour_5 = c(1.21, 5.58), Hour_6 = c(0.94, 5.5), Hour_7 = c(0.69,
5.43), Hour_8 = c(0.45, 5.37), Hour_9 = c(0.23, 5.31), Hour_10 = c(2.18,
6.19), Hour_11 = c(4.39, 7.16), Hour_12 = c(6.29, 8), Hour_13 = c(7.76,
8.65), Hour_14 = c(8.68, 9.06), Hour_15 = c(9, 9.2), Hour_16 = c(8.68,
9.06), Hour_17 = c(7.76, 8.65), Hour_18 = c(7.8, 8.52), Hour_19 = c(7.21,
7.57), Hour_20 = c(6.85, 6.99), Hour_21 = c(6.59, 6.57), Hour_22 = c(6.39,
6.25), Hour_23 = c(6.22, 5.98), Hour_24 = c(6.08, 5.75)), .Names = c("Year",
"JDay", "Hour_1", "Hour_2", "Hour_3", "Hour_4", "Hour_5", "Hour_6",
"Hour_7", "Hour_8", "Hour_9", "Hour_10", "Hour_11", "Hour_12",
"Hour_13", "Hour_14", "Hour_15", "Hour_16", "Hour_17", "Hour_18",
"Hour_19", "Hour_20", "Hour_21", "Hour_22", "Hour_23", "Hour_24"
), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"
Use of gather is just giving me all Hour_1 values in in order...
gather(OP_daily[, c(5:28)], time,temp, Hour_1:Hour_24)
Example output:
date temp
2016-1-1 1:00 2.59
2016-1-1 2:00 2.19
This sound like the gather your are looking for:
df %>%
gather(-c(Year,JDay), key = "Hour", value = "temp") %>%
unite(date,Year,JDay,Hour) %>%
mutate(date=as.POSIXct(date,format='%Y_%j_Hour_%H')) %>%
arrange(date)
date temp
<time> <dbl>
1 2016-01-01 01:00:00 2.592221
2 2016-01-01 02:00:00 2.193009
3 2016-01-01 03:00:00 1.835225
4 2016-01-01 04:00:00 1.511071
5 2016-01-01 05:00:00 1.214767
6 2016-01-01 06:00:00 0.941902
EDIT
To see how many observations per day:
res <- df %>%
gather(-c(Year,JDay), key = "Hour", value = "temp") %>%
unite(date,Year,JDay,Hour) %>%
mutate(date=as.POSIXct(date,format='%Y_%j_Hour_%H',tz = "GMT")) %>%
arrange(date)
res%>%
mutate(date_only=as.Date(date))%>%
group_by(date_only)%>%
summarise(count=n())
date_only count
<date> <int>
1 2016-01-01 23
2 2016-01-02 24
3 2016-01-03 1

insert a column into a tabular object

I found this blog on getting a nested table into LaTeX format(Blog Link). I like the outcome but want to insert a column into the object at the beginning after the rownames. I'm used to dealing with data frames so dealing with this beast is more difficult than typical column indexing.
Here's what I have now:
pre post
approach mean sd mean sd
1 24.17 8.310 54.33 11.01
2 25.50 9.434 65.25 16.32
3 26.33 9.139 63.17 12.53
And here's what I'd like it to look like:
pre post
approach n mean sd mean sd
1 12 24.17 8.310 54.33 11.01
2 12 25.50 9.434 65.25 16.32
3 12 26.33 9.139 63.17 12.53
Here's the dput of z and also the column of n's I'd like to insert.
Thank you in advance.
z <- structure(list(24.1666666666667, 25.5, 26.3333333333333, 8.31027111835746,
9.4339811320566, 9.13866245766587, 54.3333333333333, 65.25,
63.1666666666667, 11.0068848977136, 16.3157759685081, 12.5323822978956), .Dim = 3:4, .Dimnames = list(
NULL, c("term", "term", "term", "term")), rowLabels = structure(c("1",
"2", "3"), .Dim = c(3L, 1L), .Dimnames = list(NULL, "approach"), justification = structure(c(NA_character_,
NA_character_, NA_character_), .Dim = c(3L, 1L)), colnamejust = NA_character_, justify = NA, suppress = 0), colLabels = structure(c("pre",
"mean", NA, "sd", "post", "mean", NA, "sd"), .Dim = c(2L, 4L), justification = structure(c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_), .Dim = c(2L, 4L)), colnamejust = character(0), justify = NA, suppress = 0), table = value *
v * approach ~ variable2 * result_variable, formats = structure(c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Dim = 3:4, .Dimnames = list(
NULL, c("format", "format", "format", "format"))), justification = structure(c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Dim = 3:4, .Dimnames = list(
NULL, c("justification", "justification", "justification",
"justification"))), class = "tabular")
structure(c(12L, 12L, 12L), .Names = c("1", "2", "3"))
The only (known ?) way is to re-assign re-ordered:
R> mockup <- data.frame(B=21:23, C=31:33)
R> mockup
B C
1 21 31
2 22 32
3 23 33
R>
Now add column A:
R> mockup[,"A"] <- 1:3
R> mockup
B C A
1 21 31 1
2 22 32 2
3 23 33 3
R>
And reorder:
R> mockup <- mockup[,c("A", "B", "C")]
R> mockup
A B C
1 1 21 31
2 2 22 32
3 3 23 33
R>
Presto. New column at the beginning.
Something like this:
z <- data.frame(approach = gl(3, 12), pre = rnorm(36)*50, post = rnorm(36)*60)
library(tables)
tabular(approach ~ (pre + post) * (mean + sd))
pre post
approach mean sd mean sd
1 -5.431 61.01 3.766 54.76
2 20.408 29.14 -9.261 54.58
3 -7.854 53.55 -30.046 62.41
tabular(approach ~ (n=1) + (pre + post) * (mean + sd))
pre post
approach n mean sd mean sd
1 12 -5.431 61.01 3.766 54.76
2 12 20.408 29.14 -9.261 54.58
3 12 -7.854 53.55 -30.046 62.41
tabular(approach + 1 ~ (n=1) + (pre + post) * (mean + sd))
pre post
approach n mean sd mean sd
1 12 -5.431 61.01 3.766 54.76
2 12 20.408 29.14 -9.261 54.58
3 12 -7.854 53.55 -30.046 62.41
All 36 2.374 50.06 -11.847 57.46
For more details see vignette of the tables package.

Resources