Related
I have generated the following plot, which groups the data based on drug. I then created another data frame which summarized the main data by calculating the mean. These mean values(grouped by drug) are then added to the plot in triangular shapes. Using stat_summary connected the means. Now I would like to know how to add the tooltips for the means data (including x and y values with custom labels) to the plot using ggplotly. The current tooltip only includes the info for the main data
data:
Day,Drug,Sex,Y
1,A,Female,2.192306074
1,B,Male,4.551912798
1,B,Female,1.574070652
1,C,Female,-0.143946163
1,A,Male,5.144422967
1,C,Male,5.724705829
2,A,Male,2.691617258
2,B,Female,-3.0289955
2,C,Male,0.338102762
2,A,Female,-0.558581233
2,B,Female,-2.942620032
2,C,Male,1.024670497
3,A,Male,2.264980803
3,C,Female,2.103722883
3,A,Female,2.091621938
3,B,Male,1.535299922
3,B,Male,1.618399767
3,C,Female,0.136160703
After copying you may need to run the following command to convert it to the dataframe:
df <- read.delim("clipboard", sep = ",")
my variables are in string:
grouping_var <- "Drug"
x <- "Day"
y <- "Y"
color <- "Sex"
Here is the code for that the plot:
text <- paste("X:", df[[grouping_var]], "<br>",
"Y:", df[[y]], "<br>",
"group:", grouping_var, "<br>",
"color:", color)
mean_df <- df %>% group_by(get(x), get(grouping_var)) %>% summarise(Mean = mean(get(y)), .groups = "drop") %>%
rename(x = 1, grouping_var = 2)
p <- df %>% ggplot(aes(x = get(x), y = get(y), text = text)) +
geom_point(aes_string(group = grouping_var, color = color)) +
stat_summary(fun=mean, geom="line", aes_string(group = grouping_var), color = "red", lty = 2) +
geom_point(data = mean_df, aes(x = x, y = Mean, group = grouping_var, color = grouping_var),
pch = 2, size = 3,inherit.aes = F) +
xlab(x)+
ylab(y)
ggplotly(p, tooltip = c("text"))
The output:
The general approach to achieve your desired result and add a tooltip to the summary points would be to map on the text aes in your second geom_point too. However, IMHO you could simplify your code a bit and as a side effect get rid of the aes_string (which was deprecated in ggplot2 3.4.0 and should therefore be replaced by aes). To this end you could simply rename the original variables as x, y, group and color. Second, I use some small custom functions to create the tooltip texts. Finally, I replaced the stat_summary with a geom_line.
library(ggplot2)
library(plotly)
text1 <- function(x, y, group, color) {
paste0(
"X: ", x, "<br>",
"Y: ", y, "<br>",
"group (Drug): ", group, "<br>",
"color (Sex): ", color
)
}
text2 <- function(x, y, group, color) {
paste0(
"X: ", x, "<br>",
"Y: ", y, "<br>",
"group (Drug): ", group, "<br>"
)
}
df <- df %>%
rename(x = x, y = y, group = grouping_var, color = color) %>%
mutate(text = text1(x, y, group, color))
mean_df <- df %>%
group_by(x, group) %>%
summarise(y = mean(y), .groups = "drop") %>%
mutate(text = text2(x, y, group, color))
p <- df %>% ggplot(aes(
x = x, y = y, text = text
)) +
geom_point(aes(group = group, color = color)) +
geom_line(
data = mean_df,
aes(group = group),
color = "red", lty = 2
) +
geom_point(
data = mean_df,
aes(color = group, text = text),
pch = 2, size = 3
) +
xlab(x) +
ylab(y)
ggplotly(p, tooltip = c("text"))
Given two monthly time series data sample from this link.
I will need to create one plot containing 3 subplots: plot1 for the original values, plot2 for month over month changes, and plot3 for year over year changes.
I'm able to draw the plot with code below, but the code is too redundant. So my question is how could achieve that in a concise way? Thanks.
library(xlsx)
library(ggplot2)
library(reshape)
library(dplyr)
library(tidyverse)
library(lubridate)
library(cowplot)
library(patchwork)
df <- read.xlsx('./sample_data.xlsx', 'Sheet1')
colnames(df)
# df
cols <- c('food_index', 'energy_index')
df <- df %>% mutate(date=as.Date(date)) %>%
mutate(across(-contains('date'), as.numeric)) %>%
mutate(date= floor_date(date, 'month')) %>%
group_by(date) %>%
summarise_at(vars(cols), funs(mean(., na.rm=TRUE))) %>%
mutate(across(cols, list(yoy = ~(. - lag(., 12))/lag(., 12)))*100) %>%
mutate(across(cols, list(mom = ~(. - lag(., 1))/lag(., 1)))*100) %>%
filter(date >= '2018-01-01' & date <= '2021-12-31') %>%
as.data.frame()
df1 <- df %>%
select(!grep('mom|yoy', names(df)))
df1_long <- melt(df1, id.vars = 'date')
plot1 <- ggplot(df1_long[!is.na(df1_long$value), ],
aes(x = date,
y = value,
col = variable)) +
geom_line(size=0.6, alpha=0.5) +
geom_point(size=1, alpha=0.8) +
labs(
x='',
y='Unit: $'
)
# MoM changes
df2 <- df %>%
select(grep('date|mom', names(df)))
df2_long <- melt(df2, id.vars = 'date')
plot2 <- ggplot(df2_long[!is.na(df2_long$value), ],
aes(x = date,
y = value,
col = variable)) +
geom_line(size=0.6, alpha=0.5) +
geom_point(size=1, alpha=0.8) +
labs(
x='',
y='Unit: %'
)
# YoY changes
df3 <- df %>%
select(grep('date|yoy', names(df)))
df3_long <- melt(df3, id.vars = 'date')
plot3 <- ggplot(df3_long[!is.na(df3_long$value), ],
aes(x = date,
y = value,
col = variable)) +
geom_line(size=0.6, alpha=0.5) +
geom_point(size=1, alpha=0.8) +
labs(
x='',
y='Unit: %'
)
plot <- plot1 + plot2 + plot3 + plot_layout(ncol=1)
# plot <- plot_grid(plot1, plot2, plot3, labels = c('Value', 'MoM', 'YoY'), label_size = 12)
plot
Out:
The expected result will be similar to the plot below (the upper plot will display the original data, the middle plot will display the mom changes data, and the lower plot will display the yoy changes data):
References:
https://waterdata.usgs.gov/blog/beyond-basic-plotting/
http://www.sthda.com/english/articles/24-ggpubr-publication-ready-plots/81-ggplot2-easy-way-to-mix-multiple-graphs-on-the-same-page/
Side-by-side plots with ggplot2
Maybe this is what you are looking for? By reshaping your data to the right shape, using a plot function and e.g. purrr::map2 you could achieve your desired result without duplicating your code like so.
Using some fake random example data to mimic your true data:
library(tidyr)
library(dplyr)
library(ggplot2)
df_long <- df |>
rename(food_index_raw = food_index, energy_index_raw = energy_index) |>
pivot_longer(-date, names_to = c("variable", ".value"), names_pattern = "^(.*?_index)_(.*)$")
plot_fun <- function(x, y, ylab) {
x <- x |>
select(date, variable, value = .data[[y]]) |>
filter(!is.na(value))
ggplot(
x,
aes(
x = date,
y = value,
col = variable
)
) +
geom_line(size = 0.6, alpha = 0.5) +
geom_point(size = 1, alpha = 0.8) +
labs(
x = "",
y = ylab
)
}
yvars <- c("raw", "mom", "yoy")
ylabs <- paste0("Unit: ", c("$", "%", "%"))
plots <- purrr::map2(yvars, ylabs, plot_fun, x = df_long)
library(patchwork)
wrap_plots(plots) + plot_layout(ncol = 1)
DATA
set.seed(123)
date <- seq.POSIXt(as.POSIXct("2017-01-31"), as.POSIXct("2022-12-31"), by = "month")
food_index <- runif(length(date))
energy_index <- runif(length(date))
df <- data.frame(date, food_index, energy_index)
EDIT Adding subtitles to each plot when using patchwork is (as of the moment) a bit tricky. What I would do in this case would be to use a faceting "hack". To this end I slightly adjusted the function to take a subtitle argument and switched to purrr::pmap:
library(tidyr)
library(dplyr)
library(ggplot2)
df_long <- df |>
rename(food_index_raw = food_index, energy_index_raw = energy_index) |>
pivot_longer(-date, names_to = c("variable", ".value"), names_pattern = "^(.*?_index)_(.*)$")
plot_fun <- function(x, y, ylab, subtitle) {
x <- x |>
select(date, variable, value = .data[[y]]) |>
filter(!is.na(value))
ggplot(
x,
aes(
x = date,
y = value,
col = variable
)
) +
geom_line(size = 0.6, alpha = 0.5) +
geom_point(size = 1, alpha = 0.8) +
facet_wrap(~.env$subtitle) +
labs(
x = "",
y = ylab
) +
theme(strip.background = element_blank(), strip.text.x = element_text(hjust = 0))
}
yvars <- c("raw", "mom", "yoy")
ylabs <- paste0("Unit: ", c("$", "%", "%"))
subtitle <- c("Original", "Month-to-Month", "Year-to-Year")
plots <- purrr::pmap(list(y = yvars, ylab = ylabs, subtitle = subtitle), plot_fun, x = df_long)
library(patchwork)
wrap_plots(plots) + plot_layout(ncol = 1)
The target output is done with facets rather than stitching plots together. You could do this too if you like, but it requires reshaping your data in a different way. Which approach you take is really a matter of taste.
library(ggplot2)
library(dplyr)
yoy <- function(x) 100 * (x - lag(x, 13)) / lag(x, 12)
mom <- function(x) 100 * (x - lag(x)) / lag(x)
df %>%
mutate(date = as.Date(date, origin = "1899-12-30"),
`Actual value (Dollars).Food Index` = food_index,
`Month-on-month change (%).Food Index` = mom(food_index),
`Year-on-year change (%).Food Index` = yoy(food_index),
`Actual value (Dollars).Energy Index` = energy_index,
`Month-on-month change (%).Energy Index` = mom(energy_index),
`Year-on-year change (%).Energy Index` = yoy(energy_index)) %>%
select(-food_index, -energy_index) %>%
tidyr::pivot_longer(-1) %>%
filter(date > as.Date("2018-01-01")) %>%
tidyr::separate(name, into = c("series", "index"), sep = "\\.") %>%
ggplot(aes(date, value, color = index)) +
geom_point(na.rm = TRUE) +
geom_line() +
facet_grid(series~., scales = "free_y") +
theme_bw(base_size = 16)
Reproducible data taken from link in question
df <- structure(list(date = c(42766, 42794, 42825, 42855, 42886, 42916,
42947, 42978, 43008, 43039, 43069, 43100, 43131, 43159, 43190,
43220, 43251, 43281, 43312, 43343, 43373, 43404, 43434, 43465,
43496, 43524, 43555, 43585, 43616, 43646, 43677, 43708, 43738,
43769, 43799, 43830, 43861, 43890, 43921, 43951, 43982, 44012,
44043, 44074, 44104, 44135, 44165, 44196, 44227, 44255, 44286,
44316, 44347, 44377, 44408, 44439, 44469, 44500, 44530, 44561
), food_index = c(58.53, 61.23, 55.32, 55.34, 61.73, 56.91, 54.27,
59.08, 60.11, 66.01, 60.11, 63.41, 69.8, 72.45, 81.11, 89.64,
88.64, 88.62, 98.27, 111.11, 129.39, 140.14, 143.44, 169.21,
177.39, 163.88, 135.07, 151.28, 172.81, 143.82, 162.13, 172.22,
176.67, 179.3, 157.27, 169.12, 192.51, 194.2, 179.4, 169.1, 193.17,
174.92, 181.92, 188.41, 192.14, 203.41, 194.19, 174.3, 174.86,
182.33, 182.82, 185.36, 192.41, 195.59, 202.6, 201.51, 225.01,
243.78, 270.67, 304.57), energy_index = c(127.36, 119.87, 120.96,
112.09, 112.19, 109.24, 109.56, 106.89, 109.35, 108.35, 112.39,
117.77, 119.52, 122.24, 120.91, 125.41, 129.72, 135.25, 139.33,
148.6, 169.62, 184.23, 204.38, 198.55, 189.29, 202.47, 220.23,
240.67, 263.12, 249.74, 240.84, 243.42, 261.2, 256.76, 258.69,
277.98, 289.63, 293.46, 310.81, 318.68, 310.04, 302.17, 298.62,
260.92, 269.29, 258.84, 241.68, 224.18, 216.36, 226.57, 235.98,
253.86, 267.37, 261.99, 273.37, 280.91, 291.84, 297.88, 292.78,
289.79)), row.names = c(NA, 60L), class = "data.frame")
I would like to show the mean of two groups in a scatterplot. I have sorted the data so the groups are next to each other. Group 1 is the first 11 records and group2 is the next 133. How can I tell ggplot to draw one line across the range for the first group (House 1-11) and a second line for the second (House 12-133).
Here is what I have so far:
And the code is here:
library(tidyverse)
library(tidymodels)
data(ames)
ames <- AmesHousing::make_ames()
set.seed(1)
split <- initial_split(ames, prop = 0.95, strata = "Sale_Price")
ames_plot <- testing(split)
model1 <- lm(Sale_Price ~ Central_Air, data = ames_plot)
p1 <- model1 %>%
broom::augment() %>%
arrange(Central_Air) %>%
mutate(House = row_number()) %>%
ggplot(aes(House, Sale_Price, color = Central_Air)) +
geom_point(size = 1, alpha = 0.3) +
geom_segment(aes(x = 1, y = .fitted, xend = 144, yend =.fitted)) +
scale_y_continuous(labels = scales::dollar)
p1
Using geom_smooth(formula = 'y ~ x', se = FALSE, method = "lm") instead of geom_segment() gets me close to what I want but I want to show the actual predicted values coming form the lm().
It would be best just to summarize your data for that layer. For example
model1 %>%
broom::augment() %>%
arrange(Central_Air) %>%
mutate(House = row_number()) %>%
ggplot(aes(House, Sale_Price, color = Central_Air)) +
geom_point(size = 1, alpha=.3) +
geom_segment(aes(x = first, y = .fitted, xend = last, yend =.fitted),
data = function(x) {
x %>%
group_by(Central_Air) %>%
summarize(first=first(House), last=last(House), .fitted=mean(.fitted), .groups="drop_last")
}) +
scale_y_continuous(labels = scales::dollar)
I have a dataset that looks like this:
test<-data.frame("M"=c("a","b","c","a","b","b","c","a","b","c"),
"N"=c(1,3,4,6,6,7,7,8,8,8),
"X"=c(0,1,0,1,1,0,1,0,1,1),
"Y"=c(1,1,0,0,1,0,1,1,1,0))
I'm making a simple plot where I want X and Y on the y axis, M on the x axis, each grid colored if the value of X or Y is 1 and empty if the value of X or Y is 0. I'm repeating this for each categories in N (the categories of N are 1 to 5, 6, 7, 8), then stacking all plots together. Right now, I'm doing this with the following code.
test <- test[order(test$N),]
test1 <- test[c(1:3),]
test2 <- test[c(4:5),]
test3 <- test[c(6:7),]
test4 <- test[c(8:10),] # I'm doing this to "separate" categories of `N` manually
p1 <- test1[,c(1,3:4)] %>%
gather(col_name, value, -M) %>%
ggplot(aes(factor(M), col_name, fill = value == 1))+
geom_tile(colour = 'black')+
scale_fill_manual(values = c('FALSE' = 'white', 'TRUE' = 'red'))
p2 <- test2[,c(1,3:4)] %>%
gather(col_name, value, -M) %>%
ggplot(aes(factor(M), col_name, fill = value == 1))+
geom_tile(colour = 'black')+
scale_fill_manual(values = c('FALSE' = 'white', 'TRUE' = 'yellow'))
p3 <- test3[,c(1,3:4)] %>%
gather(col_name, value, -M) %>%
ggplot(aes(factor(M), col_name, fill = value == 1))+
geom_tile(colour = 'black')+
scale_fill_manual(values = c('FALSE' = 'white', 'TRUE' = 'green'))
p4 <- test4[,c(1,3:4)] %>%
gather(col_name, value, -M) %>%
ggplot(aes(factor(M), col_name, fill = value == 1))+
geom_tile(colour = 'black')+
scale_fill_manual(values = c('FALSE' = 'white', 'TRUE' = 'blue'))
grid.arrange(p1, p2, p3, p4, ncol = 1)
I'm attaching an image of what I have right now. I want to fix these plots so that I would have the same factors of M for all four plots (right now, only p1 and p4 have all three factors (a, b and c) in the x axis but I want to add factor c to p2 and a to p3 so that all x axes are identical to each other. Can anyone give me suggestions on how to do this?
(Also, I'm suspecting that the current way I'm plotting things is probably not the most quickest/easiest way to go, if anyone has suggestions on how to improve things it'd be really helpful!)
To continue using grid.arrange(), instead of facet_wrap(), do the following:
Make M a factor:
test$M <- factor(test$M)
Add the following to each of your plots:
scale_x_discrete(limits = levels(test$M))
Maybe one approach I can suggest you is using facets after applying a smart trick to group your values and avoid splitting in different dataframes. Here the code as an option for you (The colors will be the same across the facets in base of TRUE/FALSE values):
library(tidyverse)
#Code
test %>% mutate(Var=lead(N)) %>%
mutate(Diff=Var-N,Diff=ifelse(row_number()==1,0,Diff)) %>%
mutate(Group=ifelse(Diff==0,N,NA)) %>%
fill(Group) %>% select(-c(N,Var,Diff)) %>%
group_by(Group) %>% mutate(NG=paste0('p',cur_group_id())) %>% ungroup() %>%
select(-Group) %>%
pivot_longer(cols = -c(NG,M)) %>%
ggplot(aes(factor(M), name, fill = value == 1,group=value))+
geom_tile(colour = 'black')+
facet_wrap(.~NG,ncol = 1)+
scale_fill_manual('value',values=c('tomato','cyan3'))+
xlab('M')
Output:
The othe option would be patchwork with a customized function:
library(tidyverse)
library(patchwork)
#Code
data <- test %>% mutate(Var=lead(N)) %>%
mutate(Diff=Var-N,Diff=ifelse(row_number()==1,0,Diff)) %>%
mutate(Group=ifelse(Diff==0,N,NA)) %>%
fill(Group) %>% select(-c(N,Var,Diff)) %>%
group_by(Group) %>% mutate(NG=paste0('p',cur_group_id())) %>% ungroup() %>%
select(-Group) %>%
mutate(M=factor(M,levels = unique(M),ordered = T)) %>%
pivot_longer(cols = -c(NG,M))
#List
List <- split(data,data$NG)
#Function
myfun <- function(x)
{
#Test for color
val <- unique(x$NG)
#Conditioning for color
if(val=='p1') {vcolor=c('FALSE' = 'white', 'TRUE' = 'red')} else
if(val=='p2') {vcolor=c('FALSE' = 'white', 'TRUE' = 'yellow')} else
if(val=='p3') {vcolor=c('FALSE' = 'white', 'TRUE' = 'green')} else
{vcolor=c('FALSE' = 'white', 'TRUE' = 'blue')}
#Update data
x <- x %>% mutate(M=factor(M,levels = c('a','b','c'),ordered = T)) %>% complete(M=M)
#Plot
G <- ggplot(x,aes(factor(M), name, fill = (value == 1 & !is.na(value))))+
geom_tile(colour = 'black')+
scale_fill_manual('value',values=vcolor)+
xlab('M')+
scale_y_discrete(limits=c('X','Y'))+
theme_bw()+
ggtitle(val)
return(G)
}
#Apply
Lplot <- lapply(List,myfun)
#Wrap
GF <- wrap_plots(Lplot,ncol = 1)
Output:
Something like this?
test<-data.frame("M"=c("a","b","c","a","b","b","c","a","b","c"),
"N"=c(1,3,4,6,6,7,7,8,8,8),
"X"=c(0,1,0,1,1,0,1,0,1,1),
"Y"=c(1,1,0,0,1,0,1,1,1,0))
library(tidyverse)
test = mutate(test, N2 = cut(N, breaks = c(0,5:100)))
m = pivot_longer(test, c(X, Y))
ggplot(m, aes(M, name,fill=factor(value))) +
geom_tile(colour = 'black') +
facet_wrap(~N2, scales = 'free') +
scale_fill_manual(values = c(`0` = 'white', `1` = 'red'))
I've got a grouped plot in which I've already labelled the peak with it's x-axis value. However, I'd like another label in the top right of the plot with another measure, but I can't get it to work. Here's an example.
d <- data.frame(USArrests)
nrow(d)
d$predictor <- factor(c(rep(c(1, 2), times = 25)))
label <- d %>%
group_by(predictor) %>%
filter(Assault == max(Assault))
label$measure1 <- c(1.45, 5.67)
label$measure2 <- c(4.55, 6.11)
library(ggplot2)
ggplot(d, aes(x=UrbanPop, y=Assault, fill=predictor)) +
geom_col(position=position_dodge(width = 0, preserve = "single"), width = 5) +
geom_text(data = label, aes(label = UrbanPop)) +
geom_text(data = label, aes(label = measure), hjust="right", vjust="top")
I want the second label in the top right to say in red "measure1 = 1.45" then on a new line "measure2 = 4.55", then below it in green "measure1 = 5.67" and on a new line "measure2=6.11". Obviously "measures" are dynamic objects, so I don't just want to insert a static caption. Any help much appreciated!
It's a bit clunky, but you could create the full labels in a data.frame and force them to be on different lines with \n.
d <- data.frame(USArrests)
d$predictor <- factor(c(rep(c(1, 2), times = 25)))
label <- d %>%
group_by(predictor) %>%
filter(Assault == max(Assault))
label$measure1 <- c(1.45, 5.67)
label$measure2 <- c(4.55, 6.11)
label2 <- label %>%
pivot_longer(measure1:measure2, 'measure', 'value') %>%
mutate(label = case_when(
predictor == 1 & measure == 'measure1' ~ paste0(measure, ' = ', value),
predictor == 1 & measure == 'measure2' ~ paste0('\n', measure, ' = ', value),
predictor == 2 & measure == 'measure1' ~ paste0('\n\n', measure, ' = ', value),
predictor == 2 & measure == 'measure2' ~ paste0('\n\n\n', measure, ' = ', value)
))
ggplot(d, aes(x=UrbanPop, y=Assault, fill=predictor)) +
geom_col(position=position_dodge(width = 0, preserve = "single"), width = 5) +
geom_text(data = label, aes(label = UrbanPop)) +
geom_text(data = label2,
aes(x = Inf, y = Inf, label = label, color = predictor),
hjust="right", vjust="top")
E.g., use x and y = Inf. In your example, faceting makes sense, because you pass the label to different data-sets.
library(tidyverse)
d <- data.frame(USArrests)
d$predictor <- factor(c(rep(c(1, 2), times = 25)))
label <- d %>%
group_by(predictor) %>%
filter(Assault == max(Assault))
label$measure1 <- c(1.45, 5.67)
label$measure2 <- c(4.55, 6.11)
ggplot(d, aes(x=UrbanPop, y=Assault, fill=predictor)) +
geom_col(position=position_dodge(width = 0, preserve = "single"), width = 5) +
geom_text(data = label, aes(label = UrbanPop)) +
geom_text(data = label, aes(x = Inf, y = Inf, label = measure2), hjust="right", vjust="top")+
facet_grid(~predictor)
#> Warning: position_dodge requires non-overlapping x intervals
#> Warning: position_dodge requires non-overlapping x intervals
Created on 2020-04-17 by the reprex package (v0.3.0)