Related
I have a large dataset in which I want to group similar resistance patterns together. A plot to visualize similarity of resistance pattern is needed.
dat <- read.table(text="Id Resistance.Pattern
A SSRRSSSSR
B SSSRSSSSR
C RRRRSSRRR
D SSSSSSSSS
E SSRSSSSSR
F SSSRRSSRR
G SSSSR
H SSSSSSRRR
I RRSSRRRSS", header=TRUE)
I would separate out the values into a wider dataframe and then make a heatmap and dendrogram to compare sillimanites in patterns:
library(tidyverse)
library(ggdendro)
recode_dat <- dat |>
mutate(pat = str_split(Resistance.Pattern, "")) |>
unnest_wider(pat, names_sep = "_") |>
select(starts_with("pat_")) |>
mutate(across(everything(), ~case_when(. == "S" ~ 1, . == "R" ~ 2, is.na(.) ~0)))
rownames(recode_dat) <- dat$Id
dendro <- as.dendrogram(hclust(d = dist(x = scale(recode_dat))))
dendro_plot <- ggdendrogram(data = dendro, rotate = TRUE)
heatmap_plot <- dat |>
mutate(pat = str_split(Resistance.Pattern, "")) |>
unnest_wider(pat, names_sep = "_") |>
pivot_longer(cols = starts_with("pat_"), names_to = "pattern_position") |>
mutate(Id = factor(Id, levels = dat$Id[order.dendrogram(dendro)])) |>
ggplot(aes(pattern_position, Id))+
geom_tile(aes(fill = value))+
scale_x_discrete(labels = \(x) sub(".*_(\\d+$)", "\\1", x))+
theme(legend.position = "top")
cowplot::plot_grid(heatmap_plot, dendro_plot,nrow = 1, align = "h", axis = "tb")
It sounds as though the second column of your data frame represents sensitivity (S) and resistance (R), presumably to antibiotics (though this is not clear in your question). That being the case, you are presumably looking for something like this:
library(tidyverse)
p <- strsplit(dat$Resistance.Pattern, "")
do.call(rbind, lapply(p, \(x) c(x, rep(NA, max(lengths(p)) - length(x))))) %>%
as.data.frame() %>%
cbind(Id = dat$Id) %>%
mutate(Id = factor(Id, rev(Id))) %>%
pivot_longer(V1:V9) %>%
ggplot(aes(name, Id, fill = value)) +
geom_tile(col = "white", size = 2) +
coord_equal() +
scale_fill_manual(values = c("#e02430", "#d8d848"),
labels = c("Resistant", "Sensitive"),
na.value = "gray95") +
scale_x_discrete(name = "Antibiotic", position = "top",
labels = 1:9) +
labs(fill = "Resistance", y = "ID") +
theme_minimal(base_size = 20) +
theme(text = element_text(color = "gray30"))
I'd separate the entries by character, convert the binary data to numeric and plot the matrix as a heatmap and show the character string as rownames.
Whether to use a row and/or column clustering depends on whats desired.
library(dplyr)
library(tidyr) # for unnest_wider
library(gplots) # for heatmap.2
mm <-
dat %>%
group_by(Resistance.Pattern) %>%
summarize(Id, Resistance.Pattern) %>%
mutate(binary = strsplit(Resistance.Pattern, "")) %>%
unnest_wider(binary, names_sep="") %>%
mutate(across(starts_with("binary"), ~ as.numeric(c(R = 1, S = 0)[.x])))
mm2 <- as.matrix(mm[, -c(1,2)]) |> unname() # the numeric part
rownames(mm2) <- apply(as.matrix(mm[,1:2]), 1, paste, collapse=" ")
heatmap.2(mm2, trace="none", Colv="none", dendrogram="row",
col=c("green", "darkgreen"), margins=c(10,10))
Given two monthly time series data sample from this link.
I will need to create one plot containing 3 subplots: plot1 for the original values, plot2 for month over month changes, and plot3 for year over year changes.
I'm able to draw the plot with code below, but the code is too redundant. So my question is how could achieve that in a concise way? Thanks.
library(xlsx)
library(ggplot2)
library(reshape)
library(dplyr)
library(tidyverse)
library(lubridate)
library(cowplot)
library(patchwork)
df <- read.xlsx('./sample_data.xlsx', 'Sheet1')
colnames(df)
# df
cols <- c('food_index', 'energy_index')
df <- df %>% mutate(date=as.Date(date)) %>%
mutate(across(-contains('date'), as.numeric)) %>%
mutate(date= floor_date(date, 'month')) %>%
group_by(date) %>%
summarise_at(vars(cols), funs(mean(., na.rm=TRUE))) %>%
mutate(across(cols, list(yoy = ~(. - lag(., 12))/lag(., 12)))*100) %>%
mutate(across(cols, list(mom = ~(. - lag(., 1))/lag(., 1)))*100) %>%
filter(date >= '2018-01-01' & date <= '2021-12-31') %>%
as.data.frame()
df1 <- df %>%
select(!grep('mom|yoy', names(df)))
df1_long <- melt(df1, id.vars = 'date')
plot1 <- ggplot(df1_long[!is.na(df1_long$value), ],
aes(x = date,
y = value,
col = variable)) +
geom_line(size=0.6, alpha=0.5) +
geom_point(size=1, alpha=0.8) +
labs(
x='',
y='Unit: $'
)
# MoM changes
df2 <- df %>%
select(grep('date|mom', names(df)))
df2_long <- melt(df2, id.vars = 'date')
plot2 <- ggplot(df2_long[!is.na(df2_long$value), ],
aes(x = date,
y = value,
col = variable)) +
geom_line(size=0.6, alpha=0.5) +
geom_point(size=1, alpha=0.8) +
labs(
x='',
y='Unit: %'
)
# YoY changes
df3 <- df %>%
select(grep('date|yoy', names(df)))
df3_long <- melt(df3, id.vars = 'date')
plot3 <- ggplot(df3_long[!is.na(df3_long$value), ],
aes(x = date,
y = value,
col = variable)) +
geom_line(size=0.6, alpha=0.5) +
geom_point(size=1, alpha=0.8) +
labs(
x='',
y='Unit: %'
)
plot <- plot1 + plot2 + plot3 + plot_layout(ncol=1)
# plot <- plot_grid(plot1, plot2, plot3, labels = c('Value', 'MoM', 'YoY'), label_size = 12)
plot
Out:
The expected result will be similar to the plot below (the upper plot will display the original data, the middle plot will display the mom changes data, and the lower plot will display the yoy changes data):
References:
https://waterdata.usgs.gov/blog/beyond-basic-plotting/
http://www.sthda.com/english/articles/24-ggpubr-publication-ready-plots/81-ggplot2-easy-way-to-mix-multiple-graphs-on-the-same-page/
Side-by-side plots with ggplot2
Maybe this is what you are looking for? By reshaping your data to the right shape, using a plot function and e.g. purrr::map2 you could achieve your desired result without duplicating your code like so.
Using some fake random example data to mimic your true data:
library(tidyr)
library(dplyr)
library(ggplot2)
df_long <- df |>
rename(food_index_raw = food_index, energy_index_raw = energy_index) |>
pivot_longer(-date, names_to = c("variable", ".value"), names_pattern = "^(.*?_index)_(.*)$")
plot_fun <- function(x, y, ylab) {
x <- x |>
select(date, variable, value = .data[[y]]) |>
filter(!is.na(value))
ggplot(
x,
aes(
x = date,
y = value,
col = variable
)
) +
geom_line(size = 0.6, alpha = 0.5) +
geom_point(size = 1, alpha = 0.8) +
labs(
x = "",
y = ylab
)
}
yvars <- c("raw", "mom", "yoy")
ylabs <- paste0("Unit: ", c("$", "%", "%"))
plots <- purrr::map2(yvars, ylabs, plot_fun, x = df_long)
library(patchwork)
wrap_plots(plots) + plot_layout(ncol = 1)
DATA
set.seed(123)
date <- seq.POSIXt(as.POSIXct("2017-01-31"), as.POSIXct("2022-12-31"), by = "month")
food_index <- runif(length(date))
energy_index <- runif(length(date))
df <- data.frame(date, food_index, energy_index)
EDIT Adding subtitles to each plot when using patchwork is (as of the moment) a bit tricky. What I would do in this case would be to use a faceting "hack". To this end I slightly adjusted the function to take a subtitle argument and switched to purrr::pmap:
library(tidyr)
library(dplyr)
library(ggplot2)
df_long <- df |>
rename(food_index_raw = food_index, energy_index_raw = energy_index) |>
pivot_longer(-date, names_to = c("variable", ".value"), names_pattern = "^(.*?_index)_(.*)$")
plot_fun <- function(x, y, ylab, subtitle) {
x <- x |>
select(date, variable, value = .data[[y]]) |>
filter(!is.na(value))
ggplot(
x,
aes(
x = date,
y = value,
col = variable
)
) +
geom_line(size = 0.6, alpha = 0.5) +
geom_point(size = 1, alpha = 0.8) +
facet_wrap(~.env$subtitle) +
labs(
x = "",
y = ylab
) +
theme(strip.background = element_blank(), strip.text.x = element_text(hjust = 0))
}
yvars <- c("raw", "mom", "yoy")
ylabs <- paste0("Unit: ", c("$", "%", "%"))
subtitle <- c("Original", "Month-to-Month", "Year-to-Year")
plots <- purrr::pmap(list(y = yvars, ylab = ylabs, subtitle = subtitle), plot_fun, x = df_long)
library(patchwork)
wrap_plots(plots) + plot_layout(ncol = 1)
The target output is done with facets rather than stitching plots together. You could do this too if you like, but it requires reshaping your data in a different way. Which approach you take is really a matter of taste.
library(ggplot2)
library(dplyr)
yoy <- function(x) 100 * (x - lag(x, 13)) / lag(x, 12)
mom <- function(x) 100 * (x - lag(x)) / lag(x)
df %>%
mutate(date = as.Date(date, origin = "1899-12-30"),
`Actual value (Dollars).Food Index` = food_index,
`Month-on-month change (%).Food Index` = mom(food_index),
`Year-on-year change (%).Food Index` = yoy(food_index),
`Actual value (Dollars).Energy Index` = energy_index,
`Month-on-month change (%).Energy Index` = mom(energy_index),
`Year-on-year change (%).Energy Index` = yoy(energy_index)) %>%
select(-food_index, -energy_index) %>%
tidyr::pivot_longer(-1) %>%
filter(date > as.Date("2018-01-01")) %>%
tidyr::separate(name, into = c("series", "index"), sep = "\\.") %>%
ggplot(aes(date, value, color = index)) +
geom_point(na.rm = TRUE) +
geom_line() +
facet_grid(series~., scales = "free_y") +
theme_bw(base_size = 16)
Reproducible data taken from link in question
df <- structure(list(date = c(42766, 42794, 42825, 42855, 42886, 42916,
42947, 42978, 43008, 43039, 43069, 43100, 43131, 43159, 43190,
43220, 43251, 43281, 43312, 43343, 43373, 43404, 43434, 43465,
43496, 43524, 43555, 43585, 43616, 43646, 43677, 43708, 43738,
43769, 43799, 43830, 43861, 43890, 43921, 43951, 43982, 44012,
44043, 44074, 44104, 44135, 44165, 44196, 44227, 44255, 44286,
44316, 44347, 44377, 44408, 44439, 44469, 44500, 44530, 44561
), food_index = c(58.53, 61.23, 55.32, 55.34, 61.73, 56.91, 54.27,
59.08, 60.11, 66.01, 60.11, 63.41, 69.8, 72.45, 81.11, 89.64,
88.64, 88.62, 98.27, 111.11, 129.39, 140.14, 143.44, 169.21,
177.39, 163.88, 135.07, 151.28, 172.81, 143.82, 162.13, 172.22,
176.67, 179.3, 157.27, 169.12, 192.51, 194.2, 179.4, 169.1, 193.17,
174.92, 181.92, 188.41, 192.14, 203.41, 194.19, 174.3, 174.86,
182.33, 182.82, 185.36, 192.41, 195.59, 202.6, 201.51, 225.01,
243.78, 270.67, 304.57), energy_index = c(127.36, 119.87, 120.96,
112.09, 112.19, 109.24, 109.56, 106.89, 109.35, 108.35, 112.39,
117.77, 119.52, 122.24, 120.91, 125.41, 129.72, 135.25, 139.33,
148.6, 169.62, 184.23, 204.38, 198.55, 189.29, 202.47, 220.23,
240.67, 263.12, 249.74, 240.84, 243.42, 261.2, 256.76, 258.69,
277.98, 289.63, 293.46, 310.81, 318.68, 310.04, 302.17, 298.62,
260.92, 269.29, 258.84, 241.68, 224.18, 216.36, 226.57, 235.98,
253.86, 267.37, 261.99, 273.37, 280.91, 291.84, 297.88, 292.78,
289.79)), row.names = c(NA, 60L), class = "data.frame")
I have seen the answer Pie charts in geom_scatterpie overlapping.
But I do not want to reset the overlapped pie chart coordinate manually.
Is there some function in ggplot2 or something else can avoid overlap in geom_scatterpie() function?
Any help will be highly appreciated!
Reproducible code is at here:
library(scatterpie)
library(tidyverse)
library(geosphere)
library(ggnewscale)
us <- map_data('state') %>% as_tibble()
n = length(unique(us$region))
# creat fake mapping data
temperature_data <- tibble(region = unique(us$region),
temp = rnorm(n = n))
coords <- us %>% select(long, lat, region) %>% distinct(region, .keep_all = T)
category_data <- tibble(region = unique(us$region),
cat_1 = sample(1:100, size = n),
cat_2 = sample(1:100, size = n),
cat_3 = sample(1:100, size = n)) %>% left_join(coords)
us <- left_join(us, temperature_data)
p +
geom_map(map = us, aes(map_id = region, fill = temp), color = 'grey') +
new_scale('fill') +
geom_scatterpie(data = category_data,
aes(long, lat),
cols = c("cat_1", "cat_2", "cat_3"),
alpha = 0.5)
I have a dataset that looks like this:
test<-data.frame("M"=c("a","b","c","a","b","b","c","a","b","c"),
"N"=c(1,3,4,6,6,7,7,8,8,8),
"X"=c(0,1,0,1,1,0,1,0,1,1),
"Y"=c(1,1,0,0,1,0,1,1,1,0))
I'm making a simple plot where I want X and Y on the y axis, M on the x axis, each grid colored if the value of X or Y is 1 and empty if the value of X or Y is 0. I'm repeating this for each categories in N (the categories of N are 1 to 5, 6, 7, 8), then stacking all plots together. Right now, I'm doing this with the following code.
test <- test[order(test$N),]
test1 <- test[c(1:3),]
test2 <- test[c(4:5),]
test3 <- test[c(6:7),]
test4 <- test[c(8:10),] # I'm doing this to "separate" categories of `N` manually
p1 <- test1[,c(1,3:4)] %>%
gather(col_name, value, -M) %>%
ggplot(aes(factor(M), col_name, fill = value == 1))+
geom_tile(colour = 'black')+
scale_fill_manual(values = c('FALSE' = 'white', 'TRUE' = 'red'))
p2 <- test2[,c(1,3:4)] %>%
gather(col_name, value, -M) %>%
ggplot(aes(factor(M), col_name, fill = value == 1))+
geom_tile(colour = 'black')+
scale_fill_manual(values = c('FALSE' = 'white', 'TRUE' = 'yellow'))
p3 <- test3[,c(1,3:4)] %>%
gather(col_name, value, -M) %>%
ggplot(aes(factor(M), col_name, fill = value == 1))+
geom_tile(colour = 'black')+
scale_fill_manual(values = c('FALSE' = 'white', 'TRUE' = 'green'))
p4 <- test4[,c(1,3:4)] %>%
gather(col_name, value, -M) %>%
ggplot(aes(factor(M), col_name, fill = value == 1))+
geom_tile(colour = 'black')+
scale_fill_manual(values = c('FALSE' = 'white', 'TRUE' = 'blue'))
grid.arrange(p1, p2, p3, p4, ncol = 1)
I'm attaching an image of what I have right now. I want to fix these plots so that I would have the same factors of M for all four plots (right now, only p1 and p4 have all three factors (a, b and c) in the x axis but I want to add factor c to p2 and a to p3 so that all x axes are identical to each other. Can anyone give me suggestions on how to do this?
(Also, I'm suspecting that the current way I'm plotting things is probably not the most quickest/easiest way to go, if anyone has suggestions on how to improve things it'd be really helpful!)
To continue using grid.arrange(), instead of facet_wrap(), do the following:
Make M a factor:
test$M <- factor(test$M)
Add the following to each of your plots:
scale_x_discrete(limits = levels(test$M))
Maybe one approach I can suggest you is using facets after applying a smart trick to group your values and avoid splitting in different dataframes. Here the code as an option for you (The colors will be the same across the facets in base of TRUE/FALSE values):
library(tidyverse)
#Code
test %>% mutate(Var=lead(N)) %>%
mutate(Diff=Var-N,Diff=ifelse(row_number()==1,0,Diff)) %>%
mutate(Group=ifelse(Diff==0,N,NA)) %>%
fill(Group) %>% select(-c(N,Var,Diff)) %>%
group_by(Group) %>% mutate(NG=paste0('p',cur_group_id())) %>% ungroup() %>%
select(-Group) %>%
pivot_longer(cols = -c(NG,M)) %>%
ggplot(aes(factor(M), name, fill = value == 1,group=value))+
geom_tile(colour = 'black')+
facet_wrap(.~NG,ncol = 1)+
scale_fill_manual('value',values=c('tomato','cyan3'))+
xlab('M')
Output:
The othe option would be patchwork with a customized function:
library(tidyverse)
library(patchwork)
#Code
data <- test %>% mutate(Var=lead(N)) %>%
mutate(Diff=Var-N,Diff=ifelse(row_number()==1,0,Diff)) %>%
mutate(Group=ifelse(Diff==0,N,NA)) %>%
fill(Group) %>% select(-c(N,Var,Diff)) %>%
group_by(Group) %>% mutate(NG=paste0('p',cur_group_id())) %>% ungroup() %>%
select(-Group) %>%
mutate(M=factor(M,levels = unique(M),ordered = T)) %>%
pivot_longer(cols = -c(NG,M))
#List
List <- split(data,data$NG)
#Function
myfun <- function(x)
{
#Test for color
val <- unique(x$NG)
#Conditioning for color
if(val=='p1') {vcolor=c('FALSE' = 'white', 'TRUE' = 'red')} else
if(val=='p2') {vcolor=c('FALSE' = 'white', 'TRUE' = 'yellow')} else
if(val=='p3') {vcolor=c('FALSE' = 'white', 'TRUE' = 'green')} else
{vcolor=c('FALSE' = 'white', 'TRUE' = 'blue')}
#Update data
x <- x %>% mutate(M=factor(M,levels = c('a','b','c'),ordered = T)) %>% complete(M=M)
#Plot
G <- ggplot(x,aes(factor(M), name, fill = (value == 1 & !is.na(value))))+
geom_tile(colour = 'black')+
scale_fill_manual('value',values=vcolor)+
xlab('M')+
scale_y_discrete(limits=c('X','Y'))+
theme_bw()+
ggtitle(val)
return(G)
}
#Apply
Lplot <- lapply(List,myfun)
#Wrap
GF <- wrap_plots(Lplot,ncol = 1)
Output:
Something like this?
test<-data.frame("M"=c("a","b","c","a","b","b","c","a","b","c"),
"N"=c(1,3,4,6,6,7,7,8,8,8),
"X"=c(0,1,0,1,1,0,1,0,1,1),
"Y"=c(1,1,0,0,1,0,1,1,1,0))
library(tidyverse)
test = mutate(test, N2 = cut(N, breaks = c(0,5:100)))
m = pivot_longer(test, c(X, Y))
ggplot(m, aes(M, name,fill=factor(value))) +
geom_tile(colour = 'black') +
facet_wrap(~N2, scales = 'free') +
scale_fill_manual(values = c(`0` = 'white', `1` = 'red'))
Does anyone know if there is a way to add variable labels to the ggparcoord function in GGally? I've tried numerous ways with geom_text, but nothing is yielding results.
To be more explicit, I am looking to pass the row.names(mtcars) through geom_text. The only way that I can distinguish the car is passing row.names(mtcars) through the groupColumn argument, but I don't like the way this looks.
Doesn't work:
mtcars$carName <- row.names(mtcars) # This becomes column 12
library(GGally)
# Attempt 1
ggparcoord(mtcars,
columns = c(12, 1, 6),
groupColumn = 1) +
geom_text(aes(label = carName))
# Attempt 2
ggparcoord(mtcars,
columns = c(12, 1, 6),
groupColumn = 1,
mapping = aes(label = carName))
Any ideas would be appreciated!
Solution 1: If you want to stick close to your original attempt, you can calculate the appropriate y coordinates for the car names, & add that as a separate data source. Use inherit.aes = FALSE so that this geom_text layer doesn't inherit anything from the ggplot object created using ggparcoord():
library(dplyr)
p1 <- ggparcoord(mtcars,
columns = c(12, 1, 6),
groupColumn = 1) +
geom_text(data = mtcars %>%
select(carName) %>%
mutate(x = 1,
y = scale(as.integer(factor(carName)))),
aes(x = x, y = y, label = carName),
hjust = 1.1,
inherit.aes = FALSE) +
# optional: remove "carName" from x-axis labels
scale_x_discrete(labels = function(x) c("", x[-1])) +
# also optional: hide legend, which doesn't really seem relevant here
theme(legend.position = "none")
p1
Solution 2: This alternative uses carName as the group column, & doesn't pass it as one of the parallel coordinate columns. (which I think this might be closer to the use cases intended by this function...) Specifying carName as the group column allows the car name values to be captured in the data slot of the ggplot object created by ggparcoord() this time, so our geom_text label can inherit it directly, & even filter only for rows corresponding to variable == "mpg" (or whatever the first of the parallel coordinate columns is named, in the actual use case). The y coordinates are not as evenly spread out as above, but geom_text_repel from the ggrepel package does a decent job at shifting overlapping text labels away from one another.
library(dplyr)
library(ggrepel)
p2 <- ggparcoord(mtcars,
columns = c(1, 6),
groupColumn = "carName") +
geom_text_repel(data = . %>%
filter(variable == "mpg"),
aes(x = variable, y = value, label = carName),
xlim = c(NA, 1)) + # limit repel region to the left of the 1st column
theme(legend.position = "none") # as before, hide legend since the labels
# are already in the plot
p2
Solution 3 / 4: You can actually plot the same with ggplot(), without relying on extensions that may do unexpected stuff behind the scenes:
library(dplyr)
library(tidyr)
library(ggrepel)
# similar output to solution 1
p3 <- mtcars %>%
select(carName, mpg, wt) %>%
mutate(carName.column = as.integer(factor(carName))) %>%
gather(variable, value, -carName) %>%
group_by(variable) %>%
mutate(value = scale(value)) %>%
ungroup() %>%
ggplot(aes(x = variable, y = value, label = carName, group = carName)) +
geom_line() +
geom_text(data = . %>% filter(variable == "carName.column"),
hjust = 1.1) +
scale_x_discrete(labels = function(x) c("", x[-1]))
p3
# similar output to solution 2
p4 <- mtcars %>%
select(carName, mpg, wt) %>%
gather(variable, value, -carName) %>%
group_by(variable) %>%
mutate(value = scale(value)) %>%
ungroup() %>%
ggplot(aes(x = variable, y = value, label = carName, group = carName)) +
geom_line() +
geom_text_repel(data = . %>% filter(variable == "mpg"),
xlim = c(NA, 1))
p4
Edit
You can add text labels on the right as well, for each of the above. Do note that the location for labels may not be nicely spaced out, since they are positioned according to wt's scaled values:
p1 +
geom_text(data = mtcars %>%
select(carName, wt) %>%
mutate(x = 3,
y = scale(wt)),
aes(x = x, y = y, label = carName),
hjust = -0.1,
inherit.aes = FALSE)
p2 +
geom_text_repel(data = . %>%
filter(variable == "wt"),
aes(x = variable, y = value, label = carName),
xlim = c(2, NA))
p3 +
geom_text(data = . %>% filter(variable == "wt"),
hjust = -0.1)
p4 +
geom_text_repel(data = . %>% filter(variable == "wt"),
xlim = c(2, NA))