Datapoints on top of complex barplots in R-software - r

I have this fake data frame. I am looking at a quicker vectorization to add data points over the barplot of means. My solution would be hard to apply when many columns are present. My problem is that only a vector and not a matrix is allowed in the "points" functions. Do you have a smart solution?
df <- data.frame(Test = 1:5,
Factor= c("A","A","B","B","A"),
V1=c(3.2,5.4,6.0,6.5,2),
V2=c(5,5,8.6,7,1))
str(df, list.len=ncol(df))
colnames(df)
dim(df)
df.agg <- aggregate(df[c(3,4)], by = list(Factor = df$Factor), mean)
df.agg <- df.agg[order(df.agg$Factor),]
df.agg
mat.agg <- as.matrix(df.agg[c(2,3)])
barx <- barplot(mat.agg,
beside = T,
ylim = c(0, 1.3*max(mat.agg)),
col = colors()[c(5,16)][df.agg$Factor],
legend.text = as.character(df.agg$Factor))
barx
barx <- as.vector(barx)
barx
points(
rep(barx[1], length(df[df$Factor == levels(df$Factor)[1], "V1"])),
df[df$Factor == levels(df$Factor)[1], "V1"])
points(
rep(barx[2], length(df[df$Factor == levels(df$Factor)[2], "V1"])),
df[df$Factor == levels(df$Factor)[2], "V1"])
points(
rep(barx[3], length(df[df$Factor == levels(df$Factor)[1], "V2"])),
df[df$Factor == levels(df$Factor)[1], "V2"])
points(
rep(barx[4], length(df[df$Factor == levels(df$Factor)[2], "V2"])),
df[df$Factor == levels(df$Factor)[2], "V2"])

You can try to use the tidyverse:
library(tidyverse)
df %>%
gather(key, value, -Test, -Factor) %>%
ggplot(aes(x = key, y = value, fill=Factor)) +
geom_bar(stat = "summary", fun.y = "mean",position = "dodge") +
geom_point(position=position_dodge(0.9))
In base R I would do:
library(reshape2)
df_wide <- melt(df[,-1]) # make your data wide
df_wide <- df_wide[ order(df_wide$variable,df_wide$Factor),] # order appropriate
# add the x-positions using interaction
df_wide$X <- barx[as.numeric(interaction(df_wide$Factor, df_wide$variable))]
# add the points to the bars
points(df_wide$X, df_wide$value)

Related

How to visualize similar resistance pattern in a plot using R

I have a large dataset in which I want to group similar resistance patterns together. A plot to visualize similarity of resistance pattern is needed.
dat <- read.table(text="Id Resistance.Pattern
A SSRRSSSSR
B SSSRSSSSR
C RRRRSSRRR
D SSSSSSSSS
E SSRSSSSSR
F SSSRRSSRR
G SSSSR
H SSSSSSRRR
I RRSSRRRSS", header=TRUE)
I would separate out the values into a wider dataframe and then make a heatmap and dendrogram to compare sillimanites in patterns:
library(tidyverse)
library(ggdendro)
recode_dat <- dat |>
mutate(pat = str_split(Resistance.Pattern, "")) |>
unnest_wider(pat, names_sep = "_") |>
select(starts_with("pat_")) |>
mutate(across(everything(), ~case_when(. == "S" ~ 1, . == "R" ~ 2, is.na(.) ~0)))
rownames(recode_dat) <- dat$Id
dendro <- as.dendrogram(hclust(d = dist(x = scale(recode_dat))))
dendro_plot <- ggdendrogram(data = dendro, rotate = TRUE)
heatmap_plot <- dat |>
mutate(pat = str_split(Resistance.Pattern, "")) |>
unnest_wider(pat, names_sep = "_") |>
pivot_longer(cols = starts_with("pat_"), names_to = "pattern_position") |>
mutate(Id = factor(Id, levels = dat$Id[order.dendrogram(dendro)])) |>
ggplot(aes(pattern_position, Id))+
geom_tile(aes(fill = value))+
scale_x_discrete(labels = \(x) sub(".*_(\\d+$)", "\\1", x))+
theme(legend.position = "top")
cowplot::plot_grid(heatmap_plot, dendro_plot,nrow = 1, align = "h", axis = "tb")
It sounds as though the second column of your data frame represents sensitivity (S) and resistance (R), presumably to antibiotics (though this is not clear in your question). That being the case, you are presumably looking for something like this:
library(tidyverse)
p <- strsplit(dat$Resistance.Pattern, "")
do.call(rbind, lapply(p, \(x) c(x, rep(NA, max(lengths(p)) - length(x))))) %>%
as.data.frame() %>%
cbind(Id = dat$Id) %>%
mutate(Id = factor(Id, rev(Id))) %>%
pivot_longer(V1:V9) %>%
ggplot(aes(name, Id, fill = value)) +
geom_tile(col = "white", size = 2) +
coord_equal() +
scale_fill_manual(values = c("#e02430", "#d8d848"),
labels = c("Resistant", "Sensitive"),
na.value = "gray95") +
scale_x_discrete(name = "Antibiotic", position = "top",
labels = 1:9) +
labs(fill = "Resistance", y = "ID") +
theme_minimal(base_size = 20) +
theme(text = element_text(color = "gray30"))
I'd separate the entries by character, convert the binary data to numeric and plot the matrix as a heatmap and show the character string as rownames.
Whether to use a row and/or column clustering depends on whats desired.
library(dplyr)
library(tidyr) # for unnest_wider
library(gplots) # for heatmap.2
mm <-
dat %>%
group_by(Resistance.Pattern) %>%
summarize(Id, Resistance.Pattern) %>%
mutate(binary = strsplit(Resistance.Pattern, "")) %>%
unnest_wider(binary, names_sep="") %>%
mutate(across(starts_with("binary"), ~ as.numeric(c(R = 1, S = 0)[.x])))
mm2 <- as.matrix(mm[, -c(1,2)]) |> unname() # the numeric part
rownames(mm2) <- apply(as.matrix(mm[,1:2]), 1, paste, collapse=" ")
heatmap.2(mm2, trace="none", Colv="none", dendrogram="row",
col=c("green", "darkgreen"), margins=c(10,10))

Using assign within ggplot loop gives incorrect plots

I'm creating three plots in a loop over I and using assign to save each plot. The y variable is scaled by the loop index. The scaling should ensure that the final panel of plots each has y going from 0 to 1. This isn't happening and the plots seem to be being changed as the loop runs. I'd be grateful if someone could explain this apparently odd behaviour.
library(dplyr)
library(ggplot2)
library(gridExtra)
loci = c(1,2,3)
x <- seq(0,1,0.01)
df <- expand.grid(x = x, loci = loci)
df <- df %>% mutate(y = loci * x)
cols = c("red", "blue", "green")
for (i in loci){
plot_this <- df %>% filter(loci == i)
my_plot = ggplot(plot_this) +
geom_point( aes( x = x, y = y/i), colour = cols[i]) +
ylim(0,3) + ggtitle(paste0("i = ", i))
assign(paste0("plot_", i), my_plot)
print(plot_1)
}
grid.arrange(plot_1, plot_2, plot_3, ncol = 3)
It's due to the lazy evaluation nature of ggplot, and more explanation can be found in this post.
"Looping" with lapply avoids the problem.
Data
library(ggplot2)
library(gridExtra)
library(dplyr)
loci = c(1,2,3)
x <- seq(0,1,0.01)
df <- expand.grid(x = x, loci = loci)
df <- df %>% mutate(y = loci * x)
cols = c("red", "blue", "green")
Code
my_plot <- lapply(loci, function(i) {
df %>%
filter(loci == i) %>%
ggplot() +
geom_point(aes(x = x, y = y/i), colour = cols[i]) +
ylim(0,3) +
ggtitle(paste0("i = ", i))
})
grid.arrange(my_plot[[1]], my_plot[[2]], my_plot[[3]], ncol = 3)
Created on 2022-04-26 by the reprex package (v2.0.1)

Adding the median value to every box (not ggplot)

I have a boxplot from the code below and i want to add median values.
boxplot(ndvi_pct_sep~edge_direction, data= data_sample, subset = edge_direction %in% c(64,4, 1,16),ylab="NDVI2028-2016", xlab="Forest edge direction",names=c("north", "south", "east", "west"))
.
I want to add the median values to the boxplots, any idea how to do it?
It will likely involve using legends - since I don't have your data I cant make it perfect, but the below code should get you started using the ToothGrowth data contained in R. I am showing a base R and ggplot example (I know you said no ggplot, but others may use it).
# Load libraries
library(dplyr); library(ggplot2)
# get median data
mediandata <- ToothGrowth %>% group_by(dose) %>% summarise(median = median(len, na.rm = TRUE))
l <- unname(unlist(mediandata))
tg <- ToothGrowth # for convenience
tg$dose <- as.factor(tg$dose)
### Base R approach
boxplot(len ~ dose, data = tg,
main = "Guinea Pigs' Tooth Growth",
xlab = "Vitamin C dose mg",
ylab = "tooth length", col = "red")
for (i in 1:3){
legend(i-0.65,l[i+3]+5, legend = paste0("Median: ",l[i+3]), bty = "n")
}
### ggplot approach
ggplot(data = tg, aes(dose, len)) +
theme_classic() + theme(legend.position = "none") +
geom_boxplot()+
annotate("text",
x = c(1,2,3),
y = l[4:6]+1, # shit so you can read it
label = l[4:6])
Base R:
ggplot:
Here's a straightforward solution with text and without forloop:
Toy data:
set.seed(12)
df <- data.frame(
var1 = sample(LETTERS[1:4], 100, replace = TRUE),
var2 = rnorm(100)
)
Calculate the medians:
library(dplyr)
med <- df %>%
group_by(var1) %>%
summarise(medians = median(var2)) %>%
pull(medians)
Alternatively, in base R:
bx <- boxplot(df$var2 ~ df$var1)
med <- bx$stats[3,1:4]
Boxplot:
boxplot(df$var2 ~ df$var1)
Annotate boxplots:
text(1:4, med, round(med,3), pos = 3, cex = 0.6)
You can do
b <- boxplot(count ~ spray, data = InsectSprays, col = "lightgray", boxwex=.2)
s <- b$stats
text(1:ncol(s)+.4, s[3,], round(s[3,],1), col="red")

Plot pvalue information for mean comparisons by grouping variable

I've put together a plot to view groups separately but now want to include significance levels for mean pairwise comparison in the plot. While I can do the comparison outside of the plot I'm wondering what the most efficient way of including the comparison in the plot would be?
Current Plot
library(tidyverse)
dsub <- diamonds[ sample(nrow(diamonds), 10000), ]
dsub <- dsub %>%
filter(clarity %in% c('VS2', 'VS1', 'VVS2'))
ggplot(dsub, aes(x = cut, y = carat, fill = clarity)) +
geom_boxplot(outlier.size = 0) +
geom_point(pch = 21, position = position_jitterdodge())
Now I want to add the comparisons within each level of the cut variable between all levels of the clarity variable. I prefer using ggpubr but couldn't see where this could be achieved.
EDITED to take OP preference for output into account
Ahhhh... okay well let me at least save you a bunch of vertical space and neaten things up by overcoming the fact that rstatix doesn't honor the order of your factors and ggpubr wants its groups as character not factor.
library(ggplot2)
library(dplyr)
dsub <- diamonds[ sample(nrow(diamonds), 10000), ]
dsub <- dsub %>%
filter(clarity %in% c('VS2', 'VS1', 'VVS2'))
dsub <- droplevels(dsub)
dsub_stats <-
dsub %>%
group_by(cut) %>%
rstatix::wilcox_test(carat~clarity) %>%
mutate(group1 = factor(group1,
ordered = TRUE,
levels = c("VS2", "VS1", "VVS2"))) %>%
arrange(cut, group1) %>%
mutate(group1 = as.character(group1)) %>%
rstatix::add_xy_position(x='cut')
ggpubr::ggboxplot(dsub, x = "cut", y = "carat",
color = "clarity",
add='jitter') +
ggpubr::stat_pvalue_manual(dsub_stats,
label = "p.adj.signif",
tip.length = 0.01)
Created on 2020-09-24 by the reprex package (v0.3.0)
library(tidyverse)
library(rstatix)
library(ggpubr)
dsub <- diamonds[ sample(nrow(diamonds), 10000), ]
dsub <- dsub %>%
filter(clarity %in% c('VS2', 'VS1', 'VVS2'))
dsub_stats <- dsub %>%
group_by(cut) %>%
wilcox_test(carat~clarity) %>% add_xy_position(x='cut')
ggboxplot(dsub, x = "cut", y = "carat",
color = "clarity",
add='jitter'
) +
stat_pvalue_manual(dsub_stats, label = "p.adj.signif", tip.length = 0.01)

Automatic index for x-axis in ggplot2

I want to use automatic index for x-axis in ggplot2.
My data set is followed:
library(tidyverse)
library(ElemStatLearn)
phoneme <- as_tibble(phoneme)
aa = phoneme %>%
filter(g == "aa")
This is a phoneme data, recording 256 frequencies for each 695 data.
With base code, I can do as follows:
(Let's do only 15 of 695 data)
aa[1:15, 1:256]
min_l = min( aa[1:15, 1:256] )
max_l = max( aa[1:15, 1:256] )
ii=1
plot( as.double(aa[ii, 1:256]), ylim=c(min_l,max_l), type="l", col="green", xlab="Frequency")
for( ii in 2:15 ){
lines( as.double(aa[ii,]), col="green" )
}
But when I try to do it using ggplot2, I'm confused.
What should I put in aes?
ggplot(data = aa, aes(x = 1:256, y = aa[1, 1:256])) + geom_line()
returns an error. How can I deal with it?
library(tidyverse)
library(ElemStatLearn)
phoneme <- as_tibble(phoneme)
aa = phoneme %>%
filter(g == "aa")
aa[1:15, 1:256]
min_l = min( aa[1:15, 1:256] )
max_l = max( aa[1:15, 1:256] )
ii=1
plot( as.double(aa[ii, 1:256]), ylim=c(min_l,max_l), type="l", col="green",
xlab="Frequency")
for( ii in 2:15 ){
lines( as.double(aa[ii,]), col="green" )
}
library(reshape2)
aa2 <- aa %>%
dplyr::slice(1:15) %>%
dplyr::select(-g, -speaker) %>%
t %>% as.data.frame() %>%
dplyr::add_rownames() %>%
dplyr::select(-rowname) %>%
dplyr::mutate(id = 1:256) %>%
reshape2::melt(id.vars = "id")
ggplot2::ggplot(aa2) +
geom_line(aes(x = id, y = value, col = variable), show.legend = F) +
scale_x_continuous(breaks = seq(0, 250, 50)) +
scale_y_continuous(limits = c(min_l,max_l)) +
scale_color_manual(values = rep("green", 256)) +
xlab("Frequency") +
theme_classic()
Comment:
When manipulated the dataframe to perform the transposed matrix, the object under manipulation acquires the names of the varibles in each row (rownames). So, to make the plot easier and make the df more elegant I think it is interesting to remove the names of the rows.
So it was necessary at first to include the names in the df and (dplyr::add_rownames()) later to remove the column with the names of the rows (dplyr::select(-rowname)) .
This gives a false illusion of error, but I performed in a redundant way to avoid using NULL. See link.
Editing by Gregor's comment:
aa2 <- aa %>%
dplyr::slice(1:15) %>%
dplyr::select(-g, -speaker) %>%
t %>% as.data.frame() %>%
tibble::remove_rownames() %>% # Comment
dplyr::mutate(id = 1:256) %>%
reshape2::melt(id.vars = "id")

Resources