I want to use automatic index for x-axis in ggplot2.
My data set is followed:
library(tidyverse)
library(ElemStatLearn)
phoneme <- as_tibble(phoneme)
aa = phoneme %>%
filter(g == "aa")
This is a phoneme data, recording 256 frequencies for each 695 data.
With base code, I can do as follows:
(Let's do only 15 of 695 data)
aa[1:15, 1:256]
min_l = min( aa[1:15, 1:256] )
max_l = max( aa[1:15, 1:256] )
ii=1
plot( as.double(aa[ii, 1:256]), ylim=c(min_l,max_l), type="l", col="green", xlab="Frequency")
for( ii in 2:15 ){
lines( as.double(aa[ii,]), col="green" )
}
But when I try to do it using ggplot2, I'm confused.
What should I put in aes?
ggplot(data = aa, aes(x = 1:256, y = aa[1, 1:256])) + geom_line()
returns an error. How can I deal with it?
library(tidyverse)
library(ElemStatLearn)
phoneme <- as_tibble(phoneme)
aa = phoneme %>%
filter(g == "aa")
aa[1:15, 1:256]
min_l = min( aa[1:15, 1:256] )
max_l = max( aa[1:15, 1:256] )
ii=1
plot( as.double(aa[ii, 1:256]), ylim=c(min_l,max_l), type="l", col="green",
xlab="Frequency")
for( ii in 2:15 ){
lines( as.double(aa[ii,]), col="green" )
}
library(reshape2)
aa2 <- aa %>%
dplyr::slice(1:15) %>%
dplyr::select(-g, -speaker) %>%
t %>% as.data.frame() %>%
dplyr::add_rownames() %>%
dplyr::select(-rowname) %>%
dplyr::mutate(id = 1:256) %>%
reshape2::melt(id.vars = "id")
ggplot2::ggplot(aa2) +
geom_line(aes(x = id, y = value, col = variable), show.legend = F) +
scale_x_continuous(breaks = seq(0, 250, 50)) +
scale_y_continuous(limits = c(min_l,max_l)) +
scale_color_manual(values = rep("green", 256)) +
xlab("Frequency") +
theme_classic()
Comment:
When manipulated the dataframe to perform the transposed matrix, the object under manipulation acquires the names of the varibles in each row (rownames). So, to make the plot easier and make the df more elegant I think it is interesting to remove the names of the rows.
So it was necessary at first to include the names in the df and (dplyr::add_rownames()) later to remove the column with the names of the rows (dplyr::select(-rowname)) .
This gives a false illusion of error, but I performed in a redundant way to avoid using NULL. See link.
Editing by Gregor's comment:
aa2 <- aa %>%
dplyr::slice(1:15) %>%
dplyr::select(-g, -speaker) %>%
t %>% as.data.frame() %>%
tibble::remove_rownames() %>% # Comment
dplyr::mutate(id = 1:256) %>%
reshape2::melt(id.vars = "id")
Related
I have a large dataset in which I want to group similar resistance patterns together. A plot to visualize similarity of resistance pattern is needed.
dat <- read.table(text="Id Resistance.Pattern
A SSRRSSSSR
B SSSRSSSSR
C RRRRSSRRR
D SSSSSSSSS
E SSRSSSSSR
F SSSRRSSRR
G SSSSR
H SSSSSSRRR
I RRSSRRRSS", header=TRUE)
I would separate out the values into a wider dataframe and then make a heatmap and dendrogram to compare sillimanites in patterns:
library(tidyverse)
library(ggdendro)
recode_dat <- dat |>
mutate(pat = str_split(Resistance.Pattern, "")) |>
unnest_wider(pat, names_sep = "_") |>
select(starts_with("pat_")) |>
mutate(across(everything(), ~case_when(. == "S" ~ 1, . == "R" ~ 2, is.na(.) ~0)))
rownames(recode_dat) <- dat$Id
dendro <- as.dendrogram(hclust(d = dist(x = scale(recode_dat))))
dendro_plot <- ggdendrogram(data = dendro, rotate = TRUE)
heatmap_plot <- dat |>
mutate(pat = str_split(Resistance.Pattern, "")) |>
unnest_wider(pat, names_sep = "_") |>
pivot_longer(cols = starts_with("pat_"), names_to = "pattern_position") |>
mutate(Id = factor(Id, levels = dat$Id[order.dendrogram(dendro)])) |>
ggplot(aes(pattern_position, Id))+
geom_tile(aes(fill = value))+
scale_x_discrete(labels = \(x) sub(".*_(\\d+$)", "\\1", x))+
theme(legend.position = "top")
cowplot::plot_grid(heatmap_plot, dendro_plot,nrow = 1, align = "h", axis = "tb")
It sounds as though the second column of your data frame represents sensitivity (S) and resistance (R), presumably to antibiotics (though this is not clear in your question). That being the case, you are presumably looking for something like this:
library(tidyverse)
p <- strsplit(dat$Resistance.Pattern, "")
do.call(rbind, lapply(p, \(x) c(x, rep(NA, max(lengths(p)) - length(x))))) %>%
as.data.frame() %>%
cbind(Id = dat$Id) %>%
mutate(Id = factor(Id, rev(Id))) %>%
pivot_longer(V1:V9) %>%
ggplot(aes(name, Id, fill = value)) +
geom_tile(col = "white", size = 2) +
coord_equal() +
scale_fill_manual(values = c("#e02430", "#d8d848"),
labels = c("Resistant", "Sensitive"),
na.value = "gray95") +
scale_x_discrete(name = "Antibiotic", position = "top",
labels = 1:9) +
labs(fill = "Resistance", y = "ID") +
theme_minimal(base_size = 20) +
theme(text = element_text(color = "gray30"))
I'd separate the entries by character, convert the binary data to numeric and plot the matrix as a heatmap and show the character string as rownames.
Whether to use a row and/or column clustering depends on whats desired.
library(dplyr)
library(tidyr) # for unnest_wider
library(gplots) # for heatmap.2
mm <-
dat %>%
group_by(Resistance.Pattern) %>%
summarize(Id, Resistance.Pattern) %>%
mutate(binary = strsplit(Resistance.Pattern, "")) %>%
unnest_wider(binary, names_sep="") %>%
mutate(across(starts_with("binary"), ~ as.numeric(c(R = 1, S = 0)[.x])))
mm2 <- as.matrix(mm[, -c(1,2)]) |> unname() # the numeric part
rownames(mm2) <- apply(as.matrix(mm[,1:2]), 1, paste, collapse=" ")
heatmap.2(mm2, trace="none", Colv="none", dendrogram="row",
col=c("green", "darkgreen"), margins=c(10,10))
I have several sets of data that I calculate binned normalized differences for. The results I want to plot within a single line plot using ggplot. The lines representing different combinations of the paired differences are supposed to be distinguished by colors and line types.
I am stuck on taking the computed values from the bins (would be y-axis values now), and plotting these onto an x-axis.
Below is the code I use for importing the data and calculating the normalized differences.
# Read data from column 3 as data table for different number of rows
# you could use replicate here for test
# dat1 <- data.frame(replicate(1,sample(25:50,10000,rep=TRUE)))
# dat2 <- data.frame(replicate(1,sample(25:50,9500,rep=TRUE)))
dat1 <- fread("/dir01/a/dat01.txt", header = FALSE, data.table=FALSE, select=c(3))
dat2 <- fread("/dir02/c/dat02.txt", header = FALSE, data.table=FALSE, select=c(3))
# Change column names
colnames(dat1) <- c("Dat1")
colnames(dat2) <- c("Dat2")
# Perhaps there is a better way to compute the following as all-in-one? I have broken these down step by step.
# 1) Sum for each bin
bin1 = cut(dat1$Dat1, breaks = seq(25, 50, by = 2))
sum1 = tapply(dat1$Dat1, bin1, sum)
bin2 = cut(dat2$Dat2, breaks = seq(25, 50, by = 2))
sum2 = tapply(dat2$Dat2, bin2, sum)
# 2) Total sum of all bins
sumt1 = sum(sum1)
sumt2 = sum(sum2)
# 3) Divide each bin by total sum of all bins
sumn1 = lapply(sum1, `/`, sumt1)
sumn2 = lapply(sum2, `/`, sumt2)
# 4) Convert to data frame as I'm not sure how to difference otherwise
df_sumn1 = data.frame(sumn1)
df_sumn2 = data.frame(sumn2)
# 5) Difference between the two as percentage
dbin = (df_sumn1 - df_sumn2)*100
How can I plot those results using ggplot() and geom_line()?
I want
dbin values on the x-axis ranging from 25-50
different colors and line types for the lines
Here is what I tried:
p1 <- ggplot(dbin, aes(x = ?, color=Data, linetype=Data)) +
geom_line() +
scale_linetype_manual(values=c("solid")) +
scale_x_continuous(limits = c(25, 50)) +
scale_color_manual(values = c("#000000"))
dput(dbin) outputs:
structure(list(X.25.27. = -0.0729132928804117, X.27.29. = -0.119044772581772,
X.29.31. = 0.316016473225017, X.31.33. = -0.292812782147632,
X.33.35. = 0.0776336591308158, X.35.37. = 0.0205584754637611,
X.37.39. = -0.300768421159599, X.39.41. = -0.403235174844081,
X.41.43. = 0.392510458816457, X.43.45. = 0.686758883448307,
X.45.47. = -0.25387105113263, X.47.49. = -0.0508324553382303), class = "data.frame", row.names = c(NA,
-1L))
Edit
The final piece of code that works, using only the dbin and plots multiple dbins:
dat1 <- data.frame(a = replicate(1,sample(25:50,10000,rep=TRUE, prob = 25:0/100)))
dat2 <- data.frame(a = replicate(1,sample(25:50,9500,rep=TRUE, prob = 0:25/100)))
dat3 <- data.frame(a = replicate(1,sample(25:50,9500,rep=TRUE, prob = 12:37/100)))
dat4 <- data.frame(a = replicate(1,sample(25:50,9500,rep=TRUE, prob = 37:12/100)))
calc_bin_props <- function(data) {
as_tibble(data) %>%
mutate(bin = cut(a, breaks = seq(25, 50, by = 2))) %>%
group_by(bin) %>%
summarise(sum = sum(a), .groups = "drop") %>%
filter(!is.na(bin)) %>%
ungroup() %>%
mutate(sum = sum / sum(sum))
}
diff_data <-
full_join(
calc_bin_props(data = dat1),
calc_bin_props(dat2),
by = "bin") %>%
separate(bin, c("trsh", "bin", "trshb", "trshc")) %>%
mutate(dbinA = (sum.x - sum.y * 100)) %>%
select(-starts_with("trsh"))
diff_data2 <-
full_join(
calc_bin_props(data = dat3),
calc_bin_props(dat4),
by = "bin") %>%
separate(bin, c("trsh", "bin", "trshb", "trshc")) %>%
mutate(dbinB = (sum.x - sum.y * 100)) %>%
select(-starts_with("trsh"))
# Combine two differences, and remove sum.x and sum.y
full_data <- cbind(diff_data, diff_data2[,4])
full_data <- full_data[,-c(2:3)]
# Melt the data to plot more than 1 variable on a plot
m <- melt(full_data, id.vars="bin")
theme_update(plot.title = element_text(hjust = 0.5))
ggplot(m, aes(as.numeric(bin), value, col=variable, linetype = variable)) +
geom_line() +
scale_linetype_manual(values=c("solid", "longdash")) +
scale_color_manual(values = c("black", "black"))
dev.off()
library(tidyverse)
Creating example data as shown in question, but adding different probabilities to the two sample() calls, to create so visible difference
between the two sets of randomized data.
dat1 <- data.frame(a = replicate(1,sample(25:50,10000,rep=TRUE, prob = 25:0/100))) %>% as_tibble()
dat2 <- data.frame(a = replicate(1,sample(25:50,9500,rep=TRUE, prob = 0:25/100))) %>% as_tibble()
Using dplyr we can handle this within data.frames (tibbles) without
the need to switch to other datatypes.
Let’s define a function that can be applied to both datasets to get
the preprocessing done.
We use base::cut() to create
a new column that pairs each value with its bin. We then group the data
by bin, calculate the sum for each bin and finally divide the bin sums
by the total sum.
calc_bin_props <- function(data) {
as_tibble(data) %>%
mutate(bin = cut(a, breaks = seq(25, 50, by = 2), labels = seq(25, 48, by = 2))) %>%
group_by(bin) %>%
summarise(sum = sum(a), .groups = "drop") %>%
filter(!is.na(bin)) %>%
ungroup() %>%
mutate(sum = sum / sum(sum))
}
Now we call calc_bin_props() on both datasets and join them by bin.
This gives us a dataframe with the columns bin, sum.x and sum.y.
The latter two are correspond to the bin sums derived from dat1 and
dat2. With the mutate() line we calculate the differences between the
two columns.
diff_data <-
full_join(
calc_bin_props(data = dat1),
calc_bin_props(dat2),
by = "bin") %>%
mutate(dbin = (sum.x - sum.y),
bin = as.numeric(as.character(bin))) %>%
select(-starts_with("trsh"))
Before we feed the data into ggplot() we convert it to the long
format using pivot_longer() this allows us to instruct ggplot() to
plot the results for sum.x, sum.y and dbin as separate lines.
diff_data %>%
pivot_longer(-bin) %>%
ggplot(aes(as.numeric(bin), value, color = name, linetype = name)) +
geom_line() +
scale_linetype_manual(values=c("longdash", "solid", "solid")) +
scale_color_manual(values = c("black", "purple", "green"))
I've put together a plot to view groups separately but now want to include significance levels for mean pairwise comparison in the plot. While I can do the comparison outside of the plot I'm wondering what the most efficient way of including the comparison in the plot would be?
Current Plot
library(tidyverse)
dsub <- diamonds[ sample(nrow(diamonds), 10000), ]
dsub <- dsub %>%
filter(clarity %in% c('VS2', 'VS1', 'VVS2'))
ggplot(dsub, aes(x = cut, y = carat, fill = clarity)) +
geom_boxplot(outlier.size = 0) +
geom_point(pch = 21, position = position_jitterdodge())
Now I want to add the comparisons within each level of the cut variable between all levels of the clarity variable. I prefer using ggpubr but couldn't see where this could be achieved.
EDITED to take OP preference for output into account
Ahhhh... okay well let me at least save you a bunch of vertical space and neaten things up by overcoming the fact that rstatix doesn't honor the order of your factors and ggpubr wants its groups as character not factor.
library(ggplot2)
library(dplyr)
dsub <- diamonds[ sample(nrow(diamonds), 10000), ]
dsub <- dsub %>%
filter(clarity %in% c('VS2', 'VS1', 'VVS2'))
dsub <- droplevels(dsub)
dsub_stats <-
dsub %>%
group_by(cut) %>%
rstatix::wilcox_test(carat~clarity) %>%
mutate(group1 = factor(group1,
ordered = TRUE,
levels = c("VS2", "VS1", "VVS2"))) %>%
arrange(cut, group1) %>%
mutate(group1 = as.character(group1)) %>%
rstatix::add_xy_position(x='cut')
ggpubr::ggboxplot(dsub, x = "cut", y = "carat",
color = "clarity",
add='jitter') +
ggpubr::stat_pvalue_manual(dsub_stats,
label = "p.adj.signif",
tip.length = 0.01)
Created on 2020-09-24 by the reprex package (v0.3.0)
library(tidyverse)
library(rstatix)
library(ggpubr)
dsub <- diamonds[ sample(nrow(diamonds), 10000), ]
dsub <- dsub %>%
filter(clarity %in% c('VS2', 'VS1', 'VVS2'))
dsub_stats <- dsub %>%
group_by(cut) %>%
wilcox_test(carat~clarity) %>% add_xy_position(x='cut')
ggboxplot(dsub, x = "cut", y = "carat",
color = "clarity",
add='jitter'
) +
stat_pvalue_manual(dsub_stats, label = "p.adj.signif", tip.length = 0.01)
Here's a code block:
# scale the log of price per group (cut)
my_diamonds <- diamonds %>%
mutate(log_price = log(price)) %>%
group_by(cut) %>%
mutate(scaled_log_price = scale(log_price) %>% as.numeric) %>% # scale within each group as opposed to overall
nest() %>%
mutate(mean_log_price = map_dbl(data, ~ .x$log_price %>% mean)) %>%
mutate(sd_log_price = map_dbl(data, ~ .x$log_price %>% sd)) %>%
unnest %>%
select(cut, price, price_scaled:sd_log_price) %>%
ungroup
# for each cut, find the back transformed actual values (exp) of each unit of zscore between -3:3
for (i in -3:3) {
my_diamonds <- my_diamonds %>%
mutate(!! paste0('mean_', ifelse(i < 0 , 'less_', 'plus_'), abs(i), 'z') := map2(.x = mean_log_price, .y = sd_log_price, ~ (.x + (i * .y)) %>% exp) %>% unlist)
}
my_diamonds_split <- my_diamonds %>% group_split(cut)
split_names <- my_diamonds %>% mutate(cut = as.character(cut)) %>% group_keys(cut) %>% pull(cut)
names(my_diamonds_split) <- split_names
I now have a variable my_diamonds_split that is a list of data frames. I would like to loop over these data frames and each time create a new ggplot.
I can use a custom labeller function with a single df, but I don't know how to do this within a loop:
labeller <- function(x) {
paste0(x,"\n", scales::dollar(sd(ex_df$price) * x + mean(ex_df$price)))
}
ex_df <- my_diamonds_split$Ideal
ex_df %>%
ggplot(aes(x = scaled_log_price)) +
geom_density() +
scale_x_continuous(label = labeller, limits = c(-3, 3))
This creates a plot for the 'Ideal' cut of diamonds. I also get two data points on the x axis, the zscore values at -2, 0 and 2 as well as the raw dollar values of 3.8K, 3.9K and 11.8K.
When I define the labeller function, I must specify the df to scale with. Tried instead with placing the dot instead of my_df, hoping that on each iteration ggplot would get the value of the df on any iteration:
labeller <- function(x) {
paste0(x,"\n", scales::dollar(sd(.$price) * x + mean(.$price)))
}
ex_df <- my_diamonds_split$Ideal
ex_df %>%
ggplot(aes(x = scaled_log_price)) +
geom_density() +
scale_x_continuous(label = labeller, limits = c(-3, 3))
Returns:
Error in is.data.frame(x) : object '.' not found
I then tried writing the function to accept an argument for the df to scale with:
labeller <- function(x, df) {
paste0(x,"\n", scales::dollar(sd(df$price) * x + mean(df$price)))
}
ex_df <- my_diamonds_split$Ideal
ex_df %>%
ggplot(aes(x = scaled_log_price)) +
geom_density() +
scale_x_continuous(label = labeller(df = ex_df), limits = c(-3, 3)) # because when it comes to running in real life, I will try something like labeller(df = my_diamonds_split[[i]])
Error in paste0(x, "\n", scales::dollar(sd(df$price) * x + mean(df$price))) :
argument "x" is missing, with no default
Bearing in mind that the scaling must be done per iteration, how could I loop over my_diamonds_split, and on each iteration generate a ggplot per above?
labeller <- function(x) {
# how can I make df variable
paste0(x,"\n", scales::dollar(sd(df$price) * x + mean(df$price)))
}
for (i in split_names) {
my_diamonds_split[[i]] %>%
ggplot(aes(x = scaled_log_price)) +
geom_density() +
scale_x_continuous(label = labeller, # <--- here, labeller must be defined with df$price except that will difer on each iteration
limits = c(-3, 3))
}
There's a hacky way to get this result in facets. Basically, after converting to z scores, you add different amounts (say, multiples of 1000) to each group's z scores. Then you set all the breaks to this collection of points and label them with pre-calculated labels.
library(ggplot2)
library(dplyr)
f <- function(x) {
y <- diamonds$price[diamonds$cut == x]
paste(seq(-3, 3), scales::dollar(round(mean(y) + seq(-3, 3) * sd(y))), sep = "\n")
}
breaks <- as.vector(sapply(levels(diamonds$cut), f))
diamonds %>%
group_by(cut) %>%
mutate(z = scale(price) + 3 + 1000 * as.numeric(cut)) %>%
ggplot(aes(z)) +
geom_point(aes(x = z - 2, y = 1), alpha = 0) +
geom_density() +
scale_x_continuous(breaks = as.vector(sapply(1:5 * 1000, "+", 0:6)),
labels = breaks) +
facet_wrap(vars(cut), scales = "free_x") +
theme(text = element_text(size = 16),
axis.text.x = element_text(size = 6))
You would have to increase the plot size to make the dollar values more visible of course.
Created on 2020-08-04 by the reprex package (v0.3.0)
I have this fake data frame. I am looking at a quicker vectorization to add data points over the barplot of means. My solution would be hard to apply when many columns are present. My problem is that only a vector and not a matrix is allowed in the "points" functions. Do you have a smart solution?
df <- data.frame(Test = 1:5,
Factor= c("A","A","B","B","A"),
V1=c(3.2,5.4,6.0,6.5,2),
V2=c(5,5,8.6,7,1))
str(df, list.len=ncol(df))
colnames(df)
dim(df)
df.agg <- aggregate(df[c(3,4)], by = list(Factor = df$Factor), mean)
df.agg <- df.agg[order(df.agg$Factor),]
df.agg
mat.agg <- as.matrix(df.agg[c(2,3)])
barx <- barplot(mat.agg,
beside = T,
ylim = c(0, 1.3*max(mat.agg)),
col = colors()[c(5,16)][df.agg$Factor],
legend.text = as.character(df.agg$Factor))
barx
barx <- as.vector(barx)
barx
points(
rep(barx[1], length(df[df$Factor == levels(df$Factor)[1], "V1"])),
df[df$Factor == levels(df$Factor)[1], "V1"])
points(
rep(barx[2], length(df[df$Factor == levels(df$Factor)[2], "V1"])),
df[df$Factor == levels(df$Factor)[2], "V1"])
points(
rep(barx[3], length(df[df$Factor == levels(df$Factor)[1], "V2"])),
df[df$Factor == levels(df$Factor)[1], "V2"])
points(
rep(barx[4], length(df[df$Factor == levels(df$Factor)[2], "V2"])),
df[df$Factor == levels(df$Factor)[2], "V2"])
You can try to use the tidyverse:
library(tidyverse)
df %>%
gather(key, value, -Test, -Factor) %>%
ggplot(aes(x = key, y = value, fill=Factor)) +
geom_bar(stat = "summary", fun.y = "mean",position = "dodge") +
geom_point(position=position_dodge(0.9))
In base R I would do:
library(reshape2)
df_wide <- melt(df[,-1]) # make your data wide
df_wide <- df_wide[ order(df_wide$variable,df_wide$Factor),] # order appropriate
# add the x-positions using interaction
df_wide$X <- barx[as.numeric(interaction(df_wide$Factor, df_wide$variable))]
# add the points to the bars
points(df_wide$X, df_wide$value)