How to visualize k-means cluster using R? - r

How can I make k-means clustering for my following log2 transformed data set, something like attached image.
My sample df is like :
set.seed(5)
cnt_log2 = data.frame(replicate(6, runif(1000,0,20)), 1:10)
names(cnt_log2) = c(paste0("Col",1:6),"geneID")

I have done it using:
res_km <- kmeans(df, 5, nstart = 10)
data_plot <- data.table(melt(data.table(class = as.factor(res_km$cluster), df)))
data_plot[, Time := rep(1:ncol(df), each = nrow(df))]
data_plot[, ID := rep(1:nrow(df), ncol(df))]
head(data_plot)
# prepare centroids
centers <- data.table(melt(res_km$centers))
setnames(centers, c("Var1", "Var2"), c("class", "Time"))
centers[, ID := class]
centers[, gr := as.numeric(as.factor(Time))]
head(centers)
head(data_plot)
# plot the results
ggplot(data_plot, aes(variable, value, group = ID)) +
facet_wrap(~class, ncol = 2, scales = "free_y") +
geom_line(color = "grey10", alpha = 0.65) +
geom_line(data = centers, aes(gr, value),
color = "firebrick1", alpha = 0.80, size = 1.2) +
labs(x = "Time", y = "Load (normalised)") +
theme_bw()

Related

How to change the colors of the dots in the graph? ggpubr package

I am using the ggerrorplot () function of the ggpubr package to create the graph below. My question is whether there is any way to change the colors of the dots without changing the color of the point that represents the mean and standard deviation? Observe the image:
My code:
# loading packages
library(ggpubr)
# Create data frame
GROUP <- c()
TEST <- c()
VALUE <- c()
for (i in 0:100) {
gp <- c('Group1','Group2','Group1 and Group2')
ts <- c('Test1','Test2')
GROUP <- append(GROUP, sample(gp, 1))
TEST <- append(TEST, sample(ts, 1))
VALUE <- append(VALUE, sample(1:200, 1))
}
df <- data.frame(GROUP, TEST, VALUE)
# Seed
set.seed(123)
# Plot
ggerrorplot(df, x = "GROUP", y = "VALUE",
desc_stat = "mean_sd",
add = c("jitter"),
color = "TEST",
palette = "jco",
add.params = list(size = 0.2),
order = c('Group1','Group2','Group1 and Group2')
) +
labs(x = '', y = 'Values\n') +
theme(legend.title = element_blank())
Can you accomplish this by simply passing in color to add.params?
# loading packages
library(ggpubr)
#> Loading required package: ggplot2
# Create data frame
GROUP <- c()
TEST <- c()
VALUE <- c()
for (i in 0:100) {
gp <- c('Group1','Group2','Group1 and Group2')
ts <- c('Test1','Test2')
GROUP <- append(GROUP, sample(gp, 1))
TEST <- append(TEST, sample(ts, 1))
VALUE <- append(VALUE, sample(1:200, 1))
}
df <- data.frame(GROUP, TEST, VALUE)
# Seed
set.seed(123)
# Plot
ggerrorplot(df, x = "GROUP", y = "VALUE",
desc_stat = "mean_sd",
add = c("jitter"),
color = "TEST",
palette = "jco",
add.params = list(size = 0.2, color = "red"),
order = c('Group1','Group2','Group1 and Group2')
) +
labs(x = '', y = 'Values\n') +
theme(legend.title = element_blank())
Created on 2021-03-10 by the reprex package (v0.3.0)
Another potential workaround - replicate the plot using ggplot() and geom_linerange(), e.g.
library(ggpubr)
library(ggsci)
library(cowplot)
# Create data frame
GROUP <- c()
TEST <- c()
VALUE <- c()
for (i in 0:100) {
gp <- c('Group1','Group2','Group1 and Group2')
ts <- c('Test1','Test2')
GROUP <- append(GROUP, sample(gp, 1))
TEST <- append(TEST, sample(ts, 1))
VALUE <- append(VALUE, sample(1:200, 1))
}
df <- data.frame(GROUP, TEST, VALUE)
# Seed
set.seed(123)
data_summary <- function(data, varname, groupnames){
require(plyr)
summary_func <- function(x, col){
c(mean = mean(x[[col]], na.rm=TRUE),
sd = sd(x[[col]], na.rm=TRUE))
}
data_sum<-ddply(data, groupnames, .fun=summary_func,
varname)
data_sum <- rename(data_sum, c("mean" = varname))
return(data_sum)
}
df2 <- data_summary(df, varname = "VALUE", groupnames = c("TEST", "GROUP"))
# Plot
p1 <- ggplot(df, aes(x = factor(GROUP, levels = c('Group1','Group2','Group1 and Group2')),
y = VALUE, color = TEST)) +
geom_jitter(shape = 21, fill = "black", stroke = 0,
position = position_jitterdodge(jitter.width = 0.2)) +
geom_linerange(data = df2, aes(ymin=VALUE-sd, ymax=VALUE+sd),
position=position_dodge(width = .75)) +
geom_point(data = df2, aes(y = VALUE), size = 3,
position = position_dodge(width = 0.75)) +
scale_color_jco() +
labs(x = '', y = 'Values\n') +
theme_classic(base_size = 14) +
theme(legend.title = element_blank(),
legend.position = "top")
p2 <- ggerrorplot(df, x = "GROUP", y = "VALUE",
desc_stat = "mean_sd",
add = c("jitter"),
color = "TEST",
palette = "jco",
add.params = list(size = 0.2),
order = c('Group1','Group2','Group1 and Group2')
) +
labs(x = '', y = 'Values\n') +
theme(legend.title = element_blank())
cowplot::plot_grid(p1, p2, nrow = 1, ncol = 2, labels = "AUTO")
When you plot them side-by-side you can see that they aren't exactly the same, but this might work for you nonetheless.
Edit
An advantage of this approach is that you can adjust the 'fill' scale separately if you don't want all the dots to be the same colour, but you do want them to be different to the lines, e.g.
p1 <- ggplot(df, aes(x = factor(GROUP, levels = c('Group1','Group2','Group1 and Group2')),
y = VALUE, color = TEST)) +
geom_jitter(aes(fill = TEST), shape = 21, stroke = 0,
position = position_jitterdodge(jitter.width = 0.2)) +
geom_linerange(data = df2, aes(ymin=VALUE-sd, ymax=VALUE+sd),
position=position_dodge(width = .75)) +
geom_point(data = df2, aes(y = VALUE), size = 3,
position = position_dodge(width = 0.75)) +
scale_color_jco() +
scale_fill_npg() +
labs(x = '', y = 'Values\n') +
theme_classic(base_size = 14) +
theme(legend.title = element_blank(),
legend.position = "top")
p2 <- ggerrorplot(df, x = "GROUP", y = "VALUE",
desc_stat = "mean_sd",
add = c("jitter"),
color = "TEST",
palette = "jco",
add.params = list(size = 0.2),
order = c('Group1','Group2','Group1 and Group2')
) +
labs(x = '', y = 'Values\n') +
theme(legend.title = element_blank())
cowplot::plot_grid(p1, p2, nrow = 1, ncol = 2, labels = "AUTO")

Is there a way to label each cluster generated by lda()

using lda() and ggplot2 I can make a canonical plot with confidence ellipses. Is there a way to add labels for each group on the plot (labeling each cluster with a group from figure legend)?
# for the universality lda(Species~., data=iris) would be analogous
m.lda <- lda(Diet ~ ., data = b)
m.sub <- b %>% dplyr::select(-Diet) %>% as.matrix
CVA.scores <- m.sub %*% m.lda$scaling
m.CV <- data.frame(CVA.scores)
m.CV$Diet <- b$Diet
m.cva.plot <-
ggplot(m.CV, aes(x = LD1, y = LD2)) +
geom_point(aes(color=Diet), alpha=0.5) +
labs(x = "CV1", y = "CV2") +
coord_fixed(ratio=1)
chi2 = qchisq(0.05,2, lower.tail=FALSE)
CIregions.mean.and.pop <-
m.CV %>%
group_by(Diet) %>%
summarize(CV1.mean = mean(LD1),
CV2.mean = mean(LD2),
mean.radii = sqrt(chi2/n()),
popn.radii = sqrt(chi2))
m.cva.plot2 <-
m.cva.plot +
geom_circle(data = CIregions.mean.and.pop,
mapping = aes(x0 = CV1.mean, y0 = CV2.mean, r = mean.radii),
inherit.aes = FALSE) +
geom_circle(data = CIregions.mean.and.pop,
mapping = aes(x0 = CV1.mean, y0 = CV2.mean, r = popn.radii),
linetype = "dashed",
inherit.aes = FALSE)
The labels can be placed with either geom_text or geom_label. In the case below I will use geom_label, with the y coordinate adjusted by adding popn.radii the radii of the outer circles.
The code in the question is adapted to use built-in data set iris, like the question itself says.
m.cva.plot2 +
geom_label(data = CIregions.mean.and.pop,
mapping = aes(x = CV1.mean,
y = CV2.mean + popn.radii,
label = Species),
label.padding = unit(0.20, "lines"),
label.size = 0)
Reproducible code
library(dplyr)
library(ggplot2)
library(ggforce)
library(MASS)
b <- iris
m.lda <- lda(Species~., data=iris) #would be analogous
#m.lda <- lda(Diet ~ ., data = b)
m.sub <- b %>% dplyr::select(-Species) %>% as.matrix
CVA.scores <- m.sub %*% m.lda$scaling
m.CV <- data.frame(CVA.scores)
m.CV$Species <- b$Species
m.cva.plot <-
ggplot(m.CV, aes(x = LD1, y = LD2)) +
geom_point(aes(color=Species), alpha=0.5) +
labs(x = "CV1", y = "CV2") +
coord_fixed(ratio=1)
chi2 = qchisq(0.05,2, lower.tail=FALSE)
CIregions.mean.and.pop <-
m.CV %>%
group_by(Species) %>%
summarize(CV1.mean = mean(LD1),
CV2.mean = mean(LD2),
mean.radii = sqrt(chi2/n()),
popn.radii = sqrt(chi2))
m.cva.plot2 <-
m.cva.plot +
geom_circle(data = CIregions.mean.and.pop,
mapping = aes(x0 = CV1.mean, y0 = CV2.mean, r = mean.radii),
inherit.aes = FALSE) +
geom_circle(data = CIregions.mean.and.pop,
mapping = aes(x0 = CV1.mean, y0 = CV2.mean, r = popn.radii),
linetype = "dashed",
inherit.aes = FALSE)

Mutiple functions and ggplots using a for loop

I want to generate a number of plots of linear regressions (bacterial OTUs plotted against temperature) using ggplot. I want the titles of the plots to be the linear regression equation, which I am determining with a function. The code works when I make the plots individually but not when I use a for loop.
I keep getting the following error:
Error in model.frame.default(formula = taxa_list[i] ~ Temperature, data = dataframe, :
variable lengths differ (found for 'Temperature')
See below for my code. Do I need a nested for loop to make this work?
taxa_list <- c("Vibrio","Salmonella","Campylobacter","Listeria","Streptococcus","Legionella")
taxa_list <- sort(taxa_list)
for (i in seq_along(taxa_list)) {
lm_eqn <- function(dataframe) {
m <- lm(taxa_list[i] ~ Temperature, dataframe)
p <- summary(m)
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2 %.% italic(x)*","~~italic(p)~"="~p0,
list(a = format(unname(coef(m)[1]), digits = 2),
b = format(unname(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3),
p0 = format(p$coefficients[8], digits = 3)))
as.expression(eq);
}
plot <- ggplot(data = all_data, aes(x = Temperature, y = taxa_list[i], fill = taxa_list[i])) +
geom_point(data = all_data, aes(x = Temperature, y = taxa_list[i]), color = "black", size = 3) +
geom_smooth(method = "lm", size = 1, color = "black", fill = "gray") +
labs(title = lm_eqn(dataframe = all_data), subtitle = "") + xlab("Temperature") + ylab("Number of OTUs")
print(plot)
}
I tried to rewrite your code to make it more readable, efficient and maintainable. I used tidyverse choices. I believe there was an extra * x in your original eq function that I removed.
library(dplyr)
library(ggplot2)
library(purrr)
library(broom)
taxa_list <- c("Vibrio","Salmonella","Campylobacter","Listeria","Streptococcus","Legionella")
taxa_list <- sort(taxa_list)
MyFunctionNew <- function(data, bacteria, temperature)
{
my_lm <- lm(as.formula(paste(bacteria, "~", temperature)), data = data)
terms_info <- broom::tidy(my_lm)
model_info <- broom::glance(my_lm)
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2 *","~~italic(p)~"="~p0,
list(a = format(terms_info$estimate[1], digits = 2),
b = format(terms_info$estimate[2], digits = 2),
r2 = format(model_info$r.squared, digits = 3),
p0 = format(model_info$p.value, digits = 3)))
plot <- ggplot(data = data, aes_string(x = temperature, y = bacteria, fill = bacteria)) +
geom_point(size = 3, show.legend = TRUE) +
geom_smooth(method = "lm", size = 1, color = "black", fill = "gray") +
labs(title = eq, subtitle = "") + xlab("Temperature") + ylab("Number of OTUs")
return(plot)
}
MyFunctionNew(dat1, "Vibrio", "Temperature")
#> `geom_smooth()` using formula 'y ~ x'
purrr::map(taxa_list, ~ MyFunctionNew(dat1, .x, "Temperature"))
#> [[1]]
#> `geom_smooth()` using formula 'y ~ x'
Here's some made up data that should more or less be close enough
set.seed(1111)
dat1 <- data.frame(Temperature = runif(200, min = 32, max = 100),
Vibrio = rnorm(200),
Salmonella = rnorm(200),
Campylobacter = rnorm(200),
Listeria = rnorm(200),
Streptococcus = rnorm(200),
Legionella = rnorm(200)
)

Significance annotation in facets

I am trying to annotate the plot below in a pairwise fashion - in each facet compare corresponding samples in the variable. Essentially comparing CTR from pos to CTR from neg and so on. I can't seem to get it to work.
Here is my data and plots:
library(ggpubr)
#data.frame
samples <- rep(c('LA', 'EA', 'CTR'), 300)
variable <- sample(c('pos', 'neg'), 900, replace = T)
stim <- rep(c('rp','il'), 450)
population <- sample(c('EM','CM','TEMRA'), 900, replace = T)
values <- runif(900, min = 0, max = 100)
df <- data.frame(samples, variable, stim, population, values)
#test and comparisons
test_comparisons <- list(c('neg', 'pos'))
test <- compare_means(values ~ variable, data = df, method = 'wilcox.test',
group.by = c('samples', 'stim', 'population'))
#plot
ggplot(aes(x= variable, y = values, fill = samples), data = df) +
geom_boxplot(position = position_dodge(0.85)) +
geom_dotplot(binaxis='y', stackdir='center', position =
position_dodge(0.85), dotsize = 1.5) +
facet_grid(population ~ stim, scales = 'free_x') +
stat_compare_means(comparisons = test_comparisons, label = 'p.signif') +
theme_bw()
This only produces 1 comparison per facet between pos and neg instead of 3...What am I doing wrong?
You can use the following code:
samples <- rep(c('LA', 'EA', 'CTR'), 300)
variable <- sample(c('pos', 'neg'), 900, replace = T)
stim <- rep(c('rp','il'), 450)
population <- sample(c('EM','CM','TEMRA'), 900, replace = T)
values <- runif(900, min = 0, max = 100)
df <- data.frame(samples, variable, stim, population, values)
#test and comparisons
test_comparisons <- list(c('neg', 'pos'))
test <- compare_means(values ~ variable, data = df, method = 'wilcox.test',
group.by = c('samples', 'stim', 'population'))
#plot
ggplot(aes(x= variable, y = values, fill = samples), data = df) +
geom_boxplot(position = position_dodge(0.85)) +
geom_dotplot(binaxis='y', stackdir='center', position =
position_dodge(0.85), dotsize = 1.5) +
facet_grid(population ~ stim+samples, scales = 'free_x') +
stat_compare_means(comparisons = test_comparisons, label = 'p.signif') +
theme_bw()
Hope this will rectify your problem

Create dynamic labels for geom_smooth lines

I have a changing df and I am grouping different values c.
With ggplot2 I plot them with the following code to get a scatterplott with multiple linear regression lines (geom_smooth)
ggplot(aes(x = a, y = b, group = c)) +
geom_point(shape = 1, aes(color = c), alpha = alpha) +
geom_smooth(method = "lm", aes(group = c, color = c), se = F)
Now I want to display on each geom_smooth line in the plot a label with the value of the group c.
This has to be dynamic, because I can not write new code when my df changes.
Example: my df looks like this
a b c
----------------
1.6 24 100
-1.4 43 50
1 28 100
4.3 11 50
-3.45 5.2 50
So in this case I would get 3 geom_smooth lines in the plot with different colors.
Now I simply want to add a text label to the plot with "100" next to the geom_smooth with the group c = 100 and a text label with "50"to the line for the group c = 50, and so on... as new groups get introduced in the df, new geom_smooth lines are plotted and need to be labeled.
the whole code for the plot:
ggplot(aes(x = a, y = b, group = c), data = df, na.rm = TRUE) +
geom_point(aes(color = GG, size = factor(c)), alpha=0.3) +
scale_x_continuous(limits = c(-200,2300))+
scale_y_continuous(limits = c(-1.8,1.5))+
geom_hline(yintercept=0, size=0.4, color="black") +
scale_color_distiller(palette="YlGnBu", na.value="white") +
geom_smooth(method = "lm", aes(group = factor(GG), color = GG), se = F) +
geom_label_repel(data = labelInfo, aes(x= max, y = predAtMax, label = label, color = label))
You can probably do it if you pick the location you want the lines labelled. Below, I set them to label at the far right end of each line, and used ggrepel to avoid overlapping labels:
library(ggplot2)
library(ggrepel)
library(dplyr)
set.seed(12345)
df <-
data.frame(
a = rnorm(100,2,0.5)
, b = rnorm(100, 20, 5)
, c = factor(sample(c(50,100,150), 100, TRUE))
)
labelInfo <-
split(df, df$c) %>%
lapply(function(x){
data.frame(
predAtMax = lm(b~a, data=x) %>%
predict(newdata = data.frame(a = max(x$a)))
, max = max(x$a)
)}) %>%
bind_rows
labelInfo$label = levels(df$c)
ggplot(
df
, aes(x = a, y = b, color = c)
) +
geom_point(shape = 1) +
geom_smooth(method = "lm", se = F) +
geom_label_repel(data = labelInfo
, aes(x= max
, y = predAtMax
, label = label
, color = label))
This method might work for you. It uses ggplot_build to access the rightmost point in the actual geom_smooth lines to add a label by it. Below is an adaptation that uses Mark Peterson's example.
library(ggplot2)
library(ggrepel)
library(dplyr)
set.seed(12345)
df <-
data.frame(
a = rnorm(100,2,0.5)
, b = rnorm(100, 20, 5)
, c = factor(sample(c(50,100,150), 100, TRUE))
)
p <-
ggplot(df, aes(x = a, y = b, color = c)) +
geom_point(shape = 1) +
geom_smooth(method = "lm", se = F)
p.smoothedmaxes <-
ggplot_build(p)$data[[2]] %>%
group_by( group) %>%
filter( x == max(x))
p +
geom_text_repel( data = p.smoothedmaxes,
mapping = aes(x = x, y = y, label = round(y,2)),
col = p.smoothedmaxes$colour,
inherit.aes = FALSE)
This came up for me today and I landed on this solution with data = ~fn()
library(tidyverse)
library(broom)
mpg |>
ggplot(aes(x = displ, y = hwy, colour = class, label = class)) +
geom_count(alpha = 0.1) +
stat_smooth(alpha = 0.6, method = lm, geom = "line", se = FALSE) +
geom_text(
aes(y = .fitted), size = 3, hjust = 0, nudge_x = 0.1,
data = ~{
nest_by(.x, class) |>
summarize(broom::augment(lm(hwy ~ displ, data = data))) |>
slice_max(order_by = displ, n = 1)
}
) +
scale_x_continuous(expand = expansion(add = c(0, 1))) +
theme_minimal()
Or do it with a function
#' #examples
#' last_lm_points(df = mpg, formula = hwy~displ, group = class)
last_lm_points <- function(df, formula, group) {
# df <- mpg; formula <- as.formula(hwy~displ); group <- sym("class");
x_arg <- formula[[3]]
df |>
nest_by({{group}}) |>
summarize(broom::augment(lm(formula, data = data))) |>
slice_max(order_by = get(x_arg), n = 1)
}
mpg |>
ggplot(aes(displ, hwy, colour = class, label = class)) +
geom_count(alpha = 0.1) +
stat_smooth(alpha = 0.6, method = lm, geom = "line", se = FALSE) +
geom_text(
aes(y = .fitted), size = 3, hjust = 0, nudge_x = 0.1,
data = ~last_lm_points(.x, hwy~displ, class)
) +
scale_x_continuous(expand = expansion(add = c(0, 1))) +
theme_minimal()

Resources