Mutiple functions and ggplots using a for loop - r

I want to generate a number of plots of linear regressions (bacterial OTUs plotted against temperature) using ggplot. I want the titles of the plots to be the linear regression equation, which I am determining with a function. The code works when I make the plots individually but not when I use a for loop.
I keep getting the following error:
Error in model.frame.default(formula = taxa_list[i] ~ Temperature, data = dataframe, :
variable lengths differ (found for 'Temperature')
See below for my code. Do I need a nested for loop to make this work?
taxa_list <- c("Vibrio","Salmonella","Campylobacter","Listeria","Streptococcus","Legionella")
taxa_list <- sort(taxa_list)
for (i in seq_along(taxa_list)) {
lm_eqn <- function(dataframe) {
m <- lm(taxa_list[i] ~ Temperature, dataframe)
p <- summary(m)
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2 %.% italic(x)*","~~italic(p)~"="~p0,
list(a = format(unname(coef(m)[1]), digits = 2),
b = format(unname(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3),
p0 = format(p$coefficients[8], digits = 3)))
as.expression(eq);
}
plot <- ggplot(data = all_data, aes(x = Temperature, y = taxa_list[i], fill = taxa_list[i])) +
geom_point(data = all_data, aes(x = Temperature, y = taxa_list[i]), color = "black", size = 3) +
geom_smooth(method = "lm", size = 1, color = "black", fill = "gray") +
labs(title = lm_eqn(dataframe = all_data), subtitle = "") + xlab("Temperature") + ylab("Number of OTUs")
print(plot)
}

I tried to rewrite your code to make it more readable, efficient and maintainable. I used tidyverse choices. I believe there was an extra * x in your original eq function that I removed.
library(dplyr)
library(ggplot2)
library(purrr)
library(broom)
taxa_list <- c("Vibrio","Salmonella","Campylobacter","Listeria","Streptococcus","Legionella")
taxa_list <- sort(taxa_list)
MyFunctionNew <- function(data, bacteria, temperature)
{
my_lm <- lm(as.formula(paste(bacteria, "~", temperature)), data = data)
terms_info <- broom::tidy(my_lm)
model_info <- broom::glance(my_lm)
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2 *","~~italic(p)~"="~p0,
list(a = format(terms_info$estimate[1], digits = 2),
b = format(terms_info$estimate[2], digits = 2),
r2 = format(model_info$r.squared, digits = 3),
p0 = format(model_info$p.value, digits = 3)))
plot <- ggplot(data = data, aes_string(x = temperature, y = bacteria, fill = bacteria)) +
geom_point(size = 3, show.legend = TRUE) +
geom_smooth(method = "lm", size = 1, color = "black", fill = "gray") +
labs(title = eq, subtitle = "") + xlab("Temperature") + ylab("Number of OTUs")
return(plot)
}
MyFunctionNew(dat1, "Vibrio", "Temperature")
#> `geom_smooth()` using formula 'y ~ x'
purrr::map(taxa_list, ~ MyFunctionNew(dat1, .x, "Temperature"))
#> [[1]]
#> `geom_smooth()` using formula 'y ~ x'
Here's some made up data that should more or less be close enough
set.seed(1111)
dat1 <- data.frame(Temperature = runif(200, min = 32, max = 100),
Vibrio = rnorm(200),
Salmonella = rnorm(200),
Campylobacter = rnorm(200),
Listeria = rnorm(200),
Streptococcus = rnorm(200),
Legionella = rnorm(200)
)

Related

How to create unique legend/color label for multiple variables and plots in ggplot

I'm trying to create a multiple plot page in order to compare multiple variables within disease variables and patient status (i.e. deceased or recovered) over time.
Here's my code
p1 <- g + geom_smooth(data = sofa_vivo_vs_mortos, aes(x = days, y = sofa_score, color = outcome, group = outcome)) + scale_x_continuous(breaks = sofa_vivo_vs_mortos$days)
+ geom_smooth(data = sofa_vivo_vs_mortos, aes(x = days, y = resp_score, color = outcome, group = outcome)) + values = c("blue", "red")) + labs(x="Days after admission")
p2 <- g + geom_smooth(data = sofa_vivo_vs_mortos, aes(x = days, y = sofa_score, color = outcome, group = outcome)) + scale_x_continuous(breaks = sofa_vivo_vs_mortos$days)
+ geom_smooth(data = sofa_vivo_vs_mortos, aes(x = days, y = coag_score, color = outcome, group = outcome)) + labs(x="Days after admission")
ggarrange(p1, p2, labels = c("A", "B"), ncol = 2)
Which yields the following plot:
Since it's not distinguishable which variable is which on the plots, I would like my code to yield:
1-A unique legend location for the whole page
2-A color legend for each variable not based only on it's grouping variable (in my code, the outcome variable), but also on the name of the variable itself (i.e. one colour + legend for sofa_score variable in which outcome = deceased and another for sofa_score in which outcome = recovered, combined in the same plot as the second variable under analysis (i.e. variable resp_score with same outcome stratification)
Similar desired result:
Edit for sample data:
df2 <- data.frame(ID = seq(1,32, by=1), sofa_score = sample(1:8, 8, replace = TRUE), resp_score = sample(1:8, 8, replace = TRUE),
outcome = c('deceased', 'recovered'),
days = sample(1:20, 32, replace = TRUE), coag_score = sample(1:8, 8, replace = TRUE))
I generally find that the patchwork package is great for plot composition, including collecting the legends from multiple plots. An example:
library(ggplot2)
library(patchwork)
set.seed(42)
df2 <- data.frame(
ID = seq(1,32, by=1),
sofa_score = sample(1:8, 8, replace = TRUE),
resp_score = sample(1:8, 8, replace = TRUE),
outcome = c('deceased', 'recovered'),
days = sample(1:20, 32, replace = TRUE),
coag_score = sample(1:8, 8, replace = TRUE)
)
p1 <- ggplot(df2, aes(days)) +
geom_smooth(aes(y = coag_score, colour = outcome, group = outcome)) +
scale_colour_manual(
values = c("tomato", "dodgerblue"),
name = "Coag Score"
)
p2 <- ggplot(df2, aes(days)) +
geom_smooth(aes(y = resp_score, colour = outcome, group = outcome)) +
scale_colour_manual(
values = c("limegreen", "orchid"),
name = "Resp Score"
)
p1 + p2 + plot_layout(guides = "collect")
#> `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#> `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Created on 2021-03-17 by the reprex package (v0.3.0)

Is there a way to label each cluster generated by lda()

using lda() and ggplot2 I can make a canonical plot with confidence ellipses. Is there a way to add labels for each group on the plot (labeling each cluster with a group from figure legend)?
# for the universality lda(Species~., data=iris) would be analogous
m.lda <- lda(Diet ~ ., data = b)
m.sub <- b %>% dplyr::select(-Diet) %>% as.matrix
CVA.scores <- m.sub %*% m.lda$scaling
m.CV <- data.frame(CVA.scores)
m.CV$Diet <- b$Diet
m.cva.plot <-
ggplot(m.CV, aes(x = LD1, y = LD2)) +
geom_point(aes(color=Diet), alpha=0.5) +
labs(x = "CV1", y = "CV2") +
coord_fixed(ratio=1)
chi2 = qchisq(0.05,2, lower.tail=FALSE)
CIregions.mean.and.pop <-
m.CV %>%
group_by(Diet) %>%
summarize(CV1.mean = mean(LD1),
CV2.mean = mean(LD2),
mean.radii = sqrt(chi2/n()),
popn.radii = sqrt(chi2))
m.cva.plot2 <-
m.cva.plot +
geom_circle(data = CIregions.mean.and.pop,
mapping = aes(x0 = CV1.mean, y0 = CV2.mean, r = mean.radii),
inherit.aes = FALSE) +
geom_circle(data = CIregions.mean.and.pop,
mapping = aes(x0 = CV1.mean, y0 = CV2.mean, r = popn.radii),
linetype = "dashed",
inherit.aes = FALSE)
The labels can be placed with either geom_text or geom_label. In the case below I will use geom_label, with the y coordinate adjusted by adding popn.radii the radii of the outer circles.
The code in the question is adapted to use built-in data set iris, like the question itself says.
m.cva.plot2 +
geom_label(data = CIregions.mean.and.pop,
mapping = aes(x = CV1.mean,
y = CV2.mean + popn.radii,
label = Species),
label.padding = unit(0.20, "lines"),
label.size = 0)
Reproducible code
library(dplyr)
library(ggplot2)
library(ggforce)
library(MASS)
b <- iris
m.lda <- lda(Species~., data=iris) #would be analogous
#m.lda <- lda(Diet ~ ., data = b)
m.sub <- b %>% dplyr::select(-Species) %>% as.matrix
CVA.scores <- m.sub %*% m.lda$scaling
m.CV <- data.frame(CVA.scores)
m.CV$Species <- b$Species
m.cva.plot <-
ggplot(m.CV, aes(x = LD1, y = LD2)) +
geom_point(aes(color=Species), alpha=0.5) +
labs(x = "CV1", y = "CV2") +
coord_fixed(ratio=1)
chi2 = qchisq(0.05,2, lower.tail=FALSE)
CIregions.mean.and.pop <-
m.CV %>%
group_by(Species) %>%
summarize(CV1.mean = mean(LD1),
CV2.mean = mean(LD2),
mean.radii = sqrt(chi2/n()),
popn.radii = sqrt(chi2))
m.cva.plot2 <-
m.cva.plot +
geom_circle(data = CIregions.mean.and.pop,
mapping = aes(x0 = CV1.mean, y0 = CV2.mean, r = mean.radii),
inherit.aes = FALSE) +
geom_circle(data = CIregions.mean.and.pop,
mapping = aes(x0 = CV1.mean, y0 = CV2.mean, r = popn.radii),
linetype = "dashed",
inherit.aes = FALSE)

Continuous value supplied to discrete scale ggplot2

when I tried to plot a graph of decision boundary in R, I met some problem and it returned a error "Continuous value supplied to discrete scale". I think the problem happened in the scale_colur_manual but I don't know how to fix it. Below is the code attached.
library(caTools)
set.seed(123)
split = sample.split(df$Purchased,SplitRatio = 0.75)
training_set = subset(df,split==TRUE)
test_set = subset(df,split==FALSE)
# Feature Scaling
training_set[,1:2] = scale(training_set[,1:2])
test_set[,1:2] = scale(test_set[,1:2])
# Fitting logistic regression to the training set
lr = glm(formula = Purchased ~ .,
family = binomial,
data = training_set)
#Predicting the test set results
prob_pred = predict(lr,type = 'response',newdata = test_set[-3])
y_pred = ifelse(prob_pred > 0.5, 1, 0)
#Making the Confusion Matrix
cm = table(test_set[,3],y_pred)
cm
#Visualizing the training set results
library(ggplot2)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
prob_set = predict(lr, type = 'response', newdata = grid_set)
y_grid = ifelse(prob_set > 0.5, 1,0)
ggplot(grid_set) +
geom_tile(aes(x = Age, y = EstimatedSalary, fill = factor(y_grid)),
show.legend = F) +
geom_point(data = set, aes(x = Age, y = EstimatedSalary, color = Purchased),
show.legend = F) +
scale_fill_manual(values = c("orange", "springgreen3")) +
scale_colour_manual(values = c("red3", "green4")) +
scale_x_continuous(breaks = seq(floor(min(X1)), ceiling(max(X2)), by = 1)) +
labs(title = "Logistic Regression (Training set)",
ylab = "Estimated Salary", xlab = "Age")
Is your Purchased variable a factor? If not, it has to be. Try this:
grid_set %>%
mutate(Purchased=factor(Purchased)) %>%
ggplot() +
geom_tile(aes(x = Age, y = EstimatedSalary, fill = factor(y_grid)),
show.legend = F) + ... # add the rest of your commands.

How to visualize k-means cluster using R?

How can I make k-means clustering for my following log2 transformed data set, something like attached image.
My sample df is like :
set.seed(5)
cnt_log2 = data.frame(replicate(6, runif(1000,0,20)), 1:10)
names(cnt_log2) = c(paste0("Col",1:6),"geneID")
I have done it using:
res_km <- kmeans(df, 5, nstart = 10)
data_plot <- data.table(melt(data.table(class = as.factor(res_km$cluster), df)))
data_plot[, Time := rep(1:ncol(df), each = nrow(df))]
data_plot[, ID := rep(1:nrow(df), ncol(df))]
head(data_plot)
# prepare centroids
centers <- data.table(melt(res_km$centers))
setnames(centers, c("Var1", "Var2"), c("class", "Time"))
centers[, ID := class]
centers[, gr := as.numeric(as.factor(Time))]
head(centers)
head(data_plot)
# plot the results
ggplot(data_plot, aes(variable, value, group = ID)) +
facet_wrap(~class, ncol = 2, scales = "free_y") +
geom_line(color = "grey10", alpha = 0.65) +
geom_line(data = centers, aes(gr, value),
color = "firebrick1", alpha = 0.80, size = 1.2) +
labs(x = "Time", y = "Load (normalised)") +
theme_bw()

Create dynamic labels for geom_smooth lines

I have a changing df and I am grouping different values c.
With ggplot2 I plot them with the following code to get a scatterplott with multiple linear regression lines (geom_smooth)
ggplot(aes(x = a, y = b, group = c)) +
geom_point(shape = 1, aes(color = c), alpha = alpha) +
geom_smooth(method = "lm", aes(group = c, color = c), se = F)
Now I want to display on each geom_smooth line in the plot a label with the value of the group c.
This has to be dynamic, because I can not write new code when my df changes.
Example: my df looks like this
a b c
----------------
1.6 24 100
-1.4 43 50
1 28 100
4.3 11 50
-3.45 5.2 50
So in this case I would get 3 geom_smooth lines in the plot with different colors.
Now I simply want to add a text label to the plot with "100" next to the geom_smooth with the group c = 100 and a text label with "50"to the line for the group c = 50, and so on... as new groups get introduced in the df, new geom_smooth lines are plotted and need to be labeled.
the whole code for the plot:
ggplot(aes(x = a, y = b, group = c), data = df, na.rm = TRUE) +
geom_point(aes(color = GG, size = factor(c)), alpha=0.3) +
scale_x_continuous(limits = c(-200,2300))+
scale_y_continuous(limits = c(-1.8,1.5))+
geom_hline(yintercept=0, size=0.4, color="black") +
scale_color_distiller(palette="YlGnBu", na.value="white") +
geom_smooth(method = "lm", aes(group = factor(GG), color = GG), se = F) +
geom_label_repel(data = labelInfo, aes(x= max, y = predAtMax, label = label, color = label))
You can probably do it if you pick the location you want the lines labelled. Below, I set them to label at the far right end of each line, and used ggrepel to avoid overlapping labels:
library(ggplot2)
library(ggrepel)
library(dplyr)
set.seed(12345)
df <-
data.frame(
a = rnorm(100,2,0.5)
, b = rnorm(100, 20, 5)
, c = factor(sample(c(50,100,150), 100, TRUE))
)
labelInfo <-
split(df, df$c) %>%
lapply(function(x){
data.frame(
predAtMax = lm(b~a, data=x) %>%
predict(newdata = data.frame(a = max(x$a)))
, max = max(x$a)
)}) %>%
bind_rows
labelInfo$label = levels(df$c)
ggplot(
df
, aes(x = a, y = b, color = c)
) +
geom_point(shape = 1) +
geom_smooth(method = "lm", se = F) +
geom_label_repel(data = labelInfo
, aes(x= max
, y = predAtMax
, label = label
, color = label))
This method might work for you. It uses ggplot_build to access the rightmost point in the actual geom_smooth lines to add a label by it. Below is an adaptation that uses Mark Peterson's example.
library(ggplot2)
library(ggrepel)
library(dplyr)
set.seed(12345)
df <-
data.frame(
a = rnorm(100,2,0.5)
, b = rnorm(100, 20, 5)
, c = factor(sample(c(50,100,150), 100, TRUE))
)
p <-
ggplot(df, aes(x = a, y = b, color = c)) +
geom_point(shape = 1) +
geom_smooth(method = "lm", se = F)
p.smoothedmaxes <-
ggplot_build(p)$data[[2]] %>%
group_by( group) %>%
filter( x == max(x))
p +
geom_text_repel( data = p.smoothedmaxes,
mapping = aes(x = x, y = y, label = round(y,2)),
col = p.smoothedmaxes$colour,
inherit.aes = FALSE)
This came up for me today and I landed on this solution with data = ~fn()
library(tidyverse)
library(broom)
mpg |>
ggplot(aes(x = displ, y = hwy, colour = class, label = class)) +
geom_count(alpha = 0.1) +
stat_smooth(alpha = 0.6, method = lm, geom = "line", se = FALSE) +
geom_text(
aes(y = .fitted), size = 3, hjust = 0, nudge_x = 0.1,
data = ~{
nest_by(.x, class) |>
summarize(broom::augment(lm(hwy ~ displ, data = data))) |>
slice_max(order_by = displ, n = 1)
}
) +
scale_x_continuous(expand = expansion(add = c(0, 1))) +
theme_minimal()
Or do it with a function
#' #examples
#' last_lm_points(df = mpg, formula = hwy~displ, group = class)
last_lm_points <- function(df, formula, group) {
# df <- mpg; formula <- as.formula(hwy~displ); group <- sym("class");
x_arg <- formula[[3]]
df |>
nest_by({{group}}) |>
summarize(broom::augment(lm(formula, data = data))) |>
slice_max(order_by = get(x_arg), n = 1)
}
mpg |>
ggplot(aes(displ, hwy, colour = class, label = class)) +
geom_count(alpha = 0.1) +
stat_smooth(alpha = 0.6, method = lm, geom = "line", se = FALSE) +
geom_text(
aes(y = .fitted), size = 3, hjust = 0, nudge_x = 0.1,
data = ~last_lm_points(.x, hwy~displ, class)
) +
scale_x_continuous(expand = expansion(add = c(0, 1))) +
theme_minimal()

Resources