Significance annotation in facets - r

I am trying to annotate the plot below in a pairwise fashion - in each facet compare corresponding samples in the variable. Essentially comparing CTR from pos to CTR from neg and so on. I can't seem to get it to work.
Here is my data and plots:
library(ggpubr)
#data.frame
samples <- rep(c('LA', 'EA', 'CTR'), 300)
variable <- sample(c('pos', 'neg'), 900, replace = T)
stim <- rep(c('rp','il'), 450)
population <- sample(c('EM','CM','TEMRA'), 900, replace = T)
values <- runif(900, min = 0, max = 100)
df <- data.frame(samples, variable, stim, population, values)
#test and comparisons
test_comparisons <- list(c('neg', 'pos'))
test <- compare_means(values ~ variable, data = df, method = 'wilcox.test',
group.by = c('samples', 'stim', 'population'))
#plot
ggplot(aes(x= variable, y = values, fill = samples), data = df) +
geom_boxplot(position = position_dodge(0.85)) +
geom_dotplot(binaxis='y', stackdir='center', position =
position_dodge(0.85), dotsize = 1.5) +
facet_grid(population ~ stim, scales = 'free_x') +
stat_compare_means(comparisons = test_comparisons, label = 'p.signif') +
theme_bw()
This only produces 1 comparison per facet between pos and neg instead of 3...What am I doing wrong?

You can use the following code:
samples <- rep(c('LA', 'EA', 'CTR'), 300)
variable <- sample(c('pos', 'neg'), 900, replace = T)
stim <- rep(c('rp','il'), 450)
population <- sample(c('EM','CM','TEMRA'), 900, replace = T)
values <- runif(900, min = 0, max = 100)
df <- data.frame(samples, variable, stim, population, values)
#test and comparisons
test_comparisons <- list(c('neg', 'pos'))
test <- compare_means(values ~ variable, data = df, method = 'wilcox.test',
group.by = c('samples', 'stim', 'population'))
#plot
ggplot(aes(x= variable, y = values, fill = samples), data = df) +
geom_boxplot(position = position_dodge(0.85)) +
geom_dotplot(binaxis='y', stackdir='center', position =
position_dodge(0.85), dotsize = 1.5) +
facet_grid(population ~ stim+samples, scales = 'free_x') +
stat_compare_means(comparisons = test_comparisons, label = 'p.signif') +
theme_bw()
Hope this will rectify your problem

Related

Fit Surface Plasmon Resonance (SPR) data in ggplot

I am trying to fit association-dissociation SPR kinetics data for a protein and small molecule for two concentrations using ggplot2. The data is here.
The time variable indicates the time in seconds, the sample variable indicates the two concentrations (32nM and 8nM), and the values variable is the readout.
I have imported the data and running the following code to plot:
# LINE PLOT
ggplot(data) +
geom_point(aes(x = time, y = values), size = 1, color = "black") +
geom_smooth(aes(x = time, y = values, color = sample), method = "loess", se = F) +
scale_x_continuous(expand = c(0, 0), limits = c(0, NA)) +
#scale_y_continuous(expand = c(0, 0), limits = c(0, 60)) +
scale_color_npg(breaks = c("2nM", "4nM", "8nM", "16nM", "32nM")) +
theme_linedraw() +
labs(x = "Time (seconds)",
y = "Response Units") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
Here is the plot:
As you can see that the fit did not work using method = "loess". I need something like this(there are 5 concentrations here):
The fitting requires 1:1 Langmuir model but I am not sure how I can do that in ggplot. Can someone please help me?
Here is the equation:
This is from the pbm package that fits this kind of plots.
Your data are smooth enough that you need only use geom_line, not geom_smooth:
df %>%
ggplot(aes(time, values, color = sample)) +
geom_line(size = 2, na.rm = TRUE) +
geom_point(color = 'black', size = 1) +
theme_linedraw(base_size = 16) +
xlim(c(0, 400))
Edit
It is possible to fit the results to the data using non-linear least squares, employing the binding1to1 function from pbm, but it requires a bit of method tweaking to get the model to fit. It would probably be better to create a model then plot the predictions rather than using geom_smooth. However, if you really wanted to, you could do:
df %>%
ggplot(aes(time, values, color = sample)) +
geom_smooth(method = nls, se = FALSE, n = 1000,
formula = y ~ binding1to1(x, 123, 32e-9, kon, koff, rmax),
method.args = list(
start = list(kon = 2000, koff = 0.02, rmax = 2e4),
control = nls.control(minFactor = 1e-6, maxiter = 1000)
),
data = df[df$time > 0 & df$sample == "32nM",]) +
geom_smooth(method = nls, se = FALSE, n = 1000,
formula = y ~ binding1to1(x, 123, 8e-9, kon, koff, rmax),
method.args = list(
start = list(kon = 3000, koff = 0.02, rmax = 2e4),
control = nls.control(minFactor = 1e-9, maxiter = 10000)
),
data = df[df$time > 0 & df$sample == "8nM",]) +
geom_point(color = 'black', size = 1) +
theme_linedraw(base_size = 16) +
xlim(c(0, 400))
If you want to actually fit a model from which to extract the parameters and plot, you can do:
library(tidyverse)
library(pbm)
df <- read.csv("SPR.csv") %>%
filter(time >= 0) %>%
mutate(sample = as.numeric(gsub("\\D+", "", sample)) * 1e-9,
values = values * 1e-3) %>%
group_by(sample) %>%
mutate(tmax = time[which.max(values)])
fit_fun <- function(time, tmax, sample, kon, koff, rmax) {
unlist(Map(function(time, tmax, sample) {
binding1to1(time, tmax, sample, kon, koff, rmax)
}, time, tmax, sample))
}
mod <- nls(values ~ fit_fun(time, tmax, sample, kon, koff, rmax),
data = df,
start = list(kon = 3000, koff = 0.02, rmax = 2),
control = nls.control(minFactor = 1e-9, maxiter = 10000))
This gives us a model with the best fitting values for the various parameters:
mod
#> Nonlinear regression model
#> model: values ~ fit_fun(time, tmax, sample, kon, koff, rmax)
#> data: df
#> kon koff rmax
#> 8.925e+05 2.521e-03 5.445e-02
#> residual sum-of-squares: 5.219e-05
#>
#> Number of iterations to convergence: 536
#> Achieved convergence tolerance: 5.155e-07
We can then predict the output of the model over the range of our input variables:
pred_df <- expand.grid(time = 0:400, sample = c(8, 32) * 1e-9,
tmax = df$tmax[1])
pred_df$values <- predict(mod, pred_df)
And we can plot it like this:
df %>%
ggplot(aes(time, values, color = factor(sample))) +
geom_line(data = pred_df, size = 1) +
geom_point(color = 'black', size = 1) +
theme_linedraw(base_size = 16) +
xlim(c(0, 400))

Individual levels of slopes per lmer and equations

I would like to ask for some help with depicting the slopes generated by a lmer() model.
The data that I have is the mass volume of different rats across different days. Each rat has different time points where they took the measurement of that volume.
For rat 1 I have volume c(78,304,352,690,952,1250) at days c(89,110,117,124,131,138) that belong to country Chile
For rat 2 I have volume c(202,440,520,870,1380) at days c(75,89,96,103,110) that belong to country Chile.
For rat 3 I have volume c(186,370,620,850,1150) at days c(75,89,96,103,110) that belong to country Chile.
For rat 4 I have volume c(92,250,430,450,510,850,1000,1200) at days c(47,61,75,82,89,97,103,110) that belong to country England.
For rat 5 I have volume c(110,510,710,1200) at days c(47,61,75,82) that belong to country England.
For rat 6 I have volume c(115,380,480,540,560,850,1150,1350) at days c(47,61,75,82,89,97,103,110) that belong to country England.
The lmer model is:
m1 <- lmer(lVolume ~ Country*Day + (1|Rat))
I managed to plot the curves of my model by using:
m1%>%
augment() %>%
clean_names() %>%
ggplot(data = .,
mapping = aes(x = day,
y = exp(l_volume),
group = rat)) +
geom_point(alpha = 0.5) +
geom_line(alpha = 0.5) +
geom_point(aes(y = exp(fitted)),
color = "red") +
geom_line(aes(y = exp(fitted)),
color = "red") +
expand_limits(x = 0 , y = 0)
This model gave me predictions for new data points based on the model m1 for each of the rats across country.
From this lmer() I have one slope across the whole measurements, this is:
And by exp(predicted):
However, I would like to plot this in a different way. I would like to plot the slope generated by each of the levels of country that I have.
The red lines would be the exp(slopes) generating by Chile, and England, but also depict the exp(slope) of the whole model containing both levels.
So, initially I thought that creating three lmer() models:
m1 <- lmer(lVolume ~ Country*Day + (1|Rat))
m2 <- lmer(lVolume ~ Day + (1|Rat)) (Rats in Chile)
m3 <- lmer(lVolume ~ Day + (1|Rat)) (Rats in England)
But I noticed that m2 and m3 are quite different models because they do not have the interaction from Country that is something that I would like to check. So, I don't know what to do here.
Update
I tried this and kind of worked:
Final.Fixed<-effect(c("Country*Day"), m1,
xlevels=list(Day=seq(0,168,14)))
Final.Fixed<-as.data.frame(Final.Fixed)
Final.Fixed.Plot <-ggplot(data = Final.Fixed, aes(x = Day, y =exp(fit), group=Country))+
coord_cartesian(xlim=c(0,170),ylim = c(0,8000))+
geom_line(aes(color=Country), size=2)+
geom_ribbon(aes(ymin=exp(fit-se), ymax=exp(fit+se),fill=Country),alpha=.2)+
xlab("Day")+
ylab("Volume")+
scale_color_manual(values=c("blue", "red"))+
scale_fill_manual(values=c("blue", "red"))+
theme_bw()+
theme(text=element_text(face="bold", size=12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(fill = NA, colour = "NA"),
axis.line = element_line(size = 1, colour = "grey80"),
legend.title=element_blank(),
legend.position = c(.2, .92))
Final.Fixed.Plot
Is this ok ? I think that I am still cosnidering the m1 with the country*Day interaction. Correct me if I am worng, please! Also, I don't know how I can add the exp(fit) curve for the whole model and the raw data points in this plot.
Could I get some hint/help, please ?
Clean summary on top
The first code chunk contains a cleaned up version that addresses all points of the question, using some input from the comments. I've left the original answer below which step by step builds to the final plot.
library(tidyverse)
library(lme4)
library(broom.mixed)
library(ggeffects)
m1 <- lme4::lmer(lVolume ~ Country*Day + (1|Rat), data = df_rats %>%
dplyr::mutate(lVolume = log(Volume)))
# predictions for each country
syn_df <- tidyr::expand_grid(
Day = 1:170,
Country = c("Chile", "England")
) %>%
dplyr::mutate(lVolume = predict(m1, ., re.form = ~0))
# marginal effects for variable "Day"
df_day_marginal <- ggeffect(model = m1, terms = "Day", type = "fe") %>%
as.data.frame() %>%
dplyr::rename(Day = x, lVolume = predicted) %>%
dplyr::mutate(Country = "overall")
#combine prediction curves
df_preds <- bind_rows(syn_df, df_day_marginal)
# manually assemble formulas [units missing]
y0 <- round(fixef(m1)[["(Intercept)"]], 2)
beta_day <- round(fixef(m1)[["Day"]], 3)
beta_englday <- round(fixef(m1)[["CountryEngland:Day"]], 3)
beta_engl <- round(fixef(m1)[["CountryEngland"]], 2)
f_chile <- paste0("volume = exp(", y0, " + ", beta_day, " * days)")
f_england <- paste0("volume = exp(", y0 + beta_engl , " + ", beta_day + beta_englday, " * days)")
df_labels <- data.frame(
x = c(50, 50),
y = c(1300, 1400),
form = c(f_chile, f_england),
country = c("Chile", "England")
)
m1 %>%
broom.mixed::augment()%>%
ggplot(aes(x = Day, y = exp(lVolume), color = Country)) +
geom_ribbon(data = df_preds, aes(ymin = exp(conf.low), ymax = exp(conf.high), color = NULL, fill = Country), alpha = 0.3) +
geom_line(data = df_preds, size = 1.5) +
geom_line(aes(group = Rat)) +
geom_point() +
coord_cartesian(ylim = c(0, 1500), xlim = c(0, 150)) +
geom_text(data = df_labels, aes(x = x, y = y, label = form, color = country)) +
labs(x = "days", y = "volume")
original answer
I've tried to stay as close as possible to your initial code for the first part of the question.
The first chunk trains the model and makes population-level predictions for Chile and England over the specified days. (using the re.form = ~0 argument as explained e.g. here)
library(tidyverse)
library(lme4)
library(broom.mixed)
#helpful to specify in that `lVolume` is the log of the data you provid in the question
m1 <- lme4::lmer(lVolume ~ Country*Day + (1|Rat), data = df_rats %>%
dplyr::mutate(lVolume = log(Volume)))
days <- seq(0,168,14)
syn_df <- tidyr::expand_grid(
Day = 1:170,
Country = c("Chile", "England")
)
syn_df <- syn_df %>%
dplyr::mutate(l_volume = predict(m1, syn_df, re.form = ~0)) %>%
janitor::clean_names()
This can then be added to your original plot with minor modifications:
m1 %>%
broom.mixed::augment() %>%
janitor::clean_names() %>%
ggplot(data = .,
mapping = aes(x = day,
y = exp(l_volume),
color = country)) +
geom_point(alpha = 0.7) +
geom_line(aes(group = rat), alpha = 0.7) +
expand_limits(x = 0 , y = 0) +
geom_line(data = syn_df, alpha = 1, size = 1.5) +
coord_cartesian(ylim = c(NA, 1500), xlim = c(NA, 150))
Added
In addition, we can add marginal effect for days to the plot.
df_day_marginal <- ggeffect(model = m1, terms = "Day", type = "fe")
m1 %>%
broom.mixed::augment() %>%
janitor::clean_names() %>%
ggplot() +
geom_ribbon(data = df_day_marginal, aes(x = x, ymin = exp(conf.low), ymax = exp(conf.high)), alpha = 0.3) +
geom_line(data = syn_df, aes(x = day, y = exp(l_volume), color = country), size = 1.5) +
geom_line(data = df_day_marginal, aes(x = x, y = exp(predicted)), size = 1.5) +
geom_point(aes(x = day, y = exp(l_volume), color = country), alpha = 0.7) +
geom_line(aes(x = day, y = exp(l_volume), color = country, group = rat), alpha = 0.7) +
expand_limits(x = 0 , y = 0) +
coord_cartesian(ylim = c(NA, 1500), xlim = c(NA, 150)) +
labs(x = "days", y = "volume")

How to visualize k-means cluster using R?

How can I make k-means clustering for my following log2 transformed data set, something like attached image.
My sample df is like :
set.seed(5)
cnt_log2 = data.frame(replicate(6, runif(1000,0,20)), 1:10)
names(cnt_log2) = c(paste0("Col",1:6),"geneID")
I have done it using:
res_km <- kmeans(df, 5, nstart = 10)
data_plot <- data.table(melt(data.table(class = as.factor(res_km$cluster), df)))
data_plot[, Time := rep(1:ncol(df), each = nrow(df))]
data_plot[, ID := rep(1:nrow(df), ncol(df))]
head(data_plot)
# prepare centroids
centers <- data.table(melt(res_km$centers))
setnames(centers, c("Var1", "Var2"), c("class", "Time"))
centers[, ID := class]
centers[, gr := as.numeric(as.factor(Time))]
head(centers)
head(data_plot)
# plot the results
ggplot(data_plot, aes(variable, value, group = ID)) +
facet_wrap(~class, ncol = 2, scales = "free_y") +
geom_line(color = "grey10", alpha = 0.65) +
geom_line(data = centers, aes(gr, value),
color = "firebrick1", alpha = 0.80, size = 1.2) +
labs(x = "Time", y = "Load (normalised)") +
theme_bw()

Labelling outliers with ggplot

I am trying to label outliers with ggplot. Regarding my code, I have two questions:
Why does it not label outliers below 1.5*IQR?
Why does it not label outliers based on the group they are in but instead apparently refers to the overall mean of the data? I would like to label outliers for each box plot individually. I.e. the outliers for Country A in Wave 1 (of a survey), etc.
A sample of my code:
PERCENT <- rnorm(50, sd = 3)
WAVE <- sample(6, 50, replace = TRUE)
AGE_GROUP <- rep(c("21-30", "31-40", "41-50", "51-60", "61-70"), 10)
COUNTRY <- rep(c("Country A", "Country B"), 25)
N <- rnorm(50, mean = 200, sd = 2)
df <- data.frame(PERCENT, WAVE, AGE_GROUP, COUNTRY, N)
ggplot(df, aes(x = factor(WAVE), y = PERCENT, fill = factor(COUNTRY))) +
geom_boxplot(alpha = 0.3) +
geom_point(aes(color = AGE_GROUP, group = factor(COUNTRY)), position = position_dodge(width=0.75)) +
geom_text(aes(label = ifelse(PERCENT > 1.5*IQR(PERCENT)|PERCENT < -1.5*IQR(PERCENT), paste(AGE_GROUP, ",", round(PERCENT, 1), "%, n =", round(N, 0)),'')), hjust = -.3, size = 3)
A picture of what I have so far:
I appreciate your help!
If you want IQR to be calculated by country, you need to group the data. You could probably do it globally(i.e. before you send the data to ggplot) or locally in the layer.
library(dplyr)
library(ggplot2)
ggplot(df, aes(x = as.factor(WAVE), y = PERCENT, fill = COUNTRY)) +
geom_boxplot(alpha = 0.3) +
geom_point(aes(color = AGE_GROUP, group = COUNTRY), position = position_dodge(width=0.75)) +
geom_text(aes(group = COUNTRY, label = ifelse(!between(PERCENT,-1.3*IQR(PERCENT), 1.3*IQR(PERCENT)),
paste(" ",COUNTRY, ",", AGE_GROUP, ",", round(PERCENT, 1), "%, n =", round(N, 0)),'')),
position = position_dodge(width=0.75),
hjust = "left", size = 3)
Adding the group aesthetic to geom_text and modifying the ifelse test should do what you want.
Setting group = interaction(WAVE, COUNTRY) will restrict the calculations to within each boxplot, and the outliner test needs to include a call to median(PERCENT).
library(ggplot2)
set.seed(42)
PERCENT <- rnorm(50, sd = 3)
WAVE <- sample(6, 50, replace = TRUE)
AGE_GROUP <- rep(c("21-30", "31-40", "41-50", "51-60", "61-70"), 10)
COUNTRY <- rep(c("Country A", "Country B"), 25)
N <- rnorm(50, mean = 200, sd = 2)
df <- data.frame(PERCENT, WAVE, AGE_GROUP, COUNTRY, N)
ggplot(df) +
aes(x = factor(WAVE),
y = PERCENT,
fill = factor(COUNTRY)) +
geom_boxplot(alpha = 0.3) +
geom_point(aes(color = AGE_GROUP, group = factor(COUNTRY)), position = position_dodge(width=0.75)) +
geom_text(aes(group = interaction(WAVE, COUNTRY),
label = ifelse(test = PERCENT > median(PERCENT) + 1.5*IQR(PERCENT)|PERCENT < median(PERCENT) -1.5*IQR(PERCENT),
yes = paste(AGE_GROUP, ",", round(PERCENT, 1), "%, n =", round(N, 0)),
no = '')),
position = position_dodge(width = 0.75),
hjust = -.2,
size = 3)

Create dynamic labels for geom_smooth lines

I have a changing df and I am grouping different values c.
With ggplot2 I plot them with the following code to get a scatterplott with multiple linear regression lines (geom_smooth)
ggplot(aes(x = a, y = b, group = c)) +
geom_point(shape = 1, aes(color = c), alpha = alpha) +
geom_smooth(method = "lm", aes(group = c, color = c), se = F)
Now I want to display on each geom_smooth line in the plot a label with the value of the group c.
This has to be dynamic, because I can not write new code when my df changes.
Example: my df looks like this
a b c
----------------
1.6 24 100
-1.4 43 50
1 28 100
4.3 11 50
-3.45 5.2 50
So in this case I would get 3 geom_smooth lines in the plot with different colors.
Now I simply want to add a text label to the plot with "100" next to the geom_smooth with the group c = 100 and a text label with "50"to the line for the group c = 50, and so on... as new groups get introduced in the df, new geom_smooth lines are plotted and need to be labeled.
the whole code for the plot:
ggplot(aes(x = a, y = b, group = c), data = df, na.rm = TRUE) +
geom_point(aes(color = GG, size = factor(c)), alpha=0.3) +
scale_x_continuous(limits = c(-200,2300))+
scale_y_continuous(limits = c(-1.8,1.5))+
geom_hline(yintercept=0, size=0.4, color="black") +
scale_color_distiller(palette="YlGnBu", na.value="white") +
geom_smooth(method = "lm", aes(group = factor(GG), color = GG), se = F) +
geom_label_repel(data = labelInfo, aes(x= max, y = predAtMax, label = label, color = label))
You can probably do it if you pick the location you want the lines labelled. Below, I set them to label at the far right end of each line, and used ggrepel to avoid overlapping labels:
library(ggplot2)
library(ggrepel)
library(dplyr)
set.seed(12345)
df <-
data.frame(
a = rnorm(100,2,0.5)
, b = rnorm(100, 20, 5)
, c = factor(sample(c(50,100,150), 100, TRUE))
)
labelInfo <-
split(df, df$c) %>%
lapply(function(x){
data.frame(
predAtMax = lm(b~a, data=x) %>%
predict(newdata = data.frame(a = max(x$a)))
, max = max(x$a)
)}) %>%
bind_rows
labelInfo$label = levels(df$c)
ggplot(
df
, aes(x = a, y = b, color = c)
) +
geom_point(shape = 1) +
geom_smooth(method = "lm", se = F) +
geom_label_repel(data = labelInfo
, aes(x= max
, y = predAtMax
, label = label
, color = label))
This method might work for you. It uses ggplot_build to access the rightmost point in the actual geom_smooth lines to add a label by it. Below is an adaptation that uses Mark Peterson's example.
library(ggplot2)
library(ggrepel)
library(dplyr)
set.seed(12345)
df <-
data.frame(
a = rnorm(100,2,0.5)
, b = rnorm(100, 20, 5)
, c = factor(sample(c(50,100,150), 100, TRUE))
)
p <-
ggplot(df, aes(x = a, y = b, color = c)) +
geom_point(shape = 1) +
geom_smooth(method = "lm", se = F)
p.smoothedmaxes <-
ggplot_build(p)$data[[2]] %>%
group_by( group) %>%
filter( x == max(x))
p +
geom_text_repel( data = p.smoothedmaxes,
mapping = aes(x = x, y = y, label = round(y,2)),
col = p.smoothedmaxes$colour,
inherit.aes = FALSE)
This came up for me today and I landed on this solution with data = ~fn()
library(tidyverse)
library(broom)
mpg |>
ggplot(aes(x = displ, y = hwy, colour = class, label = class)) +
geom_count(alpha = 0.1) +
stat_smooth(alpha = 0.6, method = lm, geom = "line", se = FALSE) +
geom_text(
aes(y = .fitted), size = 3, hjust = 0, nudge_x = 0.1,
data = ~{
nest_by(.x, class) |>
summarize(broom::augment(lm(hwy ~ displ, data = data))) |>
slice_max(order_by = displ, n = 1)
}
) +
scale_x_continuous(expand = expansion(add = c(0, 1))) +
theme_minimal()
Or do it with a function
#' #examples
#' last_lm_points(df = mpg, formula = hwy~displ, group = class)
last_lm_points <- function(df, formula, group) {
# df <- mpg; formula <- as.formula(hwy~displ); group <- sym("class");
x_arg <- formula[[3]]
df |>
nest_by({{group}}) |>
summarize(broom::augment(lm(formula, data = data))) |>
slice_max(order_by = get(x_arg), n = 1)
}
mpg |>
ggplot(aes(displ, hwy, colour = class, label = class)) +
geom_count(alpha = 0.1) +
stat_smooth(alpha = 0.6, method = lm, geom = "line", se = FALSE) +
geom_text(
aes(y = .fitted), size = 3, hjust = 0, nudge_x = 0.1,
data = ~last_lm_points(.x, hwy~displ, class)
) +
scale_x_continuous(expand = expansion(add = c(0, 1))) +
theme_minimal()

Resources