How to loop through third variable? - r

I have adapted the codes below which I referred from https://statsandr.com/blog/how-to-do-a-t-test-or-anova-for-many-variables-at-once-in-r-and-communicate-the-results-in-a-better-way/#to-go-even-further into my dataset:
Day<-c(1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2)
Group<-c("A","A","A","B","B","B","C","C","C","A","A","A","A","B","B","B","C","C","C")
Rain<-c(4,4,6,5,3,4,5,5,3,6,6,6,5,3,3,3,2,5,2)
UV<-c(6,6,7,8,5,6,5,6,6,6,7,7,8,8,5,6,8,5,7)
dat<-data.frame(Day,Group,Rain,UV)
x <- which(names(dat) == "Group")
y <- which(names(dat) == "Rain"
| names(dat) == "UV")
method1 <- "kruskal.test"
method2 <- "wilcox.test"
my_comparisons <- list(c("A", "B"), c("A", "C"), c("B", "C")) # comparisons for post-hoc test
library(ggpubr)
for (i in y) {
for (j in x) {
p <- ggboxplot(dat,
x = colnames(dat[j]), y = colnames(dat[i]),
color = colnames(dat[j]),
legend = "none",
palette = "npg",
add = "jitter"
)
print(
p + stat_compare_means(aes(label = paste0(..method.., ", p-value = ", ..p.format..)),
method = method1, label.y = max(dat[, i], na.rm = TRUE)
)
+ stat_compare_means(comparisons = my_comparisons, method = method2, label = "p.format")
)
}
}
How do I further repeat this function through different "Day"? Thanks.

I think you want to see the results for each day, right? You can add a third loop like this:
for (h in unique(dat$Day)) {
for (i in y) {
for (j in x) {
dat_tmp <- dat[dat$Day == h,] # create a subset of the data for each day
p <- ggboxplot(dat_tmp,
x = colnames(dat_tmp[j]), y = colnames(dat_tmp[i]),
color = colnames(dat_tmp[j]),
legend = "none",
palette = "npg",
add = "jitter"
)
print(
p + stat_compare_means(aes(label = paste0(..method.., ", p-value = ", ..p.format..)),
method = method1, label.y = max(dat_tmp[, i], na.rm = TRUE)
)
+ stat_compare_means(comparisons = my_comparisons, method = method2, label = "p.format")
)
}
}
}
I added a third loop to your code and created dat_tmp inside the loop, which becomes the dataset that you use for the analyses of each day.

Related

Issues with {ggpval} package in R

I am currently working in R to create bar charts. I was asked to add p-values on each bar charts. I found how to do it with the package {ggpval}. My issue now is that I cannot change the font size of it. The function used is add_pval(), which has an option for adjusting the font size, called textsize. But, it does not work. I can change the value of textsize, but nothing happens. Any idea? Please find below a reproducible example.
# Create a dataframe
df <- data.frame(A = runif(5),
B = runif(5),
G = c("Group1", "Group2", "Group3", "Group4", "Group5"))
# Melt the dataframe to be used for ggplot2
df_melt <- reshape2::melt(df, id.vars = "G")
# Create a list of p-values
pvalues <- list("p < 0.001", "p < 0.001", "'p = 0.123'", "'p = 0.813'", "'p = 0.043'")
# Create the plot
library(ggplot2)
library(ggpval)
bar_plot <- ggplot(data = df_melt, aes(x = variable, y = value, fill = variable)) + geom_bar(stat = "identity", position = "dodge") +
facet_grid(.~G) +
theme_bw() +
scale_y_continuous(labels = scales::percent_format(), limits = c(0, 1.05))
# Add p-values
add_pval(bar_plot, pairs = list(c(1, 2)), annotation = pvalues, textsize = 5)
System information
R version 4.1.1 (2021-08-10)
R Studio Version: 1.4.1717
OS: Ubuntu 20.04.3 LTS
Platform: x86_64-pc-linux-gnu (64-bit)
Package ggplot2: version 3.3.5
Package ggpval: version 0.2.4
The add_pvalue function has a bug; textsize is not used in the code. Below you can find a modified version, called my_add_pvalue (see the last rows of the code where I added size=textsize).
my_add_pval <- function (ggplot_obj, pairs = NULL, test = "wilcox.test", heights = NULL,
barheight = NULL, textsize = 5, pval_text_adj = NULL, annotation = NULL,
log = FALSE, pval_star = FALSE, plotly = FALSE, fold_change = FALSE,
parse_text = NULL, response = "infer", ...)
{
if (is.null(pairs)) {
total_groups <- length(unique(ggplot_obj$data[[ggpval:::get_in_parenthesis(as.character(ggplot_obj$mapping[1]))]]))
if (total_groups == 2) {
pairs <- list(c(1, 2))
}
else {
pairs <- lapply(2:total_groups, function(x) c(1,
x))
}
}
if (is.null(parse_text)) {
if (is.null(annotation)) {
parse_text <- TRUE
}
else {
parse_text <- FALSE
}
}
facet <- NULL
n_facet <- 1
ggplot_obj$data <- data.table(ggplot_obj$data)
if (class(ggplot_obj$facet)[1] != "FacetNull") {
if (class(ggplot_obj$facet)[1] == "FacetGrid") {
facet <- c(names(ggplot_obj$facet$params$cols), names(ggplot_obj$facet$params$rows))
}
else {
facet <- names(ggplot_obj$facet$params$facets)
}
if (length(facet) > 1) {
facet_ <- NULL
ggplot_obj$data[, `:=`(facet_, paste0(get(facet[1]),
get(facet[2])))]
comb <- expand.grid(levels(as.factor(ggplot_obj$data[,
get(facet[1])])), levels(as.factor(ggplot_obj$data[,
get(facet[2])])))
facet_level <- paste0(comb[, 1], comb[, 2])
facet <- "facet_"
}
else {
facet_level <- levels(as.factor(ggplot_obj$data[,
get(facet)]))
}
n_facet <- length(unique(ggplot_obj$data[, get(facet)]))
}
if (!is.null(heights)) {
if (length(pairs) != length(heights)) {
pairs <- rep_len(heights, length(pairs))
}
}
ggplot_obj$data$group__ <- ggplot_obj$data[, get(ggpval:::get_in_parenthesis(as.character(ggplot_obj$mapping[1])))]
ggplot_obj$data$group__ <- factor(ggplot_obj$data$group__)
if (response == "infer") {
response_ <- ggpval:::infer_response(ggplot_obj)
}
else {
response_ <- response
}
ggplot_obj$data$response <- ggplot_obj$data[, get(response_)]
y_range <- layer_scales(ggplot_obj)$y$range$range
if (is.null(barheight)) {
barheight <- (y_range[2] - y_range[1])/20
}
if (is.null(heights)) {
heights <- y_range[2] + barheight
heights <- rep(heights, length = length(pairs))
}
if (length(barheight) != length(pairs)) {
barheight <- rep(barheight, length = length(pairs))
}
if (is.null(pval_text_adj)) {
pval_text_adj <- barheight * 0.5
}
if (length(pval_text_adj) != length(pairs)) {
pval_text_adj <- rep(pval_text_adj, length = length(pairs))
}
if (!is.null(annotation)) {
if ((length(annotation) != length(pairs)) && length(annotation) !=
n_facet) {
annotation <- rep(annotation, length = length(pairs))
}
if (is.list(annotation)) {
if (length(annotation[[1]]) != length(pairs)) {
annotation <- lapply(annotation, function(a) rep(a,
length = length(pairs)))
}
}
annotation <- data.frame(annotation)
}
if (log) {
barheight <- exp(log(heights) + barheight) - heights
pval_text_adj <- exp(log(heights) + pval_text_adj) -
heights
}
V1 <- aes <- annotate <- geom_line <- group__ <- response <- labs <- NULL
for (i in seq(length(pairs))) {
if (length(unique(pairs[[1]])) != 2) {
stop("Each vector in pairs must have two different groups to compare, e.g. c(1,2) to compare first and second box.")
}
test_groups <- levels(ggplot_obj$data$group__)[pairs[[i]]]
data_2_test <- ggplot_obj$data[ggplot_obj$data$group__ %in%
test_groups, ]
if (!is.null(facet)) {
pval <- data_2_test[, lapply(.SD, function(i) get(test)(response ~
as.character(group__), ...)$p.value), by = facet,
.SDcols = c("response", "group__")]
pval <- pval[, `:=`(facet, factor(get(facet), levels = facet_level))][order(facet),
group__]
}
else {
pval <- get(test)(data = data_2_test, response ~
group__, ...)$p.value
if (fold_change) {
fc <- data_2_test[, median(response), by = group__][order(group__)][,
.SD[1]/.SD[2], .SDcols = "V1"][, V1]
fc <- paste0("FC=", round(fc, digits = 2))
pval <- paste(pval, fc)
}
}
if (pval_star & is.null(annotation)) {
pval <- pvars2star(pval)
annotation <- t(t(pval))
}
height <- heights[i]
df_path <- data.frame(group__ = rep(pairs[[i]], each = 2),
response = c(height, height + barheight[i], height +
barheight[i], height))
ggplot_obj <- ggplot_obj + geom_line(data = df_path,
aes(x = group__, y = response), inherit.aes = F)
if (is.null(annotation)) {
labels <- sapply(pval, function(i) format_pval(i,
plotly))
}
else {
labels <- unlist(annotation[i, ])
}
if (is.null(facet)) {
anno <- data.table(x = (pairs[[i]][1] + pairs[[i]][2])/2,
y = height + barheight[i] + pval_text_adj[i],
labs = labels)
}
else {
anno <- data.table(x = rep((pairs[[i]][1] + pairs[[i]][2])/2,
n_facet), y = rep(height + barheight[i] + pval_text_adj[i],
n_facet), labs = labels, facet = facet_level)
setnames(anno, "facet", eval(facet))
}
labs <- geom_text <- x <- y <- NULL
# Added here: size=textsize
ggplot_obj <- ggplot_obj + geom_text(data = anno, aes(x = x,
y = y, label = labs), size=textsize, parse = !pval_star & !plotly,
inherit.aes = FALSE)
}
ggplot_obj
}
Try it using:
my_add_pval(bar_plot, pairs = list(c(1, 2)), annotation = pvalues, textsize = 10)

Subsetting within R function

I have a function that generates a figure of a table:
plot_covariate_means_by_ntile <- function(.df, .ntile = "ntile", n_top = 10, directory) {
.df <- as.data.frame(.df)
covariate_names <- covariate_names
#.df[, .ntile] <- as.factor(.df[, .ntile])
.df[, .ntile] <- as_factor(.df[, .ntile], levels = "both")
# Regress each covariate on ntile/subgroup assignment to means p
cov_means <- lapply(covariate_names, function(covariate) {
lm_robust(as.formula(paste0(covariate, " ~ 0 + ", .ntile)), data = .df, se_type = "stata")
})
# Extract the mean and standard deviation of each covariate per ntile/subgroup
cov_table <- lapply(cov_means, function(cov_mean) {
means <- as.data.frame(t(coef(summary(cov_mean))[,c("Estimate", "Std. Error")]))
means
})
# Preparation to color the chart
temp_standardized <- sapply(seq_along(covariate_names), function(j) {
covariate_name <- covariate_names[j]
.mean <- mean(.df[, covariate_name], na.rm = TRUE)
.sd <- sd(.df[, covariate_name], na.rm = TRUE)
m <- as.matrix(round(signif(cov_table[[j]], digits=4), 3))
.standardized <- (m["Estimate",] - .mean) / .sd
.standardized
})
colnames(temp_standardized) <- covariate_names
ordering <- order(apply(temp_standardized, MARGIN = 2, function(x) {.range <- range(x); abs(.range[2] - .range[1])}), decreasing = TRUE)
# fwrite(tibble::rownames_to_column(as.data.frame(t(temp_standardized)[ordering,])),
# paste0(directory$data, "/covariate_standardized_means_by_", .ntile, ".csv"))
color_scale <- max(abs(c(max(temp_standardized, na.rm = TRUE), min(temp_standardized, na.rm = TRUE))))
color_scale <- color_scale * c(-1,1)
max_std_dev <- floor(max(color_scale))
breaks <- -max_std_dev:max_std_dev
labels <- c(" ", breaks, " ")
breaks <- c(min(color_scale), breaks, max(color_scale))
# Little trick to display the standard errors
table <- lapply(seq_along(covariate_names), function(j) {
covariate_name <- covariate_names[j]
.mean <- mean(.df[, covariate_name], na.rm = TRUE)
.sd <- sd(.df[, covariate_name], na.rm = TRUE)
m <- as.matrix(round(signif(cov_table[[j]], digits=4), 3))
.standardized <- (m["Estimate",] - .mean) / .sd
return(data.frame(covariate = covariate_name,
group = c(1,2,5) ,
estimate = m["Estimate",], std.error = m["Std. Error",],
standardized = .standardized))
})
# table <- do.call(rbind, table)
table <- rbindlist(table)
setnames(table, "group", .ntile)
table[, covariate := factor(covariate, levels = rev(covariate_names[ordering]), ordered = TRUE)]
table[covariate %in% head(covariate_names[ordering], n_top)] %>%
mutate(info = paste0(estimate, "\n(", std.error, ")")) %>%
ggplot(aes_string(x = .ntile, y = "covariate")) +
# Add coloring
geom_raster(aes(fill = standardized)
, alpha = 0.9
) +
scale_fill_distiller(palette = "RdBu",
direction = 1,
breaks = breaks,
labels = labels,
limits = color_scale,
name = "Standard\nDeviation on\nNormalized\nDistribution"
) +
# add numerics
geom_text(aes(label = info), size=2.1) +
# reformat
labs(title = paste0("Covariate averages within ", ifelse(tolower(.ntile) == "leaf", .ntile, "Assigned Group")),
y = "within covariate") +
scale_x_continuous(position = "top") #+
#cowplot::theme_minimal_hgrid(16)
}
But the output shows all 5 columns, I want it to show only 1 , 2 and 5.
I can adjust the line
groups = 1:ncol(m)
But then that incorrectly labels the groups, the third column is actually group 5:
Is there any way to adjust the function to present the correct columns and the correct labels for them?
Maybe you could use facet_wrap as a workaround?
library(tidyverse)
data.frame(X = rep(1:5, each = 25),
Y = rep(factor(rev(LETTERS[-26]),
levels = rev(LETTERS[-26])), 5),
Z = rnorm(125, 5, 1)) %>%
mutate(X = ifelse(X %in% c(1,2,5), X, NA)) %>%
na.omit() %>%
ggplot(aes(x = X, y = Y, fill = Z)) +
geom_raster() +
facet_wrap(~X, ncol=3, scales="free_x") +
theme_minimal() +
theme(axis.text.x = element_blank())
I tried to figure out a solution using scale_x_discrete (e.g. something like scale_x_discrete(limits = c("1", "2", "5"), breaks = c("1", "2", "5"))) and it 'feels' like it could work, but I gave up - maybe something worth pursuing.

How to plot two different user defined functions in the same plot in R

I need to plot 2 different user defined function in the same R plot.
I vectorize each of them:
Vectorize creates a function wrapper that vectorizes the action of its argument FUN. Vectorize(FUN, vectorize.args = arg.names, SIMPLIFY = TRUE,USE.NAMES = TRUE)
If I plot them separately I get the correct plot, however if I try to plot both functions in the same graph, it does not work.
Here is what I did:
1) first function:
payoff_call <- function(S, K){
if(S < 0 | K < 0){
return(print("The input S and K must be > 0"))
}else{
return(max(S-K,0))
}
}
2) second function:
myBlackScholes <- function(S, K, tau, r, sigma, type = c("call", "put")) {
if(S < 0 | K < 0 | tau < 0 | sigma < 0) {
return(print("The input S , K , tau and sigma must be > 0"))
} else
{
d1 <- (log(S/K) + (r + 0.5*sigma^2)*tau)/(sigma*sqrt(tau))
d2 <- d1 - sigma*sqrt(tau)
if(type == "call"){
output <- cbind(
V_BS_Call = S*pnorm(d1) - K*exp(-r*(tau))*pnorm(d2), #fair value call
delta_call = pnorm(d1), #delta call
vega_call = S*sqrt(tau)*dnorm(d1), #vega call
theta_call = -S*dnorm(d1)*sigma/(2*sqrt(tau)) - r*K*exp(-r*tau)*pnorm(d2), #theta call
rho_call = K*tau*exp(-r*tau)*pnorm(d2), #rho call
kappa_call = -exp(-r*tau)*(pnorm(-d2)-1), #kappa call
gamma_call = dnorm(d1)/(S*sigma*sqrt(tau)))#gamma call
return(output)
} else if(type == "put"){
output <- cbind(
V_BS_Put = K*exp(-r*(tau))*pnorm(-d2) - S*pnorm(-d1), #fair value put
delta_put = pnorm(d1)-1, #delta put
vega_put = S*sqrt(tau)*dnorm(d1), #vega put same as vega call
theta_put = -S*dnorm(d1)*sigma/(2*sqrt(tau)) + r*K*exp(-r*tau)*pnorm(-d2), #theta put
rho_put = -K*tau*exp(-r*tau)*pnorm(-d2), #rho put
kappa_put = exp(-r*tau)*pnorm(-d2), #kappa put
gamma_put = dnorm(d1)/(S*sigma*sqrt(tau))) #gamma put
return(output)
} else{
return(print("Wrong type in input"))
}
}
}
3) I vectorize each function:
vect_payoff_call <- Vectorize(payoff_call)
vect_myBlackScholes <- Vectorize(myBlackScholes)
4) I plot the 2 functions, for S starting at 0 to 100:
plot(x = 0:100, y = vect_payoff_call(0:100, 50),
type="l", col="blue", lty = 1, lwd = 1,
main = "Long Call Option Payoff function", xlab = "S", ylab = expression(f(S)))
plot(x = 0:100, y = vect_myBlackScholes(0:100,50, 1, 0.12, 0.3, "call")[1,], type="l", col="green", lty = 1, lwd = 1, add=TRUE)
The first plot is correct, but the second is not.
Any suggestion?
Here is how. Note that I use ggplot2 in my example:
library(ggplot2)
x <- seq(0,2, by=0.1)
my_square <- function(x) x^2
my_cube <- function(x) x^3
my_data <- data.frame(argx = x, my_square = my_square(x),
my_cube = my_cube(x))
ggplot(my_data) +
geom_point(aes(argx, my_square, color = 'x^2')) +
geom_line(aes(argx, my_square, color = 'x^2')) +
geom_point(aes(argx, my_cube, color = 'x^3')) +
geom_line(aes(argx, my_cube, color = 'x^3')) +
theme_bw() +
labs(x = 'x', y = 'y') +
scale_color_manual(values = c('x^2' = 'red', 'x^3' = 'green'), name = 'function')
Output

Cannot update/edit ggplot2 object exported from a package (`gratia`) in R

I hope I am missing something painfully obvious here.
I wish to update (e.g., fix title, labs, etc.) on a ggplot object produced from gratia::draw(). Not really sure why I am unable to update the object.
Is there a simple solution?
# devtools::install_github('gavinsimpson/gratia')
library('mgcv')
library('gratia')
dat <- gamSim(1, n = 400, dist = "normal", scale = 2, verbose = FALSE)
mod <- gam(y ~ s(x0), data = dat, method = "REML")
draw(mod)
p = draw(mod)
# P is a ggobject.
class(p)
#> [1] "gg" "ggplot"
So, why can't I update p?
p + ggtitle("My title")
Created on 2019-02-26 by the reprex package (v0.2.1)
The object returned by draw.gam is the output of cowplot::plot_grid (not a pure ggplot2 graphical object).
I made a small change into draw.gam function of gratia. .
Now the p object can be manipulated:
# The modified draw.gam function
mydraw.gam <- function (object, parametric = TRUE, select = NULL, scales = c("free",
"fixed"), align = "hv", axis = "lrtb", n = 100, unconditional = FALSE,
overall_uncertainty = TRUE, dist = 0.1, ...)
{
scales <- match.arg(scales)
S <- smooths(object)
select <- gratia:::check_user_select_smooths(smooths = S, select = select)
d <- gratia:::smooth_dim(object)
take <- d <= 2L
select <- select[take]
S <- S[take]
d <- d[take]
is_re <- vapply(object[["smooth"]], gratia:::is_re_smooth, logical(1L))
is_by <- vapply(object[["smooth"]], gratia:::is_by_smooth, logical(1L))
if (any(is_by)) {
S <- vapply(strsplit(S, ":"), `[[`, character(1L), 1L)
}
npara <- 0
nsmooth <- length(S)
if (isTRUE(parametric)) {
terms <- parametric_terms(object)
npara <- length(terms)
p <- vector("list", length = npara)
}
g <- l <- vector("list", length = nsmooth)
for (i in unique(S)) {
eS <- evaluate_smooth(object, smooth = i, n = n, unconditional = unconditional,
overall_uncertainty = overall_uncertainty, dist = dist)
l[S == i] <- split(eS, eS[["smooth"]])
}
l <- l[select]
d <- d[select]
g <- g[select]
if (length(g) == 0L) {
message("Unable to draw any of the model terms.")
return(invisible(g))
}
for (i in seq_along(l)) {
g[[i]] <- draw(l[[i]])
}
if (isTRUE(parametric)) {
for (i in seq_along(terms)) {
p[[i]] <- evaluate_parametric_term(object, term = terms[i])
g[[i + length(g)]] <- draw(p[[i]])
}
}
if (isTRUE(identical(scales, "fixed"))) {
wrapper <- function(x) {
range(x[["est"]] + (2 * x[["se"]]), x[["est"]] -
(2 * x[["se"]]))
}
ylims <- range(unlist(lapply(l, wrapper)))
if (isTRUE(parametric)) {
ylims <- range(ylims, unlist(lapply(p, function(x) range(x[["upper"]],
x[["lower"]]))))
}
gg <- seq_along(g)[c(d == 1L, rep(TRUE, npara))]
for (i in gg) {
g[[i]] <- g[[i]] + lims(y = ylims)
}
}
g
}
# Example no. 1
dat <- gamSim(1, n = 400, dist = "normal", scale = 2, verbose = FALSE)
mod <- gam(y ~ s(x0), data = dat, method = "REML")
p <- mydraw.gam(mod)
p[[1]] + ggtitle("My title")
# Example no. 2
mod <- gam(y ~ s(x0) + x1, data = dat, method = "REML")
p <- mydraw.gam(mod)
# Plot graphs separately
p[[1]] + ggtitle("My title")
p[[2]] + ggtitle("My title")
# Arrange the two plots on the same figure
cowplot::plot_grid(plotlist = p)
Reposts from Gavin Simpson and Hao Ye, respectively:
I think the only way to change the title(s) on the individual plots of smooths would be to use draw(evaluate_smooth(model, "smooth"), title = "My title") individually at the moment.
You might be able to hack a title in a different way:
draw(mod) +
ggplot2::coord_cartesian(clip = "off") +
ggplot2::theme(plot.margin = ggplot2::unit(c(0.05, 0, 0, 0), "npc")) +
ggplot2::annotate("text", x = 0.5, y = 1, vjust = 0, label = "TITLE", size = 6)

factoextra::fviz_gap_stat() versus factoextra::fviz_nbclust(df, method = "gap_stat")

I'm trying to figure out why these two functions from the factoextra package with seemingly similar parameters (e.g. kmeans, gap_stat,k.maxandB`) are yielding different results.
library(cluster)
library(cluster.datasets)
library(tidyverse)
library(factoextra)
# load data and scale it
data("all.mammals.milk.1956")
mammals <- all.mammals.milk.1956 %>% select(-name)
mammals_scaled <- scale(mammals)
The first method uses factoextra::clusGap() and factoextra::fviz_gap_stat()
gap_stat <- clusGap(mammals_scaled, FUN = kmeans, K.max = 24, B = 50)
fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("fviz_gap_stat: Gap Statistic")
The second method uses factoextra::fviz_nbclust() which
fviz_nbclust(mammals_scaled, kmeans, method = "gap_stat", k.max = 24, nboot = 50) + theme_minimal() + ggtitle("fviz_nbClust_gap_stat: Gap Statistic")
I thought it could possibly be the nstart option from clusGap() but when I use the jimhester/lookup to read the source code of fviz_nbclust() with the following code I couldn't find what the issue was:
devtools::install_github("jimhester/lookup")
lookup::lookup(fviz_nbclust)
function (x, FUNcluster = NULL, method = c("silhouette", "wss",
"gap_stat"), diss = NULL, k.max = 10, nboot = 100, verbose = interactive(),
barfill = "steelblue", barcolor = "steelblue", linecolor = "steelblue",
print.summary = TRUE, ...)
{
set.seed(123)
if (k.max < 2)
stop("k.max must bet > = 2")
method = match.arg(method)
if (!inherits(x, c("data.frame", "matrix")) & !("Best.nc" %in%
names(x)))
stop("x should be an object of class matrix/data.frame or ",
"an object created by the function NbClust() [NbClust package].")
if (inherits(x, "list") & "Best.nc" %in% names(x)) {
best_nc <- x$Best.nc
if (class(best_nc) == "numeric")
print(best_nc)
else if (class(best_nc) == "matrix")
.viz_NbClust(x, print.summary, barfill, barcolor)
}
else if (is.null(FUNcluster))
stop("The argument FUNcluster is required. ", "Possible values are kmeans, pam, hcut, clara, ...")
else if (method %in% c("silhouette", "wss")) {
if (is.data.frame(x))
x <- as.matrix(x)
if (is.null(diss))
diss <- stats::dist(x)
v <- rep(0, k.max)
if (method == "silhouette") {
for (i in 2:k.max) {
clust <- FUNcluster(x, i, ...)
v[i] <- .get_ave_sil_width(diss, clust$cluster)
}
}
else if (method == "wss") {
for (i in 1:k.max) {
clust <- FUNcluster(x, i, ...)
v[i] <- .get_withinSS(diss, clust$cluster)
}
}
df <- data.frame(clusters = as.factor(1:k.max), y = v)
ylab <- "Total Within Sum of Square"
if (method == "silhouette")
ylab <- "Average silhouette width"
p <- ggpubr::ggline(df, x = "clusters", y = "y", group = 1,
color = linecolor, ylab = ylab, xlab = "Number of clusters k",
main = "Optimal number of clusters")
if (method == "silhouette")
p <- p + geom_vline(xintercept = which.max(v), linetype = 2,
color = linecolor)
return(p)
}
else if (method == "gap_stat") {
extra_args <- list(...)
gap_stat <- cluster::clusGap(x, FUNcluster, K.max = k.max,
B = nboot, verbose = verbose, ...)
if (!is.null(extra_args$maxSE))
maxSE <- extra_args$maxSE
else maxSE <- list(method = "firstSEmax", SE.factor = 1)
p <- fviz_gap_stat(gap_stat, linecolor = linecolor,
maxSE = maxSE)
return(p)
}
}
The difference is right at the beginning of the fviz_nbclust function. In line 6 the random seed is set:
set.seed(123)
Because the kmeans algorithm uses a random start the results can be different in repeated runs. For example, I used your data with two different random seeds to arrive at slightly different results.
set.seed(123)
gap_stat <- cluster::clusGap(mammals_scaled, FUN = kmeans, K.max = 24, B = 50)
fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("fviz_gap_stat: Gap Statistic")
seed 123 gap stat
set.seed(42)
gap_stat <- cluster::clusGap(mammals_scaled, FUN = kmeans, K.max = 24, B = 50)
fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("fviz_gap_stat: Gap Statistic")
seed 42 gap stat
I'm not entirely sure why the seed 123 results are not the same but I think it is connected to the fact that in my code it is executed right above the clusGap function and in Fviz_nbclust several other commands are evaluated in between.

Resources