Many "duplicated levels in factors are deprecated" warning with ggplot2 - r

I have made a function that can plot the loadings from many factor analyses at once, also when their variables do not overlap perfectly (or at all). It works fine, except that it generates a number of "duplicated levels in factors are deprecated" warning, and I don't understand why.
The code allow should be reproducible.
library(devtools)
source_url("https://raw.githubusercontent.com/Deleetdk/psych2/master/psych2.R")
loadings.plot2 = function(fa.objects, fa.names="") {
fa.num = length(fa.objects) #number of fas
if (fa.names=="") {
fa.names = str_c("fa.", 1:fa.num)
}
if (length(fa.names) != fa.num) {
stop("Names vector does not match the number of factor analyses.")
}
#merge into df
d = data.frame() #to merge into
for (fa.idx in 1:fa.num) { #loop over fa objects
loads = fa.objects[[fa.idx]]$loadings
rnames = rownames(loads)
loads = as.data.frame(as.vector(loads))
rownames(loads) = rnames
colnames(loads) = fa.names[fa.idx]
d = merge.datasets(d, loads, 1)
}
#reshape to long form
d2 = reshape(d,
varying = 1:fa.num,
direction="long",
ids = rownames(d))
d2$time = as.factor(d2$time)
d2$id = as.factor(d2$id)
print(d2)
print(levels(d2$time))
print(levels(d2$id))
#plot
g = ggplot(reorder_by(id, ~ fa, d2), aes(x=fa, y=id, color=time)) +
geom_point() +
xlab("Loading") + ylab("Indicator") +
scale_color_discrete(name="Analysis",
labels=fa.names)
return(g)
}
fa1 = fa(iris[-5])
fa2 = fa(iris[-c(1:50),-5])
fa3 = fa(ability)
fa4 = fa(ability[1:50,])
loadings.plot2(list(fa1))
loadings.plot2(list(fa1,fa2))
loadings.plot2(list(fa1,fa2,fa3))
loadings.plot2(list(fa1,fa2,fa3,fa4))
Plotting different numbers of factors give different numbers of errors.
I have tried setting the variables as.factor before giving them to ggplot, but it didn't change anything.
Any ideas? Perhaps related to reorder_by()? This function is needed to sort the data.frame, otherwise ggplot sorts them alphabetically, which is useless.

As mentioned in the comments, this warning is caused by using the reorder_by() function but only in conjunction with ggplot2. Specifically, the levels are for some reason duplicated:
#> levels(d2$id)
[1] "Sepal.Width" "Sepal.Width" "Sepal.Length" "Sepal.Length" "Petal.Width" "Petal.Width" "Petal.Length"
[8] "Petal.Length"
ggplot2 does not like duplicate levels, and so gives the warning.
In case anyone is interested, I wrote new code to do the re-leveling myself to avoid this problem and to avoid the dependency on the plotflow package.
The new function is this:
#' Plot multiple factor loadings in one plot.
#'
#' Returns a ggplot2 plot with sorted loadings colored by the analysis they belong to. Supports reversing óf any factors that are reversed. Dodges to avoid overplotting. Only works for factor analyses with 1 factor solutions!
#' #param fa_objects (list of fa-class objects) Factor analyses objects from the fa() function from the \code{\link{psych}} package.
#' #param fa_labels (chr vector) Names of the analyses. Defaults to fa.1, fa.2, etc..
#' #param reverse_vector (num vector) Vector of numbers to use for reversing factors. Use e.g. c(1, -1) to reverse the second factor. Defaults not reversing.
#' #param reorder (chr scalar or NA) Which factor analysis to order the loadings by. Can be integers that reprensent each factor analysis. Can also be "mean", "median" to use the means and medians of the loadings. Use "all" for the old method. Default = "mean".
#' #export
#' #examples
#' library(psych)
#' plot_loadings_multi(fa(iris[-5])) #extract a factor and reverse
plot_loadings_multi = function (fa_objects, fa_labels, reverse_vector = NA, reorder = "mean") {
library("stringr")
library("ggplot2")
library("plyr")
fa_num = length(fa_objects)
fa_names = str_c("fa.", 1:fa_num)
if (!is.list(fa_objects)) {
stop("fa_objects parameter is not a list.")
}
if (class(fa_objects) %in% c("psych", "fa")) {
fa_objects = list(fa_objects)
fa_num = length(fa_objects)
fa_names = str_c("fa.", 1:fa_num)
}
if (missing("fa_labels")) {
if (!is.null(names(fa_objects))) {
fa_labels = names(fa_objects)
}
else {
fa_labels = fa_names
}
}
if (length(fa_labels) != fa_num) {
stop("Factor analysis labels length is not identical to number of analyses.")
}
if (all(is.na(reverse_vector))) {
reverse_vector = rep(1, fa_num)
}
else if (length(reverse_vector) != fa_num) {
stop("Length of reversing vector does not match number of factor analyses.")
}
d = data.frame()
for (fa.idx in 1:fa_num) {
loads = fa_objects[[fa.idx]]$loadings * reverse_vector[fa.idx]
rnames = rownames(loads)
loads = as.data.frame(as.vector(loads))
rownames(loads) = rnames
colnames(loads) = fa_names[fa.idx]
suppressor({
d = merge_datasets(d, loads, 1)
})
}
d2 = reshape(d, varying = 1:fa_num, direction = "long", ids = rownames(d))
d2$time = as.factor(d2$time)
d2$id = as.factor(d2$id)
colnames(d2)[2] = "fa"
#reorder factor?
if (!is.na(reorder)) {
if (reorder == "all") {
library("plotflow")
silence({
d2 = reorder_by(id, ~fa, d2)
})
} else if (reorder == "mean") {
v_aggregate_values = daply(d2, .(id), function(x) {
mean(x$fa)
})
#re-level
d2$id = factor(d2$id, levels = names(sort(v_aggregate_values, decreasing = F)))
} else if (reorder == "median") {
v_aggregate_values = daply(d2, .(id), function(x) {
median(x$fa)
})
#re-level
d2$id = factor(d2$id, levels = names(sort(v_aggregate_values, decreasing = F)))
} else {
d2_sub = d2[d2$time == reorder, ] #subset the analysis whose loading is to be used for the reorder
silence({
d2_sub = reorder_by(id, ~fa, d2_sub)
})
library(gdata)
d2$id = reorder.factor(d2$id, new.order = levels(d2_sub$id))
}
}
#plot
g = ggplot(d2, aes(x = id, y = fa, color = time, group = time)) +
geom_point(position = position_dodge(width = 0.5)) +
ylab("Loading") + xlab("Indicator") + scale_color_discrete(name = "Analysis",
labels = fa_labels) + coord_flip()
return(g)
}
library(psych)
fa_1 = fa(iris[-5])
fa_2 = fa(iris[1:125, -5])
plot_loadings_multi(list(fa_1, fa_2), reorder = "mean")
Which produces the following plot without warnings:
The code is from my personal package.

Related

Cant save ggplot graphs while using a loop in R

For some reason, I can't figure out, why when I run a ggplot loop to create multiple graphs I don't see them in the environment and hence can't further display the graphs.
Data sample.
db = data.frame(exposure = sample(1:100, 100),
exposure2 = sample(-90:100,100),
outcome = sample(200:1000,100))
exposure_vector = c("exposure","exposure2")
exposure_title = c("Pesticide","Apple")
for (i in 1:length(exposure_vector)) {
current_exposure = db[[exposure_vector[i]]]
title = exposure_title[i]
graph_name = paste0(title,"_","Graph")
graph_name=ggplot(db,aes(x=current_exposure,y=outcome))+geom_smooth()+
theme_bw()+ylab("outcome")+xlab("exposure")+ggtitle(title)
print(graph_name)
}
This is probably a better way to do what you are trying to do. You can mapply over your vectors of titles and exposures, which will return a list of graphs you can then refer to by name:
graphs <- mapply(X=exposure_title,Y=exposure_vector, function(X,Y){
ggplot(db,aes(x=.data[[Y]],y=outcome))+
geom_smooth()+
theme_bw()+
ylab("outcome")+
xlab("exposure")+
ggtitle(X)
}, SIMPLIFY = FALSE )
graphs$Pesticide
graphs$Apple
The graphname is out of scope.
You need to declare it outside the loop.
For example
db = data.frame(exposure = sample(1:100, 100),
exposure2 = sample(-90:100,100),
outcome = sample(200:1000,100))
exposure_vector = c("exposure","exposure2")
exposure_title = c("Pesticide","Apple")
plot <- list() #declare
for (i in 1:length(exposure_vector)) {
current_exposure = db[[exposure_vector[i]]]
title = exposure_title[i]
graph_name = paste0(title,"_","Graph")
graph_name=ggplot(db,aes(x=current_exposure,y=outcome))+geom_smooth()+
theme_bw()+ylab("outcome")+xlab("exposure")+ggtitle(title)
plot[[i]] <- graph_name #write
print(graph_name)
}
I assume that you want to assign to a variable, whose name is paste0(title, "_", "Graph"), the value of the plot. If this is correct, you should use assign
library(ggplot2)
db <- data.frame(exposure = sample(1:100, 100),
exposure2 = sample(-90:100,100),
outcome = sample(200:1000,100))
exposure_vector <- c("exposure","exposure2")
exposure_title <- c("Pesticide","Apple")
for (i in 1:length(exposure_vector)) {
current_exposure <- db[[exposure_vector[i]]]
title <- exposure_title[i]
graph_name <- paste0(title,"_","Graph")
p <- ggplot(db,aes(x=current_exposure,y=outcome))+
geom_smooth()+
theme_bw()+
ylab("outcome")+
xlab("exposure")+
ggtitle(title)
assign(graph_name, p)
print(p)
}
ls()
##> [1] "Apple_Graph" "current_exposure" "db" "exposure_title"
##> [5] "exposure_vector" "graph_name" "i" "p"
##> [9] "Pesticide_Graph" "title"

Iterating through a vector of names using for loop in R

library(sjPlot)
I am using sjPlot to create quick xtabs:
sjPlot::tab_xtab(var.row = mtcars$mpg ,var.col =mtcars$wt ,show.row.prc = TRUE)
But since I am running it over many cols, I want to set up a loop:
cols <- names(mtcars)
for (i in cols) {
sjPlot::tab_xtab(
var.row = mtcars$mpg,
var.col = mtcars$i,
show.row.prc = TRUE
)
}
But this throws an error:
Error in table(x_full, grp_full) :
all arguments must have the same length
Why is this happening, and can someone please explain subsetting through iteration, or point me to a good resource?
FYI ---
This does not return anything.
for (i in cols) {
i <- 'cyl'
sjPlot::tab_xtab(
var.row = mtcars$mpg,
var.col = mtcars[[i]],
show.row.prc = TRUE
)
}
But this does return the chart for mpg - cyl:
for (i in cols) {
sjPlot::tab_xtab(
var.row = mtcars$mpg,
var.col = mtcars$cyl,
show.row.prc = TRUE
)
}
Thanks for the comments, this is the solution, but still looking for some understanding. Why mtcars[[i]] instead of mtars[i]? Why doesn't mrcars$i work?
for (i in cols) {
plt <- sjPlot::tab_xtab(
var.row = mtcars$mpg,
var.col = mtcars[[i]],
show.row.prc = TRUE
)
print(plt)
}

R gt: color a column by another column's value

I would like to create a gt table where I display numeric values from two columns together in a single cell, but color the cells based on just one of the column's values.
For example using the ToothGrowth example data I'd like to put the len and dose columns together in a single cell but color the cell backgrounds by the value of dose.
I tried to manually create a vector of colors to color the len_dose column but this does not work because it seems like it is reapplying the color vector to each different level of len_dose, not dose. I guess you could manually format the cells with tab_style() but that seems inefficient and does not give you the nice feature where the text color changes to maximize contrast with background. I don't know an efficient way to do this.
What I tried:
library(gt)
library(dplyr)
library(scales)
library(glue)
# Manually map dose to color
dose_colors <- col_numeric(palette = 'Reds', domain = range(ToothGrowth$dose))(ToothGrowth$dose)
ToothGrowth %>%
mutate(len_dose = glue('{len}: ({dose})')) %>%
gt(rowname_col = 'supp') %>%
cols_hide(c(len, dose)) %>%
data_color(len_dose, colors = dose_colors)
Output (not good because not colored by dose):
Not sure if you found a solution to this yet but here is what I did:
If you use tab_style() you don't need to try and create the vector of colors and can instead set the background color you want based on the dose column. If you want to color values differently based on dose, in addition to what I've colored here, then create another tab_style() for the desired value.
library(gt)
library(dplyr)
library(scales)
library(glue)
ToothGrowth %>%
mutate(len_dose = glue('{len}: ({dose})')) %>%
gt(rowname_col = 'supp') %>%
tab_style(
style = cell_fill(color = "palegreen"),
location = cells_body(
columns = len_dose,
rows = dose >= 1.0
)
) %>%
cols_hide(c(len, dose))
I faced the same issue and adjusted the gt::data_color function to accept separate source and target columns - with that, the following should work to produce your desired output.
# Distinguish SOURCE_columns and TARGET_columns
my_data_color <- function (data, SOURCE_columns, TARGET_columns, colors, alpha = NULL, apply_to = c("fill",
"text"), autocolor_text = TRUE)
{
stop_if_not_gt(data = data)
apply_to <- match.arg(apply_to)
colors <- rlang::enquo(colors)
data_tbl <- dt_data_get(data = data)
colors <- rlang::eval_tidy(colors, data_tbl)
resolved_source_columns <- resolve_cols_c(expr = {
{
SOURCE_columns
}
}, data = data)
resolved_target_columns <- resolve_cols_c(expr = {
{
TARGET_columns
}
}, data = data)
rows <- seq_len(nrow(data_tbl))
data_color_styles_tbl <- dplyr::tibble(locname = character(0),
grpname = character(0), colname = character(0), locnum = numeric(0),
rownum = integer(0), colnum = integer(0), styles = list())
for (i in seq_along(resolved_source_columns)) {
data_vals <- data_tbl[[resolved_source_columns[i]]][rows]
if (inherits(colors, "character")) {
if (is.numeric(data_vals)) {
color_fn <- scales::col_numeric(palette = colors,
domain = data_vals, alpha = TRUE)
}
else if (is.character(data_vals) || is.factor(data_vals)) {
if (length(colors) > 1) {
nlvl <- if (is.factor(data_vals)) {
nlevels(data_vals)
}
else {
nlevels(factor(data_vals))
}
if (length(colors) > nlvl) {
colors <- colors[seq_len(nlvl)]
}
}
color_fn <- scales::col_factor(palette = colors,
domain = data_vals, alpha = TRUE)
}
else {
cli::cli_abort("Don't know how to map colors to a column of class {class(data_vals)[1]}.")
}
}
else if (inherits(colors, "function")) {
color_fn <- colors
}
else {
cli::cli_abort("The `colors` arg must be either a character vector of colors or a function.")
}
color_fn <- rlang::eval_tidy(color_fn, data_tbl)
color_vals <- color_fn(data_vals)
color_vals <- html_color(colors = color_vals, alpha = alpha)
color_styles <- switch(apply_to, fill = lapply(color_vals,
FUN = function(x) cell_fill(color = x)), text = lapply(color_vals,
FUN = function(x) cell_text(color = x)))
data_color_styles_tbl <- dplyr::bind_rows(data_color_styles_tbl,
generate_data_color_styles_tbl(column = resolved_target_columns[i], rows = rows,
color_styles = color_styles))
if (apply_to == "fill" && autocolor_text) {
color_vals <- ideal_fgnd_color(bgnd_color = color_vals)
color_styles <- lapply(color_vals, FUN = function(x) cell_text(color = x))
data_color_styles_tbl <- dplyr::bind_rows(data_color_styles_tbl,
generate_data_color_styles_tbl(column = resolved_target_columns[i],
rows = rows, color_styles = color_styles))
}
}
dt_styles_set(data = data, styles = dplyr::bind_rows(dt_styles_get(data = data),
data_color_styles_tbl))
}
# Add function into gt namespace (so that internal gt functions can be called)
library(gt)
tmpfun <- get("data_color", envir = asNamespace("gt"))
environment(my_data_color) <- environment(tmpfun)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(glue)
# Map dose to color
ToothGrowth %>%
mutate(len_dose = glue('{len}: ({dose})')) %>%
gt(rowname_col = 'supp') %>%
cols_hide(c(len, dose)) %>%
my_data_color(SOURCE_columns = "dose", TARGET_columns = "len_dose",
colors = scales::col_numeric(palette = c("red", "green"), domain = c(min(ToothGrowth$dose), max(ToothGrowth$dose))))
Created on 2022-11-03 with reprex v2.0.2

How to change plotting parameters of a function within a wrapper (R)

I'm trying to generate heatmaps by using cellrangerRkit package. The function within this package refers to pheatmap function featured in pheatmap library as seen below:
gbm_pheatmap
function (gbm, genes_to_plot, cells_to_plot, n_genes = 5, colour = NULL,
limits = c(-3, 3))
{
if (!is.list(genes_to_plot)) {
cat("Plotting one gene set instead of multiple cluster-specific gene sets\n")
gene_indices <- sapply(genes_to_plot, function(x) get_gene_index(gbm,
x))
gene_annotation <- NULL
}
else {
if ("significant" %in% names(genes_to_plot[[1]])) {
gene_indices <- unlist(lapply(genes_to_plot, function(x) with(x,
head(ix[significant], n_genes))))
gene_grouping <- unlist(lapply(names(genes_to_plot),
function(nm) rep(nm, with(genes_to_plot[[nm]],
length(head(ix[significant], n_genes))))))
}
else {
gene_indices <- unlist(lapply(genes_to_plot, function(x) x$ix[1:n_genes]))
gene_grouping <- rep(names(genes_to_plot), each = n_genes)
}
gene_annotation <- data.frame(ClusterID = as.factor(gene_grouping))
}
cell_indices <- unlist(lapply(cells_to_plot, function(x) x$ix))
value <- t(scale(t(as.matrix(exprs(gbm))[gene_indices, cell_indices])))
value[value < limits[1]] <- limits[1]
value[value > limits[2]] <- limits[2]
rownames(value) <- make.unique(fData(gbm)$symbol[gene_indices])
cell_grouping <- unlist(lapply(1:length(cells_to_plot), function(x) {
rep(names(cells_to_plot)[x], length(cells_to_plot[[x]]$barcode))
}))
cell_annotation <- data.frame(ClusterID = as.factor(cell_grouping))
rownames(cell_annotation) <- colnames(value)
if (!is.null(gene_annotation)) {
rownames(gene_annotation) <- rownames(value)
}
if (is.null(colour)) {
anno_colors <- NULL
}
else {
names(colour) <- names(cells_to_plot)
anno_colors <- list(ClusterID = colour)
}
pheatmap(value, cluster_rows = FALSE, cluster_cols = FALSE,
show_colnames = FALSE, annotation_row = gene_annotation,
annotation_col = cell_annotation, annotation_names_row = FALSE,
annotation_names_col = FALSE, annotation_colors = anno_colors)
}
<bytecode: 0x00000000507b7970>
<environment: namespace:cellrangerRkit>
My problem is that, when I plot my heatmap, the annotation on the right side of the plot is overlapping due to large font size (see below)
The wrapper function gbm_heatmap doesn't have a fontsize option, preventing me from simply passing an argument when calling it. How I can change the plotting behavior within this wrapper?
Appreciate all the input, thanks!

accessing eigenvalues in RSSA package in R

I am using RSSA package in R and I need to access the eigenvalues.
using the following code I can plot the components. However, I need to access all eigenvalues as numbers.
require(Rssa)
t=ssa(co2)
plot(t)
I know almost nothing about this package. I'm taking from context that you want the values that are plotted on the y-axis of that graphic. Lacking a reproducible example, I turn to the ?ssa help page and use the first example:
> s <- ssa(co2)
>
> plot(s)
So that looks like your plot: Then I look at the code
> getAnywhere(plot.ssa)
A single object matching ‘plot.ssa’ was found
It was found in the following places
registered S3 method for plot from namespace Rssa
namespace:Rssa
with value
function (x, type = c("values", "vectors", "paired", "series",
"wcor"), ..., vectors = c("eigen", "factor"), plot.contrib = TRUE,
numvalues = nsigma(x), numvectors = min(nsigma(x), 10), idx = 1:numvectors,
idy, groups)
{
type <- match.arg(type)
vectors <- match.arg(vectors)
if (identical(type, "values")) {
.plot.ssa.values(x, ..., numvalues = numvalues)
}
else if (identical(type, "vectors")) {
.plot.ssa.vectors(x, ..., what = vectors, plot.contrib = plot.contrib,
idx = idx)
}
else if (identical(type, "paired")) {
if (missing(idy))
idy <- idx + 1
.plot.ssa.paired(x, ..., what = vectors, plot.contrib = plot.contrib,
idx = idx, idy = idy)
}
else if (identical(type, "series")) {
if (missing(groups))
groups <- as.list(1:min(nsigma(x), nu(x)))
.plot.ssa.series(x, ..., groups = groups)
}
else if (identical(type, "wcor")) {
if (missing(groups))
groups <- as.list(1:min(nsigma(x), nu(x)))
plot(wcor(x, groups = groups), ...)
}
else {
stop("Unsupported type of SSA plot!")
}
}
<environment: namespace:Rssa>
So then I look at the function called when the default arguments are used:
> getAnywhere(.plot.ssa.values)
A single object matching ‘.plot.ssa.values’ was found
It was found in the following places
namespace:Rssa
with value
function (x, ..., numvalues, plot.type = "b")
{
dots <- list(...)
d <- data.frame(A = 1:numvalues, B = x$sigma[1:numvalues])
dots <- .defaults(dots, type = plot.type, xlab = "Index",
ylab = "norms", main = "Component norms", grid = TRUE,
scales = list(y = list(log = TRUE)), par.settings = list(plot.symbol = list(pch = 20)))
do.call("xyplot", c(list(x = B ~ A, data = d, ssaobj = x),
dots))
}
<environment: namespace:Rssa>
So the answer appears to be:
s$sigma
[1] 78886.190749 329.031810 327.198387 184.659743 88.695271 88.191805
[7] 52.380502 40.527875 31.329930 29.409384 27.157698 22.334446
[13] 17.237926 14.175096 14.111402 12.976716 12.943775 12.216524
[19] 11.830642 11.614243 11.226010 10.457529 10.435998 9.774000
[25] 9.710220 9.046872 8.995923 8.928725 8.809155 8.548962
[31] 8.358872 7.699094 7.266915 7.243014 7.164837 6.203210
[37] 6.085105 6.064150 6.035110 6.028446 5.845783 5.808865
[43] 5.770708 5.753422 5.680897 5.672330 5.650324 5.612606
[49] 5.599314 5.572931

Resources