R: How do you order error plots by variance? - r

Is it possible to order error plots in R by their variance? So that they are from greatest variance to least?
Code:
library(ggplot2)
df <- ToothGrowth
df$dose <- as.factor(df$dose)
head(df, 3)
library(dplyr)
df.summary <- df %>%
group_by(dose) %>%
summarise(
sd = sd(len, na.rm = TRUE),
len = mean(len)
)
df.summary
f <- ggplot(
df.summary,
aes(x = dose, y = len, ymin = len-sd, ymax = len+sd)
)
f + geom_pointrange()
# Standard error bars
f + geom_errorbar(width = 0.2) +
geom_point(size = 1.5)
Any help at all would be greatly appreciated!

Continue the pipe coercing the sd to ordered factor with the order given by the numeric sd. Then plot as in the question. All that needs to change is the mutate below.
df.summary <- df %>%
group_by(dose) %>%
summarise(
sd = sd(len, na.rm = TRUE),
len = mean(len)
) %>%
mutate(i = order(sd, decreasing = TRUE),
dose = ordered(dose, levels = dose[i])) %>%
select(-i)

Related

How to visualize similar resistance pattern in a plot using R

I have a large dataset in which I want to group similar resistance patterns together. A plot to visualize similarity of resistance pattern is needed.
dat <- read.table(text="Id Resistance.Pattern
A SSRRSSSSR
B SSSRSSSSR
C RRRRSSRRR
D SSSSSSSSS
E SSRSSSSSR
F SSSRRSSRR
G SSSSR
H SSSSSSRRR
I RRSSRRRSS", header=TRUE)
I would separate out the values into a wider dataframe and then make a heatmap and dendrogram to compare sillimanites in patterns:
library(tidyverse)
library(ggdendro)
recode_dat <- dat |>
mutate(pat = str_split(Resistance.Pattern, "")) |>
unnest_wider(pat, names_sep = "_") |>
select(starts_with("pat_")) |>
mutate(across(everything(), ~case_when(. == "S" ~ 1, . == "R" ~ 2, is.na(.) ~0)))
rownames(recode_dat) <- dat$Id
dendro <- as.dendrogram(hclust(d = dist(x = scale(recode_dat))))
dendro_plot <- ggdendrogram(data = dendro, rotate = TRUE)
heatmap_plot <- dat |>
mutate(pat = str_split(Resistance.Pattern, "")) |>
unnest_wider(pat, names_sep = "_") |>
pivot_longer(cols = starts_with("pat_"), names_to = "pattern_position") |>
mutate(Id = factor(Id, levels = dat$Id[order.dendrogram(dendro)])) |>
ggplot(aes(pattern_position, Id))+
geom_tile(aes(fill = value))+
scale_x_discrete(labels = \(x) sub(".*_(\\d+$)", "\\1", x))+
theme(legend.position = "top")
cowplot::plot_grid(heatmap_plot, dendro_plot,nrow = 1, align = "h", axis = "tb")
It sounds as though the second column of your data frame represents sensitivity (S) and resistance (R), presumably to antibiotics (though this is not clear in your question). That being the case, you are presumably looking for something like this:
library(tidyverse)
p <- strsplit(dat$Resistance.Pattern, "")
do.call(rbind, lapply(p, \(x) c(x, rep(NA, max(lengths(p)) - length(x))))) %>%
as.data.frame() %>%
cbind(Id = dat$Id) %>%
mutate(Id = factor(Id, rev(Id))) %>%
pivot_longer(V1:V9) %>%
ggplot(aes(name, Id, fill = value)) +
geom_tile(col = "white", size = 2) +
coord_equal() +
scale_fill_manual(values = c("#e02430", "#d8d848"),
labels = c("Resistant", "Sensitive"),
na.value = "gray95") +
scale_x_discrete(name = "Antibiotic", position = "top",
labels = 1:9) +
labs(fill = "Resistance", y = "ID") +
theme_minimal(base_size = 20) +
theme(text = element_text(color = "gray30"))
I'd separate the entries by character, convert the binary data to numeric and plot the matrix as a heatmap and show the character string as rownames.
Whether to use a row and/or column clustering depends on whats desired.
library(dplyr)
library(tidyr) # for unnest_wider
library(gplots) # for heatmap.2
mm <-
dat %>%
group_by(Resistance.Pattern) %>%
summarize(Id, Resistance.Pattern) %>%
mutate(binary = strsplit(Resistance.Pattern, "")) %>%
unnest_wider(binary, names_sep="") %>%
mutate(across(starts_with("binary"), ~ as.numeric(c(R = 1, S = 0)[.x])))
mm2 <- as.matrix(mm[, -c(1,2)]) |> unname() # the numeric part
rownames(mm2) <- apply(as.matrix(mm[,1:2]), 1, paste, collapse=" ")
heatmap.2(mm2, trace="none", Colv="none", dendrogram="row",
col=c("green", "darkgreen"), margins=c(10,10))

Logit-Normal distribution in R

I am trying replicate the published code for ML-NMR from the multinma package, which is published here:
https://cran.r-project.org/web/packages/multinma/vignettes/example_plaque_psoriasis.html#ref-methods_paperl.
When I get to the following steps, I ran into problems applying the dlogitnorm function from the logitnorm package. I assume this is due to package dependencies. I've replaced the last line of the code since the function takes as arguments 'mu' and 'sigma', instead of 'mean' and 'sd'. However, when I produce the histogram, it doesn't match the published one. Any ideas how to match the published histogram?
library(multinma)
library(logitnorm) # required to use logit-Normal distribution
**# Get mean and sd of covariates in each study**
ipd_summary <- pso_ipd %>%
group_by(studyc) %>%
summarise_at(vars(weight, durnpso, bsa), list(mean = mean, sd = sd, min = min, max = max)) %>%
pivot_longer(weight_mean:bsa_max, names_sep = "_", names_to = c("covariate", ".value")) %>%
# Assign distributions
mutate(dist = recode(covariate,
bsa = "dlogitnorm",
durnpso = "dgamma",
weight = "dgamma")) %>%
# Compute density curves
group_by(studyc, covariate) %>%
mutate(value = if_else(dist == "dlogitnorm",
list(seq(0, 1, length.out = 101)),
list(seq(min*0.8, max*1.2, length.out = 101)))) %>%
unnest(cols = value) %>%
#Note this line was edited from the original code to solve an error caused by dlogitnorm(), which uses 'mu'=' and 'sigma' as arguments
**#mutate(dens = eval(call(first(dist), x = value, mean = first(mean), sd = first(sd))))**
mutate(dens = ifelse(dist != "dlogitnorm", eval(call(first(dist), x = value, mean = first(mean), sd = first(sd))), NA)
dens = ifelse(dist == "dlogitnorm", eval(call(first(dist), x = value, mu = first(mean), sigma =first(sd), log=FALSE)), dens))
*# Plot histograms and assumed densities*
pso_ipd %>%
pivot_longer(c(weight, durnpso, bsa), names_to = "covariate", values_to = "value") %>%
ggplot(aes(x = value)) +
geom_histogram(aes(y = stat(density)),
binwidth = function(x) diff(range(x)) / nclass.Sturges(x),
boundary = 0,
fill = "grey50") +
geom_line(aes(y = dens), data = ipd_summary,
colour = "darkred", size = 0.5) +
facet_wrap(~studyc + covariate, scales = "free", ncol = 3) +
theme_multinma()

How to plot line graph of normalized differences from binned data with ggplot?

I have several sets of data that I calculate binned normalized differences for. The results I want to plot within a single line plot using ggplot. The lines representing different combinations of the paired differences are supposed to be distinguished by colors and line types.
I am stuck on taking the computed values from the bins (would be y-axis values now), and plotting these onto an x-axis.
Below is the code I use for importing the data and calculating the normalized differences.
# Read data from column 3 as data table for different number of rows
# you could use replicate here for test
# dat1 <- data.frame(replicate(1,sample(25:50,10000,rep=TRUE)))
# dat2 <- data.frame(replicate(1,sample(25:50,9500,rep=TRUE)))
dat1 <- fread("/dir01/a/dat01.txt", header = FALSE, data.table=FALSE, select=c(3))
dat2 <- fread("/dir02/c/dat02.txt", header = FALSE, data.table=FALSE, select=c(3))
# Change column names
colnames(dat1) <- c("Dat1")
colnames(dat2) <- c("Dat2")
# Perhaps there is a better way to compute the following as all-in-one? I have broken these down step by step.
# 1) Sum for each bin
bin1 = cut(dat1$Dat1, breaks = seq(25, 50, by = 2))
sum1 = tapply(dat1$Dat1, bin1, sum)
bin2 = cut(dat2$Dat2, breaks = seq(25, 50, by = 2))
sum2 = tapply(dat2$Dat2, bin2, sum)
# 2) Total sum of all bins
sumt1 = sum(sum1)
sumt2 = sum(sum2)
# 3) Divide each bin by total sum of all bins
sumn1 = lapply(sum1, `/`, sumt1)
sumn2 = lapply(sum2, `/`, sumt2)
# 4) Convert to data frame as I'm not sure how to difference otherwise
df_sumn1 = data.frame(sumn1)
df_sumn2 = data.frame(sumn2)
# 5) Difference between the two as percentage
dbin = (df_sumn1 - df_sumn2)*100
How can I plot those results using ggplot() and geom_line()?
I want
dbin values on the x-axis ranging from 25-50
different colors and line types for the lines
Here is what I tried:
p1 <- ggplot(dbin, aes(x = ?, color=Data, linetype=Data)) +
geom_line() +
scale_linetype_manual(values=c("solid")) +
scale_x_continuous(limits = c(25, 50)) +
scale_color_manual(values = c("#000000"))
dput(dbin) outputs:
structure(list(X.25.27. = -0.0729132928804117, X.27.29. = -0.119044772581772,
X.29.31. = 0.316016473225017, X.31.33. = -0.292812782147632,
X.33.35. = 0.0776336591308158, X.35.37. = 0.0205584754637611,
X.37.39. = -0.300768421159599, X.39.41. = -0.403235174844081,
X.41.43. = 0.392510458816457, X.43.45. = 0.686758883448307,
X.45.47. = -0.25387105113263, X.47.49. = -0.0508324553382303), class = "data.frame", row.names = c(NA,
-1L))
Edit
The final piece of code that works, using only the dbin and plots multiple dbins:
dat1 <- data.frame(a = replicate(1,sample(25:50,10000,rep=TRUE, prob = 25:0/100)))
dat2 <- data.frame(a = replicate(1,sample(25:50,9500,rep=TRUE, prob = 0:25/100)))
dat3 <- data.frame(a = replicate(1,sample(25:50,9500,rep=TRUE, prob = 12:37/100)))
dat4 <- data.frame(a = replicate(1,sample(25:50,9500,rep=TRUE, prob = 37:12/100)))
calc_bin_props <- function(data) {
as_tibble(data) %>%
mutate(bin = cut(a, breaks = seq(25, 50, by = 2))) %>%
group_by(bin) %>%
summarise(sum = sum(a), .groups = "drop") %>%
filter(!is.na(bin)) %>%
ungroup() %>%
mutate(sum = sum / sum(sum))
}
diff_data <-
full_join(
calc_bin_props(data = dat1),
calc_bin_props(dat2),
by = "bin") %>%
separate(bin, c("trsh", "bin", "trshb", "trshc")) %>%
mutate(dbinA = (sum.x - sum.y * 100)) %>%
select(-starts_with("trsh"))
diff_data2 <-
full_join(
calc_bin_props(data = dat3),
calc_bin_props(dat4),
by = "bin") %>%
separate(bin, c("trsh", "bin", "trshb", "trshc")) %>%
mutate(dbinB = (sum.x - sum.y * 100)) %>%
select(-starts_with("trsh"))
# Combine two differences, and remove sum.x and sum.y
full_data <- cbind(diff_data, diff_data2[,4])
full_data <- full_data[,-c(2:3)]
# Melt the data to plot more than 1 variable on a plot
m <- melt(full_data, id.vars="bin")
theme_update(plot.title = element_text(hjust = 0.5))
ggplot(m, aes(as.numeric(bin), value, col=variable, linetype = variable)) +
geom_line() +
scale_linetype_manual(values=c("solid", "longdash")) +
scale_color_manual(values = c("black", "black"))
dev.off()
library(tidyverse)
Creating example data as shown in question, but adding different probabilities to the two sample() calls, to create so visible difference
between the two sets of randomized data.
dat1 <- data.frame(a = replicate(1,sample(25:50,10000,rep=TRUE, prob = 25:0/100))) %>% as_tibble()
dat2 <- data.frame(a = replicate(1,sample(25:50,9500,rep=TRUE, prob = 0:25/100))) %>% as_tibble()
Using dplyr we can handle this within data.frames (tibbles) without
the need to switch to other datatypes.
Let’s define a function that can be applied to both datasets to get
the preprocessing done.
We use base::cut() to create
a new column that pairs each value with its bin. We then group the data
by bin, calculate the sum for each bin and finally divide the bin sums
by the total sum.
calc_bin_props <- function(data) {
as_tibble(data) %>%
mutate(bin = cut(a, breaks = seq(25, 50, by = 2), labels = seq(25, 48, by = 2))) %>%
group_by(bin) %>%
summarise(sum = sum(a), .groups = "drop") %>%
filter(!is.na(bin)) %>%
ungroup() %>%
mutate(sum = sum / sum(sum))
}
Now we call calc_bin_props() on both datasets and join them by bin.
This gives us a dataframe with the columns bin, sum.x and sum.y.
The latter two are correspond to the bin sums derived from dat1 and
dat2. With the mutate() line we calculate the differences between the
two columns.
diff_data <-
full_join(
calc_bin_props(data = dat1),
calc_bin_props(dat2),
by = "bin") %>%
mutate(dbin = (sum.x - sum.y),
bin = as.numeric(as.character(bin))) %>%
select(-starts_with("trsh"))
Before we feed the data into ggplot() we convert it to the long
format using pivot_longer() this allows us to instruct ggplot() to
plot the results for sum.x, sum.y and dbin as separate lines.
diff_data %>%
pivot_longer(-bin) %>%
ggplot(aes(as.numeric(bin), value, color = name, linetype = name)) +
geom_line() +
scale_linetype_manual(values=c("longdash", "solid", "solid")) +
scale_color_manual(values = c("black", "purple", "green"))

Create all possible geom_col charts of the dataset using rowwise and rlang [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 2 years ago.
Improve this question
I want to create and save all the possible geom_col charts of a dataset with categorical variables on Y axis and average (grouping variable being y) value of numeric variables on x axis.
I borrowed ideas from this and this link and created the following code, but struggling to make it work. Please advise.
library(tidyverse)
library(skimr)
cat_vars <- skim(mpg) %>% as.data.frame() %>% filter(skim_type == "character", character.n_unique <= 16) %>% pull(skim_variable)
num_vars <- skim(mpg) %>% as.data.frame() %>% filter(skim_type == "numeric") %>% pull(skim_variable)
vars <- cross_df(list(y= cat_vars, x = num_vars))
plots <- bind_rows(rep(list(mpg), nrow(vars)), .id = "grp") %>%
nest_by(grp) %>%
bind_cols(vars) %>%
rowwise() %>%
mutate(plot = list(~(data %>%
group_by(y) %>%
summarise("{x}" = mean(.data[[x]], na.rm = T)) %>%
ungroup()) %>%
ggplot() +
geom_col(aes(x = .data[[x]],
y = fct_reorder(.data[[y]], .data[[x]], .fun = sum, na.rm = T),
fill = .data[[y]]), width = 0.8) +
xlab(paste0("Avg. ", x)) +
ylab(y) +
theme_classic()))),
filename = paste0(x, "_by_", y, ".pdf")) %>%
select(filename, plot)
pwalk(plots, ggsave, path = getwd())
There is no need to bind_rows, nest, ... Instead:
Put your plotting code in a helper function instead of wrapping everything in a pipeline. This allows for much easier debugging as well as cleaner and clearer code.
Use map2 to loop over the columns of your df vars
One issue with your code was summarise("{x}" = mean(.data[[x]], na.rm = T)) which will create a variable named {x}. Instead use e.g. !!sym(x) := ... to assign the value back to x if x is a string.
library(tidyverse)
library(skimr)
cat_vars <- skim(mpg) %>% as.data.frame() %>% filter(skim_type == "character", character.n_unique <= 16) %>% pull(skim_variable)
num_vars <- skim(mpg) %>% as.data.frame() %>% filter(skim_type == "numeric") %>% pull(skim_variable)
vars <- cross_df(list(y= cat_vars, x = num_vars))
make_plot <- function(data, x, y) {
data <- data %>%
group_by(across(all_of(y))) %>%
summarise(!!sym(x) := mean(.data[[x]], na.rm = T), .groups = "drop") %>%
ungroup()
ggplot(data) +
geom_col(aes(x = .data[[x]],
y = fct_reorder(.data[[y]], .data[[x]], .fun = sum, na.rm = T),
fill = .data[[y]]), width = 0.8) +
xlab(paste0("Avg. ", x)) +
ylab(y) +
theme_classic()
}
plots <- map2(vars$x, vars$y, make_plot, data = mpg)
length(plots)
#> [1] 25
plots[[1]]
# Export
pwalk(list(x = vars$x, y = vars$y, p = plots), function(x, y, p) ggsave(filename = paste0(x, "_by_", y, ".pdf"), plot = p, path = getwd()))

How can we data wrangling to obtain shown ratio/proportion chart shown

Goal is to produce a visualization indicating ratio.
Please help us how can we produce such ratio chart (high lighted) in R ?
library(tidyverse)
# Dataset creation
df <- data.frame(cls = c(rep("A",4),rep("B",4)),
grd = c("A1",rep("A2",3),rep(c("B1","B2"), 2)),
typ = c(rep("m",2),rep("o",2),"m","n",rep("p",2)),
pnts = c(rep(1:4,2)))
df
#### Data wrangling
df1 <- df %>%
group_by(cls) %>%
summarise(cls_pct = sum(pnts))
df1
df2 <- df %>%
group_by(cls,grd) %>%
summarize(grd_pct = sum(pnts))
df2
df3 <- df %>%
group_by(cls,grd,typ) %>%
summarise(typ_pct = sum(pnts))
df3
#### Attempt to combine all df1,df2,df3
# but mutate and summarise are mixing up leading to wrong results
df3 %>%
group_by(cls,grd) %>%
mutate(grd_pct = sum(typ_pct)) %>%
group_by(cls) %>%
mutate(cls_pct = sum(grd_pct))
Attempt to visualize all the ratios in 1 chart
data %>%
pivot_longer(cols = -c(cls:pnts),
names_to = "per_cat",
values_to = "percent") %>%
ggplot(aes(cls,percent, col = typ, fill = grd)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_bw()
plot of the same.
EDIT -- added formula version with more useful output for visualization.
ORIG: At this point it may be worth making a function to reduce copying and pasting, but this may get you what you need:
library(tidyverse)
df %>%
group_by(cls) %>%
mutate(per1 = sum(pnts),
per1_pct = per1 / sum(per1)) %>%
group_by(cls, grd) %>%
mutate(per2 = sum(pnts),
per2_pct = per2 / sum(per2)) %>%
group_by(cls, grd, typ) %>%
mutate(per3 = sum(pnts),
per3_pct = per3 / sum(per3)) %>%
ungroup()
EDIT: Here's a general function to calculate the stats for a given grouping, making it easier to combine a few groupings together in long format better suited for visualization.
df_sum <- function(df, level, ...) {
df %>%
group_by(...) %>%
summarize(grp_ttl = sum(pnts)) %>%
mutate(ttl = sum(grp_ttl),
pct = grp_ttl / ttl) %>%
ungroup() %>%
mutate(level = {{ level }} )
}
df_sum(df, level = 1, cls) %>%
bind_rows(df_sum(df, level = 2, cls, grd)) %>%
bind_rows(df_sum(df, level = 3, cls, grd, typ)) %>%
mutate(label = coalesce(as.character(typ), # This grabs the first non-NA
as.character(grd),
as.character(cls))) -> df_summed
df_summed %>%
ggplot(aes(level, grp_ttl)) +
geom_col(color = "white") +
geom_text(aes(label = paste0(label, "\n", grp_ttl, "/", ttl)),
color = "white",
position = position_stack(vjust = 0.5)) +
scale_x_reverse() + # To make level 1 at the top
coord_flip() # To switch from vertical to horizontal orientation

Resources