Can you facet on a indicator variable in ggplot2? - r

types = c("A", "B", "C")
df = data.frame(n = rnorm(100), type=sample(types, 100, replace = TRUE))
ggplot(data=df, aes(n)) + geom_histogram() + facet_grid(~type)
Above is how I normally used facetting. But can I use it when instead of a categorical variable I have a set of columns that are indicator variables such as:
df = data.frame(n = rnorm(100), A=rbinom(100, 1, .5), B=rbinom(100, 1, .5), C=rbinom(100, 1, .5))
Now the "Type" variable from my previous example isn't mutually exclusive. An observation can be "A and B" or "A and B and C" for example. However, I'd still like an individual histogram for any observation that has the presence of A, B, or C?

I would reshape the data with tidyr so that data in more that one category are duplicated. filter to remove unwanted cases.
df <- data.frame(
n = rnorm(100),
A = rbinom(100, 1, .5),
B = rbinom(100, 1, .5),
C = rbinom(100, 1, .5)
)
library("tidyr")
library("dplyr")
library("ggplot2")
df %>% gather(key = "type", value = "value", -n) %>%
filter(value == 1) %>%
ggplot(aes(x = n)) +
geom_histogram() +
facet_wrap(~type)

I've always despised gather, so I'll add another method and one for the data.table fans.
library(data.table)
DT <- melt(setDT(df), id= "n", variable = "type")[value > 0]
ggplot(DT,aes(n)) + geom_histogram() + facet_grid(~type)
#tidyland
library(reshape2)
library(dplyr)
library(ggplot2)
df %>%
melt(id = "n", variable = "type") %>%
filter(value > 0) %>%
ggplot(aes(n)) + geom_histogram() + facet_grid(~type)

Related

How to visualize similar resistance pattern in a plot using R

I have a large dataset in which I want to group similar resistance patterns together. A plot to visualize similarity of resistance pattern is needed.
dat <- read.table(text="Id Resistance.Pattern
A SSRRSSSSR
B SSSRSSSSR
C RRRRSSRRR
D SSSSSSSSS
E SSRSSSSSR
F SSSRRSSRR
G SSSSR
H SSSSSSRRR
I RRSSRRRSS", header=TRUE)
I would separate out the values into a wider dataframe and then make a heatmap and dendrogram to compare sillimanites in patterns:
library(tidyverse)
library(ggdendro)
recode_dat <- dat |>
mutate(pat = str_split(Resistance.Pattern, "")) |>
unnest_wider(pat, names_sep = "_") |>
select(starts_with("pat_")) |>
mutate(across(everything(), ~case_when(. == "S" ~ 1, . == "R" ~ 2, is.na(.) ~0)))
rownames(recode_dat) <- dat$Id
dendro <- as.dendrogram(hclust(d = dist(x = scale(recode_dat))))
dendro_plot <- ggdendrogram(data = dendro, rotate = TRUE)
heatmap_plot <- dat |>
mutate(pat = str_split(Resistance.Pattern, "")) |>
unnest_wider(pat, names_sep = "_") |>
pivot_longer(cols = starts_with("pat_"), names_to = "pattern_position") |>
mutate(Id = factor(Id, levels = dat$Id[order.dendrogram(dendro)])) |>
ggplot(aes(pattern_position, Id))+
geom_tile(aes(fill = value))+
scale_x_discrete(labels = \(x) sub(".*_(\\d+$)", "\\1", x))+
theme(legend.position = "top")
cowplot::plot_grid(heatmap_plot, dendro_plot,nrow = 1, align = "h", axis = "tb")
It sounds as though the second column of your data frame represents sensitivity (S) and resistance (R), presumably to antibiotics (though this is not clear in your question). That being the case, you are presumably looking for something like this:
library(tidyverse)
p <- strsplit(dat$Resistance.Pattern, "")
do.call(rbind, lapply(p, \(x) c(x, rep(NA, max(lengths(p)) - length(x))))) %>%
as.data.frame() %>%
cbind(Id = dat$Id) %>%
mutate(Id = factor(Id, rev(Id))) %>%
pivot_longer(V1:V9) %>%
ggplot(aes(name, Id, fill = value)) +
geom_tile(col = "white", size = 2) +
coord_equal() +
scale_fill_manual(values = c("#e02430", "#d8d848"),
labels = c("Resistant", "Sensitive"),
na.value = "gray95") +
scale_x_discrete(name = "Antibiotic", position = "top",
labels = 1:9) +
labs(fill = "Resistance", y = "ID") +
theme_minimal(base_size = 20) +
theme(text = element_text(color = "gray30"))
I'd separate the entries by character, convert the binary data to numeric and plot the matrix as a heatmap and show the character string as rownames.
Whether to use a row and/or column clustering depends on whats desired.
library(dplyr)
library(tidyr) # for unnest_wider
library(gplots) # for heatmap.2
mm <-
dat %>%
group_by(Resistance.Pattern) %>%
summarize(Id, Resistance.Pattern) %>%
mutate(binary = strsplit(Resistance.Pattern, "")) %>%
unnest_wider(binary, names_sep="") %>%
mutate(across(starts_with("binary"), ~ as.numeric(c(R = 1, S = 0)[.x])))
mm2 <- as.matrix(mm[, -c(1,2)]) |> unname() # the numeric part
rownames(mm2) <- apply(as.matrix(mm[,1:2]), 1, paste, collapse=" ")
heatmap.2(mm2, trace="none", Colv="none", dendrogram="row",
col=c("green", "darkgreen"), margins=c(10,10))

How to specify multiple xlims for facetted data in ggplot2 R?

The data is facetted by two variables (see graph). Each variable has a different range. I want to specify the range so that all plots in var1 and vae2 are bound by the min and max values of those variables. See sample code attached. I don't want to use setscales = "free" on facet_wrap.
var1 <- rnorm(100, 6, 2)
var2 <- rnorm(100,15,2)
spp.val <- rnorm(100,10,2)
spp <- rep(c("A","B","C","D"), 25)
df <- data.frame(var1, var2,spp, spp.val)
df <- gather(df,
key = "var",
value = "var.val",
var1,var2)
df$var <- as.factor(as.character(df$var))
df$spp <- as.factor(as.character(df$spp))
ggplot(aes(x = var.val, y = spp.val), data = df) +
geom_point() +
facet_grid(spp~var)
#I want the limits for each facet_grid to be set as follows
xlim(min(df[df$var == "var1",]), max(df[df$var == "var1",])
xlim(min(df[df$var == "var2",]), max(df[df$var == "var2",])
Is this what you want?
library(tidyverse)
tibble(
var1 = rnorm(100, 6, 2),
var2 = rnorm(100, 15, 2),
spp.val = rnorm(100, 10, 2),
spp = rep(c("A", "B", "C", "D"), 25)
) |>
pivot_longer(starts_with("var"), names_to = "var", values_to = "var.val") |>
mutate(across(c(spp, var), factor)) |>
ggplot(aes(var.val, spp.val)) +
geom_point() +
facet_grid(spp ~var, scales = "free_x")
Created on 2022-04-23 by the reprex package (v2.0.1)

Normalize data per row in R

How can I scale/normalize my data per row (Observations)? Something like [-1:1] like a z score?
I have seen previous post which involve normalization of the whole dataset like this https://stats.stackexchange.com/questions/178626/how-to-normalize-data-between-1-and-1
, but id like to normalise per row so they can be plotted in a same box plot as they all show same pattern across x-axis.
Obs <- c("A", "B", "C")
count1 <- c(100,15,3)
count2 <- c(250, 30, 5)
count3 <- c(290, 20, 8)
count4<- c(80,12, 2 )
df <- data.frame(Obs, count1, count2, count3, count4)
dff<- df %>% pivot_longer(cols = !Obs, names_to = 'count', values_to = 'Value')
ggplot(dff, aes(x = count, y = Value)) +
geom_jitter(alpha = 0.1, color = "tomato") +
geom_boxplot()
Based on the link you shared, you can use apply to use the corresponding function to rescale dataframe over [-1,1].
library(scales)
library(ggplot2)
library(tidyr)
Obs <- c("A", "B", "C")
count1 <- c(100,15,3)
count2 <- c(250, 30, 5)
count3 <- c(290, 20, 8)
count4<- c(80,12, 2 )
df <- data.frame(count1, count2, count3, count4)
df <- as.data.frame(t(apply(df, 1, function(x)(2*(x-min(x))/(max(x)-min(x)))- 1)))
df <- cbind(Obs, df)
dff<- df %>%
tidyr::pivot_longer(cols = !Obs, names_to = 'count', values_to = 'Value')
ggplot(dff, aes(x = count, y = Value)) +
geom_jitter(alpha = 0.1, color = "tomato") +
geom_boxplot()
Console output:
If you pivot it longer, you can group by your observations and scale:
df %>%
pivot_longer(cols = !Obs, names_to = 'count', values_to = 'Value') %>% group_by(Obs) %>%
mutate(z=as.numeric(scale(Value))) %>%
ggplot(aes(x=count,y=z))+geom_boxplot()
Or in base R, just do:
boxplot(t(scale(t(df[,-1]))))

Is it possible to use geom_table() along facet_grid()?

Recently I discovered the function geom_table(), from ggpmisc package, which allows you to put a table inside a plot. But I don't know how to put different tables into a grid plot.
I have this df and plot:
library(lubridate)
library(ggplot2)
library(ggpmisc)
Date <- c("2010-01-28", "2010-02-28", "2010-03-28",
"2010-04-28", "2010-05-28", "2010-06-28",
"2010-07-28", "2010-08-28", "2010-09-28",
"2010-10-28")
Date <- as_date(Date)
Country <- rep("Japan", 10)
A <- runif(10, min=30, max=90)
B <- runif(10, min = 1, max = 15)
df <- data.frame(Date, Country, A, B)
df %>% pivot_longer(-c(Date, Country)) %>%
ggplot(aes(x=Date,y=value,group=1,color=Country))+
geom_line(size = 0.9) +
facet_grid(name~Country, scales = "free", switch = "y")
I also have these two tables, tableA and tableB:
Time <- c("Today", "Yesterday", "One week ago")
Value_A <- 10:12
Value_B <- 1:3
tableA <- data.frame(Time, Value_A)
tableB <- data.frame(Time, Value_B)
How I put tableA in the top graph and tableB in the bottom graph?
I appreciate it if someone can help :)
You need to create a little data frame that hosts your tableA and tableB in a list column:
d <- tibble(x = c(0.95, 0.95), y = c(0.95, 0.95),
name = c("A", "B"), tb = list(tableA, tableB))
df %>% pivot_longer(-c(Date, Country)) %>%
ggplot(aes(x=Date,y=value,group=1,color=Country))+
geom_line(size = 0.9) +
geom_table_npc(data = d, aes(npcx = x, npcy = y, label = tb)) +
facet_grid(name~Country, scales = "free", switch = "y")

Mapping function by groups in ggplot

The same regression model has been estimated on several groups using dplyr::group_by() and broom::tidy(). The estimates should be used to plot the regression function for each group in ggplot.
The following code works for base r curve().
library(tidyverse)
my_tbl <- tibble::tribble(
~Col_1, ~Col_2, ~Col_3,
"A", "(Intercept)", 30,
"A", "x", 10,
"A", "x2", -2,
"B", "(Intercept)", 40,
"B", "x", 20,
"B", "x2", -1
)
my_tbl %>%
split(.$Col_1) %>%
map( ~curve(.$Col_3[1] + .$Col_3[2] * x + .$Col_3[3] * x^2,
1,
30,
main = paste(.$Col_1[1]),
ylab = "y"))
The stat_function() is not able to find the parameter values in my_tbl.
my_tbl %>%
nest(-Col_1) %>%
mutate(plot = map(data, ~ggplot(data = data.frame(x = c(1, 30)),
mapping = aes(x = x)) +
stat_function()))
Is there any particular reason why you went with nest instead of split (as per the base R approach)? Because that works fine with ggplot, and the map logic matches that of the base R solution.
With nest, since the Col_1 column isn't part of the nested data, I used map2 instead of map in order to pass both data & Col_1 to ggplot.
result1 <- my_tbl %>%
split(.$Col_1) %>%
map(~ ggplot(data.frame(x = seq(0, 30)), aes(x)) +
ggtitle(.$Col_1[1]) +
stat_function(fun = function(x) .x$Col_3[1] + .x$Col_3[2] * x + .x$Col_3[3] * x^2))
result2 <- my_tbl %>%
nest(-Col_1) %>%
mutate(plot = map2(data, Col_1,
~ ggplot(data.frame(x = seq(0, 30)), aes(x)) +
ggtitle(.y) +
stat_function(fun = function(x) .x$Col_3[1] + .x$Col_3[2] * x + .x$Col_3[3] * x^2)))
# resulting plots are the same
cowplot::plot_grid(plotlist = result1)
cowplot::plot_grid(plotlist = result2$plot)

Resources