Plot data from list using ggplot2 - r

I have a list of 4 different matrix length. I wish to plot them as set of time series like in the example below just that x-axis is a running number (e.g. 1:75) and y-axis is the matrix value (e.g. sin(1:75)).
(https://homepage.divms.uiowa.edu/~luke/classes/STAT4580/timeseries_files/figure-html/unnamed-chunk-39-2.png).
I know that that ggplot2 does not handle lists so any idea how to advance?
Script:
mat1 <- matrix(cos(1:50), nrow = 50, ncol = 1)
mat2 <- matrix(sin(1:75), nrow = 75, ncol = 1)
mat3 <- matrix(tan(1:50), nrow = 50, ncol = 1)
mat4 <- matrix(1:100, nrow = 100, ncol = 1)
myList <- list(mat1, mat2, mat3, mat4)
names(myList)[1] <- "mat1"
names(myList)[2] <- "mat2"
names(myList)[3] <- "mat3"
names(myList)[4] <- "mat4"

Something like this?
library(tidyverse)
map_dfr(myList, ~as.data.frame(.x), .id = "id") %>%
group_by(id) %>%
mutate(n = 1:n()) %>%
ungroup() %>%
mutate(id = as.factor(id)) %>%
ggplot(aes(n, V1, colour = id)) +
geom_line() +
facet_wrap(~ id, scales = "free")
Explanation: We first convert all matrices to data.frames and bind all rows together into a single data.frame including an id which derives from the list names; we can then number rows by id and then plot the row number vs. the single column.
Here is the same code "un-piped" and "uglified"
library(tidyverse)
# Convert from list of matrices to long data.frame
df.long <- map_dfr(myList, ~as.data.frame(.x), .id = "id")
# Group by id
df.long <- group_by(df.long, id)
# Add row number (per group)
df.long <- mutate(df.long, n = 1:n())
# ungroup
df.long <- ungroup(df.long)
# Make sure id is a factor
df.long <- mutate(df.long, id = as.factor(id))
# (gg)plot
ggplot(df.long, aes(n, V1, colour = id)) +
geom_line() +
facet_wrap(~ id, scales = "free")
It's easy to see how %>% takes the left object and uses it as the first argument of the function on the right; so f(x) would become x %>% f().

library(tidyverse)
enframe(myList) %>%
unnest() %>%
group_by(name) %>%
rowid_to_column() %>%
ungroup() %>%
ggplot(aes(rowid, value)) +
geom_line() +
facet_wrap(~name, scales = "free")

Related

Error in is.finite(x); need to add an additional line to a line chart (dplyr)

I have already attempted to search for this problem to no result. Have managed to reproduce the error below.
The problem: I'm trying to add a fourth line which represents the aggregate mean of all letters for each year. So far, I'm only able to generate the mean values for each letter. Everything runs fine until the last geom_line(), which is meant to generate the aggregate error. I've also tried inserting abline(). One other consideration is adding the "Mean" values under Letters so that they are generated anyway, but I believe there is a simpler method.
library(tidyverse)
Letters <- rep(c("A","B","C"),20)
Years <- rep(c(1990:1999),6)
Numbers <- runif(60, min = 0, max = 20)
df <- data.frame(Letters, Years, Numbers) %>%
group_by(Letters,Years) %>%
summarise(Letter_Mean= mean(Numbers),.groups = 'drop')
meanallletters <- df %>%
group_by(Years) %>%
summarise(all_mean = mean(Numbers),.groups = 'drop') %>%
select(-Years)
lineplotsample <- df %>%
ggplot(aes(x=Years, y=Letter_Mean, color = Letters))
## this doesn't work
lineplotsample + geom_line() + geom_point() + geom_line(aes(Years, y= meanallletters))
## this works, but missing the line representing aggregate mean
lineplotsample + geom_line() + geom_point()
I would summarize the data and then bind it to the bottom of the original data, like this:
library(tidyverse)
Letters <- rep(c("A","B","C"),20)
Years <- rep(c(1990:1999),6)
Numbers <- runif(60, min = 0, max = 20)
df <- data.frame(Letters, Years, Numbers) %>%
group_by(Letters,Years) %>%
summarise(Letter_Mean= mean(Numbers),.groups = 'drop')
meanallletters <- df %>%
group_by(Years) %>%
summarise(Letters = "All",
Letter_Mean = mean(Letter_Mean)) %>%
bind_rows(df,.) %>%
ungroup %>%
mutate(Letters = factor(Letters, levels=c("A", "B", "C", "All")))
meanallletters %>%
ggplot(aes(x=Years, y=Letter_Mean, color = Letters)) +
geom_line() +
geom_point()
Created on 2023-02-12 by the reprex package (v2.0.1)
Here's a more general way of specifying the levels. It also deals with the situation wither Letters is initially a factor.
library(tidyverse)
Letters <- rep(LETTERS,20)
Years <- rep(c(1990:1999),26)
Numbers <- runif(26*10, min = 0, max = 20)
df <- data.frame(Letters, Years, Numbers) %>%
group_by(Letters,Years) %>%
summarise(Letter_Mean= mean(Numbers),.groups = 'drop')
meanallletters <- df %>%
mutate(Letters = as.character(Letters)) %>%
group_by(Years) %>%
summarise(Letters = "All",
Letter_Mean = mean(Letter_Mean)) %>%
bind_rows(df,.) %>%
ungroup %>%
mutate(Letters = factor(Letters, levels=c(levels(as.factor(df$Letters)), "All")))
meanallletters %>%
ggplot(aes(x=Years, y=Letter_Mean, color = Letters)) +
geom_line() +
geom_point()
Created on 2023-02-12 by the reprex package (v2.0.1)

How to visualize similar resistance pattern in a plot using R

I have a large dataset in which I want to group similar resistance patterns together. A plot to visualize similarity of resistance pattern is needed.
dat <- read.table(text="Id Resistance.Pattern
A SSRRSSSSR
B SSSRSSSSR
C RRRRSSRRR
D SSSSSSSSS
E SSRSSSSSR
F SSSRRSSRR
G SSSSR
H SSSSSSRRR
I RRSSRRRSS", header=TRUE)
I would separate out the values into a wider dataframe and then make a heatmap and dendrogram to compare sillimanites in patterns:
library(tidyverse)
library(ggdendro)
recode_dat <- dat |>
mutate(pat = str_split(Resistance.Pattern, "")) |>
unnest_wider(pat, names_sep = "_") |>
select(starts_with("pat_")) |>
mutate(across(everything(), ~case_when(. == "S" ~ 1, . == "R" ~ 2, is.na(.) ~0)))
rownames(recode_dat) <- dat$Id
dendro <- as.dendrogram(hclust(d = dist(x = scale(recode_dat))))
dendro_plot <- ggdendrogram(data = dendro, rotate = TRUE)
heatmap_plot <- dat |>
mutate(pat = str_split(Resistance.Pattern, "")) |>
unnest_wider(pat, names_sep = "_") |>
pivot_longer(cols = starts_with("pat_"), names_to = "pattern_position") |>
mutate(Id = factor(Id, levels = dat$Id[order.dendrogram(dendro)])) |>
ggplot(aes(pattern_position, Id))+
geom_tile(aes(fill = value))+
scale_x_discrete(labels = \(x) sub(".*_(\\d+$)", "\\1", x))+
theme(legend.position = "top")
cowplot::plot_grid(heatmap_plot, dendro_plot,nrow = 1, align = "h", axis = "tb")
It sounds as though the second column of your data frame represents sensitivity (S) and resistance (R), presumably to antibiotics (though this is not clear in your question). That being the case, you are presumably looking for something like this:
library(tidyverse)
p <- strsplit(dat$Resistance.Pattern, "")
do.call(rbind, lapply(p, \(x) c(x, rep(NA, max(lengths(p)) - length(x))))) %>%
as.data.frame() %>%
cbind(Id = dat$Id) %>%
mutate(Id = factor(Id, rev(Id))) %>%
pivot_longer(V1:V9) %>%
ggplot(aes(name, Id, fill = value)) +
geom_tile(col = "white", size = 2) +
coord_equal() +
scale_fill_manual(values = c("#e02430", "#d8d848"),
labels = c("Resistant", "Sensitive"),
na.value = "gray95") +
scale_x_discrete(name = "Antibiotic", position = "top",
labels = 1:9) +
labs(fill = "Resistance", y = "ID") +
theme_minimal(base_size = 20) +
theme(text = element_text(color = "gray30"))
I'd separate the entries by character, convert the binary data to numeric and plot the matrix as a heatmap and show the character string as rownames.
Whether to use a row and/or column clustering depends on whats desired.
library(dplyr)
library(tidyr) # for unnest_wider
library(gplots) # for heatmap.2
mm <-
dat %>%
group_by(Resistance.Pattern) %>%
summarize(Id, Resistance.Pattern) %>%
mutate(binary = strsplit(Resistance.Pattern, "")) %>%
unnest_wider(binary, names_sep="") %>%
mutate(across(starts_with("binary"), ~ as.numeric(c(R = 1, S = 0)[.x])))
mm2 <- as.matrix(mm[, -c(1,2)]) |> unname() # the numeric part
rownames(mm2) <- apply(as.matrix(mm[,1:2]), 1, paste, collapse=" ")
heatmap.2(mm2, trace="none", Colv="none", dendrogram="row",
col=c("green", "darkgreen"), margins=c(10,10))

How to plot line graph of normalized differences from binned data with ggplot?

I have several sets of data that I calculate binned normalized differences for. The results I want to plot within a single line plot using ggplot. The lines representing different combinations of the paired differences are supposed to be distinguished by colors and line types.
I am stuck on taking the computed values from the bins (would be y-axis values now), and plotting these onto an x-axis.
Below is the code I use for importing the data and calculating the normalized differences.
# Read data from column 3 as data table for different number of rows
# you could use replicate here for test
# dat1 <- data.frame(replicate(1,sample(25:50,10000,rep=TRUE)))
# dat2 <- data.frame(replicate(1,sample(25:50,9500,rep=TRUE)))
dat1 <- fread("/dir01/a/dat01.txt", header = FALSE, data.table=FALSE, select=c(3))
dat2 <- fread("/dir02/c/dat02.txt", header = FALSE, data.table=FALSE, select=c(3))
# Change column names
colnames(dat1) <- c("Dat1")
colnames(dat2) <- c("Dat2")
# Perhaps there is a better way to compute the following as all-in-one? I have broken these down step by step.
# 1) Sum for each bin
bin1 = cut(dat1$Dat1, breaks = seq(25, 50, by = 2))
sum1 = tapply(dat1$Dat1, bin1, sum)
bin2 = cut(dat2$Dat2, breaks = seq(25, 50, by = 2))
sum2 = tapply(dat2$Dat2, bin2, sum)
# 2) Total sum of all bins
sumt1 = sum(sum1)
sumt2 = sum(sum2)
# 3) Divide each bin by total sum of all bins
sumn1 = lapply(sum1, `/`, sumt1)
sumn2 = lapply(sum2, `/`, sumt2)
# 4) Convert to data frame as I'm not sure how to difference otherwise
df_sumn1 = data.frame(sumn1)
df_sumn2 = data.frame(sumn2)
# 5) Difference between the two as percentage
dbin = (df_sumn1 - df_sumn2)*100
How can I plot those results using ggplot() and geom_line()?
I want
dbin values on the x-axis ranging from 25-50
different colors and line types for the lines
Here is what I tried:
p1 <- ggplot(dbin, aes(x = ?, color=Data, linetype=Data)) +
geom_line() +
scale_linetype_manual(values=c("solid")) +
scale_x_continuous(limits = c(25, 50)) +
scale_color_manual(values = c("#000000"))
dput(dbin) outputs:
structure(list(X.25.27. = -0.0729132928804117, X.27.29. = -0.119044772581772,
X.29.31. = 0.316016473225017, X.31.33. = -0.292812782147632,
X.33.35. = 0.0776336591308158, X.35.37. = 0.0205584754637611,
X.37.39. = -0.300768421159599, X.39.41. = -0.403235174844081,
X.41.43. = 0.392510458816457, X.43.45. = 0.686758883448307,
X.45.47. = -0.25387105113263, X.47.49. = -0.0508324553382303), class = "data.frame", row.names = c(NA,
-1L))
Edit
The final piece of code that works, using only the dbin and plots multiple dbins:
dat1 <- data.frame(a = replicate(1,sample(25:50,10000,rep=TRUE, prob = 25:0/100)))
dat2 <- data.frame(a = replicate(1,sample(25:50,9500,rep=TRUE, prob = 0:25/100)))
dat3 <- data.frame(a = replicate(1,sample(25:50,9500,rep=TRUE, prob = 12:37/100)))
dat4 <- data.frame(a = replicate(1,sample(25:50,9500,rep=TRUE, prob = 37:12/100)))
calc_bin_props <- function(data) {
as_tibble(data) %>%
mutate(bin = cut(a, breaks = seq(25, 50, by = 2))) %>%
group_by(bin) %>%
summarise(sum = sum(a), .groups = "drop") %>%
filter(!is.na(bin)) %>%
ungroup() %>%
mutate(sum = sum / sum(sum))
}
diff_data <-
full_join(
calc_bin_props(data = dat1),
calc_bin_props(dat2),
by = "bin") %>%
separate(bin, c("trsh", "bin", "trshb", "trshc")) %>%
mutate(dbinA = (sum.x - sum.y * 100)) %>%
select(-starts_with("trsh"))
diff_data2 <-
full_join(
calc_bin_props(data = dat3),
calc_bin_props(dat4),
by = "bin") %>%
separate(bin, c("trsh", "bin", "trshb", "trshc")) %>%
mutate(dbinB = (sum.x - sum.y * 100)) %>%
select(-starts_with("trsh"))
# Combine two differences, and remove sum.x and sum.y
full_data <- cbind(diff_data, diff_data2[,4])
full_data <- full_data[,-c(2:3)]
# Melt the data to plot more than 1 variable on a plot
m <- melt(full_data, id.vars="bin")
theme_update(plot.title = element_text(hjust = 0.5))
ggplot(m, aes(as.numeric(bin), value, col=variable, linetype = variable)) +
geom_line() +
scale_linetype_manual(values=c("solid", "longdash")) +
scale_color_manual(values = c("black", "black"))
dev.off()
library(tidyverse)
Creating example data as shown in question, but adding different probabilities to the two sample() calls, to create so visible difference
between the two sets of randomized data.
dat1 <- data.frame(a = replicate(1,sample(25:50,10000,rep=TRUE, prob = 25:0/100))) %>% as_tibble()
dat2 <- data.frame(a = replicate(1,sample(25:50,9500,rep=TRUE, prob = 0:25/100))) %>% as_tibble()
Using dplyr we can handle this within data.frames (tibbles) without
the need to switch to other datatypes.
Let’s define a function that can be applied to both datasets to get
the preprocessing done.
We use base::cut() to create
a new column that pairs each value with its bin. We then group the data
by bin, calculate the sum for each bin and finally divide the bin sums
by the total sum.
calc_bin_props <- function(data) {
as_tibble(data) %>%
mutate(bin = cut(a, breaks = seq(25, 50, by = 2), labels = seq(25, 48, by = 2))) %>%
group_by(bin) %>%
summarise(sum = sum(a), .groups = "drop") %>%
filter(!is.na(bin)) %>%
ungroup() %>%
mutate(sum = sum / sum(sum))
}
Now we call calc_bin_props() on both datasets and join them by bin.
This gives us a dataframe with the columns bin, sum.x and sum.y.
The latter two are correspond to the bin sums derived from dat1 and
dat2. With the mutate() line we calculate the differences between the
two columns.
diff_data <-
full_join(
calc_bin_props(data = dat1),
calc_bin_props(dat2),
by = "bin") %>%
mutate(dbin = (sum.x - sum.y),
bin = as.numeric(as.character(bin))) %>%
select(-starts_with("trsh"))
Before we feed the data into ggplot() we convert it to the long
format using pivot_longer() this allows us to instruct ggplot() to
plot the results for sum.x, sum.y and dbin as separate lines.
diff_data %>%
pivot_longer(-bin) %>%
ggplot(aes(as.numeric(bin), value, color = name, linetype = name)) +
geom_line() +
scale_linetype_manual(values=c("longdash", "solid", "solid")) +
scale_color_manual(values = c("black", "purple", "green"))

Use scale_x_continuous with labeller function that also takes a data frame as an argument as well as default breaks

Here's a code block:
# scale the log of price per group (cut)
my_diamonds <- diamonds %>%
mutate(log_price = log(price)) %>%
group_by(cut) %>%
mutate(scaled_log_price = scale(log_price) %>% as.numeric) %>% # scale within each group as opposed to overall
nest() %>%
mutate(mean_log_price = map_dbl(data, ~ .x$log_price %>% mean)) %>%
mutate(sd_log_price = map_dbl(data, ~ .x$log_price %>% sd)) %>%
unnest %>%
select(cut, price, price_scaled:sd_log_price) %>%
ungroup
# for each cut, find the back transformed actual values (exp) of each unit of zscore between -3:3
for (i in -3:3) {
my_diamonds <- my_diamonds %>%
mutate(!! paste0('mean_', ifelse(i < 0 , 'less_', 'plus_'), abs(i), 'z') := map2(.x = mean_log_price, .y = sd_log_price, ~ (.x + (i * .y)) %>% exp) %>% unlist)
}
my_diamonds_split <- my_diamonds %>% group_split(cut)
split_names <- my_diamonds %>% mutate(cut = as.character(cut)) %>% group_keys(cut) %>% pull(cut)
names(my_diamonds_split) <- split_names
I now have a variable my_diamonds_split that is a list of data frames. I would like to loop over these data frames and each time create a new ggplot.
I can use a custom labeller function with a single df, but I don't know how to do this within a loop:
labeller <- function(x) {
paste0(x,"\n", scales::dollar(sd(ex_df$price) * x + mean(ex_df$price)))
}
ex_df <- my_diamonds_split$Ideal
ex_df %>%
ggplot(aes(x = scaled_log_price)) +
geom_density() +
scale_x_continuous(label = labeller, limits = c(-3, 3))
This creates a plot for the 'Ideal' cut of diamonds. I also get two data points on the x axis, the zscore values at -2, 0 and 2 as well as the raw dollar values of 3.8K, 3.9K and 11.8K.
When I define the labeller function, I must specify the df to scale with. Tried instead with placing the dot instead of my_df, hoping that on each iteration ggplot would get the value of the df on any iteration:
labeller <- function(x) {
paste0(x,"\n", scales::dollar(sd(.$price) * x + mean(.$price)))
}
ex_df <- my_diamonds_split$Ideal
ex_df %>%
ggplot(aes(x = scaled_log_price)) +
geom_density() +
scale_x_continuous(label = labeller, limits = c(-3, 3))
Returns:
Error in is.data.frame(x) : object '.' not found
I then tried writing the function to accept an argument for the df to scale with:
labeller <- function(x, df) {
paste0(x,"\n", scales::dollar(sd(df$price) * x + mean(df$price)))
}
ex_df <- my_diamonds_split$Ideal
ex_df %>%
ggplot(aes(x = scaled_log_price)) +
geom_density() +
scale_x_continuous(label = labeller(df = ex_df), limits = c(-3, 3)) # because when it comes to running in real life, I will try something like labeller(df = my_diamonds_split[[i]])
Error in paste0(x, "\n", scales::dollar(sd(df$price) * x + mean(df$price))) :
argument "x" is missing, with no default
Bearing in mind that the scaling must be done per iteration, how could I loop over my_diamonds_split, and on each iteration generate a ggplot per above?
labeller <- function(x) {
# how can I make df variable
paste0(x,"\n", scales::dollar(sd(df$price) * x + mean(df$price)))
}
for (i in split_names) {
my_diamonds_split[[i]] %>%
ggplot(aes(x = scaled_log_price)) +
geom_density() +
scale_x_continuous(label = labeller, # <--- here, labeller must be defined with df$price except that will difer on each iteration
limits = c(-3, 3))
}
There's a hacky way to get this result in facets. Basically, after converting to z scores, you add different amounts (say, multiples of 1000) to each group's z scores. Then you set all the breaks to this collection of points and label them with pre-calculated labels.
library(ggplot2)
library(dplyr)
f <- function(x) {
y <- diamonds$price[diamonds$cut == x]
paste(seq(-3, 3), scales::dollar(round(mean(y) + seq(-3, 3) * sd(y))), sep = "\n")
}
breaks <- as.vector(sapply(levels(diamonds$cut), f))
diamonds %>%
group_by(cut) %>%
mutate(z = scale(price) + 3 + 1000 * as.numeric(cut)) %>%
ggplot(aes(z)) +
geom_point(aes(x = z - 2, y = 1), alpha = 0) +
geom_density() +
scale_x_continuous(breaks = as.vector(sapply(1:5 * 1000, "+", 0:6)),
labels = breaks) +
facet_wrap(vars(cut), scales = "free_x") +
theme(text = element_text(size = 16),
axis.text.x = element_text(size = 6))
You would have to increase the plot size to make the dollar values more visible of course.
Created on 2020-08-04 by the reprex package (v0.3.0)

How can we data wrangling to obtain shown ratio/proportion chart shown

Goal is to produce a visualization indicating ratio.
Please help us how can we produce such ratio chart (high lighted) in R ?
library(tidyverse)
# Dataset creation
df <- data.frame(cls = c(rep("A",4),rep("B",4)),
grd = c("A1",rep("A2",3),rep(c("B1","B2"), 2)),
typ = c(rep("m",2),rep("o",2),"m","n",rep("p",2)),
pnts = c(rep(1:4,2)))
df
#### Data wrangling
df1 <- df %>%
group_by(cls) %>%
summarise(cls_pct = sum(pnts))
df1
df2 <- df %>%
group_by(cls,grd) %>%
summarize(grd_pct = sum(pnts))
df2
df3 <- df %>%
group_by(cls,grd,typ) %>%
summarise(typ_pct = sum(pnts))
df3
#### Attempt to combine all df1,df2,df3
# but mutate and summarise are mixing up leading to wrong results
df3 %>%
group_by(cls,grd) %>%
mutate(grd_pct = sum(typ_pct)) %>%
group_by(cls) %>%
mutate(cls_pct = sum(grd_pct))
Attempt to visualize all the ratios in 1 chart
data %>%
pivot_longer(cols = -c(cls:pnts),
names_to = "per_cat",
values_to = "percent") %>%
ggplot(aes(cls,percent, col = typ, fill = grd)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_bw()
plot of the same.
EDIT -- added formula version with more useful output for visualization.
ORIG: At this point it may be worth making a function to reduce copying and pasting, but this may get you what you need:
library(tidyverse)
df %>%
group_by(cls) %>%
mutate(per1 = sum(pnts),
per1_pct = per1 / sum(per1)) %>%
group_by(cls, grd) %>%
mutate(per2 = sum(pnts),
per2_pct = per2 / sum(per2)) %>%
group_by(cls, grd, typ) %>%
mutate(per3 = sum(pnts),
per3_pct = per3 / sum(per3)) %>%
ungroup()
EDIT: Here's a general function to calculate the stats for a given grouping, making it easier to combine a few groupings together in long format better suited for visualization.
df_sum <- function(df, level, ...) {
df %>%
group_by(...) %>%
summarize(grp_ttl = sum(pnts)) %>%
mutate(ttl = sum(grp_ttl),
pct = grp_ttl / ttl) %>%
ungroup() %>%
mutate(level = {{ level }} )
}
df_sum(df, level = 1, cls) %>%
bind_rows(df_sum(df, level = 2, cls, grd)) %>%
bind_rows(df_sum(df, level = 3, cls, grd, typ)) %>%
mutate(label = coalesce(as.character(typ), # This grabs the first non-NA
as.character(grd),
as.character(cls))) -> df_summed
df_summed %>%
ggplot(aes(level, grp_ttl)) +
geom_col(color = "white") +
geom_text(aes(label = paste0(label, "\n", grp_ttl, "/", ttl)),
color = "white",
position = position_stack(vjust = 0.5)) +
scale_x_reverse() + # To make level 1 at the top
coord_flip() # To switch from vertical to horizontal orientation

Resources