Related
I am trying to adapt the approach from (ggplot2 multiple sub groups of a bar chart) but something is not as it should be.
The code is:
library(grid)
MethodA= rep(c("ARIMA"), 6)
MethodB=rep(c("LSTM"), 6)
MethodC = rep(c("ARIMA-LSTM"),6)
MethodD=rep(c("SSA"),6)
Method=c(MethodA, MethodB, MethodC, MethodD)
Measure = rep(c("RMSE", "RMSE", "MAE", "MAE", "MAPE", "MAPE"), 4)
trtest=rep(c("train", "test"), 12)
Value=sample(x = 4000:7000, size = 24, replace = TRUE)
df2 <- data.frame(Method, Measure, trtest, Value)
dodge <- position_dodge(width = 0.9)
g1 <- ggplot(data = df, aes(x = interaction(Variety, Trt), y = yield, fill = factor(geno))) +
geom_bar(stat = "identity", position = position_dodge()) +
#geom_errorbar(aes(ymax = yield + SE, ymin = yield - SE), position = dodge, width = 0.2) +
coord_cartesian(ylim = c(0, 7500)) +
annotate("text", x = 1:6, y = - 10,
label = rep(c("Variety 1", "Variety 2", "Variety 3"), 2)) +
annotate("text", c(1.5, 3.5), y = - 20, label = c("Irrigated", "Dry")) +
theme_classic() +
theme(plot.margin = unit(c(1, 1, 4, 1), "lines"),
axis.title.x = element_blank(),
axis.text.x = element_blank())
# remove clipping of x axis labels
g2 <- ggplot_gtable(ggplot_build(g1))
g2$layout$clip[g2$layout$name == "panel"] <- "off"
grid.draw(g2)
The problem is aslo in a sequence that interaction function generates - the sequences are not by the order - ARIMA - RMSE, MAE, MAPE, then LSTM - RMSE, MAE, MAPE ...
I would appreciate for any help.
Best,
Nikola
Instead of using interaction, it might be a lot clearer if you use facets.
Note that your example is not reproducible (your sample data has different variable names from the ones you use in your plotting code, so I had to guess which you meant to substitute):
ggplot(data = df2, aes(x = Measure, y = Value, fill = trtest)) +
geom_bar(stat = "identity", position = position_dodge()) +
coord_cartesian(ylim = c(0, 7500)) +
facet_grid(.~Method, switch = 'x') +
theme_classic() +
theme(strip.placement = 'outside',
strip.background = element_blank(),
strip.text = element_text(face = 'bold', size = 16),
panel.spacing.x = unit(0, 'mm'),
panel.border = element_rect(fill = NA, color = 'gray'))
I'm trying to plot 2 normal distribution density plots for null and alternative hazard ratios of 1 and 0.65, respectively, to replicate an example (plot attached). Here's my code so far but it doesn't makes sense to me to have negative values for hazard ratios, but when I don't have negative values, the distributions are cut off. So I know I'm doing something wrong here. Thanks!
x <- seq(-2, 2, length.out = 100000)
df <- do.call(rbind,
list(data.frame(x=x, y=dnorm(x, mean = log(1), sd = sqrt(1/60 + 1/60)), id="H0, HR = 1"),
data.frame(x=x, y=dnorm(x, mean = log(0.65), sd = sqrt(1/60 + 1/60)), id="H1, HR = 0.65")))
vline <- 0.65
p1 <- ggplot(df, aes(x, y, group = id, color = id)) +
geom_line() +
geom_area(aes(fill = id),
data = ~ subset(., (id == "H1, HR = 0.65" & x > (vline)) | (id == "H0, HR = 1" & x < (vline))),
alpha = 0.3) +
geom_vline(xintercept = vline, linetype = "dashed") +
labs(x = "log(Hazard Ratio)", y = 'Density') + xlim(-2, 2) +
guides(fill = "none", color = guide_legend(override.aes = list(fill = "white"))) +
theme_classic() +
theme(legend.title=element_text(size=10), legend.position = c(0.8, 0.4),
legend.text = element_text(size = 10),
axis.line.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()
) +
scale_color_manual(name = '', values = c('red', 'blue')) +
scale_fill_manual(values = c('red', 'blue'))
The plot I'm trying to replicate
This gets reasonably close to the image that you have posted.
You should not use the log() of the means, but rather the means as is. Moreover if you use the normal distribution, you assume that parameters can take any value between -Inf and Inf, albeit with very small densities far from the mean. Therefore, you cannot expect all values to be positive. If you would like your values to be bounded by 0, then you should use a gamma distribution instead.
x <- seq(-2, 2, length.out = 1000)
df <- do.call(rbind,
list(data.frame(x=x, y=dnorm(x, mean = 1, sd = sqrt(1/50)), id="H0, HR = 1"),
data.frame(x=x, y=dnorm(x, mean = 0.65, sd = sqrt1/50)), id="H1, HR = 0.65")))
vline <- 0.65
ggplot(df, aes(x, y, group = id, color = id)) +
geom_line() +
geom_area(aes(fill = id),
data = ~ subset(., (id == "H1, HR = 0.65" & x > (vline)) | (id == "H0, HR = 1" & x < (vline))),
alpha = 0.3) +
geom_vline(xintercept = vline, linetype = "dashed") +
labs(x = "log(Hazard Ratio)", y = 'Density') + xlim(-2, 2) +
guides(fill = "none", color = guide_legend(override.aes = list(fill = "white"))) +
theme_classic() +
theme(legend.title=element_text(size=10), legend.position = c(0.8, 0.4),
legend.text = element_text(size = 10),
axis.line.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()
) +
scale_color_manual(name = '', values = c('red', 'blue')) +
scale_fill_manual(values = c('red', 'blue')) +
scale_x_continuous(breaks = seq(-0.3, 2.1, 0.3),
limits = c(-0.3, 2.1))
I am trying to create a plot like the one below. I'd like the order the points in each category in such a way that they form an s-shape. Is it possible to do this in ggplot?
Similar data available here
What I have so far:
somatic.variants <- read.delim("data/Lawrence.S2.txt", stringsAsFactors=T)
cancer_rates <- tapply(somatic.variants$logn_coding_mutations, somatic.variants$tumor_type, median)
cancer_rates <- cancer_rates[order(cancer_rates, decreasing=F)]
somatic.variants$tumor_type <- factor(somatic.variants$tumor_type, levels = names(cancer_rates))
library(ggplot2)
library(GGally)
ggplot(data = somatic.variants,
mapping = aes(x = tumor_type,
y = log10(n_coding_mutations))) +
geom_point(position = position_dodge2()) +
scale_x_discrete(position = "top") +
scale_y_continuous(labels = c(0,10,100,1000,10000), expand = c(0,0)) +
geom_stripped_cols() +
theme_bw() +
theme(axis.title.x = element_blank(),
axis.text.x = element_text(angle = 315, hjust = 1, size = 12),
panel.grid = element_blank()) +
labs(y = "Coding mutations count") +
stat_summary(fun = median,
geom="crossbar",
size = 0.25,
width = 0.9,
group = 1,
show.legend = FALSE,
color = "#FF0000")
This could be achieved by
grouping the data by x-axis categories
arranging by the y-axis value
which ensures that the points are plotted in ascending order of the values for each category.
somatic.variants <- read.delim("https://gist.githubusercontent.com/wudustan/57deecdaefa035c1ecabf930afde295a/raw/1594d51a1e3b52f674ff746caace3231fd31910a/Lawrence.S2.txt", stringsAsFactors=T)
cancer_rates <- tapply(somatic.variants$logn_coding_mutations, somatic.variants$tumor_type, median)
cancer_rates <- cancer_rates[order(cancer_rates, decreasing=F)]
somatic.variants$tumor_type <- factor(somatic.variants$tumor_type, levels = names(cancer_rates))
library(ggplot2)
library(GGally)
library(dplyr)
somatic.variants <- somatic.variants %>%
group_by(tumor_type) %>%
arrange(n_coding_mutations)
ggplot(data = somatic.variants,
mapping = aes(x = tumor_type,
y = log10(n_coding_mutations))) +
geom_point(position = position_dodge2(.9), size = .25) +
scale_x_discrete(position = "top") +
scale_y_continuous(labels = c(0,10,100,1000,10000), expand = c(0,0)) +
geom_stripped_cols() +
theme_bw() +
theme(axis.title.x = element_blank(),
axis.text.x = element_text(angle = 315, hjust = 1, size = 12),
panel.grid = element_blank()) +
labs(y = "Coding mutations count") +
stat_summary(fun = median,
geom="crossbar",
size = 0.25,
width = 0.9,
group = 1,
show.legend = FALSE,
color = "#FF0000")
#> Warning: Removed 29 rows containing non-finite values (stat_summary).
I am a novice coder and have been trying to understand the code posted here: Forest plot with table ggplot coding
I am hoping to use the script to display my own univariate analysis results for a project. I want the script to read the data from a csv file with the columns: "Predictor", "N", "rr", "rrlow", "rrhigh", and "arr". There are in total 19 variables ("Predictors") that I need to display. I have altered the script to read in the values into a single dataframe (rather than having a separate forestdf and fplottable like in the linked thread). However, I am getting multiple "replacement has x rows, data has y".
Here is the code in question:
###dataframe
library(ggplot2)
library(tidyr)
library(grid)
library(gridExtra)
library(forcats)
forestdf<- read.csv("UnivariateAnalysis2.csv",header=T)
forestdf$Predictor <- factor(forestdf$Predictor,levels = forestdf$Predictor)
levels(forestdf$Predictor)
forestdf$colour <- rep(c("white", "gray95"), length.out = 19)
p <- ggplot(forestdf, aes(x = rr, y = Predictor, xmin = rrlow, xmax = rrhigh)) +
geom_hline(aes(yintercept = 1, colour = colour), size = 7) +
geom_pointrange(shape = 22, fill = "black") +
geom_vline(xintercept = 1, linetype = 3) +
xlab("Variable") +
ylab("Hazard Ratio with 95% Confidence Interval") +
theme_classic() +
scale_colour_identity() +
scale_y_discrete(limits = rev(forestdf$Predictor)) +
scale_x_log10(limits = c(0.25, 4),
breaks = c(0.25, 0.5, 1, 2, 4),
labels = c("0.25", "0.5", "1", "2", "4"), expand = c(0,0)) +
theme(axis.text.y = element_blank(), axis.title.y = element_blank())
forestdf$Predictor <- factor(forestdf$Predictor, rev(levels(forestdf$Predictor)))
forestdf$colour <- rep(c("white", "gray95"), length.out=19)
data_table <- ggplot(data = forestdf, aes(y = Predictor)) +
geom_hline(aes(yintercept = 1, colour = colour), size = 7) +
geom_text(aes(x = 0, label = Predictor), hjust = 0) +
geom_text(aes(x = 5, label = N)) +
geom_text(aes(x = 7, label = arr), hjust = 1) +
scale_colour_identity() +
theme_void() +
theme(plot.margin = margin(5, 0, 35, 0))
grid.arrange(data_table,p, ncol = 2)
And the errors I have been receiving:
> ###dataframe
> library(ggplot2)
> library(tidyr)
> library(grid)
> library(gridExtra)
> library(forcats)
>
> forestdf<- read.csv("UnivariateAnalysis2.csv",header=T)
> forestdf$Predictor <- factor(forestdf$Predictor,levels = forestdf$Predictor)
Error in `$<-.data.frame`(`*tmp*`, Predictor, value = integer(0)) :
replacement has 0 rows, data has 19
> levels(forestdf$Predictor)
NULL
> forestdf$colour <- rep(c("white", "gray95"), length.out = 19)
> p <- ggplot(forestdf, aes(x = rr, y = Predictor, xmin = rrlow, xmax = rrhigh)) +
+ geom_hline(aes(yintercept = 1, colour = colour), size = 7) +
+ geom_pointrange(shape = 22, fill = "black") +
+ geom_vline(xintercept = 1, linetype = 3) +
+ xlab("Variable") +
+ ylab("Hazard Ratio with 95% Confidence Interval") +
+ theme_classic() +
+ scale_colour_identity() +
+ scale_y_discrete(limits = rev(forestdf$Predictor)) +
+ scale_x_log10(limits = c(0.25, 4),
+ breaks = c(0.25, 0.5, 1, 2, 4),
+ labels = c("0.25", "0.5", "1", "2", "4"), expand = c(0,0)) +
+ theme(axis.text.y = element_blank(), axis.title.y = element_blank())
>
> forestdf$Predictor <- factor(forestdf$Predictor, rev(levels(forestdf$Predictor)))
Error in `$<-.data.frame`(`*tmp*`, Predictor, value = integer(0)) :
replacement has 0 rows, data has 19
> forestdf$colour <- rep(c("white", "gray95"), length.out=19)
>
> data_table <- ggplot(data = forestdf, aes(y = Predictor)) +
+ geom_hline(aes(yintercept = 1, colour = colour), size = 7) +
+ geom_text(aes(x = 0, label = Predictor), hjust = 0) +
+ geom_text(aes(x = 5, label = N)) +
+ geom_text(aes(x = 7, label = arr), hjust = 1) +
+ scale_colour_identity() +
+ theme_void() +
+ theme(plot.margin = margin(5, 0, 35, 0))
>
> grid.arrange(data_table,p, ncol = 2)
Error in FUN(X[[i]], ...) : object 'Predictor' not found
I greatly appreciate any help or suggestions you may provide.
Thanks!
EDIT:
###dataframe
library(ggplot2)
library(tidyr)
library(grid)
library(gridExtra)
library(forcats)
forestdf<- read.csv("UnivariateAnalysis2.csv",header=TRUE)
names(forestdf)[1]<-"Predictor"
forestdf$Predictor <- factor(forestdf$Predictor)
forestdf$colour <- rep(c("white", "gray95"), length.out = length(unique(unlist(forestdf[c("Predictor")]))))
p <- ggplot(forestdf, aes(x = rr, y = Predictor, xmin = rrlow, xmax = rrhigh)) +
geom_hline(aes(yintercept = Predictor, colour = colour), size = 7) +
geom_pointrange(shape = 22, fill = "black") +
geom_vline(xintercept = 1, linetype = 3, colour = "red") +
xlab("Hazard Ratio") +
ylab("Hazard Ratio with 95% Confidence Interval") +
theme_classic() +
scale_colour_identity() +
scale_y_discrete(limits = rev(forestdf$Predictor)) +
scale_x_log10(limits = c(0.25, 4),
breaks = c(0.25, 0.5, 1, 2, 4),
labels = c("0.25", "0.5", "1", "2", "4"), expand = c(0,0)) +
theme(axis.text.y = element_blank(), axis.title.y = element_blank())
forestdf$Predictor <- factor(forestdf$Predictor, rev(levels(forestdf$Predictor)))
data_table <- ggplot(data = forestdf, aes(y = Predictor)) +
geom_hline(aes(yintercept = Predictor, colour = colour), size = 7) +
geom_text(aes(x = 0, label = Predictor), hjust = 0) +
geom_text(aes(x = 3, label = N)) +
geom_text(aes(x = 7, label = arr), hjust = 1) +
scale_colour_identity() +
theme_void() +
theme(plot.margin = margin(5, 0, 35, 0))
grid.arrange(data_table,p, ncol = 2)
I have made some changes as per IRTFM (thank you!) and it now produces a plot and table. I'm not sure why but it wasn't reading the csv correctly. My main issues now are the following:
The alternating grey and white bars do not alternate correctly on the table side
The header for the columns does not show up on the table
The table is not aligned with the forestplot (ie. top row's forest plot is not the correct forest plot for Albumin) Example Plot
EDIT2:
I was able to fix the alternating colours and alignment with the forestplot. My issue now is that the column titles I've made are now cut off: New Plot. Also, how would I go about only bolding the values with an asterisk?
###dataframe
library(ggplot2)
library(tidyr)
library(grid)
library(gridExtra)
library(forcats)
forestdf<- read.csv("UnivariateAnalysis2.csv",header=TRUE)
names(forestdf)[1]<-"Predictor"
forestdf$Predictor <- rev(factor(forestdf$Predictor))
forestdf$colour <- rep(c("white", "gray95"), length.out = length(unique(unlist(forestdf[c("Predictor")]))))
p <- ggplot(forestdf, aes(x = rr, y = Predictor, xmin = rrlow, xmax = rrhigh)) +
geom_hline(aes(yintercept = Predictor, colour = colour), size = 7) +
geom_pointrange(shape = 22, fill = "black") +
geom_vline(xintercept = 1, linetype = 3, colour = "red") +
xlab("Hazard Ratio") +
ylab("Hazard Ratio with 95% Confidence Interval") +
theme_classic() +
scale_colour_identity() +
scale_y_discrete(limits = forestdf$Predictor) +
scale_x_log10(limits = c(0.25, 4),
breaks = c(0.25, 0.5, 1, 2, 4),
labels = c("0.25", "0.5", "1", "2", "4"), expand = c(0,0)) +
theme(axis.text.y = element_blank(), axis.title.y = element_blank())
#forestdf$Predictor <- factor(forestdf$Predictor, rev(levels(forestdf$Predictor)))
data_table <- ggplot(data = forestdf, aes(y = rev(factor(Predictor)))) +
geom_hline(aes(yintercept = Predictor, colour = colour), size = 7) +
geom_text(aes(x = 0, label = Predictor), show.legend=TRUE, hjust = 0) +
geom_text(aes(x = 3, label = N)) +
geom_text(aes(x = 5.5, label = arr), hjust = 1) +
geom_text(aes(x = 7, label = PVALUE), hjust = 1) +
geom_text(aes(x = 0, y = 20, label = "Predictor"), hjust = 0) +
geom_text(aes(x = 3, y= 20, label = "N")) +
geom_text(aes(x = 5, y= 20, label = "95% CI"), hjust = 1) +
geom_text(aes(x = 7, y= 20, label = "P Value"), hjust = 1) +
scale_colour_identity() +
theme_void() +
theme(plot.margin = margin(5, 0, 35, 0))
grid.arrange(data_table,p, ncol = 2)
Thanks!
I am trying to shade the 0.025 and 0.975 quantiles on this graph that has three lines. I have tried geom_area, geom_ribbon, and I cannot highlight every quantile in every line.
Please note that "y" was ignored in this density graph.
example <-data.frame(source=c("Leaflitter","Leaflitter","Leaflitter","Leaflitter",
"Leaflitter","Leaflitter","Leaflitter","Leaflitter","Leaflitter","Leaflitter",
"Biofilm","Biofilm","Biofilm","Biofilm","Biofilm","Biofilm","Biofilm","Biofilm",
"Biofilm","Biofilm","Algae","Algae","Algae","Algae","Algae","Algae","Algae","Algae",
"Algae","Algae"), n=c(1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,9,10),
density=c(0.554786934, 0.650578421, 0.039317168, 0.53537613,0.435081982,0.904056941,0.556284164,0.855319434,
0.399169622,0.570246304,0.076722032,0.257427999,0.172736928,0.447424473,0.520976948,0.011720494,0.311348655,
0.120698996,0.016336661,0.331741377, 0.368491034,0.09199358,0.787945904,0.017199397,0.04394107,
0.084222564,0.132367181,0.023981569,0.584493716,0.098012319))
example
One subgroup and quantiles
L <- filter(QPA_G_Feb17, source == "Leaflitter")
L <-as.data.frame(L)
Lq025 <- quantile(L$density, .025)
Lq975 <- quantile(L$density, .975)
ggplot(QPA_G_Feb17, aes(x=density, color=source)) +
labs(y="Density", x="Sorce contribution") +
geom_density(aes(linetype = source), size=1.2) +
scale_color_manual(values=c("#31a354", "#2c7fb8", "#d95f0e")) +
scale_linetype_manual(values = c("solid", "dotted", "longdash")) +
theme_classic()+
ylim(0, 5)+
theme(axis.text.y=element_text(angle=0, size=12, vjust=0.5, color="black")) +
theme(axis.text.x =element_text(angle=0, size=12, vjust=0.5, color="black")) +
theme(axis.title.x = element_text(color="black", size=14))+
theme(axis.title.y = element_text(color="black", size=14))
I would appreciate your help since I have looked in other forums, and there is information to highlight when there is only 1 line.
I think this data is a bit more representative of the data displayed in your plot:
set.seed(50)
QPA_G_Feb17 <- data.frame(density = c(rgamma(400, 2, 10),
rgamma(400, 2.25, 9),
rgamma(400, 5, 7)),
source = rep(c("Algae", "Biofilm", "Leaflitter"),
each = 400))
I find that when you are trying to do something complex or non-standard in ggplot, the best thing to do is calculate the data you wish to plot ahead of time. In this case, we can calculate the density curves and the cumulative densities, including their 0.025 and 0.975 quantiles, and putting them all in a data frame like this:
dens <- lapply(split(QPA_G_Feb17, QPA_G_Feb17$source),
function(x) density(x$density, from = 0, to = 1))
df <- do.call(rbind, mapply(function(x, y) {
data.frame(x = x$x, y = x$y, source = y)
}, dens, names(dens), SIMPLIFY = FALSE))
df <- df %>%
group_by(source) %>%
mutate(cdf = cumsum(y * mean(diff(x))),
lower = cdf < 0.025,
upper = cdf > 0.975)
Now it is easy to plot using geom_area:
ggplot(df, aes(x, y, color = source)) +
geom_area(data = df[df$lower,], aes(fill = source), alpha = 0.5,
position = "identity") +
geom_area(data = df[df$upper,], aes(fill = source), alpha = 0.5,
position = "identity") +
labs(y = "Density", x = "Source contribution") +
geom_line(aes(linetype = source), size = 1.2) +
scale_fill_manual(values = c("#31a354", "#2c7fb8", "#d95f0e")) +
scale_color_manual(values = c("#31a354", "#2c7fb8", "#d95f0e")) +
scale_linetype_manual(values = c("solid", "dotted", "longdash")) +
theme_classic() +
ylim(0, 5) +
xlim(0, 1) +
theme(axis.text.y = element_text(size = 12, vjust = 0.5),
axis.text.x = element_text(size = 12, vjust = 0.5),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))
Here, the 2.5% and 97.5% extremeties of each density curve are shaded below each line. The exception is in the "Leaflitter` line, which clearly extends out of the 0-1 range that has been plotted in your example.