Related
Data
Here is the dput of my data:
heart <- structure(list(died = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), levels = c("Survive",
"Died"), class = c("labelled", "factor"), label = "Death", format = "%8.0g", value.label.table = structure(0:1, names = c("Survive",
"Died"))), procedure = structure(c(2L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L,
1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L), levels = c("PTCA",
"CABG"), class = c("labelled", "factor"), label = "1=CABG; 0=PTCA", format = "%8.0g", value.label.table = structure(0:1, names = c("PTCA",
"CABG"))), age = structure(c(65L, 69L, 76L, 65L, 69L, 67L, 69L,
66L, 74L, 67L, 76L, 76L, 68L, 77L, 77L, 70L, 68L, 69L, 68L, 70L,
71L, 75L, 69L, 72L, 66L, 77L, 68L, 78L, 77L, 69L, 65L, 70L, 65L,
70L), label = "PATIENT AGE", class = c("labelled", "integer"), format = "%8.0g"),
gender = structure(c(2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L), levels = c("Female",
"Male"), class = c("labelled", "factor"), label = "Gender", format = "%8.0g", value.label.table = structure(0:1, names = c("Female",
"Male"))), los = structure(c(10L, 7L, 7L, 8L, 1L, 7L, 2L,
9L, 3L, 1L, 6L, 14L, 4L, 13L, 10L, 4L, 2L, 2L, 10L, 2L, 6L,
12L, 4L, 2L, 2L, 14L, 3L, 2L, 5L, 9L, 3L, 3L, 3L, 2L), label = "LOS", class = c("labelled",
"integer"), format = "%8.0g"), type = structure(c(1L, 2L,
2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L,
2L, 1L), levels = c("Elective", "Emer/Urg"), class = c("labelled",
"factor"), label = "Severity", format = "%8.0g", value.label.table = structure(0:1, names = c("Elective",
"Emer/Urg")))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-34L), stata.info = list(datalabel = "AZ 1991; CABG(106,107) & PTCA(112)",
version = 12L, time.stamp = "17 Feb 2015 12:45", val.labels = c("diedlb",
"pxllb", "", "sexllb", "", "typellb"), label.table = list(
pxllb = structure(0:1, names = c("PTCA", "CABG")), typellb = structure(0:1, names = c("Elective",
"Emer/Urg")), sexllb = structure(0:1, names = c("Female",
"Male")), diedlb = structure(0:1, names = c("Survive",
"Died")))))
Problem
So far I have fit multiple models like this using tensor splines to encode main and interaction effects:
#### Load Libraries ####
library(oddsratio)
library(mgcv)
library(splines)
library(tidyverse)
#### Fit GAM With Tensor Splines ####
fit <- gam(factor(died)
~ ti(age)
+ ti(los)
+ ti(age,los),
data = heart,
family = binomial)
However, when I try to plot the main effects:
plot_gam(model = fit,
pred = "age")
I get this odd warning:
Error in plot_df[[set_pred]]$x : $ operator is invalid for atomic vectors
I tried calling the function plot_gam explicitly, which look like this, but I'm a bit fuzzy on why this code is causing the issue. Calling whatever gam_to_df is also didn't seem to clarify the problem:
function (model = NULL, pred = NULL, col_line = "blue", ci_line_col = "black",
ci_line_type = "dashed", ci_fill = "grey", ci_alpha = 0.4,
ci_line_size = 0.8, sm_fun_size = 1.1, title = NULL, xlab = NULL,
ylab = NULL, limits_y = NULL, breaks_y = NULL)
{
df <- gam_to_df(model, pred)
if (is.null(xlab)) {
xlab <- df[[pred]]$xlab
}
if (is.null(ylab)) {
ylab <- df[[pred]]$ylab
}
plot_gam <- ggplot(df, aes_(~x, ~y)) + geom_line(colour = col_line,
size = sm_fun_size) + geom_line(aes_(~x, ~se_upr), linetype = ci_line_type,
colour = ci_line_col, size = ci_line_size) + geom_line(aes_(~x,
~se_lwr), linetype = ci_line_type, colour = ci_line_col,
size = ci_line_size) + geom_ribbon(aes_(x = ~x, ymin = ~se_lwr,
ymax = ~se_upr), fill = ci_fill, alpha = ci_alpha) +
ylab(ylab) + xlab(xlab)
if (!is.null(limits_y) & !is.null(breaks_y)) {
plot_gam <- plot_gam + scale_y_continuous(breaks = c(breaks_y),
limits = c(limits_y))
}
else if (!is.null(limits_y) & is.null(breaks_y)) {
plot_gam <- plot_gam + scale_y_continuous(limits = limits_y)
}
else if (is.null(limits_y) & !is.null(breaks_y)) {
plot_gam <- plot_gam + scale_y_continuous(breaks = c(breaks_y))
}
if (!is.null(title)) {
plot_gam <- plot_gam + ggtitle(title)
}
return(plot_gam)
}
I would greatly appreciate a resolution to this issue. The plot_gam function is super useful for what I'm trying to achieve.
I was inspecting the code. Error is caused by a fairly naïve implementation of oddsratio::gam_to_df function.
gam_to_df <- function (model = NULL, pred = NULL)
{
plot_df <- no_plot(model)
set_pred <- grep(paste0("\\b", pred, "\\b"), plot_df)
df <- data.frame(x = plot_df[[set_pred]]$x, se_upr = plot_df[[set_pred]]$fit +
plot_df[[set_pred]]$se, se_lwr = plot_df[[set_pred]]$fit -
plot_df[[set_pred]]$se, y = plot_df[[set_pred]]$fit)
return(df)
}
no_plot <- function (model = NULL)
{
png("temp.xyz")
plot_df <- plot(model, pages = 1)
dev.off()
file.remove("temp.xyz")
return(invisible(plot_df))
}
The function first create a plot and return it (it is a list) then extract the element that contains "age" somewhere in one of the elements, searching it via grep. Since two of three elements contains "age" it returns a vector of two elelments c(1,3). Thus plot_df[[set_pred]] instead of selecting the first element, selects the first of the list and then the third element inside of the list, which is a vector.
Hence the error "operator is invalid for atomic vectors"
Appears to be a bug.
EDIT
Filled an issue at https://github.com/pat-s/oddsratio/issues/54
I have a data set that uses two factor variables to create a bar graph in ggplot. A factor variable of 5 levels provides the bar graph distinctions, while the factor variable of two levels provides the mean/average of each condition. An example graph looks like this :
The code to produce this is
plot <- ggplot(data = testdf, aes(x = condition, fill = DV)) + geom_bar(position = "fill", na.rm = TRUE) + theme_bw()
I would like to add error bars onto each of the bars, using 95% confidence intervals.
I've tried converting the DV variable to a numeric 1 or 0 and then analyzing using summarySE() to get CIs for each bar, like so:
se_test <- summarySE(testdf, measurevar = "numericDV", groupvars = c("condition"))
. I then change the ggplot function to read:
plot <- ggplot(data = testdf, aes(x = condition, fill = DV)) +
geom_bar(position = "fill", na.rm = TRUE) + theme_bw() +
geom_errorbar(aes(ymin = (DV - se_test$ci), ymax = (DV - se_test$ci)))
This leads to an error for the - and + to not be meaningful for factors. So the data is still being considered as a factor. Is there a way to keep this graph, while implementing the CI error bars? I'd like for the average displayed by the bars to act as the middle of the confidence intervals, while keeping the aesthetics of the fill conditions.
Thanks in advance.
Sample Data W/ Numeric DV:
testdf <- structure(list(condition = structure(c(4L, 3L, 3L, 2L, 5L, 1L,
5L, 4L, 4L, 3L, 4L, 1L, 2L, 5L, 5L, 1L, 4L, 3L, 2L, 5L, 4L, 3L,
3L, 2L, 2L, 3L, 2L, 3L, 5L, 1L, 3L, 3L, 3L, 4L, 4L, 1L, 4L, 2L,
4L, 3L), .Label = c("0", "1", "2", "3", "4"), class = "factor"),
DV = structure(c(1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L,
2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L
), .Label = c("No", "Yes"), class = "factor"), numericDV = c(0,
1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1,
0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
1)), row.names = c(2L, 4L, 9L, 12L, 16L, 17L, 22L, 24L, 30L,
31L, 35L, 40L, 41L, 42L, 43L, 45L, 46L, 47L, 49L, 50L, 52L, 57L,
64L, 66L, 67L, 73L, 76L, 77L, 78L, 79L, 84L, 86L, 90L, 100L,
103L, 105L, 107L, 108L, 112L, 113L), class = "data.frame")
ggplot2 let's you combine several data frames in one 'screen-space' using just the variable names and values - that is you can add a layer to your plot which has a different data source.
testdf %>%
ggplot(aes(x = condition)) +
geom_bar(aes(fill = DV), position = "fill", na.rm = TRUE) +
geom_errorbar(aes(
ymin = numericDV - ci,
ymax = numericDV + ci),
data = Rmisc::summarySE(testdf, measurevar = "numericDV", groupvars = "condition")) +
theme_bw()
I'm not sure if the result looks really nice with the bars exceeding the 0-1 interval, but numerically it looks like what you wanted. I moved the fill aesthetic to the geom_bar layer, as DV is missing in the summarySE output.
you can try
library(tidyverse)
se_test <- Rmisc::summarySE(testdf, measurevar = "numericDV", groupvars = c("condition"))
testdf %>%
count(condition, DV) %>%
ggplot(aes(condition, n)) +
geom_col(aes( fill =DV)) +
geom_errorbar(data=se_test, aes(y=N, ymin = N - ci, ymax = N + ci))
The part of dataset is like this:
Treatment Status gene1 gene2
1 Both Deceased 3.1934860 63.8697194
2 Both Deceased 0.0000000 11.3436426
3 Chemo Deceased 7.2186817 35.0621681
4 Both Deceased 7.2186817 23.7185255
5 Chemo Deceased 0.8049256 17.7083638
6 Chemo Censored 0.8250437 0.8250437
7 Chemo Censored 3.4136505 23.895533
8 Radio Censored 0.9428735 4.7143673
9 None Censored 3.3001750 10.7255686
I want to make compare each gene expression in "deceased" vs "censored" for each treatment. I only could make one gene expression for now, which is like this:
ggboxplot(df, x="Treatment", y= "gene1", fill = "Status")
Is there any way I can combine two genes' boxplots in one graph? Or any other better way to show these genes expression level difference between deceased vs censored in each group?
We may use boxplot() in base R, where we need to use reshape() first to get a long format.
boxplot(gene ~ Status + time + Treatment,
reshape(cbind(id=rownames(dat), dat), 4:5, sep="", direction="long"),
border=1:2)
However, this yields a quite crowded plot. We could do separate boxplots for e.g. each treatment group using sapply().
par(mfrow=c(2, 2))
sapply(unique(dat$Treatment), function(x) {
boxplot(value ~ Status + gene,
reshape(cbind(id=rownames(dat[dat$Treatment == x, ]), dat[dat$Treatment == x, ]),
4:5, sep="", direction="long", v.names="value", timevar="gene"),
at=c(1:2, 4:5),
main=x,
border=1:2)
})
Result
Data
dat <- structure(list(Treatment = structure(c(1L, 1L, 1L, 2L, 2L, 2L,
3L, 3L, 3L, 4L, 4L, 4L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L), .Label = c("Both", "Chemo", "None", "Radio"), class = "factor"),
Status = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("Censored", "Deceased"), class = "factor"),
gene1 = c(2.83185327992901, 5.21658677992433, 9.36719279899948,
1.77809421116808, 6.39453760571561, 3.08376117126782, -1.99524072673447,
0.380722587753265, -0.947148460332481, 1.73014054712629,
0.855919162512028, 0.501667581598007, 0.0638735169737497,
10.1712355237258, 5.34317645471502, -7.96626158445742, -0.0781613844302278,
5.59930916967042, -0.725717330717595, 0.492793009977729,
-0.546677404630108, 0.290301979542245, 2.83540215865274,
-1.25738031049913), gene2 = c(6.97361394841868, -6.86012827859373,
-0.193731972798249, -5.64669185350061, -20.6664537342379,
32.5477488386544, 12.6210452154023, 6.56845245925654, 13.5491140544121,
-2.9113829554538, 2.90958200298303, -6.56806056188421, 50.2577234864485,
17.0734922804668, 49.0769939658538, -2.0186433516603, 32.3823429023035,
17.7654319738005, 12.2884241568455, 21.7600566866782, 19.68978862329,
-12.6277420840716, 27.555120882401, 17.5164450232983)), row.names = c(3L,
23L, 13L, 44L, 34L, 50L, 90L, 67L, 62L, 100L, 95L, 96L, 132L,
144L, 124L, 174L, 171L, 168L, 196L, 205L, 207L, 233L, 229L, 212L
), class = "data.frame")
using the data from jay.sf you can try a 'ggplot'. I'm using the tidyverse, but this is not required.
library(tidyverse)
dat %>%
as_tibble() %>%
gather(gene, mRNA, -Treatment, -Status) %>%
ggplot(aes(Status, mRNA, fill =gene)) +
geom_boxplot() +
facet_wrap(~Treatment, ncol = 2, scales = "free_y")
and with facet_grid you can add significance levels automatically
dat %>%
as_tibble() %>%
gather(gene, mRNA, -Treatment, -Status) %>%
ggplot(aes(gene, mRNA, fill =gene)) +
geom_boxplot(show.legend = F) +
ggbeeswarm::geom_beeswarm(show.legend = F) +
ggsignif::geom_signif(comparisons = list(c("gene1", "gene2"))) +
facet_grid(Status~Treatment, scales = "free_y")
I can't figure out how to display labels centered on each dodged bar in a barplot in ggplot2.
I know that I can dodge the bars using position = "dodge" and I know that in order for the labels to show up centered on each bar I need to add position = position_dodge(width = 1) in the geom_text() or geom_label() command.
But for some reason it doesn't work (see Figure below). I also added my data and code.
df <- structure(list(Measure = structure(c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1988",
"2017"), class = "factor"), Province = structure(c(1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L), .Label = c("BC", "AB", "SK", "MB", "ON", "QC", "NB",
"PE", "NS", "NL"), class = "factor"), Value = c(363L, 61L, NA,
69L, NA, NA, 127L, 12L, 92L, 18L, 178L, 29L, 41L, 92L, 284L,
1019L, 267L, 27L, 77L, 22L)), .Names = c("Measure", "Province",
"Value"), row.names = 41:60, class = "data.frame")
ggplot(df, aes(x=Province, y=Value)) + geom_bar(aes(fill=Measure), position="dodge",
stat="identity") + geom_label(aes(label=Value), position = position_dodge(width=1))
I just realized (thanks to #aelwan answer) that the only thing I had to do is adding group=Measure in the aes() function, i.e.
ggplot(df, aes(x=Province, y=Value, group=Measure)) +
geom_bar(aes(fill=Measure),position="dodge", stat="identity") +
geom_label(aes(label=Value),position = position_dodge(width=1))
That gives:
Try this
ggplot(df, aes(x=Province, y=Value, group = Measure)) +
geom_col(aes(fill=Measure),
position ="dodge", width = 0.4)+
geom_text(aes(label= Value,
group = Measure
),vjust= 0, position = position_dodge(0.4) , color="black" )
This is a late answer, but the geom_text is not perfectly centered over the bars. One way to fix this is by also specifying the width of the bars position=position_dodge(width = 1).
geom_bar(aes(fill=Measure),position=position_dodge(width = 1), stat="identity")
I am trying to manually reorder the x-axis labels within each facet.
The data are as follows:
df = structure(list(block = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), .Label = c("1",
"2", "3", "4", "5"), class = "factor"), item = structure(c(14L,
15L, 28L, 29L, 30L, 31L, 32L, 15L, 16L, 17L, 18L, 19L, 20L, 21L,
15L, 22L, 23L, 24L, 25L, 26L, 27L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
1L, 8L, 9L, 10L, 11L, 12L, 13L), .Label = c("p00e00d00", "p00e00d11",
"p00e00d12", "p00e00d13", "p00e00d21", "p00e00d22", "p00e00d23",
"p00e11d00", "p00e12d00", "p00e13d00", "p00e21d00", "p00e22d00",
"p00e23d00", "p01e00d00", "p11e00d00", "p11e00d11", "p11e00d12",
"p11e00d13", "p11e00d21", "p11e00d22", "p11e00d23", "p11e11d00",
"p11e12d00", "p11e13d00", "p11e21d00", "p11e22d00", "p11e23d00",
"p12e00d00", "p13e00d00", "p14e00d00", "p21e00d00", "p22e00d00"
), class = "factor"), response = structure(c(2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("2",
"1"), class = "factor"), n = c(345L, 511L, 583L, 613L, 612L,
222L, 142L, 531L, 546L, 589L, 636L, 478L, 364L, 313L, 502L, 533L,
587L, 603L, 385L, 298L, 263L, 518L, 546L, 563L, 593L, 435L, 351L,
310L, 478L, 579L, 629L, 646L, 357L, 307L, 230L), freq = c(0.408284023668639,
0.604733727810651, 0.689940828402367, 0.725443786982249, 0.724260355029586,
0.262721893491124, 0.168047337278107, 0.628402366863905, 0.646153846153846,
0.697041420118343, 0.752662721893491, 0.565680473372781, 0.430769230769231,
0.370414201183432, 0.594082840236686, 0.630769230769231, 0.694674556213018,
0.713609467455621, 0.455621301775148, 0.352662721893491, 0.311242603550296,
0.61301775147929, 0.646153846153846, 0.666272189349112, 0.701775147928994,
0.514792899408284, 0.415384615384615, 0.366863905325444, 0.565680473372781,
0.685207100591716, 0.744378698224852, 0.764497041420118, 0.422485207100592,
0.363313609467456, 0.272189349112426)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -35L), .Names = c("block", "item",
"response", "n", "freq"))
There are five blocks, each block contains 7 items, and some items have the same names across blocks. I can therefore facet by block as follows:
df %>%
ggplot(aes(x = item, y = freq)) +
geom_bar(stat = "identity", position = "dodge", color = "black") +
facet_grid(.~block, scales = "free") +
coord_cartesian(ylim = c(0, 1), expand = F) + # need to add expanse = F to prevent zooming away
scale_y_continuous(labels = scales::percent) +
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1))
I also have vectors which states for each block the order that items should appear in. For example:
block_3_order = c("p11e13d00","p11e12d00", "p11e11d00", "p11e00d00", "p11e21d00", "p11e22d00","p11e23d00")
)
block_4_order = c("p00e00d13", "p00e00d12", "p00e00d11", "p00e00d00", "p00e00d21","p00e00d22","p00e00d23")
)
I tried to reorder the "item" factor, but to get the desired effect I would need to split the dataframe into subsets representing blocks. Otherwise I am having trouble grasping how you can integrate the ordering of factors with the ggplot treatment of item as a single factor across facets.
Any help is greatly appreciated.
To get a different custom axis order in each facet, you can create each "facet" as a separate plot and then lay them out together as if they were a single faceted plot.
library(tidyverse)
#devtools::install_github("baptiste/egg")
library(egg)
library(gridExtra)
library(grid)
theme_set(theme_bw())
First, create the custom orderings. The ones that are NULL will just be sorted alphabetically in the final plot.
b.order = list(b1 = NULL,
b2 = NULL,
b3 = c("p11e13d00","p11e12d00", "p11e11d00", "p11e00d00", "p11e21d00", "p11e22d00","p11e23d00"),
b4 = c("p00e00d13", "p00e00d12", "p00e00d11", "p00e00d00", "p00e00d21","p00e00d22","p00e00d23"),
b5 = NULL)
Create a list of plots, one for each block. We do this by splitting df by block. To get the custom ordering, we use factor to set the custom order based on the list b.order.
plist = map2(split(df, df$block), b.order,
~ .x %>% group_by(block) %>%
mutate(item = factor(item, levels=if(is.null(.y)) sort(unique(item)) else .y)) %>%
ggplot(aes(x = item, y = freq)) +
geom_bar(stat = "identity", position = "dodge", color = "black") +
facet_grid(.~block, scales = "free") +
coord_cartesian(ylim = c(0, 1), expand = F) + # need to add expanse = F to prevent zooming away
scale_y_continuous(labels = scales::percent) +
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1),
plot.margin=margin(b=-5)) +
labs(x=""))
Remove y-axis labels, title, and ticks from all but the left-most plot:
plist[2:length(plist)] = plist[2:length(plist)] %>%
map(~ .x + theme(axis.text.y=element_blank(),
axis.title.y=element_blank(),
axis.ticks.y=element_blank()))
Arrange the plots. We use ggarrange from the egg package in order to ensure that the plot panels all have the same horizontal width. We also need to add the Item label beneath the plot. However, ggarrange prints the plot to the output device, even inside arrangeGrob. So we create the object p, clear the device and then redraw the final plot.
p = arrangeGrob(ggarrange(plots=plist, ncol=length(plist)),
textGrob("Item"), heights=c(20,1))
grid.newpage()
grid.draw(p)