How to count and display factors on the x/y scale? - r

Problem
I am trying to count the data factor-wise and display it on the scale of a axis.
My closest solution is the following:
aes(x=(paste(A_REF,"(n=", length(A_REF), ")"))
n is the number displaying how many occurances of the factor exist in the data field.
Edit: How do I achieve that the first and fifth factor of V43 show up? --> forgot to library("foreign")
Current State: Solved
My Code so far
# Load libraries & packages =================================
library("ggplot2")
library("scales")
library("dplyr")
library("foreign")
# Data setup =================================
spss_file_path <- "D:\\Programming\\Testing\\2017-03-15_data_import&ggplot2\\Beispieldatensatz(fiktiv).sav"
exampledata <- read.spss(spss_file_path, use.value.labels = TRUE,
to.data.frame = TRUE, reencode = TRUE)
names(exampledata) <- c(V101, A_REF, V43)
exampledata$V43 <- factor(exampledata$V43,
levels = c(1,2,3,4,5),
labels = c("1 Sehr zufrieden","2","3","4", "5 Sehr unzufrieden"))
exampledata$V43 <- factor(exampledata$V43, levels = rev(unique(levels(exampledata$V43))))
exampledata$A_REF <- factor(exampledata$A_REF, levels = rev(unique(levels(exampledata$A_REF))))
exampledata$V101 <- factor(exampledata$V101, levels = rev(unique(levels(exampledata$V101))))
labels <- exampledata %>%
filter(!is.na(V101), !is.na(V43)) %>%
count(A_REF) %>%
mutate(labels = paste(A_REF,"(n=", n, ")")) %>%
select(A_REF, labels)
plot_data <- exampledata %>%
filter(!is.na(V101), !is.na(V43)) %>%
left_join(labels, by = "A_REF")
# Plot =================================
ggplot(plot_data, aes(x = labels, fill = V43)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent, breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1)) +
labs(y=NULL, x=NULL, fill=NULL) +
ggtitle(paste(attr(exampledata, "variable.labels")[77])) +
theme_classic() +
geom_text(stat="count",aes(label = scales::percent((..count..)/sum(..count..))), position = position_fill(vjust=0.5)) +
coord_flip()
Data
structure(list(exampledata.V101 = structure(c(2L, NA, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, NA, 2L, 2L, 2L, 1L, 2L, NA,
NA, NA, 1L, 1L, 2L, NA, 2L, 2L, 2L, NA, 2L, 2L, NA, NA, 1L, NA,
2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, NA, NA, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, NA, 1L, NA, 1L, NA,
1L, 2L, NA, NA, 2L, NA, 1L, 2L, 2L, NA, 2L, NA, 2L, 2L, 1L, 2L,
1L, 2L, 1L, 1L, 2L, 1L, NA, 2L, 2L, 2L, 2L, NA, 2L, 1L, 2L, 2L
), .Label = c("Weiblich", "Männlich"), class = "factor"), exampledata.A_REF = structure(c(18L,
18L, 18L, 18L, 18L, 17L, 18L, 18L, 18L, 18L, 18L, 18L, 16L, 18L,
18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 16L, 18L, 18L, 16L, 18L,
16L, 18L, 18L, 17L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L,
16L, 18L, 18L, 17L, 18L, 18L, 18L, 18L, 18L, 18L, 17L, 16L, 18L,
18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 17L, 18L, 18L,
16L, 18L, 16L, 18L, 18L, 16L, 16L, 18L, 18L, 18L, 18L, 18L, 18L,
18L, 17L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 16L, 18L,
16L, 16L, 18L, 18L, 18L, 17L, 16L, 18L), .Label = c("Zertifikat eines Aufbau- oder Ergänzungsstudiums",
"LA Berufliche Schulen", "LA Sonderschule", "LA Gymnasium", "LA Haupt- und Realschule",
"LA Grundschule", "Künstlerischer/musischer Abschluss", "Kirchlicher Abschluss",
"Staatsexamen (ohne Lehramt)", "Diplom Fachhochschule, Diplom I an Gesamthochschulen",
"Diplom Universität, Diplom II an Gesamthochschulen", "Sonstiges",
"Promotion", "Staatsexamen", "Magister", "Diplom", "Master",
"Bachelor"), class = "factor"), exampledata.V43 = structure(c(3L,
5L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 4L, 3L, 3L, 2L, NA, 4L, 5L, 5L,
4L, 4L, 4L, 4L, NA, 2L, 4L, 3L, 5L, 4L, 4L, 4L, NA, 4L, 4L, NA,
NA, 3L, 5L, 2L, 4L, 5L, 4L, 4L, 5L, 5L, 4L, NA, NA, 4L, NA, 3L,
4L, 5L, 5L, 2L, 4L, 4L, 3L, 4L, 4L, 4L, 3L, 5L, 4L, 5L, NA, 4L,
NA, 4L, NA, 4L, 5L, 4L, NA, 5L, NA, 4L, 4L, 4L, NA, 4L, NA, 5L,
4L, 4L, 4L, 4L, 4L, 3L, 3L, 4L, 2L, 4L, 4L, 4L, 3L, 4L, NA, 4L,
5L, 5L, 4L), .Label = c("5 Sehr unzufrieden", "4", "3", "2",
"1 Sehr zufrieden"), class = "factor")), .Names = c("exampledata.V101",
"exampledata.A_REF", "exampledata.V43"), row.names = c(NA, 100L
), class = "data.frame")

I think the easiest way is to compute the labels outside of ggplot.
Note that with your data, the 5th level of V43 doesn't show up.
library(ggplot2)
library(dplyr)
names(exampledata) <- c("V101", "A_REF", "V43")
I count A_REF and then apply your formula to compute the labels.
labels <- exampledata %>%
filter(!is.na(V101), !is.na(V43)) %>%
count(A_REF) %>%
mutate(labels = paste(A_REF,"(n=", n, ")")) %>%
select(A_REF, labels)
I then join the labels to the data
plot_data <- exampledata %>%
filter(!is.na(V101), !is.na(V43)) %>%
left_join(labels, by = "A_REF")
And finally, here is the plot. Note that the title doesn't show up as well.
ggplot(plot_data, aes(x = labels, fill = V43)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent, breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1)) +
labs(y=NULL, x=NULL, fill=NULL) +
ggtitle(paste(attr(exampledata, "variable.labels")[77])) +
theme_classic() +
geom_text(stat="count",aes(label = scales::percent((..count..)/sum(..count..))), position = position_fill(vjust=0.5)) +
coord_flip()

Related

Change x axis labels to hours (time) on geom_tile()

Here is a geom_tile displaying hours and days of the week, how can it made to display each hour (i.e. 00:00 through to 23:00 on the x axis)?
library(tidyverse)
df %>%
ggplot(aes(hour, day, fill = value)) +
geom_tile(colour = "ivory")
Currently it displays every fifth hour:
I have tried a bunch of different things, and would prefer a 'best practice' way (i.e. without manually generating labels), but in case labels are needed, here's one way to produce them hour_labs <- 0:23 %>% { ifelse(nchar(.) == 1, paste0("0", .), .) } %>% paste0(., ":00")
Data for reproducible example
df <- structure(list(day = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L), .Label = c("Sunday",
"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"
), class = c("ordered", "factor")), hour = c(0L, 2L, 3L, 5L,
6L, 7L, 8L, 10L, 11L, 12L, 13L, 18L, 21L, 22L, 23L, 0L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 20L, 21L, 22L,
23L, 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L,
20L, 21L, 22L, 23L, 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 13L, 14L, 20L, 21L, 22L, 23L, 0L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 15L, 20L, 21L, 22L, 23L, 0L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 13L, 14L, 15L, 16L,
19L, 21L, 0L, 1L, 2L, 3L, 7L, 8L, 10L, 13L, 14L, 22L, 23L), value = c(1L,
1L, 1L, 2L, 1L, 3L, 1L, 1L, 2L, 1L, 3L, 1L, 2L, 13L, 13L, 24L,
39L, 21L, 17L, 25L, 22L, 27L, 28L, 19L, 6L, 2L, 2L, 1L, 2L, 2L,
7L, 23L, 38L, 18L, 26L, 21L, 20L, 31L, 40L, 35L, 22L, 5L, 3L,
2L, 7L, 4L, 3L, 3L, 3L, 17L, 13L, 23L, 24L, 19L, 31L, 13L, 35L,
50L, 22L, 13L, 7L, 2L, 1L, 1L, 1L, 1L, 3L, 14L, 17L, 33L, 32L,
32L, 25L, 29L, 27L, 38L, 26L, 11L, 8L, 4L, 5L, 5L, 3L, 1L, 1L,
3L, 14L, 21L, 24L, 22L, 25L, 26L, 23L, 58L, 36L, 26L, 6L, 3L,
1L, 5L, 3L, 1L, 1L, 3L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L,
1L, 1L)), row.names = c(NA, -116L), groups = structure(list(day = structure(1:7, .Label = c("Sunday",
"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"
), class = c("ordered", "factor")), .rows = structure(list(1:15,
16:33, 34:51, 52:69, 70:88, 89:105, 106:116), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr"))), row.names = c(NA, 7L), class = c("tbl_df", "tbl",
"data.frame"), .drop = TRUE), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"))
Here's one way using sprintf to construct labels.
library(dplyr)
library(ggplot2)
df %>%
mutate(lab = sprintf('%02d:00', hour)) %>%
ggplot() + aes(lab, day, fill = value) +
geom_tile(colour = "ivory") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
To complete the missing times apart from #Eric Watt's suggestion we can also use complete.
df %>%
mutate(lab = sprintf('%02d:00', hour)) %>%
tidyr::complete(lab = sprintf('%02d:00', 0:23)) %>%
ggplot() + aes(lab, day, fill = value) +
geom_tile(colour = "ivory") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
I would suggest making sure your data type is correctly representing your data. If your hour column is representing time in hours, then it should be a time based structure. For example:
df$hour <- as.POSIXct(as.character(df$hour), format = "%H", tz = "UTC")
Then you can tell ggplot that the x axis is a datetime variable using scale_x_datetime.
ggplot(df, aes(hour, day, fill = value)) +
geom_tile(colour = "ivory") +
scale_x_datetime(labels = date_format("%H:%M")) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
If you want a break for every hour, you can input that as breaks:
ggplot(df, aes(hour, day, fill = value)) +
geom_tile(colour = "ivory") +
scale_x_datetime(breaks = as.POSIXct(as.character(0:23), format = "%H", tz = "UTC"),
labels = date_format("%H:%M")) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
You can also use the scales package which has handy formatting options such as date_breaks:
library(scales)
ggplot(df, aes(hour, day, fill = value)) +
geom_tile(colour = "ivory") +
scale_x_datetime(breaks = date_breaks("1 hour"),
labels = date_format("%H:%M")) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

How to create histograms for each unique combination of levels from two factors?

I cannot figure out how to use a loop to plot one histogram for each unique combination of levels from TWO factors.
Here is my data: https://www.dropbox.com/sh/exsjhu23fnpwf4r/AABvitLBN1nRMpXcyYMVIOIDa?dl=0
# perhaps need to have factors
df$freq <- as.factor(df$freq)
df$time <- as.factor(df$time)
I learned how to use a loop to plot histograms for ONE factor levels:
# space for plots
windows(width=19, height=10)
par(las=1, cex.lab=0.75, cex.axis=0.6, bty="n", mgp=c(1, 0.6, 0),
oma=c(2, 4, 2, 0) + 0.1, mar=c(4, 0, 3, 3) + 0.1)
a <- layout(matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21), nrow=3, ncol=7, byrow=T))
layout.show(a)
# loop
for (i in 1:length(unique(df$freq))) {
value <- subset(df, freq == unique (df$freq)[i])
hist(value$thr, main=paste0("freq: ", unique(df$freq)[i]))
}
I tried variations of this loop for TWO factors but that unfortunately does not work:
for (i in 1:length(unique(df[c("freq", "time")]))) {
value <- subset(df, freq == unique (df$freq)[i] & time == unique(df$time)[i])
hist(value$thr, main=paste0("freq: ", unique(df$freq)[i]))
}
I would also like to learn how to label each histogram based on the levels of TWO factors (not just one)...
It's more convenient to use by here.
For the titles we prefer characters to factors.
df1[c("freq", "time")] <- lapply(df1[c("freq", "time")], as.character)
Then open windows,
windows(width=19, height=10)
par(las=1, cex.lab=0.75, cex.axis=0.6, bty="n", mgp=c(1, 0.6, 0),
oma=c(2, 4, 2, 0) + 0.1, mar=c(4, 0, 3, 3) + 0.1)
a <- layout(matrix(1:21, 3, 7))
layout.show(a)
and plot.
by(df1, df1[c("freq", "time")], function(x)
hist(x$thr, main=paste("freq:", paste(x[1, c(1, 3)], collapse=","))))
Result
Edit
To get the specific order we probably have to do some more stuff.
df1[c("freq", "time")] <- lapply(df1[c("freq", "time")], as.character)
windows(width=19, height=10)
par(las=1, cex.lab=0.75, cex.axis=0.6, bty="n", mgp=c(1, 0.6, 0),
oma=c(2, 4, 2, 0) + 0.1, mar=c(4, 0, 3, 3) + 0.1)
a <- layout(matrix(1:21, 3, 7, byrow=TRUE)) # with byrow
layout.show(a)
l <- split(df1, df1[c("freq", "time")])
m <- t(sapply(l, function(x) x[1, c(1, 3)])) # matrix of first rows of each subset
m[, 2] <- sub("m", "", m[, 2]) # use the values...
m <- apply(m, 1:2, as.numeric) # ... make numeric
Now we obtain the histograms within a lapply over the list ordered by m.
lapply(l[order(m[, 2], m[, 1])], function(x)
hist(x$thr, main=paste("freq:", paste(x[1, c(1, 3)], collapse=","))))
New Result
Data
df1 <- structure(list(freq = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L), .Label = c("4",
"8", "12.5", "16", "20", "25", "31.5"), class = "factor"), thr = c(60L,
25L, 20L, 15L, 15L, 30L, 35L, 60L, 25L, 10L, 15L, 15L, 30L, 35L,
55L, 30L, 15L, 15L, 10L, 25L, 40L, 50L, 25L, 15L, 10L, 15L, 20L,
40L, 50L, 30L, 10L, 15L, 15L, 20L, 25L, 50L, 25L, 10L, 10L, 10L,
20L, 25L, 45L, 20L, 10L, 10L, 10L, 20L, 25L, 45L, 15L, 10L, 10L,
10L, 20L, 30L, 60L, 30L, 10L, 10L, 10L, 15L, 30L, 50L, 25L, 10L,
10L, 10L, 20L, 30L, 45L, 25L, 15L, 10L, 15L, 30L, 35L, 50L, 25L,
15L, 10L, 15L, 25L, 35L, 60L, 25L, 10L, 10L, 15L, 20L, 30L, 60L,
25L, 5L, 5L, 10L, 20L, 30L, 45L, 20L, 5L, 10L, 10L, 20L, 30L,
45L, 20L, 10L, 10L, 10L, 20L, 30L, 60L, 30L, 15L, 10L, 15L, 25L,
30L, 55L, 25L, 10L, 10L, 10L, 20L, 30L, 55L, 35L, 10L, 10L, 10L,
20L, 30L, 60L, 35L, 15L, 10L, 10L, 15L, 25L, 50L, 30L, 10L, 10L,
10L, 20L, 25L, 55L, 25L, 10L, 10L, 15L, 25L, 25L, 65L, 30L, 10L,
10L, 15L, 20L, 30L, 60L, 30L, 15L, 15L, 15L, 15L, 30L, 55L, 35L,
15L, 15L, 15L, 25L, 35L, 55L, 35L, 15L, 15L, 15L, 25L, 35L, 60L,
35L, 15L, 15L, 15L, 25L, 35L, 60L, 30L, 10L, 10L, 15L, 25L, 35L,
55L, 30L, 15L, 10L, 10L, 25L, 30L, 50L, 25L, 10L, 10L, 10L, 20L,
30L, 55L, 30L, 10L, 10L, 15L, 20L, 30L, 55L, 30L, 10L, 15L, 20L,
25L, 35L, 55L, 25L, 15L, 15L, 15L, 25L, 40L, 50L, 20L, 10L, 10L,
20L, 30L, 40L, 45L, 25L, 10L, 10L, 10L, 20L, 30L, 50L, 25L, 10L,
10L, 10L, 20L, 25L, 55L, 20L, 10L, 10L, 15L, 25L, 35L, 50L, 20L,
10L, 10L, 15L, 25L, 30L, 45L, 20L, 15L, 10L, 10L, 20L, 30L, 50L,
20L, 15L, 15L, 15L, 20L, 30L, 60L, 35L, 15L, 10L, 15L, 25L, 30L,
60L, 35L, 15L, 15L, 15L, 30L, 35L, 55L, 25L, 10L, 15L, 15L, 25L,
35L, 50L, 30L, 10L, 15L, 15L, 25L, 35L, 55L, 25L, 20L, 15L, 15L,
25L, 30L, 55L, 25L, 15L, 15L, 15L, 30L, 35L), time = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L), .Label = c("3m", "6m", "9m"), class = "factor")), row.names = c(NA,
-322L), class = "data.frame")

plot group and category means with group_by

I am new to R and trying to figure out a way to plot means for individual samples as well as group means with ggplot.
I am following this articles on R-bloggers (last paragraph):
https://www.r-bloggers.com/plotting-individual-observations-and-group-means-with-ggplot2/
This is my code:
gd <- meanplot1 %>%
group_by(treatment, value) %>%
summarise(measurement = mean(measurement))
ggplot(meanplot1, aes(x=value, y=measurement, color=treatment)) +
geom_line(aes(group=sample), alpha=0.3) +
geom_line(data=gd, size=3, alpha=0.9) +
theme_bw()
Whilst the sample means are being shown, the group means aren´t. I get the error
geom_path: Each group consists of only one observation. Do you need
to adjust the group aesthetic?
Upon adding group=1, I get a weirdly mixed category mean, but not what I am looking for..
I scrolled through a lot of articles already, but couldnt find an answer - I would be so happy if somebody could help me out here!! :)
My data (meanplot1) is formatted like this:
treatment sample value measurement
1 control, control 1, initial, 20,
2 control, control 1, 26, NA,
3 control, control 1, 26', 28,
12 control, control 2, initial, 22,
13 control control 2, 26, NA,
14 control control 2, 26', 36,
15 control control 2, 28, 45,
67 stressed, stress 1, initial, 37,
68 stressed, stress 1, 26, NA,
69 stressed, stress 1, 26', 17,
78 stressed, stress 2, initial, 36,
79 stressed, stress 2, 26, NA,
80 stressed, stress 2, 26', 25,
I am hoping to see 6 lines, one mean for stress 1, stress 2, control 1 and control 2, and one mean for all treatment=control, and one for all treatment=stressed
output dput(gd):
structure(list(treatment = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("control", "stressed"), class = "factor"), value = structure(c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 11L), .Label = c("26", "26'", "28", "28'",
"30", "30'", "32", "32'", "34", "34'", "initial"), class = "factor"),
measurement = c(NA, 32.3333333333333, 39.5, 30.3333333333333,
31.8333333333333, 31.8333333333333, NA, 36, 34.6666666666667,
36, 24.6666666666667, NA, 25.3333333333333, 33.3333333333333,
32, 50.1666666666667, 39.1666666666667, NA, 33.5, 24.3333333333333,
27.3333333333333, 36)), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -22L), vars = list(treatment), drop = TRUE, .Names = c("treatment",
"value", "measurement"))
output dput(meanplot1):
structure(list(treatment = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("control",
"stressed"), class = "factor"), sample = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L), .Label = c("control 1",
"control 2", "control 3", "control 4", "control 5", "control 6",
"stress 1", "stress 2", "stress 3", "stress 4", "stress 5", "stress 6"
), class = "factor"), value = structure(c(11L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), .Label = c("26", "26'",
"28", "28'", "30", "30'", "32", "32'", "34", "34'", "initial"
), class = "factor"), measurement = c(20L, NA, 28L, 18L, 17L,
19L, 34L, NA, 23L, 29L, 27L, 22L, NA, 36L, 45L, 31L, 40L, 44L,
NA, 49L, 40L, 39L, 32L, NA, 35L, 57L, 30L, 37L, 29L, NA, 44L,
37L, 46L, 20L, NA, 39L, 27L, 30L, 40L, 25L, NA, 29L, 50L, 30L,
26L, NA, 28L, 45L, 47L, 27L, 35L, NA, 24L, 22L, 35L, 28L, NA,
28L, 45L, 27L, 28L, 24L, NA, 47L, 30L, 39L, 37L, NA, 17L, 29L,
29L, 31L, 29L, NA, 37L, 21L, 27L, 36L, NA, 25L, 41L, 51L, 66L,
50L, NA, 33L, 25L, 22L, 36L, NA, 33L, 45L, 26L, 72L, 59L, NA,
33L, 26L, 25L, 33L, NA, 21L, 33L, 25L, 29L, 21L, NA, 26L, 20L,
16L, 22L, NA, 30L, 27L, 28L, 57L, 41L, NA, 28L, 23L, 17L, 52L,
NA, 26L, 25L, 33L, 46L, 35L, NA, 44L, 31L, 57L)), .Names = c("treatment",
"sample", "value", "measurement"), class = "data.frame", row.names = c(NA,
-132L))
I suppose you are aiming to plot the treatment means.
By default, since you are using a categorical x-axis, the grouping is set to the interaction between x and color. You only want to group by treatment, however. So we'll add the correct grouping to the call.
ggplot(meanplot1, aes(x = value, y = measurement, color=treatment)) +
geom_line(aes(group=sample), alpha=0.3) +
geom_line(aes(group = treatment), gd, size=3, alpha=0.9) +
theme_bw()
Also note that
ggplot(meanplot1, aes(x=value, y=measurement, color=treatment)) +
geom_line(aes(group=sample), alpha=0.3) +
stat_summary(aes(group = treatment), fun.y = mean, geom = 'line', size=3, alpha=0.9) +
theme_bw()
Gives the same plot, without the interruption.

How to calculate percentages in a stacked barplot bar-wise?

Problem
The current percentages in the bar are calculate with the total amount of data. I want to each stack to have a fully 100%. (Solved)
Also the percentages should be rounded to the nearest integer. (Solved)
Edit: Remove all percentages below or equal to 1. (Solved)
Edit2: Make sure no labels are overlapping.
I've been googling for a while now. It seems like there isn't a proper way to prevent labels overlapping.
Possible solutions I discovered:
Flip the plot
Add angle() to rotate the labels
"Manually" calculate each position
Make use of check_overlap = TRUE
Current State
My Code so far
# Load libraries & packages =================================
library("ggplot2")
library("scales")
library("dplyr")
library("foreign")
library("tidyverse")
library("forcats")
# Data setup =================================
spss_file_path <- "D:\\Programming\\Testing\\2017-03-15_data_import&ggplot2\\Beispieldatensatz(fiktiv).sav"
exampledata <- read.spss(spss_file_path, use.value.labels = TRUE,
to.data.frame = TRUE, reencode = TRUE)
exampledata$V43 <- factor(exampledata$V43,
levels = c(1,2,3,4,5),
labels = c("1 Sehr zufrieden","2","3","4", "5 Sehr unzufrieden"))
exampledata$V43 <- factor(exampledata$V43, levels = rev(unique(levels(exampledata$V43))))
exampledata$A_REF <- factor(exampledata$A_REF, levels = rev(unique(levels(exampledata$A_REF))))
exampledata$V101 <- factor(exampledata$V101, levels = rev(unique(levels(exampledata$V101))))
labels <- exampledata %>%
filter(!is.na(V101), !is.na(V43)) %>%
count(A_REF) %>%
mutate(labels = paste(A_REF,"(n=", n, ")")) %>%
select(A_REF, labels)
plot_data <- exampledata %>%
filter(!is.na(V101), !is.na(V43)) %>%
left_join(labels, by = "A_REF")
plot_data <- plot_data %>%
group_by(labels) %>%
summarize(`5 Sehr unzufrieden` = sum(ifelse(V43 == "5 Sehr unzufrieden", 1, 0)) / n(),
`4` = sum(ifelse(V43 == "4", 1, 0)) / n(),
`3` = sum(ifelse(V43 == "3", 1, 0)) / n(),
`2` = sum(ifelse(V43 == "2", 1, 0)) / n(),
`1 Sehr zufrieden` = sum(ifelse(V43 == "1 Sehr zufrieden", 1, 0)) / n()) %>%
gather(key = Rating, value = prop, -labels)
plot_data$labels <- factor(plot_data$labels)
plot_data$Rating <- factor(plot_data$Rating) %>% fct_rev()
# Plot =================================
ggplot(plot_data, aes(x = labels, y = prop, fill = Rating)) +
geom_col() +
scale_y_continuous(labels = scales::percent, breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1)) +
labs(y=NULL, x=NULL, fill=NULL) +
ggtitle(paste(attr(exampledata, "variable.labels")[77])) +
theme_classic() +
geom_text(aes(label = if_else(prop > 0.02, scales::percent(round(prop, 2)), NULL)), position = position_fill(vjust=0.5)) +
coord_flip()
Data
structure(list(exampledata.V101 = structure(c(2L, NA, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, NA, 2L, 2L, 2L, 1L, 2L, NA,
NA, NA, 1L, 1L, 2L, NA, 2L, 2L, 2L, NA, 2L, 2L, NA, NA, 1L, NA,
2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, NA, NA, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, NA, 1L, NA, 1L, NA,
1L, 2L, NA, NA, 2L, NA, 1L, 2L, 2L, NA, 2L, NA, 2L, 2L, 1L, 2L,
1L, 2L, 1L, 1L, 2L, 1L, NA, 2L, 2L, 2L, 2L, NA, 2L, 1L, 2L, 2L
), .Label = c("Weiblich", "Männlich"), class = "factor"), exampledata.A_REF = structure(c(18L,
18L, 18L, 18L, 18L, 17L, 18L, 18L, 18L, 18L, 18L, 18L, 16L, 18L,
18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 16L, 18L, 18L, 16L, 18L,
16L, 18L, 18L, 17L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L,
16L, 18L, 18L, 17L, 18L, 18L, 18L, 18L, 18L, 18L, 17L, 16L, 18L,
18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 17L, 18L, 18L,
16L, 18L, 16L, 18L, 18L, 16L, 16L, 18L, 18L, 18L, 18L, 18L, 18L,
18L, 17L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 16L, 18L,
16L, 16L, 18L, 18L, 18L, 17L, 16L, 18L), .Label = c("Zertifikat eines Aufbau- oder Ergänzungsstudiums",
"LA Berufliche Schulen", "LA Sonderschule", "LA Gymnasium", "LA Haupt- und Realschule",
"LA Grundschule", "Künstlerischer/musischer Abschluss", "Kirchlicher Abschluss",
"Staatsexamen (ohne Lehramt)", "Diplom Fachhochschule, Diplom I an Gesamthochschulen",
"Diplom Universität, Diplom II an Gesamthochschulen", "Sonstiges",
"Promotion", "Staatsexamen", "Magister", "Diplom", "Master",
"Bachelor"), class = "factor"), exampledata.V43 = structure(c(3L,
5L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 4L, 3L, 3L, 2L, NA, 4L, 5L, 5L,
4L, 4L, 4L, 4L, NA, 2L, 4L, 3L, 5L, 4L, 4L, 4L, NA, 4L, 4L, NA,
NA, 3L, 5L, 2L, 4L, 5L, 4L, 4L, 5L, 5L, 4L, NA, NA, 4L, NA, 3L,
4L, 5L, 5L, 2L, 4L, 4L, 3L, 4L, 4L, 4L, 3L, 5L, 4L, 5L, NA, 4L,
NA, 4L, NA, 4L, 5L, 4L, NA, 5L, NA, 4L, 4L, 4L, NA, 4L, NA, 5L,
4L, 4L, 4L, 4L, 4L, 3L, 3L, 4L, 2L, 4L, 4L, 4L, 3L, 4L, NA, 4L,
5L, 5L, 4L), .Label = c("5 Sehr unzufrieden", "4", "3", "2",
"1 Sehr zufrieden"), class = "factor")), .Names = c("exampledata.V101",
"exampledata.A_REF", "exampledata.V43"), row.names = c(NA, 100L
), class = "data.frame")
It's usually preferable to manipulate your data into summarized data before charting it. I find that trying to have ggplot2 do the summarization for you is either limited or difficult to have it shown the way you want.
library(tidyverse)
library(forcats)
Because it's best to summarize your data before plotting it in ggplot2, the following bit of code calculates the proportion withing each group of label that selected a particular answer on the scale. In the final step I turned the data from wide to long, so that all the proportions to be charted are in the same variable (which I call prop).
plot_data <- plot_data %>% group_by(labels) %>%
summarize(`5 Sehr unzufrieden` = sum(ifelse(V43 == "5 Sehr unzufrieden", 1, 0)) / n(),
`4` = sum(ifelse(V43 == "4", 1, 0)) / n(),
`3` = sum(ifelse(V43 == "3", 1, 0)) / n(),
`2` = sum(ifelse(V43 == "2", 1, 0)) / n(),
`1 Sehr zufrieden` = sum(ifelse(V43 == "1 Sehr zufrieden", 1, 0)) / n()) %>%
gather(key = Rating, value = prop, -labels)
It's preferable that categorical variables are set as factors for manipulating, say, the order and the colours, so this is what the following does. Initially, my code had the scale labels (which I called Rating in the gather function above) go in the reverse order than what you had, so I'm using fct_rev from the forcats package to reverse it back.
plot_data$labels <- factor(plot_data$labels)
plot_data$Rating <- factor(plot_data$Rating) %>% fct_rev()
For the chart below, I just made a couple of changes. The most notable is that I'm using geom_col instead of geom_bar. In the background, geom_col is the same as geom_bar(stat = "identity") - it's just quicker to type. We're essentially telling ggplot2 to chart the data as is instead of treating it like raw data. However, I do need to specify the y aesthetic to indicate what data I want charted, so I'm specifying to use the prop variable in the initial ggplot call.
# Plot =================================
ggplot(plot_data, aes(x = labels, y = prop, fill = Rating)) +
geom_col() +
scale_y_continuous(labels = scales::percent, breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1)) +
labs(y=NULL, x=NULL, fill=NULL) +
ggtitle(paste(attr(exampledata, "variable.labels")[77])) +
theme_classic() +
geom_text(aes(label = if_else(prop > 0.01, scales::percent(round(prop, 2)), NULL)), position = position_fill(vjust=0.5)) +
coord_flip()
The only other line I changed is the geom_text call above. I added an if_else function so that it either shows the label (if it's above 1%) or not (1% or less). Also, I rounded the percentage so that you don't have any decimals using the round function. Remember that you need to round to 2 decimal points.
Not sure if this will get you towards where you want to go, but here's a simple version based on some code I made a little way back. Didn't include all the ggplot2 bits as I agree with #Phil that the summary should be done before plotting.
devtools::install_github("ekstroem/MESS")
x <- c(35, 34.6, 12, 5, .1, .99, 1.2, 11.11) # Input percentages
round_percent(x)
which gives
[1] 35 35 12 5 0 1 1 11
or you could have
round_percent(x[x>1])
which gives
[1] 36 35 12 5 1 11
You'd need to make sure the colouring matches the remaining groups tho' so there is still some work left.

automaticly add p-values to facet plot

I have made a facet plot below using the following command:
ggplot(data, aes(factor(Length),logFC)),
+ geom_boxplot(fill = "grey90"),
+ coord_cartesian(ylim=c(-5,5)) + facet_grid(X~Modification)
Is there a way to compute p-values for each boxplot and add them as geom_text above each boxplot. I want to compute a t-test and compare against y=0.
My data looks like this:
X Length logFC Modification
Daub 26 -0.7307060811 NTA
Daub 22 -0.3325621272 NTA
Daub 22 -2.0579390395 NTA
Daub 25 2.7199391457 NTA
Daub 23 -0.0009869389 NTA
Daub 25 -0.3318842493 NTA
...
My error message:
> data <- structure(list(Experiment = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
+ 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
+ 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
+ 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
+ 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
+ 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
+ 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
+ 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
+ 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
+ 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
+ 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
+ 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
+ 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
+ 3L, 3L, 3L, 3L, 3L,
+ 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
+ 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Daub", "Marie",
+ "Meister"), class = "factor"), Length = c(26L, 22L, 22L, 25L,
+ 23L, 25L, 23L, 25L, 24L, 23L, 24L, 26L, 24L, 21L, 20L, 21L, 22L,
+ 22L, 21L, 21L, 21L, 22L, 21L, 22L, 21L, 21L, 20L, 20L, 21L, 25L,
+ 20L, 22L, 24L, 22L, 23L, 24L, 23L, 23L, 22L, 22L, 22L, 22L, 21L,
+ 19L, 21L, 20L, 20L, 20L, 19L, 19L, 19L, 22L, 23L, 23L, 22L, 23L,
+ 22L, 20L, 21L, 24L, 24L, 24L, 25L, 24L, 21L, 20L, 23L, 23L, 20L,
+ 23L, 23L, 24L, 20L, 21L, 22L, 24L, 23L, 22L, 23L, 22L, 23L, 23L,
+ 19L, 21L, 23L, 24L, 22L, 23L, 23L, 21L, 22L, 20L, 22L, 23L, 25L,
+ 22L, 22L, 23L, 22L, 23L, 25L, 25L, 24L, 24L, 23L, 22L, 22L, 25L,
+ 23L, 24L, 23L, 23L, 22L, 22L, 25L, 23L, 22L, 25L, 21L, 19L, 21L,
+ 23L, 22L, 22L, 20L, 20L, 20L, 23L, 22L, 21L, 21L, 23L, 23L, 23L,
+ 21L, 25L, 23L, 24L, 24L, 23L, 23L, 23L, 21L, 22L, 21L, 21L, 23L,
+ 23L, 22L, 22L, 21L, 22L, 22L, 25L, 24L, 24L, 22L, 24L, 24L, 23L,
+ 22L, 21L, 22L, 23L, 20L, 22L, 23L, 24L, 25L, 24L, 25L, 22L, 23L,
+ 24L, 21L, 25L, 23L, 19L, 21L, 21L, 22L, 20L, 21L, 18L, 20L, 20L,
+ 21L, 20L, 23L, 19L, 19L, 22L, 22L, 22L, 22L, 22L, 21L, 22L, 24L,
+ 20L, 21L, 22L, 22L, 21L, 21L, 21L, 21L, 21L, 23L, 23L, 23L, 25L,
+ 25L, 25L, 23L, 24L, 24L, 24L, 24L, 24L, 24L, 25L, 25L), logFC = c(-0.7307060811,
+ -0.3325621272, -2.0579390395, 2.7199391457, -0.0009869389, -0.3318842493,
+ -2.1922199037, -1.8907961065, -1.9059255014, -0.2815081355, -0.2040330335,
+ 3.661469505, 0.6489955587, -0.0261245467, -1.4312409441, -1.1199604078,
+ -1.6528592355, -2.8208936451, -0.7207549269, -1.6528592355, -1.2540377475,
+ -2.1088724443, -2.1088724443, -1.5556550771, -1.5556550771, -0.2899601367,
+ 0.36449851, -1.7787723427, -1.5556550771, -1.5556550771, -1.5556550771,
+ -2.1092566794, 0.0417776477, -3.0768675589, -4.2573082637, -1.5556550771,
+ -1.8493703566, -0.7310899725, -2.8201262449, -0.7203706918, -2.1088724443,
+ -3.5714106365, -1.5556550771, -1.2144625017, 1.6608916211, -0.3147141406,
+ 1.2344697053, 1.2303596917, 1.2138067782, 0.9409846988, 0.5270928206,
+ -1.0435216994, -1.4320081419, -1.1644217165, -1.1478237529, -0.9941196613,
+ 0.0762668692, 1.0076747803, 0.0679302699, -0.4852244221, 0.7792467457,
+ 0.4902414285, 1.6172022872, 0.5270928206, -1.5403877099, -0.3322684844,
+ 0.0965099283, 0.8067662712, -0.3322684844, -1.2928579903, 0.6067208763,
+ 0.0247576412, -0.0291609233, -0.4737578429, 0.0743062433, 0.1126554177,
+ -0.0156954476, 1.1069888258, -0.956482117, -0.2829742145, 0.8511530937,
+ -0.1571780266, -1.2033199926, -1.1883052896, -0.0619556757, -0.7813018565,
+ 2.2467468049, 2.8382841074, 0.5658773933, -0.4461699001, -0.7409548873,
+ -0.992979577, -1.0966445642, -0.8035321174, 0.4586171366, -0.2760821893,
+ 0.0585422656, 0.0328935437, 0.3858231436, -0.4374188039, 1.1166538873,
+ -1.6539303789, 0.2027459981, -0.2193112677, -0.3939953745, -1.6726108643,
+ 1.1518720793, 2.2517568637, -0.561147283, -2.1625509666, -1.65562751,
+ -0.9048469063, -1.0759388341, 0.4938537603, 1.8754485108, -1.5944759871,
+ 1.0688499798, 2.6559945275, -1.908097968, -1.9214219995, -2.9675169126,
+ 0.0365892303, -0.8345258687, -1.0535567925, -2.0036191122, -1.6843791204,
+ -2.5554312825, -1.5778268888, -1.576142107, -0.9398408101, 2.4453250675,
+ -1.5434092122, -0.794414515, -0.6200158513, 0.5556353409, -1.0772272444,
+ -0.8720587283, -0.8082062813, -0.7353916189, 0.1072543637, 0.5658773933,
+ 0.13043531, -0.0154958912, -0.868710614, -0.1922496916, 1.0682890388,
+ -1.673413308, -0.9581901784, -1.9575141988, -1.8973257122, 1.4967046965,
+ -2.456068976, -1.4577030552, -4.2692094743, -1.9124787897, -1.4993411082,
+ -0.6409837734, 0.6369441273, -0.9960964825, -5.9703084924, -1.97960268,
+ -1.2422870608, -1.5170124157, -1.9021683731, 3.4029417731, 0.1812972171,
+ -1.6370149729, -1.749015407, -2.1677341592, -1.4942545905, -1.1137758818,
+ -1.2428452903, -1.3014446584, 0.0287537402, -0.8721416458, -2.4062762035,
+ -4.0278899462, -2.2229120764, -1.5950383235, -3.6098212725, -2.5979636046,
+ 0.3631424981, 1.1377073609, 0.5151459494, 0.0640542096, -0.7715375264,
+ -1.0361077101, -0.2462753448, -2.3058140776, -0.0847179004, -0.518970228,
+ 0.8519432911, 1.9516260022, -0.5706154628, 1.240812729, 0.336736001,
+ 2.2509464232, -0.322918086, -4.4019571741, -0.5618441487, 3.4700721641,
+ -3.9220135953, -2.1968879291, -0.1362995026, 2.164094913, -1.0688563363,
+ 0.4302583643, 2.6411096027, -3.020513717, -1.5395519303, -2.2219591633,
+ -3.8891956255, 0.9602784132, -0.6470571429, 1.853151793, -0.3271268741,
+ -0.9870872828, -2.516770073, -1.2898235194, -1.7246627604, -0.61328192,
+ -3.5457352204, -2.5068717697), Modification = structure(c(1L,
+ 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L,
+ 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L,
+ 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
+ 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L,
+ 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
+ 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
+ 5L, 5L, 5L, 5L, 5L, 5L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
+ 4L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
+ 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
+ 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
+ 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
+ 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L,
+ 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L,
+ 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NTA",
+ "t3-d", "t3-u", "t5-d", "t5-u"), class = "factor")), .Names = c("Experiment",
+ "Length", "logFC", "Modification"), class = "data.frame", row.names = c(NA,
+ -223L))
> library(dplyr)
> pvalues <- data %>% group_by(Experiment, Modification, Length) %>%
+ filter(n() > 1) %>%
+ summarize(p.value = (t.test(logFC, mu = 0)$p.value))
Error in t.test(logFC, mu = 0) : object 'logFC' not found
You can do this by summarizing the data into a table of p-values. This can be done using dplyr:
library(dplyr)
pvalues <- data %>% group_by(Experiment, Modification, Length) %>%
filter(n() > 1) %>%
summarize(p.value = (t.test(logFC, mu = 0)$p.value))
(The line filter(n() > 1) is to get rid of any groups of size 1, for which a p-value cannot be calculated). This produces a table that looks like:
# Experiment Modification Length p.value
# 1 Daub NTA 22 0.3980043
# 2 Daub NTA 23 0.3535590
# 3 Daub NTA 24 0.5831962
# 4 Daub NTA 25 0.9137644
# 5 Daub NTA 26 0.6254004
# 6 Daub t3-d 20 0.1493108
Now you can add that text to your plot using a geom_text layer, choosing some y such as y = 3:
library(ggplot2)
ggplot(data, aes(factor(Length),logFC)) + geom_boxplot(fill = "grey90") +
coord_cartesian(ylim=c(-5,5)) + facet_grid(Experiment~Modification) +
geom_text(aes(y = 3, label = p.value), data = pvalues, size = 1)
You will probably have to manipulate the size (and possibly angle) of your geom_text to make the plot readable. Note also that since you are performing many tests, you should probably look at the adjusted p-values rather than the raw p-values. You can compute that column with
pvalues <- pvalues %>% mutate(p.adjusted = p.adjust(p.value, method = "bonferroni"))
The function format.pval will also come in handy, especially if some of your p-values are close to 0.

Resources