Related
I have a dataframe where the first column has different types of bacteria, and the rest of the columns the samples, each sample belong to a specific time (T0, T1...) and the last 39 columns are the control group.
What I pretend is to plot each bacteria in one plot. And the plot must contain the different times in x-axis and the value in the y-axis (I was thinking in a bar plot or box plot with the errors coef.
Any idea about how can I group the data for different times and for different bacteria?
Here a small example of the data:
thanks!
structure(list(Bacteria = c("Methanobrevibacter", "Methanosphaera",
"Methanomassiliicoccus"), PIE2001_T0_TORUNDA = c(2.279974027,
0.670536115, -0.022611066), PIE2001_T1_TORUNDA = c(2.021643324,
-0.057798217, -0.057798217), PIE2001_T5_COMPL = c(2.788566988,
0.648500825, -0.044646356), PIE2006_T0_TORUNDA = c(0.07550014,
1.684938052, 0.07550014), PIE2007_T0_TORUNDA = c(2.072075243,
1.261145027, -0.125149334), PIE2007_T1_TORUNDA = c(2.601582257,
1.279826417, -0.106467944), PIE2007_T2 = c(2.81564899, 1.765826865,
-0.180083284), PIE2007_T3 = c(0.639040509, 3.081387545, -0.054106671
), PIE2013_T0_COMPLETA = c(2.683794403, -0.024255798, -0.024255798
), PIE2013_T1_COMPLETA = c(2.614756053, -0.024301277, -0.024301277
), PIE2013_T4_COMP = c(2.653056483, 0.013999154, 0.013999154),
PIE2013_T5_COMPL = c(1.861263144, -0.084647005, -0.084647005
), PIE2014_COMP = c(2.304771706, 1.005488722, -0.093123567
), PIE2016_T0_COMPLETA = c(-0.141271428, -0.141271428, -0.141271428
), PIE2016_T1_COMPLETA = c(-0.081696055, -0.081696055, -0.081696055
), PIE2016_T3 = c(-0.019385468, -0.019385468, -0.019385468
), PIE2016_T3_TOR = c(0.045856809, 0.045856809, 0.045856809
), PIE2017_T0_COMPLETA = c(4.493506636, 0.189441543, 0.189441543
), PIE2017_T1_COMPLETA = c(5.001671041, 0.71808448, 0.024937299
), PIE2017_T2_TOR = c(5.887191114, 0.672255357, -0.020891824
), PIE2017_T3 = c(3.306066839, 0.703377154, 0.010229973),
PIE2017_T4_COMP = c(5.560847286, 1.371192544, -0.015101817
), PIE2017_T5_COMPL = c(5.688626959, -0.025105846, -0.025105846
), PIE2018_T1 = c(0.158551089, 0.158551089, 0.158551089),
PIE2019_T1_COMPL = c(6.659430141, 0.833430034, 0.140282853
)), row.names = c(NA, 3L), class = "data.frame")
Script updated:
colnames(df)[363:401] <- gsub("T0", "T6", colnames(df)[363:401])
df %>%
pivot_longer(-Bacteria) %>%
mutate(group = gsub('_.*$', '', name),
time = gsub('^.*_(T\\d+).*$', '\\1', name)) %>%
filter(grepl('T\\d+', time)) %>%
ggplot(aes(time, value, fill = Bacteria)) +
geom_bar(stat = 'summary', fun = 'mean', position = 'dodge') +
stat_summary(fun.data = "mean_se", geom = "errorbar", width = 0.2, position = position_dodge(0.9)) +
theme_minimal() +
facet_grid(Bacteria ~ ., scale = 'free_y') +
scale_fill_brewer(palette = 'Set1') +
theme(panel.border = element_rect(fill = NA, color = 'gray75'))
You need to reshape your data. You can then do a comparative boxplot:
library(tidyverse)
df %>%
pivot_longer(-Bacteria) %>%
mutate(group = gsub('_.*$', '', name),
time = gsub('^.*_(T\\d+).*$', '\\1', name)) %>%
filter(grepl('T\\d+', time)) %>%
ggplot(aes(time, value, fill = Bacteria)) +
geom_boxplot() +
theme_minimal() +
facet_grid(Bacteria ~ ., scale = 'free_y') +
scale_fill_brewer(palette = 'Set2') +
theme(panel.border = element_rect(fill = NA, color = 'gray75'))
Pivot your data to long format, then facet by bacterium:
library(tidyr)
library(dplyr)
library(forcats)
library(ggplot2)
dat_long <- dat %>%
pivot_longer(!Bacteria, names_to = "sample") %>%
mutate(sample = fct_inorder(sample))
ggplot(dat_long, aes(sample, value)) +
geom_col() +
facet_wrap(vars(Bacteria), ncol = 1) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Or make a line graph with bacteria mapped to color:
ggplot(dat_long, aes(sample, value)) +
geom_line(aes(color = Bacteria, group = Bacteria)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
For a heatmap made using ggplot and geom_tile, how would you "cross off" a tile based on a conditional value?
The heatmap shows counts of the number of times an animal performed a behavior between 1990-2020.
Rows are animal IDs, columns are years.
Years go from 1990-2020 but not all animals are alive throughout that time frame (ie, some born later than 1990 or die earlier than 2020)
So I want to cross off any tiles where an animal isn't alive, or before it was born.
Data look like this (shortened to 5 rows for brevity):
data <- data.frame(date = structure(c(8243, 8243, 8243, 8248, 8947), class = "Date"),
year = c("1992", "1992", "1992", "1992", "1994"),
event.id = c(8L, 8L, 8L, 10L, 11L),
id = c("L5", "L58", "L73", "L21", "L5"),
birth = c(1964L, 1980L, 1986L, 1950L, 1964L),
death = c(2012L, 2003L, NA, NA, 2012L))
NA means the animal is still alive and it wouldn't be crossed off since before it was born.
Any help to create this is greatly appreciated!
Code looks like this:
heatmap <- data %>%
mutate(x = case_when(year %in% 1990:1999 ~ "1990-1999",
TRUE ~ year)) %>%
mutate(y = paste(id)) %>%
group_by(x, y, .drop = FALSE) %>%
summarize(count = n()) %>%
arrange(y)
ggplot(data = heatmap, aes(x, y, fill = count)) +
geom_tile()
EDIT
Current heat map.
Here's how you could use color to indicate NA, like suggested by #Gregor Thomas.
Transforming your data to "complete":
library(dplyr)
library(tidyr)
library(ggplot2)
hm <- dat %>%
mutate(x = case_when(year < 1960 ~ "Pre-1960",
year %in% 1960:1969 ~ "1960-1969",
year %in% 1970:1979 ~ "1970-1979",
year %in% 1980:1989 ~ "1980-1989",
year %in% 1990:1999 ~ "1990-1999",
TRUE ~ year)) %>%
mutate(y = paste(matriline, id)) %>%
group_by(x, y, .drop = FALSE) %>%
summarize(count = n()) %>%
ungroup() %>%
tidyr::complete(x, y) %>%
arrange(y) %>%
tidyr::separate(y, into = c("ym", "yid"), sep = " ", remove = FALSE)
Then define a color for NA:
ggplot(data = hm, aes(x, yid, fill = count)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "red", na.value = "grey50") +
scale_x_discrete(position = "top", drop = FALSE) +
scale_y_discrete(limits=rev) +
labs(x = NULL, y = NULL) +
facet_wrap( ~ ym, strip.position = "left", dir = "v", ncol = 1) +
theme(panel.spacing = unit(0.2, "lines"),
strip.background = element_blank(),
strip.placement = "outside",
axis.text.x = element_text(angle = 45, hjust = -0.02))
Data:
ids <- c("J11", "J16", "J17", "J02", "J22", "J26", "J27", "J30")
matrilines <- c("J02","J04", "K11", "L20", "P90", "K100", "R22")
dat <- data.frame(year = as.character(sample(1960:2018, 1000, replace = TRUE)),
id = sample(ids, 1000, replace = TRUE),
matriline = sample(matrilines, 1000, replace = TRUE))
I am trying to produce a graph for a categorical variable with three sub-groups, but I would like to strictly present the results for two groups. In Stata, this can be done while producing a graph by adding something like, but I am not sure if there is an R equivalent?
drop if sentiment== "neutral"
Here is the a data example:
dput(head(sample_graph, 5))
(list(sentiment = structure(c(3L, 2L, 4L, NA, 2L), .Label = c("meg",
"negative", "neutral", "positive"), class = "factor"), treatment_announcement = c("pre",
"pre", "pre", "pre", "post"), n = c(78L, 150L, 87L, 1L, 829L),
sentiment_percentage = c(0.246835443037975, 0.474683544303797,
0.275316455696203, 0.00316455696202532, 0.490822972172883
), am = structure(c(2L, 2L, 2L, 2L, 1L), .Label = c("post",
"pre"), class = "factor")), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -5L), groups = structure(list(
treatment_announcement = c("post", "pre"), .rows = structure(list(
5L, 1:4), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L), .drop = TRUE))
I have used this code before, which works well but it drops all observations under this category, but I only want to drop them for visualization purposes, not all rows in the df itself.
For instance, after running the code below, my observations declined from 8000 to 6323.
sample_graph<- sample_graph %>%
drop_na() %>%
filter(sentiment != "neutral")
Therefore, I have attempted dropping the specific subgroup within the ggplot itself, but I am facing an error: "Problem with filter() input ..2.
i Input ..2 is aes(x = treatment_announcement, fill = sentiment, y = sentiment_percentage)."
ggplot(sample_graph %>% filter(sentiment != "neutral", aes(x = treatment_announcement, fill = sentiment, y = sentiment_percentage))) +
geom_bar(stat = "identity", position=position_dodge()) +
scale_fill_grey() +
ylab("percentage") +
theme(text=element_text(size=20)) +
scale_fill_manual(values = c("positive" = "green",
"negative" = "red")) +
theme(plot.title = element_text(size = 18, face = "bold")) +
scale_x_discrete(limits = c("pre", "post")) +
theme_bw()
Following Allen's advice below, I tried the following:
twitter_posts |>
drop_na() |>
filter(sentiment != "neutral") |>
select(sentiment, treatment_announcement) |> # we're only interested in sentiment & treatment_announcement
group_by(sentiment) %>% # group data and
add_count(treatment_announcement) |> # add count of treatment_announcement
unique() |> # remove duplicates
ungroup() |> # remove grouping
group_by(treatment_announcement) |> # group by treatment_announcement
mutate(sentiment_percentage = n/sum(n)) |> # ...calculating percentage
mutate(sentiment = as.factor(sentiment)) |> # change to factors so that ggplot treats...
mutate(am = as.factor(treatment_announcement)) |>
twitter_posts (data = teacher_posts, aes(x = treatment_announcement, fill = sentiment, y = sentiment_percentage)) +
geom_bar(stat = "identity", position=position_dodge()) +
scale_fill_grey() +
xlab("Treatment refers to the implementation of the wage subsidy program targeted at jobless teachers") +
ylab("percentage") +
theme(text=element_text(size=20)) +
scale_fill_manual(values = c("positive" = "green",
"negative" = "red")) +
theme(plot.title = element_text(size = 18, face = "bold")) +
scale_x_discrete(limits = c("pre", "post")) +
theme_bw()
And I am receiving this error "Mapping should be created with aes() or aes_()." although I have the aes mapping for the plot.
You can do some version of this via piping to ggplot or using filter in the data argument
library(tidyverse)
library(palmerpenguins)
penguins <- penguins
penguins |>
drop_na() |>
filter(species != "Adelie") |>
ggplot(aes(x = bill_length_mm, y = body_mass_g)) +
geom_point()
ggplot(data = filter(penguins,species != "Adelie"), aes(x = bill_length_mm, y = body_mass_g)) +
geom_point()
#> Warning: Removed 1 rows containing missing values (geom_point).
Created on 2022-07-18 by the reprex package (v2.0.1)
So taking the code you provided it would look something like this
twitter_posts |>
drop_na() |>
filter(sentiment != "neutral") |>
select(sentiment, treatment_announcement) |> # we're only interested in sentiment & treatment_announcement
group_by(sentiment) %>% # group data and
add_count(treatment_announcement) |> # add count of treatment_announcement
unique() |> # remove duplicates
ungroup() |> # remove grouping
group_by(treatment_announcement) |> # group by treatment_announcement
mutate(sentiment_percentage = n/sum(n)) |> # ...calculating percentage
mutate(sentiment = as.factor(sentiment)) |> # change to factors so that ggplot treats...
mutate(am = as.factor(treatment_announcement)) |>
ggplot(aes(x = treatment_announcement, fill = sentiment, y = sentiment_percentage)) +
geom_bar(stat = "identity", position=position_dodge()) +
scale_fill_grey() +
xlab("Treatment refers to the implementation of the wage subsidy program targeted at jobless teachers") +
ylab("percentage") +
theme(text=element_text(size=20)) +
scale_fill_manual(values = c("positive" = "green",
"negative" = "red")) +
theme(plot.title = element_text(size = 18, face = "bold")) +
scale_x_discrete(limits = c("pre", "post")) +
theme_bw()
So you would be doing your data cleaning and then plotting it. Because you are piping it you do not need to include the data argument.
If I were you, I would just create a new dataframe by filtering your original one with
newdataframe <- originaldataframe %>%
filter(variable==)
or something in this style.
From there generating the new graph should be trivial if you already have a working code.
Maybe is not the most polished way to do it, but its fast and effective.
Hope it helps.
I have a dataset that looks something like this:
a b a_total b_total
dog 3 5 10 8
cat 6 2 12 13
pig 9 3 15 9
I'm trying to make a stacked barplot using ggplot to have "a_total" on the bottom and "a" on top for each animal. I tried this but doesn't work.
ggplot(df, aes(x = "", y = c("a", "a_total")) + geom_bar(stat= "identity")
How should I go about this?
We create a column from the row names (rownames_to_column), select the 'a' columns along with the new column, reshape to 'long' format (pivot_longer) and do the plotting
library(dplyr)
library(tidyr)
library(ggplot2)
df %>%
rownames_to_column('animal') %>%
select(animal, a, a_total) %>%
pivot_longer(cols = -animal) %>%
ggplot(aes(x = animal, y = value, fill = name)) +
geom_bar(stat = 'identity') +
theme_bw()
-output
Also, this can be done for both 'a' and 'b' in a facet_wrap
df %>%
rownames_to_column('animal') %>%
pivot_longer(cols = -animal) %>%
mutate(abgrp = substr(name, 1, 1)) %>%
ggplot(aes(x = animal, y = value, fill = name)) +
geom_bar(stat = 'identity') +
theme_bw() +
facet_wrap(~ abgrp)
In base R, we can use barplot
barplot(t(df[c('a', 'a_total')]), col = c('red', 'blue'), legend = TRUE)
data
df <- structure(list(a = c(3L, 6L, 9L), b = c(5L, 2L, 3L), a_total = c(10L,
12L, 15L), b_total = c(8L, 13L, 9L)), class = "data.frame", row.names = c("dog",
"cat", "pig"))
My current chart looks like this:
What I want to achieve in the scale is something like this:
So basically show the date and day in the middle of the interval and all single hours on those minor breaks.
Here is a reproducible example of the plot:
# Input load. Please do not change #
`dataset` = structure(list(JobName = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "Job 1", class = "factor"),
RunDateTime = structure(c(1479195000, 1479196617, 1479281400,
1479281851, 1479367800, 1479368235), class = c("POSIXct",
"POSIXt"), tzone = ""), EndRunDateTime = structure(c(1479195855,
1479197916, 1479283032, 1479283032, 1479369407, 1479369407
), class = c("POSIXct", "POSIXt"), tzone = ""), Status = structure(c(1L,
2L, 2L, 2L, 2L, 2L), .Label = c("failed", "successfull"), class = "factor"),
GraphicColor = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("green",
"red"), class = "factor")), .Names = c("JobName", "RunDateTime",
"EndRunDateTime", "Status", "GraphicColor"), row.names = c(NA,
6L), class = "data.frame")
# Original Script. Please update your script content here and once completed copy below section back to the original editing window #
vars <- c("RunDateTime", "EndRunDateTime")
dataset[vars] <- lapply(dataset[vars], as.POSIXct, format = "%Y-%m-%dT%H:%M:%S")
df <- with(dataset, data.frame(Name = JobName, Start = RunDateTime, End = EndRunDateTime, Status = Status))
library(ggplot2)
lims <- with(df, c(min(Start), max(End)))
ggplot(df, aes(color = Status)) +
geom_segment(aes(x = Start, xend = End, y = Name, yend = Name), size = 3) +
scale_x_datetime(date_breaks = "1 day", date_minor_breaks = "1 hour", limits = lims, labels = function(x) paste(format(x, "%a \n %d.%m.%y"))) +
xlab(NULL) +
ylab(NULL) +
scale_colour_manual(values = c("successfull" = "#a1d99b", "failed" = "red", "repeated" = "yellow", "canceled" = "grey")) +
theme_bw()
Would love to hear suggestions on how to get a scale like this!
SOLUTION:
I've chosen #Marijn Stevering solution and rearanged the code a bit:
vars <- c("RunDateTime", "EndRunDateTime")
dataset[vars] <- lapply(dataset[vars], as.POSIXct, format = "%Y-%m-%dT%H:%M:%S")
df <- with(dataset, data.frame(Name = JobName, Start = RunDateTime, End = EndRunDateTime, Status = Status))
library(ggplot2)
lims <- with(df, c(min(Start), max(End)))
library(dplyr)
library(lubridate)
dates <- data.frame(Date = df$Start)
minute(dates$Date) <- 0
second(dates$Date) <- 0
hour(dates$Date) <- 12
dates <- distinct(dates) %>%
mutate(Label = paste(format(Date, "%a - %d.%m.%y")),
DateRound = Date)
hour(dates$DateRound) <- 0
ggplot(df) +
geom_segment(aes(x = Start, xend = End, y = Name, yend = Name, color = Status), size = 3) +
# Add the major labels as a geom, this does limit to the plot area so have to put them above axis
geom_text(data = dates, aes(x = dates$Date, label = dates$Label, y = 0, vjust = -0.5),check_overlap = TRUE, size = 3.5) +
# Add vertical lines to separate the days visually
geom_vline(data = dates, aes(xintercept = as.numeric(dates$DateRound)),linetype = "longdash") +
scale_x_datetime(date_breaks = "2 hour", date_minor_breaks = "1 hour", limits = lims, labels = function(x) paste(format(x, "%H"))) +
xlab(NULL) +
ylab(NULL) +
scale_colour_manual(values = c("successfull" = "#a1d99b", "failed" = "red", "repeated" = "yellow", "canceled" = "grey")) +
theme_bw() +
theme(axis.text.x = element_text(size = 7))
So now my scale looks like this, with which I can live very well.
I tried solving it by making the axis labels the hours and then manually adding in the date labels. I did this using a geom_text but that is limited to the plot area, so it appears above the axis. Also the way I create the data for the geom_text is probably sub-optimal. The plot then looks like this:
Generated with this code:
# Fiddling with dates, there is probably a better way to do this
library(dplyr)
library(lubridate)
dates <- data.frame(Date = df$Start)
minute(dates$Date) <- 0
second(dates$Date) <- 0
hour(dates$Date) <- 12
dates <- distinct(dates) %>%
mutate(Label = paste(format(Date, "%a \n %d.%m.%y")),
DateRound = Date)
hour(dates$DateRound) <- 0
ggplot(df) +
geom_segment(aes(x = Start, xend = End, y = Name, yend = Name, color = Status), size = 3) +
# Add the major labels as a geom, this does limit to the plot area so have to put them above axis
geom_text(data = dates, aes(x = Date, label = Label, y = 0, vjust = -0.2)) +
# Add vertical lines to separate the days visually
geom_vline(data = dates, aes(xintercept = as.numeric(DateRound))) +
scale_x_datetime(date_breaks = "1 hour", date_minor_breaks = "1 hour", limits = lims, labels = function(x) paste(format(x, "%H"))) +
xlab(NULL) +
ylab(NULL) +
scale_colour_manual(values = c("successfull" = "#a1d99b", "failed" = "red", "repeated" = "yellow", "canceled" = "grey")) +
theme_bw()
I think facets might help you achieve what you want.
# Original Script. Please update your script content here and once completed copy below section back to the original editing window #
vars <- c("RunDateTime", "EndRunDateTime")
dataset[vars] <- lapply(dataset[vars], as.POSIXct, format = "%Y-%m-%dT%H:%M:%S")
df <- with(dataset, data.frame(Name = JobName, Start = RunDateTime, End = EndRunDateTime, Status = Status))
library(ggplot2)
library(lubridate)
df <- transform(df,
Date = as.Date(Start),
DateStart = as.Date(Start),
DateEnd = as.Date(End),
HourStart = hour(Start),
HourEnd = hour(End))
if(sum(df$DateStart != df$DateEnd) != 0){
stop("At least one job runs in two different days.")
}
ggplot(df, aes(color = Status)) +
geom_segment(aes(x = HourStart, xend = HourEnd, y = Name, yend = Name), size = 3) +
xlab(NULL) +
ylab(NULL) +
scale_x_continuous(breaks = 0:24, limits = c(0,24)) +
scale_colour_manual(values = c("successfull" = "#a1d99b",
"failed" = "red", "repeated" = "yellow", "canceled" = "grey")) +
theme_bw()+
facet_grid(~Date)
This lets you group every day in one column and then you have the hours as labels on the axis. I know that it is not exactly how you drew it but it is how I would display it.
Watch out for the treatment of jobs that start in one day and finish in another.
I hope this helps.