EDITED: following the recommendation
I am trying to create a simple plot of the frequency of consecutive events based on this post: ggplot sequence patterns
I would like to pile up the same plot but for others subjects ( different sequences)
Subjects "aa", "bb", "cc"
The example dataset is as follow:
subject <- c("aa","aa","aa","aa","aa","aa","aa", "bb","bb","bb","bb","bb","bb","bb","cc","cc","cc","cc","cc","cc","cc")
event <- c("P","C","D","C","P","E","D","C","P","E","D","P","C","E","P","E","D","P","C","x","x")
freq <- c(3,2,1,4,2,1,0,3,4,1,3,3,1,2,1,3,2,1,4,0,0))
dfx <- data.frame(subject, event, freq)
The result I get is:
Using this code, based on the original post:
library(ggplot2)
dfx$type <- factor(dfx$type)
dfx$ymin <- c(0,1,2)
dfx$ymax <- c(1,2,3)
dfx$xmax <- cumsum(dfx$count)
dfx$xmin <- c(0, head(dfx$xmax, n=-1))
plot_x <- ggplot(dfx,
aes(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax,fill=type)) +geom_rect(colour="grey40", size=0.5)
png("plot_x.png", height=200, width=800)
print(plot_x)
dev.off()
I have created this image with what I would like to plot. ( it is handmade in excel). In this case, we have 3 subjects, 4 events (C,P, D,E)+1 dummy event(X), necessary to create the data frame. As you can see, the total number of events is not necessary equal on each subject.
Try using facet_grid or facet_wrap
library(ggplot2)
p1 <-
ggplot(dfx, aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, fill = event))+
theme_bw() +
geom_rect(colour = "black", size = 0.5) +
facet_grid(subject ~ .) +
theme(axis.ticks.y=element_blank(), axis.text.y=element_blank())
p1
DATA:
dfx <- data.frame(
subject = c("aa","aa","aa","aa","aa","aa","aa", "bb","bb","bb","bb","bb","bb","bb","cc","cc","cc","cc","cc","cc","cc"),
event = c("P","C","D","C","P","E","D","C","P","E","D","P","C","E","P","E","D","P","C","x","x"),
freq = c(3,2,1,4,2,1,0,3,4,1,3,3,1,2,1,3,2,1,4,0,0),
ymin = 0,
ymax = 1)
TEMP <- tapply(dfx$freq,dfx$subject,cumsum)
dfx$xmax <- unlist(TEMP)
dfx$xmin <- c(0, head(TEMP$aa, -1), 0, head(TEMP$bb, -1), 0, head(TEMP$cc, -1))
Related
How can I reformat this ridgeline plot so that is a vertical ridgeline plot?
My real dataset is the actual PDF. For a minimum reproducible example, I generate distributions and extract the PDFs to use in a dummy function. The dataframe has a model name (for grouping), x values paired with PDF ordinates, and an id field that separates the different ridgeline levels (i.e., ridgeline y axis).
set.seed(123)
makedfs <- function(name, id, mu, sig) {
vals <- exp(rnorm(1000, mean=mu, sd=sig))
pdf <-density(vals)
model <- rep(name, length(pdf$x))
prox <- rep(id, length(pdf$x))
df <- data.frame(model, prox, pdf$x, pdf$y)
colnames(df) <- c("name", "id", "x", "pdf")
return(df)
}
df1 <- makedfs("model1", 0, log(1), 1)
df2 <- makedfs("model2", 0, log(0.5), 2)
df3 <- makedfs("model1", 1, log(0.2), 0.8)
df4 <- makedfs("model2", 1, log(1), 1)
df <- rbind(df1, df2, df3, df4)
From this answer, R Ridgeline plot with multiple PDFs can be overlayed at same level, I have a standard joyplot:
ggplot(df, aes(x=x, y=id, height = pdf, group = interaction(name, id), fill = name)) +
geom_ridgeline(alpha = 0.5, scale = .5) +
scale_y_continuous(limits = c(0, 5)) +
scale_x_continuous(limits = c(-6, 6))
I am trying the code below based on https://wilkelab.org/ggridges/reference/geom_vridgeline.html but it throws an error on the width parameter.
p <- ggplot(df, aes(x=id, y=x, width = ..density.., fill=id)) +
geom_vridgeline(stat="identity", trim=FALSE, alpha = 0.85, scale = 2)
Error in `f()`:
! Aesthetics must be valid computed stats. Problematic aesthetic(s): width = ..density...
Did you map your stat in the wrong layer?
If you wanted the same graph, just vertically oriented, you need to use the same parameters when you use geom_vridgeline.
I swapped the limits you originally set so you can see that it's the same.
ggplot(df, aes(x = id, y = x, width = pdf, fill = name,
group = interaction(name, id))) +
geom_vridgeline(alpha = 0.85, scale = .5) +
scale_x_continuous(limits = c(0, 5)) + # <-- note that the x & y switched
scale_y_continuous(limits = c(-6, 6))
I would like to plot rectangles between specific values listed in a data frame, such as:
Region <- c("A","B","A","B","A","C","B","C","A"),
Lon <- c(31.03547, 37.25443, 65.97450, 69.90290, 101.77630,
105.32550, 148.86270, 147.72010, 146.10420)
var1 <- rnorm(n = 9, mean = 15, sd = 100)
regions <- data.frame(Region, Lon, var1)
This is an example where I show the region limits using geom_vline:
ggplot(NULL)+
geom_vline(data = regions, aes(xintercept=Lon,
linetype=region,
color = region),
size=0.6)+
geom_point(data = regions, aes(x=Lon, y=var1, color=Region))+
theme_bw()
I want to plot background rectangles that would be limited by those verticle lines.
I tried to look at this previous question:
How to find the start and the end of sequences automatically in R for rectangles in ggplot
However, it does not satisfy completely my needs, because I would like to plot rectangles for every region.
# Convert to runlength encoding
rle <- rle(regions$Region == "B")
# Determine starts and ends
starts <- {ends <- cumsum(rle$lengths)} - rle$lengths + 1
# Build a data.frame from the rle
dfrect <- data.frame(
xmin = regions$Lon[starts],
# We have to +1 the ends, because the linepieces end at the next datapoint
# Though we should not index out-of-bounds, so we need to cap at the last end
xmax = regions$Lon[pmin(ends + 1, max(ends))],
fill = rle$values
)
ggplot(NULL)+
geom_vline(data = regions, aes(xintercept=Lon,
linetype=region,
color = region),
size=0.6)+
geom_rect(data = dfrect,
aes(xmin = xmin, xmax = xmax, ymin = -Inf, ymax = Inf,
fill = fill),
alpha = 0.4) +
geom_point(data = regions, aes(x=Lon, y=var1, color=Region))+
theme_bw()
How can I define the rectangles for A and C too? consider that I have multiple regions, not only 3.
sample data
Region <- c("A","B","A","B","A","C","B","C","A")
Lon <- c(31.03547, 37.25443, 65.97450, 69.90290, 101.77630,
105.32550, 148.86270, 147.72010, 146.10420)
var1 <- rnorm(n = 9, mean = 15, sd = 100)
regions <- data.frame(Region, Lon, var1)
code
library(data.table)
# Make regions a data.table
setDT(regions)
# first sort by lon, to avoind overlap in rectangles
setkey(regions, Lon)
# create boundaries of rectangles
regions[, Lon_end := data.table::shift(Lon, type = "lead", fill = Inf)][]
# plot
ggplot(data = regions) +
geom_vline(aes(xintercept = Lon, linetype = Region, color = Region), size = 0.6) +
geom_rect(mapping = aes(xmin = Lon, xmax = Lon_end, ymin = 0, ymax = 1, fill = Region), alpha = 0.1)
output
I have a plot made in R similar to the one below. I have some values plotted in addition to the mean of those values. I want to draw a rectangle around values that are consecutively 5 times on either side of the mean. I'm having a hard time accomplishing this, any insight would be appreciated.
EDIT: I should clarify that I'm trying to have this done automatically, I don't want to manually set the coordinates of the rectangles.
Reproducible example
library(ggplot2)
mydata <- data.frame(
id = c(1:20),
result = c(102,99,102,99,102, rep.int(105,5), 102,99,102,99,102, rep.int(95,5))
)
mymean <- 100
ggplot(mydata, aes(x = id, y = result)) +
geom_point() +
geom_hline(yintercept = mymean)
Current Plot:
Desired Plot:
You will first need an algorithm to go through the data and classify it into different groups depending on your criteria (the hardest part of this). Then, you can take the result of that and use geom_rect() to add the rectangles to the chart. The function classify_data() below does the classification, and the rectangles are added with geom_rect(). If you have points that fall on the mean and you don't want to group them in a rectangle, you can add a condition to test for that.
classify_groups <- function(df, id_col = "id", val_col = "result") {
id <- df[[id_col]]
values <- df[[val_col]]
current_ids <- c()
current_values <- c()
result <- data.frame(id = numeric(),
xmin = numeric(),
xmax = numeric(),
y = numeric())
for (i in 1:(length(id))) {
if (length(current_values) == 0 | values[i] %in% current_values) {
current_values <- c(current_values, values[i])
current_ids <- c(current_ids, id[i])
} else {
current_ids <- c(id[i])
current_values <- c(values[i])
}
if (length(current_values) == 5) {
result <- result %>%
add_row(id = i,
xmin = min(current_ids),
xmax = max(current_ids),
y = max(current_values))
current_ids <-c()
current_values <- c()
}
}
result
}
mydata <- data.frame(
id = c(1:20),
result = c(102,99,102,99,102, rep.int(105,5), 102,99,102,99,102, rep.int(95,5))
)
mymean = 100
groups <- classify_groups(mydata)
ggplot(mydata, aes(x = id, y = result)) +
geom_point() +
geom_hline(yintercept = mymean) +
geom_rect(data = groups,
aes(xmin = xmin - 0.5,
xmax = xmax + 0.5,
ymin = y - 0.5,
ymax = y + 0.5,
group = id),
alpha = 0,
color = 'darkorange',
size=1,
inherit.aes = FALSE)
Result:
Attached to the post you see a density plot. I am asked to compute the volume of bike rentals during weather levels, named weathersit (on the right). I am further to assign names, say Bad, Very_Bad, God_Awful and I_Wont_Even_Bother. Preferably the names should be near or in the blue circle, seen in the attached Image.
library(ggplot2)
library(hrbrthemes)
library(dplyr)
library(tidyr)
library(viridis)
#plotting the bike density
ggplot(Bikes_Washington, aes(x=cnt, group=weathersit, fill = weathersit)) +
geom_density(color = "darkblue", alpha=0.2)
Our plot
In day data, there only three categories exists, so I recode 1, 2, and 3 as Bad, Very Bad, God Awful. Also, it's recommended to change categorical variable as factor using factor or as.factor.
Bikes_Washington <- read.csv("D:/Prac/day.csv")
Bikes_Washington <- Bikes_Washington %>%
mutate(weathersit = as.factor(weathersit)) %>%
mutate(weathersit = recode_factor(weathersit, "1" = "Bad", "2" = "Very_Bad", "3" = "God_Awful"))
sp <- split(Bikes_Washington$cnt, Bikes_Washington$weathersit)
a <- lapply(seq_along(sp), function(i){
d <- density(sp[[i]])
k <- which.max(d$y)
data.frame(weathersit = names(sp)[i], xmax = d$x[k], ymax = d$y[k])
})
a <- do.call(rbind, a)
Bikes_Washington %>%
mutate(weathersit = as.factor(weathersit)) %>%
ggplot(aes(x=cnt, group=weathersit, fill = weathersit)) +
geom_density(color = "darkblue", alpha=0.2) +
geom_text(data = a,
aes(x = xmax, y = ymax,
label = weathersit, vjust = -.5))
Forword: I provide a reasonably satisfactory answer to my own question. I understand this is acceptable practice. Naturally my hope is to invite suggestions and improvements.
My purpose is to plot two time series (stored in a dataframe with dates stored as class 'Date') and to fill the area between the data points with two different colors according to whether one is above the other. For instance, to plot an index of Bonds and an index of Stocks, and to fill the area in red when the Stock index is above the bond index, and to fill the area in blue otherwise.
I have used ggplot2 for this purpose, because I am reasonably familiar with the package (author: Hadley Wickham), but feel free to suggest other approaches. I wrote a custom function based on the geom_ribbon() function of the ggplot2 package. Early on I faced problems related to my lack of experience in handling the geom_ribbon() function and objects of class 'Date'. The function below represents my effort to solve these problems, almost surely it is roundabout, unecessarily complicated, clumsy, etc.. So my question is: Please suggest improvements and/or alternative approaches. Ultimately, it would be great to have a general-purpose function made available here.
Data:
set.seed(123456789)
df <- data.frame(
Date = seq.Date(as.Date("1950-01-01"), by = "1 month", length.out = 12*10),
Stocks = 100 + c(0, cumsum(runif(12*10-1, -30, 30))),
Bonds = 100 + c(0, cumsum(runif(12*10-1, -5, 5))))
library('reshape2')
df <- melt(df, id.vars = 'Date')
Custom Function:
## Function to plot geom_ribbon for class Date
geom_ribbon_date <- function(data, group, N = 1000) {
# convert column of class Date to numeric
x_Date <- as.numeric(data[, which(sapply(data, class) == "Date")])
# append numeric date to dataframe
data$Date.numeric <- x_Date
# ensure fill grid is as fine as data grid
N <- max(N, length(x_Date))
# generate a grid for fill
seq_x_Date <- seq(min(x_Date), max(x_Date), length.out = N)
# ensure the grouping variable is a factor
group <- factor(group)
# create a dataframe of min and max
area <- Map(function(z) {
d <- data[group == z,];
approxfun(d$Date.numeric, d$value)(seq_x_Date);
}, levels(group))
# create a categorical variable for the max
maxcat <- apply(do.call('cbind', area), 1, which.max)
# output a dataframe with x, ymin, ymax, is. max 'dummy', and group
df <- data.frame(x = seq_x_Date,
ymin = do.call('pmin', area),
ymax = do.call('pmax', area),
is.max = levels(group)[maxcat],
group = cumsum(c(1, diff(maxcat) != 0))
)
# convert back numeric dates to column of class Date
df$x <- as.Date(df$x, origin = "1970-01-01")
# create and return the geom_ribbon
gr <- geom_ribbon(data = df, aes(x, ymin = ymin, ymax = ymax, fill = is.max, group = group), inherit.aes = FALSE)
return(gr)
}
Usage:
ggplot(data = df, aes(x = Date, y = value, group = variable, colour = variable)) +
geom_ribbon_date(data = df, group = df$variable) +
theme_bw() +
xlab(NULL) +
ylab(NULL) +
ggtitle("Bonds Versus Stocks (Fake Data!)") +
scale_fill_manual('is.max', breaks = c('Stocks', 'Bonds'),
values = c('darkblue','darkred')) +
theme(legend.position = 'right', legend.direction = 'vertical') +
theme(legend.title = element_blank()) +
theme(legend.key = element_blank())
Result:
While there are related questions and answers on stackoverflow, I haven't found one that was sufficiently detailed for my purpose. Here is a selection of useful exchanges:
create-geom-ribbon-for-min-max-range: Asks a similar question, but provides less detail than I was looking for.
possible-bug-in-geom-ribbon: Closely related, but intermediate steps on how to compute max/min are missing.
fill-region-between-two-loess-smoothed-lines-in-r-with-ggplot: Closely related, but focuses on loess lines. Excellent.
ggplot-colouring-areas-between-density-lines-according-to-relative-position : Closely related, but focuses on densities. This post greatly inspired me.
Perhaps I'm not understanding your full problem but it seems that a fairly direct approach would be to define a third line as the minimum of the two time series at each time point. geom_ribbon is then called twice (once for each unique value of Asset) to plot the ribbons formed by each of the series and the minimum line. Code could look like:
set.seed(123456789)
df <- data.frame(
Date = seq.Date(as.Date("1950-01-01"), by = "1 month", length.out = 12*10),
Stocks = 100 + c(0, cumsum(runif(12*10-1, -30, 30))),
Bonds = 100 + c(0, cumsum(runif(12*10-1, -5, 5))))
library(reshape2)
library(ggplot2)
df <- cbind(df,min_line=pmin(df[,2],df[,3]) )
df <- melt(df, id.vars=c("Date","min_line"), variable.name="Assets", value.name="Prices")
sp <- ggplot(data=df, aes(x=Date, fill=Assets))
sp <- sp + geom_ribbon(aes(ymax=Prices, ymin=min_line))
sp <- sp + scale_fill_manual(values=c(Stocks="darkred", Bonds="darkblue"))
sp <- sp + ggtitle("Bonds Versus Stocks (Fake Data!)")
plot(sp)
This produces following chart:
I actually had the same question some time ago and here is the related post. It defines a function finding the intersections between two lines and an other function which takes a dataframe in input and then colors the space between the two columns using matplotand polygon
EDIT
Here is the code, modified a bit to allow the last polygon to be plotted
set.seed(123456789)
dat <- data.frame(
Date = seq.Date(as.Date("1950-01-01"), by = "1 month", length.out = 12*10),
Stocks = 100 + c(0, cumsum(runif(12*10-1, -30, 30))),
Bonds = 100 + c(0, cumsum(runif(12*10-1, -5, 5))))
intersects <- function(x1, x2) {
seg1 <- which(!!diff(x1 > x2)) # location of first point in crossing segments
above <- x2[seg1] > x1[seg1] # which curve is above prior to crossing
slope1 <- x1[seg1+1] - x1[seg1]
slope2 <- x2[seg1+1] - x2[seg1]
x <- seg1 + ((x2[seg1] - x1[seg1]) / (slope1 - slope2))
y <- x1[seg1] + slope1*(x - seg1)
data.frame(x=x, y=y, pindex=seg1, pabove=(1:2)[above+1L])
# pabove is greater curve prior to crossing
}
fillColor <- function(data, addLines=TRUE) {
## Find points of intersections
ints <- intersects(data[,2], data[,3]) # because the first column is for Dates
intervals <- findInterval(1:nrow(data), c(0, ints$x))
## Make plot
matplot(data, type="n", col=2:3, lty=1, lwd=4,xaxt='n',xlab='Date')
axis(1,at=seq(1,dim(data)[1],length.out=12),
labels=data[,1][seq(1,dim(data)[1],length.out=12)])
legend("topright", c(colnames(data)[2], colnames(data)[3]), col=3:2, lty=1, lwd=2)
## Draw the polygons
for (i in seq_along(table(intervals))) {
xstart <- ifelse(i == 1, 0, ints$x[i-1])
ystart <- ifelse(i == 1, data[1,2], ints$y[i-1])
xend <- ints$x[i]
yend <- ints$y[i]
x <- seq(nrow(data))[intervals == i]
polygon(c(xstart, x, xend, rev(x)), c(ystart, data[x,2], yend, rev(data[x,3])),
col=ints$pabove[i]%%2+2)
}
# add end of plot
xstart <- ints[dim(ints)[1],1]
ystart <- ints[dim(ints)[1],2]
xend <- nrow(data)
yend <- data[dim(data)[1],2]
x <- seq(nrow(data))[intervals == max(intervals)]
polygon(c(xstart, x, xend, rev(x)), c(ystart, data[x,2], yend, rev(data[x,3])),
col=ints[dim(ints)[1]-1,4]%%2+2)
## Add lines for curves
if (addLines)
invisible(lapply(1:2, function(x) lines(seq(nrow(data)), data[,x], col=x%%2+2, lwd=2)))
}
## Plot the data
fillColor(dat,FALSE)
and the final result is this (with the same data used for the question)
#walts answer should remain the winner but while implementing his solution, I gave it a tidy update.
library(tidyverse)
set.seed(2345)
# fake data
raw_data <-
tibble(
date = as.Date("2020-01-01") + (1:40),
a = 95 + cumsum(runif(40, min = -20, max = 20)),
b = 55 + cumsum(runif(40, min = -1, max = 1))
)
# the steps
# the 'y' + 'min_line' + 'group' is the right granularity (by date) to
# create 2 separate ribbons
df <-
raw_data %>%
# find min of the two columns
mutate(min_line = pmin(a, b)) %>%
pivot_longer(c(a, b), names_to = "group", values_to = "y") %>%
print()
# the result
ggplot(data = df, aes(x = date, fill = group)) +
geom_ribbon(aes(ymax = y, ymin = min_line)) +
theme_classic()
another option using ggh4x - requires the data to be wide with y for lines 1 and 2 in different columns.
library(ggh4x)
#> Loading required package: ggplot2
set.seed(123456789)
df <- data.frame(
Date = seq.Date(as.Date("1950-01-01"), by = "1 month", length.out = 12*10),
Stocks = 100 + c(0, cumsum(runif(12*10-1, -30, 30))),
Bonds = 100 + c(0, cumsum(runif(12*10-1, -5, 5))))
## The data frame is NOT made long!!
ggplot(data = df, aes(x = Date)) +
stat_difference(aes(ymin = Stocks, ymax = Bonds)) +
scale_fill_brewer(palette = "Set1")
Created on 2022-11-24 with reprex v2.0.2