ggridges with time series - R - r

I have a DF and I wanted to do a density graph with geom_density_ridges from ggridges, but, it's returning the same line in all states. What I'm doing wrong?
I would like to add trim = TRUE like in here, but it returns the following error message:
Ignoring unknown parameters: trim
My code:
library(tidyverse)
library(ggridges)
url <- httr::GET("https://xx9p7hp1p7.execute-api.us-east-1.amazonaws.com/prod/PortalGeral",
httr::add_headers("X-Parse-Application-Id" =
"unAFkcaNDeXajurGB7LChj8SgQYS2ptm")) %>%
httr::content() %>%
'[['("results") %>%
'[['(1) %>%
'[['("arquivo") %>%
'[['("url")
data <- openxlsx::read.xlsx(url) %>%
filter(is.na(municipio), is.na(codmun)) %>%
mutate_at(vars(contains(c("Acumulado", "Novos", "novos"))), ~ as.numeric(.))
data[,8] <- openxlsx::convertToDate(data[,8])
data <- data %>%
mutate(mortalidade = obitosAcumulado / casosAcumulado,
date = data) %>%
select(-data)
ggplot(data = data, aes(x = date, y = estado, heights = casosNovos)) +
geom_density_ridges(trim = TRUE)

You are probably not looking for density ridges but regular ridgelines.
There are a few choices to make in terms of normalisation. If you want to resemble densities, you can devide each group by their sum: height = casosNovos / sum(casosNovos). Next, you can decide that you want each ridge to be scaled to fit in between the lines, which you can do with the scales::rescale() function. It's your decision whether you want to do this per group or for the entire data. I chose the entire data below.
library(tidyverse)
library(ggridges)
url <- httr::GET("https://xx9p7hp1p7.execute-api.us-east-1.amazonaws.com/prod/PortalGeral",
httr::add_headers("X-Parse-Application-Id" =
"unAFkcaNDeXajurGB7LChj8SgQYS2ptm")) %>%
httr::content() %>%
'[['("results") %>%
'[['(1) %>%
'[['("arquivo") %>%
'[['("url")
data <- openxlsx::read.xlsx(url) %>%
filter(is.na(municipio), is.na(codmun)) %>%
mutate_at(vars(contains(c("Acumulado", "Novos", "novos"))), ~ as.numeric(.))
data[,8] <- openxlsx::convertToDate(data[,8])
data <- data %>%
mutate(mortalidade = obitosAcumulado / casosAcumulado,
date = data) %>%
select(-data) %>%
group_by(estado) %>%
mutate(height = casosNovos / sum(casosNovos))
ggplot(data = data[!is.na(data$estado),],
aes(x = date, y = estado, height = scales::rescale(height))) +
geom_ridgeline()

Related

Cumulative hazard curve fitting

How technically it possible to prolong the cumulative hazard curves until day 80 if in my original data I have follow-up time until 50 day? The cumulative results estimates will remain the same just that both lines are the same until day 80.
I used to create survival object object
surv = survfit(Surv(Tstart, Tstop, outcome==1)~T, data = data.long, ctype=1, id=id)
and then created a plot:
palette = c("#FF9E29", "#86AA00"),
risk.table = FALSE,
ylim=c(0,2),
xlim=c(0,70),
fun = "cumhaz")
You can transform the fit into a tibble for manual plotting using ggplot. By adding new rows at the maximal time point with the maximal value, geom_step will be extended as desired:
library(tidyverse)
library(survival)
fit <- survfit(Surv(time, status) ~ sex, data = lung)
max_time <- 3000
data <-
tibble(
cumhaz = fit$cumhaz,
stratum = {
fit$strata %>%
as.numeric() %>%
enframe() %>%
mutate(vec = name %>% map2(value, ~ rep(.x, .y))) %>%
pull(vec) %>%
simplify()
},
time = fit$time
)
data %>%
bind_rows(
data %>% group_by(stratum) %>% summarise(cumhaz = max(cumhaz), time = max_time)
) %>%
mutate(stratum = stratum %>% factor()) %>%
ggplot(aes(time, cumhaz, color = stratum)) +
geom_step() +
scale_x_continuous(limits = c(0, max_time))
Created on 2022-04-14 by the reprex package (v2.0.0)

Plotting the average of multiple time series objects and illustrating the error from that plot

Consider dat created here:
set.seed(123)
ID = factor(letters[seq(6)])
time = c(100, 102, 120, 105, 109, 130)
dat <- data.frame(ID = rep(ID,time), Time = sequence(time))
dat$group <- rep(c("GroupA","GroupB"), c(322,344))
dat$values <- sample(100, nrow(dat), TRUE)
dat contains time series data for 6 individuals (6 IDs), which belong to 2 groups (GroupA and GroupB). Assume that we expect the time series within each group to have similar properties. Also note that the time series for each individual is of different length. We essentially want to create an "average" time series plot of each group, which I have done like this:
library(dplyr)
library(ggplot2)
dat %>%
group_by(ID) %>%
mutate(maxtime = max(Time)) %>%
group_by(group) %>%
mutate(maxtime = min(maxtime)) %>%
group_by(group, Time) %>%
summarize(values = mean(values)) %>%
ggplot(aes(Time, values, colour = group))+
geom_line()+
facet_wrap(.~group)
How can we do this same thing, but add the original plots for each individual behind the "average" plots to illustrate the error associated with each "average"? Note that The way I created the "average plot" was by using the length of the ID with the shortest time series from each group, but when the originals are added, I would like to see the whole plots from the originals if possible (so some will be longer than others)
Using a second geom_line you can plot the "raw" data in the background as e.g. grey lines.
set.seed(123)
ID = factor(letters[seq(6)])
time = c(100, 102, 120, 105, 109, 130)
dat <- data.frame(ID = rep(ID,time), Time = sequence(time))
dat$group <- rep(c("GroupA","GroupB"), c(322,344))
dat$values <- sample(100, nrow(dat), TRUE)
library(dplyr)
library(ggplot2)
d <- dat %>%
group_by(ID) %>%
mutate(maxtime = max(Time)) %>%
group_by(group) %>%
mutate(maxtime = min(maxtime)) %>%
group_by(group, Time) %>%
summarize(values = mean(values))
#> `summarise()` regrouping output by 'group' (override with `.groups` argument)
ggplot()+
geom_line(data = dat, aes(Time, values, group = ID), color = "grey80", alpha = .7) +
geom_line(data = d, aes(Time, values, colour = group)) +
facet_wrap(.~group)
Maybe you are looking for a composed plot like this:
library(dplyr)
library(ggplot2)
library(patchwork)
G1 <- dat %>%
group_by(ID) %>%
mutate(maxtime = max(Time)) %>%
group_by(group) %>%
mutate(maxtime = min(maxtime)) %>%
group_by(group, Time) %>%
summarize(values = mean(values)) %>%
ggplot(aes(Time, values, colour = group))+
geom_line()+
facet_wrap(.~group)+
ylab('Mean')
G2 <- dat %>%
group_by(ID) %>%
mutate(maxtime = max(Time)) %>%
group_by(group) %>%
mutate(maxtime = min(maxtime)) %>%
ggplot(aes(Time, values, colour = group))+
geom_line()+
facet_wrap(.~group)+
ylab('Real Values')
#Compose plots
G3 <- G2/G1+plot_layout(guides = "collect")
Output:

How can we data wrangling to obtain shown ratio/proportion chart shown

Goal is to produce a visualization indicating ratio.
Please help us how can we produce such ratio chart (high lighted) in R ?
library(tidyverse)
# Dataset creation
df <- data.frame(cls = c(rep("A",4),rep("B",4)),
grd = c("A1",rep("A2",3),rep(c("B1","B2"), 2)),
typ = c(rep("m",2),rep("o",2),"m","n",rep("p",2)),
pnts = c(rep(1:4,2)))
df
#### Data wrangling
df1 <- df %>%
group_by(cls) %>%
summarise(cls_pct = sum(pnts))
df1
df2 <- df %>%
group_by(cls,grd) %>%
summarize(grd_pct = sum(pnts))
df2
df3 <- df %>%
group_by(cls,grd,typ) %>%
summarise(typ_pct = sum(pnts))
df3
#### Attempt to combine all df1,df2,df3
# but mutate and summarise are mixing up leading to wrong results
df3 %>%
group_by(cls,grd) %>%
mutate(grd_pct = sum(typ_pct)) %>%
group_by(cls) %>%
mutate(cls_pct = sum(grd_pct))
Attempt to visualize all the ratios in 1 chart
data %>%
pivot_longer(cols = -c(cls:pnts),
names_to = "per_cat",
values_to = "percent") %>%
ggplot(aes(cls,percent, col = typ, fill = grd)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_bw()
plot of the same.
EDIT -- added formula version with more useful output for visualization.
ORIG: At this point it may be worth making a function to reduce copying and pasting, but this may get you what you need:
library(tidyverse)
df %>%
group_by(cls) %>%
mutate(per1 = sum(pnts),
per1_pct = per1 / sum(per1)) %>%
group_by(cls, grd) %>%
mutate(per2 = sum(pnts),
per2_pct = per2 / sum(per2)) %>%
group_by(cls, grd, typ) %>%
mutate(per3 = sum(pnts),
per3_pct = per3 / sum(per3)) %>%
ungroup()
EDIT: Here's a general function to calculate the stats for a given grouping, making it easier to combine a few groupings together in long format better suited for visualization.
df_sum <- function(df, level, ...) {
df %>%
group_by(...) %>%
summarize(grp_ttl = sum(pnts)) %>%
mutate(ttl = sum(grp_ttl),
pct = grp_ttl / ttl) %>%
ungroup() %>%
mutate(level = {{ level }} )
}
df_sum(df, level = 1, cls) %>%
bind_rows(df_sum(df, level = 2, cls, grd)) %>%
bind_rows(df_sum(df, level = 3, cls, grd, typ)) %>%
mutate(label = coalesce(as.character(typ), # This grabs the first non-NA
as.character(grd),
as.character(cls))) -> df_summed
df_summed %>%
ggplot(aes(level, grp_ttl)) +
geom_col(color = "white") +
geom_text(aes(label = paste0(label, "\n", grp_ttl, "/", ttl)),
color = "white",
position = position_stack(vjust = 0.5)) +
scale_x_reverse() + # To make level 1 at the top
coord_flip() # To switch from vertical to horizontal orientation

How to add labels and percentages to printable R sunburst

I'm trying to add labels and percentages to each layer within a sunburst chart using R - so it looks like this Sunburst.
I can create a sunburst chart (using this guide) but I can't figure out how to add the labels or percentages. I also want to be able to print the chart with all labels and percentages.
Here's my code so far.
# libraries
library(dplyr)
library(treemap)
library(sunburstR)
library(readxl)
library(vcd)
## Load Arthritis as example
Data <- data.frame(Arthritis)
Data <- Data %>% select(-ID) %>%
mutate(Age=ifelse(Age<50,"Young","Old")) %>% group_by(Treatment,Sex,Improved,Age) %>%
summarise(Count=n()) %>%
mutate(Path=paste(Treatment,Sex,Improved,Age,sep="-")) %>%
ungroup() %>%
select(Path,Count)
sunburst(Data)
Any help would be great.
Thanks.
I suggest the ggsunburst package https://github.com/didacs/ggsunburst
library(ggsunburst)
library(dplyr)
library(vcd) # just for the Arthritis dataset
Data <- data.frame(Arthritis)
# compute percentage using tally
# add column leaf, with format "name->attribute:value"
# ggsunburst considers everything after "->" as attributes
# the attribute "size" is used as the size of the arc
df <- Data %>%
mutate(Age=ifelse(Age<50,"Young","Old")) %>%
group_by(Treatment,Sex,Improved,Age) %>%
tally() %>%
mutate(percentage = n/nrow(Data)*100,
size=paste("->size:",round(percentage,2),sep=""),
leaf=paste(Improved,size,sep = "")) %>%
ungroup() %>%
select(Treatment,Sex,Age,leaf)
# sunburst_data reads from a file so you need to create one
write.table(df, file = 'data.csv', row.names = F, col.names = F, sep = ",")
# specify node_attributes = "size" to add labels with percentages in terminal nodes
sb <- sunburst_data('data.csv', type = "lineage", sep = ',', node_attributes = "size")
# compute percentages for internal nodes
tre <- Data %>%
group_by(Treatment) %>%
tally() %>%
mutate(percent=n/nrow(Data)*100,
name=Treatment) %>%
ungroup() %>%
select(name,percent)
sex <- Data %>%
group_by(Treatment,Sex) %>%
tally() %>%
mutate(percent=n/nrow(Data)*100,
name=Sex) %>%
ungroup() %>%
select(name,percent)
age <- Data %>%
mutate(Age=ifelse(Age<50,"Young","Old")) %>%
group_by(Treatment,Sex,Age) %>%
tally() %>%
mutate(percent=n/nrow(Data)*100,
name=Age) %>%
ungroup() %>%
select(name,percent)
x <- rbind(tre, sex, age)
# the rows in x are in the same order as sb$node_labels, cbind works here only because of that
x <- cbind(sb$node_labels, round(x[,"percent"],2))
percent <- x %>% mutate(name_percent = paste(label,percent,"%"))
sunburst(sb, node_labels.min = 0) +
geom_text(data = sb$leaf_labels, aes(x=x, y=0.1, label=paste(size,"%"), angle=angle, hjust=hjust), size = 2) +
geom_text(data = percent, aes(x=x, y=y, label=name_percent, angle=pangle), size=2)

ggplot2: stacked barplot over different columns

I have the following sample data with three different cost-types and a year-column:
library(tidyverse)
# Sample data
costsA <- sample(100:200,30, replace=T)
costsB <- sample(100:140,30, replace=T)
costsC <- sample(20:20,30, replace=T)
year <- sample(c("2000", "2010", "2030"), 30, replace=T)
df <- data.frame(costsA, costsB, costsC, year)
My goal is to plot these costs in a stacked barplot, so that I can compare the mean-costs between the three year-categories. In order to do so I aggregated the values:
df %>% group_by(year) %>%
summarise(n=n(),
meanA = mean(costsA),
meanB = mean(costsB),
meanC = mean(costsC)) %>%
ggplot( ... ) + geom_bar()
But how can I plot the graph now? In the x-axis there should be the years and in the y-axis the stacked costs.
You have to make the summarise data into a tidy(-ish) format to generate a plot like the one you posted. In a tidy-verse, you'd do that with gather function where you convert multiple columns into two-columns of key-value pairs. For instance, the following code generates the figure below.
df %>% group_by(year) %>%
summarise(n=n(),
meanA = mean(costsA),
meanB = mean(costsB),
meanC = mean(costsC)) %>%
gather("key", "value", - c(year, n)) %>%
ggplot(aes(x = year, y = value, group = key, fill = key)) + geom_col()
With gather("key", "value", - c(year, n)), three columns (costsA, costsB, costsC) are changed to the key-value pairs.

Resources