Cumulative hazard curve fitting - r

How technically it possible to prolong the cumulative hazard curves until day 80 if in my original data I have follow-up time until 50 day? The cumulative results estimates will remain the same just that both lines are the same until day 80.
I used to create survival object object
surv = survfit(Surv(Tstart, Tstop, outcome==1)~T, data = data.long, ctype=1, id=id)
and then created a plot:
palette = c("#FF9E29", "#86AA00"),
risk.table = FALSE,
ylim=c(0,2),
xlim=c(0,70),
fun = "cumhaz")

You can transform the fit into a tibble for manual plotting using ggplot. By adding new rows at the maximal time point with the maximal value, geom_step will be extended as desired:
library(tidyverse)
library(survival)
fit <- survfit(Surv(time, status) ~ sex, data = lung)
max_time <- 3000
data <-
tibble(
cumhaz = fit$cumhaz,
stratum = {
fit$strata %>%
as.numeric() %>%
enframe() %>%
mutate(vec = name %>% map2(value, ~ rep(.x, .y))) %>%
pull(vec) %>%
simplify()
},
time = fit$time
)
data %>%
bind_rows(
data %>% group_by(stratum) %>% summarise(cumhaz = max(cumhaz), time = max_time)
) %>%
mutate(stratum = stratum %>% factor()) %>%
ggplot(aes(time, cumhaz, color = stratum)) +
geom_step() +
scale_x_continuous(limits = c(0, max_time))
Created on 2022-04-14 by the reprex package (v2.0.0)

Related

How can I select one plot from the partial dependence plot in R?

Here is the code for the partial dependence plot. I use the example data for this. First of all, I made a random forest model. Then I made a partial dependence plot.
rm(list = ls())
library(tidyverse)
library(mlbench)
library(randomForest)
library(caret)
library(edarf)
data("Sonar")
df<-Sonar
rm(Sonar)
# Clean up variable names (becuz I'm a bit OCD)
df <- df %>% rename(V01 = V1, V02 = V2, V03 = V3, V04 = V4,
V05 = V5, V06 = V6, V07 = V7, V08 = V8,
V09 = V9)
# Get minimum class frequency
min <- min(table(df$Class))
set.seed(223)
df_rf <- df %>% na.omit()
fit_rf <- randomForest(data = df_rf,
Class ~ .,
ntree = 500,
importance = TRUE,
sampsize = c(min, min))
# Add predicted values to data frame
df_rf <- df_rf %>%
mutate(predicted = predict(fit_rf))
# Get performance measures
confusionMatrix(df_rf$predicted, df_rf$Class, positive = "R")
# Get variable importance measures
imp_df <- data.frame(importance(fit_rf, scale = FALSE, type = 1))
# Tidy up and sort the data frame
imp_df <- imp_df %>%
mutate(names = rownames(imp_df)) %>%
arrange(desc(MeanDecreaseAccuracy))
# Save top predictor names as character vector
nm <- as.character(imp_df$names)[1:10]
# Get partial depedence values for top predictors
pd_df <- partial_dependence(fit = fit_rf,
vars = nm,
data = df_rf,
n = c(100, 200))
# Plot partial dependence using edarf
plot_pd(pd_df)
Then I got the result as follows.
I successfully got the multiple images that combined as one big plot. However, I need to select any one of these plots. Is there any way I can try?
You could use the dataframe your pd_df where you first have to make it a longer format by the columns M and R to visualize it in ggplot by a variable you want like this with example of V11:
library(ggplot2)
library(tidyr)
library(dplyr)
pd_df %>%
pivot_longer(cols = c(M, R)) %>%
ggplot(aes(x = V11, y = value, color = name)) +
geom_line() +
geom_point() +
labs(x = "value", y = "prediction")
Created on 2023-01-09 with reprex v2.0.2
You can replace V11 with other variables like you want.

Equal sign changes rendering of legend labels in autoplot of a survfit object

I am using the survival package to make Kaplan-Mayer estimates of survival curves by group and then I plot out the said curves using packages ggfortify and survminer. All works fine except the legend labels for plotting. I want to present N sizes of groups in the legend labels. I thought that adding the N size to the grouping variable itself using paste0 was a good way to go. In my case it is easier than to use something like scale_fill_discrete("", labels = legend_labeller_for_plot).
library(dplyr)
library(ggplot2)
library(survival)
library(survminer)
library(ggfortify)
set.seed = 100
data <- data.frame(
time = rlnorm(20),
event = as.integer(runif(20) < 0.5),
group = ifelse(runif(20) > 0.5,
"group A",
"group B")
)
# Plotting survival curves without N sizes in the legend
fit <- survfit(
with(data, Surv(time, event)) ~ group,
data)
autoplot(fit)
# Adding N sizes to the data and plotting
data_new <- data %>%
group_by(group) %>% mutate(N = n()) %>%
ungroup() %>%
mutate(group_with_N = paste0(group, ", N = ", N))
fit_new <- survfit(
with(data, Surv(time, event)) ~ group_with_N,
data_new)
autoplot(fit_new)
When I try to add N sizes to the groups variable, the part with "N =" in the grouping variable disappears, i.e. the group variable isn't displayed on the legend labels as expected.
For comparison, what I expect is something like the following using Iris data:
What is more, I found that that the culprit is the equali sign =. When I remove the = sign, the legend labels correspond to the grouping variable values.
My question is, why does the equal sign cause this?
An option could be using ggsurvplot where you can specify the legend.labs so you can show your size in the legend like this:
library(dplyr)
library(ggplot2)
library(survival)
library(survminer)
library(ggfortify)
set.seed = 100
data <- data.frame(
time = rlnorm(20),
event = as.integer(runif(20) < 0.5),
group = ifelse(runif(20) > 0.5,
"group A",
"group B")
)
# Adding N sizes to the data and plotting
data_new <- data %>%
group_by(group) %>% mutate(N = n()) %>%
ungroup() %>%
mutate(group_with_N = paste0(group, ", N = ", N))
fit_new <- survfit(
with(data, Surv(time, event)) ~ group_with_N,
data_new)
p <- autoplot(fit_new)
p
# ggsurvplot
ggsurvplot(fit_new, data_new,
legend.labs = unique(sort(data_new$group_with_N)),
conf.int = TRUE)
Created on 2022-08-18 with reprex v2.0.2

Dynamic legend labels for ggsurvplot

I have a general question related to dynamic labels and titles for ggsurvplot.
Is it possible to provide dynamic legend labels and legend title to the ggsurvplot function when using it in nested list using map from package purrr?
The dataset I used is the NCCTG Lung Cancer Data lung{survival} and slightly adapted include ECOG scores below 3
The code below dynamically creates survival plots for each of the prognostic factors (sex and ECOG score, defined in the varnames). However, I can't figure out how to add dynamic title,legend.labs and legend.title to the arguments within the ggsurvplot function as they need to be provided as a character vector that should change with each varname (sex and ph.ecog).
Any help would be greatly appreciated.
Thanks!
HTJ
varnames <- c("sex", "ph.ecog")
lung <- lung %>% filter(ph.ecog<3)
res.tabs <- lung %>%
gather(key=VAR, value, varnames) %>%
nest( data = -VAR) %>% mutate(
fit = map(data, ~ coxph(Surv(time, status) ~ value, data = .x)),
survfit = map(data, ~ survfit(Surv(time, status) ~ value , data = .x, conf.type ="log")),
plots = ggsurvplot(survfit, data, conf.int = TRUE, title = "???", legend.labs = "???", legend.title = "???"))

ggridges with time series - R

I have a DF and I wanted to do a density graph with geom_density_ridges from ggridges, but, it's returning the same line in all states. What I'm doing wrong?
I would like to add trim = TRUE like in here, but it returns the following error message:
Ignoring unknown parameters: trim
My code:
library(tidyverse)
library(ggridges)
url <- httr::GET("https://xx9p7hp1p7.execute-api.us-east-1.amazonaws.com/prod/PortalGeral",
httr::add_headers("X-Parse-Application-Id" =
"unAFkcaNDeXajurGB7LChj8SgQYS2ptm")) %>%
httr::content() %>%
'[['("results") %>%
'[['(1) %>%
'[['("arquivo") %>%
'[['("url")
data <- openxlsx::read.xlsx(url) %>%
filter(is.na(municipio), is.na(codmun)) %>%
mutate_at(vars(contains(c("Acumulado", "Novos", "novos"))), ~ as.numeric(.))
data[,8] <- openxlsx::convertToDate(data[,8])
data <- data %>%
mutate(mortalidade = obitosAcumulado / casosAcumulado,
date = data) %>%
select(-data)
ggplot(data = data, aes(x = date, y = estado, heights = casosNovos)) +
geom_density_ridges(trim = TRUE)
You are probably not looking for density ridges but regular ridgelines.
There are a few choices to make in terms of normalisation. If you want to resemble densities, you can devide each group by their sum: height = casosNovos / sum(casosNovos). Next, you can decide that you want each ridge to be scaled to fit in between the lines, which you can do with the scales::rescale() function. It's your decision whether you want to do this per group or for the entire data. I chose the entire data below.
library(tidyverse)
library(ggridges)
url <- httr::GET("https://xx9p7hp1p7.execute-api.us-east-1.amazonaws.com/prod/PortalGeral",
httr::add_headers("X-Parse-Application-Id" =
"unAFkcaNDeXajurGB7LChj8SgQYS2ptm")) %>%
httr::content() %>%
'[['("results") %>%
'[['(1) %>%
'[['("arquivo") %>%
'[['("url")
data <- openxlsx::read.xlsx(url) %>%
filter(is.na(municipio), is.na(codmun)) %>%
mutate_at(vars(contains(c("Acumulado", "Novos", "novos"))), ~ as.numeric(.))
data[,8] <- openxlsx::convertToDate(data[,8])
data <- data %>%
mutate(mortalidade = obitosAcumulado / casosAcumulado,
date = data) %>%
select(-data) %>%
group_by(estado) %>%
mutate(height = casosNovos / sum(casosNovos))
ggplot(data = data[!is.na(data$estado),],
aes(x = date, y = estado, height = scales::rescale(height))) +
geom_ridgeline()

Select variables in one data frame to plot from another

I have large data sets where I am doing exploratory screenings for correlations. I want to do a correlation test to identify significantly related variables, and then plot these variables against each other.
data <- data.frame(a = 1:10, b = c(1.5*(1:9), 10), c = 2*(1:10), d = sample(1:5, 10, replace = T))
cor_data <- corr.test(data)
sig_cor <- ifelse(cor_data$p <0.05, cor_data$r, NA)
sig_cor_long <- sig_cor %>%
data.frame() %>%
mutate(var1 = rownames(sig_cor)) %>%
gather(var2, value = r, -var1) %>%
drop_na(r) %>%
filter(r != 1)
This identifies pairs a-b and b-c as significantly correlated, so I want to plot those. How can I automate this process of selecting the paired variables from sig_cor_long to plot via ggplot from data? An example plot that I want to create for each correlated pair would be:
ggplot(data, aes(a, b)) +
geom_smooth(method = 'lm')+
geom_point(shape = 21, color = 'darkblue', fill = 'white')
I want to have a function to input into ggplot to tell it to plot all the var1 and var2 pairs identified in sig_cor_long for which the raw data are in data.
Ok so, this is one way of plotting e.g. all the plots where there is significant correlation (in a list, so you could do anything with them)
do.call(gridExtra::grid.arrange,
ifelse(cor_data$p <0.05, cor_data$r, NA) %>%
as.data.frame() %>%
rownames_to_column() %>%
gather(pair, val, -rowname) %>%
drop_na() %>%
filter(val != 1) %$%
map2(rowname, pair, ~ggplot() + geom_smooth(method = "lm", aes(data[, .x], data[, .y])) + geom_point(aes(data[, .x], data[, .y])))
)

Resources