Fitting age adjusted model in R - r

I would like to fit a logistic regression model in R but adjust for age only. How can this be done? I know there is a way to fit univariable logistic model using gtsummary tbl_uvregression function. Can we tweak this?
library(tidyverse)
library(gtsummary)
set.seed(100)
trialdata <- tibble(
outcome = factor(rbinom(1000, 1, 0.10),
labels = c("No", "Yes")),
age = floor(rnorm(1000, 16, 3)),
score = floor(runif(1000, 30, 99)),
school = factor(rbinom(1000, 1, 0.35),
labels = c("No", "Yes")),
education = factor(rbinom(1000, 2, 0.20),
labels = c("Primary", "Secondary", "Tertiary"))
)
tbl_uv_ex1 <-
trialdata %>%
tbl_uvregression(
method = glm,
y = outcome,
method.args = list(family = binomial),
exponentiate = TRUE
)
tbl_uv_ex1

Related

Is there a way to report prevalence ratio using gtsummary?

How would I report PR instead of OR?
library(gtsummary)
library(dplyr)
set.seed(2022)
trial_data <- tibble(
"outcome" = factor(rbinom(1000, 1, 0.20),
labels = c("No", "Yes")),
"var1" = factor(rbinom(1000, 2, 0.25),
labels = c("Low", "Middle", "High")),
"var2" = factor(rbinom(1000, 2, 0.20),
labels = c("Primary", "Secondary", "Tertiary")),
"var3" = factor(rbinom(1000, 1, 0.10),
labels = c("No", "Yes")),
"var4" = round(rnorm(1000, 20, 5)),
)
trial_data %>% count(var3)
# Logistic regression with odds ratio
trial_data %>%
tbl_uvregression(
method = glm,
y = "outcome",
method.args = list(family = binomial),
exponentiate = TRUE,
pvalue_fun = function(x) style_pvalue(x, digits = 2)
)

Survival Analysis in R -- why it is always a straight line

I am encountering a problem while trying to generate a survival curve. I don't know and understand why the curve which represents the survival probability of each year is sometimes a straight line. Ideally, it should be an up-and-down line, eventually lying around 55%.
I have included codes below, and please kindly refer to them. I have also attached a sample wrong plot to understand my description. Hopefully, it can help you understand what I am describing.
Any suggestions will be very appreciated. :)
Plot:
Codes:
library(survival)
library(survminer)
library(dplyr)
library(ggplot2)
library(readxl)
library(tidyverse)
data_all <- data.frame(Years_Diff_Surg_Death = c(8.919917864,
8.895277207, 8.881587953, 8.821355236, 8.728268309, 8.709103354), Survival = c(1L, 0L, 1L, 1L, 1L, 1L))
data_all <- data.frame(Years_Diff_Surg_Death = c(8.919917864,
8.895277207, 8.881587953, 8.821355236, 8.728268309, 8.709103354), Survival = c(1L, 0L, 1L, 1L, 1L, 1L))
data_2013 <- data.frame(Years_Diff_Surg_Death = c("36.99383984", "2.584531143", "36.91991786", "36.89527721", "36.88158795", "36.82135524"), YEARS_OF_SURGERY = c("2013","2013","2013","2013","2013","2013"), Survival = c("1","0", "1", "1", "1", "1"))
data_2014 <- data.frame(Years_Diff_Surg_Death = c(0.542094456, 5.196440794, 35.95619439, 35.91786448, 35.86584531, 35.8275154), YEARS_OF_SURGERY = c(2014, 2014, 2015, 2014, 2014, 2014, 2016), Survival = c(0, 0, 1, 1, 1, 1))
data_2015 <- data.frame(Years_Diff_Surg_Death = c(34.4476386, 34.25598905,0.621492129, 34.38740589, 34.33264887, 1.081451061), YEARS_OF_SURGERY = c(2015, 2015, 2015, 2015, 2015, 2015), Survival = c(1, 1, 0, 1, 1, 0))
data_2016 <- data.frame(Years_Diff_Surg_Death = c(2.902121834, 0.950034223, 33.9301848, 33.91101985, 33.87268994, 33.85352498), YEARS_OF_SURGERY = c(2016,2016,2016, 2016, 2016, 2016), Survival = c(0, 0, 1, 1, 1, 1))
data_2017 <- data.frame(Years_Diff_Surg_Death = c(32.99110198, 3.348391513, 32.95277207,32.91170431, 32.87611225, 0.791238877), YEARS_OF_SURGERY = c(2017, 2017, 2017, 2017, 2017, 2017), Survival = c(1, 0, 1, 1, 1, 0))
fit_all <- survfit(Surv(Years_Diff_Surg_Death, Survival) ~ 1, data = data_all)
fit_2013 <- survfit(Surv(Years_Diff_Surg_Death, Survival) ~ YEARS_OF_SURGERY, data = data_2013)
fit_2014 <- survfit(Surv(Years_Diff_Surg_Death, Survival) ~ YEARS_OF_SURGERY, data = data_2014)
fit_2015 <- survfit(Surv(Years_Diff_Surg_Death, Survival) ~ YEARS_OF_SURGERY, data = data_2015)
fit_2016 <- survfit(Surv(Years_Diff_Surg_Death, Survival) ~ YEARS_OF_SURGERY, data = data_2016)
fit_2017 <- survfit(Surv(Years_Diff_Surg_Death, Survival) ~ YEARS_OF_SURVERY, data = data_2017)
fit_comb <- list(s_2013 = fit_2013,
s_2014 = fit_2014,
s_2015 = fit_2015,
s_2016 = fit_2016,
s_2017 = fit_2017,
s_all= fit_all)
ggsurvplot(fit_all, conf.int = TRUE,
ylim = c(0,1),
xlim = c(0,5),
break.x.by = 1,
title = "Years of Death After Surgery via Survival",
xlab = ("Years"),
legend = "none")
ggsurvplot(fit_2013, conf.int = TRUE,
ylim = c(0,1),
xlim = c(0,5),
break.x.by = 1,
title = ("Years of Death After Surgery via Survival"),
xlab = ("Years"),
legend = "none",
risk.table = F)
ggsurvplot(fit_2014, conf.int = TRUE,
ylim = c(0,1),
xlim = c(0,5),
break.x.by = 1,
title = ("Years of Death After Surgery via Survival"),
xlab = ("Years"),
legend = "none",
risk.table = F)
ggsurvplot(fit_2015, conf.int = TRUE,
ylim = c(0,1),
xlim = c(0,5),
break.x.by = 1,
title = ("Years of Death After Surgery via Survival"),
xlab = ("Years"),
legend = "none",
risk.table = F)
ggsurvplot(fit_2016, conf.int = TRUE,
ylim = c(0,1),
xlim = c(0,5),
break.x.by = 1,
title = ("Years of Death After Surgery via Survival"),
xlab = ("Years"),
legend = "none",
risk.table = F)
ggsurvplot(fit_2017, conf.int = TRUE,
ylim = c(0,1),
xlim = c(0,5),
break.x.by = 1,
title = ("Years of Death After Surgery via Survival"),
xlab = ("Years"),
legend = "none",
risk.table = F)
ggsurvplot_combine(fit_comb,
data_ECV,
xlab = ("Years"),
xlim = c(0,5),
break.x.by = 1)

How to overlay prediction intervals over confidence intervals using the 'forestplot' function in R

I want to overlay prediction intervals over confidence intervals in a non-overall forest plot to compare the width between these intervals. The example of the desired forest plot is as follows:
However, I can't seem to find the right function with the 'forestplot' package. Does anyone has any clue as of how to create the analysis in R?
Update:
To illustrate, I made a dummy dataset as follows:
names <- c("Variable", "Diabetes", "Hypertension", "Cancer", "Asthma")
n <- c("N", 5, 5, 5, 5)
coef <- c(1.10, 1.05, 1.20, 1.25)
ci.low <- c(1.05, 1.02, 1.18, 1.21)
ci.high <- c(1.12, 1.07, 1.24, 1.29)
pi.low <- c(0.80, 0.99, 0.97, 0.95)
pi.high <- c(1.45, 1.30, 1.44, 1.66)
boxsize <- c(0.2,0.2,0.2,0.2)
test_data <- data.frame(coef=coef, low=ci.low,high=ci.high, boxsize=boxsize)
test_data <- rbind(NA, test_data)
row_names <- cbind(names, n, c("OR [95% CI]", "1.10 [1.05-1.12]", "1.05 [1.02-1.07]", "1.20 [1.18-1.24]", "1.25 [1.21-1.29]"))
#Forest plot
forestplot(labeltext = row_names,
mean = test_data$coef, upper = test_data$high,
lower = test_data$low,
is.summary=c(TRUE, FALSE, FALSE, FALSE, FALSE, FALSE),
boxsize = test_data$boxsize,
zero = 1,
xlog = FALSE,
xlab = "OR (95% CI)",
clip = c(0,2),
col = fpColors(lines="black", box="black"),
colgap = unit(0.03,'npc'),
hrzl_lines= TRUE,
lineheight = unit(1.1,"cm"),
graphwidth = "auto",
txt_gp=fpTxtGp(label = gpar(cex = 0.8),
title = gpar(cex = 1),
ticks = gpar(cex = 0.7),
xlab = gpar(cex = 0.7)))
Which returns this figure:
However, I still can't seem to find the appropriate function to overlay the prediction intervals over the confidence intervals. Thank you very much in advance.

calculate r square (Multiple determination coefficient) using GBM in R

I perform GBM models.
Data sample
a=structure(list(yield = c(1842L, 2147L, 2444L, 3850L, 1866L, 2897L,
1783L, 2434L, 2650L, 2863L), time.diff = c(122, 186, 177, 168,
162, 186, 161, 125, 187, 185), Biomass = c(18400L, 6400L, 8620L,
12800L, 5400L, 10400L, 6000L, 8800L, 9080L, 60000L)), class = "data.frame", row.names = c(NA,
-10L))
my code
indexes = createDataPartition(a$yield, p = .7, list = F)
train = a[indexes, ]
test = a[-indexes, ]
write.csv(test,"test.csv")
ames_train <- train
ames_test <- test
str(ames_train)
# train GBM model
gbm.fit <- gbm(
formula = yield ~ .,
distribution = "gaussian",
data = ames_train,
n.trees = 10000,
interaction.depth = 1,
shrinkage = 0.001,
cv.folds = 5,
n.cores = NULL, # will use all cores by default
verbose = FALSE
)
# print results
print(gbm.fit)
# get MSE and compute RMSE
sqrt(min(gbm.fit$cv.error))
Here indicated MSE and RMSE
How can i calculate r square (Multiple determination coefficient) for this model?
This way you get the r2 with predictions obtained by cross-validation, which are true predictions.
r <- as.numeric(gbm.fit$cv.statistics[3])
rsq = round(r^2,2)

Negative adjusted R-squared of fixed effect plm function

I am currently writing my thesis and when I use the plm (from package plm) function (pooled, within or random) I get a very low R-squared and negative adjusted R-squared. However, when I run the lm() function with dummy variables, the adjusted R-squared is around 0.4. What am I doing wrong?
This is the first three rows of my data:
structure(list(table.id = c("alameda, ca, 2003", "alameda, ca, 2004",
"alameda, ca, 2005"), location = c("Alameda, CA", "Alameda, CA",
"Alameda, CA"), year = c(2003, 2004, 2005), search.fund = c(0,
0, 0), search.fund.binary = c(0, 0, 0), time.avg = c(0, 0, 0),
distance.avg = c(0, 0, 0), avg.income.capita = c(40266, 41973,
43594), real.gdp = c(86355025, 88443534, 90705419), unemployment = c(6.8,
5.9, 5.1), education.rate = c(34.9, 34.9, 34.9), urban.id = c(1,
1, 1), no.establishments = c(46548.75, 46623.5, 46254.25),
no.building.permit = c(14828, 15239, 14883), population.size = c(1454163,
1445721, 1441545), no.establishments.capita = c(0.0320106824338124,
0.0322493067472908, 0.0320865807172166), no.building.permit.capita = c(0.0101969311555857,
0.0105407613225512, 0.0103243395107333)), row.names = c(NA,
3L), class = "data.frame")
This is my lm model:
sf.lm.fe.nb <- lm(search.fund ~ education.rate + unemployment +
urban.id + no.establishments.capita + no.building.permit.capita +
factor(location) + factor(year), data = df)
summary(sf.lm.fe.nb)
and this is my plm model:
sf.plm.fe.nb <- plm(search.fund ~ education.rate + unemployment +
urban.id + no.establishments.capita + no.building.permit.capita,
data = df, model = "within", effect = "twoways",
index = c("location", "year"))
summary(sf.plm.fe.nb)
Both use the same data sets

Resources