Loop glm for every column in R dataset - r

I have a dataset of 100 patients (7 are shown here), 2 covariates and 50 phenotypes(5 are shown here). I want to perform a multivariable logistic regression for each phenotype using Covariate1 and Covariate2 as covariates to predict the Outcome, I would like to get a table like this, where I have the p-value, OR and confidence interval(CI)per each of the covariates.
I tried:
for (i in df) {
print(i)
model <-glm(Outcome~ x[i] +Covariate1 +Covariate2, family = binomial(link = "logit"), data=df)
I also tried the solution for this question. But x and y a reversed in my question, so it did not work:
R: automate table for results of several multivariable logistic regressions
Thanks very much for your help!
This is an example dataset
df<-structure(list(ID = c(1, 2, 3, 4, 5, 6, 7), Outcome = c(0, 0,
1, 1, 0, 1, 0), Covariate1 = c(1, 2, 3, 4, 5, 6, 7), Covariate2 = c(0,
0, 0, 1, 1, 1, 1), P1 = c(1, 0, 0, 1, 1, 1, 2), P2 = c(0, 2,
0, 1, 1, 1, 1), P3 = c(0, 0, 0, 1, 1, 1, 1), P4 = c(0, 0, 0,
1, 2, 1, 1), P5 = c(0, 0, 0, 1, 1, 1, 2)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -7L))

if I understood correctly
df <- structure(
list(
ID = c(1, 2, 3, 4, 5, 6, 7),
Outcome = c(0, 0, 1, 1, 0, 1, 0),
Covariate1 = c(1, 2, 3, 4, 5, 6, 7),
Covariate2 = c(0, 0, 0, 1, 1, 1, 1),
P1 = c(1, 0, 0, 1, 1, 1, 2),
P2 = c(0, 2, 0, 1, 1, 1, 1),
P3 = c(0, 0, 0, 1, 1, 1, 1),
P4 = c(0, 0, 0, 1, 2, 1, 1),
P5 = c(0, 0, 0, 1, 1, 1, 2)
),
class = c("tbl_df",
"tbl", "data.frame"),
row.names = c(NA,-7L)
)
library(tidyverse)
first_tables <- map(
.x = select(df, starts_with("P")),
.f = ~ glm(
Outcome ~ .x + Covariate1 + Covariate2,
family = binomial(link = "logit"),
data = df
)
) %>%
map(broom::tidy)
map_df(
.x = first_tables,
.f = ~ .x %>% mutate(
p = p.value,
OR = exp(estimate),
CI5 = exp(estimate - 1.96 * std.error),
CI95 = exp(estimate + 1.96 * std.error),
.keep = "unused"
) %>%
select(-statistic),
.id = "phenotype"
) %>%
filter(term == ".x") %>%
select(-term)
#> # A tibble: 5 x 5
#> phenotype p OR CI5 CI95
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 P1 0.997 5.84e-10 0 Inf
#> 2 P2 0.996 1.53e- 4 0 Inf
#> 3 P3 0.824 2.00e+ 0 0.00442 904.
#> 4 P4 0.998 3.66e- 9 0 Inf
#> 5 P5 0.997 2.72e-10 0 Inf
Created on 2023-01-11 with reprex v2.0.2

Related

Odds ratio for 2*5 Table [duplicate]

I have the following dataframe:
structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15), var1 = c(1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1), var2 = c(0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1),
var3 = c(1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1), var4 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1), outcome = c(1,
1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1)), row.names = c(NA,
-15L), class = c("tbl_df", "tbl", "data.frame"))
I would like to arrange a script to calculate all possible odds ratio (using chi square), with 95% CI and p values, between all columns and the column outcome.
How can I do that?
I installed epitools but it seems that I need a 2x2 contingency table and I am not able to apply the function to columns of a dataframe
With mapply, you can use the fisher.test function, which doesn't fail when the odds ratio cannot be calculated.
mapply(fisher.test, x=data[, grep("var", names(data))], y=data[,"outcome"])
But the output is a 7x4 matrix which cannot be tidied into a nice format. However, we can use lapply to perform Fisher's test for each column and then tidy the results with the broom package.
library(broom)
cols <- df1[,grep("var", names(df1))]
res_list <- lapply(as.list(cols), function(x) fisher.test(x, y=df1$outcome))
do.call(rbind, lapply(res_list, broom::tidy))
# A tibble: 4 x 6
estimate p.value conf.low conf.high method alternative
<dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 0 1 0 77.9 Fisher's Exact Test ~ two.sided
2 Inf 0.505 0.204 Inf Fisher's Exact Test ~ two.sided
3 2.13 0.608 0.160 37.2 Fisher's Exact Test ~ two.sided
4 Inf 0.505 0.204 Inf Fisher's Exact Test ~ two.sided
Or using dplyr with map, reshaping first and then splitting on the name.
library(dplyr)
df1 %>%
pivot_longer(cols=starts_with("var")) %>%
split(.$name) %>%
map(~fisher.test(x=.$value, y=.$outcome)) %>%
map(tidy) %>%
map_df(~as_tibble(.))
Data:
df1 <- structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15), var1 = c(1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1), var2 = c(0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1),
var3 = c(1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1), var4 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1), outcome = c(1,
1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1)), row.names = c(NA,
-15L), class = c("tbl_df", "tbl", "data.frame"))
The following code performs the computations as described in the question but 3/4 give errors.
library(epitools)
cols <- grep("var", names(df1), value = TRUE)
res_list <- lapply(cols, function(v){
tbl <- table(df1[, c(v, "outcome")])
tryCatch(oddsratio(x = tbl), error = function(e) e)
})
ok <- !sapply(res_list, inherits, "error")
res_list[ok]
The errors are all this:
simpleError in uniroot(function(or) { 1 - midp(a1, a0, b1, b0, or)
- alpha/2}, interval = interval): f() values at end points not of opposite sign
which can be seen with
res_list[!ok]

Sorting barplot based on multi-categories in r

I am trying to get a bar plot for sentiment scores corrected as per the following order and put into two separate colors:
(NEGATIVE) anger, disgust, fear, sadness, negative --- (POSITIVE) anticipation, joy, surprise, trust, positive.
Below is the code which only gives a decreasing plot.
barplot(sort(colSums(s), decreasing = TRUE),
las = 2,
col = rainbow(2),
ylab = 'Count',
main = 'User Synergies')
> dput(head(s))
structure(list(anger = c(1, 0, 0, 0, 0, 0), anticipation = c(0,
0, 5, 0, 0, 0), disgust = c(0, 0, 0, 0, 0, 0), fear = c(1, 0,
2, 1, 0, 0), joy = c(1, 0, 1, 0, 0, 0), sadness = c(1, 0, 2,
1, 0, 0), surprise = c(0, 0, 2, 1, 0, 0), trust = c(4, 2, 3,
1, 0, 1), negative = c(2, 0, 3, 2, 1, 1), positive = c(4, 4,
7, 1, 0, 2)), row.names = c(NA, 6L), class = "data.frame")
Another way:
positive <- c("anticipation", "joy", "surprise", "trust", "positive")
negative <- c("anger", "disgust", "fear", "sadness", "negative")
barplot(colSums(s[,c(negative, positive)]),
las = 2,
col = c(rep("red", length(negative)), rep("cyan", length(positive))),
ylab = 'Count', ylim = c(0, 20),
main = 'User Synergies')
The result:
Try this ,
df <- structure(list(anger = c(1, 0, 0, 0, 0, 0),
anticipation = c(0, 0, 5, 0, 0, 0),
disgust = c(0, 0, 0, 0, 0, 0),
fear = c(1, 0,2, 1, 0, 0),
joy = c(1, 0, 1, 0, 0, 0),
sadness = c(1, 0, 2, 1, 0, 0),
surprise = c(0, 0, 2, 1, 0, 0),
trust = c(4, 2, 3, 1, 0, 1),
negative = c(2, 0, 3, 2, 1, 1),
positive = c(4, 4,7, 1, 0, 2)),
row.names = c(NA, 6L), class = "data.frame")
pn <- rainbow(2) # "#FF0000" "#00FFFF" one for positive and the other for negative
s <- sort(colSums(df) , decreasing = TRUE)
names(s)
#> [1] "positive" "trust" "negative" "anticipation" "fear"
#> [6] "sadness" "surprise" "joy" "anger" "disgust"
# arrange colors based on names of sorted columns
col <- c(pn[1] , pn[1] , pn[2] , pn[1] , pn[2] ,
pn[2] , pn[1] , pn[1] , pn[2] , pn[2])
barplot(s ,
las = 2,
col = col,
ylab = 'Count',
main = 'User Synergies')
Created on 2022-05-31 by the reprex package (v2.0.1)
You may try
library(dplyr)
library(reshape2)
df <- data.frame(
anger = 200,
disgust = 100,
fear = 900,
sadness = 400,
negative = 1500,
anticipation = 2000,
joy = 1200,
surprise = 300,
trust = 2500,
positive = 5000
)
pall <- c("red", "blue")
colSums(df) %>%
melt %>%
tibble::rownames_to_column(., "sentiments") %>%
mutate(sentiments = factor(sentiments, levels = c("anger", "disgust", "fear", "sadness", "negative", "anticipation", "joy", "surprise", "trust", "positive"))) %>%
mutate(colo = ifelse(sentiments %in% c("anger", "disgust", "fear", "sadness", "negative"), 0, 1) %>% as.factor) %>%
barplot(data = ., value ~ sentiments, col = pall[.$colo], las = 2, xlab = "")
Another approach :
df <- structure(list(anger = c(1, 0, 0, 0, 0, 0),
anticipation = c(0, 0, 5, 0, 0, 0),
disgust = c(0, 0, 0, 0, 0, 0),
fear = c(1, 0,2, 1, 0, 0),
joy = c(1, 0, 1, 0, 0, 0),
sadness = c(1, 0, 2, 1, 0, 0),
surprise = c(0, 0, 2, 1, 0, 0),
trust = c(4, 2, 3, 1, 0, 1),
negative = c(2, 0, 3, 2, 1, 1),
positive = c(4, 4,7, 1, 0, 2)),
row.names = c(NA, 6L), class = "data.frame")
s <- sort(colSums(df) , decreasing = TRUE)
pos <- c("positive" , "trust" , "anticipation" ,
"surprise" , "joy")
col <- names(s)
col <- ifelse(col %in% pos , "cyan" , "red")
barplot(s ,
las = 2,
col = col,
ylab = 'Count',
main = 'User Synergies')
Created on 2022-05-31 by the reprex package (v2.0.1)

Calculating Odds Ratio between multiple columns of a dataframe

I have the following dataframe:
structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15), var1 = c(1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1), var2 = c(0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1),
var3 = c(1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1), var4 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1), outcome = c(1,
1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1)), row.names = c(NA,
-15L), class = c("tbl_df", "tbl", "data.frame"))
I would like to arrange a script to calculate all possible odds ratio (using chi square), with 95% CI and p values, between all columns and the column outcome.
How can I do that?
I installed epitools but it seems that I need a 2x2 contingency table and I am not able to apply the function to columns of a dataframe
With mapply, you can use the fisher.test function, which doesn't fail when the odds ratio cannot be calculated.
mapply(fisher.test, x=data[, grep("var", names(data))], y=data[,"outcome"])
But the output is a 7x4 matrix which cannot be tidied into a nice format. However, we can use lapply to perform Fisher's test for each column and then tidy the results with the broom package.
library(broom)
cols <- df1[,grep("var", names(df1))]
res_list <- lapply(as.list(cols), function(x) fisher.test(x, y=df1$outcome))
do.call(rbind, lapply(res_list, broom::tidy))
# A tibble: 4 x 6
estimate p.value conf.low conf.high method alternative
<dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 0 1 0 77.9 Fisher's Exact Test ~ two.sided
2 Inf 0.505 0.204 Inf Fisher's Exact Test ~ two.sided
3 2.13 0.608 0.160 37.2 Fisher's Exact Test ~ two.sided
4 Inf 0.505 0.204 Inf Fisher's Exact Test ~ two.sided
Or using dplyr with map, reshaping first and then splitting on the name.
library(dplyr)
df1 %>%
pivot_longer(cols=starts_with("var")) %>%
split(.$name) %>%
map(~fisher.test(x=.$value, y=.$outcome)) %>%
map(tidy) %>%
map_df(~as_tibble(.))
Data:
df1 <- structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15), var1 = c(1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1), var2 = c(0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1),
var3 = c(1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1), var4 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1), outcome = c(1,
1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1)), row.names = c(NA,
-15L), class = c("tbl_df", "tbl", "data.frame"))
The following code performs the computations as described in the question but 3/4 give errors.
library(epitools)
cols <- grep("var", names(df1), value = TRUE)
res_list <- lapply(cols, function(v){
tbl <- table(df1[, c(v, "outcome")])
tryCatch(oddsratio(x = tbl), error = function(e) e)
})
ok <- !sapply(res_list, inherits, "error")
res_list[ok]
The errors are all this:
simpleError in uniroot(function(or) { 1 - midp(a1, a0, b1, b0, or)
- alpha/2}, interval = interval): f() values at end points not of opposite sign
which can be seen with
res_list[!ok]

Decision tree and error matrix calculations

I've created a decision tree using rpart and the code below:
res.tree <- rpart(myformula, data = credit_train)
my data has been subset into 2 parts. The training part at 70% and a testing part at 30%.
This part works well and my tree is created. Where I'm getting stuck is with the prediction so that I can calculate my confusion matrix and ROC curves.
I'm using this code tree_pred = predict(res.tree, credit_train, type = "class")
but I get this message:
Error in predict.rpart(res.tree, credit_test, type = "class") : Invalid prediction for "rpart" object
In addition:
Warning message:
'newdata' had 271 rows but variables found have 729 rows
I can't figure out if I don't have a library loaded or what is causing the it not to recognize the type, which is what so many resources say I need to use and why I'm getting a mismatch in the rows.
The 'newdata' at 271 rows is what my testing data set has and my training data-set has 729 rows.
Is the decision tree creation causing my problem or could it be the prediction code?
Responding to comments:
I'm using the following libraries:
library(readxl)
library(dplyr)
library(factoextra)
library(corrplot)
library(rpart)
library(rpart.plot)
library(RColorBrewer)
library(pROC)
library(Hmisc)
library(fBasics)
library(rattle)
library(caret)
A sample of my data:
structure(list(CHK_ACCT = c(0, 1, 0, 0), DURATION = c(6, 48,
42, 24), HISTORY = c(4, 2, 2, 3), NEW_CAR = c(0, 0, 0, 1), USED_CAR = c(0,
0, 0, 0), FURNITURE = c(0, 0, 1, 0), `RADIO/TV` = c(1, 1, 0,
0), EDUCATION = c(0, 0, 0, 0), RETRAINING = c(0, 0, 0, 0), AMOUNT = c(1169,
5951, 7882, 4870), SAV_ACCT = c(4, 0, 0, 0), EMPLOYMENT = c(4,
2, 3, 2), INSTALL_RATE = c(4, 2, 2, 3), MALE_DIV = c(0, 0, 0,
0), MALE_SINGLE = c(1, 0, 1, 1), MALE_MAR_or_WID = c(0, 0, 0,
0), `CO-APPLICANT` = c(0, 0, 0, 0), GUARANTOR = c(0, 0, 1, 0),
PRESENT_RESIDENT = c(4, 2, 4, 4), REAL_ESTATE = c(1, 1, 0,
0), PROP_UNKN_NONE = c(0, 0, 0, 1), AGE = c(67, 22, 45, 53
), OTHER_INSTALL = c(0, 0, 0, 0), RENT = c(0, 0, 0, 0), OWN_RES = c(1,
1, 0, 0), NUM_CREDITS = c(2, 1, 1, 2), JOB = c(2, 2, 2, 2
), NUM_DEPENDENTS = c(1, 1, 2, 2), TELEPHONE = c(1, 0, 0,
0), FOREIGN = c(0, 0, 0, 0), DEFAULT = c(0, 1, 0, 1), CHK_ACCT_rec = c(1,
2, 1, 1), SAV_ACCT_rec = c(0, 1, 1, 1)), .Names = c("CHK_ACCT",
"DURATION", "HISTORY", "NEW_CAR", "USED_CAR", "FURNITURE", "RADIO/TV",
"EDUCATION", "RETRAINING", "AMOUNT", "SAV_ACCT", "EMPLOYMENT",
"INSTALL_RATE", "MALE_DIV", "MALE_SINGLE", "MALE_MAR_or_WID",
"CO-APPLICANT", "GUARANTOR", "PRESENT_RESIDENT", "REAL_ESTATE",
"PROP_UNKN_NONE", "AGE", "OTHER_INSTALL", "RENT", "OWN_RES",
"NUM_CREDITS", "JOB", "NUM_DEPENDENTS", "TELEPHONE", "FOREIGN",
"DEFAULT", "CHK_ACCT_rec", "SAV_ACCT_rec"), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
myformula = credit_train$DEFAULT ~ credit_train$CHK_ACCT_rec +
credit_train$DURATION + credit_train$HISTORY + credit_train$NEW_CAR +
credit_train$USED_CAR + credit_train$FURNITURE + credit_train$`RADIO/TV` +
credit_train$EDUCATION + credit_train$RETRAINING + credit_train$AMOUNT +
credit_train$SAV_ACCT_rec + credit_train$EMPLOYMENT +
credit_train$INSTALL_RATE + credit_train$MALE_DIV + credit_train$MALE_SINGLE
+ credit_train$MALE_MAR_or_WID + credit_train$`CO-APPLICANT` +
credit_train$GUARANTOR + credit_train$PRESENT_RESIDENT +
credit_train$REAL_ESTATE + credit_train$PROP_UNKN_NONE + credit_train$AGE +
credit_train$OTHER_INSTALL + credit_train$RENT + credit_train$OWN_RES +
credit_train$NUM_CREDITS + credit_train$JOB + credit_train$NUM_DEPENDENTS +
credit_train$TELEPHONE + credit_train$FOREIGN
#calimo I hope this is what you needed.

Identify correspondence between rows and columns

df <- data.frame("Minute" = c(rep(27, 3), rep(28, 3)),
"ID" <- c(1,2,3,1,2,3),
"dist1" = c(0, 1, 4, 2, 4, 1),
"dist2" = c(1, 0, 0, 0, 1, 0),
"dist3" = c(0, 0, 2, 1, 4, 0))
At minute 27 ID 1 has a value of 0 with dist3. At minute 27 ID 3 has a value of 4 with dist1. I want to know how to write an "if then" statement or something similar to identify if those values 1) are greater than zero or 2) match. If so, I want to replace the values in this data frame with ones. If not, they become zero.
Expected output for minute 27 only:
df2 <- data.frame("Minute" = c(rep(27, 3)),
"ID" = c(1,2,3),
"dist1" = c(0, 1, 0),
"dist2" = c(1, 0, 0),
"dist3" = c(0, 0, 0))
Another example:
df <- data.frame("Minute" = c(rep(28, 3)),
"ID" = c(1,2,3),
"dist1" = c(2, 4, 1),
"dist2" = c(2, 1, 0),
"dist3" = c(1, 4, 5))
Expected output:
df2 <- data.frame("Minute" = c(rep(28, 3)),
"ID" = c(1,2,3),
"dist1" = c(2, 1, 1),
"dist2" = c(1, 1, 0),
"dist3" = c(1, 0, 5))
Notice that ID 1 has a value greater than 0 for dist2 and ID2 has a value greater than 0 for dist1. Those two correspond by being above 0, so I want both of them to become 1.

Resources