ROC function iris dataset - r

I'm trying to make a binary classification model based on the built-in iris dataset using the glm() function.
First, filtered the dataset so only Species Versicolor and Virginica are used in the binary classification. I am getting warning messages in the code.
Is there a way to fix the code to get rid of these warning messages? Warning messages are under the ROC.train code and ggplot() code. The code is below:
>library(dplyr)
> library(forcats)
> library(ggplot2)
> iris.small <- datasets::iris %>%
+ dplyr::filter(Species != "setosa") %>%
+ dplyr::mutate(Species = fct_drop(Species)) %>%
+ dplyr::group_by(Species) %>%
+ dplyr::summarize(avg_sl = mean(Sepal.Length),
+ avg_sw = mean(Sepal.Width),
+ avg_pl = mean(Petal.Length),
+ avg_pw = mean(Petal.Width))
> set.seed(2016-11-14)
> iris.big <- data_frame(Species = as.factor(c(rep("versicolor", 500), rep("virginica", 500))),
+ sl = c(rnorm(500, iris.small$avg_sl[1]), rnorm(500, iris.small$avg_sl[2])),
+ sw = c(rnorm(500, iris.small$avg_sw[1]), rnorm(500, iris.small$avg_sw[2])),
+ pl = c(rnorm(500, iris.small$avg_pl[1]), rnorm(500, iris.small$avg_pl[2])),
+ pw = c(rnorm(500, iris.small$avg_pw[1]), rnorm(500, iris.small$avg_pw[2])))
> train_fraction <- 0.5 #fraction of data for training purposes
> n_obs <- nrow(iris.big)
> train_size <- floor(train_fraction * nrow(iris.big))
> train_indices <- sample(n_obs, size=train_size, replace=TRUE) #sample(x, size, replace = FALSE, prob = NULL)x Either a (numeric, complex, character or logical) vector of more than one element from which to choose, or a positive integer.size non-negative integer giving the number of items to choose. replace Should sampling be with replacement? prob A vector of probability weights for obtaining the elements of the vector being sampled
> train_data <- iris.big[train_indices, ]
> test_data <- iris.big[-train_indices, ]
> glm.out.train <- glm(Species ~ sl + sw + pl + pw, data=train_data, family = "binomial")
> test_pred <- predict(glm.out.train, test_data, type='response')
> calc_ROC <- function(probabilities, known_truth, model.name=NULL)
+ {
+ outcome <- as.numeric(factor(known_truth))-1
+ pos <- sum(outcome) # total known positives
+ neg <- sum(1-outcome) # total known negatives
+ pos_probs <- outcome*probabilities # probabilities for known positives
+ neg_probs <- (1-outcome)*probabilities # probabilities for known negatives
+ true_pos <- sapply(probabilities,
+ function(x) sum(pos_probs>=x)/pos) # true pos. rate
+ false_pos <- sapply(probabilities,
+ function(x) sum(neg_probs>=x)/neg)
+ if (is.null(model.name))
+ result <- data.frame(true_pos, false_pos)
+ else
+ result <- data.frame(true_pos, false_pos, model.name)
+ result %>% dplyr::arrange(false_pos, true_pos)
+ }
> ROC.train <- calc_ROC(probabilities=test_pred, known_truth=train_data$Species, model.name="train")
Warning messages:
1: In outcome * probabilities :
longer object length is not a multiple of shorter object length
2: In (1 - outcome) * probabilities :
longer object length is not a multiple of shorter object length
> ROC.test <- calc_ROC(probabilities=test_pred, known_truth=test_data$Species, model.name="test")
> ROCs <- rbind(ROC.train, ROC.test)
> ggplot(ROCs, aes(x=false_pos, y=true_pos, color=model.name)) + geom_line() + xlim(0, 0.25)
Warning message:
Removed 745 rows containing missing values (geom_path).
> ROCs %>% dplyr::group_by(model.name) %>% dplyr::mutate(delta=false_pos-lag(false_pos)) %>% dplyr::summarize(AUC=sum(delta*true_pos, na.rm=T)) %>% dplyr::arrange(desc(AUC))
# A tibble: 2 × 2
model.name AUC
<fctr> <dbl>
1 test 0.8700770
2 train 0.7329557
JackStat recommendation:
> library(dplyr)
> library(forcats)
> library(ggplot2)
> iris.small <- datasets::iris %>%
+ dplyr::filter(Species != "setosa") %>%
+ dplyr::mutate(Species = fct_drop(Species)) %>%
+ dplyr::group_by(Species) %>%
+ dplyr::summarize(avg_sl = mean(Sepal.Length),
+ avg_sw = mean(Sepal.Width),
+ avg_pl = mean(Petal.Length),
+ avg_pw = mean(Petal.Width))
> set.seed(2016-11-14)
> iris.big <- data_frame(Species = as.factor(c(rep("versicolor", 500), rep("virginica", 500))),
+ sl = c(rnorm(500, iris.small$avg_sl[1]), rnorm(500, iris.small$avg_sl[2])),
+ sw = c(rnorm(500, iris.small$avg_sw[1]), rnorm(500, iris.small$avg_sw[2])),
+ pl = c(rnorm(500, iris.small$avg_pl[1]), rnorm(500, iris.small$avg_pl[2])),
+ pw = c(rnorm(500, iris.small$avg_pw[1]), rnorm(500, iris.small$avg_pw[2])))
> train_fraction <- 0.5 #fraction of data for training purposes
> n_obs <- nrow(iris.big)
> train_size <- floor(train_fraction * nrow(iris.big))
> train_indices <- sample(n_obs, size=train_size, replace=TRUE) #sample(x, size, replace = FALSE, prob = NULL)x Either a (numeric, complex, character or logical) vector of more than one element from which to choose, or a positive integer.size non-negative integer giving the number of items to choose. replace Should sampling be with replacement? prob A vector of probability weights for obtaining the elements of the vector being sampled
> train_data <- iris.big[train_indices, ]
> test_data <- iris.big[-train_indices, ]
> glm.out.train <- glm(Species ~ sl + sw + pl + pw, data=train_data, family = "binomial")
> test_pred <- predict(glm.out.train, test_data, type='response')
> calc_ROC <- function(probabilities, known_truth, model.name=NULL)
+ {
+ outcome <- as.numeric(factor(known_truth))-1
+ pos <- sum(outcome) # total known positives
+ neg <- sum(1-outcome) # total known negatives
+ pos_probs <- outcome*probabilities # probabilities for known positives
+ neg_probs <- (1-outcome)*probabilities # probabilities for known negatives
+ true_pos <- sapply(probabilities,
+ function(x) sum(pos_probs>=x)/pos) # true pos. rate
+ false_pos <- sapply(probabilities,
+ function(x) sum(neg_probs>=x)/neg)
+ if (is.null(model.name))
+ result <- data.frame(true_pos, false_pos)
+ else
+ result <- data.frame(true_pos, false_pos, model.name)
+ result %>% dplyr::arrange(false_pos, true_pos) eps <- 1e-15; test_pred = pmax(pmin(test_pred, 1 - eps), eps)
Error: unexpected symbol in:
"result <- data.frame(true_pos, false_pos, model.name)
result %>% dplyr::arrange(false_pos, true_pos) eps"
> }
Error: unexpected '}' in "}"
> ROC.train <- calc_ROC(probabilities=test_pred, known_truth=train_data$Species, model.name="train")
Warning messages:
1: In outcome * probabilities :
longer object length is not a multiple of shorter object length
2: In (1 - outcome) * probabilities :
longer object length is not a multiple of shorter object length
> ROC.test <- calc_ROC(probabilities=test_pred, known_truth=test_data$Species, model.name="test")
> ROCs <- rbind(ROC.train, ROC.test)
> ggplot(ROCs, aes(x=false_pos, y=true_pos, color=model.name)) + geom_line() + xlim(0, 0.25)
Warning message:
Removed 745 rows containing missing values (geom_path).
> ROCs %>% dplyr::group_by(model.name) %>% dplyr::mutate(delta=false_pos-lag(false_pos)) %>% dplyr::summarize(AUC=sum(delta*true_pos, na.rm=T)) %>% dplyr::arrange(desc(AUC))
# A tibble: 2 × 2
model.name AUC
<fctr> <dbl>
1 test 0.8700770
2 train 0.7329557
The code is still giving an error message.
Am I putting this code in the right place?
eps <- 1e-15; test_pred = pmax(pmin(test_pred, 1 - eps), eps)
I edited the code to exclude 0s and 1s. But, I'm still getting errors. What can I do now to fix the error?
> library(dplyr)
> library(forcats)
> library(ggplot2)
> iris.small <- datasets::iris %>%
+ dplyr::filter(Species != "setosa") %>%
+ dplyr::mutate(Species = fct_drop(Species)) %>%
+ dplyr::group_by(Species) %>%
+ dplyr::summarize(avg_sl = mean(Sepal.Length),
+ avg_sw = mean(Sepal.Width),
+ avg_pl = mean(Petal.Length),
+ avg_pw = mean(Petal.Width))
> set.seed(2016-11-14)
> iris.big <- data_frame(Species = as.factor(c(rep("versicolor", 500), rep("virginica", 500))),
+ sl = c(rnorm(500, iris.small$avg_sl[1]), rnorm(500, iris.small$avg_sl[2])),
+ sw = c(rnorm(500, iris.small$avg_sw[1]), rnorm(500, iris.small$avg_sw[2])),
+ pl = c(rnorm(500, iris.small$avg_pl[1]), rnorm(500, iris.small$avg_pl[2])),
+ pw = c(rnorm(500, iris.small$avg_pw[1]), rnorm(500, iris.small$avg_pw[2])))
> iris.big$sl[iris.big$sl==0] <-0.0000000001
> iris.big$sw[iris.big$sw==0] <-0.0000000001
> iris.big$pl[iris.big$pl==0] <-0.0000000001
> iris.big$pw[iris.big$pw==0] <-0.0000000001
> iris.big$sl[iris.big$sl==1] <-0.99999999
> iris.big$sw[iris.big$sw==1] <-0.99999999
> iris.big$pl[iris.big$pl==1] <-0.99999999
> iris.big$pw[iris.big$pw==1] <-0.99999999
> train_fraction <- 0.5 #fraction of data for training purposes
> n_obs <- nrow(iris.big)
> train_size <- floor(train_fraction * nrow(iris.big))
> train_indices <- sample(n_obs, size=train_size, replace=TRUE) #sample(x, size, replace = FALSE, prob = NULL)x Either a (numeric, complex, character or logical) vector of more than one element from which to choose, or a positive integer.size non-negative integer giving the number of items to choose. replace Should sampling be with replacement? prob A vector of probability weights for obtaining the elements of the vector being sampled
> train_data <- iris.big[train_indices, ]
> test_data <- iris.big[-train_indices, ]
> glm.out.train <- glm(Species ~ sl + sw + pl + pw, data=train_data, family = "binomial")
> test_pred <- predict(glm.out.train, test_data, type='response')
> calc_ROC <- function(probabilities, known_truth, model.name=NULL)
+ {
+ outcome <- as.numeric(factor(known_truth))-1
+ pos <- sum(outcome) # total known positives
+ neg <- sum(1-outcome) # total known negatives
+ pos_probs <- outcome*probabilities # probabilities for known positives
+ neg_probs <- (1-outcome)*probabilities # probabilities for known negatives
+ true_pos <- sapply(probabilities,
+ function(x) sum(pos_probs>=x)/pos) # true pos. rate
+ false_pos <- sapply(probabilities,
+ function(x) sum(neg_probs>=x)/neg)
+ if (is.null(model.name))
+ result <- data.frame(true_pos, false_pos)
+ else
+ result <- data.frame(true_pos, false_pos, model.name)
+ result %>% dplyr::arrange(false_pos, true_pos)
+ }
> ROC.train <- calc_ROC(probabilities=test_pred, known_truth=train_data$Species, model.name="train")
Warning messages:
1: In outcome * probabilities :
longer object length is not a multiple of shorter object length
2: In (1 - outcome) * probabilities :
longer object length is not a multiple of shorter object length
> ROC.test <- calc_ROC(probabilities=test_pred, known_truth=test_data$Species, model.name="test")
> ROCs <- rbind(ROC.train, ROC.test)
> ggplot(ROCs, aes(x=false_pos, y=true_pos, color=model.name)) + geom_line() + xlim(0, 0.25)
Warning message:
Removed 745 rows containing missing values (geom_path).
> ROCs %>% dplyr::group_by(model.name) %>% dplyr::mutate(delta=false_pos-lag(false_pos)) %>% dplyr::summarize(AUC=sum(delta*true_pos, na.rm=T)) %>% dplyr::arrange(desc(AUC))
# A tibble: 2 × 2
model.name AUC
<fctr> <dbl>
1 test 0.8700770
2 train 0.7329557

Related

Ho to run stratified bootstrapped linear regression in R?

Into my model x is categorical variable with 3 categories: 0,1 & 2, where 0 is reference category. However 0 categories are larger than others (1,2), so to avoid biased sample I want to to stratified bootstrapping, but could not find any relevant method for that
df <- data.frame (x = c(0,0,0,0,0,1,1,2,2),
y = c(10,11,10,10,12,17,16,20,19),
m = c(6,5,6,7,2,10,14,8,11)
)
df$x <- as.factor(df$x)
df$x <- relevel(df$x,ref = "0")
fit <- lm(y ~ x*m, data = df)
summary(fit)
Expanding on Roland's answer in the comments, you can harvest the confidence intervals from bootstrapping using boot.ci:
library(boot)
b <- boot(df, \(DF, i) coef(lm(y ~ x*m, data = df[i,])), strata = df$x, R = 999)
result <- do.call(rbind, lapply(seq_along(b$t0), function(i) {
m <- boot.ci(b, type = 'norm', index = i)$normal
data.frame(estimate = b$t0[i], lower = m[2], upper = m[3])
}))
result
#> estimate lower upper
#> (Intercept) 12.9189189 10.7166127 15.08403731
#> x1 6.5810811 2.0162637 8.73184665
#> x2 9.7477477 6.9556841 11.37390826
#> m -0.4459459 -0.8010925 -0.07451434
#> x1:m 0.1959459 -0.1842914 0.55627896
#> x2:m 0.1126126 -0.2572955 0.48352616
And even plot the results like this:
ggplot(within(result, var <- rownames(result)), aes(estimate, var)) +
geom_vline(xintercept = 0, color = 'gray') +
geom_errorbarh(aes(xmin = lower, xmax = upper), height = 0.1) +
geom_point(color = 'red') +
theme_light()

ggplot - use data passed to ggplot to calculate the mean of the data in subsequent geom calls [duplicate]

I was wondering why variable mean_y is not recognized by my
geom_hline(yintercept = unique(mean_y)) call?
library(tidyverse)
set.seed(20)
n_groups <- 2
n_in_group <- 20
sd_e = 2
groups <- gl(n_groups, n_in_group, labels = c("T","C"))
age <-rnorm(length(groups), 25, 3)
betas <- c(5,0,0,2)
dat <- data.frame(groups=groups,age=age)
X <- model.matrix(~ groups * age, data = dat)
lin_pred <- as.vector(X %*% betas)
dat$y <- rnorm(nrow(X), lin_pred, sd_e)
dat %>% group_by(groups) %>% mutate(mean_y = mean(y)) %>%
ungroup() %>%
ggplot()+aes(x = age, y = y) +
geom_point(aes(color=groups)) +
geom_hline(yintercept = unique(mean_y)) # Error in unique(mean_y) :
# object 'mean_y' not found
Variables need to be inside aes(), try:
geom_hline(aes(yintercept = mean_y))

data column not recognized in the ggplot geom_hline

I was wondering why variable mean_y is not recognized by my
geom_hline(yintercept = unique(mean_y)) call?
library(tidyverse)
set.seed(20)
n_groups <- 2
n_in_group <- 20
sd_e = 2
groups <- gl(n_groups, n_in_group, labels = c("T","C"))
age <-rnorm(length(groups), 25, 3)
betas <- c(5,0,0,2)
dat <- data.frame(groups=groups,age=age)
X <- model.matrix(~ groups * age, data = dat)
lin_pred <- as.vector(X %*% betas)
dat$y <- rnorm(nrow(X), lin_pred, sd_e)
dat %>% group_by(groups) %>% mutate(mean_y = mean(y)) %>%
ungroup() %>%
ggplot()+aes(x = age, y = y) +
geom_point(aes(color=groups)) +
geom_hline(yintercept = unique(mean_y)) # Error in unique(mean_y) :
# object 'mean_y' not found
Variables need to be inside aes(), try:
geom_hline(aes(yintercept = mean_y))

Simulation to visualize power does not predict built-in power functions

I built a shiny web APP with a nice interface to calculate sample size needed to differentiate between test groups. I wanted to build a simulation that visualizes how power works, my understanding is that power is the probability of differentiating test groups from each other when in fact these are different. Using the built in power calculator I find that I need 10 replicates to distinguish 2 samples groups when %CV =15, %Improvement = 20 and power = 80%. When I try to simulate this using random number generators I on average get a power of 40-55%. See the code below.
cv <- 15 #coefficient of variance is the standard deviation divided by the mean
percent_increase <- 20 #percent increase to detect
mean1 <- 40
mean2 <- mean1 + (mean1*(percent_increase/100))
sd1 <- (cv*mean1)/100
sd2 <- (cv*mean2)/100
pooled_sd <- sqrt((sd1^2 + sd2^2)/2)
difference <- (percent_increase/100)*mean1
pwrt <- power.t.test(delta=difference,sd=sd1,power=0.8,sig.level = .05, type="two.sample", alternative = "two.sided")
print(paste("Number of replicates needed is", pwrt$n))
#Simulate sample populations and tag which samples are different from each other.
record_test <- c()
for(i in 1:1000){
rep_sequence <- seq(2,50, by=4)
data_list <- list() # empty data list
for(r in 1:length(rep_sequence)){
d <- rnorm(rep_sequence[r], mean = mean1, sd = pooled_sd)
d2 <- rnorm(rep_sequence[r], mean = mean2, sd = pooled_sd)
df <- data.frame("value"=d, "sample"="Sample 1")
df2 <- data.frame("value"=d2, "sample"="Sample 2")
df3 <- rbind(df, df2)
df3$rep_n <- rep_sequence[r]
data_list[[r]] <- df3
}
all_data <- do.call(rbind, data_list)
all_data_summ <- all_data %>%
group_by(sample, rep_n) %>%
summarise(N= n(),
mean = mean(value),
sd = sd(value)
) %>%
ungroup() %>%
mutate(se = sd / sqrt(N),
ci_lower = mean - qnorm(0.975)*se,
ci_upper = mean + qnorm(0.975)*se,
#ci_lower = mean - qt(1 - (0.05/2), N -1)* se,
#ci_upper = mean + qt(1 - (0.05/2), N -1)* se,
)
different_tag <- (all_data_summ %>% filter(sample == 'Sample 1') %>% select(N, ci_upper) ) %>% #Sample 1 set
left_join(all_data_summ %>% filter(sample == 'Sample 2') %>% select(N, ci_lower), by=c("N"="N")) %>% #Sample 2 set
rename(ci_upper_s1 = ci_upper, ci_lower_s2 = ci_lower) %>%
mutate(different = ifelse(ci_lower_s2 > ci_upper_s1 ,'yes', 'no'))
all_data_summ1 <- all_data_summ %>%
left_join(different_tag %>% select(N, different), by=c("N"="N"))
replicate_n <- 10 #at n=10 the power should be 80%
test_result <- all_data_summ1[all_data_summ1$rep_n == replicate_n, ]$different[[1]]
record_test <- c(record_test, test_result)
ggplot(all_data_summ1, aes(rep_n, mean)) +
geom_errorbar(aes(x=rep_n, ymin=ci_lower, ymax= ci_upper, group=sample),position=position_dodge(width=1.5), width=1.5,size=1, colour="red") +
geom_point(data= filter(all_data_summ1, different == "yes"), colour="black", size=8,stroke=2, aes(rep_n,mean,group=sample),position=position_dodge(width=1.5), shape=0) +
geom_point(position=position_dodge(width=1.5), size=4, pch=21, aes(fill=sample)) +
scale_x_continuous(breaks = rep_sequence) +
labs(x="Replication", y="Average", title= paste0('Sample Means with 95% Confidence Interval Bars at CV=',cv, '%')) +
theme_gray(base_size = 25) +
theme(plot.title = element_text(hjust = 0.5))
}
print(table(record_test)/length(record_test))

Extract prediction band from lme fit

I have following model
x <- rep(seq(0, 100, by=1), 10)
y <- 15 + 2*rnorm(1010, 10, 4)*x + rnorm(1010, 20, 100)
id <- NULL
for(i in 1:10){ id <- c(id, rep(i,101)) }
dtfr <- data.frame(x=x,y=y, id=id)
library(nlme)
with(dtfr, summary( lme(y~x, random=~1+x|id, na.action=na.omit)))
model.mx <- with(dtfr, (lme(y~x, random=~1+x|id, na.action=na.omit)))
pd <- predict( model.mx, newdata=data.frame(x=0:100), level=0)
with(dtfr, plot(x, y))
lines(0:100, predict(model.mx, newdata=data.frame(x=0:100), level=0), col="darkred", lwd=7)
with predict and level=0 i can plot the mean population response. How can I extract and plot the 95% confidence intervals / prediction bands from the nlme object for the whole population?
Warning: Read this thread on r-sig-mixed models before doing this. Be very careful when you interpret the resulting prediction band.
From r-sig-mixed models FAQ adjusted to your example:
set.seed(42)
x <- rep(0:100,10)
y <- 15 + 2*rnorm(1010,10,4)*x + rnorm(1010,20,100)
id<-rep(1:10,each=101)
dtfr <- data.frame(x=x ,y=y, id=id)
library(nlme)
model.mx <- lme(y~x,random=~1+x|id,data=dtfr)
#create data.frame with new values for predictors
#more than one predictor is possible
new.dat <- data.frame(x=0:100)
#predict response
new.dat$pred <- predict(model.mx, newdata=new.dat,level=0)
#create design matrix
Designmat <- model.matrix(eval(eval(model.mx$call$fixed)[-2]), new.dat[-ncol(new.dat)])
#compute standard error for predictions
predvar <- diag(Designmat %*% model.mx$varFix %*% t(Designmat))
new.dat$SE <- sqrt(predvar)
new.dat$SE2 <- sqrt(predvar+model.mx$sigma^2)
library(ggplot2)
p1 <- ggplot(new.dat,aes(x=x,y=pred)) +
geom_line() +
geom_ribbon(aes(ymin=pred-2*SE2,ymax=pred+2*SE2),alpha=0.2,fill="red") +
geom_ribbon(aes(ymin=pred-2*SE,ymax=pred+2*SE),alpha=0.2,fill="blue") +
geom_point(data=dtfr,aes(x=x,y=y)) +
scale_y_continuous("y")
p1
Sorry for coming back to such an old topic, but this might address a comment here:
it would be nice if some package could provide this functionality
This functionality is included in the ggeffects-package, when you use type = "re" (which will then include the random effect variances, not only residual variances, which is - however - the same in this particular example).
library(nlme)
library(ggeffects)
x <- rep(seq(0, 100, by = 1), 10)
y <- 15 + 2 * rnorm(1010, 10, 4) * x + rnorm(1010, 20, 100)
id <- NULL
for (i in 1:10) {
id <- c(id, rep(i, 101))
}
dtfr <- data.frame(x = x, y = y, id = id)
m <- lme(y ~ x,
random = ~ 1 + x | id,
data = dtfr,
na.action = na.omit)
ggpredict(m, "x") %>% plot(rawdata = T, dot.alpha = 0.2)
ggpredict(m, "x", type = "re") %>% plot(rawdata = T, dot.alpha = 0.2)
Created on 2019-06-18 by the reprex package (v0.3.0)

Resources