Im working with a huge data frame with structure similar to the followings. I use output_reg to store slope and intercept for each treatment but I need to add r.squared for each lm (y~x) and store it in another column besides the other two. Any hint on that?
library(plyr)
field <- c('t1','t1','t1', 't2', 't2','t2', 't3', 't3','t3')
predictor <- c(4.2, 5.3, 5.4,6, 7,8.5,9, 10.1,11)
response <- c(5.1, 5.1, 2.4,6.1, 7.7,5.5,1.99, 5.42,2.5)
my_df <- data.frame(field, predictor, response, stringsAsFactors = F)
output_reg<-list()
B<-(unique(my_df$field))
for (i in 1:length(B)) {
index <- my_df[my_df$field==B[i],]
x<- index$predictor
y<- index$response
output_reg[[i]] <- lm (y ~ x) # gets estimates for each field
}
Thanks
r.squared can be accessed via the summary of the model, try this:
m <- lm(y ~ x)
rs <- summary(m)$r.squared
The summary object of the linear regression result contains almost everything you need:
output_reg<-list()
B<-(unique(my_df$field))
for (i in 1:length(B)) {
index <- my_df[my_df$field==B[i],]
x<- index$predictor
y<- index$response
m <- lm (y ~ x)
s <- summary(m) # get the summary of the model
# extract every thing you need from the summary object
output_reg[[i]] <- c(s$coefficients[, 'Estimate'], r.squared = s$r.squared)
}
output_reg
#[[1]]
#(Intercept) x r.squared
# 10.7537594 -1.3195489 0.3176692
#[[2]]
#(Intercept) x r.squared
# 8.8473684 -0.3368421 0.1389040
#[[3]]
#(Intercept) x r.squared
#-0.30500000 0.35963455 0.03788593
To bind the result together:
do.call(rbind, output_reg)
# (Intercept) x r.squared
# [1,] 10.753759 -1.3195489 0.31766917
# [2,] 8.847368 -0.3368421 0.13890396
# [3,] -0.305000 0.3596346 0.03788593
Check-out the broom package and sprinkle in some dplyr (see this vignette):
library(broom)
library(dplyr)
my_df %>%
group_by(field) %>%
do(glance(lm(predictor ~ response, data = .))) #also see do(tidy(...))
# field r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int>
# 1 t1 0.31766917 -0.3646617 0.7778175 0.46556474 0.6188153 2 -1.855107 9.710214 7.006051 0.605000 1
# 2 t2 0.13890396 -0.7221921 1.6513038 0.16131065 0.7568653 2 -4.113593 14.227185 11.523022 2.726804 1
# 3 t3 0.03788593 -0.9242281 1.3894755 0.03937779 0.8752903 2 -3.595676 13.191352 10.487189 1.930642 1
Alternatively, save the regressions first:
regressions <- my_df %>% group_by(field) %>% do(fit = lm(predictor ~ response, data = .))
regressions %>% tidy(fit)
regressions %>% glance(fit)
You can do the following using purrr
require(purrr)
my_df %>%
slice_rows("field") %>%
by_slice(partial(lm, predictor ~ response), .labels = FALSE) %>%
flatten %>%
map(~c(coef(.), r.squared=summary(.)$r.squared))
Which gives you:
[[1]]
(Intercept) response r.squared
5.9777778 -0.2407407 0.3176692
[[2]]
(Intercept) response r.squared
9.8195876 -0.4123711 0.1389040
[[3]]
(Intercept) response r.squared
9.68534163 0.10534562 0.03788593
If you want a data.frame back instead use this as last line:
map_df(~as.data.frame(t(c(coef(.), r.squared=summary(.)$r.squared))))
You can create a data frame with model stats like this:
model_stats <- data.frame(model$coefficients)
model_stats <- rbind(model_stats, r.sq = summary(model)$r.squared)
Related
I made a nls loop and get values calculated in console. Now I want to extract those values, specify which values are from which group and put everything in a dataframe to continue working.
my loop so far:
for (i in seq_along(trtlist2)) { loopmm.nls <-
nls(rate ~ (Vmax * conc /(Km + conc)),
data=subset(M3, M3$trtlist==trtlist2[i]),
start=list(Km=200, Vmax=2), trace=TRUE )
summary(loopmm.nls)
print(summary(loopmm.nls))
}
the output in console: (this is what I want to extract and put in a dataframe, I have this same "parameters" thing like 20 times)
Parameters:
Estimate Std. Error t value Pr(>|t|)
Km 23.29820 9.72304 2.396 0.0228 *
Vmax 0.10785 0.01165 9.258 1.95e-10 ***
---
different ways of extracting data from the console that work but not in the loop (so far!)
#####extract data in diff ways from nls#####
## extract coefficients as matrix
Kinall <- summary(mm.nls)$parameters
## extract coefficients save as dataframe
Kin <- as.data.frame(Kinall)
colnames(Kin) <- c("values", "SE", "T", "P")
###create Km Vmax df
Kms <- Kin[1, ]
Vmaxs <- Kin[2, ]
#####extract coefficients each manually
Km <- unname(coef(summary(mm.nls))["Km", "Estimate"])
Vmax <- unname(coef(summary(mm.nls))["Vmax", "Estimate"])
KmSE <- unname(coef(summary(mm.nls))["Km", "Std. Error"])
VmaxSE <- unname(coef(summary(mm.nls))["Vmax", "Std. Error"])
KmP <- unname(coef(summary(mm.nls))["Km", "Pr(>|t|)"])
VmaxP <- unname(coef(summary(mm.nls))["Vmax", "Pr(>|t|)"])
KmT <- unname(coef(summary(mm.nls))["Km", "t value"])
VmaxT <- unname(coef(summary(mm.nls))["Vmax", "t value"])
one thing that works if you extract data through append, but somehow that only works for "estimates" not the rest
Kms <- append(Kms, unname(coef(loopmm.nls)["Km"] ))
Vmaxs <- append(Vmaxs, unname(coef(loopmm.nls)["Vmax"] ))
}
Kindf <- data.frame(trt = trtlist2, Vmax = Vmaxs, Km = Kms)
I would just keep everything in the dataframe for ease. You can nest by the group and then run the regression then pull the coefficients out. Just make sure you have tidyverse and broom installed on your computer.
library(tidyverse)
#example
mtcars |>
nest(data = -cyl) |>
mutate(model = map(data, ~nls(mpg~hp^b,
data = .x,
start = list(b = 1))),
clean_mod = map(model, broom::tidy)) |>
unnest(clean_mod) |>
select(-c(data, model))
#> # A tibble: 3 x 6
#> cyl term estimate std.error statistic p.value
#> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 6 b 0.618 0.0115 53.6 2.83e- 9
#> 2 4 b 0.731 0.0217 33.7 1.27e-11
#> 3 8 b 0.504 0.0119 42.5 2.46e-15
#what I expect will work for your data
All_M3_models <- M3 |>
nest(data = -trtlist) |>
mutate(model = map(data, ~nls(rate ~ (Vmax * conc /(Km + conc)),
data=.x,
start=list(Km=200, Vmax=2))),
clean_mod = map(model, broom::tidy))|>
unnest(clean_mod) |>
select(-c(data, model))
I'm struggling with how the obtain the AUC from a logistic regression model using tidymodels.
Here's an example using the built-in mpg dataset.
library(tidymodels)
library(tidyverse)
# Use mpg dataset
df <- mpg
# Create an indicator variable for class="suv"
df$is_suv <- as.factor(df$class == "suv")
# Create the split object
df_split <- initial_split(df, prop=1/2)
# Create the training and testing sets
df_train <- training(df_split)
df_test <- testing(df_split)
# Create workflow
rec <-
recipe(is_suv ~ cty + hwy + cyl, data=df_train)
glm_spec <-
logistic_reg() %>%
set_engine(engine = "glm")
glm_wflow <-
workflow() %>%
add_recipe(rec) %>%
add_model(glm_spec)
# Fit the model
model1 <- fit(glm_wflow, df_train)
# Attach predictions to training dataset
training_results <- bind_cols(df_train, predict(model1, df_train))
# Calculate accuracy
accuracy(training_results, truth = is_suv, estimate = .pred_class)
# Calculate AUC??
roc_auc(training_results, truth = is_suv, estimate = .pred_class)
The last line returns this error:
> roc_auc(training_results, truth = is_suv, estimate = .pred_class)
Error in metric_summarizer(metric_nm = "roc_auc", metric_fn = roc_auc_vec, :
formal argument "estimate" matched by multiple actual arguments
Since you are doing binary classification, roc_auc() is expecting a vector of class probabilities corresponding to the "relevant" class, not the predicted class.
You can get this using predict(model1, df_train, type = "prob"). Alternatively, if you are using workflows version 0.2.2 or newer you can use the augment() to get class predictions and probabilities without using bind_cols().
library(tidymodels)
library(tidyverse)
# Use mpg dataset
df <- mpg
# Create an indicator variable for class="suv"
df$is_suv <- as.factor(df$class == "suv")
# Create the split object
df_split <- initial_split(df, prop=1/2)
# Create the training and testing sets
df_train <- training(df_split)
df_test <- testing(df_split)
# Create workflow
rec <-
recipe(is_suv ~ cty + hwy + cyl, data=df_train)
glm_spec <-
logistic_reg() %>%
set_engine(engine = "glm")
glm_wflow <-
workflow() %>%
add_recipe(rec) %>%
add_model(glm_spec)
# Fit the model
model1 <- fit(glm_wflow, df_train)
# Attach predictions to training dataset
training_results <- augment(model1, df_train)
# Calculate accuracy
accuracy(training_results, truth = is_suv, estimate = .pred_class)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 accuracy binary 0.795
# Calculate AUC
roc_auc(training_results, truth = is_suv, estimate = .pred_FALSE)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 roc_auc binary 0.879
Created on 2021-04-12 by the reprex package (v1.0.0)
I have the following code which displays some coefficients from lm
fit <-lm(Petal.Width ~ Petal.Length, data=iris)
cf <-coef(summary(fit,complete = TRUE))
colnames(cf)[4] <- "pval"
cf<- data.frame(cf)
cf <-cf[cf$pval < 0.05,]
cf <-cf[order(-cf$pval), ]
head(cf)
cf[1,1]
I want to extract the names in the left column ie (intercept) and petal length.
I thought I could use cf[1,1] but it shows the estimate
Those are extracted using rownames :
fit <-lm(Petal.Width ~ Petal.Length, data=iris)
cf <-coef(summary(fit,complete = TRUE))
rownames(cf)
#[1] "(Intercept)" "Petal.Length"
The tidyverse solution would be to use broom:
library(broom)
tidy_fit <- tidy(fit)
Results:
# A tibble: 2 x 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) -0.363 0.0398 -9.13 4.70e-16
2 Petal.Length 0.416 0.00958 43.4 4.68e-86
Then it's easy to extract the components that you want and the resulting code is more readable, e.g. tidy_fit$term to get the list of variables ((Intercept) and Petal.Length).
I'm using the moderndrive package to calculate a linear regression but using a function. I am trying to create a function where i can just pass in two selected columns(e.g deaths & cases, titles of the columns) from my data frame (Rona_2020). Below is the function...
score_model_Fxn <- function(y, x){
score_mod <- lm(y ~ x, data = Rona_2020)
Reg_Table <- get_regression_table(score_mod)
print(paste('The regression table is', Reg_Table))
}
when I run the function ...
score_model_Fxn(deaths, cases)
I get ...
Error in eval(predvars, data, env) : object 'deaths' not found
What should i do? I have looked several similar issues but to no avail.
What you want to do by passing deaths and cases is called non-standard evaluation. You need to combine this with computing on the language if you want to run a model with the correct formula and scoping. Computing on the language can be done with substitute and bquote.
library(moderndive)
score_model_Fxn <- function(y, x, data){
#get the symbols passed as arguments:
data <- substitute(data)
y <- substitute(y)
x <- substitute(x)
#substitute them into the lm call and evaluate the call:
score_mod <- eval(bquote(lm(.(y) ~ .(x), data = .(data))))
Reg_Table <- get_regression_table(score_mod)
message('The regression table is') #better than your paste solution
print(Reg_Table)
invisible(score_mod) #a function should always return something useful
}
mod <- score_model_Fxn(Sepal.Length, Sepal.Width, iris)
#The regression table is
## A tibble: 2 x 7
# term estimate std_error statistic p_value lower_ci upper_ci
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 intercept 6.53 0.479 13.6 0 5.58 7.47
#2 Sepal.Width -0.223 0.155 -1.44 0.152 -0.53 0.083
print(mod)
#
#Call:
#lm(formula = Sepal.Length ~ Sepal.Width, data = iris)
#
#Coefficients:
#(Intercept) Sepal.Width
# 6.5262 -0.2234
You could have the function return Reg_Table instead if you prefer.
One of the coolest ways of doing this is using the new recipes package to generate the formula for us and then manipulating a tibble to produce or result
library(tidyverse)
library(recipes)
#>
#> Attaching package: 'recipes'
#> The following object is masked from 'package:stringr':
#>
#> fixed
#> The following object is masked from 'package:stats':
#>
#> step
library(moderndive)
score_model_Fxn <- function(df,x, y){
formula_1 <- df %>%
recipe() %>%
update_role({{x}},new_role = "outcome") %>%
update_role({{y}},new_role = "predictor") %>%
formula()
Reg_Table <- mtcars %>%
summarise(score_mod = list(lm(formula_1,data = .))) %>%
rowwise() %>%
mutate(Reg_Table = list(get_regression_table(score_mod))) %>%
pull(Reg_Table)
print(paste('The regression table is', Reg_Table))
Reg_Table
}
k <- mtcars %>%
score_model_Fxn(x = cyl,y = gear)
#> [1] "The regression table is list(term = c(\"intercept\", \"gear\"), estimate = c(10.585, -1.193), std_error = c(1.445, 0.385), statistic = c(7.324, -3.101), p_value = c(0, 0.004), lower_ci = c(7.633, -1.978), upper_ci = c(13.537, -0.407))"
k
#> [[1]]
#> # A tibble: 2 x 7
#> term estimate std_error statistic p_value lower_ci upper_ci
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 intercept 10.6 1.44 7.32 0 7.63 13.5
#> 2 gear -1.19 0.385 -3.10 0.004 -1.98 -0.407
Created on 2020-06-09 by the reprex package (v0.3.0)
For those that might be interested...I modified Bruno's answer.
library(tidyverse); library(recipes); library(moderndive)
score_model_Fxn2 <- function(df,x, y){
formula_1 <- df %>%
recipe() %>%
update_role({{y}},new_role = "outcome") %>%
update_role({{x}},new_role = "predictor") %>%
formula()
Reg_Table <- df %>%
summarise(score_mod = list(lm(formula_1,data = .))) %>%
rowwise() %>%
mutate(Reg_Table = list(get_regression_table(score_mod))) %>%
pull(Reg_Table)
print(Reg_Table)
}
score_model_Fxn2()
I'm doing cross validation (five fold). Then I want to calculate the mean value for each group in a given data set I used for that cv. Please note that I need to use the following functions.
data(mpg)
library(modelr)
cv <- crossv_kfold(mpg, k = 5)
models1 <- map(cv$train, ~lm(hwy ~ displ, data = .))
get_pred <- function(model, test_data){
data <- as.data.frame(test_data)
pred <- add_predictions(data, model)
return(pred)
}
pred1 <- map2_df(models1, cv$test, get_pred, .id = "Run")
MSE1 <- pred1 %>% group_by(Run) %>%
summarise(MSE = mean( (hwy - pred)^2))
MSE1
My problem lies with the output of 'summarise'. The function should be applied to each group. The result should look something like this:
## # A tibble: 5 x 2
## Run MSE
## <chr> <dbl>
## 1 1 27.889532
## 2 2 8.673054
## 3 3 17.033056
## 4 4 12.552037
## 5 5 9.138741
Unfortunately, I get only one value:
MSE
1 14.77799
How can I get a tibble like that above?
When I run your code, I get the style of output you are expecting (though the numbers are different (as the seed wasn't set in your example)); I do not see a summarise-type problem like you do:
library(ggplot2)
library(modelr)
library(purrr)
library(dplyr)
data(mpg)
cv <- crossv_kfold(mpg, k = 5)
models1 <- map(cv$train, ~lm(hwy ~ displ, data = .))
get_pred <- function(model, test_data){
data <- as.data.frame(test_data)
pred <- add_predictions(data, model)
return(pred)
}
pred1 <- map2_df(models1, cv$test, get_pred, .id = "Run")
MSE1 <- pred1 %>% group_by(Run) %>%
summarise(MSE = mean( (hwy - pred)^2))
MSE1
# A tibble: 5 x 2
Run MSE
<chr> <dbl>
1 1 7.80
2 2 12.5
3 3 9.82
4 4 27.3
5 5 17.5