The code below works for almost all dates and codes that I choose from my df1 database, however only for the day 09/07, code FGE that doesn't. I would like to solve this problem, could you help me? If you need to, I can explain better what this code does.
library(dplyr)
library(tidyverse)
library(lubridate)
library(stringr)
df1 <- structure(
list(date1= c("2021-06-28","2021-06-28","2021-06-28","2021-06-28","2021-06-28",
"2021-06-28","2021-06-28","2021-06-28"),
date2 = c("2021-06-30","2021-06-30","2021-07-02","2021-07-07","2021-07-07","2021-07-09","2021-07-09","2021-07-09"),
Code = c("FDE","ABC","ABC","ABC","CDE","FGE","ABC","CDE"),
Week= c("Wednesday","Wednesday","Friday","Wednesday","Wednesday","Friday","Friday","Friday"),
DR1 = c(4,1,4,3,3,4,3,5),
DR01 = c(4,1,4,3,3,4,3,6), DR02= c(4,2,6,7,3,2,7,4),DR03= c(9,5,4,3,3,2,1,5),
DR04 = c(5,4,3,3,6,2,1,9),DR05 = c(5,4,5,3,6,2,1,9),
DR06 = c(2,4,3,3,5,6,7,8),DR07 = c(2,5,4,4,9,4,7,8),
DR08 = c(0,0,0,1,2,0,0,0),DR09 = c(0,0,0,0,0,0,0,0),DR010 = c(0,0,0,0,0,0,0,0),DR011 = c(4,0,0,0,0,0,0,0),
DR012 = c(0,0,0,3,0,0,0,5),DR013 = c(0,0,1,0,0,0,2,0),DR014 = c(0,0,0,0,0,2,0,0)),
class = "data.frame", row.names = c(NA, -8L))
dmda<-"2021-07-09"
CodeChosse<-"FGE"
x<-df1 %>% select(starts_with("DR0"))
x<-cbind(df1, setNames(df1$DR1 - x, paste0(names(x), "_PV")))
PV<-select(x, date2,Week, Code, DR1, ends_with("PV"))
med<-PV %>%
group_by(Code,Week) %>%
summarize(across(ends_with("PV"), median))
SPV<-df1%>%
inner_join(med, by = c('Code', 'Week')) %>%
mutate(across(matches("^DR0\\d+$"), ~.x +
get(paste0(cur_column(), '_PV')),
.names = '{col}_{col}_PV')) %>%
select(date1:Code, DR01_DR01_PV:last_col())
SPV<-data.frame(SPV)
mat1 <- df1 %>%
filter(date2 == dmda, Code == CodeChosse) %>%
select(starts_with("DR0")) %>%
pivot_longer(cols = everything()) %>%
arrange(desc(row_number())) %>%
mutate(cs = cumsum(value)) %>%
filter(cs == 0) %>%
pull(name)
(dropnames <- paste0(mat1,"_",mat1, "_PV"))
SPV %>%
filter(date2 == dmda, Code == CodeChosse) %>%
select(-dropnames)
> SPV %>%
+ filter(date2 == dmda, Code == CodeChosse) %>%
+ select(-dropnames)
Error: Can't subset columns that don't exist.
x Column `__PV` doesn't exist.
Run `rlang::last_error()` to see where the error occurred.
For others datas/codes it works, for example:
dmda<-"2021-06-30"
CodeChosse<-"FDE"
> SPV %>%
+ filter(date2 == dmda, Code == CodeChosse) %>%
+ select(-dropnames)
date1 date2 Code DR01_DR01_PV DR02_DR02_PV DR03_DR03_PV DR04_DR04_PV DR05_DR05_PV DR06_DR06_PV DR07_DR07_PV DR08_DR08_PV
1 2021-06-28 2021-06-30 FDE 4 4 4 4 4 4 4 4
DR09_DR09_PV DR010_DR010_PV DR011_DR011_PV
1 4 4 4
dmda<-"2021-07-02"
CodeChosse<-"ABC"
> SPV %>%
+ filter(date2 == dmda, Code == CodeChosse) %>%
+ select(-dropnames)
date1 date2 Code DR01_DR01_PV DR02_DR02_PV DR03_DR03_PV DR04_DR04_PV DR05_DR05_PV DR06_DR06_PV DR07_DR07_PV DR08_DR08_PV
1 2021-06-28 2021-07-02 ABC 4 3 5 4.5 5.5 1.5 2 3.5
DR09_DR09_PV DR010_DR010_PV DR011_DR011_PV DR012_DR012_PV DR013_DR013_PV
1 3.5 3.5 3.5 3.5 3
dmda<-"2021-07-07"
CodeChosse<-"CDE"
> SPV %>%
+ filter(date2 == dmda, Code == CodeChosse) %>%
+ select(-dropnames)
date1 date2 Code DR01_DR01_PV DR02_DR02_PV DR03_DR03_PV DR04_DR04_PV DR05_DR05_PV DR06_DR06_PV DR07_DR07_PV DR08_DR08_PV
1 2021-06-28 2021-07-07 CDE 3 3 3 3 3 3 3 3
The issue in this particular case is that mat1 has no values in it because the 1st row in df1 has non-zero value and filter(cs == 0) drops all the rows.
mat1
#character(0)
You may use any_of which will not give an error if the column does not exist.
library(dplyr)
SPV %>%
filter(date2 == dmda, Code == CodeChosse) %>%
select(-any_of(dropnames))
Related
I have two dataframes :
> df1 <- data.frame(date = as.Date( c( "2021-06-01", "2021-06-02", "2021-06-03", "2021-06-04",
"2021-06-05", "2021-06-06", "2021-06-07", "2021-06-08",
"2021-06-09", "2021-06-10", "2021-06-11", "2021-06-12",
"2021-06-13") ),
temperature = c( 17, 30, 28, 29, 16, 21, 20, 11, 28, 29, 25, 26, 19) )
and
> df2 <- data.frame( ID = c( 1 : 4 ),
date.pose = as.Date(c("2021-06-01", "2021-06-03", "2021-06-06", "2021-06-10") ),
date.withdrawal = as.Date(c("2021-06-02", "2021-06-05", "2021-06-09", "2021-06-13") ) )
I want to store the mean temperature for each period that is in df2 in a new colomn (df2$mean.temperature).
For ID = 1 from df2, the mean temperature would be calculated with the temperatures from 2021-06-01 and 2021-06-02, witch is mean(17, 30)
In other words, I want to get this :
> df2 <- data.frame(ID = c( 1 : 4 ),
date.pose = as.Date( c("2021-06-01", "2021-06-03", "2021-06-06", "2021-06-10") ) ,
date.withdrawal = as.Date( c("2021-06-03", "2021-06-06", "2021-06-10", "2021-06-13") ),
mean.Temperature = c(23.5, 24.3, 20.0, 24.8) )
I'm trying to add the ID from df2 in a new colomn in df1. Once I do that, I could aggregate like this :
> df3 <- aggregate(df1$temperature, list(df1$ID, df2$date.pose), FUN = mean)
I don't know how to add the corresponding ID in df1.
Or maybe there is a better way to do this?
Here's an approach using uncount from tidyr and some joins.
df2 %>%
mutate(days = (date.witdrawal - date.pose + 1) %>% as.integer) %>%
tidyr::uncount(days, .id = "row") %>%
transmute(ID, date = date.pose + row - 1) %>%
left_join(df1) %>%
group_by(ID) %>%
summarize(mean.Temperature = mean(temperature)) %>%
right_join(df2)
Result
# A tibble: 4 × 4
ID mean.Temperature date.pose date.witdrawal
<int> <dbl> <date> <date>
1 1 23.5 2021-06-01 2021-06-02
2 2 24.3 2021-06-03 2021-06-05
3 3 20 2021-06-06 2021-06-09
4 4 24.8 2021-06-10 2021-06-13
Update. thanks to #Jon Spring:
Here is how we could do it:
logic:
join both df's by date after long pivoting df1
arrange by date and fill
then after grouping by ID use summarise with mean()
and re-join finally:
library(dplyr)
library(tidyr)
df2 %>%
pivot_longer(-ID, values_to = "date") %>%
full_join(df1, by= "date") %>%
arrange(date) %>%
fill(ID, .direction = "down") %>%
group_by(ID) %>%
summarise(mean_temp = mean(temperature, na.rm = TRUE)) %>%
left_join(df2, by="ID")
ID mean_temp date.pose date.witdrawal
<int> <dbl> <date> <date>
1 1 23.5 2021-06-01 2021-06-02
2 2 24.3 2021-06-03 2021-06-05
3 3 20 2021-06-06 2021-06-09
4 4 24.8 2021-06-10 2021-06-13
This question is very similar to this one Error in UseMethod("select") : no applicable method for 'select' applied to an object of class "character", what is different is a few things in the return_coef function and it's also that I'm filtering the function by Id, date and Category, and in the answered question, it was just by date and Category.
You will see below that I can generate the coefficients for each Id/date/Category, but when I ask to do it for everyone at once, I get the following error:
Error: Problem with `mutate()` column `coef`.
i `coef = mapply(...)`.
x negative length vectors are not allowed
Executable code:
library(dplyr)
library(tidyverse)
library(lubridate)
library(data.table)
df1<- structure(
list(
Id = c(1, 1, 1, 1),
date1 = c("2022-01-06","2022-01-06","2022-01-06","2022-01-06"),
date2 = c("2022-01-02","2022-01-03","2022-01-09","2022-01-10"),
Week = c("Sunday","Monday","Sunday","Monday"),
Category = c("EFG", "ABC","EFG","ABC"),
DR1 = c(200, 300, 200, 200),
DRM01 = c(300, 300, 300, 300),
DRM02 = c(300, 300, 300, 300),
DRM03 = c(300,300,300,300),
DRM04 = c(300,300,300,300),
DRM05 = c(300,250,350,350)),row.names = c(NA, 4L), class = "data.frame")
return_coef <- function (df1, idd, dmda, CategoryChosse) {
selection = startsWith(names(df1), "DRM0")
df1[selection][is.na(df1[selection])] = 0
data1<-subset(df1,df1$date2<df1$date1)
dt1 <- as.data.table(data1)
cols <- grep("^DRM0", colnames(dt1), value = TRUE)
med <-
dt1[, (paste0(cols, "_PV")) := DR1 - .SD, .SDcols = cols
][, lapply(.SD, median), by = .(Id, Category, Week), .SDcols = paste0(cols, "_PV") ]
SPV<-df1%>%
inner_join(med, by = c('Id','Category', 'Week')) %>%
mutate(across(matches("^DRM0\\d+$"), ~.x +
get(paste0(cur_column(), '_PV')),
.names = '{col}_{col}_PV')) %>%
select(Id:Category, DRM01_DRM01_PV:last_col())%>%
data.frame()
mat1 <- df1 %>%
dplyr::filter(Id==idd, date2 == ymd(dmda), Category == CategoryChosse) %>%
select(starts_with("DRM0")) %>%
pivot_longer(cols = everything()) %>%
arrange(desc(row_number())) %>%
mutate(cs = cumsum(value)) %>%
dplyr::filter(cs == 0) %>%
pull(name)
(dropnames <- paste0(mat1,"_",mat1, "_PV"))
SPV <- SPV %>%
filter(Id==idd,date2 == ymd(dmda), Category == CategoryChosse) %>%
select(-any_of(dropnames))
if(length(grep("DRM0", names(SPV))) == 0) {
SPV[head(mat1,10)] <- NA_real_
}
datas <-SPV %>%
dplyr::filter(Id==idd,date2 == ymd(dmda)) %>%
group_by(Category) %>%
dplyr::summarize(dplyr::across(starts_with("DRM0"), sum)) %>%
pivot_longer(cols= -Category, names_pattern = "DRM0(.+)", values_to = "val") %>%
mutate(name = readr::parse_number(name))
colnames(datas)[-1]<-c("var1","var2")
datas$days <- datas[[as.name("var1")]]
datas$numbers <- datas[[as.name("var2")]]
datas <- datas %>%
group_by(Category) %>%
slice((as.Date(dmda) - min(as.Date(df1$date1) [
df1$Category == first(Category)])):max(days)+1) %>%
ungroup
m<-df1 %>%
group_by(Id,Category,Week) %>%
dplyr::summarize(dplyr::across(starts_with("DR1"), mean), .groups = 'drop')
m<-subset(m, Week == df1$Week[match(ymd(dmda), ymd(df1$date2))] & Category == CategoryChosse)$DR1
if (nrow(datas)<=2){
val<-as.numeric(m)
}
else{
mod <- nls(numbers ~ b1*days^2+b2,start = list(b1 = 0,b2 = 0),data = datas, algorithm = "port")
coef<-coef(mod)[2]
val<-as.numeric(coef(mod)[2])
}
return(val)
}
Find the coef one by one (It works):
return_coef(df1,"1","2022-01-09","EFG")
[1] 200
return_coef(df1,"1","2022-01-10","ABC")
[1] 250
Find all coef at once (Does not work)
subset_df1 <- subset(df1, date2 > date1)
All<-subset_df1%>%
transmute(
Id,date2,Category,
coef = mapply(return_coef, list(cur_data()), Id, as.Date(date2), Category))
Error: Problem with `mutate()` column `coef`.
i `coef = mapply(...)`.
x negative length vectors are not allowed
dput(head(df1))
structure(list(Id = c(1, 1, 1, 1), date1 = structure(c(1641427200,
1641427200, 1641427200, 1641427200), tzone = "UTC", class = c("POSIXct",
"POSIXt")), date2 = structure(c(1641081600, 1641168000, 1641686400,
1641772800), tzone = "UTC", class = c("POSIXct", "POSIXt")),
Week = c("Sunday", "Monday", "Sunday", "Monday"), Category = c("EFG",
"ABC", "EFG", "ABC"), DR1 = c(200, 300, 200, 200), DRM01 = c(300,
300, 300, 300), DRM02 = c(300, 300, 300, 300), DRM03 = c(300,
300, 300, 300), DRM04 = c(300, 300, 300, 300), DRM05 = c(300,
250, 350, 350)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
The function seems to be use df1 as the full data
library(dplyr)
subset_df1 %>%
rowwise %>%
mutate(coef = return_coef(df1, Id, date2, Category)) %>%
ungroup
-output
# A tibble: 2 × 12
Id date1 date2 Week Category DR1 DRM01 DRM02 DRM03 DRM04 DRM05 coef
<dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2022-01-06 2022-01-09 Sunday EFG 200 300 300 300 300 350 200
2 1 2022-01-06 2022-01-10 Monday ABC 200 300 300 300 300 350 250
Update
In the new dataset, the columns 'date1', 'date2' are POSIXct, convert to Date class and it should work
df2 <- df1 %>%
mutate(across(c(date1, date2), as.Date))
subset_df2 <- subset(df2, date2 > date1)
subset_df2 %>%
rowwise %>%
mutate(coef = return_coef(df2, Id, date2, Category)) %>%
ungroup
# A tibble: 2 × 12
Id date1 date2 Week Category DR1 DRM01 DRM02 DRM03 DRM04 DRM05 coef
<dbl> <date> <date> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2022-01-06 2022-01-09 Sunday EFG 200 300 300 300 300 350 200
2 1 2022-01-06 2022-01-10 Monday ABC 200 300 300 300 300 350 250
Or using pmap
library(purrr)
subset_df1 %>%
transmute(Id, date2, Category,
coeff = pmap_dbl(across(c(Id, date2, Category)),
~ return_coef(df1, ..1, ..2, ..3)))
-output
Id date2 Category coeff
3 1 2022-01-09 EFG 200
4 1 2022-01-10 ABC 250
I would like to get PV, but when I run the code it gives the following error appears: Error in FUN(left, right) : non-numeric argument to binary operator when I do cbing. Can you help me?
library(dplyr)
library(tidyverse)
library(lubridate)
df1 <- structure(
list(date1= c("2021-06-28","2021-06-28","2021-06-28","2021-06-28"),
date2 = c("2021-06-30","2021-07-01","2021-07-02","2021-07-02"),
Code = c("FDE","ABC","ABC","FDE"),
Week= c("Wednesday","Wednesday","Friday","Friday"),
DR1 = c(4,1,4,1),
DR01 = c(4,1,4,2), DR02= c(4,2,6,2),DR03= c(9,5,4,2),
DR04 = c(5,4,3,3),DR05 = c(5,4,5,2),
DR06 = c(2,4,3,3),DR07 = c(2,5,4,2),
DR08 = c(0,0,0,4),DR09 = c(0,0,0,2),DR010 = c(0,0,0,2),DR011 = c(4,0,0,2),
DR012 = c(0,0,0,""), DR013 = c(0,0,1,""), DR014 = c(0,0,0,"")),
class = "data.frame", row.names = c(NA, -4L))
x<-df1 %>% select(starts_with("DR0"))
x<-cbind(df1, setNames(df1$DR1 - x, paste0(names(x), "_PV")))
PV<-select(x, date2,Week, Code, DR1, ends_with("PV"))
In x DR012 to DR14 columns as character class. So you can yous type.convert(as.is=TRUE) to solve your problem:
library(dplyr)
library(tidyverse)
library(lubridate)
df1 <- structure(
list(date1= c("2021-06-28","2021-06-28","2021-06-28","2021-06-28"),
date2 = c("2021-06-30","2021-07-01","2021-07-02","2021-07-02"),
Code = c("FDE","ABC","ABC","FDE"),
Week= c("Wednesday","Wednesday","Friday","Friday"),
DR1 = c(4,1,4,1),
DR01 = c(4,1,4,2), DR02= c(4,2,6,2),DR03= c(9,5,4,2),
DR04 = c(5,4,3,3),DR05 = c(5,4,5,2),
DR06 = c(2,4,3,3),DR07 = c(2,5,4,2),
DR08 = c(0,0,0,4),DR09 = c(0,0,0,2),DR010 = c(0,0,0,2),DR011 = c(4,0,0,2),
DR012 = c(0,0,0,""), DR013 = c(0,0,1,""), DR014 = c(0,0,0,"")),
class = "data.frame", row.names = c(NA, -4L))
x<-df1 %>% select(starts_with("DR0")) %>% type.convert(as.is=TRUE)
x<-cbind(df1, setNames(df1$DR1 - x, paste0(names(x), "_PV")))
PV<-select(x, date2,Week, Code, DR1, ends_with("PV"))
PV
output:
date2 Week Code DR1 DR01_PV DR02_PV DR03_PV DR04_PV DR05_PV DR06_PV DR07_PV DR08_PV DR09_PV
1 2021-06-30 Wednesday FDE 4 0 0 -5 -1 -1 2 2 4 4
2 2021-07-01 Wednesday ABC 1 0 -1 -4 -3 -3 -3 -4 1 1
3 2021-07-02 Friday ABC 4 0 -2 0 1 -1 1 0 4 4
4 2021-07-02 Friday FDE 1 -1 -1 -1 -2 -1 -2 -1 -3 -1
DR010_PV DR011_PV DR012_PV DR013_PV DR014_PV
1 4 0 4 4 4
2 1 1 1 1 1
3 4 4 4 3 4
4 -1 -1 NA NA NA
Could you help me to insert the column Category in my generated table? That way I can know specifically the coef for each day and category.
library(purrr)
library(dplyr)
library(tidyverse)
library(lubridate)
df1 <- structure(
list(date1= c("2021-06-28","2021-06-28","2021-06-28","2021-06-28"),
date2 = c("2021-06-30","2021-06-30","2021-07-01","2021-07-01"),
Category = c("FDE","ABC","FDE","ABC"),
Week= c("Wednesday","Wednesday","Friday","Friday"),
DR1 = c(4,1,6,3),
DR01 = c(4,1,4,3), DR02= c(4,2,6,2),DR03= c(9,5,4,7),
DR04 = c(5,4,3,2),DR05 = c(5,4,5,4),
DR06 = c(2,4,3,2),DR07 = c(2,5,4,4),
DR08 = c(3,4,5,4),DR09 = c(2,3,4,4)),
class = "data.frame", row.names = c(NA, -4L))
dates <- subset(df1, date2 > date1, select = date2)$date2
map_dfr(dates, ~ {
datas <- df1 %>%
filter(date2 == ymd(.x)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name))
colnames(datas)<-c("Days","Numbers")
mod <- nls(Numbers ~ b1*Days^2+b2,start = list(b1 = 47,b2 = 0), data = datas)
tibble(dates = .x, coef = coef(mod)[2])
}) %>%
mutate(dates = format(ymd(dates), "%d/%m/%Y"))
# A tibble: 4 x 2
dates coef
<chr> <dbl>
1 30/06/2021 7.89
2 30/06/2021 7.89
3 01/07/2021 7.95
4 01/07/2021 7.95
In this case, it looks like this:
dates Category coef
<chr> <dbl>
1 30/06/2021 FDE 7.89
2 30/06/2021 ABC 7.89
3 01/07/2021 FDE 7.95
4 01/07/2021 ABC 7.95
You can use bind_cols() with a dataset containing the Category column
##Select the Category column as a subset
categories <- subset(df1, date2 > date1, select = Category)
map_dfr(dates, ~ {
datas <- df1 %>%
filter(date2 == ymd(.x)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name))
colnames(datas)<-c("Days","Numbers")
mod <- nls(Numbers ~ b1*Days^2+b2,start = list(b1 = 47,b2 = 0), data = datas)
tibble(dates = .x[[1]], coef = coef(mod)[2])
}) %>% bind_cols(categories) %>% #add the categories to the tibble
mutate(dates = format(ymd(dates), "%d/%m/%Y"))
I built a prediction model using logistic regression which works well. But when I analyze the estimates calculated on the test dataset, I can see the variable I used to stratify the split comes up when I want it to be excluded of the model as a predictor. update_role() doesn't do that...
data_split <- initial_split(mldata, prop = 3/4, strata = strata_var)
# Create training and testing datasets:
train_data <- training(data_split)
test_data <- testing(data_split)
# Build model
mldata_recipe <-
recipe(vital ~ ., data = train_data) %>%
update_role(ids, new_role = "ID") %>%
update_role(strata_var, new_role = "strata") %>%
step_zv(all_predictors()) %>%
step_unknown(all_nominal_predictors()) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_smote(vital)
set.seed(456)
# 10 fold cross validation
mldata_folds <- vfold_cv(train_data, strata = strata_var)
glmnet_spec <-
logistic_reg(penalty = tune(), mixture = tune()) %>%
set_mode("classification") %>%
set_engine("glmnet")
glmnet_workflow <-
workflow() %>%
add_recipe(mldata_recipe) %>%
add_model(glmnet_spec)
glmnet_grid <- tidyr::crossing(penalty = 10^seq(-6, -1, length.out = 20), mixture = c(0, 0.05,
0.2, 0.4, 0.6, 0.8, 1))
set.seed(789)
glmnet_tune <-
tune_grid(glmnet_workflow, resamples = mldata_folds, grid = glmnet_grid)
final_glmnet <- glmnet_workflow %>%
finalize_workflow(select_best(glmnet_tune, "roc_auc"))
glmnet_results <- final_glmnet %>%
fit_resamples(
resamples = mldata_folds,
metrics = metric_set(roc_auc, accuracy, sensitivity, specificity),
control = control_resamples(save_pred = TRUE)
)
set.seed(789)
final_fit <- final_glmnet %>%
last_fit(data_split)
final_fit %>%
pull(.workflow) %>%
pluck(1) %>%
tidy() %>%
filter(term != "(Intercept)") %>%
arrange(desc(abs(estimate))) %>%
filter(abs(estimate) >0) %>%
ggplot(aes(estimate, fct_reorder(term, desc(estimate)), color = estimate > 0))+
geom_vline(xintercept = 0, color = "lightgrey", lty = 2, size = 1.2) +
geom_point() +
scale_color_discrete(name = "Variable Effect \non outcome", labels = c("Deleterious", "Beneficial")) +
theme_minimal()+
ggtitle("Meaningful Parameter Estimate Coefficients using logistic regression model")
In the last plot I can see the strata variable coming up.
You got this result because of the combination of role selection functions you used in step_dummy(). (full reprex at the end of post)
You used the following selections. Which selects all nominal, but not any outcomes. This selected the strata variables because it is both a nominal variable and not an outcome.
all_nominal(), -all_outcomes()
A better option would be to use all_nominal_predictors() which won't select id/strata variables.
library(tidymodels)
data("penguins")
rec_spec1 <- recipe(species ~ island + body_mass_g, data = penguins) %>%
update_role(island, new_role = "strata") %>%
step_dummy(all_nominal(), -all_outcomes())
rec_spec1 %>%
prep() %>%
bake(new_data = NULL)
#> # A tibble: 344 × 4
#> body_mass_g species island_Dream island_Torgersen
#> <int> <fct> <dbl> <dbl>
#> 1 3750 Adelie 0 1
#> 2 3800 Adelie 0 1
#> 3 3250 Adelie 0 1
#> 4 NA Adelie 0 1
#> 5 3450 Adelie 0 1
#> 6 3650 Adelie 0 1
#> 7 3625 Adelie 0 1
#> 8 4675 Adelie 0 1
#> 9 3475 Adelie 0 1
#> 10 4250 Adelie 0 1
#> # … with 334 more rows
rec_spec2 <- recipe(species ~ island + body_mass_g, data = penguins) %>%
update_role(island, new_role = "strata") %>%
step_dummy(all_nominal_predictors())
rec_spec2 %>%
prep() %>%
bake(new_data = NULL)
#> # A tibble: 344 × 3
#> island body_mass_g species
#> <fct> <int> <fct>
#> 1 Torgersen 3750 Adelie
#> 2 Torgersen 3800 Adelie
#> 3 Torgersen 3250 Adelie
#> 4 Torgersen NA Adelie
#> 5 Torgersen 3450 Adelie
#> 6 Torgersen 3650 Adelie
#> 7 Torgersen 3625 Adelie
#> 8 Torgersen 4675 Adelie
#> 9 Torgersen 3475 Adelie
#> 10 Torgersen 4250 Adelie
#> # … with 334 more rows
Full reprex
library(tidymodels)
library(themis)
library(forcats)
data("penguins")
penguins0 <- penguins %>%
mutate(ids = row_number(),
species = factor(species == "Adelie")) %>%
drop_na()
data_split <- initial_split(penguins0, prop = 3/4, strata = island)
# Create training and testing datasets:
train_data <- training(data_split)
test_data <- testing(data_split)
# Build model
mldata_recipe <-
recipe(species ~ ., data = train_data) %>%
update_role(ids, new_role = "ID") %>%
update_role(island, new_role = "strata") %>%
step_zv(all_predictors()) %>%
step_unknown(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_smote(species)
set.seed(456)
# 10 fold cross validation
mldata_folds <- vfold_cv(train_data, strata = island)
glmnet_spec <-
logistic_reg(penalty = tune(), mixture = tune()) %>%
set_mode("classification") %>%
set_engine("glmnet")
glmnet_workflow <-
workflow() %>%
add_recipe(mldata_recipe) %>%
add_model(glmnet_spec)
glmnet_grid <- tidyr::crossing(penalty = 10^seq(-6, -1, length.out = 20),
mixture = c(0, 0.05, 0.2, 0.4, 0.6, 0.8, 1))
set.seed(789)
glmnet_tune <-
tune_grid(glmnet_workflow, resamples = mldata_folds, grid = glmnet_grid)
final_glmnet <- glmnet_workflow %>%
finalize_workflow(select_best(glmnet_tune, "roc_auc"))
glmnet_results <- final_glmnet %>%
fit_resamples(
resamples = mldata_folds,
metrics = metric_set(roc_auc, accuracy, sensitivity, specificity),
control = control_resamples(save_pred = TRUE)
)
set.seed(789)
final_fit <- final_glmnet %>%
last_fit(data_split)
final_fit %>%
pull(.workflow) %>%
pluck(1) %>%
tidy() %>%
filter(term != "(Intercept)") %>%
arrange(desc(abs(estimate))) %>%
filter(abs(estimate) >0) %>%
ggplot(aes(estimate, fct_reorder(term, desc(estimate)), color = estimate > 0))+
geom_vline(xintercept = 0, color = "lightgrey", lty = 2, size = 1.2) +
geom_point() +
scale_color_discrete(name = "Variable Effect \non outcome", labels = c("Deleterious", "Beneficial")) +
theme_minimal()+
ggtitle("Meaningful Parameter Estimate Coefficients using logistic regression model")
Created on 2021-08-20 by the reprex package (v2.0.1)