Sparklyr Spark ML Feature Importance after feature transformation

Sparklyr Spark ML Feature Importance after feature transformation - r

How do i chart or extract feature importance after having gone through feature transformation such as below? Or should i have avoided hot encoding? Due to the transformation (hot encoding), i ended up having more variable importance metrics than the columns themselves.
Thanks
Feature Transformation
fea_pipeline <- ml_pipeline(sc) %>%
ft_string_indexer(input_col = "sex", output_col = "sex_indexed") %>%
ft_string_indexer(input_col = "drinks", output_col = "drinks_indexed") %>%
ft_string_indexer(input_col = "drugs", output_col = "drugs_indexed") %>%
ft_one_hot_encoder(
input_cols = c("sex_indexed", "drinks_indexed", "drugs_indexed"),
output_cols = c("sex_encoded", "drinks_encoded", "drugs_encoded")
) %>%
ft_vector_assembler(
input_cols = c("age", "sex_encoded", "drinks_encoded",
"drugs_encoded", "essay_length"),
output_col = "features"
) %>%
ft_standard_scaler(input_col = "features", output_col = "features_scaled",
with_mean = TRUE) %>%
ml_random_forest_classifier(features_col = "features_scaled",
label_col = "not_working")
Hyper Parameter Tuning
# ------ Hyper Param Tuning ---------
grid <- list(
random_forest = list(
num_trees = c(5, 10),
max_depth = c(10, 20)
)
)
cv <- ml_cross_validator(
sc,
estimator = fea_pipeline,
evaluator = ml_binary_classification_evaluator(sc, label_col = "not_working"),
estimator_param_maps = grid,
num_folds = 5)
cv_model <- ml_fit(cv, train_tbl)
Print the metrics
ml_validation_metrics(cv_model)
fitted <- cv_model$best_model
# ------ Variable Importance ------
ml_tree_feature_importance(ml_stage(fitted,7))
output of variable importance gives me this
0.673134090 0.023902744 0.021771300 0.015035223 0.012361712 0.016907567 0.011370478 0.007484832 0.014057235 0.013598873 0.012238969 0.178136976
I clearly have more importance values than the columns after hot encoding the categorical columns
ml_stage(fea_pipeline,5)$param_map$input_cols
as I have only these columns
[1] "age" "sex_encoded" "drinks_encoded" "drugs_encoded" "essay_length"
Scripts to run to reproduce (pre-Feature Transformation step above)
##Data Download
# download.file(
# "https://github.com/r-spark/okcupid/raw/master/profiles.csv.zip",
# "okcupid.zip")
#
# unzip("okcupid.zip", exdir = "data")
# unlink("okcupid.zip")
#load library
library(sparklyr)
library(ggplot2)
library(dbplot)
library(dplyr)
library(tidyr)
# --------- processining of data---------
sc <- spark_connect(master = "local")
okc <- spark_read_csv(
sc,
"data/profiles.csv",
escape = "\"",
memory = FALSE,
options = list(multiline = TRUE)
) %>%
mutate(height = as.numeric(height),
income = ifelse(income == "-1", NA, as.numeric(income))) %>%
mutate(sex = ifelse(is.na(sex), "missing", sex)) %>%
mutate(drinks = ifelse(is.na(drinks), "missing", drinks)) %>%
mutate(drugs = ifelse(is.na(drugs), "missing", drugs)) %>%
mutate(job = ifelse(is.na(job), "missing", job))
okc <- okc %>%
mutate(
not_working = ifelse(job %in% c("student", "unemployed", "retired"), 1 , 0)
)
ethnicities <- c("asian", "middle eastern", "black", "native american", "indian",
"pacific islander", "hispanic / latin", "white", "other")
ethnicity_vars <- ethnicities %>%
purrr::map(~ expr(ifelse(like(ethnicity, !!.x), 1, 0))) %>%
purrr::set_names(paste0("ethnicity_", gsub("\\s|/", "", ethnicities)))
okc <- mutate(okc, !!!ethnicity_vars)
okc <- okc %>%
mutate(
essay_length = char_length(paste(!!!syms(paste0("essay", 0:9))))
) %>%
select(not_working, age, sex, drinks, drugs, essay1:essay9, essay_length)
# --------- pipeline---------
# Partition the data
partition <-
okc %>%
sdf_random_split(train = 0.7, test = 0.3, seed = 1234)
# Create table references
train_tbl <- partition$train
test_tbl <- partition$test

Related

Putting confidence interval on time series graphs

I made time series graph with dygraph package:
library(forecast)
library(dygraphs)
hw <- HoltWinters(ldeaths)
p <- predict(hw, n.ahead = 36, prediction.interval = TRUE)
all <- cbind(ldeaths, p)
f <- forecast(unemp.mod)
dygraph(all, "Deaths from Lung Disease (UK)") %>%
dySeries("ldeaths", label = "Actual") %>%
dySeries(c("p.lwr", "p.fit", "p.upr"), label = "Predicted")
Now I try to add confidence intervals to predictions:
gen_array <- function(forecast_obj){
actuals <- forecast_obj$x
lower <- forecast_obj$lower[,2]
upper <- forecast_obj$upper[,2]
point_forecast <- forecast_obj$mean
cbind(actuals, lower, upper, point_forecast)
}
dygraph(all, main = "graph_title") %>%
dyRangeSelector() %>%
dyRangeSelector(height = 40,
dateWindow = c("2011-04-01", "2019-4-01")) %>%
dySeries(name = "actuals", label = "actual") %>%
dySeries(c("lower","point_forecast","upper"), label = "Predicted") %>%
dyLegend(show = "always", hideOnMouseOut = FALSE) %>%
dyHighlight(highlightCircleSize = 5,
highlightSeriesOpts = list(strokeWidth = 2)) %>%
dyOptions(axisLineColor = "navy", gridLineColor = "grey")
But I get errors:
Error in dySeries(., name = "actuals", label = "actual") :
One or more of the specified series were not found. Valid series names are: ldeaths, p.fit, p.upr, p.lwr
How I can fix it?

How do you pass multiple arguments to a function inside a function?

So I'm trying to make cross tables for multi-response questions with both frequency and counts using expss. I am able to get the result I need by running the following code:
library(expss)
set.seed(1998)
expss_output_viewer()
# Example data set
x <- c("A","B","C")
area <- rep(x,each = 10)
p <- sample(c(0,1),30,replace = T)
q <- sample(c(0,1),30,replace = T)
r <- sample(c(0,1),30,replace = T)
mrdata <- data.frame(area,p,q,r)
# Creating the Table
mrdata %>%
tab_significance_options(keep = "none", sig_labels = NULL, subtable_marks = "greater", mode = "append") %>%
tab_cols(total(), mdset(p,q,r)) %>%
tab_cells(area) %>%
tab_stat_cases(label = "cases") %>%
tab_stat_cpct_responses(label = "%",total_row_position = "none") %>%
tab_pivot(stat_position = "inside_columns") %>% set_caption("Table 1")
However, seeing as this is a lot of code for a single table, I wanted to wrap it into a function to be able to create the tables quickly and without much clutter. I've tried doing it like this:
mrtable <- function(input,rowvar,colvars,capt ="Table 1") {
input %>%
tab_significance_options(keep = "none", sig_labels = NULL, subtable_marks = "greater", mode = "append") %>%
tab_cols(total(), mdset(colvars)) %>%
tab_cells(rowvar) %>%
tab_stat_cases(label = "cases") %>%
tab_stat_cpct_responses(label = "%",total_row_position = "none") %>%
tab_pivot(stat_position = "inside_columns") %>% set_caption(capt)
}
mrtable(input = mrdata,colvars = c(p,q,r),rowvar = area)
Running the function above returns:
Error: 'cro': all variables should be of the same length or length 1.
I can't figure out why it fails. Any help would be appreciated.
EDIT:
got it to work :
mrtable <- function(input,rowvar,...,capt ="Table 1") {
input %>%
tab_significance_options(keep = "none", sig_labels = NULL, subtable_marks = "greater", mode = "append") %>%
tab_cols(total(), mdset(...)) %>%
tab_cells(rowvar) %>%
tab_stat_cases(label = "cases") %>%
tab_stat_cpct_responses(label = "%",total_row_position = "none") %>%
tab_pivot(stat_position = "inside_columns") %>% set_caption(capt)
}
mrtable(input = mrdata,rowvar = area,p,q,r,capt = "Tab")

Is it possible to change the order of columns in heatmaply?

I have created a dendrogram with heatmaply with a dendrogram object from the dendextend package. I was wondering if there is a way to change the order of the columns: vs, am, carb, wt, drat, gear, etc. For example, I want to sort them alphabetically. Is this possible?
library(heatmaply)
library(tidyverse)
library(RColorBrewer)
library(dendextend)
custom_colors <- c("#B6E2D3", "#887BB0", "#FFD53D", "#EF7C8E", "#A1F7A1", "#EFE7BC", "#C26DBC", "#14C2DD",
"#5CD85A", "#AB6B51", "#39918C", "#FFA384", "#57DDF3", "#D0B49F", "#EEB5EB", "#74BDCB")
dend_example <- mtcars %>% dist %>% hclust(method = "ward.D2") %>%
as.dendrogram %>% color_branches(k=5) %>% color_labels
heatmaply_mtcars<- heatmaply(mtcars, hclustfun = hclust, hclust_method = "ward.D2",
Rowv = dend_example,
main = "mtcars",
show_dendrogram = c(TRUE, FALSE),
showticklabels = c(TRUE, TRUE),
scale_fill_gradient_fun = ggplot2::scale_fill_gradient2(high = "#025492"),
file = "heatmaply_mtcars.html",
column_text_angle = 30,
colorbar_thickness = 50)

I included a column dendrogram within heatmaply with the colv argument. The column dendrogram was rotated with the rotate function from the package dendextend:
dend_example <- mtcars %>% dist %>% hclust(method = "ward.D2") %>%
as.dendrogram %>% color_branches(k=5) %>% color_labels
column_dend <- t(mtcars) %>% dist %>% hclust(method = "ward.D2") %>%
as.dendrogram
column_dend <- column_dend %>% rotate(c(vector_with_labels))
heatmaply_mtcars<- heatmaply(mtcars, hclustfun = hclust, hclust_method = "ward.D2",
Rowv = dend_example,
Colv = column_dend,
main = "mtcars",
show_dendrogram = c(TRUE, FALSE),
showticklabels = c(TRUE, TRUE),
scale_fill_gradient_fun = ggplot2::scale_fill_gradient2(high = "#025492"),
file = "heatmaply_mtcars.html",
column_text_angle = 30,
colorbar_thickness = 50)

Creating a user-defined total with grand_summary_rows()

I have a table that looks like this:
category family amount
<chr> <chr> <chr>
1 SALES ONLINE SALES 47
2 SALES IN STORE 72
3 COGS LABOR 28
4 COGS TAXES 35
5 COGS WORKERS COMP 24
6 COGS BENEFITS 33
7 EXPENSE AUTOMOBILE 44
8 EXPENSE RENT 12
9 EXPENSE TELEPHONE 26
I am trying to create a gt table from this so I have created this code:
library(tidyverse)
library(gt)
category <- c(rep("SALES",2),
rep("COGS", 4),
rep("EXPENSE",3)
)
family <- c("ONLINE SALES","IN STORE","LABOR","TAXES","WORKERS COMP","BENEFITS",
"AUTOMOBILE", "RENT","TELEPHONE")
amount <- c(47,72,28,35,24,33,44,12,26)
output <- as_tibble(cbind(category,family,amount)) %>%
mutate(amount= as.numeric(amount)) %>%
gt(rowname_col = 'family',
groupname_col = 'category') %>%
row_group_order(c("SALES","COGS", "EXPENSE")) %>%
summary_rows(groups = TRUE,
columns = 'amount',
fns = list(
Total = ~sum(.,na.rm = TRUE)
))
output
How do you get the overall total of SALES-COGS-EXPENSE using the grand_summary_rows() function while all of the amounts are still positive?

if I have correctly understood your request you can use the code below:
output2 <- as_tibble(cbind(category,family,amount)) %>%
mutate(amount= as.numeric(amount)) %>%
gt(rowname_col = 'family', groupname_col = 'category') %>%
summary_rows(groups = TRUE,
columns = 'amount',
fns = list(
Total = ~sum(.,na.rm = TRUE)
)) %>%
grand_summary_rows(
columns = c("family","category","amount"),
fns = list(
"Grand Total" = ~sum(.,na.rm = TRUE)),
formatter = fmt_number,
use_seps = FALSE
)
output2
################### EDIT ###################
The only way I found from the documentation is to create a custom aggregation function. Here's the full working example with the output printscreen.
customFunc <- function(data) {
salesSum <- sum(subset(data$`_data`[,c("amount")], category == 'SALES'))
cogsSum <- sum(subset(data$`_data`[,c("amount")], category == 'COGS'))
expenseSum <- sum(subset(data$`_data`[,c("amount")], category == 'EXPENSE'))
return (salesSum - cogsSum - expenseSum)
}
data <- as_tibble(cbind(category,family,amount)) %>%
mutate(amount= as.numeric(amount)) %>%
gt(rowname_col = 'family', groupname_col = 'category')
output3 <- data %>%
summary_rows(groups = TRUE,
columns = 'amount',
fns = list(
Total = ~sum(.,na.rm = TRUE)
)) %>%
grand_summary_rows(
columns = c("family","category","amount"),
fns = list(
"Grand Total" = ~customFunc(data)),
formatter = fmt_number,
use_seps = FALSE
)
output3

Argument is not named in hc_add_series

I use the package highcharter to create the plot below with:
library(highcharter)
library(dplyr)
hc <- highchart() %>%
hc_chart(type="column") %>%
hc_xAxis(type="category") %>%
hc_add_series(
name = "Things",
data = list(
list(
name = "Animals",
y = 10,
drilldown = "animals"
),
list(
name = "People",
y = 10,
drilldown = "people"
)
)
)
hc
When I try to create the similar plot fot the sum of Num for every US State I get:
argument is not named in hc_add_series
data
State <- c("ALABAMA", "ALABAMA", "ALASKA", "ALASKA")
Num <- c(5, 6, 7, 8)
d <- data.frame(State, Num)
code
library(highcharter)
library(dplyr)
hc <- highchart() %>%
hc_chart(type="column") %>%
hc_xAxis(type="category") %>%
hc_add_series(
name = "States",
output2 <- d %>% group_by(State) %>%
summarise(Num = sum(Num)) %>%
mutate(drilldown = tolower(State)) %>%
transpose
)
Why does this happen since in both cases Im using a list

I think you can benefit from using hcaes which works like aes in ggplot:
highchart() %>%
hc_chart(type="column") %>%
hc_xAxis(type="category") %>%
hc_add_series(
data = d,
name = "States",
type = "column",
hcaes(x = State, y = Num)
)

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Sparklyr Spark ML Feature Importance after feature transformation - r

Related

Putting confidence interval on time series graphs

How do you pass multiple arguments to a function inside a function?

Is it possible to change the order of columns in heatmaply?

Creating a user-defined total with grand_summary_rows()

Argument is not named in hc_add_series

Categories

Resources