Remove columns with many NA values using mlr3pipelines - r

I am trying to remove columns where proportion of NA value are greater than na_cutoff threshold using mlr3pipelines.
Here is my try:
library(mlr3)
library(mlr3pipelines)
task = tsk("iris")
dt = task$data()
dt[1:50, Sepal.Width := NA]
task_ = as_task_classif(dt, target = "Species")
graph = po("removeconstants", id = "removeconstants", ratio = 0.01) %>>%
po("select", id = "drop_na_cols")
ps = ParamSet$new(list(ParamDbl$new("na_cutoff", lower = 0, upper = 1, default = 0.2)))
graph$param_set$add(ps)
graph$param_set
graph$param_set$trafo = function(x, param_set) {
na_cutoff = x$na_cutoff
print(na_cutoff)
x$drop_na_cols.selector = function(task) {
fn = task$feature_names
data = task$data(cols = fn)
drop <- which(colMeans(is.na(data)) > na_cutoff)
fn[-drop]
}
x$na_cutoff = NULL
x
}
train_res = graph$train(task_)
train_res$drop_na_cols.output$data()
The problem is that last column is not removed even it should be.

In general, trafos are not meant for parameter sets.
I.e. internally, when the Graph accesses the parameters, the parameter transformation is not applied.
They are intended to create search spaces for black-box optimization, including hyperparameter optimization of ML models.
Also, you modifying the parameter set of an existing Graph is a bad idea.
The way to go I believe is to use the PipeOpSelect with a custom selector: https://mlr3pipelines.mlr-org.com/reference/Selector.html

Following this issue https://github.com/mlr-org/mlr3pipelines/issues/313
I thought the recommended way to do this is through trafo on select pipe.
Nevertheless, I have just created new pipeop that removes columns with many NA values:
library(mlr3pipelines)
library(mlr3verse)
library(mlr3misc)
library(R6)
PipeOpDropNACol = R6::R6Class(
"PipeOpDropNACol",
inherit = mlr3pipelines::PipeOpTaskPreprocSimple,
public = list(
initialize = function(id = "drop.nacol", param_vals = list()) {
ps = ParamSet$new(list(
ParamDbl$new("cutoff", lower = 0, upper = 1, default = 0.05, tags = c("dropnacol_tag"))
))
ps$values = list(cutoff = 0.2)
super$initialize(id, param_set = ps, param_vals = param_vals)
}
),
private = list(
.get_state = function(task) {
pv = self$param_set$get_values(tags = "dropnacol_tag")
print(pv$cutoff)
features_names = task$feature_names
data = task$data(cols = features_names)
print(data)
many_na = sapply(data, function(column) (sum(is.na(column))) / length(column) > pv$cutoff)
print(many_na)
list(cnames = colnames(data)[-many_na])
},
.transform = function(task) {
task$select(self$state$cnames)
}
)
)
# no group variable
task = tsk("iris")
dt = task$data()
dt[1:50, Sepal.Width := NA]
task = as_task_classif(dt, target = "Species")
gr = Graph$new()
gr$add_pipeop(PipeOpDropNACol$new())
result = gr$train(task)
result[[1]]$data()
gr$predict(task)

Related

mlrCPO - Task conversion TOCPO

I would like to build a CPO for the mlr::makeClassificationViaRegression wrapper. The wrapper builds regression models that predict for the positive class whether a particular example belongs to it (1) or not (-1). It also calculates predicted probabilities using a softmax.
After reading the documentation and vignettes for makeCPOTargetOp, my attempt is as follows:
cpoClassifViaRegr = makeCPOTargetOp(
cpo.name = 'ClassifViaRegr',
dataformat = 'task', #Not sure - will this work if input is df with unknown target values?
# properties.data = c('numerics', 'factors', 'ordered', 'missings'), #Is this needed?
properties.adding = 'twoclass', #See https://mlrcpo.mlr-org.com/articles/a_4_custom_CPOs.html#task-type-and-conversion
properties.needed = character(0),
properties.target = c('classif', 'twoclass'),
task.type.out = 'regr',
predict.type.map = c(response = 'response', prob = 'response'),
constant.invert = TRUE,
cpo.train = function(data, target) {
getTaskDesc(data)
},
cpo.retrafo = function(data, target, control) {
cat(class(target))
td = getTaskData(target, target.extra = T)
target.name = paste0(control$positive, ".prob")
data = td$data
data[[target.name]] = ifelse(td$target == pos, 1, -1)
makeRegrTask(id = paste0(getTaskId(target), control$positive, '.'),
data = data,
target = target.name,
weights = target$weights,
blocking = target$blocking)
},
cpo.train.invert = NULL, #Since constant.invert = T
cpo.invert = function(target, control.invert, predict.type) {
if(predict.type == 'response') {
factor(ifelse(target > 0, control.invert$positive, control.invert$positive))
} else {
levs = c(control.invert$positive, control.invert$negative)
propVectorToMatrix(vnapply(target, function(x) exp(x) / sum(exp(x))), levs)
}
})
It seems to work as expected, the demo below shows that the inverted prediction is identical to the prediction obtained using the makeClassificationViaRegr wrapper:
lrn = makeLearner("regr.lm")
# Wrapper -----------------------------------------------------------------
lrn2 = makeClassificationViaRegressionWrapper(lrn)
model = train(lrn2, sonar.task, subset = 1:140)
predictions = predict(model, newdata = getTaskData(sonar.task)[141:208, 1:60])
# CPO ---------------------------------------------------------------------
sonar.train = subsetTask(sonar.task, 1:140)
sonar.test = subsetTask(sonar.task, 141:208)
trafd = sonar.train %>>% cpoClassifViaRegr()
mod = train(lrn, trafd)
retr = sonar.test %>>% retrafo(trafd)
pred = predict(mod, retr)
invpred = invert(inverter(retr), pred)
identical(predictions$data$response, invpred$data$response)
The problem is that the after the CPO has converted the task from twoclass to regr, there is no way for me to specify predict.type = 'prob'. In the case of the wrapper, the properties of the base regr learner are modified to accept predict.type = prob (see here). But the CPO is unable to modify the learner in this way, so how can I tell my model to return predicted probabilities instead of the predicted response?
I was thinking I could specify a include.prob parameter, i.e. cpoClassifViaRegr(include.prob = T). If set to TRUE, the cpo.invert returns the predicted probabilities in addition to the predicted response. Would something like this work?

Only Table in rpivotTable

I'm using the rpivotTable package in Shiny application and I'd like to have only the choice of 'Table' for the users (no charts)
The RenderName argument is only used to choose the default display...
output$pivot <- renderRpivotTable(
rpivotTable(iris,
rendererName = "Table" )
)
Many thanks in advance !
There are multiple issues here.
you can specify renderers via the anonymos renderers argument in rpivotTable(). I have the JS code form here.
however, there is a bug when only selecting one option. In this case, rpivotTable() wraps the argument in a list again (see the Map() call in the original function code) and the forwarding to JS fails.
Therefore, I accounted for this issue and extended the function a bit. Play around with aggregators/renderers to see how it behaves differently to the original rpivotTable() function.
# define own function
my_rpivotTable <- function (data, rows = NULL, cols = NULL, aggregatorName = NULL,
vals = NULL, rendererName = NULL, sorter = NULL, exclusions = NULL,
inclusions = NULL, locale = "en", subtotals = FALSE, ...,
width = 800, height = 600, elementId = NULL)
{
if (length(intersect(class(data), c("data.frame", "data.table",
"table", "structable", "ftable"))) == 0) {
stop("data should be a data.frame, data.table, or table",
call. = F)
}
if (length(intersect(c("table", "structable", "ftable"),
class(data))) > 0)
data <- as.data.frame(data)
params <- list(rows = rows, cols = cols, aggregatorName = aggregatorName,
vals = vals, rendererName = rendererName, sorter = sorter,
...)
params <- Map(function(p) {
# added to the class check -------------------------------------------------
if (length(p) == 1 && class(p[[1]]) != "JS_EVAL") {
p = list(p)
}
return(p)
}, params)
par <- list(exclusions = exclusions, inclusions = inclusions)
params <- c(params, par)
params <- Filter(Negate(is.null), params)
x <- list(data = data, params = params, locale = locale,
subtotals = subtotals)
htmlwidgets::createWidget(name = "rpivotTable", x, width = width,
height = height, elementId = elementId, package = "rpivotTable")
}
# create the pivot table
my_rpivotTable(
expand.grid(LETTERS, 1:3),
aggregatorName = "Count",
aggregators = list(Sum = htmlwidgets::JS('$.pivotUtilities.aggregators["Sum"]'),
Count = htmlwidgets::JS('$.pivotUtilities.aggregators["Count"]')),
rendererName = "fancyTable",
renderers = list(fancyTable = htmlwidgets::JS('$.pivotUtilities.renderers["Table"]'))
)

Create a script with 2 vectors in R

I'm using Heatmap from the package complexheatmap
in the script, I need to create a variable ha_column that I will incorporate into my script.
ha_column = HeatmapAnnotation (df = data.frame(type1=c(rep("name1",5), rep("name2",5),rep("name3",5), col = list(type1=c("name1" = "#DCDCDC", "name2" = "#DC928B", "name2"="#BA72D3")))))
I have 2 vectors:
vectors1=c("name1","name2","name3)
vectors2=c("#DCDCDC","#DC928B","#BA72D3")
and the idea is to reproduce the above script with these two vectors.
I tried:
paste0("ha_column = HeatmapAnnotation(df = data.frame(type1 = c(rep(",vectors1,", 5),col = list(type1 = c(",vectors1,"=",vectors2,")))")
bu it only paste line by line such as:
[1] "ha_column = HeatmapAnnotation(df = data.frame(type1 = c(rep(name1, 5),col = list(type1 = c(name1=#DCDCDC)))"
[2] "ha_column = HeatmapAnnotation(df = data.frame(type1 = c(rep(name2, 5),col = list(type1 = c(name2=#DC928B)))"
[3] "ha_column = HeatmapAnnotation(df = data.frame(type1 = c(rep(name3, 5),col = list(type1 = c(name3=#BA72D3)))"
instead of doing what I want ...
Does anyone have an idea?
Thanks for your time.
It's generally not a good idea to build code as a string. Instead think of building a function to do what you want.
You could do something line
ha_column_fun = function(names, colors) {
HeatmapAnnotation(
df = data.frame(type1 = rep(names, each=5)),
col = list(type1=setNames(colors, names))
)
}
And then you could call it with
ha_column = ha_column_fun(vectors1, vectors2)

Editing a function from a package in R?

I am using the referenceIntervals package in R, to do some data analytics.
In particular I am using the refLimit function which calculates reference and confidence intervals. I want to edit it to remove certain functionality (for instance it runs a shapiro normalitiy test, which stops the entire code if the data larger than 5000, it wont allow you to parametrically test samples less than 120). To do this I have been typing refLimit into the terminal - copying the function definition, then saving it as a separate file (below is the full original definition of the function).
singleRefLimit =
function (data, dname = "default", out.method = "horn", out.rm = FALSE,
RI = "p", CI = "p", refConf = 0.95, limitConf = 0.9)
{
if (out.method == "dixon") {
output = dixon.outliers(data)
}
else if (out.method == "cook") {
output = cook.outliers(data)
}
else if (out.method == "vanderLoo") {
output = vanderLoo.outliers(data)
}
else {
output = horn.outliers(data)
}
if (out.rm == TRUE) {
data = output$subset
}
outliers = output$outliers
n = length(data)
mean = mean(data, na.rm = TRUE)
sd = sd(data, na.rm = TRUE)
norm = NULL
if (RI == "n") {
methodRI = "Reference Interval calculated nonparametrically"
data = sort(data)
holder = nonparRI(data, indices = 1:length(data), refConf)
lowerRefLimit = holder[1]
upperRefLimit = holder[2]
if (CI == "p") {
CI = "n"
}
}
if (RI == "r") {
methodRI = "Reference Interval calculated using Robust algorithm"
holder = robust(data, 1:length(data), refConf)
lowerRefLimit = holder[1]
upperRefLimit = holder[2]
CI = "boot"
}
if (RI == "p") {
methodRI = "Reference Interval calculated parametrically"
methodCI = "Confidence Intervals calculated parametrically"
refZ = qnorm(1 - ((1 - refConf)/2))
limitZ = qnorm(1 - ((1 - limitConf)/2))
lowerRefLimit = mean - refZ * sd
upperRefLimit = mean + refZ * sd
se = sqrt(((sd^2)/n) + (((refZ^2) * (sd^2))/(2 * n)))
lowerRefLowLimit = lowerRefLimit - limitZ * se
lowerRefUpperLimit = lowerRefLimit + limitZ * se
upperRefLowLimit = upperRefLimit - limitZ * se
upperRefUpperLimit = upperRefLimit + limitZ * se
shap_normalcy = shapiro.test(data)
shap_output = paste(c("Shapiro-Wilk: W = ", format(shap_normalcy$statistic,
digits = 6), ", p-value = ", format(shap_normalcy$p.value,
digits = 6)), collapse = "")
ks_normalcy = suppressWarnings(ks.test(data, "pnorm",
m = mean, sd = sd))
ks_output = paste(c("Kolmorgorov-Smirnov: D = ", format(ks_normalcy$statistic,
digits = 6), ", p-value = ", format(ks_normalcy$p.value,
digits = 6)), collapse = "")
if (shap_normalcy$p.value < 0.05 | ks_normalcy$p.value <
0.05) {
norm = list(shap_output, ks_output)
}
else {
norm = list(shap_output, ks_output)
}
}
if (CI == "n") {
if (n < 120) {
cat("\nSample size too small for non-parametric confidence intervals, \n \t\tbootstrapping instead\n")
CI = "boot"
}
else {
methodCI = "Confidence Intervals calculated nonparametrically"
ranks = nonparRanks[which(nonparRanks$SampleSize ==
n), ]
lowerRefLowLimit = data[ranks$Lower]
lowerRefUpperLimit = data[ranks$Upper]
upperRefLowLimit = data[(n + 1) - ranks$Upper]
upperRefUpperLimit = data[(n + 1) - ranks$Lower]
}
}
if (CI == "boot" & (RI == "n" | RI == "r")) {
methodCI = "Confidence Intervals calculated by bootstrapping, R = 5000"
if (RI == "n") {
bootresult = boot::boot(data = data, statistic = nonparRI,
refConf = refConf, R = 5000)
}
if (RI == "r") {
bootresult = boot::boot(data = data, statistic = robust,
refConf = refConf, R = 5000)
}
bootresultlower = boot::boot.ci(bootresult, conf = limitConf,
type = "basic", index = 1)
bootresultupper = boot::boot.ci(bootresult, conf = limitConf,
type = "basic", index = 2)
lowerRefLowLimit = bootresultlower$basic[4]
lowerRefUpperLimit = bootresultlower$basic[5]
upperRefLowLimit = bootresultupper$basic[4]
upperRefUpperLimit = bootresultupper$basic[5]
}
RVAL = list(size = n, dname = dname, out.method = out.method,
out.rm = out.rm, outliers = outliers, methodRI = methodRI,
methodCI = methodCI, norm = norm, refConf = refConf,
limitConf = limitConf, Ref_Int = c(lowerRefLimit = lowerRefLimit,
upperRefLimit = upperRefLimit), Conf_Int = c(lowerRefLowLimit = lowerRefLowLimit,
lowerRefUpperLimit = lowerRefUpperLimit, upperRefLowLimit = upperRefLowLimit,
upperRefUpperLimit = upperRefUpperLimit))
class(RVAL) = "interval"
return(RVAL)
}
However when I then execute this file a large number of terms end up being undefined, for instance when I use the function I get 'object 'nonparRanks' not found'.
How do I edit the function in the package? I have looked at trying to important the package namespace and environment but this has not helped. I have also tried to find the actual function in the package files in my directory, but not been able to.
I am reasonably experienced in R, but I have never had to edit a package before. I am clearly missing something about how functions are defined in packages, but I am not sure what.
In the beginning of the package there is a line
data(sysdata, envir=environment())
See here: https://github.com/cran/referenceIntervals/tree/master/data/sysdata.rda
I suspect that "nonparRanks" is defined there as I don't see it defined anywhere else. So perhaps you could download that file, write your own function, then run that same line before running your function and it may work.
EDIT:
Download the file then run:
load("C:/sysdata.rda")
With your path to the file and then your function will work.
nonparRanks is a function in the referenceIntervals package:
Table that dictate the ranks for the confidence intervals
around thecalculated reference interval
Your method of saving and editing the function is fine, but make sure you load all the necessary underlying functions to run it too.
The easiest thing to do might be to:
save your copied and pasted R function as a different name, e.g. singleRefLimit2, then
call library("referenceIntervals"), which will load all the underlying functions you need and then
load your function source("singelRefLimit2.R"), with whatever edits you choose to make.

Create argument list using lapply for do.call

I'm trying to pass a set of modified arguments from a larger function to arguments in a nested function. This is an argument supplied from the larger function:
time_dep_covariates_list = c(therapy_start = "Start of Therapy",
therapy_end = "End of Therapy")
I have these sets of constant arguments:
tmerge_args_1 <- alist(data1 = analytic_dataset,
data2 = analytic_dataset,
id = patientid,
tstop = adv_dx_to_event,
death_censor = event(adv_dx_to_event))
And I want to append these modified arguments to that argument list:
tmerge_args_2 <- lapply(1:length(time_dep_covariates_list), function(x){
tmerge_args <<- c(tmerge_args, alist('var' = tdc(var)) )
paste0(names(time_dep_covariates_list[x])," =
tdc(",names(time_dep_covariates_list[x]), ")")
})
> tdc_args
[[1]]
[1] "therapy_start = tdc(therapy_start)"
[[2]]
[1] "therapy_end = tdc(therapy_end)"
I want to create a do.call that handles the arguments like so:
count_process_form <- do.call(tmerge, args = c(tmerge_args_1,
tmerge_args_2)
That would be identical to the following:
tmerge(data1 = analytic_dataset, data2 = analytic_dataset,
id = patientid, tstop = adv_dx_to_event,
therapy_start = tdc(therapy_start), therapy_end = tdc(therapy_end)
It works fine with tmerge_args_1 by itself, but as the args_2 are character and not language elements, I get this error:
Error in (function (data1, data2, id, ..., tstart, tstop, options) :
all additional argments [sic] must have a name:
How can I modify the list I'm creating for args_2 so they're stored as arguments that do.call can understand? Or am I approaching this all wrong?
Thanks!
Here is a reproducible example:
analytic_dataset= data_frame(patientid = sample(1:1000,5),
adv_dx_to_event = sample(100:200, 5),
death_censor = sample(0:1,5, replace = T),
therapy_start = sample(1:20,5),
therapy_stop = sample(40:100,5))
The below would be passed in from a function:
time_dep_covariates_list = c(therapy_start = "Start of Therapy",
therapy_end = "End of Therapy")
tmerge_args_1 <- alist(data1 = analytic_dataset,
data2 = analytic_dataset,
id = patientid,
tstop = adv_dx_to_event,
death_censor = event(adv_dx_to_event))
do.call(tmerge,tmerge_args_1) #this works
tmerge_args_2 <- lapply(1:length(time_dep_covariates_list), function(x){
tmerge_args <<- c(tmerge_args, alist('var' = tdc(var)) )
paste0(names(time_dep_covariates_list[x])," = tdc(",names(time_dep_covariates_list[x]), ")")
})
do.call(tmerge,tmerge_args_1,tmerge_args_2) # this doesn't```

Resources