Related
I am trying to remove columns where proportion of NA value are greater than na_cutoff threshold using mlr3pipelines.
Here is my try:
library(mlr3)
library(mlr3pipelines)
task = tsk("iris")
dt = task$data()
dt[1:50, Sepal.Width := NA]
task_ = as_task_classif(dt, target = "Species")
graph = po("removeconstants", id = "removeconstants", ratio = 0.01) %>>%
po("select", id = "drop_na_cols")
ps = ParamSet$new(list(ParamDbl$new("na_cutoff", lower = 0, upper = 1, default = 0.2)))
graph$param_set$add(ps)
graph$param_set
graph$param_set$trafo = function(x, param_set) {
na_cutoff = x$na_cutoff
print(na_cutoff)
x$drop_na_cols.selector = function(task) {
fn = task$feature_names
data = task$data(cols = fn)
drop <- which(colMeans(is.na(data)) > na_cutoff)
fn[-drop]
}
x$na_cutoff = NULL
x
}
train_res = graph$train(task_)
train_res$drop_na_cols.output$data()
The problem is that last column is not removed even it should be.
In general, trafos are not meant for parameter sets.
I.e. internally, when the Graph accesses the parameters, the parameter transformation is not applied.
They are intended to create search spaces for black-box optimization, including hyperparameter optimization of ML models.
Also, you modifying the parameter set of an existing Graph is a bad idea.
The way to go I believe is to use the PipeOpSelect with a custom selector: https://mlr3pipelines.mlr-org.com/reference/Selector.html
Following this issue https://github.com/mlr-org/mlr3pipelines/issues/313
I thought the recommended way to do this is through trafo on select pipe.
Nevertheless, I have just created new pipeop that removes columns with many NA values:
library(mlr3pipelines)
library(mlr3verse)
library(mlr3misc)
library(R6)
PipeOpDropNACol = R6::R6Class(
"PipeOpDropNACol",
inherit = mlr3pipelines::PipeOpTaskPreprocSimple,
public = list(
initialize = function(id = "drop.nacol", param_vals = list()) {
ps = ParamSet$new(list(
ParamDbl$new("cutoff", lower = 0, upper = 1, default = 0.05, tags = c("dropnacol_tag"))
))
ps$values = list(cutoff = 0.2)
super$initialize(id, param_set = ps, param_vals = param_vals)
}
),
private = list(
.get_state = function(task) {
pv = self$param_set$get_values(tags = "dropnacol_tag")
print(pv$cutoff)
features_names = task$feature_names
data = task$data(cols = features_names)
print(data)
many_na = sapply(data, function(column) (sum(is.na(column))) / length(column) > pv$cutoff)
print(many_na)
list(cnames = colnames(data)[-many_na])
},
.transform = function(task) {
task$select(self$state$cnames)
}
)
)
# no group variable
task = tsk("iris")
dt = task$data()
dt[1:50, Sepal.Width := NA]
task = as_task_classif(dt, target = "Species")
gr = Graph$new()
gr$add_pipeop(PipeOpDropNACol$new())
result = gr$train(task)
result[[1]]$data()
gr$predict(task)
I have a complicated equation for which I have written the code as follows:
sigma = 1.336449027;
f_t = 0.500185113;
alpha = 0.364; #elasticity of capital
beta = 0.115; #elasticity of labor
R = 3.131696599;
chi = 0.5;
M = log(1056);
sigma = 1.336449027; #degree of product substitutability
W = log(29448.08908);
P = 3.0686;
aval = 1.25;
c = 0.5;
f = function(b){
loutpow = sigma/(beta*(sigma-1)-sigma);
lconst1 = sigma/(beta*(sigma-1));
lconst2 = (aval*kval^alpha)^((1 - sigma)/sigma);
lconst3 = (R*P^(sigma-1))^(1/sigma);
lval = (W/b*lconst2/lconst3*lconst1)^loutpow;
profit_first_term = (R*P^(sigma-1))^(1/sigma)*(aval*kval^alpha*lval^beta)^(1-(1/sigma));
profit_middle_terms = kval - kprimeval - f_t*kprimeval - c(kval - kprimeval)^2
profit_last_term = W/b*lval
profit = profit_first_term + profit_middle_terms - profit_last_term
bankruptcy = profit - chi*dval
}
For a range of kval,kprimeval,dval from 1 to 10000, I want to find the roots of this equation, that is the value of b. It is possible that for some values of kval,kprimeval,dval roots do not exist.
apparently your function has not a zero:
curve(f(x), -1, 1e9)
i have some issues using the package AlgDesign. I want to create a design using the federov exchange algorithm. Unfortunalty, I run into an confusing error:
Error in apply(data[, numericColumn], 2, mean) :
dim(X) must have a positive length
The error occures using different orders of the grid variables
cand.list = expand.grid(x1 = scale(as.data.frame(c(0, 0.1, 0.2, 0.3, 0.4, 0.5)), center = 0.0, scale = 0.5),
x2 = c("PMX", "MOC","OC","OX2","POS","CX","UX"),
x4 = c("Swap","Invert","Memetic 2-opt","Memetic k-opt","Memetic VNS"),
x5 = c("A","B")
)
federovDesign<-optFederov(~x1*x2*x5*x4,data = cand.list,nullify = 1,nRepeats = 40,center=TRUE)
This codes produces the error stated above. If i use the following code, everything work fine.
cand.list = expand.grid(x1 = scale(as.data.frame(c(0, 0.1, 0.2, 0.3, 0.4, 0.5)), center = 0.0, scale = 0.5),
x2 = c("PMX", "MOC","OC","OX2","POS","CX","UX"),
x3 = c(50, 100,150,200),
x4 = c("Swap","Invert","Memetic 2-opt","Memetic k-opt","Memetic VNS"),
x5 = c("A","B")
)
federovDesign<-optFederov(~x1*x2*x5*x4,data = cand.list,nullify = 1,nRepeats = 40,center=TRUE)
I just add another variable. However, the other variables remain unchanged but the error disappears. I observe this strange behavior every time I am using the package. With a little luck I may get it to work, trying different variable orders within my grid, however, I dont understand the underlaying concept.
I looked at the origin code of the function at github https://github.com/jvbraun/AlgDesign/blob/master/R/FederovOpt.R and tried my first cand.list and no error occures:
cand.list = expand.grid(x1 = scale(as.data.frame(c(0, 0.1, 0.2, 0.3, 0.4, 0.5)), center = 0.0, scale = 0.5),
x2 = c("PMX", "MOC","OC","OX2","POS","CX","UX"),
# x3 = c(50, 100,150,200),
x4 = c("Swap","Invert","Memetic 2-opt","Memetic k-opt","Memetic VNS"),
x5 = c("A","B")
)
data = cand.list
frml<-~x1*x2*x4*x5
if (!exists(".Random.seed"))
set.seed(555111666)
seed<-.Random.seed
if (missing(frml) || !inherits(frml,c("formula","character"))) {
if (missing(data))
stop("frml and data cannot both be missing.")
frml<-~.
}
if (missing(data)) {
# Create a data matrix from the global variables in frml
frmla<-formula(paste("~-1+",paste(all.vars(frml),sep="",collapse="+"),sep=""))
data<-data.frame(model.matrix(frmla,data))
}else {
if (!inherits(data,"data.frame")) {
# to insure the columns are named
data<-data.frame(data)
if (ncol(data)==1)
colnames(data)<-"X1"
}
}
numericColumn<-sapply(data,is.numeric)
frml<-expand.formula(frml,colnames(data),numerics=numericColumn)
X<-model.matrix(frml,data)
model.matrix.default(frml,data)
means<-apply(data[,numericColumn,drop=FALSE],2,mean)
data[,numericColumn]<-sweep(data[,numericColumn,drop=FALSE],2,means)
frml<-expand.formula(frml,colnames(data),numerics=numericColumn)
X<-model.matrix(frml,data)
N <- nrow(X)
k <- ncol(X)
nRound<-0
nTrials<-k+5
if (nTrials<k)
stop("nTrials must be greater than or equal to the number of columns in expanded X")
nTrials<-as.integer(nTrials) # to be safe
rows<-rep(0,nTrials)
nullify = 1
crit<-0
evaluateI<-FALSE
doSpace=NULL
B<-NULL
RandomStart<-FALSE # this has no effect when approximate!=FALSE since nullify is
augment<-FALSE
approximate=FALSE
proportions<-NULL
maxIteration<-1000
nRepeats<-40
DFrac<-1
CFrac<-1
value<-.Call("FederovOpt", X,as.integer(RandomStart),as.integer(rows),as.integer(nullify),
as.integer(crit),as.integer(evaluateI),as.integer(doSpace),B,as.integer(augment),as.integer(approximate),
as.double(proportions),as.integer(nTrials),as.integer(maxIteration),as.integer(nRepeats),
as.double(DFrac),as.double(CFrac),PACKAGE="AlgDesign")
data[,numericColumn]<-sweep(data[,numericColumn,drop=FALSE],2,-means)
RowNos<-sort(1+((value$rows[1:nTrials])%%N))
Design<-data[RowNos,,drop=FALSE]
So whats the matter? What do i miss?
Thank you for your effort. I have found a solution, its a bug:
https://github.com/jvbraun/AlgDesign/issues/3
solved close
I am using the referenceIntervals package in R, to do some data analytics.
In particular I am using the refLimit function which calculates reference and confidence intervals. I want to edit it to remove certain functionality (for instance it runs a shapiro normalitiy test, which stops the entire code if the data larger than 5000, it wont allow you to parametrically test samples less than 120). To do this I have been typing refLimit into the terminal - copying the function definition, then saving it as a separate file (below is the full original definition of the function).
singleRefLimit =
function (data, dname = "default", out.method = "horn", out.rm = FALSE,
RI = "p", CI = "p", refConf = 0.95, limitConf = 0.9)
{
if (out.method == "dixon") {
output = dixon.outliers(data)
}
else if (out.method == "cook") {
output = cook.outliers(data)
}
else if (out.method == "vanderLoo") {
output = vanderLoo.outliers(data)
}
else {
output = horn.outliers(data)
}
if (out.rm == TRUE) {
data = output$subset
}
outliers = output$outliers
n = length(data)
mean = mean(data, na.rm = TRUE)
sd = sd(data, na.rm = TRUE)
norm = NULL
if (RI == "n") {
methodRI = "Reference Interval calculated nonparametrically"
data = sort(data)
holder = nonparRI(data, indices = 1:length(data), refConf)
lowerRefLimit = holder[1]
upperRefLimit = holder[2]
if (CI == "p") {
CI = "n"
}
}
if (RI == "r") {
methodRI = "Reference Interval calculated using Robust algorithm"
holder = robust(data, 1:length(data), refConf)
lowerRefLimit = holder[1]
upperRefLimit = holder[2]
CI = "boot"
}
if (RI == "p") {
methodRI = "Reference Interval calculated parametrically"
methodCI = "Confidence Intervals calculated parametrically"
refZ = qnorm(1 - ((1 - refConf)/2))
limitZ = qnorm(1 - ((1 - limitConf)/2))
lowerRefLimit = mean - refZ * sd
upperRefLimit = mean + refZ * sd
se = sqrt(((sd^2)/n) + (((refZ^2) * (sd^2))/(2 * n)))
lowerRefLowLimit = lowerRefLimit - limitZ * se
lowerRefUpperLimit = lowerRefLimit + limitZ * se
upperRefLowLimit = upperRefLimit - limitZ * se
upperRefUpperLimit = upperRefLimit + limitZ * se
shap_normalcy = shapiro.test(data)
shap_output = paste(c("Shapiro-Wilk: W = ", format(shap_normalcy$statistic,
digits = 6), ", p-value = ", format(shap_normalcy$p.value,
digits = 6)), collapse = "")
ks_normalcy = suppressWarnings(ks.test(data, "pnorm",
m = mean, sd = sd))
ks_output = paste(c("Kolmorgorov-Smirnov: D = ", format(ks_normalcy$statistic,
digits = 6), ", p-value = ", format(ks_normalcy$p.value,
digits = 6)), collapse = "")
if (shap_normalcy$p.value < 0.05 | ks_normalcy$p.value <
0.05) {
norm = list(shap_output, ks_output)
}
else {
norm = list(shap_output, ks_output)
}
}
if (CI == "n") {
if (n < 120) {
cat("\nSample size too small for non-parametric confidence intervals, \n \t\tbootstrapping instead\n")
CI = "boot"
}
else {
methodCI = "Confidence Intervals calculated nonparametrically"
ranks = nonparRanks[which(nonparRanks$SampleSize ==
n), ]
lowerRefLowLimit = data[ranks$Lower]
lowerRefUpperLimit = data[ranks$Upper]
upperRefLowLimit = data[(n + 1) - ranks$Upper]
upperRefUpperLimit = data[(n + 1) - ranks$Lower]
}
}
if (CI == "boot" & (RI == "n" | RI == "r")) {
methodCI = "Confidence Intervals calculated by bootstrapping, R = 5000"
if (RI == "n") {
bootresult = boot::boot(data = data, statistic = nonparRI,
refConf = refConf, R = 5000)
}
if (RI == "r") {
bootresult = boot::boot(data = data, statistic = robust,
refConf = refConf, R = 5000)
}
bootresultlower = boot::boot.ci(bootresult, conf = limitConf,
type = "basic", index = 1)
bootresultupper = boot::boot.ci(bootresult, conf = limitConf,
type = "basic", index = 2)
lowerRefLowLimit = bootresultlower$basic[4]
lowerRefUpperLimit = bootresultlower$basic[5]
upperRefLowLimit = bootresultupper$basic[4]
upperRefUpperLimit = bootresultupper$basic[5]
}
RVAL = list(size = n, dname = dname, out.method = out.method,
out.rm = out.rm, outliers = outliers, methodRI = methodRI,
methodCI = methodCI, norm = norm, refConf = refConf,
limitConf = limitConf, Ref_Int = c(lowerRefLimit = lowerRefLimit,
upperRefLimit = upperRefLimit), Conf_Int = c(lowerRefLowLimit = lowerRefLowLimit,
lowerRefUpperLimit = lowerRefUpperLimit, upperRefLowLimit = upperRefLowLimit,
upperRefUpperLimit = upperRefUpperLimit))
class(RVAL) = "interval"
return(RVAL)
}
However when I then execute this file a large number of terms end up being undefined, for instance when I use the function I get 'object 'nonparRanks' not found'.
How do I edit the function in the package? I have looked at trying to important the package namespace and environment but this has not helped. I have also tried to find the actual function in the package files in my directory, but not been able to.
I am reasonably experienced in R, but I have never had to edit a package before. I am clearly missing something about how functions are defined in packages, but I am not sure what.
In the beginning of the package there is a line
data(sysdata, envir=environment())
See here: https://github.com/cran/referenceIntervals/tree/master/data/sysdata.rda
I suspect that "nonparRanks" is defined there as I don't see it defined anywhere else. So perhaps you could download that file, write your own function, then run that same line before running your function and it may work.
EDIT:
Download the file then run:
load("C:/sysdata.rda")
With your path to the file and then your function will work.
nonparRanks is a function in the referenceIntervals package:
Table that dictate the ranks for the confidence intervals
around thecalculated reference interval
Your method of saving and editing the function is fine, but make sure you load all the necessary underlying functions to run it too.
The easiest thing to do might be to:
save your copied and pasted R function as a different name, e.g. singleRefLimit2, then
call library("referenceIntervals"), which will load all the underlying functions you need and then
load your function source("singelRefLimit2.R"), with whatever edits you choose to make.
so I am in dire need of help. I have finally managed to construct my R-INLA model and get it to graph as needed. via the code below:
First I create the stacks (note this is the very end of my INLA process, the mesh etc has already been done)
stk.abdu = inla.stack(data = list(y = 1, e = 0), A = list(abdu.mat, 1),tag = 'abdu', effects = list(list(i = 1:sc.mesh.5$n), data.frame(Intercept = 1,dwater=winter.abdu$dwater,elev=winter.abdu$elev,forest=winter.abdu$forest,developed=winter.abdu$developed,openwater=winter.abdu$OpenWater,barren=winter.abdu$barren,shrubland=winter.abdu$shrubland,herb=winter.abdu$herb,planted=winter.abdu$planted,wetland=winter.abdu$wetland,dist=winter.abdu$dwater)))
stk.quad = inla.stack(data = list(y = 0, e = 0.1), A = list(quad.mat, 1),tag = 'quad', effects = list(list(i = 1:sc.mesh.5$n), data.frame(Intercept = 1,dwater=dummy$dwater,elev=dummy$elev,forest=dummy$forest,developed=dummy$developed,openwater=dummy$openwater,barren=dummy$barren,shrubland=dummy$shrubland,herb=dummy$herb,planted=dummy$planted,wetland=dummy$wetland,dist=dummy$dwater)))
stk.prd<-inla.stack(data = list(y = NA), A = list(Aprd, 1),tag = 'prd', effects = list(list(i = 1:sc.mesh.5$n), data.frame(Intercept = 1,dwater=prddf2$dwater,elev=prddf2$elev,forest=prddf2$forest,developed=prddf2$developed,openwater=prddf2$openwater,barren=prddf2$barren,shrubland=prddf2$shrubland,herb=prddf2$herb,planted=prddf2$planted,wetland=prddf2$wetland,dist=prddf2$dwater)))
stk.all.prd = inla.stack(stk.abdu,stk.quad,stk.prd)
Next I fit my model
ft.inla.prd<-inla(y ~ 0 + Intercept + elev + dwater + forest+ developed + f(inla.group(dist,n=50,method="quantile"),model="rw1",scale.model=TRUE)+f(i,model=sc.spde),family="binomial",data=inla.stack.data(stk.all.prd),control.predictor = list(A = inla.stack.A(stk.all.prd),compute=TRUE),E=inla.stack.data(stk.all.prd)$e,control.compute=list(dic = TRUE),control.fixed=list(expand.factor.strategy="INLA"))
Then I change the predicted values from logit to probabilities
ft.inla.prd$newfield <- exp(ft.inla.prd$summary.random$i$mean)/(1 + exp(ft.inla.prd$summary.random$i$mean))
And finally I use inla.mesh.project and levelplot to create my image
xmean <- inla.mesh.project(projgrid,ft.inla.prd$newfield)
levelplot(xmean, col.regions=topo.colors(99), main='Probability of Presence',xlab='', ylab='', scales=list(draw=FALSE))
So my problem is that I now want to export this data (what is projected as the graph) as a raster so that I can work with it in ArcGIS. However, I have not been able to find a way to do so.
Any input is greatly appreciated