R Optimization over Dataframe - r

I have the following code where I want to find the beste Values for x,y and z.
df <- data.frame(replicate(3,sample(0:100,100,rep=TRUE)))
find_best <- function(xyz) {
x <- xyz[1]
y <- xyz[2]
z <- xyz[3]
nr <- count(df)
val <- count(df[df[, "X1"] < x & df[, "X2"] < y & df[, "X3"] < z, ] )
return(val$n/nr$n)
}
optim(par = c(30,15,15), fn = find_best, lower=c(0,0,0), upper=c(100,100,100), method="L-BFGS-B")
The function does not achieve much at the moment, but I will add constraints later. However if I run this, I only get the value of the initial values back.
$par
[1] 30 15 15
So the question is, how can I get the best values for x,y,z either with optim or with anything else.

Here is an example of how you can use optim for your purpose
set.seed(1)
df <- data.frame(replicate(3,sample(0:100,1e5,rep=TRUE)))
find_best <- function(xyz) {
x <- xyz[1]
y <- xyz[2]
z <- xyz[3]
r <- nrow(subset(df,X1 < x & X2 < y & X3 < z))/nrow(df)
}
res <- optim(par = c(35,15,15), fn = find_best, lower=c(0,0,0), upper=c(100,100,100), control = list(fnscale = -1))
which gives
> res
$par
[1] 35.085 15.205 15.225
$value
[1] 0.00881
$counts
function gradient
2 2
$convergence
[1] 0
$message
[1] "CONVERGENCE: NORM OF PROJECTED GRADIENT <= PGTOL"

Related

floating-point error fligner.test r function?

I have noticed that using the statistical test fligner.test from the r stats package provides different results with a simple transformation, even though this shouldn't be the case.
Here an example (the difference for the original dataset is much more dramatic):
g <- factor(rep(1:2, each=6))
x1 <- c(2,2,6,6,1,4,5,3,5,6,5,5)
x2 <- (x1-1)/5 #> cor(x1,x2) [1] 1
fligner.test(x1,g) # chi-squared = 4.2794, df = 1, p-value = 0.03858
fligner.test(x2,g) # chi-squared = 4.8148, df = 1, p-value = 0.02822
Looking at the function code, I have noticed that the median centering might be causing the issue:
x1 <- x1 - tapply(x1,g,median)[g]
x2 <- x2 - tapply(x2,g,median)[g]
unique(abs(x1)) # 1 3 2 0
unique(abs(x2)) # 0.2 0.6 0.4 0.2 0.0 <- repeated 0.2
Is this a known issue, and how should this inconsistency be resolved?
I think your analysis is correct here. In your example the problem ultimately occurs because (0.8 - 0.6) == 0.2 is FALSE unless rounded to 15 decimal places. You should file a bug report, since this is avoidable.
If you are desperate in the meantime, you can adapt stats:::fligner.test.default by applying a tiny bit of rounding at the median centering stage to remove floating point inequalities:
fligner <- function (x, g, ...)
{
if (is.list(x)) {
if (length(x) < 2L)
stop("'x' must be a list with at least 2 elements")
DNAME <- deparse1(substitute(x))
x <- lapply(x, function(u) u <- u[complete.cases(u)])
k <- length(x)
l <- lengths(x)
if (any(l == 0))
stop("all groups must contain data")
g <- factor(rep(1:k, l))
x <- unlist(x)
}
else {
if (length(x) != length(g))
stop("'x' and 'g' must have the same length")
DNAME <- paste(deparse1(substitute(x)), "and",
deparse1(substitute(g)))
OK <- complete.cases(x, g)
x <- x[OK]
g <- g[OK]
g <- factor(g)
k <- nlevels(g)
if (k < 2)
stop("all observations are in the same group")
}
n <- length(x)
if (n < 2)
stop("not enough observations")
x <- round(x - tapply(x, g, median)[g], 15)
a <- qnorm((1 + rank(abs(x))/(n + 1))/2)
a <- a - mean(a)
v <- sum(a^2)/(n - 1)
a <- split(a, g)
STATISTIC <- sum(lengths(a) * vapply(a, mean, 0)^2)/v
PARAMETER <- k - 1
PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
names(STATISTIC) <- "Fligner-Killeen:med chi-squared"
names(PARAMETER) <- "df"
METHOD <- "Fligner-Killeen test of homogeneity of variances"
RVAL <- list(statistic = STATISTIC, parameter = PARAMETER,
p.value = PVAL, method = METHOD, data.name = DNAME)
class(RVAL) <- "htest"
return(RVAL)
}
This now returns the correct result for both your vectors:
fligner(x1,g)
#>
#> Fligner-Killeen test of homogeneity of variances
#>
#> data: x1 and g
#> Fligner-Killeen:med chi-squared = 4.2794, df = 1, p-value = 0.03858
fligner(x2,g)
#>
#> Fligner-Killeen test of homogeneity of variances
#>
#> data: x2 and g
#> Fligner-Killeen:med chi-squared = 4.2794, df = 1, p-value = 0.03858

How to set NonConvex = 2 in Gurobi in R?

I get this error when I run the MWE code below. Does anyone know how to resolve this? thanks!
Error: Error 10020: Q matrix is not positive semi-definite (PSD). Set NonConvex parameter to 2 to solve model.
MWE:
library(gurobi)
library(Matrix)
model <- list()
#optimization problem:
# max x + y
# s.t.
# -x + y <= 0
# x^2 - y^2 <= 10
# 0 <= x < = 20
# 0 <= y <= 20
model$obj <- c(1,1)
model$A <- matrix(c(-1,1), nrow=1, byrow=T) # for LHS of linear constraint: -x + y <= 0
model$rhs <- c(0) # for RHS of linear constraint: -x + y <= 0
model$ub[1] = 20 # x < = 20
model$ub[2] = 20 # y < = 20
model$sense <- c('<')
# non-convex quadratic constraint: x^2 - y^2 <= 10
qc1 <- list()
qc1$Qc <- spMatrix(2, 2, c(1, 2), c(1, 2), c(1.0, -1.0))
qc1$rhs <- 10
model$quadcon <- list(qc1)
#the QC constraint is a non-convex quadratic constraint, so set NonConvex = 2
model$params <- list(NonConvex=2)
gurobi_write(model,'quadtest.lp', env)
result <- gurobi(model) # THIS IS WHERE I GET THE ERROR ABOVE
print(result$objval)
print(result$x)
NM...i see that I can fix this by not putting the params as part of the model list, and instead running it as an input to the gurobi(,) call as follows:
params <- list(NonConvex=2)
result <- gurobi(model, params)

Finding parameters of a system of non-linear equations in R using BB package

I am trying to solve a system of equation to find parameters (x[1] and x[2]) using BB package but R outputs Unsuccessful convergence precisely it is:
Iteration: 0 ||F(x0)||: 469.829
iteration: 10 ||F(xn)|| = 0.5105992
iteration: 20 ||F(xn)|| = 0.05530002
$`par`
[1] 26.2772128 -0.9905601
$residual
[1] 0.01809358
$fn.reduction
[1] 664.413
$feval
[1] 31
$iter
[1] 25
$convergence
[1] 3
$message
[1] "Failure: Error in function evaluation"
Warning message:
In dfsane(par = p0, fn = f3) : Unsuccessful convergence.
Simburr.f1 <- function(n, tau) {
u=runif(n)
x<- runif(n)
lambda = exp(1+x)
y= (1/(u^(1/lambda))-1)^(1/tau)
y
}
y = Simburr.f1(300,0.5)
x = runif(300)
f3 <- function(x) {
n=300
x1 <- x[1]
x2 <- x[2]
F <- rep(NA, 2)
F[1] <- n/sum(log(1+y^(x[1])))
F[2] <- n/-sum(log(y))+(x[2]+1)*sum((log(y)*y^x[1])/(1+y^x[1]))
return(F)
}
p0 <- c(0.5,1)
dfsane(par=p0, fn=f3)

Function: sapply in apply, removing outliers

I'm working on a function which will get rid of outliers in a given data set based on 3 sigma rule. My code is presented below. "data" is a data set to be processed.
rm.outlier <- function(data){
apply(data, 2, function(var) {
sigma3.plus <- mean(var) + 3 * sd(var)
sigma3.min <- mean(var) - 3 * sd(var)
sapply(var, function(y) {
if (y > sigma3.plus){
y <- sigma3.plus
} else if (y < sigma3.min){
y <- sigma3.min
} else {y <- y}
})
})
as.data.frame(data)
}
In order to check if the function works I wrote a short test:
set.seed(123)
a <- data.frame("var1" = rnorm(10000, 0, 1))
b <- a
sum(a$var1 > mean(a$var1) + 3 * sd(a$var1)) # number of outliers in a
As a result, I get:
[1] 12
So the variable var1 in the data frame a has 12 outliers. Next, I try to apply my function on this object:
a2 <- rm.outlier(a)
sum(b$var1 - a2$var1)
Unfortunately, it gives 0 which clearly indicates that something does not work. I have already worked out that the implementation of sapply is correct so there must be a mistake in my apply. Any help would be appreciated.
If runtime is important for you, you might consider another approach. You could vectorize this filtering, e.g. by using pmin and pmax which is equally readable and > 15x times faster. If you like it a little bit more complex you could use findInterval and get even more speed:
rm.outlier2 <- function(x) {
## calculate -3/3 * sigma borders
s <- mean(x) + c(-3, 3) * sd(x)
pmin(pmax(x, s[1]), s[2])
}
rm.outlier3 <- function(x) {
## calculate -3/3 * sigma borders
s <- mean(x) + c(-3, 3) * sd(x)
## sorts x into intervals 0 == left of s[1], 2 == right of s[2], 1
## between both s
i <- findInterval(x, s)
## which values are left/right of the interval
j <- which(i != 1L)
## add a value between s to directly use output of findInterval for subsetting
s2 <- c(s[1], 0, s[2])
## replace all values that are left/right of the interval
x[j] <- s2[i[j] + 1L]
x
}
Benchmarking the stuff:
## slightly modified OP version
rm.outlier <- function(x) {
sigma3 <- mean(x) + c(-3,3) * sd(x)
sapply(x, function(y) {
if (y > sigma3[2]){
y <- sigma3[2]
} else if (y < sigma3[1]){
y <- sigma3[1]
} else {y <- y}
})
}
set.seed(123)
a <- rnorm(10000, 0, 1)
# check output
all.equal(rm.outlier(a), rm.outlier2(a))
all.equal(rm.outlier2(a), rm.outlier3(a))
library("rbenchmark")
benchmark(rm.outlier(a), rm.outlier2(a), rm.outlier3(a),
order = "relative",
columns = c("test", "replications", "elapsed", "relative"))
# test replications elapsed relative
#3 rm.outlier3(a) 100 0.028 1.000
#2 rm.outlier2(a) 100 0.102 3.643
#1 rm.outlier(a) 100 1.825 65.179
It seems like you just forgot to assign your results of the apply function to a new dataframe. (Compare the 3rd line with your code)
rm.outlier <- function(data){
# Assign the result to a new dataframe
data_new <- apply(data, 2, function(var) {
sigma3.plus <- mean(var) + 3 * sd(var)
sigma3.min <- mean(var) - 3 * sd(var)
sapply(var, function(y) {
if (y > sigma3.plus){
y <- sigma3.plus
} else if (y < sigma3.min){
y <- sigma3.min
} else {y <- y}
})
})
# Print the new dataframe
as.data.frame(data_new)
}
set.seed(123)
a <- data.frame("var1" = rnorm(10000, 0, 1))
sum(a$var1 > mean(a$var1) + 3 * sd(a$var1)) # number of too big outliers
# 15
sum(a$var1 < mean(a$var1) - 3 * sd(a$var1)) # number of too small outliers
# 13
# Overall 28 outliers
# Check the function for the number of outliers
a2 <- rm.outlier(a)
sum(a2$var1 == a$var1) - length(a$var1)

Triple integral in R (how to specifying the domain)

I would like to compute the triple integral of a function of three variables f(x,y,z) in R. I'm using the package cubature and the function adaptIntegrate(). The integrand is equal to 1 only in a certain domain (x<y<z, 0 otherwise) which I don't know how to specify. I'm trying 2 different implementations of the function, but none of them work:
#First implementation
fxyz <- function(w) {
x <- w[1]
y <- w[2]
z <- w[3]
x*y*z*(x < y)&(y < z)
}
#Second implementation
fxyz <- function(w) {
x <- w[1]
y <- w[2]
z <- w[3]
if(x<y&y<z)
out<-1
else
out<-0
out
}
#Computation of integral
library(cubature)
lower <- rep(0,3)
upper <- rep(1, 3)
adaptIntegrate(f=fxyz, lowerLimit=lower, upperLimit=upper, fDim = 3)
Any idea on how to specify the domain correctly?
I don't know about the cubature package, but you can do this by repeated application of base R's integrate function for one-dimensional integration.
f.xyz <- function(x, y, z) ifelse(x < y & y < z, 1, 0)
f.yz <- Vectorize(function(y, z) integrate(f.xyz, 0, 1, y=y, z=z)$value,
vectorize.args="y")
f.z <- Vectorize(function(z) integrate(f.yz, 0, 1, z=z)$value,
vectorize.args="z")
integrate(f.z, 0, 1)
# 0.1666632 with absolute error < 9.7e-05
You'll probably want to play with the control arguments to set the numeric tolerances; small errors in the inner integration can turn into big ones on the outside.
In your first function the return value is wrong. It should be as.numeric(x<=y)*as.numeric(y<=z). In your second function you should also use <= instead of <, otherwise `adapIntegrate won't work correctly. You also need to specify a maximum number of evaluations. Try this
library(cubature)
lower <- rep(0,3)
upper <- rep(1,3)
# First implementation (modified)
fxyz <- function(w) {
x <- w[1]
y <- w[2]
z <- w[3]
as.numeric(x <= y)*as.numeric(y <= z)
}
adaptIntegrate(f=fxyz,lowerLimit=lower,upperLimit=upper,doChecking=TRUE,
maxEval=2000000,absError=10e-5,tol=1e-5)
#$integral
#[1] 0.1664146
#$error
#[1] 0.0001851699
#$functionEvaluations
#[1] 2000031
#$returnCode
#[1] 0
The domain 0 <= x <= y <= z <= 1 is the "canonical" simplex. To integrate over a simplex, use the SimplicialCubature package.
library(SimplicialCubature)
f <- function(x) 1
S <- CanonicalSimplex(3)
> adaptIntegrateSimplex(function(x) 1, S)
$integral
[1] 0.1666667
$estAbsError
[1] 1.666667e-13
$functionEvaluations
[1] 55
$returnCode
[1] 0
$message
[1] "OK"
Note that integrating the constant function f(x)=1 over the simplex simply gives the volume of the simplex, which is 1/6. The integration is useless for this example.
> SimplexVolume(S)
[1] 0.1666667

Resources