I had asked this question earlier, and wanted to continue with a follow-up since I tried some other things and they didn't quite work out.
I am essentially trying to optimize an NLP type problem in R, which has binary and integer constraints. The code for the same is below :
# Input Data
DTM <- sample(1:30,10,replace=T)
DIM <- rep(30,10)
Price <- 100 - seq(0.4,1,length.out=10)
# Variables that shall be changed to find optimal solution
Hike <- c(1,0,0,1,0,0,0,0,0,1)
Position <- c(0,1,-2,1,0,0,0,0,0,0)
# Bounds for Hikes/Positions
HikeLB <- rep(0,10)
HikeUB <- rep(1,10)
PositionLB <- rep(-2,10)
PositionUB <- rep(2,10)
library(Rsolnp)
# x <- c(Hike, Position)
# Combining two arrays into one since I want
# to optimize using both these variables
opt_func <- function(x) {
Hike <- head(x,length(x)/2)
Position <- tail(x,length(x)/2)
hikes_till_now <- cumsum(Hike) - Hike
PostHike <- numeric(length(Hike))
for (i in seq_along(Hike)){
PostHike[i] <- 99.60 - 0.25*(Hike[i]*(1-DTM[i]/DIM[i]))
if(i>1) {
PostHike[i] <- PostHike[i] - 0.25*hikes_till_now[i]
}
}
Pnl <- Position*(PostHike-Price)
return(-sum(Pnl)) # Since I want to maximize sum(Pnl)
}
#specify the in-equality function for Hike
unequal <- function(x) {
Hike <- head(x,length(x)/2)
return(sum(Hike))
}
#specify the equality function for Position
equal <- function(x) {
Position <- tail(x,length(x)/2)
return(sum(Position))
}
#the optimiser
solnp(c(Hike,Position), opt_func,
eqfun=equal, eqB=0,
ineqfun=unequal, ineqUB=3, ineqLB=1,
LB=c(HikeLB,PositionLB), UB=c(HikeUB,PositionUB))
I get the following warning/error :
# solnp--> Solution not reliable....Problem Inverting Hessian.
What I understand is that the Hessian is a sparse matrix and therefore there might be issues in inverting? Also, might there be some better way to do this optimization, since it doesn't seem like a complicated problem and I feel I am missing something fairly straightforward here!
The description of the problem is given in this question in good detail.
Any help would be greatly appreciated.
I think the algorithm is stuck in a local minimum. I helped the algorithm with a "pre-minimization" procedure with the R package DEoptim and it seems to work. You can check the output below.
# Input Data
DTM <- sample(1 : 30, 10, replace = TRUE)
DIM <- rep(30, 10)
Price <- 100 - seq(0.4, 1, length.out = 10)
# Variables that shall be changed to find optimal solution
Hike <- c(1,0,0,1,0,0,0,0,0,1)
Position <- c(0,1,-2,1,0,0,0,0,0,0)
# Bounds for Hikes/Positions
HikeLB <- rep(0,10)
HikeUB <- rep(1,10)
PositionLB <- rep(-2,10)
PositionUB <- rep(2,10)
# specify the in-equality function for Hike
unequal <- function(x)
{
Hike <- head(x,length(x) / 2)
return(sum(Hike))
}
# specify the equality function for Position
equal <- function(x)
{
Position <- tail(x,length(x) / 2)
return(sum(Position))
}
opt_func <- function(x, const = 10 ^ 30, const_Include = 0)
{
val_Eq <- equal(x)
val_Uneq <- unequal(x)
if(val_Uneq > 3)
{
return(const)
}else if(val_Uneq < 1)
{
return(const)
}else
{
Hike <- head(x,length(x) / 2)
Position <- tail(x,length(x) / 2)
hikes_till_now <- cumsum(Hike) - Hike
PostHike <- numeric(length(Hike))
for(i in seq_along(Hike))
{
PostHike[i] <- 99.60 - 0.25 * (Hike[i] * (1 - DTM[i] / DIM[i]))
if(i > 1)
{
PostHike[i] <- PostHike[i] - 0.25 * hikes_till_now[i]
}
}
Pnl <- Position * (PostHike - Price)
return((-sum(Pnl) + const_Include * 10 ^ 5 * val_Eq ^ 2))
}
}
library(DEoptim)
obj_DEoptIter <- DEoptim(fn = opt_func, lower = c(HikeLB, PositionLB),
upper = c(HikeUB, PositionUB),
list(itermax = 4000), const_Include = 1)
equal(obj_DEoptIter$optim$bestmem)
opt_func(obj_DEoptIter$optim$bestmem, const_Include = 0)
vector_Eta <- c(0.5, 0.25, 0.15, 0.1, 0.05, 0.05)
nb_Eta <- length(vector_Eta)
list_Obj_DEoptim <- list()
list_Obj_DEoptim[[1]] <- obj_DEoptIter
for(i in 1 : nb_Eta)
{
eta <- vector_Eta[i]
obj_DEoptIter1 <- list_Obj_DEoptim[[i]]
lower <- ifelse(obj_DEoptIter1$optim$bestmem < 0, (1 + eta) * obj_DEoptIter1$optim$bestmem, (1 - eta) * obj_DEoptIter1$optim$bestmem)
lower <- pmax(lower, c(HikeLB, PositionLB))
upper <- ifelse(obj_DEoptIter1$optim$bestmem < 0, (1 - eta) * obj_DEoptIter1$optim$bestmem, (1 + eta) * obj_DEoptIter1$optim$bestmem)
upper <- pmin(upper, c(HikeUB, PositionUB))
list_Obj_DEoptim[[i + 1]] <- DEoptim(fn = opt_func, lower = lower, upper = upper, list(itermax = 2000), const_Include = 1)
}
library(Rsolnp)
pars <- list_Obj_DEoptim[[nb_Eta + 1]]$optim$bestmem
pars
par1 par2 par3 par4 par5 par6 par7 par8 par9 par10 par11
1.378436e-05 2.024484e-05 1.770700e-06 2.826411e-06 4.351425e-05 9.483165e-05 6.086782e-04 2.978773e-04 3.993085e-04 9.990947e-01 -1.987184e+00
par12 par13 par14 par15 par16 par17 par18 par19 par20
-1.338216e+00 -1.996457e+00 -8.111605e-01 8.450319e-01 9.434997e-01 1.262152e+00 -8.391519e-01 1.977017e+00 1.944523e+00
solnp(pars, opt_func, eqfun = equal, eqB = 0,
ineqfun = unequal, ineqUB = 3, ineqLB = 1,
LB = c(HikeLB, PositionLB), UB = c(HikeUB,PositionUB))
Iter: 1 fn: -3.3333 Pars: 0.0000002057324 0.0000000422716 0.0000000203609 0.0000000069049 0.0000000042465 0.0000000005265 0.0000000053055 0.0000000110825 0.0000000281918 0.9999998374786 -1.9999999156676 -1.9999999139568 -1.9999998417164 -1.9999997579607 -1.9999976211239 1.9999981198069 1.9999995045018 1.9999997029469 1.9999998414763 1.9999998815401
Iter: 2 fn: -3.3333 Pars: 0.0000002031041 0.0000000416019 0.0000000198673 0.0000000066085 0.0000000039753 0.0000000003308 0.0000000050202 0.0000000107497 0.0000000277094 0.9999998403129 -1.9999999168693 -1.9999999149208 -1.9999998437310 -1.9999997605837 -1.9999976915876 1.9999981741728 1.9999995155922 1.9999997097596 1.9999998444069 1.9999998837611
solnp--> Completed in 2 iterations
$pars
par1 par2 par3 par4 par5 par6 par7 par8 par9 par10 par11
2.031041e-07 4.160187e-08 1.986733e-08 6.608512e-09 3.975269e-09 3.307675e-10 5.020223e-09 1.074966e-08 2.770940e-08 9.999998e-01 -2.000000e+00
par12 par13 par14 par15 par16 par17 par18 par19 par20
-2.000000e+00 -2.000000e+00 -2.000000e+00 -1.999998e+00 1.999998e+00 2.000000e+00 2.000000e+00 2.000000e+00 2.000000e+00
$convergence
[1] 0
$values
[1] -2.355177 -3.333333 -3.333333
$lagrange
[,1]
[1,] -0.2965841
[2,] 0.2187991
$hessian
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11]
[1,] 9.999798e-01 0.02742111 0.03375175 0.03362079 0.01640916 -0.10340640 -3.6590783 -3.9063315 -3.2571705 0.007948037 2.162646e-05
[2,] 2.742111e-02 66.88899630 61.41701189 -1.06357113 -65.20163998 -5.08593318 -15.3210883 24.7495950 95.4918041 -2.592847895 -2.660031e-02
[3,] 3.375175e-02 61.41701189 65.27279520 1.65679327 -64.08506082 -6.26030243 -61.7149713 -33.0447996 34.1906702 -2.048842423 -3.406531e-02
[4,] 3.362079e-02 -1.06357113 1.65679327 2.75444192 1.01778735 -1.88727085 -40.6383890 -21.7241256 -23.0845472 0.273012820 -3.390742e-02
[5,] 1.640916e-02 -65.20163998 -64.08506082 1.01778735 69.06953922 3.45702004 -0.3026398 1.5544219 -70.1962042 2.543536245 -1.651286e-02
[6,] -1.034064e-01 -5.08593318 -6.26030243 -1.88727085 3.45702004 4.31393562 52.9263744 8.0690045 4.1293583 -0.052762820 1.038279e-01
[7,] -3.659078e+00 -15.32108825 -61.71497133 -40.63838903 -0.30263981 52.92637439 1036.8692392 424.2554909 416.5996364 -5.304528929 3.673517e+00
[8,] -3.906331e+00 24.74959499 -33.04479955 -21.72412557 1.55442188 8.06900454 424.2554909 611.1234552 605.6589299 -4.994038897 3.923389e+00
[9,] -3.257170e+00 95.49180407 34.19067019 -23.08454723 -70.19620420 4.12935827 416.5996364 605.6589299 681.7605089 -7.626377583 3.273432e+00
[10,] 7.948037e-03 -2.59284790 -2.04884242 0.27301282 2.54353625 -0.05276282 -5.3045289 -4.9940389 -7.6263776 1.136211890 -8.005497e-03
[11,] 2.162646e-05 -0.02660031 -0.03406531 -0.03390742 -0.01651286 0.10382789 3.6735172 3.9233885 3.2734322 -0.008005497 9.999767e-01
[12,] 2.560506e-04 -1.28441997 -1.03624297 0.12124532 1.25787404 -0.01537463 -3.1259275 -3.7153566 -4.7747487 0.067321907 -3.004553e-04
[13,] 5.123418e-04 -1.43173666 -0.93034646 0.22381646 1.30268827 -0.06388374 -4.8019860 -5.5804183 -6.7831527 0.087150919 -6.003854e-04
[14,] 5.554612e-04 0.98471992 1.35764165 0.19241155 -1.11681403 -0.25008321 -6.2599910 -6.4421787 -4.7866436 -0.004199357 -6.259218e-04
[15,] 4.473410e-04 -0.11869196 0.19819250 0.19407189 0.08815337 -0.17579817 -6.5077248 -7.1035569 -6.4692030 0.040393548 -5.242930e-04
[16,] -2.085873e-04 -1.96550988 -0.68338138 0.45350085 1.39119548 -0.11589229 -6.9686856 -8.8689925 -10.9267740 0.133277513 2.221037e-04
[17,] -2.873929e-04 -0.65632927 0.37181355 0.35099772 0.14218820 -0.14396492 -5.5117954 -6.9879800 -7.6638548 0.070739356 3.402369e-04
[18,] 2.361230e-03 1.85793311 0.66958680 -0.46036814 -1.38871483 0.11073021 8.3464247 11.2659358 12.7138766 -0.143080948 -2.682272e-03
[19,] -4.073214e-04 1.74650856 2.64058481 0.30085238 -2.28580164 -0.29320084 -5.5232047 -6.2299486 -4.3128641 -0.025281367 4.807052e-04
[20,] 6.691791e-04 0.97282574 1.27569962 0.11147845 -1.15542498 -0.06862054 0.1078153 0.3291076 0.9647769 -0.022334033 -7.534958e-04
[21,] 6.199951e-04 0.95352282 1.29179584 0.12737426 -1.14776727 -0.09857535 -0.8077458 -0.5834213 0.1413152 -0.019358184 -6.974727e-04
[,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21]
[1,] 0.0002560506 0.0005123418 0.0005554612 0.000447341 -2.085873e-04 -0.0002873929 0.002361230 -0.0004073214 0.0006691791 0.0006199951
[2,] -1.2844199676 -1.4317366600 0.9847199178 -0.118691962 -1.965510e+00 -0.6563292684 1.857933110 1.7465085621 0.9728257362 0.9535228153
[3,] -1.0362429738 -0.9303464586 1.3576416527 0.198192496 -6.833814e-01 0.3718135470 0.669586799 2.6405848119 1.2756996162 1.2917958363
[4,] 0.1212453150 0.2238164645 0.1924115460 0.194071891 4.535009e-01 0.3509977184 -0.460368143 0.3008523758 0.1114784546 0.1273742585
[5,] 1.2578740408 1.3026882661 -1.1168140329 0.088153370 1.391195e+00 0.1421882038 -1.388714829 -2.2858016360 -1.1554249791 -1.1477672657
[6,] -0.0153746283 -0.0638837380 -0.2500832107 -0.175798170 -1.158923e-01 -0.1439649211 0.110730209 -0.2932008389 -0.0686205407 -0.0985753501
[7,] -3.1259275281 -4.8019860058 -6.2599909562 -6.507724752 -6.968686e+00 -5.5117954046 8.346424660 -5.5232046808 0.1078153176 -0.8077458454
[8,] -3.7153565741 -5.5804183495 -6.4421786729 -7.103556916 -8.868992e+00 -6.9879799717 11.265935833 -6.2299486257 0.3291076177 -0.5834213196
[9,] -4.7747486553 -6.7831526938 -4.7866436050 -6.469203037 -1.092677e+01 -7.6638548254 12.713876638 -4.3128641023 0.9647769448 0.1413152244
[10,] 0.0673219073 0.0871509195 -0.0041993566 0.040393548 1.332775e-01 0.0707393560 -0.143080948 -0.0252813670 -0.0223340333 -0.0193581838
[11,] -0.0003004553 -0.0006003854 -0.0006259218 -0.000524293 2.221037e-04 0.0003402369 -0.002682272 0.0004807052 -0.0007534958 -0.0006974727
[12,] 1.0298639515 0.0384261248 -0.0088672552 0.010364778 6.830500e-02 0.0382383110 -0.069344565 -0.0092977594 -0.0116491404 -0.0102362279
[13,] 0.0384261248 1.0557371024 0.0039723626 0.021060697 1.151472e-01 0.0762447552 -0.116140750 0.0240732365 -0.0015522014 0.0012502622
[14,] -0.0088672552 0.0039723626 1.0405811350 0.015598750 5.107293e-02 0.0597045666 -0.056121550 0.0958555701 0.0358818177 0.0381463360
[15,] 0.0103647784 0.0210606967 0.0155987497 1.011768391 7.077174e-02 0.0599946674 -0.087157476 0.0541735129 0.0107826788 0.0132195770
[16,] 0.0683050028 0.1151472294 0.0510729318 0.070771744 4.640564e-01 0.3981262980 0.045718792 0.3317973279 0.0521686203 0.0517223181
[17,] 0.0382383110 0.0762447552 0.0597045666 0.059994667 3.981263e-01 0.3625712839 0.107458944 0.3438763827 0.0629936354 0.0613058394
[18,] -0.0693445652 -0.1161407495 -0.0561215497 -0.087157476 4.571879e-02 0.1074589438 0.557297695 0.1719157196 -0.0771914128 -0.0669185711
[19,] -0.0092977594 0.0240732365 0.0958555701 0.054173513 3.317973e-01 0.3438763827 0.171915720 0.4136119154 0.1004220146 0.0980924411
[20,] -0.0116491404 -0.0015522014 0.0358818177 0.010782679 5.216862e-02 0.0629936354 -0.077191413 0.1004220146 1.0302634816 0.0329624994
[21,] -0.0102362279 0.0012502622 0.0381463360 0.013219577 5.172232e-02 0.0613058394 -0.066918571 0.0980924411 0.0329624994 1.0354545441
$ineqx0
[1] 1
$nfuneval
[1] 1048
$outer.iter
[1] 2
$elapsed
Time difference of 0.1874812 secs
$vscale
[1] 3.33333273 0.00000001 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000
[14] 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000
Related
I want to generat a veusing R.
Is there a way to generate a sequence of POSITIVE numbers that satisfy specific constraints
a mean of 13,
a standard deviation of 30.96 , and
a sample size of 6.
Thank you guys.
Another option (after echoing all the concerns about this being an XY problem):
We can transform n samples of practically any continuous distribution by finding a pair of scale and translation parameters that satisfies the desired constraints.
f <- function(n, mu, sigma) {
x <- rnorm(n) # substitute any continuous distribution here
fn <- function(par) {
y <- exp(x*par[2] + par[1])
log(abs(mean(y) - mu) + abs(sd(y) - sigma))
}
with(optim(c(0, 0), fn), exp(x*par[2] + par[1]))
}
# example usage
f(6L, 13, 30.96)
#> [1] 2.569263e-09 1.912637e-06 9.521086e-05 4.023787e-01 7.618698e+01 1.410541e+00
# take 20 sets of samples
m <- matrix(NA, 20, 8)
for (i in 1:nrow(m)) {
m[i, 1:6] <- sort(f(6L, 13, 30.96))
m[i, 7] <- mean(m[i, 1:6]) - 13 # difference from desired mean
m[i, 8] <- sd(m[i, 1:6]) - 30.96 # difference from desired SD
}
m
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
#> [1,] 2.373175e-12 5.239577e-08 1.612212e-07 9.033024e-01 0.9063398 76.19036 -7.105427e-15 -1.776357e-14
#> [2,] 1.185131e-17 2.420683e-12 1.967461e-11 1.613739e-03 1.8189977 76.17939 -7.105427e-15 3.552714e-15
#> [3,] 4.178030e-03 6.449223e-03 3.329587e-02 7.889274e-02 1.6949839 76.18220 1.776357e-15 0.000000e+00
#> [4,] 1.134263e-19 8.582391e-13 3.675582e-12 7.909737e-06 1.8206428 76.17935 -1.776357e-15 -3.552714e-15
#> [5,] 3.875063e-11 5.453168e-06 1.005380e-05 7.971751e-02 1.7390532 76.18121 -3.552714e-15 0.000000e+00
#> [6,] 2.057442e-10 1.395120e-04 2.825930e-03 8.257653e-01 0.9809560 76.19031 1.065814e-14 0.000000e+00
#> [7,] 3.281160e-02 3.939408e-02 4.617334e-02 6.313399e-01 1.0591873 76.19109 -1.776357e-15 -1.776357e-14
#> [8,] 1.652719e-08 1.700647e-08 2.909621e-05 1.366993e-02 1.8066190 76.17968 -1.065814e-14 -1.065814e-14
#> [9,] 3.231989e-18 1.103402e-08 4.891128e-08 2.246015e-01 1.5912433 76.18416 1.776357e-15 0.000000e+00
#> [10,] 2.147959e-25 1.412579e-21 4.929303e-16 7.013902e-04 1.8199323 76.17937 0.000000e+00 -5.684342e-14
#> [11,] 8.239083e-08 1.516883e-05 4.268010e-01 6.580114e-01 0.7215226 76.19365 1.065814e-14 3.552714e-15
#> [12,] 8.837010e-05 7.983109e-04 3.712888e-03 9.311085e-03 1.8064017 76.17969 0.000000e+00 -7.105427e-15
#> [13,] 3.586152e-178 2.149918e-134 1.681243e-56 2.681863e-09 1.8206509 76.17935 -5.329071e-15 -9.592327e-14
#> [14,] 5.860182e-07 1.645025e-05 1.057840e-03 3.878328e-02 1.7798381 76.18030 -1.776357e-15 -3.552714e-15
#> [15,] 2.363474e-05 1.090204e-03 3.533081e-03 2.924378e-01 1.5174856 76.18543 0.000000e+00 1.421085e-14
#> [16,] 1.666746e-03 6.105670e-02 1.966066e-01 6.731973e-01 0.8746421 76.19283 1.776357e-15 0.000000e+00
#> [17,] 1.633101e-06 5.641356e-05 2.427083e-02 9.874914e-02 1.6947355 76.18219 -1.598721e-14 -7.105427e-15
#> [18,] 2.124617e-72 2.143486e-68 9.930707e-68 1.367184e-08 1.8206509 76.17935 2.842171e-14 -1.776357e-14
#> [19,] 5.846315e-19 1.528350e-06 2.030263e-06 7.959439e-04 1.8198318 76.17937 -1.776357e-15 -1.776357e-14
#> [20,] 5.630657e-03 1.462770e-01 2.914364e-01 6.119128e-01 0.7504916 76.19425 1.776357e-15 0.000000e+00
Inspired by #SamMason's comment, here is an empirical solution. Though I am concerned that providing OP with an answer may actually do them a disservice, since I suspect this may well be an XY problem.
First, establish if it is possible to construct a set of six numbers that satisfy OP's constraints:
f <- function(x) {
y <- c(rep(x, 5), 6*13 - 5*x)
sd(y) - 30.96
}
uniroot(f, c(0, 1))$root
[1] 0.3606329
> y
[1] 0.3606329 0.3606329 0.3606329 0.3606329 0.3606329 76.1968355
> mean(y)
[1] 13
> sd(y)
[1] 30.96
So, yes, it is possible to construct a set of six numbers that meets OP's constraints. Now introduce a (small) degree of variation into the first five elements of the sample, fix the sixth to satisfy the mean constraint and calculate the sample SD. If the SD isn't "close enough" to the target, throw the sample away and try again. Impose an upper limit to the number of tries to prevent infinite looping.
# For reproducibility
set.seed(1234)
f1 <- function(sd) {
y <- rnorm(5, mean=0.3606329, sd=sd)
y[6] <- 6*13 - sum(y)
y
}
findIt <- function(sd, epsilon=0.001, maxIter=1000) {
iteration <- 0
found <- FALSE
while(!found) {
z <- f1(sd)
if (abs(sd(z) - 30.96) < epsilon) {
found <- TRUE
return(z)
}
iteration <- iteration + 1
if (iteration == maxIter) {
warning(paste0("No solution found after ", maxIter, " iterations"))
return(NA)
}
}
}
z <- findIt(0.2)
> z
[1] 0.44505164 0.66907765 0.47566925 0.09247431 0.12141987 76.19630728
> mean(z)
[1] 13
> sd(z)
[1] 30.96053
Note that findIt() is sloppy in that it assumes that the SD is "small enough" that all generated values are positive.
This procedure generates a sample that is "random" in the sense that five of the values are iid N(0.3606329, sigma * sigma) and the fifth is (highly) correlated with the sum of the other five. The joint distribution of all six values in the sample is not obvious to me.
I can't imagine a scenario in which this is a sensible thing to do.
I am given an empirical distribution FXemp of a real-valued random variable X. Given now X1,..., Xn having the same distribution as X and dependencies given by a copula C. I would like now to produce random samples of X1,..., Xn element of R.
E.g. I am given a vector of samples and the corresponding cdf
x <- rnorm(1000)
df <- ecdf(x)
Assume that I pick for a example a t-student or Clayton copula C. How can I produce random samples of for example 10 copies of x, where their dependency is determined by C.
Is there an easy way?
Or are their any packages that can be used here?
You can sample from the copula (with uniform margins) by using the copula package, and then apply the inverse ecdf to each component:
library(copula)
x <- rnorm(100) # sample of X
d <- 5 # desired number of copies
copula <- claytonCopula(param = 2, dim = d)
nsims <- 25 # number of simulations
U <- rCopula(nsims, copula) # sample from the copula (with uniform margins)
# now sample the copies of X ####
Xs <- matrix(NA_real_, nrow = nsims, ncol = d)
for(i in 1:d){
Xs[,i] <- quantile(x, probs = U[,i], type = 1) # type=1 is the inverse ecdf
}
Xs
# [,1] [,2] [,3] [,4] [,5]
# [1,] -0.5692185 -0.9254869 -0.6821624 -1.2148041 -0.682162391
# [2,] -0.4680407 -0.4263257 -0.3456553 -0.6132320 -0.925486872
# [3,] -1.1322063 -1.2148041 -0.8115089 -1.0074435 -1.430405604
# [4,] 0.9760268 1.2600186 1.0731551 1.2369623 0.835024471
# [5,] -1.1280825 -0.8995429 -0.5761037 -0.8115089 -0.543125426
# [6,] -0.1848303 -1.2148041 -0.5692185 0.8974921 -0.613232036
# [7,] -0.5692185 -0.3070884 -0.8995429 -0.8115089 -0.007292346
# [8,] 0.1696306 0.4072428 0.7646646 0.4910863 1.236962330
# [9,] -0.7908557 -1.1280825 -1.2970952 0.3655081 -0.633521404
# [10,] -1.3226053 -1.0074435 -1.6857615 -1.3226053 -1.685761474
# [11,] -2.5410325 -2.3604936 -2.3604936 -2.3604936 -2.360493569
# [12,] -2.3604936 -2.2530003 -1.9311289 -2.2956444 -2.360493569
# [13,] 0.4072428 -0.2150035 -0.3564803 -0.1051930 -0.166434458
# [14,] -0.4680407 -1.0729763 -0.6335214 -0.8995429 -0.899542914
# [15,] -0.9143225 -0.1522242 0.4053462 -1.0729763 -0.158375658
# [16,] -0.4998761 -0.7908557 -0.9813504 -0.1763604 -0.283013334
# [17,] -1.2148041 -0.9143225 -0.5176347 -0.9143225 -1.007443492
# [18,] -0.2150035 0.5675260 0.5214050 0.8310799 0.464151265
# [19,] -1.2148041 -0.6132320 -1.2970952 -1.1685962 -1.132206305
# [20,] 1.4456635 1.0444720 0.7850181 1.0742214 0.785018119
# [21,] 0.3172811 1.2369623 -0.1664345 0.9440006 1.260018624
# [22,] 0.5017980 1.4068250 1.9950305 1.2600186 0.976026807
# [23,] 0.5675260 -1.0729763 -1.2970952 -0.3653535 -0.426325703
# [24,] -2.5410325 -2.2956444 -2.3604936 -2.2956444 -2.253000326
# [25,] 0.4053462 -0.5431254 -0.5431254 0.8350245 0.950891450
I am a beginner in R and I have written a double-for loop for calculating chi2 values for selecting features among 6610 terms and 10 classes.
Here is my for loops:
library(raster)
#for x^2 [n,r] = term n, class r. n starts from col #7 and r starts from col #6617
chi2vals <- matrix(0:0,6610,10)
chi2avgs <- vector("numeric",6610L)
for(r in 1:10){
for(n in 1:6610){
A = sum(data1.sub.added[,6+n]==1 & data1.sub.added[,6616+r]==1)
M = sum(data1.sub.added[,6+n]==1)
P = sum(data1.sub.added[,6616+r]==1)
N = nrow(data1.sub.added)
E = ((A*N)-(M*P))**2
F = (N-P)*(N-M)
chi2vals[n,r] = (N/(P*M))*(E/F) # for term n
}
Prcj = sum(data1.sub.added[,6616+r]==1)/sum(data1.sub.added[,6616:6626]==1) #probability of class c_r
pchi <- Prcj * chi2vals
chi2avgs[n] = rowSums(pchi)[n]
}
The code correctly calculates everything up to the line pchi <- Prcj * chi2vals. The result is a nice matrix of p*chi2 values:
> head(pchi)
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
[1,] 128.36551442 0.239308113 0.683517530 1.5038665 0.6145058 3.656857e-01 1.3311564 2.6977448 0.410702803
[2,] 0.06632758 0.067970859 0.019178551 0.2900692 1.5300639 4.430705e-08 0.2599859 0.6362953 0.098745147
[3,] 1.85641330 1.411925435 3.590747764 7.3018416 38.8044465 4.102248e-01 6.4118078 13.0164994 1.709506238
[4,] 0.11063892 0.005039029 0.244964758 0.1622654 0.1156411 8.274468e+00 0.2564959 0.0577651 0.242946022
[5,] 0.04788648 0.049072885 0.001420669 0.2094211 1.7200152 2.045923e-01 0.1877019 0.1468187 0.005493183
[6,] 5.39946188 6.899336618 60.735646913 7.4351538 10.7005784 9.946261e+00 35.8868899 178.7112406 11.382740754
[,10]
[1,] 0.26436516
[2,] 0.14414444
[3,] 0.90292073
[4,] 0.01168997
[5,] 0.06641298
[6,] 19.68599142
But the final chi2avgs values mostly turn out to be zeros:
> head(chi2avgs)
[1] 0.000000 0.000000 0.000000 0.000000 2.638835 0.000000
However, when aside from the loop I replace n with any number, the last line works well:
chi2avgs[1] = rowSums(pchi)[1]
chi2avgs[2] = rowSums(pchi)[2]
chi2avgs[3] = rowSums(pchi)[3]
chi2avgs[4] = rowSums(pchi)[4]
chi2avgs[5] = rowSums(pchi)[5]
> head(chi2avgs)
[1] 136.476367 3.112781 75.416334 9.481914 2.638835 0.000000
I wonder what causes this problem. Do you have an idea how I can fix it?
You can try directly rowsums without [n]
chi2avgs = rowSums(pchi)
I have two lists of more than 1500 elements, one vector list and one matrix list. Here some example data:
Z_matr <- list("111.2012"= matrix(c(0,0,100,200,0,0,0,0,50,350,0,50,50,200,200,0),
nrow = 4, ncol = 4, byrow = T),
"112.2012"= matrix(c(10,90,0,30,10,90,0,10,200,50,10,350,150,100,200,10),
nrow = 4, ncol = 4, byrow = T))
p <- list("111.2012"=c(200, 1000, 100, 10), "112.2012"=c(300, 900, 50, 100))
On this two list I want to perform the following function, which of course works fine on this data:
kast <- function(Z_matr, p) {
imp <- rowSums(Z_matr)
exp <- colSums(Z_matr)
x = p + imp
ac = p + imp - exp
einsdurchx = 1/as.vector(x)
einsdurchx[is.infinite(einsdurchx)] <- 0
A = Z_matr %*% diag(einsdurchx)
return(A)
}
mapply(kast, Z_matr,p, SIMPLIFY=FALSE)
However, I with my original lists I get an error. What I need is a counting of the list names that already had been calculated before it comes to the error list element (so that I know which of the list combinations creates an error). So, I tryed print(names(A)) however I only get NULL, NULL... How can I get instead this, for this example 111.2012 and 112.2012 with print?
Set it up so you pass the names and use to index the object:
kast <- function(item, p) { print(item)
imp <- rowSums(Z_matr[[item]])
exp <- colSums(Z_matr[[item]])
x = p + imp
ac = p + imp - exp
einsdurchx = 1/as.vector(x)
einsdurchx[is.infinite(einsdurchx)] <- 0
A = Z_matr[[item]] %*% diag(einsdurchx)
return(A)
}
mapply(kast, names(Z_matr),p, SIMPLIFY=FALSE)
The output... obviously you take out the print statement:
[1] "111.2012"
[1] "112.2012"
$`111.2012`
[,1] [,2] [,3] [,4]
[1,] 0.0 0.00 0.1818182 0.4347826
[2,] 0.0 0.00 0.0000000 0.0000000
[3,] 0.1 0.35 0.0000000 0.1086957
[4,] 0.1 0.20 0.3636364 0.0000000
$`112.2012`
[,1] [,2] [,3] [,4]
[1,] 0.02325581 0.08910891 0.00000000 0.05357143
[2,] 0.02325581 0.08910891 0.00000000 0.01785714
[3,] 0.46511628 0.04950495 0.01515152 0.62500000
[4,] 0.34883721 0.09900990 0.30303030 0.01785714
This is a longstanding issue with the use of both s/lapply and mapply. Only the values and not the names of list items are passed to functions. They are only added back after the processing. You can see this if you attempt to print(deparse(substitute(Z_matr))) as the first call inside your example function.
Here is an excerpt of numeric matrix that I have
[1,] 30 -33.129487 3894754.1 -39.701738 -38.356477 -34.220534
[2,] 29 -44.289487 -8217525.9 -44.801738 -47.946477 -41.020534
[3,] 28 -48.439487 -4572815.9 -49.181738 -48.086477 -46.110534
[4,] 27 -48.359487 -2454575.9 -42.031738 -43.706477 -43.900534
[5,] 26 -38.919487 -2157535.9 -47.881738 -43.576477 -46.330534
[6,] 25 -45.069487 -5122485.9 -47.831738 -47.156477 -42.860534
[7,] 24 -46.207487 -2336325.9 -53.131738 -50.576477 -50.410534
[8,] 23 -51.127487 -2637685.9 -43.121738 -47.336477 -47.040534
[9,] 22 -45.645487 3700424.1 -56.151738 -47.396477 -50.720534
[10,] 21 -56.739487 1572594.1 -49.831738 -54.386577 -52.470534
[11,] 20 -46.319487 642214.1 -39.631738 -44.406577 -41.490534
What I want to do now, is to scale the values for each column to have values from 0 to 1.
I tried to accomplish this using the scale() function on my matrix (default parameters), and I got this
[1,] -0.88123100 0.53812440 -1.05963281 -1.031191482 -0.92872324
[2,] -1.17808251 -1.13538649 -1.19575096 -1.289013031 -1.11327085
[3,] -1.28847084 -0.63180980 -1.31265244 -1.292776849 -1.25141017
[4,] -1.28634287 -0.33914007 -1.12182012 -1.175023107 -1.19143220
[5,] -1.03524267 -0.29809911 -1.27795565 -1.171528133 -1.25738083
[6,] -1.19883019 -0.70775576 -1.27662116 -1.267774342 -1.16320727
[7,] -1.22910054 -0.32280189 -1.41807728 -1.359719044 -1.36810940
[8,] -1.35997055 -0.36443973 -1.15091204 -1.272613537 -1.27664977
[9,] -1.21415156 0.51127451 -1.49868058 -1.274226602 -1.37652260
[10,] -1.50924749 0.21727976 -1.33000083 -1.462151358 -1.42401647
[11,] -1.23207969 0.08873245 -1.05776452 -1.193844887 -1.12602635
Which is already close to what I want, but values from 0:1 were even better. I read the help manual of scale(), but I really don't understand how I would do that.
Try the following, which seems simple enough:
## Data to make a minimal reproducible example
m <- matrix(rnorm(9), ncol=3)
## Rescale each column to range between 0 and 1
apply(m, MARGIN = 2, FUN = function(X) (X - min(X))/diff(range(X)))
# [,1] [,2] [,3]
# [1,] 0.0000000 0.0000000 0.5220198
# [2,] 0.6239273 1.0000000 0.0000000
# [3,] 1.0000000 0.9253893 1.0000000
And if you were still to use scale:
maxs <- apply(a, 2, max)
mins <- apply(a, 2, min)
scale(a, center = mins, scale = maxs - mins)
Install the clusterSim package and run the following command:
normX = data.Normalization(x,type="n4");
scales package has a function called rescale:
set.seed(2020)
x <- runif(5, 100, 150)
scales::rescale(x)
#1.0000000 0.5053362 0.9443995 0.6671695 0.0000000
Not the prettiest but this just got the job done, since I needed to do this in a dataframe.
column_zero_one_range_scale <- function(
input_df,
columns_to_scale #columns in input_df to scale, must be numeric
){
input_df_replace <- input_df
columncount <- length(columns_to_scale)
for(i in 1:columncount){
columnnum <- columns_to_scale[i]
if(class(input_df[,columnnum]) !='numeric' & class(input_df[,columnnum])!='integer')
{print(paste('Column name ',colnames(input_df)[columnnum],' not an integer or numeric, will skip',sep='')) }
if(class(input_df[,columnnum]) %in% c('numeric','integer'))
{
vec <- input_df[,columnnum]
rangevec <- max(vec,na.rm=T)-min(vec,na.rm=T)
vec1 <- vec - min(vec,na.rm=T)
vec2 <- vec1/rangevec
}
input_df_replace[,columnnum] <- vec2
colnames(input_df_replace)[columnnum] <- paste(colnames(input_df)[columnnum],'_scaled')
}
return(input_df_replace)
}