I am new to R and need to do pairwise comparison formulas across a set of variables. The number of elements to be compared will by dynamic but here is a hardcoded example with 4 elements, each compared against the other:
#there are 4 choices A, B, C, D -
#they are compared against each other and comparisons are stored:
df1 <- data.frame("A" = c(80),"B" = c(20))
df2 <- data.frame("A" = c(90),"C" = c(10))
df3 <- data.frame("A" = c(95), "D" = c(5))
df4 <- data.frame("B" = c(80), "C" = c(20))
df5 <- data.frame("B" = c(90), "D" = c(10))
df6 <- data.frame("C" = c(80), "D" = c(20))
#show the different comparisons in a matrix
matrixA <- matrix(c("", df1$B[1], df2$C[1], df3$D[1],
df1$A[1], "", df4$C[1], df5$D[1],
df2$A[1], df4$B[1], "", df6$D[1],
df3$A[1], df5$B[1], df6$C[1], ""),
nrow=4,ncol = 4,byrow = TRUE)
dimnames(matrixA) = list(c("A","B","C","D"),c("A","B","C","D"))
#perform calculations on the comparisons
matrixB <- matrix(
c(1, df1$B[1]/df1$A[1], df2$C[1]/df2$A[1], df3$D[1]/df3$A[1],
df1$A[1]/df1$B[1], 1, df4$C[1]/df4$B[1], df5$D[1]/df5$B[1],
df2$A[1]/df2$C[1], df4$B[1]/df4$C[1], 1, df6$D[1]/df6$C[1],
df3$A[1]/df3$D[1], df5$B[1]/df5$D[1], df6$C[1]/df6$D[1], 1),
nrow = 4, ncol = 4, byrow = TRUE)
matrixB <- rbind(matrixB, colSums(matrixB)) #add the sum of the colums
dimnames(matrixB) = list(c("A","B","C","D","Sum"),c("A","B","C","D"))
#so some more calculations that I'll use later on
dfC <- data.frame("AB" = c(matrixB["A","A"] / matrixB["A","B"],
matrixB["B","A"] / matrixB["B","B"],
matrixB["C","A"] / matrixB["C","B"],
matrixB["D","A"] / matrixB["D","B"]),
"BC" = c(matrixB["A","B"] / matrixB["A","C"],
matrixB["B","B"] / matrixB["B","C"],
matrixB["C","B"] / matrixB["C","C"],
matrixB["D","B"] / matrixB["D","C"]
),
"CD" = c(matrixB["A","C"] / matrixB["A","D"],
matrixB["B","C"] / matrixB["B","D"],
matrixB["C","C"] / matrixB["C","D"],
matrixB["D","C"] / matrixB["D","D"]))
dfCMeans <- colMeans(dfC)
#create the normalization matrix
matrixN <- matrix(c(
matrixB["A","A"] / matrixB["Sum","A"], matrixB["A","B"] / matrixB["Sum","B"], matrixB["A","C"] / matrixB["Sum","C"], matrixB["A","D"] / matrixB["Sum","D"],
matrixB["B","A"] / matrixB["Sum","A"], matrixB["B","B"] / matrixB["Sum","B"], matrixB["B","C"] / matrixB["Sum","C"], matrixB["B","D"] / matrixB["Sum","D"],
matrixB["C","A"] / matrixB["Sum","A"], matrixB["C","B"] / matrixB["Sum","B"], matrixB["C","C"] / matrixB["Sum","C"], matrixB["C","D"] / matrixB["Sum","D"],
matrixB["D","A"] / matrixB["Sum","A"], matrixB["D","B"] / matrixB["Sum","B"], matrixB["D","C"] / matrixB["Sum","C"], matrixB["D","D"] / matrixB["Sum","D"]
), nrow = 4, ncol = 4, byrow = TRUE)
Since R is so concise it seems like there should be a much better way to do this, I would like to know an easier way to figure out these type of calculations using R.
OK, I might be starting to piece together something here.
We start with a matrix like so:
A <- structure(
c(NA, 20, 10, 5, 80, NA, 20, 10, 90, 80, NA, 20, 95, 90, 80, NA),
.Dim = c(4, 4),
.Dimnames = list(LETTERS[1:4], LETTERS[1:4]))
A
# A B C D
# A NA 80 90 95
# B 20 NA 80 90
# C 10 20 NA 80
# D 5 10 20 NA
This matrix is the result of a pairwise comparison on a vector of length 4. We know nothing of this vector, and the only thing we know about the function used in the comparison is that it is binary non-commutative, or more precisely: f(x, y) = 100 - f(y, x) and the result is ∈ [0, 100].
matrixB appears to be simply matrixA divided by its own transpose:
B = ATA-1
or if you prefer:
B = (100 - A) / A
Potato patato due to above mentioned properties.
B <- (100 - A) / A
B <- t(A) / A
# fill in the diagonal with 1s
diag(B) <- 1
round(B, 2)
# A B C D
# A 1 0.25 0.11 0.05
# B 4 1.00 0.25 0.11
# C 9 4.00 1.00 0.25
# D 19 9.00 4.00 1.00
The 'normalized' matrix as you call it seems to be simply each column divided by its sum.
B.norm <- t(t(B) / colSums(B))
round(B.norm, 3)
# A B C D
# A 0.030 0.018 0.021 0.037
# B 0.121 0.070 0.047 0.079
# C 0.273 0.281 0.187 0.177
# D 0.576 0.632 0.746 0.707
Related
I have data that looks like this:
data = data.frame(a.coef = c(.14, .15, .16),
b.coef = c(.4, .5, .6),
a.var = c(0.0937, 0.0934, 0.0945),
b.var = c(0.00453, 0.00564, 0.00624),
ab.cov = c(0.000747, 0.000747, 0.000747))
and I would like to run the following code (source: http://www.quantpsy.org/medmc/medmc.htm) on each row of the data set.
require(MASS)
a = data$a.coef
b = data$b.coef
rep = 10000
conf = 95
pest = c(a, b)
acov <- matrix(c(data$a.var, data$ab.cov,
data$ab.cov, data$b.var), 2, 2)
mcmc <- mvrnorm(rep, pest, acov, empirical = FALSE)
ab <- mcmc[ , 1] * mcmc[ , 2]
low = (1 - conf / 100) / 2
upp = ((1 - conf / 100) / 2) + (conf / 100)
LL = quantile(ab, low)
UL = quantile(ab, upp)
LL4 = format(LL, digits = 4)
UL4 = format(UL, digits = 4)
I've created a relatively simple function that takes the data and the row number as inputs:
MCMAM <- function(data_input, row_number) {
data = data_input[row_number, ]
a = data[["a.coef"]]
b = data[["b.coef"]]
rep = 10000
conf = 95
pest = c(a, b)
acov <- matrix(c(data[["a.var"]], data[["ab.cov"]],
data[["ab.cov"]], data[["b.var"]]), 2, 2)
require(MASS)
mcmc <- mvrnorm(rep, pest, acov, empirical = FALSE)
ab <- mcmc[, 1] * mcmc[, 2]
low = (1 - conf / 100) / 2
upp = ((1 - conf / 100) / 2) + (conf / 100)
LL = quantile(ab, low)
UL = quantile(ab, upp)
return(c(LL, UL))
}
MCMAM(data, 1)
2.5% 97.5%
-0.1901272 0.3104614
But it would be great if there was a way to get rid of the row specification and just have the function run through the data set row by row and save the output to a new column in the data set.
I've been experimenting with for loops and apply functions but haven't had any success, largely because both the matrix() and mvrnorm() functions take values rather than vectors.
We can use lapply
do.call(rbind, lapply(seq_len(nrow(data)), MCMAM, data_input = data))
-ouptut
2.5% 97.5%
[1,] -0.1832449 0.3098362
[2,] -0.2260856 0.3856575
[3,] -0.2521126 0.4666583
Or use rowwise
library(dplyr)
library(tidyr)
data %>%
rowwise %>%
mutate(new = list(MCMAM(cur_data(), 1))) %>%
unnest_wider(new)
# A tibble: 3 x 7
# a.coef b.coef a.var b.var ab.cov `2.5%` `97.5%`
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 0.14 0.4 0.0937 0.00453 0.000747 -0.185 0.309
#2 0.15 0.5 0.0934 0.00564 0.000747 -0.219 0.396
#3 0.16 0.6 0.0945 0.00624 0.000747 -0.259 0.472
I trying to find a way to do a nested for loop in r to get every possible correlation combination of this:
cor(y, column1* column2),
cor(y, column1* column3),
cor(y, column1* column4)
and so on
This is what I have tried so far:
for(i in 1:length(dataframe))
{
for(j in 1:length(dataframe))
{
joint_correlation(i,j)=cor(y ~ dataframe(i) * dataframe(j));
}
}
My dataframe has 115 columns like shown with a small sample:
FG_pct FGA FT FT_pct FTA GP GS GmSc MP ORB
0.625 8 0 0.00 0 1 0 6.6 28.4 2
0.500 4 0 0.00 1 2 0 2.1 17.5 0
0.000 1 0 0.00 0 3 0 1.2 6.6 1
0.500 6 0 0.00 0 4 0 3.6 13.7 1
0.500 2 0 0.00 0 5 0 0.9 7.4 1
I want to find the correlation for cor(MP, column1* column2) for every possible combination switched out for column1 and column2. This way, I wouldn't have to do every single one of them separately. If possible, I would like to save the output for each correlation combination cor(MP, column1* column2), cor(MP, column1* column3),cor(MP, column2* column4), etc. in a separate column.
This is an example of what I want:
cor(MP, FG_pct*FT_pct)
Edit: Jean-Claude Arbaut gives a better answers, as commented to this answer. Use cor(df).
Here is my botched answer: Using the library corrgram (Which is mainly a visual tool) we can easily get all combinations of correlations in a dataset. Example:
library(corrgram)
#Example data
df <- data.frame(x = rnorm(50, 5, 5),
y = rnorm(50, 2, 5))
df$z <- df$x / df$y
df$abc <- df$x * df$y * df$z
#panel arguments are necessary if you want to visualize correlations
corr <- corrgram(df,
order = F,
lower.panel = panel.cor,
upper.panel = panel.pts,
text.panel = panel.txt,
diag.panel = panel.minmax,
main = "Correlation")
#call corr gives
corr
x y z abc
x 1.00000000 0.07064179 0.1402051 0.89166002
y 0.07064179 1.00000000 0.2495239 0.08024278
z 0.14020508 0.24952388 1.0000000 0.14649093
abc 0.89166002 0.08024278 0.1464909 1.00000000
There is absolutely a better way for doing this with functions and without a package, but its early here, and if you are desperate to get the results this will probably do you fine.
p.s using the corrgram() function without assigning it will give you a nice visualization of your correlations.
Assuming you want the correlations of every column multiplied by combinations of two of the remaining columns.
We can find the names of according combinations using combn(names(dat), 2) which we put into an lapply.
combs <- do.call(cbind.data.frame,
lapply("MP", rbind, combn(names(dat)[names(dat) != "MP"], 2)))
combs
# 1 2 3
# 1 MP MP MP
# 2 FG_pct FG_pct FGA
# 3 FGA FT FT
In another lapply we subset the data on the name-combinations and calculate cor with formula cor(x1 ~ x2 * x3). Simultaneously we store the names pasted as formula in an attribute, to remember later what we've calculated in each iteration.
res.l <- lapply(combs, function(x) {
`attr<-`(cor(dat[,x[1]], dat[,x[2]]*dat[,x[3]]),
"what", {
paste0(x[1], ", ", paste(x[2], "*", x[3]))})
})
Finally we unlist and setNames according to the attributes.
res <- setNames(unlist(res.l), sapply(res.l, attr, "what"))
res
Result
# MP, FG_pct * FGA MP, FG_pct * FT MP, FGA * FT
# 0.2121374 0.2829003 0.4737892
Check:
(Note, that you can directly put the names, e.g. MP, FG_pct * FGA into the cor function.)
with(dat, cor(MP, FG_pct * FGA))
# [1] 0.2121374
with(dat, cor(MP, FG_pct * FT))
# [1] 0.2829003
with(dat, cor(MP, FGA * FT))
# [1] 0.4737892
To sort, use e.g. sort(res) or rev(sort(res)).
Toy data:
set.seed(42)
dat <- as.data.frame(`colnames<-`(MASS::mvrnorm(n=1e4,
mu=c(0.425, 4.2, 0.2, 3),
Sigma=matrix(c(1, .3, .7, 0,
.3, 1, .5, 0,
.7, .5, 1, 0,
0, 0, 0, 1), nrow=4),
empirical=T), c("FG_pct", "MP", "FGA", "FT")))
Weirdly for this one, I think its easier to start by viewing the df.
#reproducible data
quantiles<-c("50","90")
var=c("w","d")
df=data.frame(a=runif(20,0.01,.5),b=runif(20,0.02,.5),c=runif(20,0.03,.5),e=runif(20,0.04,.5),
q50=runif(20,1,5),q90=runif(20,10,50))
head(df)
I want to automate a function that I've created (below) to calculate vars using different combinations of values from my df.
For example, the calculation of w needs to use a and b, and d needs to use c and e such that w = a *q ^ b and d = c * q ^ e. Further, q is a quantile, so I actually want w50, w90, etc., which will correspond to q50, q90 etc. from the df.
The tricky part as i see it is setting the condition to use a & b vs. c & d without using nested loops.
I have a function to calculate vars using the appropriate columns, however I can't get all the pieces together efficiently.
#function to calculate the w, d
calc_wd <- function(df,col_name,col1,col2,col3){
#Calculate and create new column col_name for each combo of var and quantile, e.g. "w_50", "d_50", etc.
df[[col_name]] <- df[[col1]] * (df[[col2]] ^ (df[[col3]]))
df
}
I can get this to work for a single case, but not by automating the coefficient swap... you'll see I specify "a" and "b" below.
wd<-c("w_","d_")
make_wd_list<-apply(expand.grid(wd, quantiles), 1, paste,collapse="")
calc_wdv(df,make_wd_list[1],"a",paste0("q",sapply(strsplit(make_wd_list[1],"_"),tail,1)),"b")
Alternatively, I have tried to make this work using nested for loops, but can't seem to append the data correctly. And its ugly.
var=c("w","d")
dataf<-data.frame()
for(j in unique(var)){
if(j=="w"){
coeff1="a"
coeff2="b"
}else if(j=="d"){
coeff1="c"
coeff1="e"
}
print(coeff1)
print(coeff2)
for(k in unique(quantiles)){
dataf<-calc_wd(df,paste0(j,k),coeff1,paste0("q",k),coeff2)
dataf[k,j]=rbind(df,dataf) #this aint right. tried to do.call outside, etc.
}
}
In the end, I'm looking to have new columns with w_50, w_90, etc., which use q50, q90 and the corresponding coefficients as defined originally.
One approach I find easy to type is using purrr::pmap. I like this because when you use with(list(...),), you can access the column names of your data.frame by name. Additionally, you can supply additional arguments.
library(purrr)
pmap_df(df, quant = "q90", ~with(list(...),{
list(w = a * get(quant) ^ b, d = c * get(quant) ^ e)
}))
## A tibble: 20 x 2
# w d
# <dbl> <dbl>
# 1 0.239 0.295
# 2 0.152 0.392
# 3 0.476 0.828
# 4 0.344 0.236
# 5 0.439 1.00
You could combine this with for example a second map call to iterate over quantiles.
library(dplyr)
map(setNames(quantiles,quantiles),
~ pmap_df(df, quant = paste0("q",.x),
~ with(list(...),{list(w = a * get(quant) ^ b, d = c * get(quant) ^ e)}))
) %>% do.call(cbind,.)
# 50.w 50.d 90.w 90.d
#1 0.63585897 0.11045837 1.7276019 0.1784987
#2 0.17286184 0.22033649 0.2333682 0.5200265
#3 0.32437528 0.72502654 0.5722203 1.4490065
#4 0.68020897 0.33797621 0.8749206 0.6179557
#5 0.73516886 0.38481785 1.2782923 0.4870877
Then assigning a custom function is trivial.
calcwd <- function(df,quantiles){
map(setNames(quantiles,quantiles),
~ pmap_df(df, quant = paste0("q",.x),
~ with(list(...),{list(w = a * get(quant) ^ b, d = c * get(quant) ^ e)}))
) %>% do.call(cbind,.)
}
I love #Ian's answer for the completeness and the use of classics like with and do.call. I'm late to the scene with my solution but since I have been trying to get better with rowwise operations (including the use of rowwise thought I would offer up a less elegant but simpler and faster solution using just mutate, formula.tools and map_dfc
library(dplyr)
library(purrr)
require(formula.tools)
# same type example data plus a much larger version in df2 for
# performance testing
df <- data.frame(a = runif(20, 0.01, .5),
b = runif(20, 0.02, .5),
c = runif(20, 0.03, .5),
e = runif(20, 0.04, .5),
q50 = runif(20,1,5),
q90 = runif(20,10,50)
)
df2 <- data.frame(a = runif(20000, 0.01, .5),
b = runif(20000, 0.02, .5),
c = runif(20000, 0.03, .5),
e = runif(20000, 0.04, .5),
q50 = runif(20000,1,5),
q90 = runif(20000,10,50)
)
# from your original post
quantiles <- c("q50", "q90")
wd <- c("w_", "d_")
make_wd_list <- apply(expand.grid(wd, quantiles),
1,
paste, collapse = "")
make_wd_list
#> [1] "w_q50" "d_q50" "w_q90" "d_q90"
# an empty list to hold our formulas
eqn_list <- vector(mode = "list",
length = length(make_wd_list)
)
# populate the list makes it very extensible to more outcomes
# or to more quantile levels
for (i in seq_along(make_wd_list)) {
if (substr(make_wd_list[[i]], 1, 1) == "w") {
eqn_list[[i]] <- as.formula(paste(make_wd_list[[i]], "~ a * ", substr(make_wd_list[[i]], 3, 5), " ^ b"))
} else if (substr(make_wd_list[[i]], 1, 1) == "d") {
eqn_list[[i]] <- as.formula(paste(make_wd_list[[i]], "~ c * ", substr(make_wd_list[[i]], 3, 5), " ^ e"))
}
}
# formula.tools helps us grab both left and right sides
add_column <- function(df, equation){
df <- transmute_(df, rhs(equation))
colnames(df)[ncol(df)] <- as.character(lhs(equation))
return(df)
}
result <- map_dfc(eqn_list, ~ add_column(df = df, equation = .x))
#> w_q50 d_q50 w_q90 d_q90
#> 1 0.10580863 0.29136904 0.37839737 0.9014040
#> 2 0.34798729 0.35185585 0.64196417 0.4257495
#> 3 0.79714122 0.37242915 1.57594506 0.6198531
#> 4 0.56446922 0.43432160 1.07458217 1.1082825
#> 5 0.26896574 0.07374273 0.28557366 0.1678035
#> 6 0.36840408 0.72458466 0.72741030 1.2480547
#> 7 0.64484009 0.69464045 1.93290705 2.1663690
#> 8 0.43336109 0.21265672 0.46187366 0.4365486
#> 9 0.61340404 0.47528697 0.89286358 0.5383290
#> 10 0.36983212 0.53292900 0.53996112 0.8488402
#> 11 0.11278412 0.12532491 0.12486156 0.2413191
#> 12 0.03599639 0.25578020 0.04084221 0.3284659
#> 13 0.26308183 0.05322304 0.87057854 0.1817630
#> 14 0.06533586 0.22458880 0.09085436 0.3391683
#> 15 0.11625845 0.32995233 0.12749040 0.4730407
#> 16 0.81584442 0.07733376 2.15108243 0.1041342
#> 17 0.38198254 0.60263861 0.68082354 0.8502999
#> 18 0.51756058 0.43398089 1.06683204 1.3397900
#> 19 0.34490492 0.13790601 0.69168711 0.1580659
#> 20 0.39771037 0.33286225 1.32578056 0.4141457
microbenchmark::microbenchmark(result <- map_dfc(eqn_list, ~ add_column(df = df2, equation = .x)), times = 10)
#> Unit: milliseconds
#> expr min
#> result <- map_dfc(eqn_list, ~add_column(df = df2, equation = .x)) 10.58004
#> lq mean median uq max neval
#> 11.34603 12.56774 11.6257 13.24273 16.91417 10
The mutate and formula solution is about fifty times faster although both rip through 20,000 rows in less than a second
Created on 2020-04-30 by the reprex package (v0.3.0)
I'm working to implement a lpSolve solution to optimizing a hypothetical daily fantasy baseball problem. I'm having trouble applying my last constraint:
position - Exactly 3 outfielders (OF) 2 pitchers (P) and 1 of everything else
cost - Cost less than 200
team - Max number from any one team is 6
team - Minimum number of teams on a roster is 3**
Say for example you have a dataframe of 1000 players with points, cost, position, and team and you're trying to maximize average points:
library(tidyverse)
library(lpSolve)
set.seed(123)
df <- data_frame(avg_points = sample(5:45,1000, replace = T),
cost = sample(3:45,1000, replace = T),
position = sample(c("P","C","1B","2B","3B","SS","OF"),1000, replace = T),
team = sample(LETTERS,1000, replace = T)) %>% mutate(id = row_number())
head(df)
# A tibble: 6 x 5
# avg_points cost position team id
# <int> <int> <chr> <chr> <int>
#1 17 13 2B Y 1
#2 39 45 1B P 2
#3 29 33 1B C 3
#4 38 31 2B V 4
#5 17 13 P A 5
#6 10 6 SS V 6
I've implemented the first 3 constraints with the following code, but i'm having trouble figuring out how to implement the minimum number of teams on a roster. I think I need to add additional variable to the model, but i'm not sure how to do that.
#set the objective function (what we want to maximize)
obj <- df$avg_points
# set the constraint rows.
con <- rbind(t(model.matrix(~ position + 0,df)), cost = df$cost, t(model.matrix(~ team + 0, df)) )
#set the constraint values
rhs <- c(1,1,1,1,3,2,1, # 1. #exactly 3 outfielders 2 pitchers and 1 of everything else
200, # 2. at a cost less than 200
rep(6,26) # 3. max number from any team is 6
)
#set the direction of the constraints
dir <- c("=","=","=","=","=","=","=","<=",rep("<=",26))
result <- lp("max",obj,con,dir,rhs,all.bin = TRUE)
If it helps, i'm trying to replicate This paper (with minor tweaks) which has corresponding julia code here
This might be a solution for your problem.
This is the data I have used (identical to yours):
library(tidyverse)
library(lpSolve)
N <- 1000
set.seed(123)
df <- tibble(avg_points = sample(5:45,N, replace = T),
cost = sample(3:45,N, replace = T),
position = sample(c("P","C","1B","2B","3B","SS","OF"),N, replace = T),
team = sample(LETTERS,N, replace = T)) %>%
mutate(id = row_number())
You want to find x1...xn that maximise the objective function below:
x1 * average_points1 + x2 * average_points1 + ... + xn * average_pointsn
With the way lpSolve works, you will need to express every LHS as the sum over
x1...xn times the vector you provide.
Since you cannot express the number of teams with your current variables, you can introduce new ones (I will call them y1..yn_teams and z1..zn_teams):
# number of teams:
n_teams = length(unique(df$team))
Your new objective function (ys and zs will not influence your overall objective funtion, since the constant is set to 0):
obj <- c(df$avg_points, rep(0, 2 * n_teams))
)
The first 3 constraints are the same, but with the added constants for y and z:
c1 <- t(model.matrix(~ position + 0,df))
c1 <- cbind(c1,
matrix(0, ncol = 2 * n_teams, nrow = nrow(c1)))
c2 = df$cost
c2 <- c(c2, rep(0, 2 * n_teams))
c3 = t(model.matrix(~ team + 0, df))
c3 <- cbind(c3, matrix(0, ncol = 2 * n_teams, nrow = nrow(c3)))
Since you want to have at least 3 teams, you will first use y to count the number of players per team:
This constraint counts the number of players per team. You sum up all players of a team that you have picked and substract the corresponding y variable per team. This should be equal to 0. (diag() creates the identity matrix, we do not worry about z at this point):
# should be x1...xn - y1...n = 0
c4_1 <- cbind(t(model.matrix(~team + 0, df)), # x
-diag(n_teams), # y
matrix(0, ncol = n_teams, nrow = n_teams) # z
) # == 0
Since each y is now the number of players in a team, you can now make sure that z is binary with this constraint:
c4_2 <- cbind(t(model.matrix(~ team + 0, df)), # x1+...+xn ==
-diag(n_teams), # - (y1+...+yn )
diag(n_teams) # z binary
) # <= 1
This is the constraint that ensures that at least 3 teams are picked:
c4_3 <- c(rep(0, nrow(df) + n_teams), # x and y
rep(1, n_teams) # z >= 3
)
You need to make sure that
You can use the big-M method for that to create a constraint, which is:
Or, in a more lpSolve friendly version:
In this case you can use 6 as a value for M, because it is the largest value any y can take:
c4_4 <- cbind(matrix(0, nrow = n_teams, ncol = nrow(df)),
diag(n_teams),
-diag(n_teams) * 6)
This constraint is added to make sure all x are binary:
#all x binary
c5 <- cbind(diag(nrow(df)), # x
matrix(0, ncol = 2 * n_teams, nrow = nrow(df)) # y + z
)
Create the new constraint matrix
con <- rbind(c1,
c2,
c3,
c4_1,
c4_2,
c4_3,
c4_4,
c5)
#set the constraint values
rhs <- c(1,1,1,1,3,2,1, # 1. #exactly 3 outfielders 2 pitchers and 1 of everything else
200, # 2. at a cost less than 200
rep(6, n_teams), # 3. max number from any team is 6
rep(0, n_teams), # c4_1
rep(1, n_teams), # c4_2
3, # c4_3,
rep(0, n_teams), #c4_4
rep(1, nrow(df))# c5 binary
)
#set the direction of the constraints
dir <- c(rep("==", 7), # c1
"<=", # c2
rep("<=", n_teams), # c3
rep('==', n_teams), # c4_1
rep('<=', n_teams), # c4_2
'>=', # c4_3
rep('<=', n_teams), # c4_4
rep('<=', nrow(df)) # c5
)
The problem is almost the same, but I am using all.int instead of all.bin to make sure the counts work for the players in the team:
result <- lp("max",obj,con,dir,rhs,all.int = TRUE)
Success: the objective function is 450
roster <- df[result$solution[1:nrow(df)] == 1, ]
roster
# A tibble: 10 x 5
avg_points cost position team id
<int> <int> <chr> <chr> <int>
1 45 19 C I 24
2 45 5 P X 126
3 45 25 OF N 139
4 45 22 3B J 193
5 45 24 2B B 327
6 45 25 OF P 340
7 45 23 P Q 356
8 45 13 OF N 400
9 45 13 SS L 401
10 45 45 1B G 614
If you change your data to
N <- 1000
set.seed(123)
df <- tibble(avg_points = sample(5:45,N, replace = T),
cost = sample(3:45,N, replace = T),
position = sample(c("P","C","1B","2B","3B","SS","OF"),N, replace = T),
team = sample(c("A", "B"),N, replace = T)) %>%
mutate(id = row_number())
It will now be infeasable, because the number of teams in the data is less then 3.
You can check that it now works:
sort(unique(df$team))[result$solution[1027:1052]==1]
[1] "B" "E" "I" "J" "N" "P" "Q" "X"
sort(unique(roster$team))
[1] "B" "E" "I" "J" "N" "P" "Q" "X"
I have been trying to solve a constrained optimization problem in R using constrOptim() (my first time) but am struggling to set up the constraints for my problem.
The problem is pretty straight forward and i can set up the function ok but am a bit at a loss about passing the constraints in.
e.g. problem i've defined is (am going to start with N fixed at 1000 say so i just want to solve for X ultimately i'd like to choose both N and X that max profit):
so i can set up the function as:
fun <- function(x, N, a, c, s) { ## a profit function
x1 <- x[1]
x2 <- x[2]
x3 <- x[3]
a1 <- a[1]
a2 <- a[2]
a3 <- a[3]
c1 <- c[1]
c2 <- c[2]
c3 <- c[3]
s1 <- s[1]
s2 <- s[2]
s3 <- s[3]
((N*x1*a1*s1)-(N*x1*c1))+((N*x2*a2*s2)-(N*x2*c2))+((N*x3*a3*s3)-(N*x3*c3))
}
The constraints i need to implement are that:
x1>=0.03
x1<=0.7
x2>=0.03
x2<=0.7
x3>=0.03
x2<=0.7
x1+x2+x3=1
The X here represents buckets into which i need to optimally allocate N, so x1=pecent of N to place in bucket 1 etc. with each bucket having at least 3% but no more than 70%.
Any help much appreciated...
e.g. here is an example i used to test the function does what i want:
fun <- function(x, N, a, c, s) { ## profit function
x1 <- x[1]
x2 <- x[2]
x3 <- x[3]
a1 <- a[1]
a2 <- a[2]
a3 <- a[3]
c1 <- c[1]
c2 <- c[2]
c3 <- c[3]
s1 <- s[1]
s2 <- s[2]
s3 <- s[3]
((N*x1*a1*s1)-(N*x1*c1))+((N*x2*a2*s2)-(N*x2*c2))+((N*x3*a3*s3)-(N*x3*c3))
};
x <-matrix(c(0.5,0.25,0.25));
a <-matrix(c(0.2,0.15,0.1));
s <-matrix(c(100,75,50));
c <-matrix(c(10,8,7));
N <- 1000;
fun(x,N,a,c,s);
You can use The lpSolveAPI package.
## problem constants
a <- c(0.2, 0.15, 0.1)
s <- c(100, 75, 50)
c <- c(10, 8, 7)
N <- 1000
## Problem formulation
# x1 >= 0.03
# x1 <= 0.7
# x2 >= 0.03
# x2 <= 0.7
# x3 >= 0.03
# x1 +x2 + x3 = 1
#N*(c1- a1*s1)* x1 + (a2*s2 - c2)* x2 + (a3*s3- c3)* x3
library(lpSolveAPI)
my.lp <- make.lp(6, 3)
The best way to build a model in lp solve is columnwise;
#constraints by columns
set.column(my.lp, 1, c(1, 1, 0, 0, 1, 1))
set.column(my.lp, 2, c(0, 0, 1, 1, 0, 1))
set.column(my.lp, 3, c(0, 0, 0, 0, 1, 1))
#the objective function ,since we need to max I set negtive max(f) = -min(f)
set.objfn (my.lp, -N*c(c[1]- a[1]*s[1], a[2]*s[2] - c[2],a[3]*s[3]- c[3]))
set.rhs(my.lp, c(rep(c(0.03,0.7),2),0.03,1))
#constraint types
set.constr.type(my.lp, c(rep(c(">=","<="), 2),">=","="))
take a look at my model
my.lp
Model name:
Model name:
C1 C2 C3
Minimize 10000 -3250 2000
R1 1 0 0 >= 0.03
R2 1 0 0 <= 0.7
R3 0 1 0 >= 0.03
R4 0 1 0 <= 0.7
R5 1 0 1 >= 0.03
R6 1 1 1 = 1
Kind Std Std Std
Type Real Real Real
Upper Inf Inf Inf
Lower 0 0 0
solve(my.lp)
[1] 0 ## sucess :)
get.objective(my.lp)
[1] -1435
get.constraints(my.lp)
[1] 0.70 0.70 0.03 0.03 0.97 1.00
## the decisions variables
get.variables(my.lp)
[1] 0.03 0.70 0.27
Hi Just in case of use to anyone i also found an answer as below:
First of all, your objective function can be written a lot more concisely using vector operations:
> my_obj_coeffs <- function(N,a,c,s) N*(a*s-c)
> fun <- function(x,N,a,c,s) sum(my_obj_coeffs(N,a,c,s) * x)
You're trying to solve a linear program, so you can use solve it using the simplex algorithm. There's a lightweight implementation of it in the 'boot' package.
> library(boot)
> solution <- function(obj) simplex(obj, diag(3), rep(0.7,3), diag(3), rep(0.03,3), rep(1,3), 1, maxi=TRUE)
Then for the example parameters you used, you can call that solution function:
> a <- c(0.2,0.15,0.1)
> s <- c(100,75,50)
> c <- c(10,8,7)
> N <- 1000
> solution(my_obj_coeffs(N,a,c,s))
Linear Programming Results
Call : simplex(a = obj(N, a, s, c), A1 = diag(3), b1 = rep(0.7, 3),
A2 = diag(3), b2 = rep(0.03, 3), A3 = matrix(1, 1, 3), b3 = 1,
maxi = TRUE)
Maximization Problem with Objective Function Coefficients
[,1]
[1,] 10000
[2,] 3250
[3,] -2000
attr(,"names")
[1] "x1" "x2" "x3"
Optimal solution has the following values
x1 x2 x3
0.70 0.27 0.03
The optimal value of the objective function is 7817.5.