Use combn within a function for nonlinear regression with nlsLM - r

Given is a few rows of a dataframe DATA:
> dput(DATA[c(1,7,20,25,26,53,89),])
structure(list(Lanes = c(3, 3, 3, 3, 3, 3, 3), N_b = c(5, 5,
5, 5, 5, 5, 5), A = c(-12, -12, -15, -9, -9, -15, -9), x.sqr =
c(1440, 1440, 2250, 810, 810, 2250, 810), e_1 = c(21.8, 21.8,
29, 14.6, 14.6, 29, 14.6), e_2 = c(9.8, 9.8, 17, 2.6, 2.6, 17,
2.6), e_3 = c(-2.2, -2.2, 5, -9.4, -9.4, 5, -9.4), e_4 =
c(-14.2, -14.2, -7, 0, 0, -7, 0), e_5 = c(0, 0, -19, 0, 0, -19, 0),
S = c(12, 12, 15, 9, 9, 15, 9), CSi = c(0.59189685884369,
0.574916237257971, 0.644253184434141, 0.474070747691647,
0.492033722080107, 0.644904371480046, 0.49900365977452),
m = c(0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85)), row.names = c(1L,
7L, 20L, 25L, 26L, 53L, 89L), class = "data.frame")
I write the function below to use for nonlinear regression with nlsLM:
library(minpack.lm)
Prposed <- function(N_b,Lanes,m,A,x.sqr,e_1,e_2,e_3,e_4,e_5,S,a) {
e <- data.frame(e_1,e_2,e_3,e_4,e_5)
CSi <- m * ((Lanes/N_b) + (A * combn(e,Lanes,sum) / x.sqr) * (b*S^a))
return(CSi)
}
nlsLM <- nlsLM(CSi ~ Prposed(N_b,Lanes,m,A,x.sqr,e_1,e_2,e_3,e_4,e_5,S,a,b),
data = DATA,
start = c(a = 0.01, b = 0.01))
summary(nlsLM)
I keep getting an error and it is coming from how I am defining the columns e_1, e_2, etc.. with the combn function.
UPDATE
I found another question: Error when running nlsLM but works for nls
which uses a for loop in the original function, and that seems to work fine with the nls2 function from library(nls2). I was wondering if I could get rid of the combn term altogether by going to a for loop instead.

This is not an actual answer since it generates a new error after fixing the combn error but this might give you some direction.
I think you are trying to run nlsLM function for each row in DATA. You need to pass each row separately in Prposed function. Also note that a and b are required in the function to perform calculation so they need to be passed as an argument of the function and I think passing them using start in nlsLM would not work.
So change your function to :
library(minpack.lm)
Prposed <- function(N_b,Lanes,m,A,x.sqr,e_1,e_2,e_3,e_4,e_5,S,a, b) {
e <- data.frame(e_1,e_2,e_3,e_4,e_5)
CSi <- m * ((Lanes/N_b) + (A * combn(e,Lanes,sum) / x.sqr) * (b*S^a))
return(CSi)
}
Now let's run this for first row of DATA :
x <- DATA[1, ]
Prposed(x[[2]], x[[1]], x[[12]], x[[3]], x[[4]], x[[5]], x[[6]], x[[7]], x[[8]],
x[[9]],x[[10]],a = 0.01, b = 0.01)
#[1] 0.5078651 0.5087365 0.5077053 0.5096079 0.5085767 0.5094481 0.5104793
# 0.5094481 0.5103195 0.5111909
I don't know the theory so I don't know if these numbers make sense/are correct. However, when you plug this in nlsLM function it gives an error.
nlsLM(CSi~Prposed(x[[2]],x[[1]],x[[12]], x[[3]],x[[4]],x[[5]],x[[6]],x[[7]],
x[[8]], x[[9]],x[[10]],a = 0.01, b = 0.01),data = DATA)
Error in getInitial.default(func, data, mCall = as.list(match.call(func, :
no 'getInitial' method found for "function" objects
Is this because nlsLM expects a formula object but what we are passing to it is values? I am not sure.
Once you get the above step working you can plug this in an apply and run it as :
apply(DATA, 1, function(x) {
nlsLM(CSi~Prposed(x[[2]],x[[1]],x[[12]], x[[3]],x[[4]],x[[5]],x[[6]],x[[7]],
x[[8]], x[[9]],x[[10]],a = 0.01, b = 0.01), data = DATA)
})
It works and generates numbers without nlsLM function :
apply(DATA, 1, function(x) {
Prposed(x[[2]],x[[1]],x[[12]], x[[3]],x[[4]],x[[5]],x[[6]],x[[7]],x[[8]],
x[[9]],x[[10]],a = 0.01, b = 0.01)
})
# 1 7 20 25 26 53 89
# [1,] 0.5078651 0.5078651 0.5070307 0.5092470 0.5092470 0.5070307 0.5092470
# [2,] 0.5087365 0.5087365 0.5077293 0.5083395 0.5083395 0.5077293 0.5083395
# [3,] 0.5077053 0.5077053 0.5084280 0.5083395 0.5083395 0.5084280 0.5083395
# [4,] 0.5096079 0.5096079 0.5084280 0.5094980 0.5094980 0.5084280 0.5094980
# [5,] 0.5085767 0.5085767 0.5091267 0.5094980 0.5094980 0.5091267 0.5094980
# [6,] 0.5094481 0.5094481 0.5098253 0.5085905 0.5085905 0.5098253 0.5085905
# [7,] 0.5104793 0.5104793 0.5091267 0.5106565 0.5106565 0.5091267 0.5106565
# [8,] 0.5094481 0.5094481 0.5098253 0.5106565 0.5106565 0.5098253 0.5106565
# [9,] 0.5103195 0.5103195 0.5105240 0.5097490 0.5097490 0.5105240 0.5097490
#[10,] 0.5111909 0.5111909 0.5112227 0.5109075 0.5109075 0.5112227 0.5109075

I had to define the by row operation within the original function
Proposed <- function(N_b,Lanes,m,A,x.sqr,e_1,e_2,e_3,e_4,e_5,S,a,b) {
e <- data.frame(e_1,e_2,e_3,e_4,e_5)
CSi <- m * ((Lanes/N_b) + (max(A * combn(seq_along(e), Lanes, FUN = function(i) rowSums(e[i]))) / x.sqr) * (b*S^a))
return(CSi)
}
nlsLM <- nlsLM(CSi ~ Proposed(N_b,Lanes,m,A,x.sqr,e_1,e_2,e_3,e_4,e_5,S,a,b),
data = DATA,
start = c(a = 0.01, b = 0.01))
summary(nlsLM)

Related

Pooled average marginal effects from survey-weighted and multiple-imputed data

I am working with survey data and their associated weights, in addition to missing data that I imputed using mice(). The model I'm eventually running contains complex interactions between variables for which I want the average marginal effect.
This task seems trivial in STATA, but I'd rather stay in R since that's what I know best. It seems easy to retrieve AME's for each separate imputed dataset and average the estimates. However, I need to make use of pool() (from mice) to make sure I'm getting the correct standard errors.
Here is a reproducible example:
library(tidyverse)
library(survey)
library(mice)
library(margins)
df <- tibble(y = c(0, 5, 0, 4, 0, 1, 2, 3, 1, 12), region = c(1, 1, 1, 1, 1, 3, 3, 3, 3, 3),
weight = c(7213, 2142, 1331, 4342, 9843, 1231, 1235, 2131, 7548, 2348),
x1 = c(1.14, 2.42, -0.34, 0.12, -0.9, -1.2, 0.67, 1.24, 0.25, -0.3),
x2 = c(12, NA, 10, NA, NA, 12, 11, 8, 9, 9))
Using margins() on a simple (non-multiple) svyglm works without a hitch. Running svyglm on each imputation using which() and pooling the results also works well.
m <- with(surv_obj, svyglm(y ~ x1 * x2))
pool(m)
However, wrapping margins() into which() returns an error "Error in .svycheck(design) : argument "design" is missing, with no default"
with(surv_obj, margins(svyglm(y ~ x1 * x2), design = surv_obj))
If I specify the design in the svyglm call, I get "Error in UseMethod("svyglm", design) : no applicable method for 'svyglm' applied to an object of class "svyimputationList""
with(surv_obj, margins(svyglm(y ~ x1 * x2, design = surv_obj), design = surv_obj))
If I drop the survey layer, and simply try to run the margins on each imputed set and then pool, I get a warning: "Warning in get.dfcom(object, dfcom) : Infinite sample size assumed.".
m1 <- with(imputed_df, margins(lm(y ~ x1 * x2)))
pool(m1)
This worries me given that pool() may use sample size in its calculations.
Does anyone know of any method to either (a) use which(), margins() and pool() to retrieve the pooled average marginal effects or (b) knows what elements of margins() I should pass to pool() (or pool.scalar()) to achieve the desired result?
Update following Vincent's comment
Wanted to update this post following Vincent's comment and related package marginaleffects() which ended up fixing my issue. Hopefully, this will be helpful to others stuck on similar problems.
I implemented the code in the vignette linked in Vincent's comment, adding a few steps that allow for survey weighting and modeling. It's worth noting that svydesign() will drop any observations missing on clustering/weighting variables, so marginaleffects() can't predict values back unto the original "dat" data and will throw up an error. Pooling my actual data still throws up an "infinite sample size assumed", which (as noted) should be fine but I'm still looking into fixes.
library(tidyverse)
library(survey)
library(mice)
library(marginaleffects)
fit_reg <- function(dat) {
svy <- svydesign(ids = ~ 1, cluster = ~ region, weight = ~weight, data = dat)
mod <- svyglm(y ~ x1 + x2*factor(x3), design = svy)
out <- marginaleffects(mod, newdata = dat)
class(out) <- c("custom", class(out))
return(out)
}
tidy.custom <- function(x, ...) {
out <- marginaleffects:::tidy.marginaleffects(x, ...)
out$term <- paste(out$term, out$contrast)
return(out)
}
df <- tibble(y = c(0, 5, 0, 4, 0, 1, 2, 3, 1, 12), region = c(1, 1, 1, 1, 1, 3, 3, 3, 3, 3),
weight = c(7213, 2142, 1331, 4342, 9843, 1231, 1235, 2131, 7548, 2348),
x1 = c(1.14, 2.42, -0.34, 0.12, -0.9, -1.2, 0.67, 1.24, 0.25, -0.3),
x2 = c(12, NA, 10, NA, NA, 12, 11, 8, 9, 9),
x3 = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2))
imputed_df <- mice(df, m = 2, seed = 123)
dat_mice <- complete(imputed_df, "all")
mod_imputation <- lapply(dat_mice, fit_reg)
mod_imputation <- pool(mod_imputation)
summary(mod_imputation)

Pooling Survreg Results Across Multiply Imputed Datasets - Error Message: log(1 - 2 * pnorm(width/2)) : NaNs produced

I am trying to run an interval regression using the survival r package (as described here https://stats.oarc.ucla.edu/r/dae/interval-regression/), but I am running into difficulties when trying to pool results across multiply imputed datasets. Specifically, although estimates are returned, I get the following error: log(1 - 2 * pnorm(width/2)) : NaNs produced. The estimates seem reasonable, at face value (no NaNs, very large or small SEs).
I ran the same model on the stacked dataset (ignoring imputations) and on individual imputed datasets, but in either case, I do not get the error. Would someone be able to explain to me what is going on? Is this an ignorable error? If not, is there a workaround that avoids this error?
Thanks so much!
# A Reproducible Example
require(survival)
require(mice)
require(car)
# Create DF
dat <- data.frame(dv = c(1, 1, 2, 1, 0, NA, 1, 4, NA, 0, 3, 1, 3, 0, 2, 1, 4, NA, 2, 4),
catvar1 = factor(c(0, 0, 0, 0, 0, 1, 0, 0, 0, NA, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0)),
catvar2 = factor(c(1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, NA, 0)))
dat_imp <- mice(data = dat)
# Transform Outcome Var for Interval Reg
dat_imp_long <- complete(dat_imp, action = "long", include=TRUE)
# 1-4 correspond to ranges (e.g., 1 = 1 to 2 times...4 = 10 or more)
# create variables that reflect this range
dat_imp_long$dv_low <- car::recode(dat_imp_long$dv, "0 = 0; 1 = 1; 2 = 3; 3 = 6; 4 = 10")
dat_imp_long$dv_high <- car::recode(dat_imp_long$dv, "0 = 0; 1 = 2; 2 = 5; 3 = 9; 4 = 999")
dat_imp_long$dv_high[dat_imp_long$dv_high > 40] <- Inf
# Convert back to mids
dat_mids <- as.mids(dat_imp_long)
# Run Interval Reg
model1 <- with(dat_mids, survreg(Surv(dv_low, dv_high, type = "interval2") ~
catvar1 + catvar2, dist = "gaussian"))
# Warning message for both calls: In log(1 - 2 * pnorm(width/2)) : NaNs produced
# Problem does not only occur with pool, but summary
summary(model1)
summary(pool(model1))
# Run Equivalent Model on Individual Datasets
# No errors produced
imp1 <- subset(dat_imp_long, .imp == 1)
model2 <- survreg(Surv(dv_low, dv_high, type = "interval2") ~
catvar1 + catvar2, dist = "gaussian", data = imp1)
summary(model2)
imp2 <- subset(dat_imp_long, .imp == 2)
model3 <- survreg(Surv(dv_low, dv_high, type = "interval2") ~
catvar1 + catvar2, dist = "gaussian", data = imp2)
summary(model3)
# Equivalent Analysis on Stacked Dataset
# No error
model <- with(dat_imp_long, survreg(Surv(dv_low, dv_high, type = "interval2") ~
catvar1 + catvar2, dist = "gaussian"))
summary(model)

Perform nonlinear regression with nlsLM within a function in r

I want to add a modification factor to an existing equation to fit data. The original equation is defined through a function because the variable N_l is a vector of numbers and the function is selecting the largest outcome of the equation by going through all possible values in the vector N_l. The original function is defined as:
library(utils)
R <- function(x){
N_b <- x[1]
N_l <- x[2]
A <- x[3]
x.sqr <- x[4]
S <- x[10]
e <- x[grepl("e_\\d",names(x))]
f <- sapply(seq(N_l),function(k) max(Multi.Presence$m[k] * ((k/N_b) +
(A * combn(e,k,sum) / x.sqr))))
c(val = max(f), pos = which.max(f))
}
DATA.GIRDER1 <- cbind(DATA.GIRDER1, vars = t(apply(DATA.GIRDER1, 1, R)))
colnames(DATA.GIRDER1)[12:13] <- c("Proposed.Girder1","Lanes")
The equation defines in the function is:
The first 5 rows of the dataframe DATA.GIRDER1 and dataframe Multi.Presence are provided:
> dput(DATA.GIRDER1[(1:5),]
structure(list(N_b = c(5, 5, 5, 5, 5), N_l = c(4, 4, 4, 4, 4),
A = c(-12, -12, -12, -12, -12), x.sqr = c(1440, 1440,
1440, 1440, 1440), e_1 = c(21.8, 21.8, 21.8, 21.8, 21.8),
e_2 = c(9.8, 9.8, 9.8, 9.8, 9.8), e_3 = c(-2.2, -2.2, -2.2,
-2.2, -2.2), e_4 = c(-14.2, -14.2, -14.2, -14.2, -14.2),
e_5 = c(0, 0, 0, 0, 0), S = c(12, 12, 12, 12, 12),
R = c(0.59189685884369, 0.583646426252063,
0.556293941275237, 0.576160481501275, 0.597435112708129)),
row.names = c(NA, 5L), class = "data.frame")
> dput(Multi.Presence)
structure(list(N_l = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), m = c(1.2,
1, 0.85, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65)), row.names = c(NA,
-10L), class = "data.frame")
The theoretical data to fit the equation to is CSi.Girder1. At the moment, the way the function is set up, it calculates the maximum R for each row of dataframe DATA.GIRDER1.
I want to add a regression term based on variable S in dataframe DATA.GIRDER1to the second part of the equation to find parameters a and b to best fit the data in CSi.Girder1. The desired output would implement the equation below:
To use nlsLM I need to define a function for the equation such as:
library(minpack.lm)
Prposed.Girder1 <- function(N_b, N_l,A,x.sqr,e_1,e_2,e_3,e_4,e_5,S,a,b) {
R <- function(x){
N_b <- x[1]
N_l <- x[2]
A <- x[3]
x.sqr <- x[4]
e <- x[grepl("e_\\d",names(x))]
f <- sapply(seq(N_l),function(k) max(Multi.Presence$m[k] * ((k/N_b) +
(A * combn(e,k,sum) / x.sqr) * (b*S^a))))
c(val = max(f), pos = which.max(f))
}
DATA.GIRDER1 <- cbind(DATA.GIRDER1, vars = t(apply(DATA.GIRDER1, 1, R)))
colnames(DATA.GIRDER1)[12:13] <- c("Proposed.Girder1","Lanes")
return(R)
}
Girder1_nlsLM <- nlsLM(R ~ Prposed.Girder1(N_b, N_l,A,x.sqr,e_1,e_2,e_3,e_4,e_5,S,a,b),
data = DATA.GIRDER1,
start = c(a = 0.01, b = 0.01))
summary(Girder1_nlsLM)
But this is not successful and I get the error:
Error in model.frame.default(formula = ~R + N_b + N_l + A + x.sqr + e_1 + :
object is not a matrix
How can I add this modification factor in terms of variable S to solve for the parameters a and b.

R optim() constraint optimization does not find the first best

my problem is summarized in finding a vector X with the best solution to the problem:
L is the profits,
R is the restrictions,
P is a constraint parameters matrix,
max SUM_i (x_i * l_i)
or max(t(L)%*%X)
restriction
SUM_i(x_i*p_ij)<=r_j
or P%*%X <= R.
I find a solution for X, but not the best, which would be
fb = c(.217,0,0,23,2865,0,13,427).
How do I find the best solution?
code:
X<-matrix(rep(1,6),6,1)
P<-matrix(c(
1, 1, 1, 2, 0, 0,
0, 1, 1, 2, 1, 1,
99.4, 37.75, 19.75, 54.40, 74.75, 53,
2.400, 1.540, 0, 0, 0, 0,
2.400, 1.960, 0, 0, 0, 0,
1.800, 3.300, 5.330, 0, 0, 0,
0, 0, 2.070, 0, 8.700, 0,
0, 0, .436, 0, 19.100, 12.363,
0, 3.000, .364, 0, 9.100, 26.737 ),
9,6,1)
L <- matrix(c(83.4, 72.35, 27.3, 72.05, 217.25, 455), 6,1)
R <- matrix(c(60,60,2000,351,448,479,338,424,359),9,1)
farm<- function(par, P,R, L){
trues<- P%*%par<=R
if (min(trues)==1 && min(par)>=0) {
return(-t(L)%*%par)
}
else{
return(0)
}
}
mtds = c("Nelder-Mead", "BFGS", "CG", "L-BFGS-B", "SANN","Brent")
out <- optim(par = X, # initial guess
fn = farm,
P = P,
R = R ,
L = L,
method = mtds[5])
# my result
t(L)%*%out$par
#A matrix: 1 × 1 of type dbl
#7419.596
# the first best
fb<- matrix(c(.217,0,0,23.2865,0,13.427),6,1)
t(L)%*%fb
#A matrix: 1 × 1 of type dbl
#7805.175
I think you can try fmincon from package pracma
library(pracma)
objfun <- function(x) -t(L)%*%x
res <- fmincon(x0 = X,fn = objfun,A = P,b = R,lb = rep(0,length(X)))
and you will see that
> res$par
[1] 4.201711e-16 -1.239088e-15 1.863081e-17 2.310286e+01
[5] 5.566620e-01 1.323762e+01
> -res$value
[,1]
[1,] 7808.615
That looks very much like a model that could be solved by a linear programme.
library("Rglpk")
Rglpk_solve_LP(obj = L,
mat = P,
dir = rep("<=", 9),
rhs = R,
max = TRUE)

Calling ROI "LP "and "QP" functions

I am trying to reproduce some of the examples given by the ROI creators.
For example in http://statmath.wu.ac.at/courses/optimization/Presentations/ROI-2011.pdf (slides 15-17) there is the example:
library("ROI")
#ROI: R Optimization Infrastructure
#Installed solver plugins: cplex, lpsolve, glpk, quadprog, symphony, nlminb.
#Default solver: glpk.
(constr1 <- L_constraint(c(1, 2), "<", 4))
#An object containing 1 linear constraints.
(constr2 <- L_constraint(matrix(c(1:4), ncol = 2), c("<", "<"), c(4, 5)))
#An object containing 2 linear constraints.
rbind(constr1, constr2)
#An object containing 3 linear constraints.
(constr3 <- Q_constraint(matrix(rep(2, 4), ncol = 2), c(1, 2), "<", 5))
#An object containing 1 constraints.
#Some constraints are of type quadratic.
foo <- function(x) {sum(x^3) - seq_along(x) %*% x}
F_constraint(foo, "<", 5)
lp <- LP(objective = c(2, 4, 3), L_constraint(L = matrix(c(3, 2, 1, 4, 1, 3, 2, 2, 2), nrow = 3), dir = c("<=", "<=", "<="), rhs = c(60, 40, 80)), maximum = TRUE)
qp <- QP(Q_objective(Q = diag(1, 3), L = c(0, -5, 0)), L_constraint(L = matrix(c(-4, -3, 0, 2, 1, 0, 0, -2, 1), ncol = 3, byrow = TRUE), dir = rep(">=", 3), rhs = c(-8, 2, 0)))
When I run it I get the errors
Error in LP(objective = c(2, 4, 3), L_constraint(L = matrix(c(3, 2, 1, :
could not find function "LP"
and
Error in QP(Q_objective(Q = diag(1, 3), L = c(0, -5, 0)), L_constraint(L = matrix(c(-4, :
could not find function "QP"
In fact the functions are not in ROI's namespace. e.g.
ROI::LP
Error: 'LP' is not an exported object from 'namespace:ROI'
The same syntax appears in other examples I found on the web but the functions LP and QP are never defined.
I am using ROI 0.3.0
Can someone tell me what is going wrong?
The commands LP and QP were both changed to OP.
library("ROI")
## ROI: R Optimization Infrastructure
## Registered solver plugins: nlminb, alabama, cbc, cccp, clp, deoptim, ecos, glpk, ipop, lpsolve, msbinlp, neos, nloptr, ucminf, spg, cgm, vmm, bobyqa, newuoa, uobyqa, hjk, nmk, lbfgs, optimx, qpoases, quadprog, scs, symphony.
## Default solver: auto.
(constr1 <- L_constraint(c(1, 2), "<", 4))
## An object containing 1 linear constraint.
(constr2 <- L_constraint(matrix(c(1:4), ncol = 2), c("<", "<"), c(4, 5)))
## An object containing 2 linear constraints.
rbind(constr1, constr2)
## An object containing 3 linear constraints.
(constr3 <- Q_constraint(matrix(rep(2, 4), ncol = 2), c(1, 2), "<", 5))
## An object containing 0 linear constraints
## 1 quadratic constraint.
foo <- function(x) {sum(x^3) - seq_along(x) %*% x}
F_constraint(foo, "<", 5)
## An object containing 1 nonlinear constraint.
lp <- OP(objective = c(2, 4, 3),
L_constraint(L = matrix(c(3, 2, 1, 4, 1, 3, 2, 2, 2), nrow = 3),
dir = c("<=", "<=", "<="),
rhs = c(60, 40, 80)), maximum = TRUE)
qp <- OP(Q_objective(Q = diag(1, 3), L = c(0, -5, 0)),
L_constraint(L = matrix(c(-4, -3, 0, 2, 1, 0, 0, -2, 1), ncol = 3, byrow = TRUE),
dir = rep(">=", 3), rhs = c(-8, 2, 0)))
The slides you refer to are outdated. The new documentation is on http://roi.r-forge.r-project.org !

Resources