Restrictions in a optimization using R - r

Continuing the question made here, I'd like to add a restriction in the optimization:
a <- c(52.67, 46.80, 41.74, 40.45)
b <- c(1.726219351, 1.842421805, 1.790801758, 1.449997494)
rsq <- function(c) {
x <- log(a)
y <- log((c*(a/b))-1)
summary(lm(y ~ x))$r.squared
}
optimise(rsq, maximum = TRUE, interval=c(1, 1000))
The interval for the optimization is 1 to 1000, however I'd like that the interval starts where
(c*(a/b)) > 0
To avoid problems with log

Just return -Inf if the log argument would be negative. Then it is unnecessary to manipulate the domain.
rsq <- function(c) {
x <- log(a)
tmp <- (c*(a/b))-1
if (any(tmp < 0)) -Inf else summary(lm(log(tmp) ~ x))$r.squared
}
optimise(rsq, maximum = TRUE, interval=c(1, 1000))
giving:
$maximum
[1] 1.082353
$objective
[1] 0.8093781
Update
Fixed.

Related

How to predict and extract R Squared with .lm.fit?

As the title suggest, I have seen some user mentioned that .lm.fit() functions has an advantage of more speed than a regular lm(), but when i look deeper at .lm.fit() in help, it is supposed to be a fitter functions, it returns a set of list instead of a model, which makes me to think is it still possible to extract components like R squared, Adj R Squared, and lastly do a predict() out of it?
Below is sample data and executions:
test_dat <- data.frame(y = rnorm(780, 20, 10))
for(b in 1:300){
name_var <- paste0("x",b)
test_dat[[name_var]] <- rnorm(780, 0.01 * b, 5)
}
tic()
obj_lm <- lm(y ~ ., data = test_dat)
print(class(obj_lm))
print(summary(obj_lm)$r.squared)
print(summary(obj_lm)$adj.r.squared)
predict(obj_lm)
toc() #approximately 0.4 seconds
tic()
datm <- as.matrix(test_dat)
obj_lm_fit <- .lm.fit(cbind(1,datm[,-1]), datm[,1])
print(class(obj_lm_fit))
toc() #approximately 0.2 seconds
Functions predict and resid are generic and since .lm.fit returns an object of class "list", all you have to do is to write methods implementing the definitions of what you want. Below are methods to compute fitted values, residuals and R^2.
set.seed(2023) # make the results reproducible
test_dat <- data.frame(y = rnorm(780, 20, 10))
for(b in 1:300){
name_var <- paste0("x",b)
test_dat[[name_var]] <- rnorm(780, 0.01 * b, 5)
}
obj_lm <- lm(y ~ ., data = test_dat)
datm <- as.matrix(test_dat)
obj_lm_fit <- .lm.fit(cbind(1,datm[,-1]), datm[,1])
#------------------------------------------------------------------------
# the methods for objects of class "list"
#
fitted.list <- function(object, X) {
X %*% coef(object)
}
resid.list <- residuals.list <- function(object, X, y) {
y_fitted <- fitted(object, X)
y - y_fitted
}
rsquared <- function(x, ...) UseMethod("rsquared")
rsquared.default <- function(x, ...) {
summary(x)$r.squared
}
rsquared.list <- function(object, X, y) {
e <- resid.list(object, X, y)
1 - sum(e^2)/sum( (y - mean(y))^2 )
}
rsquared(obj_lm_fit, cbind(1,datm[,-1]), datm[,1])
#> [1] 0.3948863
rsquared(obj_lm)
#> [1] 0.3948863
Created on 2023-01-03 with reprex v2.0.2
Edit 1
Added method to also calculate adj.R2
adj_rsquared_list <- function(object, X, y){
r2 <- rsquared.list(object, X, y)
k <- ncol(X) - 1
n <- nrow(X)
rate_of_error <- (1 - r2) * (n - 1) / (n - k - 1)
adj_r2 <- 1 - rate_of_error
return(adj_r2)
}
adj_rsquared_list(obj_lm_fit, cbind(1,datm[,-1]), datm[,1])
#> [1] 0.01590061
Edit 2
After the edit by Jovan, I have changed fitted.list above to use coef(), a function that extracts the first arguments list member "coefficients", if it exists, and rewrote the default and list methods of rsquared to accept a adj argument. The code to compute the adjusted R^2 is a copy&paste of Jovan's code.
rsquared <- function(x, ...) UseMethod("rsquared")
rsquared.default <- function(x, adj = FALSE, ...) {
if(adj) {
summary(x)$adj.r.squared
} else summary(x)$r.squared
}
rsquared.list <- function(object, X, y, adj = FALSE) {
e <- resid.list(object, X, y)
r2 <- 1 - sum(e^2)/sum( (y - mean(y))^2 )
if(adj) {
k <- ncol(X) - 1
n <- nrow(X)
rate_of_error <- (1 - r2) * (n - 1) / (n - k - 1)
adj_r2 <- 1 - rate_of_error
adj_r2
} else r2
}
# same as above
rsquared(obj_lm_fit, cbind(1,datm[,-1]), datm[,1])
#> [1] 0.3948863
rsquared(obj_lm)
#> [1] 0.3948863
# new, `adj = TRUE`
rsquared(obj_lm_fit, cbind(1,datm[,-1]), datm[,1], adj = TRUE)
#> [1] 0.01590061
rsquared(obj_lm, adj = TRUE)
#> [1] 0.01590061
Created on 2023-01-03 with reprex v2.0.2

Call nls from within a function in R, passing a user-specified function with any number of arguments

I have a function that uses stats::nls() internally to get parameter estimates for a non-linear model. However, the number of parameters I need the function to estimate is variable and determined by the users. How can I do this? This function is going in an R package, so it is important that it is as flexible as possible.
For example, here is some dummy data and two possible functions that users might use.
## dummy data
set.seed(654)
df <- data.frame(d = runif(50))
df$y <- exp(-df$d/.1)
df$x <- df$y + abs(rnorm(50, sd = .2))
## functions with different numbers of arguments
exp_fun <- function(d, r){
return(exp(-d/r))
}
exppow_fun <- function(d, r, a){
return(exp(-(d/r)^a))
}
The goal is to have a function called fit_nls(), which takes at least 3 arguments, x, d, and FUN. FUN is a function that takes d as its first argument, but can have any number of additional parameters, and outputs some transformation of d:
# run with any FUN
fit_nls(d = df$d, x = df$x, FUN = "exp_fun", ...)
fit_nls(d = df$d, x = df$x, FUN = "exppow_fun", ...)
fit_nls(d = df$d, x = df$x, FUN = function(d, a, b, c, e){...}), ...)
fit_nls(d = df$d, x = df$x, FUN = function(d){...}, ...)
I can make the function work for a fixed number of arguments:
fit_nls <- function(d, x, FUN, start){
fit.fun <- match.fun(FUN)
nls(x ~ fit.fun(d, r = r), start = start)
}
fit_nls(df$d, df$x, "exp_fun", start = list(r = .1))
fit_nls(df$d, df$x, function(d, r){d^r}, start = list(r = .1))
but haven't been able to figure out how to use a variable number of parameters. One thing I've tried is passing a list of arguments using do.call(), but this doesn't work:
fit_nls.multiarg <- function(d, x, FUN, start){
fit.fun <- match.fun(FUN)
args = append(list(d = d), start)
nls(x ~ do.call(fit.fun, args), start = start)
}
fit_nls.multiarg(df$d, df$x, "exp_fun", list(r = .01)) # error
which isn't really surprising, since it is equivalent to setting the values within the function:
nls(df$x ~ exp_fun(df$d, r = .1), start = list(r = .01) # equivalent error
So, I tried passing a symbol as a stand-in, without success:
fit_nls.symbol <- function(d, x, FUN, start){
fit.fun <- match.fun(FUN)
nam = names(start)
args = append(list(d = d), as.symbol(nam))
nls(x ~ do.call(fit.fun, args), start = start)
}
fit_nls.symbol(df$d, df$x, "exp_fun", list(r = .01)) # error
I'm open to any kind of solution. If anyone can give me any advice or point me in the right direction, I'd greatly appreciate it.
If a character string or function name is passed set FUN to the name of the function as a character string; otherwise, use "fit.fun". Then create the formula argument as a character string, convert it to an actual R formula and then run nls.
fit_nls <- function(d, x, FUN, start) {
FUN <- if (length(match.call()[[4]]) > 1) {
fit.fun <- match.fun(FUN)
"fit.fun"
} else deparse(substitute(FUN))
fo <- as.formula(sprintf("x ~ %s(d, %s)", FUN, toString(names(start))))
nls(fo, start = start)
}
Tests
with(df, fit_nls(d, x, "exp_fun", list(r = .01)))
## Nonlinear regression model
## model: x ~ exp_fun(d, r)
## data: parent.frame()
## r
## 0.1968
## residual sum-of-squares: 1.319
##
## Number of iterations to convergence: 11
## Achieved convergence tolerance: 6.254e-06
with(df, fit_nls(d, x, function(d, r){d^r}, start = list(r = .1)))
## Nonlinear regression model
## model: x ~ fit.fun(d, r)
## data: parent.frame()
## r
## 96.73
## residual sum-of-squares: 4.226
##
## Number of iterations to convergence: 22
## Achieved convergence tolerance: 7.429e-06
with(df, fit_nls(d, x, exp_fun, list(r = .01)))
## Nonlinear regression model
## model: x ~ exp_fun(d, r)
## data: parent.frame()
## r
## 0.1968
## residual sum-of-squares: 1.319
##
## Number of iterations to convergence: 11
## Achieved convergence tolerance: 6.254e-06

Error in z[, 1] : incorrect number of dimensions when doing bootstrapping samples

I am using a ratio of means to estimate the population, T = mean(xbar)/ mean(u_bar) using bootstrapping method and I'm getting this error
message, Error in z[, 1] : incorrect number of dimensions.
pop = read_xlsx("US_pop.xlsx")
attach(pop)
z = cbind(X,U)
T = function(z)
{
T = (mean(z[,1]) / mean(z[,2])) ## I am using a ratio of means to estimate the population = mean(xbar)/ mean(u_bar)
}
T_stat = T(z)
nBoot = 2e5 # number of bootstrapping sample
Tboot = matrix(0,nBoot,2) # est. T for each sample
set.seed(123)
for (i in 1:nBoot)
{
{
Tboot[i] = T(sample(z,replace = TRUE))
}
}
The error of the code in the question is in
sample(z, replace = TRUE)
z is a matrix but when sample is applied to it z is seen as a vector and the return value is no longer of class "matrix" and "array":
class(z)
#[1] "matrix" "array"
class(sample(z, replace = TRUE))
#[1] "numeric"
Therefore, to subset z[, 1] in the function is not to subset the same z but the function argument of the same name. And this is what sample returned, a numeric vector.
Here is the function and its call, corrected. Tested with the data at the end.
STAT <- function(z) mean(z[, 1])/mean(z[, 2])
T_stat <- STAT(z)
nBoot <- 2e5
Tboot <- numeric(nBoot)
set.seed(123)
for (i in 1:nBoot) {
j <- sample(nrow(z), replace = TRUE)
Tboot[i] = STAT(z[j, ])
}
T_stat
#[1] 1.826662
mean(Tboot)
#[1] 1.878934
A simpler alternative is function boot in base package boot. The function is modified in order to have an index variable i as argument.
STAT2 <- function(z, i) mean(z[i, 1])/mean(z[i, 2])
set.seed(123)
b <- boot::boot(z, STAT2, R = nBoot)
b
#
#ORDINARY NONPARAMETRIC BOOTSTRAP
#
#
#Call:
#boot::boot(data = z, statistic = STAT2, R = nBoot)
#
#
#Bootstrap Statistics :
# original bias std. error
#t1* 1.826662 0.05249037 0.4181259
mean(b$t)
#[1] 1.879153
Test data creation code
set.seed(2020)
X <- rexp(20, rate = 1/4)
U <- rexp(20, rate = 1/2)
z <- cbind(X, U)

Why do I get he error: argument is of length 0 for dffits?

I have a problem when I try to run the dffits() function for an object of my own logistic regression.
When I'm running dffits(log) I get the error message:
error in if (model$rank == 0) { : Argument is of length 0
However, when I'm using the inbuilt gym function (family = binomial), then dffits(glm) works just fine.
Here is my function for the logistic regression and a short example of my problem:
mydata <- read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
mydata$rank <- factor(mydata$rank)
mydata$admit <- factor(mydata$admit)
logRegEst <- function(x, y, threshold = 1e-10, maxIter = 100)
{
calcPi <- function(x, beta)
{
beta <- as.vector(beta)
return(exp(x %*% beta) / (1 + exp(x %*% beta)))
}
beta <- rep(0, ncol(x)) # initial guess for beta
diff <- 1000
# initial value bigger than threshold so that we can enter our while loop
iterCount = 0
# counter to ensure we're not stuck in an infinite loop
while(diff > threshold) # tests for convergence
{
pi <- as.vector(calcPi(x, beta))
# calculate pi by using the current estimate of beta
W <- diag(pi * (1 - pi)) # calculate matrix of weights W
beta_change <- solve(t(x) %*% W %*% x) %*% t(x) %*% (y - pi)
# calculate the change in beta
beta <- beta + beta_change # new beta
diff <- sum(beta_change^2)
# calculate how much we changed beta by in this iteration
# if this is less than threshold, we'll break the while loop
iterCount <- iterCount + 1
# see if we've hit the maximum number of iterations
if(iterCount > maxIter){
stop("This isn't converging.")
}
# stop if we have hit the maximum number of iterations
}
df <- length(y) - ncol(x)
# calculating the degrees of freedom by taking the length of y minus
# the number of x columns
vcov <- solve(t(x) %*% W %*% x)
list(coefficients = beta, vcov = vcov, df = df)
# returning results
}
logReg <- function(formula, data)
{
mf <- model.frame(formula = formula, data = data)
# model.frame() returns us a data.frame with the variables needed to use the
# formula.
x <- model.matrix(attr(mf, "terms"), data = mf)
# model.matrix() creates a design matrix. That means that for example the
#"Sex"-variable is given as a dummy variable with ones and zeros.
y <- as.numeric(model.response(mf)) - 1
# model.response gives us the response variable.
est <- logRegEst(x, y)
# Now we have the starting position to apply our function from above.
est$formula <- formula
est$call <- match.call()
est$data <- data
# We add the formular and the call to the list.
est$x <- x
est$y <- y
# We add x and y to the list.
class(est) <- "logReg"
# defining the class
est
}
log <- logReg(admit ~ gre + gpa, data= mydata)
glm <- glm(admit ~ gre + gpa, data= mydata, family = binomial)
dffits(glm)
dffits(log)
log$data
glm$data
I don't understand why mydata$rank == 0, because when I look at log$data I see that the rank is just defined as in glm$data.
I really appreciate your help!

perform Deming regression without intercept

I would like to perform Deming regression (or any equivalent of a regression method with uncertainties in both X and Y variables, such as York regression).
In my application, I have a very good scientific justification to deliberately set the intercept as zero. However, I can't find way of setting it to zero, either in R package deming, it gives an error when I use -1 in the formula:
df=data.frame(x=rnorm(10), y=rnorm(10), sx=runif(10), sy=runif(10))
library(deming)
deming(y~x-1, df, xstd=sy, ystd=sy)
Error in lm.wfit(x, y, wt/ystd^2) : 'x' must be a matrix
In other packages (like mcr::mcreg or IsoplotR::york or MethComp::Deming), the input are two vectors x and y, so there is no way I can input a model matrix or modify the formula.
Do you have any idea on how to achieve that? Thanks.
There is a bug in the function when you remove the intercept. I'm going to report it.
It is easy to fix, you just have to change 2 lines in the original function.
The print does not work correctly, but it is possible to interpret the output.
deming.aux <-
function (formula, data, subset, weights, na.action, cv = FALSE,
xstd, ystd, stdpat, conf = 0.95, jackknife = TRUE, dfbeta = FALSE,
x = FALSE, y = FALSE, model = TRUE)
{
deming.fit1 <- getAnywhere(deming.fit1)[[2]][[1]]
deming.fit2 <- getAnywhere(deming.fit2)[[2]][[1]]
Call <- match.call()
indx <- match(c("formula", "data", "weights", "subset", "na.action", "xstd", "ystd"), names(Call), nomatch = 0)
if (indx[1] == 0)
stop("A formula argument is required")
temp <- Call[c(1, indx)]
temp[[1]] <- as.name("model.frame")
mf <- eval(temp, parent.frame())
Terms <- terms(mf)
n <- nrow(mf)
if (n < 3)
stop("less than 3 non-missing observations in the data set")
xstd <- model.extract(mf, "xstd")
ystd <- model.extract(mf, "ystd")
Y <- as.matrix(model.response(mf, type = "numeric"))
if (is.null(Y))
stop("a response variable is required")
wt <- model.weights(mf)
if (length(wt) == 0)
wt <- rep(1, n)
usepattern <- FALSE
if (is.null(xstd)) {
if (!is.null(ystd))
stop("both of xstd and ystd must be given, or neither")
if (missing(stdpat)) {
if (cv)
stdpat <- c(0, 1, 0, 1)
else stdpat <- c(1, 0, 1, 0)
}
else {
if (any(stdpat < 0) || all(stdpat[1:2] == 0) || all(stdpat[3:4] ==
0))
stop("invalid stdpat argument")
}
if (stdpat[2] > 0 || stdpat[4] > 0)
usepattern <- TRUE
else {
xstd <- rep(stdpat[1], n)
ystd <- rep(stdpat[3], n)
}
}
else {
if (is.null(ystd))
stop("both of xstd and ystd must be given, or neither")
if (!is.numeric(xstd))
stop("xstd must be numeric")
if (!is.numeric(ystd))
stop("ystd must be numeric")
if (any(xstd <= 0))
stop("xstd must be positive")
if (any(ystd <= 0))
stop("ystd must be positive")
}
if (conf < 0 || conf >= 1)
stop("invalid confidence level")
if (!is.logical(dfbeta))
stop("dfbeta must be TRUE or FALSE")
if (dfbeta & !jackknife)
stop("the dfbeta option only applies if jackknife=TRUE")
X <- model.matrix(Terms, mf)
if (ncol(X) != (1 + attr(Terms, "intercept")))
stop("Deming regression requires a single predictor variable")
xx <- X[, ncol(X), drop = FALSE]
if (!usepattern)
fit <- deming.fit1(xx, Y, wt, xstd, ystd, intercept = attr(Terms, "intercept"))
else fit <- deming.fit2(xx, Y, wt, stdpat, intercept = attr(Terms, "intercept"))
yhat <- fit$coefficients[1] + fit$coefficients[2] * xx
fit$residuals <- Y - yhat
if (x)
fit$x <- X
if (y)
fit$y <- Y
if (model)
fit$model <- mf
na.action <- attr(mf, "na.action")
if (length(na.action))
fit$na.action <- na.action
fit$n <- length(Y)
fit$terms <- Terms
fit$call <- Call
fit
}
deming.aux(y ~ x + 0, df, xstd=sy, ystd=sy)
$`coefficients`
[1] 0.000000 4.324481
$se
[1] 0.2872988 0.7163073
$sigma
[1] 2.516912
$residuals
[,1]
1 9.19499513
2 2.13037957
3 3.00064886
4 2.16751905
5 0.00168729
6 4.75834265
7 3.44108236
8 6.40028085
9 6.63531039
10 -1.48624851
$model
y x (xstd) (ystd)
1 2.1459817 -1.6300251 0.48826221 0.48826221
2 1.3163362 -0.1882407 0.46002166 0.46002166
3 1.5263967 -0.3409084 0.55771660 0.55771660
4 -0.9078000 -0.7111417 0.81145673 0.81145673
5 -1.6768719 -0.3881527 0.01563191 0.01563191
6 -0.6114545 -1.2417205 0.41675425 0.41675425
7 -0.7783790 -0.9757150 0.82498713 0.82498713
8 1.1240046 -1.2200946 0.84072712 0.84072712
9 -0.3091330 -1.6058442 0.35926078 0.35926078
10 0.7215432 0.5105333 0.23674788 0.23674788
$n
[1] 10
$terms
y ~ x + 0
...
To adapt the function I have performed, these steps:
1 .- Load the internal functions of the package.
deming.fit1 <- getAnywhere(deming.fit1)[[2]][[1]]
deming.fit2 <- getAnywhere(deming.fit2)[[2]][[1]]
2 .- Locate the problem and solve it, executing the function step by step with an example.
Y <- as.matrix(model.response(mf, type = "numeric"))
...
xx <- X[, ncol(X), drop = FALSE]
3 .- Fix other possible error generated by the changes.
In this case, delete the class of the output to avoid an error in the print.
Bug report:
Terry Therneau (the author of deming) uploaded a new version to CRAN, with this problem solved.

Resources