How to pass loop results as initial parameters for nlslm model? - r

I am running a nonlinear regression model that needs initial values to start, but the number of variables I want to include may be too large to manually type all the values - therefore I was wondering if there's an alternative to that.
set.seed(12345)
y = rnorm(100, 1000,150)
x1 = rnorm(100,10000,251)
x2 = rnorm(100, 3000,654)
x3 = rnorm(100, 25000,100)
x4 = rnorm(100, 200000,589)
x5 = rnorm(100, 31657,296)
adstock <- function(x,rate=0){
return(as.numeric(stats::filter(x=log(x+1),filter=rate,method="recursive")))
}
library(minpack.lm)
nlsLM(y~b0
+ b1 * adstock(x1, r1)
+ b2 * adstock(x2, r2)
+ b3 * adstock(x3, r3)
+ b4 * adstock(x4, r4)
+ b5 * adstock(x5, r5)
, algorithm = "LM"
# this is where I need to paste the results from the loop
, start = c(b0=1,b1=1,b2=1,b3=1,b4=1,b5=1
,r1=0.1,r2=0.1,r3=0.1,r4=0.1,r5=0.1
)
# end
, control = list(maxiter = 200)
)
My idea was to use a loop to pass the values to the model, but I can't make it work (the following code should be for b_i coefficients)
test_start <- NULL
for(i in 1:(5+1)) {
test_start[i] = paste0("b",i-1,"=",1)
}
cat(test_start)
This is the result, which is not exactly what the model expects:
b0=1 b1=1 b2=1 b3=1 b4=1 b5=1
How can I pass the results of the loop to the model?
Also, how can I add r_i start coefficients to b_i start coefficients in the loop?
Any help would be very appreciated.
PS: at the moment I am interested to assign to each b0,b1,...,b5 the same value (in this case, 1) and to each r1,r2,...,r5 the same value (in this case, 0.1)

Define the data as DF and the formula as fo and then grep out the b and r variables. The line defining v creates a vector with their names and the line defining st a named vector with value 1 for the b's and 0.1 for the r's.
DF <- data.frame(y, x1, x2, x3, x4, x5)
n <- ncol(DF) - 1
rhs <- c("b0", sprintf("b%d * adstock(x%d, r%d)", 1:n, 1:n, 1:n))
fo <- reformulate(rhs, "y")
v <- grep("[br]", all.vars(fo), value = TRUE)
st <- setNames(grepl("b", v) + 0.1 * grepl("r", v), v)
st
nlsLM(fo, DF, start = st, algorithm = "LM", control = list(maxiter = 200))
Regarding the comment try defining rhs like this. In the first line take whatever subset of labs you want, e.g. labs <- labels(...)[1:9] or change the formula in the first line, e.g. labs <- labels(terms(y ~ .*(1 + x1), data = DF))
labs <- labels(terms(y ~ .^2, data = DF))
labs <- sub(":", "*", labs)
n <- length(labs)
rhs <- c("b0", sprintf("b%d * adstock(%s, r%d)", 1:n, labs, 1:n))

Related

How do I add a conditional column based on an inequality with matrix multiplication (in R)?

I have a data frame of values and want to add a column based on an inequality condition that involves matrix multiplication.
The data frame looks like this
# Set possible values for variables
b0 <- b1 <- b2 <- seq(0, 2, by=0.1)
# Create data frame of all the different combos of these variables
df <- setNames(expand.grid(b0, b1, b2), c("b0", "b1", "b2"))
There are a lot of precursor objects I have to define before adding this column:
##### Set n
n = 100
#### Generate (x1i, x2i)
# Install and load the 'MASS' package
#install.packages("MASS")
library("MASS")
# Input univariate parameters
rho <- 0.5
mu1 <- 0; s1 <- 1
mu2 <- 0; s2 <- 1
# Generate parameters for bivariate normal distribution
mu <- c(mu1, mu2)
sigma <- matrix(c(s1^2, s1*s2*rho, s1*s2*rho, s2^2), nrow=2, ncol=2)
# Generate draws from bivariate normal distribution
bvn <- mvrnorm(n, mu=mu, Sigma=sigma ) # from MASS package
x1 <- bvn[, 1]
x2 <- bvn[, 2]
##### Generate error
error <- rnorm(n)
##### Generate dependent variable
y <- 0.5 + x1 + x2 + error
##### Create the model
lm <- lm(y ~ x1 + x2)
# Setup parameters
n <- 100
K <- 3
c <- qf(.95, K, n - K)
# Define necessary objects
sigma_hat_sq <- 1
b0_hat <- summary(lm)$coefficients[1, 1]
b1_hat <- summary(lm)$coefficients[2, 1]
b2_hat <- summary(lm)$coefficients[3, 1]
x <- cbind(1, x1, x2)
I am trying to add this conditional column like this:
# Add a column to the data frame that says whether the condition holds
df <- transform(df, ueq = (
(1/(K*sigma_hat_sq))*
t(matrix(c(b0_hat-b0, b1_hat-b1, b2_hat-b2)))%*%
t(x)%*%x%*%
matrix(c(b0_hat-b0, b1_hat-b1, b2_hat-b2))
<= c
))
...but doing so generates the error message
Error in t(matrix(c(b0_hat - b0, b1_hat - b1, b2_hat - b2))) %*% t(x) :
non-conformable arguments
Mathematically, the condition is [1/(Ksigmahat^2)](Bhat-B)'X'X(Bhat-B) <= c, where, for each triple (b0,b1,b2), (Bhat-B) is a 3x1 matrix with elements {B0hat, B1hat, B2hat}. I'm just not sure how to write this condition in R.
Any help would be greatly appreciated!
In order to only work with one row of df at a time (and get a separate answer for each 1 x 3 matrix, you need a loop.
A simple way to do this in R is mapply.
df <- transform(df, ueq = mapply(\(b0, b1, b2)
(1/(K*sigma_hat_sq)) *
t(c(b0_hat-b0, b1_hat-b1, b2_hat-b2)) %*%
t(x) %*% x %*%
c(b0_hat-b0, b1_hat-b1, b2_hat-b2)
<= c,
b0 = b0, b1 = b1, b2 = b2
))
This leads to 91 TRUE rows.
sum(df$ueq)
[1] 91

R telling me vectors are not vectors when I map function that includes grf::causal_forest()

I want to pass a vector of column names to purrr::map() and iteratively pass them to the grf::causal_forest() function. In attempting this, i get an error that the values i'm passing to causal_forest() are not vectors (which is required) even though they definitely are.
For example, say I have this df
n <- 500
p <- 5
X <- matrix(rnorm(n * p), n, p)
W <- rbinom(n, 1, 0.5)
Y1 <- pmax(X[, 1], 0) * W + X[, 2] + pmin(X[, 3], 0) + rnorm(n)
Y2 <- pmax(X[, 1], 0) * W + X[, 2] + pmin(X[, 3], 0) + rnorm(n)
Y3 <- pmax(X[, 1], 0) * W + X[, 2] + pmin(X[, 3], 0) + rnorm(n)
df <- data.frame(Y1, Y2, Y3, W, X)
head(df)
Y1 Y2 Y3 W X1 X2 X3 X4 X5
1 0.5457143 1.933581483 2.38474639 1 -0.788463384 0.9146194 0.73684926 -0.51268651 -0.53317046
2 0.9640213 -1.098133573 1.15639726 1 0.008873619 1.1513535 -1.09108874 0.10308198 1.46560149
3 0.8839862 0.005357524 1.26430215 1 1.588380125 -0.9261196 0.35219255 0.81017210 -1.86847771
4 0.1424579 -0.783984941 -0.01038922 0 2.391068797 0.3080699 -0.94651780 1.92707015 0.42646239
5 0.1771250 0.484711614 -1.95481918 1 0.058835623 0.2541232 -0.05696465 0.01781394 -0.07254417
6 -1.8144585 -1.972902090 -1.47101855 1 -0.518724916 -1.1474859 0.94850272 0.80635703 0.72156403
Where Y* are the dependent variables, X* is the covariate matrix, and W is a binary treatment indicator. I can estimate the model with just a single value of Y* like so
library(grf)
c_forest <- causal_forest(
X = X,
Y = df$Y1,
W = df$W)
ate_c_forest <- average_treatment_effect(
c_forest,
target.sample = "overlap")
ate_c_forest
estimate std.err
0.12262543 0.09578717
But I want to iterate over each value of Y1, Y2, and Y3 using map(), then extract the estimate and std.err for the output of each call to average_treatment_effect(), and put these inside a tibble. So I wrote this small function
Y_n <- c("Y1", "Y2", "Y3")
names(Y_n) <- Y_n
grf_fcn <- function(.x){
Y <- df$.x
W <- df$W
c_forest <- causal_forest(
X = X,
W = W,
Y = Y)
ate_c_forest <- average_treatment_effect(
c_forest,
target.sample = "overlap")
}
## call function
library(purrr)
grf_results <- purrr::map(
.x = tidyselect::all_of(Y_n),
.f = grf_fcn)
However, when I attempt to call the function it returns the error "Error in validate_observations(Y, X) : Observations (W, Y, Z or D) must be vectors." I find this curious as Y* and W are vectors. E.g.
> is.vector(df$Y1)
[1] TRUE
> is.vector(df$W)
[1] TRUE
Can anyone see where i'm going wrong here? Or is this a bug of some kind?
To better understand where's the problem in your function, compare the output of the two following calls to map.
This one is the one you are using, it will return NULL:
purrr::map(tidyselect::all_of(Y_n), function(x) { df$x })
This one uses bracket notation, it will return the expected values:
purrr::map(tidyselect::all_of(Y_n), function(x) { df[[x]] })
This is a quirk of map and honestly I'm not quite sure what's going on under the hood, but at least we know how to modify your function to get your desired results:
grf_fcn <- function(x){
Y <- df[[x]]
W <- df$W
c_forest <- causal_forest(
X = X,
W = W,
Y = Y)
ate_c_forest <- average_treatment_effect(
c_forest,
target.sample = "overlap")
}

Minimum of a multidimensional function

I have a function R^5 -> R, and I am interested in its minimum. There are plenty of functions in R like optim, optimize or fminbnd in the R package pracma. But they just accept one argument and I don't understand the help page.
mindisturbed <- function(a,d1,d2,d3,p){
sum((data^(- a) * (d1 + d2*cos(log(data)*2*pi/p) + d3 *
sin(log(data)*2*pi/p)) - log(j))^2)
}
The "data" and the "j" variable are in my global settings. These are vectors with length k. The arguments of the function are all numeric numbers with length 1. The function is an residual square sum.
So do anyone know how to minimize this function in depend of all its arguments?
Assuming data and j are vectors of the same length try the following. You may or may not need better starting values.
1) Use optim like this
st <- c(a = 1, d1 = 1, d2 = 1, d3 = 1, p = 1)
f <- function(x) mindisturbed(x[1], x[2], x[3], x[4], x[5])
optim(st, f)
2) or nls with default algorithm where st is from (1)
fo <- log(j) ~ data^(- a) * (d1 + d2*cos(log(data)*2*pi/p) + d3 *
sin(log(data)*2*pi/p))
nls(fo, start = st)
3) or nls with plinear algorithm. In that case the RHS of the formula is a matrix with column names d1, d2 and d3 such that first column multiplies d1, second d2 and third d3. Only the nonlinear parameters, i.e. a and p, are specified in start.
fo2 <- log(j) ~ data^(-a) * cbind(d1 = 1,
d2 = cos(log(data)*2*pi/p),
d3 = sin(log(data)*2*pi/p))
nls(fo2, start = c(a = 0.1, p = 0.1), algorithm = "plinear")
Note
The question did not include data and j but we can use these to try it out.
set.seed(123)
n <- 100
data <- runif(n, 1, 2)
j <- 1:n
o <- order(data)
j <- j[o]
data <- data[o]

Replace only certain percentage of value in condition

I would like to change 5% of the good data into outliers. So in this case n=40 which means I need to change 2 y-values to become outliers. But I dont know how to put in one more condition to let R know that I only want to change 2 y-values, instead of all values less than or equal to 2. Following are the R codes that I tried:
set.seed(1001)
x1 <- runif(40,1,10)
y1 <- 5 + x1
x <- 2
for (i in 1:length(x1)){
if (x1[i] <= x){
y1[i] <- rnorm(1,20,1)
}
}
You probably want the outliers to be randomly selected.
set.seed(1001)
x1 <- runif(40,1,10)
y1 <- 5 + x1
change_idx <- sample(1:40, 2, replace = FALSE)
y1[change_idx] <- rnorm(2, 20, 1)
The line y1[change_idx] makes use of vectorization, so there is no need for loops.
To make it easier to reuse it or change variables, you could use some variables for values that are used in the code:
n <- 40
o <- 2
x1 <- runif(n,1,10)
y1 <- 5 + x1
change_idx <- sample(1:n, o, replace = FALSE)
y1[change_idx] <- rnorm(o, 20, 1)
You can now visualise your data and mark the outliers:
group <- rep("normal", n)
group[change_idx] <- "outlier"
ggplot(data.frame(x1, y1, group)) + geom_point(aes(x1, y1, col = group))
Depending on what you want to achieve I recommend adding some noise to the y1 variable, e.g. y1 <- 5 + x1 + rnorm(n, 0, 1), which will result in data like this:

Writing my own MLE command in R is causing issues

I am just really getting into trying to write MLE commands in R that function and look similar to native R functions. In this attempt I am trying to do a simple MLE with
y=b0 + x*b1 + u
and
u~N(0,sd=s0 + z*s1)
However, even such a simple command I am having difficulty coding. I have written a similar command in Stata in a handful of lines
Here is the code I have written so far in R.
normalreg <- function (beta, sigma=NULL, data, beta0=NULL, sigma0=NULL,
con1 = T, con2 = T) {
# If a formula for sigma is not specified
# assume it is the same as the formula for the beta.
if (is.null(sigma)) sigma=beta
# Grab the call expression
mf <- match.call(expand.dots = FALSE)
# Find the position of each argument
m <- match(c("beta", "sigma", "data", "subset", "weights", "na.action",
"offset"), names(mf), 0L)
# Adjust names of mf
mf <- mf[c(1L, m)]
# Since I have two formulas I will call them both formula
names(mf)[2:3] <- "formula"
# Drop unused levels
mf$drop.unused.levels <- TRUE
# Divide mf into data1 and data2
data1 <- data2 <- mf
data1 <- mf[-3]
data2 <- mf[-2]
# Name the first elements model.frame which will be
data1[[1L]] <- data2[[1L]] <- as.name("model.frame")
data1 <- as.matrix(eval(data1, parent.frame()))
data2 <- as.matrix(eval(data2, parent.frame()))
y <- data1[,1]
data1 <- data1[,-1]
if (con1) data1 <- cbind(data1,1)
data2 <- unlist(data2[,-1])
if (con2) data2 <- cbind(data2,1)
data1 <- as.matrix(data1) # Ensure our data is read as matrix
data2 <- as.matrix(data2) # Ensure our data is read as matrix
if (!is.null(beta0)) if (length(beta0)!=ncol(data1))
stop("Length of beta0 need equal the number of ind. data2iables in the first equation")
if (!is.null(sigma0)) if (length(sigma0)!=ncol(data2))
stop("Length of beta0 need equal the number of ind. data2iables in the second equation")
# Set initial parameter estimates
if (is.null(beta0)) beta0 <- rep(1, ncol(data1))
if (is.null(sigma0)) sigma0 <- rep(1, ncol(data2))
# Define the maximization function
normMLE <- function(est=c(beta0,sigma0), data1=data1, data2=data2, y=y) {
data1est <- as.matrix(est[1:ncol(data1)], nrow=ncol(data1))
data2est <- as.matrix(est[(ncol(data1)+1):(ncol(data1)+ncol(data2))],
nrow=ncol(data1))
ps <-pnorm(y-data1%*%data1est,
sd=data2%*%data2est)
# Estimate a vector of log likelihoods based on coefficient estimates
llk <- log(ps)
-sum(llk)
}
results <- optim(c(beta0,sigma0), normMLE, hessian=T,
data1=data1, data2=data2, y=y)
results
}
x <-rnorm(10000)
z<-x^2
y <-x*2 + rnorm(10000, sd=2+z*2) + 10
normalreg(y~x, y~z)
At this point the biggest issue is finding an optimization routine that does not fail when the some of the values return NA when the standard deviation goes negative. Any suggestions? Sorry for the huge amount of code.
Francis
I include a check to see if any of the standard deviations are less than or equal to 0 and return a likelihood of 0 if that is the case. Seems to work for me. You can figure out the details of wrapping it into your function.
#y=b0 + x*b1 + u
#u~N(0,sd=s0 + z*s1)
ll <- function(par, x, z, y){
b0 <- par[1]
b1 <- par[2]
s0 <- par[3]
s1 <- par[4]
sds <- s0 + z*s1
if(any(sds <= 0)){
return(log(0))
}
preds <- b0 + x*b1
sum(dnorm(y, preds, sds, log = TRUE))
}
n <- 100
b0 <- 10
b1 <- 2
s0 <- 2
s1 <- 2
x <- rnorm(n)
z <- x^2
y <- b0 + b1*x + rnorm(n, sd = s0 + s1*z)
optim(c(1,1,1,1), ll, x=x, z=z,y=y, control = list(fnscale = -1))
With that said it probably wouldn't be a bad idea to parameterize the standard deviation in such a way that it is impossible to go negative...

Resources