Regression function with variable number of arguments in r - r

I have composed a function to calculate VIF for nls regression models. It looks like this:
function (a,b,c,d,e,f,g) {
VIFa <- 1/(1- (R2 <- summary(lm(a ~ b + c + d + e + f + g))$r.square))
PMa <- ifelse (sqrt(VIFa) > 2, "JE", "NI")
VIFb <- 1/(1- (R2 <- summary(lm(b ~ a + c + d + e + f + g))$r.square))
PMb <- ifelse (sqrt(VIFb) > 2, "JE", "NI")
VIFc <- 1/(1- (R2 <- summary(lm(c ~ a + b + d + e + f + g))$r.square))
PMc <- ifelse (sqrt(VIFc) > 2, "JE", "NI")
VIFd <- 1/(1- (R2 <- summary(lm(d ~ a + b + c + e + f + g))$r.square))
PMd <- ifelse (sqrt(VIFd) > 2, "JE", "NI")
VIFe <- 1/(1- (R2 <- summary(lm(e ~ a + b + c + d + f + g))$r.square))
PMe <- ifelse (sqrt(VIFe) > 2, "JE", "NI")
VIFf <- 1/(1- (R2 <- summary(lm(f ~ a + b + c + d + e + g))$r.square))
PMf <- ifelse (sqrt(VIFf) > 2, "JE", "NI")
VIFg <- 1/(1- (R2 <- summary(lm(g ~ a + b + c + d + e + f))$r.square))
PMg <- ifelse (sqrt(VIFg) > 2, "JE", "NI")
rezultat <- data.frame(c(VIFa, VIFb, VIFc, VIFd, VIFe, VIFf, VIFg),
c(PMa, PMb, PMc, PMd, PMe, PMf, PMg))
names(rezultat) <- c("VIF", "Multikolinearnost")
return(as.matrix.data.frame(rezultat))
}
Where a,b,c,d,e,f,g are variables that are used in certain model. VIFa is Variance Inflation Factor of 'a' variable, and PMa is a logical value showing whether variance inflation can cause major discrepancy in model (JE = yes) or not (NO = not).
My question is how to make this function suitable for any number of arguments?
I already tried to use lapply function, however I could not find a way to use each variable once as dependent and all others as independent (for any number of variables).

Try the following:
regapply <- function(l) {
ids <- names(l)
n <- length(l)
vifs <- numeric(n)
pms <- character(n)
for (i in seq_along(l)) {
f <- parse(text=sprintf("%s ~ %s", ids[i], paste(ids[-i], collapse=" + ")))[[1]]
vifs[i] <- 1/(1- (R2 <- summary(lm(f, data=l))$r.square))
pms[i] <- ifelse (sqrt(vifs[i]) > 2, "JE", "NI")
}
data.frame(var=ids, vif=vifs, pm=pms)
}
Call the above on a named list (e.g. a data frame), for example:
regapply(iris[-5])
## var vif pm
## 1 Sepal.Length 7.072722 JE
## 2 Sepal.Width 2.100872 NI
## 3 Petal.Length 31.261498 JE
## 4 Petal.Width 16.090175 JE
Explanation: parse(text=sprintf("%s ~ %s", ids[i], paste(ids[-i], collapse=" + ")))[[1]] creates a series of formulas. In the above example we have:
Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width
Sepal.Width ~ Sepal.Length + Petal.Length + Petal.Width
Petal.Length ~ Sepal.Length + Sepal.Width + Petal.Width
Petal.Width ~ Sepal.Length + Sepal.Width + Petal.Length

Here is another solution using an ellipsis for the varying number of arguments.
The regression models that you apply contain solely additive terms (see my comment on your question). In this case, you can loop over the variables (e.g. in a data.frame).
vif <- function(...){
dat <- data.frame(...)
n <- ncol(dat)
out <- data.frame(VIF=numeric(n), MK=numeric(n))
for(ii in 1:n){
dv <- colnames(dat)[ii]
iv <- colnames(dat)[-ii]
fml <- as.formula(paste(dv,paste(iv,collapse="+"),sep="~"))
VIF <- 1/(1- (R2 <- summary(lm(fml))$r.square))
KM <- ifelse(sqrt(VIF)>2, "JE", "NI")
out[ii,] <- c(round(VIF,5),KM)
}
return(out)
}
Works for both variables and data.frames as input.
a <- c(1,2,3,4,5)
b <- c(1,3,2,2,4)
x <- c(3,3,2,4,5)
dat <- data.frame(a,b,x)
# > vif(a,b,x)
# VIF MK
# 1 2.47059 NI
# 2 2.06471 NI
# 3 2.06471 NI
# > vif(dat)
# VIF MK
# 1 2.47059 NI
# 2 2.06471 NI
# 3 2.06471 NI
Cheers!

Related

R - How can I make this loop run faster?

The for loop below iterates over nodes in an igraph graph. There are 2048 of these, so it is very slow. I've tried to code as efficiently as possible (for example, by not growing vectors). How can I make the loop run faster?
Edit: I've also thought about writing this in C++ via Rcpp. I just don't know how I would use igraph in that case.
Edit 2: compatible_models actually depends on child_node. What I gave here is an example of what it could be for a particular value of child_node.
library(igraph)
library(Metrics)
set.seed(1234)
N <- 10000
A <- rnorm(N, 10, 2)
B <- rnorm(N, 9, 2)
C <- rnorm(N, 12, 1)
D <- rnorm(N, 7, 3)
Y <- A + B + A*B + D + A^2 + rnorm(N)
data <- data.frame(Y = Y, A = A, B = B, C = C, D = D)
partition <- sort(sample(N, 0.7*N))
data_train <- data[partition, ]
data_test <- data[-partition, ]
g <- make_empty_graph()
g <- g + vertices(1:2049)
generate_edges <- function(start_vertex, end_vertices) {
edges <- c()
for (i in 1:length(end_vertices)) {
edges <- c(edges, start_vertex, end_vertices[i])
}
return(edges)
}
outward_edges <- generate_edges(V(g)[1], V(g)[2:vcount(g)])
g <- g + edges(outward_edges, attr1 = rep(0, length(outward_edges) / 2), attr2 = rep(0, length(outward_edges) / 2))
successors <- matrix(nrow = length(g[[1, ]][[1]]), ncol = 9) # nrow = number of successors of node 1
i <- 1
for (child_node in 2:2049) {
# compatible_models <- lapply(...) # suppose this is a list of "formula" objects
# like:
compatible_models <- list(Y ~ A + B + C, Y ~ I(A^2) + B + C + D, Y ~ B + D)
compatible_models <- lapply(compatible_models, lm, data = data_train)
predictions <- sapply(compatible_models, predict, newdata = data_test)
successors[i, 1:3] <- c(edge_attr(g, name = "attr1", g[[1, V(g)[child_node], edges = TRUE]]), edge_attr(g, name = "attr2", g[[1, V(g)[child_node], edges = TRUE]]),
sum(apply(predictions, 2, rmse, actual = data_test$Y))/length(compatible_models))
i <- i + 1
}
Correct me if i am wrong but i think you could evaluate the first three lines (or any lines that build model objects, but do not evaluate anything) outside of the loop, which ~ triples the performance of the code on my machine:
successors <- matrix(nrow = length(g[[1, ]][[1]]), ncol = 9) # nrow = number of successors of node 1
i <- 1
start_time <- Sys.time()
for (child_node in 2:2049) {
# build models inside loop:
compatible_models <- list(Y ~ A + B + C, Y ~ I(A^2) + B + C + D, Y ~ B + D)
compatible_models <- lapply(compatible_models, lm, data = data_train)
predictions <- sapply(compatible_models, predict, newdata = data_test)
successors[i, 1:3] <- c(edge_attr(g, name = "attr1", g[[1, V(g)[child_node], edges = TRUE]]), edge_attr(g, name = "attr2", g[[1, V(g)[child_node], edges = TRUE]]),
sum(apply(predictions, 2, rmse, actual = data_test$Y))/length(compatible_models))
i <- i + 1
}
Sys.time()-start_time
#Time difference of 26.69914 secs
Optimized code with model creation outside of loop:
## model building:
compatible_models <- list(Y ~ A + B + C, Y ~ I(A^2) + B + C + D, Y ~ B + D)
compatible_models <- lapply(compatible_models, lm, data = data_train)
predictions <- sapply(compatible_models, predict, newdata = data_test)
## initialisation:
successors2 <- matrix(nrow = length(g[[1, ]][[1]]), ncol = 9)
i <- 1
start_time <- Sys.time()
for (child_node in 2:2049) {
successors2[i, 1:3] <- c(edge_attr(g, name = "attr1", g[[1, V(g)[child_node], edges = TRUE]]), edge_attr(g, name = "attr2", g[[1, V(g)[child_node], edges = TRUE]]),
sum(apply(predictions, 2, rmse, actual = data_test$Y))/length(compatible_models))
i <- i + 1
}
Sys.time()-start_time
#Time difference of 8.885826 secs
all.equal(successors,successors2)
# [1] TRUE

Problem with function/ replacement has length zero

I'm trying to run the following function proposed by Lai et al., 2021, that aims to compare fit differences in non-nested models with categorical AVs. The Models look like this:
Mod1 <- '
# Measurement models
# Predictor Variables
A =~ NoEPA2 + ChEPA2 + SvEPA2
N =~ NoEPN2 + ChEPN2 + SvEPN2
# Outcome Variables
I =~ DE_VORH + AN_VORH + AINTMAX
EX =~ C_VORH + O_VORH + AD_MAX
# Control variables
AGE =~ age
age ~~ 0*age
SEX =~ sex
sex ~~ 0*sex
EDU =~ edu
edu ~~ 0* edu
#Error correlation A, N, E
NoEPA1 ~~ NoEPN1
ChEPA1 ~~ ChEPN1
SvEPA1 ~~ SvEPN1
# Correlations DV
A ~~ N
I ~~ EX
# Paths
I ~ A + N + AGE + SEX + EDU
EX ~ A + N + AGE + SEX + EDU
'
Sem2 <- sem(Mod1,
data=a,
estimator = "WLSMV",
conditional.x = FALSE,
mimic = "Mplus",
ordered = c("DE_VORH", "AN_VORH","AINTMAX","O_VORH", "C_VORH","AD_MAX"))
summary(sem2,
fit.measures = TRUE,
standardize = TRUE,
rsquare = TRUE,
estimates = TRUE,
ci = FALSE)
Mod2 <- '
# Measurement models
# Predictor Variables
A =~ NoEPA1 + ChEPA1 + SvEPA1
N =~ NoEPN1 + ChEPN1 + SvEPN1
E =~ MxStEPEM + ChEPEM1 + SvEPEM1
# Outcome Variables
I =~ DE_VORH + AN_VORH + AINTMAX
EX =~ C_VORH + O_VORH + AD_MAX
# Control variables
AGE =~ age
age ~~ 0*age
SEX =~ sex
sex ~~ 0*sex
EDU =~ edu
edu ~~ 0* edu
#Error correlation A, N, E
NoEPA1 ~~ NoEPN1 + MxStEPEM
NoEPN1 ~~ MxStEPEM
ChEPA1 ~~ ChEPN1 + ChEPEM1
ChEPN1 ~~ ChEPEM1
SvEPA1 ~~ SvEPN1 + SvEPEM1
SvEPN1 ~~ SvEPEM1
# Correlations DV
A ~~ N + E
N ~~ E
I ~~ EX
# Paths
I ~ A + N + E + AGE + SEX + EDU
EX ~ A + N + E + AGE + SEX + EDU
'
sem3a <- sem(Mod2,
data=a,
estimator = "WLSMV",
conditional.x = FALSE,
mimic = "Mplus",
ordered = c("DE_VORH", "AN_VORH", "AINTMAX", "O_VORH","C_VORH","AD_MAX"))
summary(sem3a,
fit.measures = TRUE,
standardize = TRUE,
rsquare = TRUE,
estimates = TRUE,
ci = FALSE)
The function I want to apply looks like this:
## The function below returns point estimate and standard error for
## ∆RMSEA, ∆CFI, and ∆SRMR between two competing models A & B given categorical data.
## The two models do not need to be nested.
# fitA = Fitted 'lavaan' model object for Model A
# fitB = Fitted 'lavaan' model object for Model B
# fitZ = Fitted 'lavaan' model object for the baseline model for CFI
fit.diff.cat <- function(fitA, fitB){
######################################
# Internal functions
######################################
# Rearrange the model-implied correlation matrix of 'fitB' so that its columns and rows
# are in the same order as that in the model-implied correlation matrix of 'fitA'
rearrange.P.theta <- function(fitA, fitB){
R <- inspect(fitA, "sampstat")$'cov'
p <- dim(R)[1]
R <- as.matrix(R, p, p)
P.theta.A <- inspect(fitA, "cov.ov")
P.theta.B0 <- inspect(fitB, "cov.ov")
target.var.names <- rownames(R)
current.var.names <- rownames(P.theta.B0)
P.theta.B <- matrix(NA, p, p)
rownames(P.theta.B) <- colnames(P.theta.B) <- target.var.names
for (i.row in 1:p){
for(i.col in 1:p){
row.name <- target.var.names[i.row]
col.name <- target.var.names[i.col]
pick.row <- which(current.var.names==row.name)
pick.col <- which(current.var.names==col.name)
P.theta.B[i.row, i.col] <- P.theta.B0[pick.row, pick.col]
}
}
return(P.theta.B)
}# End of rearrange.P.theta()
# Rearrange the model-implied thresholds of 'fitB' so that its names
# are in the same order as that in the model-implied thresholds 'fitA'
rearrange.thresh <- function(fitA, fitB){
thresh <- inspect(fitA, "sampstat")$th
thresh.B0 <- inspect(fitB, "th")
target.var.names <- names(thresh)
current.var.names <- names(thresh.B0)
n.thresh <- length(thresh)
thresh.B <- rep(NA, n.thresh)
names(thresh.B) <- target.var.names
for (i in 1:n.thresh){
name <- target.var.names[i]
pick.name <- which(current.var.names==name)
thresh.B[i] <- thresh.B0[pick.name]
}
return(thresh.B)
}# End of rearrange.thresh()
# Rearrange the Delta matrix of 'fitB' so that its rows are
# in the same order as that in the Delta matrix of 'fitA'.
# Delta = derivative of P(theta) wrt theta
rearrange.Delta <- function(fitA, fitB){
Delta.B0 <- lavaan:::computeDelta(fitB#Model)[[1]]
n.theta <- dim(Delta.B0)[2]
thresh <- inspect(fitA, "sampstat")$th
thresh.B0 <- inspect(fitB, "th")
target.var.names <- names(thresh)
current.var.names <- names(thresh.B0)
n.thresh <- length(thresh)
Delta.th <- matrix(NA, n.thresh, n.theta)
rownames(Delta.th) <- target.var.names
for (i in 1:n.thresh){
name <- target.var.names[i]
pick.name <- which(current.var.names==name)
Delta.th[i,] <- Delta.B0[pick.name,]
}
P.theta.B0 <- inspect(fitB, "cov.ov")
R <- inspect(fitA, "sampstat")$'cov'
p <- dim(R)[1]
target.var.names <- rownames(R)
current.var.names <- rownames(P.theta.B0)
n.rho <- p*(p-1)/2
current.matrix <- matrix(NA, p, p)
current.matrix[lower.tri(current.matrix, diag=FALSE)] <- 1:n.rho
pick.vech <- rep(NA, n.rho)
j <- 1
for(i.col in 1:(p-1)){
for(i.row in (i.col+1):p){
row.name <- target.var.names[i.row]
col.name <- target.var.names[i.col]
pick.row <- which(current.var.names==row.name)
pick.col <- which(current.var.names==col.name)
if(pick.row >= pick.col) pick.vech[j] <- current.matrix[pick.row, pick.col]
if(pick.row < pick.col) pick.vech[j] <- current.matrix[pick.col, pick.row]
j <- j+1
}
}
Delta.rho <- matrix(NA, n.rho, n.theta)
for(i in 1:n.rho){
pick <- pick.vech[i] + n.thresh
Delta.rho[i,] <- Delta.B0[pick,]
}
Delta.B <- rbind(Delta.th, Delta.rho)
return(Delta.B)
}# End of rearrange.Delta()
######################################
# Main function
######################################
H.A <- inspect(fitA, "hessian")*2
H.B <- inspect(fitB, "hessian")*2
H.A.inv <- try(chol2inv(chol(H.A)), TRUE)
H.B.inv <- try(chol2inv(chol(H.B)), TRUE)
if(class(H.A.inv)=="matrix" & class(H.B.inv)=="matrix"){
n <- inspect(fitA, "nobs")
dA <- as.numeric(fitmeasures(fitA, "df"))
dB <- as.numeric(fitmeasures(fitB, "df"))
P.B <- rearrange.P.theta(fitA, fitB)
p <- dim(P.B)[1]
rho.B <- lav_matrix_vech(P.B, diagonal = FALSE)
thresh.B <- rearrange.thresh(fitA, fitB)
estB <- c(thresh.B, rho.B)
m <- inspect(fitA, "wls.obs")
estA <- inspect(fitA, "wls.est")
eA <- m - estA
eB <- m - estB
Gamma <- inspect(fitA, "gamma")
DeltaA <- lavaan:::computeDelta(fitA#Model)[[1]]
DeltaB <- rearrange.Delta(fitA, fitB)
p1 <- dim(DeltaA)[1]
g.A <- 2*t(eA)
K.A <- (-2)*t(DeltaA)
T.A <- 2*diag(1, p1)
Q.A <- T.A - t(K.A)%*%H.A.inv%*%K.A
G.A <- t(eA) %*% eA
G.A.bc0 <- G.A - sum(diag(Q.A%*%Gamma))/(2*n)
G.A.bc <- ifelse(G.A.bc0 < 0, 0, G.A.bc0)
g.B <- 2*t(eB)
K.B <- (-2)*t(DeltaB)
T.B <- 2*diag(1, p1)
Q.B <- T.B - t(K.B)%*%H.B.inv%*%K.B
G.B <- t(eB) %*% eB
G.B.bc0 <- G.B - sum(diag(Q.B%*%Gamma))/(2*n)
G.B.bc <- ifelse(G.B.bc0 < 0, 0, G.B.bc0)
R <- inspect(fitA, "sampstat")$cov
r <- lav_matrix_vech(R, diagonal = FALSE)
k <- length(r)
G.Z <- t(r) %*% r
G.Z.bc0 <- G.Z - sum(diag(Gamma))/n
G.Z.bc <- ifelse(G.Z.bc0 < 0, 0, G.Z.bc0)
G.A1 <- ifelse(G.A.bc > 0, G.A.bc, G.A)
G.B1 <- ifelse(G.B.bc > 0, G.B.bc, G.B)
G.Z1 <- ifelse(G.Z.bc > 0, G.Z.bc, G.Z)
## RMSEA diff
rmsea.AB <- sqrt(G.A.bc/dA) - sqrt(G.B.bc/dB)
J.rmsea.1 <- cbind( 1/(2*sqrt(dA*G.A1)), -1/(2*sqrt(dB*G.B1)) )
J.rmsea.2 <- rbind(g.A, g.B)
J.rmsea <- J.rmsea.1 %*% J.rmsea.2
var.rmsea.AB <- J.rmsea %*% Gamma %*% t(J.rmsea) / n
se.rmsea.AB <- sqrt(var.rmsea.AB)
## CFI diff
cfi.AB <- (G.B.bc - G.A.bc) / G.Z.bc
n.thresh <- length(fitted(fitA)$th)
r1 <- c(rep(0, n.thresh), r)
J.cfi.1 <- cbind( -1/G.Z1, 1/G.Z1, (G.A1-G.B1)/G.Z1^2 )
J.cfi.2 <- rbind(g.A, g.B, 2*t(r1) )
J.cfi <- J.cfi.1 %*% J.cfi.2
var.cfi.AB <- J.cfi %*% Gamma %*% t(J.cfi) / n
se.cfi.AB <- sqrt(var.cfi.AB)
## SRMR diff
srmr.AB <- sqrt(G.A.bc/k) - sqrt(G.B.bc/k)
J.srmr.1 <- cbind( 1/(2*sqrt(k*G.A1)), -1/(2*sqrt(k*G.B1)) )
J.srmr.2 <- rbind(g.A, g.B)
J.srmr <- J.srmr.1 %*% J.srmr.2
var.srmr.AB <- J.srmr %*% Gamma %*% t(J.srmr) / n
se.srmr.AB <- sqrt(var.srmr.AB)
#######
output <- c(rmsea.AB, se.rmsea.AB,
cfi.AB, se.cfi.AB,
srmr.AB, se.srmr.AB)
names(output) <- c("rmsea.AB", "se.rmsea.AB",
"cfi.AB", "se.cfi.AB",
"srmr.AB", "se.srmr.AB")
}# End of if Hessian is positive definite
else{output <- rep(NA, 6)}
return(output)
}
When I enter my fitted models, the following error is returned:
Error in P.theta.B[i.row, i.col] <- P.theta.B0[pick.row, pick.col] :
replacement has length zero<
Now I tried to run the commands step by step and it seems that this error is produced at the beginning of the internal functions running this step:
P.theta.B[i.row, i.col] <- P.theta.B0[pick.row, pick.col]
Any ideas on how to troubleshoot there? I'm left with a lot of questionmarks.

How can I use `I()` with `paste0`?

I have multiple dataframes and I would like to evaluate (multiple) different models on each. MWE
df1 <- data.frame(A3 = c(-5, 5, 1),
B3 = c(0, 10, 1))
df2 <- data.frame(A4 = c(5, 15, 1))
B4 = c(10, 20, 1))
myfun <- function(arg1, arg2){ # arg1 =1 or 2
if (arg2 == 1){
eqn <- paste0("A", arg1+2) ~ paste0("B", arg1+2) + I(as.name(paste0("B", arg1+2))^2)
} else {
eqn <- paste0("A", arg1+2) ~ paste0("B", arg1+2) + I(as.name(paste0("B", arg1+2))^2) +I(as.name(paste0("B", arg1+2))^3)
}
return (lm(formula = eqn, data = eval(as.name(paste0("df", arg1)))
)
)
}
For example if I run myfun(1,2) I would like to get lm(A4 ~ B4 + I(B4^2) + I(B4^3), data = df2). But whatever I try I get the following error message Error in (paste0("B", arg1 +2))^2 : non-numeric argument to binary operator. From what I read in ?I, I imagine this is because R isolates whatever is passed into I(), so it doesn't realize I am trying to transform a variable: is that what is going on, and is it something I can fix? Also, is there a better way to estimate multiple models quickly? All the similar questions I found used the same data.frame across models, while I have to account for the response (and predictor) variables coming from different dataframes for different models.
Maybe this is what you are looking for:
The issue is that your are doing a math operation on a string, i.e with (paste0("B", arg1 +2))^2 you try to square a string, that's why you get the error. Inytead you can simply glue you formula together as a string an d converted it to a formula via as.formula:
df1 <- data.frame(A3 = c(-5, 5, 1),
B3 = c(0, 10, 1))
df2 <- data.frame(A4 = c(5, 15, 1))
B4 = c(10, 20, 1)
myfun <- function(arg1, arg2){ # arg1 =1 or 2
if (arg2 == 1){
eqn <- paste0("A", arg1+2, " ~ B", arg1+2," + I(B", arg1+2, "^2)")
} else {
eqn <- paste0("A", arg1+2, " ~ B", arg1+2," + I(B", arg1+2, "^2) + I(B", arg1+2, "^3)")
}
return (lm(formula = as.formula(eqn), data = eval(as.name(paste0("df", arg1)))
)
)
}
myfun(2, 1)
#>
#> Call:
#> lm(formula = as.formula(eqn), data = eval(as.name(paste0("df",
#> arg1))))
#>
#> Coefficients:
#> (Intercept) B4 I(B4^2)
#> 0.84795 0.12281 0.02924
An option is also to construct the formula with glue
myfun <- function(arg1, arg2){
eqn <- switch(arg2,
`1` = glue::glue("A{arg1 + 2}~ B{arg1+2} + I(B{arg1+2}^2)"),
glue::glue("A{arg1 + 2}~ B{arg1+2}",
"+ I(B{arg1+2}^2) + I(B{arg1+2}^3)")
)
model <- lm(eqn, data = get(paste0('df', arg1), envir = .GlobalEnv))
model$call <- as.formula(eqn)
return(model)
}
myfun(2, 1)
#Call:
#A4 ~ B4 + I(B4^2)
#Coefficients:
#(Intercept) B4 I(B4^2)
# 0.84795 0.12281 0.02924

Calculate/approach individual face probabilities of 10-faced dice, based on summed 2-roll dice experiment

I have a biochemistry problem , that can be simplified as a two-roll dice experiment (I think...).
Assume there is an uneven dice with 10 faces, i.e. individual face probabilities are not 1/10. We want to know these probabilities.
The given dataset that we have, however, is a histogram of summed faces of rolling the (same) dice twice. So, the range of the observed bins is 2-20 (2 = 1+1; 3 = 1+2, 2+1, 4 = 2+2, 1+3, 3+1; etc.).
The probabilities of summed faces are the product of the individual probabilities (s: observed probabilities of summed faces; p: probabilities of individual faces) and can be written as follows:
s2 ~ p1^2
s3 ~ 2*p1*p2
s4 ~ 2*p1*p3 + p2^2
s5 ~ 2*p1*p4 + 2*p2*p3
s6 ~ 2*p1*p5 + 2*p2*p4 + p3^2
s7 ~ 2*p1*p6 + 2*p2*p5 + 2*p3*p4
s8 ~ 2*p1*p7 + 2*p2*p6 + 2*p3*p5 + p4^2
s9 ~ 2*p1*p8 + 2*p2*p7 + 2*p3*p6 + 2*p4*p5
s10 ~ 2*p1*p9 + 2*p2*p8 + 2*p3*p7 + 2*p4*p6 + p5^2
s11 ~ 2*p1*p10 + 2*p2*p9 + 2*p3*p8 + 2*p4*p7 + 2*p5*p6
s12 ~ 2*p2*p10 + 2*p3*p9 + 2*p4*p8 + 2*p5*p7 + p6^2
s13 ~ 2*p3*p10 + 2*p4*p9 + 2*p5*p8 + 2*p6*p7
s14 ~ 2*p4*p10 + 2*p5*p9 + 2*p6*p8 + p7^2
s15 ~ 2*p5*p10 + 2*p6*p9 + 2*p7*p8
s16 ~ 2*p6*p10 + 2*p7*p9 + p8^2
s17 ~ 2*p7*p10 + 2*p8*p9
s18 ~ 2*p8*p10 + p9^2
s19 ~ 2*p9*p10
s20 ~ p10^2
In this case there are 20-1=19 known variables, and 10 unknowns, so the system is over-determined. It is also easy to solve by hand using algebra. As far as I can remember: quadratic terms will result in 2 possible solutions per individual face. Probabilities are always positive, so practically there should be one solution. Right?
Is there a way to solve this system in R? I am familiar with linear inverse problems in R, but I don't know how to approach this (quadratic?) system in R.
Here is some code to simulate the problem:
options(stringsAsFactors = FALSE)
library(gtools)
library(dplyr)
dice <- data.frame(face = 1:10)
### functions
split_dice_faces <- function(summed_face){
face_face <- strsplit(x = as.character(summed_face),split = "[/_\\|]")[[1]]
names(face_face) <- c("face1","face2")
as.numeric(face_face)
}
sum_dice_faces <- function(face_face){
sapply(face_face, function(face_face_i){
face1 <- split_dice_faces(face_face_i)[1]
face2 <- split_dice_faces(face_face_i)[2]
sum(c(face1[1], face2[1]))
})
}
simulate_2_rolls <- function(dice_pool){
dice_perm <- data.frame(permutations(n = dim(dice_pool)[1], r = 2, v = as.character(dice_pool$face), repeats.allowed = T ))
dice_perm$face_face <- paste(dice_perm[[1]],"|",dice_perm[[2]], sep = "")
dice_perm$prob <- dice_pool$prob[match(dice_perm[[1]], dice_pool$face)]*dice_pool$prob[match(dice_perm[[2]], dice_pool$face)]
dice_perm$summed_face <- sum_dice_faces(dice_perm$face_face)
dice_perm <- dice_perm %>% arrange(summed_face) %>% select(one_of(c("face_face", "summed_face","prob")))
dice_perm
}
summarise_2_rolls_experiment <- function(simulate_2_rolls_df){
simulate_2_rolls_df %>% group_by(summed_face) %>% summarise(prob = sum(prob))
}
from_face_probs_to_summed_observations <- function(face_probs){
face_probs %>%
data.frame(face = dice$face, prob = .) %>%
simulate_2_rolls() %>%
summarise_2_rolls_experiment() %>%
pull(prob)
}
generate_formulas <- function() {
output <-
dice_sum_probs %>% group_by(summed_face) %>% group_split() %>%
sapply(function(i){
left_hand <- paste("s",i$summed_face[1],sep="")
right_hand <-
sapply(strsplit(i$face_face, "\\|") , function(row){
row_i <- as.numeric(row)
row_i <- row_i[order(row_i)]
row_i <- paste("p",row_i,sep = "")
if(row_i[1] == row_i[2]){
paste(row_i[1],"^2",sep="")
} else {
paste(row_i,collapse="*")
}
})
right_hand <-
paste(sapply(unique(right_hand), function(right_hand_i){
fact <- sum(right_hand == right_hand_i)
if(fact > 1){fact <- paste(fact,"*",sep = "")} else {fact <- ""}
paste(fact,right_hand_i,sep = "")
}), collapse = " + ")
paste(left_hand, "~", right_hand)
})
return(output)
}
to simulate a dataset:
### random individual probabilites
dice_probs <- data.frame(face = dice$face,
prob = runif(n = dim(dice)[1]) %>% (function(x){x / sum(x)}))
dice_probs
### simulate infinite amount of trials, observations expressed as probabilities
dice_sum_probs <- simulate_2_rolls(dice_probs)
dice_sum_probs
### sum experiment outcomes with the same sum
dice_sum_probs_summary <- dice_sum_probs %>% group_by(summed_face) %>% summarise(prob = sum(prob))
### plot, this is the given dataset
with(data = dice_sum_probs_summary, barplot(prob, names.arg = summed_face))
### how to calculate / approach p1, p2, ..., p10?
Thanks!
If we create a multiplication table of the probabilities, outer(p, p) and then sum those over constant values of outer(1:10, 1:10, "+") using tapply we get the following nonlinear regression problem:
nls(prob ~ tapply(outer(p, p), outer(1:10, 1:10, `+`), sum),
dice_sum_probs_summary, algorithm = "port",
start = list(p = sqrt(dice_sum_probs_summary$prob[seq(1, 19, 2)])),
lower = numeric(10), upper = rep(1, 10))
giving:
Nonlinear regression model
model: prob ~ tapply(outer(p, p), outer(1:10, 1:10, `+`), sum)
data: dice_sum_probs_summary
p1 p2 p3 p4 p5 p6 p7 p8 p9 p10
0.06514 0.04980 0.14439 0.06971 0.06234 0.19320 0.09491 0.01237 0.11936 0.18878
residual sum-of-squares: 1.33e-30
which is consistent with
> dice_probs
face prob
1 1 0.06513537
2 2 0.04980455
3 3 0.14438749
4 4 0.06971313
5 5 0.06234477
6 6 0.19319613
7 7 0.09491289
8 8 0.01236557
9 9 0.11936244
10 10 0.18877766
We can alternately express it as follows where X is a matrix of zeros and ones having dimension 19 x 100 such that each row corresponds to a possible outcome of rolling the two dice (i.e. 2:20) and each column corresponds to a pair of indexes from 1:10 and 1:10. An entry equals one if the column pair sums to the sum of the two faces represented by its row and zero otherwise.
g <- c(outer(1:10, 1:10, `+`))
X <- + outer(2:20, g, `==`)
nls(prob ~ X %*% kronecker(p, p), dice_sum_probs_summary, alg = "port",
start = list(p = sqrt(dice_sum_probs_summary$prob[seq(1, 19, 2)])),
lower = numeric(10), upper = rep(1, 10))

`nlme` with crossed random effects

I am trying to fit a crossed non-linear random effect model as the linear random effect models as mentioned in this question and in this mailing list post using the nlme package. Though, I get an error regardless of what I try. Here is an example
library(nlme)
#####
# simulate data
set.seed(18112003)
na <- 30
nb <- 30
sigma_a <- 1
sigma_b <- .5
sigma_res <- .33
n <- na*nb
a <- gl(na,1,n)
b <- gl(nb,na,n)
u <- gl(1,1,n)
x <- runif(n, -3, 3)
y_no_noise <- x + sin(2 * x)
y <-
x + sin(2 * x) +
rnorm(na, sd = sigma_a)[as.integer(a)] +
rnorm(nb, sd = sigma_b)[as.integer(b)] +
rnorm(n, sd = sigma_res)
#####
# works in the linear model where we know the true parameter
fit <- lme(
# somehow we found the right values
y ~ x + sin(2 * x),
random = list(u = pdBlocked(list(pdIdent(~ a - 1), pdIdent(~ b - 1)))))
vv <- VarCorr(fit)
vv2 <- vv[c("a1", "b1"), ]
storage.mode(vv2) <- "numeric"
print(vv2,digits=4)
#R Variance StdDev
#R a1 1.016 1.0082
#R b1 0.221 0.4701
#####
# now try to do the same with `nlme`
fit <- nlme(
y ~ c0 + sin(c1),
fixed = list(c0 ~ x, c1 ~ x - 1),
random = list(u = pdBlocked(list(pdIdent(~ a - 1), pdIdent(~ b - 1)))),
start = c(0, 0.5, 1))
#R Error in nlme.formula(y ~ a * x + sin(b * x), fixed = list(a ~ 1, b ~ :
#R 'random' must be a formula or list of formulae
The lme example is similar to the one page 163-166 of "Mixed-effects Models in S and S-PLUS" with only 2 random effects instead of 3.
I should haved used a two-sided formula as written in help("nlme")
fit <- nlme(
y ~ c0 + c1 + sin(c2),
fixed = list(c0 ~ 1, c1 ~ x - 1, c2 ~ x - 1),
random = list(u = pdBlocked(list(pdIdent(c0 ~ a - 1), pdIdent(c1 ~ b - 1)))),
start = c(0, 0.5, 1))
# fixed effects estimates
fixef(fit)
#R c0.(Intercept) c1.x c2.x
#R -0.1788218 0.9956076 2.0022338
# covariance estimates
vv <- VarCorr(fit)
vv2 <- vv[c("c0.a1", "c1.b1"), ]
storage.mode(vv2) <- "numeric"
print(vv2,digits=4)
#R Variance StdDev
#R c0.a1 0.9884 0.9942
#R c1.b1 0.2197 0.4688

Resources