how to solve the NaNs produced when i use the log() - r

> LL_LN<-function(para){
+ ww=para[1] # w
+ alpha=para[2] # alpha
+ beta=para[3]# beta
+ sigma=para[4]
+ n=length(R)
+ psi <- rep(0, n)
+ # read data from file , named as RR
+ psi[1]=0.5 # replace with mean (R) or R(1)
+ sum1 = -n/2*log(2*pi*sigma^2)
+ sum4 = -log(R[1])
+ sum3 = -((log(R[1])-log(psi[1]))+0.5*(sigma^2))^2/(2*(sigma^2))
+ sum=0
+
+ for (i in 2:n) {
+ psi[i]=ww+alpha*psi[i-1]+beta*R[i-1]
+ sum4=sum4-log(R[i])
+ sum3=sum3-((log(R[i])-log(psi[i]))+0.5*(sigma^2))^2/(2*(sigma^2))
+ }
+ sum=sum1+sum3+sum4
+ return(-sum)
+
+ }
> n=10000
> k=5
> sigma=0.25
> mu=-(sigma^2)/2
> result=c(0,0,0,0)
> output<-matrix(0,k,4)
> output<-as.data.frame(output)
> for(j in 1:k){
+ #E<-rexp(n,1)
+ E=rlnorm(n,mu,sigma)
+ w=1
+ a=0.2
+ b=0.3
+ R <- rep(0, n) # To Store R1,R2....R1000
+ X <- rep(0, n) # To Store X1,X2....X1000
+ X[1] = 0.5
+ R[1]=X[1]*E[1]
+
+ for (i in 2:n) # To Find R1,R2....R1000
+ {
+ X[i]=w+a*X[i-1]+b*R[i-1]
+ R[i]=X[i]*E[i]
+ }
+ hist(R)
+ mean(E)
+
+ initial =c(1,0.2,0.3,2)
+ op<-optim(initial,LL_LN)
+ output$V1[j] <- op$par[1]
+ output$V2[j] <- op$par[2]
+ output$V3[j] <- op$par[3]
+ output$V4[j] <- op$par[4]
+
+ print(output)
+ }
V1 V2 V3 V4
1 1.0202418 0.1989128 0.2927711 0.2484346
2 0.9725745 0.2159796 0.2970536 0.2529665
3 1.0648460 0.1692719 0.2987034 0.2492417
4 1.0186746 0.1819587 0.3039954 0.2517418
5 1.0022230 0.2103271 0.2858050 0.2484962
There were 50 or more warnings (use warnings() to see the first 50)
> mean<-cbind(mean(output$V1),mean(output$V2),mean(output$V3),mean(output$V4))
> mean
[,1] [,2] [,3] [,4]
[1,] 1.015712 0.19529 0.2956657 0.2501762

Related

ggplot's stat_function() giving wrong result

I generated some data to perfome a regression on it:
library(tidyverse)
library(nnet)
# Generating the data --------------------------
set.seed(100)
helicopter <- rnorm(20, mean = 35, sd = 3)
car <- rnorm(20, mean = 30, sd = 3)
bus <- rnorm(20, mean = 25, sd = 3)
bike <- rnorm(20, mean = 20, sd = 3)
transportation_data <- data.frame(helicopter, car, bus, bike) %>%
pivot_longer(cols = 1:4, values_to = "income", names_to = "mode")
# Setting up the regression -------------------
transportation_regression <- multinom(mode~income, data = transportation_data)
So far, so good. I now want to plot the regression results (probability of choosing a certain mode of transportation based on income) using stat_function:
ins <- coef(transportation_regression)[1:3]
betas <- coef(transportation_regression)[4:6]
transportation_data %>%
ggplot(aes(x = income))+
stat_function(fun = function(x) { 1 / (1 + sum(exp(ins + betas * x))) }, aes(color = "bike"))+
stat_function(fun = function(x) { exp(ins[1] + betas[1] * x) / (1 + sum(exp(ins + betas * x))) }, aes(color = "bus"))+
stat_function(fun = function(x) { exp(ins[2] + betas[2] * x) / (1 + sum(exp(ins + betas * x))) }, aes(color = "car"))+
stat_function(fun = function(x) { exp(ins[3] + betas[3] * x) / (1 + sum(exp(ins + betas * x))) }, aes(color = "helicopter"))
I get this output, which is obviously wrong, and a warning Warning: longer object length is not a multiple of shorter object length where I don't know what it means.
When I use the same functions, but predict data points first, everything works just fine:
income <- seq(0,50,0.1)
result <- matrix( , nrow = length(income), ncol = 4)
i <- 1
for(x in income){
result[i,1] <- 1 / (1 + sum(exp(ins + betas * x))) # bike
result[i,2] <- exp(ins[1] + betas[1] * x) / (1 + sum(exp(ins + betas * x))) # bus
result[i,3] <- exp(ins[2] + betas[2] * x) / (1 + sum(exp(ins + betas * x))) # car
result[i,4] <- exp(ins[3] + betas[3] * x) / (1 + sum(exp(ins + betas * x))) # helicopter
i <- i + 1
}
cbind(income, as.data.frame(result)) %>%
pivot_longer(cols = V1:V4) %>%
ggplot(aes(x = income, y = value, color = name))+
geom_line()
Why don't the stat_function() in ggplot work?
I think it's just a misunderstanding of how the function works. Here's an example of using stat_function() to generate the right result:
library(tidyverse)
library(nnet)
# Generating the data --------------------------
set.seed(100)
helicopter <- rnorm(20, mean = 35, sd = 3)
car <- rnorm(20, mean = 30, sd = 3)
bus <- rnorm(20, mean = 25, sd = 3)
bike <- rnorm(20, mean = 20, sd = 3)
transportation_data <- data.frame(helicopter, car, bus, bike) %>%
pivot_longer(cols = 1:4, values_to = "income", names_to = "mode")
# Setting up the regression -------------------
transportation_regression <- multinom(mode~income, data = transportation_data)
#> # weights: 12 (6 variable)
#> initial value 110.903549
#> iter 10 value 48.674542
#> iter 20 value 46.980349
#> iter 30 value 46.766625
#> iter 40 value 46.734782
#> iter 50 value 46.732249
#> final value 46.732163
#> converged
ins <- coef(transportation_regression)[1:3]
betas <- coef(transportation_regression)[4:6]
transportation_data %>%
ggplot(aes(x = income))+
stat_function(fun = function(x) { 1 / (1 + exp(ins[1] + betas[1] * x) + exp(ins[2] + betas[2] * x) + exp(ins[3] + betas[3] * x)) }, aes(color = "bike"))+
stat_function(fun = function(x) { exp(ins[1] + betas[1] * x) / (1 + exp(ins[1] + betas[1] * x) + exp(ins[2] + betas[2] * x) + exp(ins[3] + betas[3] * x)) }, aes(color = "bus"))+
stat_function(fun = function(x) { exp(ins[2] + betas[2] * x) / (1 + exp(ins[1] + betas[1] * x) + exp(ins[2] + betas[2] * x) + exp(ins[3] + betas[3] * x)) }, aes(color = "car"))+
stat_function(fun = function(x) { exp(ins[3] + betas[3] * x) / (1 + exp(ins[1] + betas[1] * x) + exp(ins[2] + betas[2] * x) + exp(ins[3] + betas[3] * x)) }, aes(color = "helicopter"))
There were a couple of problems originally. Take, for example, the first instance of stat_function(),
stat_function(fun = function(x) {
1 / (1 + sum(exp(ins + betas * x))) },
aes(color = "bike"))
You're expecting ins + betas * x to be equivalent to ins[1] + betas[1] * x + ins[2] + betas[2] * x + ins[3] + betas[3] * x, but it isn't essentially recycling ins and betas to make them vectors as long as x and then multiplying betas by x and adding ins.
The other problem was the sum() around exp(ins ...) Rather than summing the rows, it's summing all rows and columns of the output, making a scalar value.
You could also make it a bit more general using matrix calculations:
b <- coef(transportation_regression)
transportation_data %>%
ggplot(aes(x = income))+
stat_function(fun = function(x) { 1 / (1 + rowSums(exp(cbind(1, x) %*% t(b)))) }, aes(color = "bike"))+
stat_function(fun = function(x) { exp(ins[1] + betas[1] * x) / (1 + rowSums(exp(cbind(1, x) %*% t(b)))) }, aes(color = "bus"))+
stat_function(fun = function(x) { exp(ins[2] + betas[2] * x) / (1 + rowSums(exp(cbind(1, x) %*% t(b)))) }, aes(color = "car"))+
stat_function(fun = function(x) { exp(ins[3] + betas[3] * x) / (1 + rowSums(exp(cbind(1, x) %*% t(b)))) }, aes(color = "helicopter"))
Created on 2023-02-04 by the reprex package (v2.0.1)

Removing the interaction terms when the main effect is removed

I have a formula in R for example
y ~ x + z + xx + zz + tt + x:xx + x:zz + xx:z + zz:xx + xx:zz:tt
or even more complicated (y~x*z*xx*zz*tt)
Note that the names on the right-hand side of the formula are intentionally selected to be somehow similar to at least one other term.
The question is now how to remove the interaction terms that are related to a specific main effect. For example, if I remove the term x (main effect) I want to remove the interaction terms that also include x, here x:xx.
I have tried grepl() but it would remove any term that contains partially or fully the word. In my example it removes x,xx,x:xx,xx:z,zz:xx,xx:zz:tt
any ideas about a function to do it?
Update:
What I have already tried:
f = y ~ x + z + xx + zz + tt + x:xx + x:zz + xx:z + zz:xx + xx:zz:tt
modelTerms = attr(terms(f) , which = 'term.labels')
modelTerms[!grepl(pattern = 'x', x = modelTerms)]
Use update.formula:
f <- y~x*z*xx*zz*tt
update(f, . ~ . - x - x:.)
#y ~ z + xx + zz + tt + z:xx + z:zz + xx:zz + z:tt + xx:tt + zz:tt +
# z:xx:zz + z:xx:tt + z:zz:tt + xx:zz:tt + z:xx:zz:tt
f <- y ~ x + z + xx + zz + tt + x:xx + x:zz + xx:z + zz:xx + xx:zz:tt
update(f, . ~ . - x - x:.)
#y ~ z + xx + zz + tt + z:xx + xx:zz + xx:zz:tt
Are you looking for this?
> modelTerms[!grepl(pattern='^x\\:x+', x=modelTerms)]
[1] "x" "z" "xx" "zz" "tt" "x:zz" "z:xx" "xx:zz"
[9] "xx:zz:tt"
Simple:
f = y~x*z*xx*zz*tt
modelTerms = attr(terms(f) , which = 'term.labels')
l = sapply(
strsplit(x = modelTerms, split = '[:*]'),
FUN = function(x) {
'x' %in% x
}
)
modelTerms[!l]

Transform Solver Excel program in R

I want to maximize a one variable (sum(DeplaQ1$somme_vrai)) with unknow parameters (alpha,VDT,constVPC,constVPP,a,constTC,const2RM,constTaxi) and I want to determine this parametres with a one constraint x1 <= 10000
I do not find this solution. triplaQ1 is a database with time and number of trip per transport mod.I try to use Optimum or LpSolve but it does not work.**
** My program :
fr <- function(alpha,VDT,constVPC,constVPP,a,constTC,const2RM,constTaxi) {
triplaQ1$u_VPC <- - (alpha * alpha + 0.001)*(triplaQ1$time_VPC*VDT + 0.15*triplaQ1$dist_VPC + constVPC)
triplaQ1$u_VPP <- - (alpha * alpha +0.001) * (triplaQ1$time_VPP * VDT) + constVPP
triplaQ1$u_TC <- - (alpha * alpha +0.001) * ((triplaQ1$time_TC/a)*VDT+constTC)
triplaQ1$u_mp <- - (alpha * alpha +0.001) * (triplaQ1$time_MP * VDT)
triplaQ1$u_2RM <- - (alpha * alpha +0.001) * (triplaQ1$time_2RM * VDT + const2RM)
triplaQ1$u_Taxi <- - (alpha * alpha + 0.001) * (triplaQ1$time_Taxi * VDT + 0.3 * triplaQ1$dist_Taxi + constTaxi)
triplaQ1$simuleVPC <- triplaQ1$trip * ((exp(triplaQ1$u_VPC)) / (exp(triplaQ1$u_VPC) + exp(triplaQ1$u_VPP) + exp(triplaQ1$u_TC) + exp(triplaQ1$u_mp) + exp(triplaQ1$u_2RM) + exp(triplaQ1$u_Taxi)))
triplaQ1$simuleVPP <- triplaQ1$trip * ((exp(triplaQ1$u_VPP)) / (exp(triplaQ1$u_VPC) + exp(triplaQ1$u_VPP) + exp(triplaQ1$u_TC) + exp(triplaQ1$u_mp) + exp(triplaQ1$u_2RM) + exp(triplaQ1$u_Taxi)))
triplaQ1$simuleTC <- triplaQ1$trip * ((exp(triplaQ1$u_TC)) / (exp(triplaQ1$u_VPC) + exp(triplaQ1$u_VPP) + exp(triplaQ1$u_TC) + exp(triplaQ1$u_mp) + exp(triplaQ1$u_2RM) + exp(triplaQ1$u_Taxi)))
triplaQ1$simuleMAP <- triplaQ1$trip * ((exp(triplaQ1$u_mp)) / (exp(triplaQ1$u_VPC) + exp(triplaQ1$u_VPP) + exp(triplaQ1$u_TC) + exp(triplaQ1$u_mp) + exp(triplaQ1$u_2RM) + exp(triplaQ1$u_Taxi)))
triplaQ1$simule2RM <- triplaQ1$trip * ((exp(triplaQ1$u_2RM)) / (exp(triplaQ1$u_VPC) + exp(triplaQ1$u_VPP) + exp(triplaQ1$u_TC) + exp(triplaQ1$u_mp) + exp(triplaQ1$u_2RM) + exp(triplaQ1$u_Taxi)))
triplaQ1$simuleTaxi <- triplaQ1$trip * ((exp(triplaQ1$u_Taxi)) / (exp(triplaQ1$u_VPC) + exp(triplaQ1$u_VPP) + exp(triplaQ1$u_TC) + exp(triplaQ1$u_mp) + exp(triplaQ1$u_2RM) + exp(triplaQ1$u_Taxi)))
triplaQ1$somme_simule <- triplaQ1$simuleVPC + triplaQ1$simuleVPP + triplaQ1$simuleTC + triplaQ1$simuleMAP + triplaQ1$simule2RM + triplaQ1$simuleTaxi
triplaQ1$shsimuleVPC <- ifelse(triplaQ1$trip > 0.001, triplaQ1$simuleVPC/triplaQ1$trip, 0)
triplaQ1$shsimuleVPP <- ifelse(triplaQ1$trip > 0.001, triplaQ1$simuleVPP/triplaQ1$trip, 0)
triplaQ1$shsimuleTC <- ifelse(triplaQ1$trip > 0.001, triplaQ1$simuleTC/triplaQ1$trip, 0)
triplaQ1$shsimuleMAP <- ifelse(triplaQ1$trip > 0.001, triplaQ1$simuleMAP/triplaQ1$trip, 0)
triplaQ1$shsimule2RM <- ifelse(triplaQ1$trip > 0.001, triplaQ1$simule2RM/triplaQ1$trip, 0)
triplaQ1$shsimuleTaxi <- ifelse(triplaQ1$trip > 0.001, triplaQ1$simuleTaxi/triplaQ1$trip, 0)
triplaQ1$shsommesimule <- triplaQ1$shsimuleVPC + triplaQ1$shsimuleVPP + triplaQ1$shsimuleTC + triplaQ1$shsimuleMAP + triplaQ1$shsimule2RM + triplaQ1$shsimuleTaxi
triplaQ1$VraiVPC <- ifelse(triplaQ1$trip > 0,triplaQ1$Tij_vpc * log(triplaQ1$simuleVPC/triplaQ1$trip),0)
triplaQ1$VraiVPP <- ifelse(triplaQ1$trip > 0,triplaQ1$Tij._vpp * log(triplaQ1$simuleVPP/triplaQ1$trip),0)
triplaQ1$VraiTC <- ifelse(triplaQ1$trip > 0,triplaQ1$Tij._tc * log(triplaQ1$simuleTC/triplaQ1$trip),0)
triplaQ1$VraiMAP <- ifelse(triplaQ1$trip > 0,triplaQ1$Tij._mp * log(triplaQ1$simuleMAP/triplaQ1$trip),0)
triplaQ1$Vrai2RM <- ifelse(triplaQ1$trip > 0,triplaQ1$Tij._2RM * log(triplaQ1$simule2RM/triplaQ1$trip),0)
triplaQ1$VraiTaxi <- ifelse(triplaQ1$trip > 0,triplaQ1$Tij_Taxi * log(triplaQ1$simuleTaxi/triplaQ1$trip),0)
triplaQ1$somme_vrai <- triplaQ1$VraiVPC + triplaQ1$VraiVPP + triplaQ1$VraiTC + triplaQ1$VraiMAP + triplaQ1$Vrai2RM + triplaQ1$VraiTaxi
return(sum(triplaQ1$somme_vrai))
}
x1 <- sqrt((sum(DeplaQ1$dep_VPC) - sum(DeplaQ1$simuleVPC))^2 + (sum(DeplaQ1$dep_VPP) - sum(DeplaQ1$simuleVPP))^2 + (sum(DeplaQ1$dep_TC) - sum(DeplaQ1$simuleTC))^2 + (sum(DeplaQ1$dep_2RM) - sum(DeplaQ1$simule2RM))^2 + (sum(DeplaQ1$dep_MP) - sum(DeplaQ1$simuleMAP))^2 + (sum(DeplaQ1$dep_Taxi) - sum(DeplaQ1$simuleTaxi))^2)
thank you in advance
first translate the EXCEL formula you want to find a target value to to a r-function. I did this for a very simple formula x^2+y^2 here, this is my_fun in the example. Then define a function that gives the squared difference (or any other norm you want to use) between target value and function value for all inputs. Then just put everything into the optim function.
my_fun <- function(x){
x[1]^2 + x[2]^2
}
target <- 2
optim_fun <- function(x){
sum((my_fun(x)-target)^2)
}
res <- optim(c(0,0), optim_fun)
res
To deal with constraints you could introduce a penalty term. Some algorithms in optim also support constraints. There are a lot of more specialised optimisation libraries for R if the very basic optim function does not fit your need.

optim in R failing due to NAs

I have been trying to estimate a rather messy nonlinear regression model in R for quite some time now. After countless failed attempts using the nls function, I am now trying my luck with optim, which I have used many times in the past. For this example, I'll use the following data:
x1 <- runif(1000,0,7)
x2 <- runif(1000,0,7)
x3 <- runif(1000,0,7)
y <- log(.5 + .5*x1 + .7*x2 + .4*x3 + .05*x1^2 + .1*x2^2 + .15*x3^2 - .05*x1*x2 - .1*x1*x3 - .07*x2*x3 + .02*x1*x2*x2) + rnorm(1000)
I would like to estimate the parameters in the polynomial expression inside the log() function above, and so I have defined the following function to replicate a nonlinear least squares regression:
g <- function(coefs){
fitted <- coefs[1] + coefs[2]*x1 + coefs[3]*x2 + coefs[4]*x3 + coefs[5]*x1^2 + coefs[6]*x2^2 + coefs[7]*x3^2 + coefs[8]*x1*x2 + coefs[9]*x1*x3 + coefs[10]*x2*x3 + coefs[11]*x1*x2*x3
error <- y - log(fitted)
return(sum(error^2))
}
In order to avoid negative starting values inside the log() expression, I first estimate the linear model below:
lm.1 <- lm(I(exp(y)) ~ x1 + x2 + x3 + I(x1^2) + I(x2^2) + I(x3^2) + I(x1*x2) + I(x1*x3) + I(x2*x3) + I(x1*x2*x3))
intercept.start <- ifelse((min(fitted(lm.1)-lm.1$coefficients[1])) <= 0, -(min(fitted(lm.1)-lm.1$coefficients[1])) + .5, .5)
coefs.start <- c(intercept.start,lm.1$coefficients[-1])
Defining intercept.start above guarantees that the expression inside of log() will be strictly positive at the outset. However, when I run the optim command
nl.model <- optim(coefs.start, g, method="L-BFGS-B")
I get the following error message
Error in optim(coefs.start, g, method = "L-BFGS-B") :
L-BFGS-B needs finite values of 'fn'
In addition: Warning message:
In log(fitted) : NaNs produced
Does anyone know how I can force the optim routine to simply disregard parameter estimates that would produce negative values inside of the log() expression? Thanks in advance.
Here's a slightly different approach.
Aside from the typo mentioned in the comment, if the issue is that the argument to the log(...) is < 0 for certain parameter estimates, you can change the function definition to prevent that.
# just some setup - we'll need this later
set.seed(1)
err <- rnorm(1000, sd=0.1) # note smaller error sd
x1 <- runif(1000,0,7)
x2 <- runif(1000,0,7)
x3 <- runif(1000,0,7)
par <- c(0.5, 0.5, 0.7, 0.4, 0.05, 0.1, 0.15, -0.05, -0.1, -0.07, 0.02)
m <- cbind(1, x1, x2, x3, x1^2, x2^2, x3^2, x1*x2, x1*x3, x2*x3, x1*x2*x3)
y <- as.numeric(log(m %*% par)) + err
# note slight change in the model function definition
g <- function(coefs){
fitted <- coefs[1] + coefs[2]*x1 + coefs[3]*x2 + coefs[4]*x3 + coefs[5]*x1^2 + coefs[6]*x2^2 + coefs[7]*x3^2 + coefs[8]*x1*x2 + coefs[9]*x1*x3 + coefs[10]*x2*x3 + coefs[11]*x1*x2*x3
fitted <- ifelse(fitted<=0, 1, fitted) # ensures fitted > 0
error <- y - log(fitted)
return(sum(error^2))
}
lm.1 <- lm(I(exp(y)) ~ x1 + x2 + x3 + I(x1^2) + I(x2^2) + I(x3^2) + I(x1*x2) + I(x1*x3) + I(x2*x3) + I(x1*x2*x3))
nl.model <- optim(coef(lm.1), g, method="L-BFGS-B", control=list(maxit=1000))
nl.model$par
# (Intercept) x1 x2 x3 I(x1^2) I(x2^2) I(x3^2) I(x1 * x2) I(x1 * x3) I(x2 * x3) I(x1 * x2 * x3)
# 0.40453182 0.50136222 0.71696293 0.45335893 0.05461253 0.10210854 0.14913914 -0.06169715 -0.11195476 -0.08497180 0.02531717
with(nl.model, cat(convergence, message))
# 0 CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH
Note that these estimates are pretty close to the actual values. That's because in the setup I used a smaller error term (sd = 0.2 instead of 1). In your example, the error is large compared to the response (y), so you're basically fitting random error.
If you fit the model using the actual parameter values as starting estimates, you get nearly identical results, no closer to the "true" values.
nl.model <- optim(par, g, method="L-BFGS-B", control=list(maxit=1000))
nl.model$par
# [1] 0.40222956 0.50159930 0.71734810 0.45459606 0.05465654 0.10206887 0.14899640 -0.06177640 -0.11209065 -0.08497423 0.02533085
with(nl.model, cat(convergence, message))
# 0 CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH
Try this with the original error (sd = 1) and see what happens.
Here's a log of my efforts to investigate. I put a maximum on the fitted values and got convergence. I then asked myself if increasing that max would do anything th the estimated parameters and found that there was no change... AND there was no difference from the starting values, so I think you messed up in building the function. Perhaps you can investigate further:
> gp <- function(coefs){
+
+ fitted <- coefs[1] + coefs[2]*x1 + coefs[3]*x2 + coefs[4]*x3 + coefs[5]*x1^2 + coefs[6]*x2^2 + coefs[7]*x3^2 + coefs[8]*x1*x2 + coefs[9]*x1*x3 + coefs[10]*x2*x3 + coefs[11]*x1*x2*x3 }
> describe( gp( coefs.start) ) #describe is from pkg:Hmisc
gp(coefs.start)
n missing unique Info Mean .05 .10 .25 .50 .75
1000 0 1000 1 13.99 2.953 4.692 8.417 12.475 18.478
.90 .95
25.476 28.183
lowest : 0.5000 0.5228 0.5684 0.9235 1.1487
highest: 41.0125 42.6003 43.1457 43.5950 47.2234
> g <- function(coefs){
+
+ fitted <- max( coefs[1] + coefs[2]*x1 + coefs[3]*x2 + coefs[4]*x3 + coefs[5]*x1^2 + coefs[6]*x2^2 + coefs[7]*x3^2 + coefs[8]*x1*x2 + coefs[9]*x1*x3 + coefs[10]*x2*x3 + coefs[11]*x1*x2*x3 , 1000)
+ error <- y - log(fitted)
+ return(sum(error^2))
+ }
> nl.model <- optim(coefs.start, g, method="L-BFGS-B")
> nl.model
$par
x1 x2 x3 I(x1^2)
0.77811231 -0.94586233 -1.33540959 1.65454871 0.31537594
I(x2^2) I(x3^2) I(x1 * x2) I(x1 * x3) I(x2 * x3)
0.45717138 0.11051418 0.59197115 -0.25800792 0.04931727
I(x1 * x2 * x3)
-0.08124126
$value
[1] 24178.62
$counts
function gradient
1 1
$convergence
[1] 0
$message
[1] "CONVERGENCE: NORM OF PROJECTED GRADIENT <= PGTOL"
> g <- function(coefs){
+
+ fitted <- max( coefs[1] + coefs[2]*x1 + coefs[3]*x2 + coefs[4]*x3 + coefs[5]*x1^2 + coefs[6]*x2^2 + coefs[7]*x3^2 + coefs[8]*x1*x2 + coefs[9]*x1*x3 + coefs[10]*x2*x3 + coefs[11]*x1*x2*x3 , 100000)
+ error <- y - log(fitted)
+ return(sum(error^2))
+ }
> nl.model <- optim(coefs.start, g, method="L-BFGS-B")
> nl.model
$par
x1 x2 x3 I(x1^2)
0.77811231 -0.94586233 -1.33540959 1.65454871 0.31537594
I(x2^2) I(x3^2) I(x1 * x2) I(x1 * x3) I(x2 * x3)
0.45717138 0.11051418 0.59197115 -0.25800792 0.04931727
I(x1 * x2 * x3)
-0.08124126
$value
[1] 89493.99
$counts
function gradient
1 1
$convergence
[1] 0
$message
[1] "CONVERGENCE: NORM OF PROJECTED GRADIENT <= PGTOL"
.

operating on pairs of elements in a data frame

I have two data frames, x and weights, in which columns are paired. Here are example data frames:
x = read.table(text = "
yr1 yr2 yr3 yr4
10 15 6 8
10 20 30 NA
NA 5 2 3
100 100 NA NA",
sep = "", header = TRUE)
weights = read.table(text = "
yr1 yr2 yr3 yr4
2 4 1 3
2 2 4 2
3 2 2 3
4 2 2 4",
sep = "", header = TRUE)
The columns yr1 and yr2 are one pair and the columns yr3 and yr4 are another pair. With my actual data the columns go up to yr100 and there are 50 pairs of columns.
If yr1 or yr2 is missing in x I want to fill the missing observation with, for example:
(5 / 2) * 3
Likewise for yr3 or yr4:
(30 / 4) * 2
where 5 (or 30) is the element in the column in x that is not missing for a given pair of elements. The values 2 and 3 for the first example (and the values 4 and 2 in the second example) are the corresponding elements in the weights data frame for a given pair of elements in the x data frame. If both elements in a pair are missing in x I want to leave them as missing.
Here is R code that does the above operations using nested for loops. However, there are 2000 or 3000 rows in my actual data set and the nested for loops have been running now for >10 hours.
for(i in 1: (ncol(x)/2)) {
for(j in 1: nrow(x)) {
if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] = (x[j,(1 + ((i-1)*2 + 1))] / weights[j,(1 + ((i-1)*2 + 1))]) * weights[j,(1 + (i-1)*2 + 0)]
if(!is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] = (x[j,(1 + ((i-1)*2 + 0))] / weights[j,(1 + ((i-1)*2 + 0))]) * weights[j,(1 + (i-1)*2 + 1)]
if( is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] = NA
if( is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] = NA
}
}
I have realized that the third and fourth if statements probably are not necessary. Perhaps the time to run this code will be reduced substantially if I simply remove those two if statements.
However, I also came up with the following alternative solution that uses reshape instead of nested for loops:
n.years <- 4
x2 <- reshape(x , direction="long", varying = list(seq(1,(n.years-1),2), seq(2,n.years,2)), v.names = c("yr1", "yr2"), times = c("t1", "t2"))
wt2 <- reshape(weights, direction="long", varying = list(seq(1,(n.years-1),2), seq(2,n.years,2)), v.names = c("yr1", "yr2"), times = c("t1", "t2"))
x2$yr1 <- ifelse(is.na(x2$yr1), (x2$yr2 / wt2$yr2) * wt2$yr1, x2$yr1)
x2$yr2 <- ifelse(is.na(x2$yr2), (x2$yr1 / wt2$yr1) * wt2$yr2, x2$yr2)
x3 <- reshape(x2, direction="wide", varying = list(seq(1,3,2), seq(2,4,2)), v.names = c("yr1", "yr2"), times = c("t1", "t2"))
x3
Before I shut the current R session down and try one of the above approaches please suggest possible alternatives that might be more efficient. I have used microbenchmark a little bit, but have not yet attempted to do so here, partially because writing a function for each possible solution is a little intimidating to me. I also tried coming up with a solution using the apply family of functions, but could not come up with one.
My reshape solution was derived from this question:
Reshaping a data frame with more than one measure variable
In addition to computation time I am also concerned about possible memory exhaustion.
I try hard to stick with base R, but will consider using other options to obtain desired output. Thank you for any suggestions.
Does this work for you?
Note that I didn't use your replacement function because I found it a bit confusing, so you will have to fix how you replace the yr1 and yr2 variables with your formula. Also, you'll probably want to reshape the result if you need to be able to attach it to your original dataframe.
newx <-
reshape(x, direction="long",varying=list(1:50*2-1,1:50*2), v.names=c("v1","v2"))
newwt <-
reshape(weights, direction="long",varying=list(1:50*2-1,1:50*2), v.names=c("w1","w2"))
condwtmean <- function(x,y,wtx,wty){
if(xor(is.na(x),is.na(y))){
if(is.na(x))
x <- y # replacement function
if(is.na(y))
y <- x # replacement function
return(weighted.mean(c(x,y),c(wtx,wty)))
}
else if(!is.na(x) & !is.na(y))
return(weighted.mean(c(x,y),c(wtx,wty)))
else
return(NA)
}
newx$wtmean <- mapply(condwtmean, newx$v1, newx$v2, newwt$w1, newwt$w2)
Thomas's answer is much better than any of the three approaches I tried. Here I compare the four approaches with microbenchmark. I have not yet tried Thomas's answer with the actual data. My original nested for-loops approach is still running after 22 hours.
Unit: milliseconds
expr min lq median uq max neval
fn.1(x, weights) 98.69133 99.47574 100.5313 101.7315 108.8757 20
fn.2(x, weights) 755.51583 758.12175 762.3775 776.0558 801.9615 20
fn.3(x, weights) 564.21423 567.98822 568.5322 571.0975 575.1809 20
fn.4(x, weights) 367.05862 370.52657 371.7439 373.7367 395.0423 20
#########################################################################################
# create data
set.seed(1234)
n.rows <- 40
n.cols <- 40
n.sample <- n.rows * n.cols
x <- sample(20, n.sample, replace=TRUE)
x.NA <- sample(n.rows*n.cols, 10*(n.sample / n.rows), replace=FALSE)
x[x.NA] <- NA
x <- as.data.frame(matrix(x, nrow = n.rows))
weights <- sample(4, n.sample, replace=TRUE)
weights <- as.data.frame(matrix(weights, nrow = n.rows))
weights
#########################################################################################
# Thomas's function
fn.1 <- function(x, weights){
newx <- reshape(x, direction="long", varying = list(seq(1,(n.cols-1),2), seq(2,n.cols,2)), v.names=c("v1", "v2"))
newwt <- reshape(weights, direction="long", varying = list(seq(1,(n.cols-1),2), seq(2,n.cols,2)), v.names=c("w1", "w2"))
condwtmean <- function(x,y,wtx,wty){
if(xor(is.na(x),is.na(y))){
if(is.na(x))
x <- (y / wty) * wtx # replacement function
if(is.na(y))
y <- (x / wtx) * wty # replacement function
return(weighted.mean(c(x,y),c(wtx,wty)))
}
else if(!is.na(x) & !is.na(y))
return(weighted.mean(c(x,y),c(wtx,wty)))
else
return(NA)
}
newx$wtmean <- mapply(condwtmean, newx$v1, newx$v2, newwt$w1, newwt$w2)
newx2 <- reshape(newx[,c(1,4:5)], v.names = "wtmean", timevar = "time", direction = "wide")
newx2 <- newx2[,2:(n.cols/2+1)]
names(newx2) <- paste('X', 1:(n.cols/2), sep = "")
return(newx2)
}
fn.1.output <- fn.1(x, weights)
#########################################################################################
# nested for-loops with 4 if statements
fn.2 <- function(x, weights){
for(i in 1: (ncol(x)/2)) {
for(j in 1: nrow(x)) {
if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] = (x[j,(1 + ((i-1)*2 + 1))] / weights[j,(1 + ((i-1)*2 + 1))]) * weights[j,(1 + (i-1)*2 + 0)]
if(!is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] = (x[j,(1 + ((i-1)*2 + 0))] / weights[j,(1 + ((i-1)*2 + 0))]) * weights[j,(1 + (i-1)*2 + 1)]
if( is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] = NA
if( is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] = NA
}
}
x.weights = x * weights
numerator <- sapply(seq(1,ncol(x.weights),2), function(i) {
apply(x.weights[,c(i, i+1)], 1, sum, na.rm=T)
})
denominator <- sapply(seq(1,ncol(weights),2), function(i) {
apply(weights[,c(i, i+1)], 1, sum, na.rm=T)
})
weighted.x <- numerator/denominator
for(i in 1: (ncol(x)/2)) {
for(j in 1: nrow(x) ) {
if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] = sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE)
if(!is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] = sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE)
if( is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] = NA
}
}
return(weighted.x)
}
fn.2.output <- fn.2(x, weights)
fn.2.output <- as.data.frame(fn.2.output)
names(fn.2.output) <- paste('X', 1:(n.cols/2), sep = "")
#########################################################################################
# nested for-loops with 2 if statements
fn.3 <- function(x, weights){
for(i in 1: (ncol(x)/2)) {
for(j in 1: nrow(x)) {
if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] = (x[j,(1 + ((i-1)*2 + 1))] / weights[j,(1 + ((i-1)*2 + 1))]) * weights[j,(1 + (i-1)*2 + 0)]
if(!is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] = (x[j,(1 + ((i-1)*2 + 0))] / weights[j,(1 + ((i-1)*2 + 0))]) * weights[j,(1 + (i-1)*2 + 1)]
}
}
x.weights = x * weights
numerator <- sapply(seq(1,ncol(x.weights),2), function(i) {
apply(x.weights[,c(i, i+1)], 1, sum, na.rm=T)
})
denominator <- sapply(seq(1,ncol(weights),2), function(i) {
apply(weights[,c(i, i+1)], 1, sum, na.rm=T)
})
weighted.x <- numerator/denominator
for(i in 1: (ncol(x)/2)) {
for(j in 1: nrow(x) ) {
if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] = sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE)
if(!is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] = sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE)
if( is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] = NA
}
}
return(weighted.x)
}
fn.3.output <- fn.3(x, weights)
fn.3.output <- as.data.frame(fn.3.output)
names(fn.3.output) <- paste('X', 1:(n.cols/2), sep = "")
#########################################################################################
# my reshape solution
fn.4 <- function(x, weights){
new.x <- reshape(x , direction="long", varying = list(seq(1,(n.cols-1),2), seq(2,n.cols,2)), v.names = c("v1", "v2"))
wt <- reshape(weights, direction="long", varying = list(seq(1,(n.cols-1),2), seq(2,n.cols,2)), v.names = c("w1", "w2"))
new.x$v1 <- ifelse(is.na(new.x$v1), (new.x$v2 / wt$w2) * wt$w1, new.x$v1)
new.x$v2 <- ifelse(is.na(new.x$v2), (new.x$v1 / wt$w1) * wt$w2, new.x$v2)
x2 <- reshape(new.x, direction="wide", varying = list(seq(1,3,2), seq(2,4,2)), v.names = c("v1", "v2"))
x <- x2[,2:(n.cols+1)]
x.weights = x * weights
numerator <- sapply(seq(1,ncol(x.weights),2), function(i) {
apply(x.weights[,c(i, i+1)], 1, sum, na.rm=T)
})
denominator <- sapply(seq(1,ncol(weights),2), function(i) {
apply(weights[,c(i, i+1)], 1, sum, na.rm=T)
})
weighted.x <- numerator/denominator
for(i in 1: (ncol(x)/2)) {
for(j in 1: nrow(x) ) {
if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] = sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE)
if(!is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] = sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE)
if( is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] = NA
}
}
return(weighted.x)
}
fn.4.output <- fn.4(x, weights)
fn.4.output <- as.data.frame(fn.4.output)
names(fn.4.output) <- paste('X', 1:(n.cols/2), sep = "")
#########################################################################################
rownames(fn.1.output) <- NULL
rownames(fn.2.output) <- NULL
rownames(fn.3.output) <- NULL
rownames(fn.4.output) <- NULL
all.equal(fn.1.output, fn.2.output)
all.equal(fn.1.output, fn.3.output)
all.equal(fn.1.output, fn.4.output)
all.equal(fn.2.output, fn.3.output)
all.equal(fn.2.output, fn.4.output)
all.equal(fn.3.output, fn.4.output)
library(microbenchmark)
microbenchmark(fn.1(x, weights), fn.2(x, weights), fn.3(x, weights), fn.4(x, weights), times=20)
#########################################################################################

Resources