I have a data set and want to essentially fit a linear model with a rolling time window, find the fitted values and calculate the errors in the estimate. I have functions which calculate the error and I have the start of the algorithm, but I keep getting null time series with the algorithm below. Can anybody spot a fix for it?
rollerOLS <- function(data, measure, predict, predictor){
error <- c()
m <- dim(data)[1]
for(i in 1:(floor(m/142)-10)){
data.new <- as.data.frame(data[c((1+(142*(i-1))):((i+9)*142)),])
data.pred <- as.data.frame(data[c((1+(142*(i+9))):((i+10)*142)-1),])
n <- dim(data.new)[1]
k <- dim(data.pred)[1]
x <- data.new[-1,predictor]
y <- data.new[-n, predict]
mod <- lm(y ~ x)
ts <- predict.lm(mod, newdata = data.frame(data.pred[, predictor]), interval="none")
actual <- data.pred[-k,predict]
error[i] <- measure(ts, actual)
}
return(mod)
}
Note that 142 is specific to my data set.
The problem was in the ts line and here is the fix.
rollerOLS <- function(data, measure, predict, predictor){
error <- c()
m <- dim(data)[1]
for(i in 1:(floor(m/142)-10)){
data.new <- as.data.frame(data[c((1+(142*(i-1))):((i+9)*142)),])
data.pred <- as.data.frame(data[c((1+(142*(i+9))):((i+10)*142)-1),])
n <- dim(data.new)[1]
k <- dim(data.pred)[1]
x <- data.new[-1,predictor]
y <- data.new[-n, predict]
mod <- lm(y ~ x)
ts <- mod$coefficients[1] + mod$coefficients[2]*data.pred[-1,predictor]
actual <- data.pred[-k,predict]
error[i] <- measure(ts, actual)
}
return(error)
}
Related
I'm trying to run the following function mentioned below using OptimParallel in R on a certain data set. The code is as follows:
install.packages("optimParallel")
install.packages('parallel')
library(parallel)
library(optimParallel)
library(doParallel)
library(data.table)
library(Rlab)
library(HDInterval)
library(mvtnorm)
library(matrixStats)
library(dplyr)
library(cold)
## Bolus data:
data("bolus")
d1 <- bolus
d1$group <- ifelse(d1$group == "2mg",1,0)
colnames(d1) <- c("index",'group',"time","y")
d2 <- d1 %>% select(index, y, group, time)
colnames(d2) <- c('index','y','x1','x2') ### Final data
## Modification of the objective function:
## Another approach:
dpd_poi <- function(x,fixed = c(rep(FALSE,5))){
params <- fixed
dpd_1 <- function(p){
params[!fixed] <- p
alpha <- params[1]
beta_0 <- params[2]
beta_1 <- params[3]
beta_2 <- params[4]
rho <- params[5]
add_pi <- function(d){
k <- beta_0+(d[3]*beta_1)+(d[4]*beta_2)
k1 <- exp(k) ## for Poisson regression
d <- cbind(d,k1)
}
dat_split <- split(x , f = x$index)
result <- lapply(dat_split, add_pi)
result <- rbindlist(result)
result <- as.data.frame(result)
colnames(result) <- c('index','y','x1','x2','lamb')
result_split <- split(result, f = result$index)
expression <- function(d){
bin <- as.data.frame(combn(d$y , 2))
pr <- as.data.frame(combn(d$lamb , 2))
## Evaluation of the probabilities:
f_jk <- function(u,v){
dummy_func <- function(x,y){
ppois(x, lambda = y)
}
dummy_func_1 <- function(x,y){
ppois(x-1, lambda = y)
}
k <- mapply(dummy_func,u,v)
k_1 <- mapply(dummy_func_1,u,v)
inv1 <- inverseCDF(as.matrix(k), pnorm)
inv2 <- inverseCDF(as.matrix(k_1), pnorm)
mean <- rep(0,2)
lower <- inv2
upper <- inv1
corr <- diag(2)
corr[lower.tri(corr)] <- rho
corr[upper.tri(corr)] <- rho
prob <- pmvnorm(lower = lower, upper = upper, mean = mean, corr = corr)
prob <- (1+(1/alpha))*(prob^alpha)
## First expression: (changes for Poisson regression)
lam <- as.vector(t(v))
v1 <- rpois(1000, lambda = lam[1])
v2 <- rpois(1000, lambda = lam[2])
all_possib <- as.data.frame(rbind(v1,v2))
new_func <- function(u){
k <- mapply(dummy_func,u,v)
k_1 <- mapply(dummy_func_1,u,v)
inv1_1 <- inverseCDF(as.matrix(k), pnorm)
inv2_1 <- inverseCDF(as.matrix(k_1), pnorm)
mean1 <- rep(0,2)
lower1 <- inv2_1
upper1 <- inv1_1
corr1 <- diag(2)
corr1[lower.tri(corr1)] <- rho
corr1[upper.tri(corr1)] <- rho
prob1 <- pmvnorm(lower = lower1, upper = upper1, mean = mean1, corr = corr1)
prob1 <- prob1^(alpha)
}
val <- apply(all_possib, 2, new_func)
val_s <- mean(val) ## approximation
return(val_s - prob)
}
final_res <- mapply(f_jk, bin, pr)
final_value <- sum(final_res)
}
u <- sapply(result_split,expression)
return(sum(u))
}
}
## run the objective function:
cl <- makeCluster(25)
setDefaultCluster(cl=cl)
clusterExport(cl,c('d2','val'))
clusterEvalQ(cl,c(library(data.table), library(Rlab),library(HDInterval),library(mvtnorm),library(matrixStats),library(dplyr),library(cold)))
val <- dpd_poi(d2, c(0.5,FALSE,FALSE,FALSE,FALSE))
optimParallel(par = c(beta_0 =1, beta_1 =0.1 ,beta_2 = 1,rho=0.2),fn = val ,method = "L-BFGS-B",lower = c(-10,-10,-10,0),upper = c(Inf,Inf,Inf,1))
stopCluster(cl)
After running for some time, it returns the following error:
checkForRemoteErrors(val)
9 nodes produced errors; first error: missing value where TRUE/FALSE needed
However, when I make a minor change in the objective function (pick 2 random numbers from rpois instead of 1000) and run the same code using optim, it converges and gives me a proper result. This is a Monte Carlo simulation and it does not make sense to draw so few Poisson variables. I have to use optimParllel, otherwise, it takes way too long to converge. I could also run this code using simulated data.
I'm unable to figure out where the issue truly lies. I truly appreciate any help in this regard.
The below code perfectly provides my gmm estimates for 216 rolling windows.
gmm_list <- lapply(1:(length(ALLX$DE)-24), function(i) {
tmp <- ALLX[i:(i+23),]
DE <- as.matrix(as.numeric(tmp$DE))
p <- nrow(DE)
rmrf_local <- as.matrix(as.numeric(tmp$rmrf_local))
SMB_L <- as.matrix(as.numeric(tmp$SMB_L))
HML_L <- as.matrix(as.numeric(tmp$HML_L))
MOM_L <- as.matrix(as.numeric(tmp$MOM_L))
bond_L <- as.matrix(as.numeric(tmp$bond_L))
h <- cbind(rmrf_local,SMB_L,HML_L,MOM_L,bond_L)
gmm(DE ~ rmrf_local+SMB_L+HML_L+bond_L, x=h)
})
I'm able to extract the coef (estimate, SE, t_value and P_value) with
lapply(gmm_list, function(x) coef(summary(x))[,1:4])
However, i am unable to extract the adj.Rsquared.
I used the code below but i have NULL, and any idea??
> sapply(gmm_list, function(x) summary(x) $r.squared)
NULL
I am modeling the population change in a food web of species, using ODE and deSolve in R. obviously the populations should not be less than zero. therefore I have added an event function and run it as below. although the answers change from when I did nlt used event function, but it still producds negative values. What is wrong?
#using events in a function to distinguish and address the negative abundances
eventfun <- function(t, y, parms){
y[which(y<0)] <- 0
return(y)
}
# =============================== main code
max.time = 100
start.time = 50
initials <- c(N, R)
#parms <- list(webs=webs, a=a, b=b, h=h, m=m, basals=basals, mu=mu, Y=Y, K=K, no.species=no.species, flow=flow,S=S, neighs=neighs$neighs.per, dispers.maps=dispers.maps)
temp.abund <- ode(y=initials, func=solve.model, times=0:max.time, parms=parms, events = list(func = eventfun, time = 0:max.time))
and here is the ODE function(if it helps in finding the problem):
solve.model <- function(t, y, parms){
y <- ifelse(y<1e-6, 0, y)
with(parms,{
# return from vector form into matrix form for calculations
(R <- as.matrix(y[(max(no.species)*length(no.species)+1):length(y)]))
(N <- matrix(y[1:(max(no.species)*length(no.species))], ncol=length(no.species)))
dy1 <- matrix(nrow=max(no.species), ncol=length(no.species))
dy2 <- matrix(nrow=length(no.species), ncol=1)
no.webs <- length(no.species)
for (i in 1:no.webs){
species <- no.species[i]
(abundance <- N[1:species,i])
adj <- as.matrix(webs[[i]])
a.temp <- a[1:species, 1:species]*adj
b.temp <- b[1:species, 1:species]*adj
h.temp <- h[1:species, 1:species]*adj
(sum.over.preys <- abundance%*%(a.temp*h.temp))
(sum.over.predators <- (a.temp*h.temp)%*%abundance)
#Calculating growth of basal
(basal.growth <- basals[,i]*N[,i]*(mu*R[i]/(K+R[i])-m))
# Calculating growth for non-basal species D
no.basal <- rep(1,len=species)-basals[1:species]
predator.growth<- rep(0, max(no.species))
(predator.growth[1:species] <- ((abundance%*%(a.temp*b.temp))/(1+sum.over.preys)-m*no.basal)*abundance)
predation <- rep(0, max(no.species))
(predation[1:species] <- (((a.temp*b.temp)%*%abundance)/t(1+sum.over.preys))*abundance)
(pop <- basal.growth + predator.growth - predation)
dy1[,i] <- pop
dy2[i] <- 0.0005 #to consider a nearly constant value for the resource
}
#Calculating dispersals .they can be easily replaced
# by adjacency maps of connections between food webs arbitrarily!
disp.left <- dy1*d*dispers.maps$left.immig
disp.left <- disp.left[,neighs[,2]]
disp.right <- dy1*d*dispers.maps$right.immig
disp.right <- disp.right[,neighs[,3]]
emig <- dy1*d*dispers.maps$emigration
mortality <- m*dy1
dy1 <- dy1+disp.left+disp.right-emig
return(list(c(dy1, dy2)))
})
}
thank you so much for your help
I have had success using a similar event function defined like this:
eventfun <- function(t, y, parms){
with(as.list(y), {
y[y < 1e-6] <- 0
return(y)
})
}
I am using a similar event function to the one posted by jjborrelli. I wanted to note that for me it is still showing the ode function returning negative values. However, when ode goes to calculate the next step, it is using 0, and not the negative value shown for the current step, so you can basically ignore the negative values and replace with zeros at the end of the simulation.
I'm writing function used for forecasting sales on the base of trend and seasonality. I use dummy variables to represent seasonality and time variable for trend. Here is the code:
forecast<-function(data, time, fn) {
n <- length(data)
seasonal <- factor(cycle(data))
new_data <- seq(from=cycle(data)[n]+1, length=fn, by=1)
new_seasonal <- factor(new_data)
trend <- rep(0, n)
new_trend <- rep(0, n)
if (time > 0) {
trend <- vector()
new_trend <- vector()
ttrend <- seq(from=1, to=n, by=1)
tnew_trend <- seq(from=n+1, length=fn, by=1)
for(i in 1:time) {
trend <- cbind(trend, ttrend^i)
new_trend <- cbind(new_trend, tnew_trend^i)
}
model_trend <- lm(data ~ seasonal + trend)
} else {
model_trend <- lm(data ~ seasonal)
}
df <- data.frame(new_seasonal, new_trend)
p <- predict(model_trend, df)
}
forecast(data = dane.ts[,"SALES"], time=2, fn=5)
However I get warning:
Warning message:
'newdata' had 5 rows but variable(s) found have 104 rows
And it seems that sth is wrong with this function. I would appreciate any help.
I am just really getting into trying to write MLE commands in R that function and look similar to native R functions. In this attempt I am trying to do a simple MLE with
y=b0 + x*b1 + u
and
u~N(0,sd=s0 + z*s1)
However, even such a simple command I am having difficulty coding. I have written a similar command in Stata in a handful of lines
Here is the code I have written so far in R.
normalreg <- function (beta, sigma=NULL, data, beta0=NULL, sigma0=NULL,
con1 = T, con2 = T) {
# If a formula for sigma is not specified
# assume it is the same as the formula for the beta.
if (is.null(sigma)) sigma=beta
# Grab the call expression
mf <- match.call(expand.dots = FALSE)
# Find the position of each argument
m <- match(c("beta", "sigma", "data", "subset", "weights", "na.action",
"offset"), names(mf), 0L)
# Adjust names of mf
mf <- mf[c(1L, m)]
# Since I have two formulas I will call them both formula
names(mf)[2:3] <- "formula"
# Drop unused levels
mf$drop.unused.levels <- TRUE
# Divide mf into data1 and data2
data1 <- data2 <- mf
data1 <- mf[-3]
data2 <- mf[-2]
# Name the first elements model.frame which will be
data1[[1L]] <- data2[[1L]] <- as.name("model.frame")
data1 <- as.matrix(eval(data1, parent.frame()))
data2 <- as.matrix(eval(data2, parent.frame()))
y <- data1[,1]
data1 <- data1[,-1]
if (con1) data1 <- cbind(data1,1)
data2 <- unlist(data2[,-1])
if (con2) data2 <- cbind(data2,1)
data1 <- as.matrix(data1) # Ensure our data is read as matrix
data2 <- as.matrix(data2) # Ensure our data is read as matrix
if (!is.null(beta0)) if (length(beta0)!=ncol(data1))
stop("Length of beta0 need equal the number of ind. data2iables in the first equation")
if (!is.null(sigma0)) if (length(sigma0)!=ncol(data2))
stop("Length of beta0 need equal the number of ind. data2iables in the second equation")
# Set initial parameter estimates
if (is.null(beta0)) beta0 <- rep(1, ncol(data1))
if (is.null(sigma0)) sigma0 <- rep(1, ncol(data2))
# Define the maximization function
normMLE <- function(est=c(beta0,sigma0), data1=data1, data2=data2, y=y) {
data1est <- as.matrix(est[1:ncol(data1)], nrow=ncol(data1))
data2est <- as.matrix(est[(ncol(data1)+1):(ncol(data1)+ncol(data2))],
nrow=ncol(data1))
ps <-pnorm(y-data1%*%data1est,
sd=data2%*%data2est)
# Estimate a vector of log likelihoods based on coefficient estimates
llk <- log(ps)
-sum(llk)
}
results <- optim(c(beta0,sigma0), normMLE, hessian=T,
data1=data1, data2=data2, y=y)
results
}
x <-rnorm(10000)
z<-x^2
y <-x*2 + rnorm(10000, sd=2+z*2) + 10
normalreg(y~x, y~z)
At this point the biggest issue is finding an optimization routine that does not fail when the some of the values return NA when the standard deviation goes negative. Any suggestions? Sorry for the huge amount of code.
Francis
I include a check to see if any of the standard deviations are less than or equal to 0 and return a likelihood of 0 if that is the case. Seems to work for me. You can figure out the details of wrapping it into your function.
#y=b0 + x*b1 + u
#u~N(0,sd=s0 + z*s1)
ll <- function(par, x, z, y){
b0 <- par[1]
b1 <- par[2]
s0 <- par[3]
s1 <- par[4]
sds <- s0 + z*s1
if(any(sds <= 0)){
return(log(0))
}
preds <- b0 + x*b1
sum(dnorm(y, preds, sds, log = TRUE))
}
n <- 100
b0 <- 10
b1 <- 2
s0 <- 2
s1 <- 2
x <- rnorm(n)
z <- x^2
y <- b0 + b1*x + rnorm(n, sd = s0 + s1*z)
optim(c(1,1,1,1), ll, x=x, z=z,y=y, control = list(fnscale = -1))
With that said it probably wouldn't be a bad idea to parameterize the standard deviation in such a way that it is impossible to go negative...