Rolling Dummy Regression in R - r

I´ve a question concerning the a regression analysis in r.
#Datei einlesen
residual <- read.csv2("E:***Input-R_Renditen.csv",header=TRUE,sep=";")
#Firmen
alist <- list()
for (a in 2:11){
#Länge Gesamtzeit
t <- 243
tx <- t-59
#Länge Regression
reglist <- list()
for (i in 1:tx){
j <- i+59
c <- i+54
#RegressionsVariable
r <- residual[i:j,a]
rm <- residual[i:j,12]
smb <- residual[i:j,13]
hml <- residual[i:j,14]
rf <- residual[i:j,15]
#Überschussrendite
er <- r-rf
erm <- rm-rf
#Regression
reg <- lm(er~erm+smb+hml)
reglist[[i]] <- coef(reg)
}
alist[[a]] <- reglist
}
I want to insert a dummy/categorical variable into the regression. Let me call the dummy "d", d should have the the value 1 for i to j-6 and the value 0 for j-5 to j.
I cannot include this in my table, which is read, because for every new regression i, the dummy differs. I tried it with ifelse, but I got the error, that the lengths of d differs from others for the regression.

Related

How to create the sampling matrixes for Sobol sensitivity analysis in R (package "sensitivity")

I would like to perform a Sobol sensitivity analysis in R
The package "sensitivity" should allow me to do so, but I don't understand how to generate the sampling matrixes (X1, X2). I have a model that runs outside of R. I have 6 parameters with uniform distribution.
In my text book: N = (2k+2)*M ; M = 2^b ; b=[8,12] (New sampling method : Wu et al. 2012)
I had the feeling that I should create two sampling matrix and feed the two to the sobol function X1_{M,k} X2_{M,k}.
The dimension of final sampling matrix x$X is then (k+2)*M. because:
X <- rbind(X1, X2)
for (i in 1:k) {
Xb <- X1
Xb[, i] <- X2[, i]
X <- rbind(X, Xb)
}
How should I conduct my sampling to get the right number of runs as (2*k+2)*M ?
This script is for the old method but does someone know if the new method is already implemented yet in the sensitivity package? Feel free to comment this procedure
name = c("a" , "b" , "c" , "d" , "e", "f")
vals <- list(list(var="a",dist="unif",params=list(min=0.1,max=1.5)),
list(var="b",dist="unif",params=list(min=-0.3,max=0.4)),
list(var="c",dist="unif",params=list(min=-0.3,max=0.3)),
list(var="d",dist="unif",params=list(min=0,max=0.5)),
list(var="e",dist="unif",params=list(min=2.4E-5,max=2.4E-3)),
list(var="f",dist="unif",params=list(min=3E-5,max=3E-3)))
k = 6
b = 8
M = 2^b
n <- 2*M
X1 <- makeMCSample(n,vals, p = 1)
X2 <- makeMCSample(n,vals, p = 2)
x <- sobol2007(model = NULL, X1, X2, nboot = 200)
if I understand correctly, I should provide a y for each x$X sampling combination
then I can use the function "tell" which will generate the Sobol' first-order indices as well as the total indices
tell(x,y)
ggplot(x)
Supplemental R function SobolR
makeMCSample <- function(n, vals) {
# Packages to generate quasi-random sequences
# and rearrange the data
require(randtoolbox)
require(plyr)
# Generate a Sobol' sequence
if (p == 2){ sob <- sobol(n, length(vals), seed = 4321, scrambling = 1)
}else{sob <- sobol(n, length(vals), seed = 1234, scrambling = 1)}
# Fill a matrix with the values
# inverted from uniform values to
# distributions of choice
samp <- matrix(rep(0,n*(length(vals)+1)), nrow=n)
samp[,1] <- 1:n
for (i in 1:length(vals)) {
# i=1
l <- vals[[i]]
dist <- l$dist
params <- l$params
fname <- paste("q",dist,sep="")
samp[,i+1] <- do.call(fname,c(list(p=sob[,i]),params))
}
# Convert matrix to data frame and add labels
samp <- as.data.frame(samp)
names(samp) <- c("n",laply(vals, function(l) l$var))
return(samp)
}
ref: Qiong-Li Wu, Paul-Henry Cournède, Amélie Mathieu, 2012, Efficient computational method for global sensitivity analysis and its application to tree growth modelling

storing data from a loop in a matrix

I would like to code a loop for cross-validation: computing MSE for a one- and a four-step forecast and store the results in a matrix. The problem I get is that the columns for the 1 to 3-step forecast get overwritten and I get just the 4-step forecast in all columns. Anybody can help?
k<-20
n<-length(xy)-1
h<-4
start <- tsp(xy) [1]+k
j <- n-k
mseQ1 <- matrix(NA,j,h)
colnames(mseQ1) <- paste0('h=',1:h)
for(i in 1:j)
{
xtrain <- window(xy, end=start+(i-1))
xvalid <- window(xy, start=start+i, end=start+i)
qualifiedETS <- ets(xtrain, alpha=NULL, beta=NULL, additive.only=TRUE, opt.crit="mse")
fcastHW <- forecast(qualifiedETS, h=h)
mseQ1[i,] <- ((fcastHW[['mean']]-xvalid)^2)
}

Plotting specific variable in runjags output

When plotting runjags output, how does one plot a single specific variable, when many other variables have similar names? Providing a quoted variable name with the varsargument doesn't seem to do it (it still provides all partial matches).
Here is a simple reproducible example.
N <- 200
nobs <- 3
psi <- 0.35
p <- 0.45
z <- rbinom(n=N, size=1,prob=psi)
y <- rbinom(n=N, size=nobs,prob=p*z)
sink("model.txt")
cat("
model {
for (i in 1:N){
z[i] ~ dbern(psi)
pz[i] <- z[i]*p
y[i] ~ dbin(pz[i],nobs)
} #i
psi ~ dunif(0,1)
p ~ dunif(0,1)
}
",fill = TRUE)
sink()
m <-list(y=y,N=N,nobs=nobs)
inits <- function(){list(psi=runif(1),p=runif(1),z=as.numeric(y>0))}
parameters <- c("p","psi")
ni <- 1000
nt <- 1
nb <- 200
nc <- 3
ad <- 100
library(runjags)
out <- run.jags(model="model.txt",monitor=parameters,data=m,n.chains=nc,inits=inits,burnin=nb,
sample=ni,adapt=ad,thin=nt,modules=c("glm","dic"),method="parallel")
windows(9,4)
plot(out,plot.type=c("trace","histogram"),vars="p",layout=c(1,2),new.window=FALSE)
It should be possible to double quote variables to get an exact match, but this seems to be broken. It should also be possible to specify a logical vector to vars but this seems to be broken for the plot method ... how embarrassing. The following does work though:
# Generate a logical vector to use with matching variable names:
variables <- extract(out, 'stochastic')
variables['psi'] <- FALSE
# Add summary statistics only for the specified variables and pre-draw plots:
out2 <- add.summary(out, vars=variables, plots=TRUE)
plot(out2, plot.type=c("trace","histogram"))
I will fix the other issues for the next release.
Matt

Non-Deterministic behaviour of svm{e1071}

I noticed that SVM when fed with decision.values=T (plus sigmoid to get probabilities ) produces non-deterministic result when I permute data frame under analysis. Does anyone has any idea why? Please try the code yourself
install.packages("e1071")
library(e1071)
A <- cbind(rnorm(20,1,1),rnorm(20,1,1),rep(1,20))
B <- cbind(rnorm(20,9,1),rnorm(20,9,1),rep(0,20))
dataframe <- as.data.frame(rbind(A,B))
predc <- rep(0,length(dataframe[,1]))
K <- length(dataframe[1,])
permutator <- sample(nrow(dataframe))
dataframe$V3 <- factor(dataframe$V3)
dataframe <- dataframe[permutator, ]
for(i in 1:length(dataframe[,1])) {
frm <- as.formula(object=paste("V",as.character(K), " ~ .",sep=""))
r <- svm(formula=frm, data=(dataframe[-i,]))
predicted <- predict(r,newdata=dataframe[i,],decision.values=TRUE)
predc[i] <- sigmoid(attr(predicted,'decision.values')[1])
}
plot(sort(predc))
[edited: code]

Writing my own MLE command in R is causing issues

I am just really getting into trying to write MLE commands in R that function and look similar to native R functions. In this attempt I am trying to do a simple MLE with
y=b0 + x*b1 + u
and
u~N(0,sd=s0 + z*s1)
However, even such a simple command I am having difficulty coding. I have written a similar command in Stata in a handful of lines
Here is the code I have written so far in R.
normalreg <- function (beta, sigma=NULL, data, beta0=NULL, sigma0=NULL,
con1 = T, con2 = T) {
# If a formula for sigma is not specified
# assume it is the same as the formula for the beta.
if (is.null(sigma)) sigma=beta
# Grab the call expression
mf <- match.call(expand.dots = FALSE)
# Find the position of each argument
m <- match(c("beta", "sigma", "data", "subset", "weights", "na.action",
"offset"), names(mf), 0L)
# Adjust names of mf
mf <- mf[c(1L, m)]
# Since I have two formulas I will call them both formula
names(mf)[2:3] <- "formula"
# Drop unused levels
mf$drop.unused.levels <- TRUE
# Divide mf into data1 and data2
data1 <- data2 <- mf
data1 <- mf[-3]
data2 <- mf[-2]
# Name the first elements model.frame which will be
data1[[1L]] <- data2[[1L]] <- as.name("model.frame")
data1 <- as.matrix(eval(data1, parent.frame()))
data2 <- as.matrix(eval(data2, parent.frame()))
y <- data1[,1]
data1 <- data1[,-1]
if (con1) data1 <- cbind(data1,1)
data2 <- unlist(data2[,-1])
if (con2) data2 <- cbind(data2,1)
data1 <- as.matrix(data1) # Ensure our data is read as matrix
data2 <- as.matrix(data2) # Ensure our data is read as matrix
if (!is.null(beta0)) if (length(beta0)!=ncol(data1))
stop("Length of beta0 need equal the number of ind. data2iables in the first equation")
if (!is.null(sigma0)) if (length(sigma0)!=ncol(data2))
stop("Length of beta0 need equal the number of ind. data2iables in the second equation")
# Set initial parameter estimates
if (is.null(beta0)) beta0 <- rep(1, ncol(data1))
if (is.null(sigma0)) sigma0 <- rep(1, ncol(data2))
# Define the maximization function
normMLE <- function(est=c(beta0,sigma0), data1=data1, data2=data2, y=y) {
data1est <- as.matrix(est[1:ncol(data1)], nrow=ncol(data1))
data2est <- as.matrix(est[(ncol(data1)+1):(ncol(data1)+ncol(data2))],
nrow=ncol(data1))
ps <-pnorm(y-data1%*%data1est,
sd=data2%*%data2est)
# Estimate a vector of log likelihoods based on coefficient estimates
llk <- log(ps)
-sum(llk)
}
results <- optim(c(beta0,sigma0), normMLE, hessian=T,
data1=data1, data2=data2, y=y)
results
}
x <-rnorm(10000)
z<-x^2
y <-x*2 + rnorm(10000, sd=2+z*2) + 10
normalreg(y~x, y~z)
At this point the biggest issue is finding an optimization routine that does not fail when the some of the values return NA when the standard deviation goes negative. Any suggestions? Sorry for the huge amount of code.
Francis
I include a check to see if any of the standard deviations are less than or equal to 0 and return a likelihood of 0 if that is the case. Seems to work for me. You can figure out the details of wrapping it into your function.
#y=b0 + x*b1 + u
#u~N(0,sd=s0 + z*s1)
ll <- function(par, x, z, y){
b0 <- par[1]
b1 <- par[2]
s0 <- par[3]
s1 <- par[4]
sds <- s0 + z*s1
if(any(sds <= 0)){
return(log(0))
}
preds <- b0 + x*b1
sum(dnorm(y, preds, sds, log = TRUE))
}
n <- 100
b0 <- 10
b1 <- 2
s0 <- 2
s1 <- 2
x <- rnorm(n)
z <- x^2
y <- b0 + b1*x + rnorm(n, sd = s0 + s1*z)
optim(c(1,1,1,1), ll, x=x, z=z,y=y, control = list(fnscale = -1))
With that said it probably wouldn't be a bad idea to parameterize the standard deviation in such a way that it is impossible to go negative...

Resources