I am working with a matrix that is 35 rows and 16 columns. I am trying to run a Bayesian Multistate Model but something in my model code prevents it from working. When I run the code in R, I get the error message:
`
Error in checkForRemoteErrors(val) :
3 nodes produced errors; first error: RUNTIME ERROR:
Dimension mismatch in values supplied for betaA
`
Any help is appreciated and my code is below:
# psi = movement probability
# phi = apparent survival
# p = detection probability
# o = occurrence probability
# load libraries
library(jagsUI)
library(lattice)
library(coda)
library("R2WinBUGS")
library("R2jags")
library(zoo)
devtools::install_github("bstaton1/postpack")`
# initializing functions####
known.state.ms <- function(ms, notseen){
#notseen: label for 'not seen'
state <- ms
state[state==notseen] <- NA
for (i in 1:dim(ms)[1]){
m <- min(which(!is.na(state[i,])))
state[i,m] <- NA
}
return(state)
`}`
#i = 1
#ch = CHY[i,]
#first = f[i]`
z_inits = function(ch, first) {
nt = length(ch)
to_fill = which(ch == 4 & 1:nt >= first)
to_keep = which(ch != 4 & 1:nt >= first)
known = ch; known[to_fill] = NA
unknown = rep(NA, nt)
known_alive = rep(NA, nt)
unknown[to_fill] = 2
for (t in 1:nt) {
known_alive[t] = ifelse(any(!is.na(known[t:nt])), 1, 0)
}
last_known_alive = max(which(known_alive == 1))
if (last_known_alive < 16) {
dead = rep(0, nt)
for (t in (last_known_alive + 1):nt) {
dead[t] = sample(c(0,1), size = 1, prob = matrix(c(0.9, 0.1, 0, 1), 2,
2, byrow = T)[dead[t-1] + 1,])
}
unknown[dead == 1] = 4
}
unknown
}
`
# import data
dat <- read.csv("bass_encounter_history_0.csv")
covs <- read.csv("depth.csv")
depth = covs[,1]
histories <- unlist(lapply(dat$history, function(x) strsplit(x,split="")))
CH <- t(matrix(histories,nrow=16,ncol=35))
CH <- gsub("0",4,CH)
CH <- gsub("A",1,CH)
CH <- gsub("B",2,CH)
CH <- gsub("C",3,CH)
CH <- matrix(as.numeric(CH),nrow=35,ncol=16)
# Built the model####
nind= nrow(CH)
n.occasions = ncol(CH)
f=c(1,1,1,2,2,2,2,2,3,3,3,4,4,4,4,4,4,1,1,1,3,1,1,1,3,3,3,3,5,3,5,6,6,6,6) # initial tagging week
jags_model = function() {
# -------------------------------------------------
# Parameters:
# phiA: survival probability at site A
# phiB: survival probability at site B
# phiC: survival probability at site C
# psiA[1,t]: probability of staying in site A
# psiA[2,t]: movement probability from site A to site B
# psiB[1,t]: movement probability from site B to site A
# psiB[2,t]: probability of staying in site B
# psiB[3,t]: movement probability from site B to site C
# psiC[1,t]: probability of staying in site C
# psiC[2,t]: movement probability from site C to site B
# betaA[i]: the effect of standardized flow on movement probabilities at site A
# betaB[i]: the effect of standardized flow on movement probabilities at site B
# betaC[i]: the effect of standardized flow on movement probabilities at site C
# wA,B,C: the variable weight of the betas, 1 = essential, 0 = insignificant
# pA: recapture probability at site A
# pB: recapture probability at site B
# pC: recapture probability at site C
# -------------------------------------------------
# States (S):
# 1 alive at A
# 2 alive at B
# 3 alive at C
# 4 dead
# Observations (O):
# 1 seen at A
# 2 seen at B
# 3 seen at C
# 4 not seen
# -------------------------------------------------
# Priors and constraints
# Survival and recapture: uniform
phiA ~ dunif(0, 1)
phiB ~ dunif(0, 1)
phiC ~ dunif(0, 1)
pA ~ dunif(0, 1)
pB ~ dunif(0, 1)
pC ~ dunif(0, 1)
wA ~ dbern(.5)
for(i in 1:3){
wB[i] ~ dbern(.5)
}
wC ~ dbern(.5)
for(t in 1:(n.occasions-1)){
logit(psiA[1,t]) <- muA + wA*betaA*x[t]
psiA[2,t] <- 1 - psiA[1,t]
logit(psiC[1,t]) <- muC + wC*betaC*x[t]
psiC[2,t] <- 1 - psiC[1,t]
for(i in 1:3){
b[i,t] <- exp(muB[i] + wB[i]*betaB[i]*x[t])
psiB[i,t] <- b[i,t]/sum(b[,t])
}
}
muA ~ dt(0, 1/1.566^2, 7.763)
muC ~ dt(0, 1/1.566^2, 7.763)
mean.psiA <- 1/(1+exp(-muA))
#it's not really the mean - it's the probability of staying in A at mean value of x (only b/c x is z- transformed)
mean.psiC <- 1/(1+exp(-muC))
betaA ~ dt(0, 1/1.566^2, 7.763)
betaC ~ dt(0, 1/1.566^2, 7.763)
for(i in 1:2){
muB[i] ~ dt(0, 1/1.566^2, 7.763)
betaB[i] ~ dt(0, 1/1.566^2, 7.763)
}
muB[3] <- 0
betaB[3] <- 0
# PREDICTED TRANSITION PROBS FOR PLOTTING
for(r in 1:n.depth){
for(i in 1:3){
pred.b[i,r] <- exp(muB[i] + wB[i]*betaB[i]*depthseq[r])
pred.psiB[i,r] <- pred.b[i,r]/sum(pred.b[,r])
}
logit(pred.psiA[1,r]) <- muA + wA*betaA*depthseq[r]
pred.psiA[2,r] <- 1 - pred.psiA[1,r]
logit(pred.psiC[1,r]) <- muC + wC*betaC*depthseq[r]
pred.psiC[2,r] <- 1 - pred.psiC[1,r]
}
# Define probabilities of state S(t+1) given S(t)
for (t in 1:(n.occasions-1)){
ps[1,t,1] <- phiA * psiA[1,t]
ps[1,t,2] <- phiA * psiA[2,t]
ps[1,t,3] <- 0
ps[1,t,4] <- 1-phiA
ps[2,t,1] <- phiB * psiB[1,t]
ps[2,t,2] <- phiB * psiB[2,t]
ps[2,t,3] <- phiB * psiB[3,t]
ps[2,t,4] <- 1-phiB
ps[3,t,1] <- 0
ps[3,t,2] <- phiC * psiC[2,t]
ps[3,t,3] <- phiC * psiC[1,t] # switch these so the coefs talk about prob(stay in C)
ps[3,t,4] <- 1-phiC
ps[4,t,1] <- 0
ps[4,t,2] <- 0
ps[4,t,3] <- 0
ps[4,t,4] <- 1
# Define probabilities of O(t) given S(t)
po[1,t,1] <- pA
po[1,t,2] <- 0
po[1,t,3] <- 0
po[1,t,4] <- 1-pA
po[2,t,1] <- 0
po[2,t,2] <- pB
po[2,t,3] <- 0
po[2,t,4] <- 1-pB
po[3,t,1] <- 0
po[3,t,2] <- 0
po[3,t,3] <- pC
po[3,t,4] <- 1-pC
po[4,t,1] <- 0
po[4,t,2] <- 0
po[4,t,3] <- 0
po[4,t,4] <- 1
} #t
# Likelihood
for (i in 1:nind){
# Define latent state at first capture
z[i,f[i]] <- y[i,f[i]]
for (t in (f[i]+1):n.occasions){
# State process: draw S(t) given S(t-1)
z[i,t] ~ dcat(ps[z[i,t-1], t-1,])
# Observation process: draw O(t) given S(t)
y[i,t] ~ dcat(po[z[i,t], t-1,])
} #t
} #i
}
jags_file = "invasiondepthmodel.txt"
postpack::write_model(jags_model, jags_file)
# Configure the model settings and initial values ####
depthseq = seq(min(depth),max(depth),length.out=100)
n.depth=length(depthseq)
#compile jags data object
jags_data <- list(y = CH, x= depth, depthseq=depthseq, n.depth=n.depth, f = f,
n.occasions = n.occasions, nind = nind, z = known.state.ms(CH, 4))
#specify initial values
jags_inits <- function(i){list(
muA = runif(1,-1,1),
muB = c(runif(2,-1,1),NA),
muC = runif(1,-1,1),
wA= rbinom(3, 1, 0.5),
wB= rbinom(3, 1, 0.5),
wC= rbinom(3, 1, 0.5),
betaA = runif(2,-1,1),
betaB = c(runif(2,-1,1),NA),
betaC = runif(2,-1,1),
phiA = runif(1, 0.5, 1),
phiB = runif(1, 0.5, 1),
phiC = runif(1, 0.5, 1),
pA = runif(1, 0.5, 1),
pB = runif(1, 0.5, 1),
pC = runif(1, 0.5, 1),
z = t(sapply(1:nind, function(i) z_inits(CH[i,], f[i])))
)
}
# Parameters monitored
jags_params <- c("phiA","phiB","phiC",
"psiA","psiB","psiC",
"wA","wB","wC",
"muA","muB","muC",
"betaA","betaB","betaC",
"pA","pB","pC",
"pred.psiA","pred.psiB","pred.psiC")
jags_dims = c(
na = 10000, # number of samples in adapting phase
ni = 40000, # number of post-burn-in samples per chain
nb = 40000, # number of burn-in samples
nt = 20, # thinning rate
nc = 3, # number of chains,
parallel = T # run chains in parallel?
); with(as.list(jags_dims), ni/nt * nc)
inits = lapply(1:jags_dims["nc"], jags_inits)
# Run the model in JAGS #####
starttime = Sys.time()
cat("MCMC Started: ", format(starttime), "\n")
post = jagsUI::jags.basic(
data = jags_data,
model.file = jags_file,
inits = inits,
parameters.to.save = jags_params,
n.adapt = jags_dims["na"],
n.iter = sum(jags_dims[c("ni", "nb")]),
n.thin = jags_dims["nt"],
n.burnin = jags_dims["nb"],
n.chains = jags_dims["nc"],
parallel = jags_dims["parallel"],
verbose = F
)
I was expecting the dimensions between the matrix and betaA values to match up. However, it seems like they are not.
In the model, betaA is a scalar. In the model code, you have betaA*x[t] and in the prior: betaA ~ dt(0, 1/1.566^2, 7.763) both indicating a single value. However, in the initial values, it is a vector of length 2: betaA = runif(2,-1,1). You either need to define it as a vector in the model or pass a single value in the inits.
I have a training df with 2 columns like
a b
1 1000 20
2 1008 13
...
n ... ...
Now, as I am required to find a 95% CI for the estimate of 'b' based on a specific 'a' value, with a 'k' value of my choice and compare the CI result to other specific value of 'k's. My question is how can I perform bootstrap for this with 1000 bootstrap reps as I am required to use a fitted knn model for the training data with kernel = 'gaussian' and k can only be in range 1-20 ?
I have found that the best k for this model is k = 5, and had a go for bootstrap but it doesn't work
library(kknn)
library(boot)
boot.kn = function(formula, data, indices)
{
# Create a bootstrapped version
d = data[indices,]
# Fit a model for bs
fit.kn = fitted(train.kknn(formula,data, kernel= "gaussian", ks = 5))
# Do I even need this complicated block
target = as.character(fit.kn$terms[[2]])
rv = my.pred.stats(fit.kn, d[,target])
return(rv)
}
bs = boot(data=df, statistic=boot.kn, R=1000, formula=b ~ a)
boot.ci(bs,conf=0.95,type="bca")
Please inform me for more info if I'm not clear enough. Thank you.
Here is a way to regress b on a with the k-nearest neighbors algorithm.
First, a data set. This is a subset of the iris data set, keeping the first two columns. One row is removed to later be the new data.
i <- which(iris$Sepal.Length == 5.3)
df1 <- iris[-i, 1:2]
newdata <- iris[i, 1:2]
names(df1) <- c("a", "b")
names(newdata) <- c("a", "b")
Now load the packages to be used and determine the optimal value for k with package kknn.
library(caret)
library(kknn)
library(boot)
fit <- kknn::train.kknn(
formula = b ~ a,
data = df1,
kmax = 15,
kernel = "gaussian",
distance = 1
)
k <- fit$best.parameters$k
k
#[1] 9
And bootstrap predictions for the new point a <- 5.3.
boot.kn <- function(data, indices, formula, newdata, k){
d <- data[indices, ]
fit <- knnreg(formula, data = d)
predict(fit, newdata = newdata)
}
set.seed(2021)
R <- 1e4
bs <- boot(df1, boot.kn, R = R, formula = b ~ a, newdata = newdata, k = k)
ci <- boot.ci(bs, level = 0.95, type = "bca")
ci
#BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#Based on 10000 bootstrap replicates
#
#CALL :
#boot.ci(boot.out = bs, type = "bca", level = 0.95)
#
#Intervals :
#Level BCa
#95% ( 3.177, 3.740 )
#Calculations and Intervals on Original Scale
Plot the results.
old_par <- par(mfrow = c(2, 1),
oma = c(5, 4, 0, 0) + 0.1,
mar = c(1, 1, 1, 1) + 0.1)
hist(bs$t, main = "Histogram of bootstrap values")
abline(v = 3.7, col = "red")
abline(v = mean(bs$t), col = "blue")
abline(v = ci$bca[4:5], col = "blue", lty = "dashed")
plot(b ~ a, df1)
points(5.3, 3.7, col = "red", pch = 19)
points(5.3, mean(bs$t), col = "blue", pch = 19)
arrows(x0 = 5.3, y0 = ci$bca[4],
x1 = 5.3, y1 = ci$bca[5],
col = "blue", angle = 90, code = 3)
par(old_par)
I've been trying to estimate VAR models using Monte Carlo Simulation. I have 3 endogenous variables. I need some guidance regarding this.
First of all, I want to add an outlier as a percentage of the sample size.
Second (second simulation for same model), I want to add multivariate contaminated normal distribution like 0.9N (0, I) + 0.1((0,0,0)',(100, 100, 100)) instead of outlier.
Could you tell me how to do these?
Thank you.
RR <- function(n, out){
# n is number of observations
k <- 3 # Number of endogenous variables
p <- 2 # Number of lags
# add outlier
n[1]<- n[1]+out
# Generate coefficient matrices
B1 <- matrix(c(.1, .3, .4, .1, -.2, -.3, .03, .1, .1), k) # Coefficient matrix of lag 1
B2 <- matrix(c(0, .2, .1, .07, -.4, -.1, .5, 0, -.1), k) # Coefficient matrix of lag 2
M <- cbind(B1, B2) # Companion form of the coefficient matrices
# Generate series
DT <- matrix(0, k, n + 2*p) # Raw series with zeros
for (i in (p + 1):(n + 2*p)){ # Generate series with e ~ N(0,1)
DT[, i] <- B1%*%DT[, i-1] + B2%*%DT[, i-2] + rnorm(k, 0, 1)
}
DT <- ts(t(DT[, -(1:p)])) # Convert to time series format
#names <- c("V1", "V2", "V3") # Rename variables
colnames(DT) <- c("Y1", "Y2", "Y3")
#plot.ts(DT) # Plot the series
# estimate VECM
vecm1 <- VECM(DT, lag = 2, r = 2, include = "const", estim ="ML")
vecm2 <- VECM(DT, lag = 2, r = 1, include = "const", estim ="ML")
# mse
mse1 <- mean(vecm1$residuals^2)
mse2 <- mean(vecm2$residuals^2)
#param_list <- unname(param_list)
return(list("mse1" = mse1, "mse2" = mse2, "mse3" = mse3))
}
# defined the parameter grids(define the parameters ranges we want to run our function with)
n_grid = c(50, 80, 200, 400)
out_grid = c(0 ,5, 10)
# collect parameter grids in a list (to enter it into the Monte Carlo function)
prml = list("n" = n_grid, "out" = out_grid)
# run simulation
RRS <- MonteCarlo(func = RR, nrep = 1000, param_list = prml)
summary(RRS)
# make table:
rows = "n"
cols = "out"
MakeTable(output = RRS, rows = rows, cols = cols)
I tried a Naive Bayes classifier to see if I can predict if a person, given their age and estimated salary, would purchase a particular vehicle or not. The plot I got in the visualisation section looks not very smooth and clean, with white lines running across my plot. I'm assuiming the graphics/resolution is the problem but I am not sure.
This is a snippet of what the dataset looks like
Age EstimatedSalary Purchased
19 19000 0
35 20000 0
26 43000 0
27 57000 0
19 76000 0
27 58000 0
Here is the code
# Loading the data set
data <- read.csv(" *A csv sheet on people's age, salaries and whether or not they will purchase a certain vehicle* ")
data <- data[, 3:5]
attach(data)
# Encoding the dependent variable
data$Purchased <- factor(data$Purchased, levels = c(0, 1))
attach(data)
# Splitting the dataset
library(caTools)
set.seed(404)
split <- sample.split(Purchased, SplitRatio = 0.75)
train_set <- subset(data, split == T)
test_set <- subset(data, split == F)
# Feature scaling
train_set[-3] <- scale(train_set[-3])
test_set[-3] <- scale(test_set[-3])
# Training the model
library(e1071)
classifier <- naiveBayes(x = train_set[-3], y = train_set$Purchased)
# Predicting test results
y_pred <- predict(classifier, newdata = test_set[-3])
# Construct the confusion matrix
(cm <- table(test_set[, 3], y_pred))
Below is the code that I used to visualise the results
# Visualising the results
library(ElemStatLearn)
set <- test_set
x1 <- seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
x2 <- seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set <- expand.grid(x1, x2)
colnames(grid_set) <- c("Age", "EstimatedSalary")
y_grid <- predict(classifier, newdata = grid_set)
plot(set[, -3], main = "Naive Bayes: Test set", xlab = "Age", ylab = "EstimatedSalary", xlim = range(x1), ylim = range(x2))
contour(x1, x2, matrix(as.numeric(y_grid), length(x1), length(x2)), add = T)
points(grid_set, pch = ".", col = ifelse(y_grid == 1, "Springgreen3", "tomato"))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, "green4", "red3"))
Naive Bayes classifier plot on the test set predictions
Would like to know the reason for the white lines running up and down the plot and why it does not look smooth?
So I figured out what was giving me the weird lines and the low quality resolution. Adding the "cex = n" parameter to the "points()" function in the graph with n = 5 solved this.
Revised block of code
set <- test_set
x1 <- seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
x2 <- seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set <- expand.grid(x1, x2)
colnames(grid_set) <- c("Age", "EstimatedSalary")
y_grid <- predict(classifier, newdata = grid_set)
plot(set[, -3], main = "Naive Bayes: Test set", xlab = "Age", ylab = "EstimatedSalary", xlim = range(x1), ylim = range(x2))
contour(x1, x2, matrix(as.numeric(y_grid), length(x1), length(x2)), add = T)
points(grid_set, pch = ".", col = ifelse(y_grid == 1, "Springgreen3", "tomato"), cex = 5)
points(set, pch = 21, bg = ifelse(set[, 3] == 1, "green4", "red3"))
The revised line of code in the above block
points(grid_set, pch = ".", col = ifelse(y_grid == 1, "Springgreen3", "tomato"), cex = 5)
However the case, I would still like to know the reason behind how this happened because the explanation available in R about the functions and the parameters were not that clear to me.
Would appreciate any help given!
I'm using the support vector machine from the e1071 package to classify my data and want to visualize how the machine actually does the classification. However, when using the plot.svm function, I get an error that I can't resolve.
Script:
library("e1071")
data <-read.table("2010223_11042_complete")
names(data) <- c("Class","V1", "V2")
model <- svm(Class~.,data, type = "C-classification", kernel = "linear")
plot(model,data,fill=TRUE, grid=200, svSymbol=4, dataSymbol=1, color.palette=terrain.colors)
Output:
plot(model,data,fill=TRUE, grid=200, svSymbol=4, dataSymbol=1, color.palette=terrain.colors)
Error in rect(0, levels[-length(levels)], 1, levels[-1L], col = col) :
cannot mix zero-length and non-zero-length coordinates
Traceback:
traceback()
4: rect(0, levels[-length(levels)], 1, levels[-1L], col = col)
3: filled.contour(xr, yr, matrix(as.numeric(preds), nr = length(xr),
byrow = TRUE), plot.axes = {
axis(1)
axis(2)
colind <- as.numeric(model.response(model.frame(x, data)))
dat1 <- data[-x$index, ]
dat2 <- data[x$index, ]
coltmp1 <- symbolPalette[colind[-x$index]]
coltmp2 <- symbolPalette[colind[x$index]]
points(formula, data = dat1, pch = dataSymbol, col = coltmp1)
points(formula, data = dat2, pch = svSymbol, col = coltmp2)
}, levels = 1:(length(levels(preds)) + 1), key.axes = axis(4,
1:(length(levels(preds))) + 0.5, labels = levels(preds),
las = 3), plot.title = title(main = "SVM classification plot",
xlab = names(lis)[2], ylab = names(lis)[1]), ...)
2: plot.svm(model, data, fill = TRUE, grid = 200, svSymbol = 4,
dataSymbol = 1, color.palette = terrain.colors)
1: plot(model, data, fill = TRUE, grid = 200, svSymbol = 4,
dataSymbol = 1, color.palette = terrain.colors)
Part of my (4488 lines long) data file:
-1 0 23.532
+1 1 61.1157
+1 1 61.1157
+1 1 61.1157
-1 1 179.03
-1 0 17.0865
-1 0 27.6201
-1 0 17.0865
-1 0 27.6201
-1 1 89.6398
-1 0 42.7418
-1 1 89.6398
Since I`m just starting with R, I have no idea what this means and how I should deal with it, nor did I find anything useful in other places.
Without being sure what exactly causes the problem, I would try to transform the Class column to a factor (so defining the type as C-classification will no longer be necessary) using something like this:
data$Class <- as.factor(data$Class)
or in one step:
model <- svm(as.factor(Class)~.,data, kernel = "linear")