How to simulate PCA Data? - r

I am trying to simulate PCA Data as follows:
q <- 5 # no. of PCs
p <- 20 # no. of variables
n <- 2000 # no. of individuals
eps <- 0.05 # error standard deviation
# Eigenvalues
Sig <- seq(3, 1, length.out = q)^2
Lambda <- diag(Sig)
# Matrix of Principal Components
H <- rmvnorm(n = n, mean = rep(0, q), sigma = Lambda)
# Add gaussian noise
E <- matrix(rnorm(n*p, sd = sqrt(eps)), ncol = p)
# Data matrix
Y <- H %*% t(Amat) + E
# Perform PCA
summary(m1 <- prcomp(Y, scale = T)) # and so on...
However, I have no idea how to create the matrix of Loadings Amat in a meaningful way.
Thanks for any help I receive from you and I appreciate it!

This is not using the same structure as the OP, but it simulates a PCA with 4 different groups (which could be species) which each have 3 "traits" (each of the trait have different means and sd based on some biological data found in the literature for example).
set.seed(123) # setting this so the random results will be repeatable
library(MASS)
# Simulating 3 traits for 4 different species
n = 200 # number of "individuals"
# Generate the groups
Amat1 = MASS::mvrnorm(n, mu = c(11.2,11.8,9.91), Sigma = diag(c(1.31,1.01,1.02)))
Amat2 = MASS::mvrnorm(n, mu = c(7.16,8.54,6.82), Sigma = diag(c(0.445,0.546,0.350)))
Amat3 = MASS::mvrnorm(n, mu = c(15.6,14.6,13.5), Sigma = diag(c(1.43,0.885,0.990)))
Amat4 = MASS::mvrnorm(n, mu = c(8.65,14.1,8.24), Sigma = diag(c(0.535,0.844,0.426)))
# Combine the data
Amat = rbind(Amat1,Amat2,Amat3,Amat4)
# Make group data
Amat.gr = cbind(Amat, gl(4,k=n,labels = c(1,2,3,4)))
# Calculate the covariance matrix for each group
by(Amat.gr[,1:3],INDICES = Amat.gr[,4],FUN = cov) # calculate covariance matrix for all groups
# Plot the result
summary(m1 <- prcomp(Amat, scale= T))
# biplot(m1, xlabs=rep(".", nrow(Amat)), cex = 2)
plot(vegan::scores(m1), asp = 1, pch = 19, col = gl(4,k=n,labels = c(1,2,3,4)))
plot(Amat[,1],Amat[,2], pch = 19, col = gl(4,k=n,labels = c(1,2,3,4)))
The plot on the left shows the PCA and on the right the raw data.
I added a toy example with data to show what is the algorithm to compute a PCA in R from Legendre and Legendre 2012.
# Generate vectors (example from Legendre and Legendre 2012)
v1 = c(2,3,5,7,9)
v2 = c(1,4,0,6,2)
# If you want to play with sample size
# n = 100
# v1 = rnorm(n = n, mean = mean(v1), sd = sd(v1))
# v2 = rnorm(n = n, mean = mean(v2), sd = sd(v2))
# Get the y matrix
y = cbind(v1,v2)
# Centered y matrix
yc = apply(y, 2, FUN = function(x) x-mean(x))
# Dispersion matrix
s = 1/(nrow(y)-1)*t(yc) %*% yc
# Compute the single value decomposition to get the eigenvectors and
ev = svd(s)$v
# get the principal components
f = yc %*% ev
# This gives the identity matrix
round(t(svd(s)$v) %*% svd(s)$v,2)
# these are the eigen values
svd(s)$d
-svd(yc)$v #p. 104
plot(f, pch = 19); abline(h=0,v=0, lty = 3)

Related

Why State Space estimation changes every time?

I don't know if this is the right place for my question (maybe stats exchange?), but I've been stuck some days here
I've been reading this paper from Morley (2002), where he makes an approach to the state space to the Beveridge Nelson decomposition.
So on page 4, Morley uses an example of how to use the state space for an Arima(2,1,2) model.
So I tried to generalize it for any Arima model of order (p,d,q)
So far I have been following the FKF package documentation
So here I put the state space rep for an Arma(3,3) (supposing the series is I(1))
So far I've made a function that computes the Kalman filter for any Arma(p,q), but it changes with every run and I'm not simulating anything because I use the GDP from FRED from 1970 to 2022.
BN_filter <- function(data, order, p_estimates = FALSE){
# Order things up ----
p = order[1]
d = order[2]
q = order[3]
# Set the data
xt <- select_if(us, is.numeric)[[1]]
yt <- xt - lag(xt, n = d)
n <- length(yt)
# Estimate the parameters (guesses) and set the par0 ----
m <- arima(yt, order = c(p, 0 , q), include.mean = FALSE, method = "ML")
coef <- as.matrix(m$coef) # all AR and MA coefficients
rownames(coef) <- NULL
sigma_e <- var(m$residuals, na.rm = T) # Variance of the error
# here is a vector of (p+q)+1 dimension where the last object is the variance
par0 <- as.matrix(c(coef, sigma_e))
# Since series is stationary and arma model doesn't have drift the expected value for the alpha vector is 0
a0 <- as.numeric(rep(0, times = (p+q)))
P0 <- matrix(1e16, nrow = (p+q), ncol = (p+q))
# State - space matrices
dt <- matrix(0, nrow = (p+q))
ct <- matrix(0)
GGt <- matrix(0)
Zt <- matrix(c(1,rep(0, times = (p+q-1))), nrow = 1)
# Set up sample ----
yt_s <- t(yt[(d+1):n])
# Build the Tt matrix and the HHT matrix
Kalman <- function(par, yt, a0, P0, ct, Zt, GGt, dt){
# Build pseudo Tt (this is the Tt matrix from the second row)
Ip = diag(1, nrow = (p-1))
Zpv = matrix(0, nrow = (p-1))
Zm1 = matrix(0, nrow = (p-1), ncol = (q-1))
Zeps = matrix(0, nrow = 1, ncol = (p+q))
Zm2 = matrix(0, nrow = (q-1), ncol = (p-1))
Zqv = matrix(0, nrow = (q-1))
Iq = diag(1, nrow = (q-1))
s_row <- cbind(Ip, Zpv, Zm1, Zpv)
f_row <- cbind(Zm2, Zqv, Iq, Zqv)
pseudo_Tt <- rbind(s_row, Zeps, f_row)
# Tt ----
firstRow <- matrix(par[1:(p+q)], nrow = 1)
Tt <- rbind(firstRow, pseudo_Tt)
#HHt ----
HHt <- matrix(par[(length(par-1):length(par))], ncol = (p+q), nrow = (p+q))
results <- fkf(a0 = a0, P0 = P0, dt = dt, ct = ct,
Tt = Tt, Zt = Zt, HHt = HHt,
GGt = GGt, yt=yt)
return(results)
}
# Optimizing function ----
fn <- function(x0, ...){
-Kalman(par = x0, yt = yt_s,a0 = a0, P0 = P0, ct = ct, Zt = Zt, GGt = GGt, dt= dt)$logLik
}
# Run optimizers -----
## Global optimizer
fit.fkf <- optim(par0, fn, method = "SANN")
## Local optimizer
fit.fkf <- optim(fit.fkf$par, fn)
## Fitted object
fit.obj <- Kalman(fit.fkf$par, yt = yt_s, a0 = a0,
P0 = P0, ct = ct,
Zt = Zt, GGt = GGt, dt = dt)
# Estimates ----
kalman_est <- c(phi = fit.fkf$par[1:p],
theta = fit.fkf$par[(p+1):(p+q)],
sigma_e = tail(fit.fkf$par,1))
colnames(par0) <- "ML Estimates"
if(p_estimates == TRUE){
print(cbind("Kalman Estimates" = kalman_est, "ML Estimates"=par0))
}
return(fit.obj)
}
But I have noticed that when it's an Arma(3,1,3) model the estimated coefficients do not change, but the Arma(4,1,2) model does change:
Here is my output:
> a <- BN_filter(data = us, order = c(3,1,3), p_estimates = TRUE)
Kalman Estimates ML Estimates
phi1 -0.6362326324 -0.5359310692
phi2 0.5288426267 0.6021502455
phi3 1.0348389624 0.9328190861
theta1 0.5993870595 0.5223257304
theta2 -0.4980386193 -0.5754977018
theta3 -0.9717439064 -0.8987930785
sigma_e 0.0001401711 0.0001331292
> a <- BN_filter(data = us, order = c(3,1,3), p_estimates = TRUE)
Kalman Estimates ML Estimates
phi1 -0.6362326324 -0.5359310692
phi2 0.5288426267 0.6021502455
phi3 1.0348389624 0.9328190861
theta1 0.5993870595 0.5223257304
theta2 -0.4980386193 -0.5754977018
theta3 -0.9717439064 -0.8987930785
sigma_e 0.0001401711 0.0001331292
> b <- BN_filter(data = us, order = c(4,1,2), p_estimates = TRUE)
Kalman Estimates ML Estimates
phi1 1.4104289329 -0.0007921442
phi2 -0.3076356643 1.0536717900
phi3 -0.1259039820 0.0330696296
phi4 0.0222118830 -0.0865077506
theta1 -1.4458524512 0.0129934941
theta2 0.4801402654 -0.9869856537
sigma_e 0.0001379343 0.0001306852
> b <- BN_filter(data = us, order = c(4,1,2), p_estimates = TRUE)
Kalman Estimates ML Estimates
phi1 -0.3795696850 -0.0007921442
phi2 0.5461933222 1.0536717900
phi3 0.1572464501 0.0330696296
phi4 0.1603534419 -0.0865077506
theta1 0.3029337586 0.0129934941
theta2 -0.4077502428 -0.9869856537
sigma_e 0.0001557173 0.0001306852
So... can someone explain to me what is happening? Why does it change? What is wrong with this?
Thanks in advance!
Here is the state space model for the Arma(p,q) model, if you find it useful:

How can I perform bootstrap to find the confidence interval for a k-nn model in R?

I have a training df with 2 columns like
a b
1 1000 20
2 1008 13
...
n ... ...
Now, as I am required to find a 95% CI for the estimate of 'b' based on a specific 'a' value, with a 'k' value of my choice and compare the CI result to other specific value of 'k's. My question is how can I perform bootstrap for this with 1000 bootstrap reps as I am required to use a fitted knn model for the training data with kernel = 'gaussian' and k can only be in range 1-20 ?
I have found that the best k for this model is k = 5, and had a go for bootstrap but it doesn't work
library(kknn)
library(boot)
boot.kn = function(formula, data, indices)
{
# Create a bootstrapped version
d = data[indices,]
# Fit a model for bs
fit.kn = fitted(train.kknn(formula,data, kernel= "gaussian", ks = 5))
# Do I even need this complicated block
target = as.character(fit.kn$terms[[2]])
rv = my.pred.stats(fit.kn, d[,target])
return(rv)
}
bs = boot(data=df, statistic=boot.kn, R=1000, formula=b ~ a)
boot.ci(bs,conf=0.95,type="bca")
Please inform me for more info if I'm not clear enough. Thank you.
Here is a way to regress b on a with the k-nearest neighbors algorithm.
First, a data set. This is a subset of the iris data set, keeping the first two columns. One row is removed to later be the new data.
i <- which(iris$Sepal.Length == 5.3)
df1 <- iris[-i, 1:2]
newdata <- iris[i, 1:2]
names(df1) <- c("a", "b")
names(newdata) <- c("a", "b")
Now load the packages to be used and determine the optimal value for k with package kknn.
library(caret)
library(kknn)
library(boot)
fit <- kknn::train.kknn(
formula = b ~ a,
data = df1,
kmax = 15,
kernel = "gaussian",
distance = 1
)
k <- fit$best.parameters$k
k
#[1] 9
And bootstrap predictions for the new point a <- 5.3.
boot.kn <- function(data, indices, formula, newdata, k){
d <- data[indices, ]
fit <- knnreg(formula, data = d)
predict(fit, newdata = newdata)
}
set.seed(2021)
R <- 1e4
bs <- boot(df1, boot.kn, R = R, formula = b ~ a, newdata = newdata, k = k)
ci <- boot.ci(bs, level = 0.95, type = "bca")
ci
#BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#Based on 10000 bootstrap replicates
#
#CALL :
#boot.ci(boot.out = bs, type = "bca", level = 0.95)
#
#Intervals :
#Level BCa
#95% ( 3.177, 3.740 )
#Calculations and Intervals on Original Scale
Plot the results.
old_par <- par(mfrow = c(2, 1),
oma = c(5, 4, 0, 0) + 0.1,
mar = c(1, 1, 1, 1) + 0.1)
hist(bs$t, main = "Histogram of bootstrap values")
abline(v = 3.7, col = "red")
abline(v = mean(bs$t), col = "blue")
abline(v = ci$bca[4:5], col = "blue", lty = "dashed")
plot(b ~ a, df1)
points(5.3, 3.7, col = "red", pch = 19)
points(5.3, mean(bs$t), col = "blue", pch = 19)
arrows(x0 = 5.3, y0 = ci$bca[4],
x1 = 5.3, y1 = ci$bca[5],
col = "blue", angle = 90, code = 3)
par(old_par)

Monte Carlo simulations for VAR models

I've been trying to estimate VAR models using Monte Carlo Simulation. I have 3 endogenous variables. I need some guidance regarding this.
First of all, I want to add an outlier as a percentage of the sample size.
Second (second simulation for same model), I want to add multivariate contaminated normal distribution like 0.9N (0, I) + 0.1((0,0,0)',(100, 100, 100)) instead of outlier.
Could you tell me how to do these?
Thank you.
RR <- function(n, out){
# n is number of observations
k <- 3 # Number of endogenous variables
p <- 2 # Number of lags
# add outlier
n[1]<- n[1]+out
# Generate coefficient matrices
B1 <- matrix(c(.1, .3, .4, .1, -.2, -.3, .03, .1, .1), k) # Coefficient matrix of lag 1
B2 <- matrix(c(0, .2, .1, .07, -.4, -.1, .5, 0, -.1), k) # Coefficient matrix of lag 2
M <- cbind(B1, B2) # Companion form of the coefficient matrices
# Generate series
DT <- matrix(0, k, n + 2*p) # Raw series with zeros
for (i in (p + 1):(n + 2*p)){ # Generate series with e ~ N(0,1)
DT[, i] <- B1%*%DT[, i-1] + B2%*%DT[, i-2] + rnorm(k, 0, 1)
}
DT <- ts(t(DT[, -(1:p)])) # Convert to time series format
#names <- c("V1", "V2", "V3") # Rename variables
colnames(DT) <- c("Y1", "Y2", "Y3")
#plot.ts(DT) # Plot the series
# estimate VECM
vecm1 <- VECM(DT, lag = 2, r = 2, include = "const", estim ="ML")
vecm2 <- VECM(DT, lag = 2, r = 1, include = "const", estim ="ML")
# mse
mse1 <- mean(vecm1$residuals^2)
mse2 <- mean(vecm2$residuals^2)
#param_list <- unname(param_list)
return(list("mse1" = mse1, "mse2" = mse2, "mse3" = mse3))
}
# defined the parameter grids(define the parameters ranges we want to run our function with)
n_grid = c(50, 80, 200, 400)
out_grid = c(0 ,5, 10)
# collect parameter grids in a list (to enter it into the Monte Carlo function)
prml = list("n" = n_grid, "out" = out_grid)
# run simulation
RRS <- MonteCarlo(func = RR, nrep = 1000, param_list = prml)
summary(RRS)
# make table:
rows = "n"
cols = "out"
MakeTable(output = RRS, rows = rows, cols = cols)

My P-values are way lower than I expected and can not build a proper power curve

pval.dist.sim = function(n, sigma_x, rho, reps = 2500){
p = 5; sigma = sqrt(2)
beta = c(0.5, 0.5, 0, 0.25, 0)
mu = 10
# generate vector for pvals
pval.list = numeric(reps)
for(r in 1:reps){
# generate design matrix
X = gen_X(n = n, p = 5, rho = rho, sigma_x = sigma_x, mu = mu)
# generate the XtXinv portion of equation
XtXinv = qr.solve(crossprod(X))
sqrtXtXinv55 = sqrt(XtXinv[5,5])
y = X %*% beta + rnorm(n = n)
beta.hat = XtXinv %*% crossprod(X, y)
sE = sqrt(sum((y - X %*% beta.hat)^2)/(n-p))
t.val = beta.hat[3]/(sE * sqrtXtXinv55)
pval.list[r] = 2 * pt(-abs(t.val), df = n - p)
}
return(pval.list)
}
Above is the pval.dist simulation. I need to run this function to build my p.values to build my power curve
set.seed(3701)
# givens
p = 5; d = 2; mu = 10; sigmasqrd = 2; reps = 2500
n.list = seq(from=10, to=150, by=10)
# create a vector for the estimates of the power
est.power = numeric(length(n.list))
# create a vector for the left endpoints of the 95% CI
LB.list = numeric(length(n.list))
# create a vector for the right endpoints of the 95% CI
UB.list = numeric(length(n.list))
for(j in 1:length(n.list)){
# perform the test reps times
pvals = pval.dist.sim(n = n.list[j], sigma_x = 1.5, rho = 0.2, reps = reps )
# record the simulated estimate of the power
est.power[j] = mean(pvals<0.05)
# compute the 95% conf int
bounds = binom.test(x=sum(pvals < 0.05), n = reps, conf.level = 0.95)$conf.int[1:2]
LB.list[j] = bounds[1]
UB.list[j] = bounds[2]
}
## plot the power curve estimation
plot(n.list, est.power, t = "l", xlab = "n",ylab = "Power")
I am having the issue that my pvalues, when plugged in, are drastically low. I am getting values in the single digit percentage. What am I doing wrong?

fitCopula yielding checkPar error

I am new to both R and copulas. I'm trying to fit a Tawn Type 1 copula to data, but keep receiving the following error message:
Error in .local(u, copula, log, ...) : unused argument (checkPar = FALSE)
The data I'm using is Swedish Motor Insurance data, located here: http://instruction.bus.wisc.edu/jfrees/jfreesbooks/Regression%20Modeling/BookWebDec2010/data.html
The x is claims, and the y is payments.
Can anyone please help me discern what's wrong?
Thanks in advance!
# Estimate x (Claims) gamma distribution parameters and visually compare simulated vs observed data
x_mean <- mean(x)
x_var <- var(x)
x_rate <- x_mean / x_var
x_shape <- ( (x_mean)^2 ) / x_var
hist(x, breaks = 20, col = "green", density = 20)
hist(rgamma( nrow(mat), rate = x_rate, shape = x_shape), breaks = 20,col = "blue", add = T, density = 20, angle = -45)
# Estimate y (Payment) gamma distribution parameters and visually compare simulated vs observed data
y_mean <- mean(y)
y_var <- var(y)
y_rate <- y_mean / y_var
y_shape <- ( (y_mean)^2 ) / y_var
hist(y, breaks = 20, col = "green", density = 20)
hist(rgamma(nrow(mat), rate = y_rate, shape = y_shape), breaks = 20, col = "blue", add = T, density = 20, angle = -45)
# Looks good, so...
# Measure association using Kendall's Tau
cor(mat, method = "kendall")
##[,1] [,2]
##[1,] 1.0000000 0.8673572
##[2,] 0.8673572 1.0000000
# Now try Spearman's Rho
cor(mat, method = "spearman")
##[,1] [,2]
##[1,] 1.0000000 0.9624433
##[2,] 0.9624433 1.0000000
# Use Vine Copula pachage to select a copula
var_a <- pobs(mydata)[,1]
var_b <- pobs(mydata)[,2]
selectedCopula <- BiCopSelect(var_a, var_b, familyset = NA)
selectedCopula
## Bivariate copula: Tawn type 1 (par = 12.9, par2 = 0.96, tau = 0.89)
# Family
selectedCopula$family
##[1] 104
# Par
# par -> parameter of the copula
selectedCopula$par
##[1] 12.89932
# Estimate copula parameters
cop_model <- tawnT1Copula(param = c(2,0.5))
m <- pobs(as.matrix(mat))
fit <- fitCopula(cop_model, m, method = 'ml')
coef(fit)
Without a minimum working example, we cannot be sure. But, the error message is clear: a copula function is being fed a "checkPar" argument, which does not exist.
The code you provided does not have the word "checkPar", so it's probably in a dependency. Is your R up-to-date? Are your packages up-to-date?

Resources