A beginner in R over here, so apologies for the basic question.
Why does ATE return a null vector instead of saving the values of the difference of the means?
fun.cluster <- function(M, N){
set.seed(02139)
J <- 1:M # vector J_i
df <- as.data.frame(matrix(data=1:N, nrow = N, ncol = 1)) #data frame of all original values
df$cluster <- cut(df$V1, M, labels = 1:M) #breaking the dataframe into clusters
df$cluster <- as.numeric(df$cluster)
Y1 <- as.vector(sample(J, 5)) # assigning treatment
df$treatment <- ifelse(df$cluster %in% Y1, df$treatment <- 1, df$treatment <- 0)
#Inducing intracluster correlation:
mu_0j <- runif(n = 50, min = -1, max = 1)
df$V1[df$treatment==0] <- mu_0j
mu_1j <- runif(n=50, min = -0.5, max = 1.5)
df$V1[df$treatment==0] <- mu_1j
# drawing values
y_0i <- rnorm(n = 50, mean = mu_0j, sd = 1)
y_1i <- rnorm(n = 50, mean = mu_1j, sd = 1)
D_i <- as.vector(c(y_0i, y_1i))
# calculating ATE:
ATE[i] <- mean(y_1i - y_0i)
}
ATE <- c()
for(i in 1:10){
fun.cluster(M = 10, N = 100)
}
I tried to speed the below code but without any success.
I read about Rfast package but I also fail in implementing that package.
Is there any way to optimise the following code in R?
RI<-function(y,x,a,mu,R=500,t=500){
x <- as.matrix(x)
dm <- dim(x)
n <- dm[1]
bias1 <- bias2 <- bias3 <- numeric(t)
b1 <- b2<- b3 <- numeric(R)
### Outliers in Y ######
for (j in 1:t) {
for (i in 1:R) {
id <- sample(n, a * n)
z <- y
z[id] <- rnorm(id, mu)
b1[i] <- var(coef(lm(z ~., data = as.data.frame(x))))
b2[i] <- var(coef(rlm(z ~ ., data = data.frame(x), maxit = 2000, method = "MM")))
b3[i] <- var(coef(rlm(z ~ ., data = data.frame(x), psi = psi.huber,maxit = 300)))
}
bias1[j] <- sum(b1) ; bias2[j] <- sum(b2); bias3[j] <- sum(b3)
}
bias <- cbind("lm" = bias1,"MM-rlm" = bias2, "H-rlm" = bias3)
colMeans(bias)
}
#######################################
p <- 5
n <- 200
x<- matrix(rnorm(n * p), ncol = p)
y<-rnorm(n)
a=0.2
mu <-10
#######################################
RI(y,x,a,mu)
I am using R package costrOptim.nl.
I need to minimize a function with the following constraints:
Alpha < sqrt(2*omega) and omega > 0
In my code expressed as:
theta[3] < sqrt(2*theta[1]) and theta[1] > 0
I write these conditions as:
Image
But when I call optimizer and run it.
I'm getting the following problem:
1: In sqrt(2 * theta[1]) : NaNs produced
Why? Did I set the proper conditions?
This is my whole code.
data <- read.delim(file = file, header = FALSE)
ind <- seq(from = 1, to = NROW(data), by = 1)
data <- data.frame(ind = ind, Ret = data$V1, Ret2 = data$V1^2)
colnames(data)[1] <- "Ind"
colnames(data)[2] <- "Ret"
colnames(data)[3] <- "Ret2"
T <- length(data$Ret)
m <- arima(x = data$Ret2, order = c(3,0,0), include.mean = TRUE, method = c("ML"))
b_not <- m$coef
omega <- 0.1
alpha <- 0.005
beta <- 0.9
theta <- c(omega,beta,alpha) # "some" value of theta
s0 <- theta[1]/(1-theta[2])
theta[3] < sqrt(2*theta[1]) # check whether the Feller condition is verified
N <- 30000
reps <- 1
rho <- -0.8
n <- 100
heston.II <- function(theta){
set.seed(5)
u <- rnorm(n = N*reps,mean = 0, sd = 1)
u1 <- rnorm(n = N*reps,mean = 0, sd = 1)
u2 <- rho*u + sqrt((1-rho^2))*u1
sigma <- matrix(0, nrow = N*reps, ncol = 1)
ret.int <- matrix(0, nrow = N*reps, ncol = 1)
sigma[1,1] <- s0
for (i in 2:(N*reps)) {
sigma[i,1] <- theta[1] + theta[2]*sigma[i-1,1] + theta[3]*sqrt(sigma[i-1,1])*u1[i]
# if(sigma[i,1] < 0.00000001){ sigma[i,1] = s0}
}
for (i in 1:(N*reps)) {
ret.int[i,1] <- sqrt(sigma[i,1])*u2[i]
}
ret <- matrix(0, nrow = N*reps/n, ncol = 1)
ret[1,1] <- sum(ret.int[1:n],1)
for (i in 2:((N*reps)/n)) {
ret[i,] <- sum(ret.int[(n*i):(n*(i+1))])
ret[((N*reps)/n),] <- sum(ret.int[(n*(i-1)):(n*i)])
}
ret2 <- ret^2
model <- arima(x = ret2, order = c(3,0,0), include.mean = TRUE)
beta_hat <- model$coef
m1 <- beta_hat[1] - b_not[1]
m2 <- beta_hat[2] - b_not[2]
m3 <- beta_hat[3] - b_not[3]
m4 <- beta_hat[4] - b_not[4]
D <- cbind(m1,m2,m3,m4)
DD <- (D)%*%t(D)/1000
DD <- as.numeric(DD)
return(DD)
}
heston.sim <- heston.II(theta)
hin <- function(theta){
h <- rep(NA, 2)
h[1] <- theta[1]
h[2] <- sqrt(2*theta[1]) - theta[3]
return(h)
}
hin(theta = theta)
.opt <- constrOptim.nl(par = theta, fn = heston.II, hin = hin)
.opt
I’m trying to write simulation code, that generates data and runs t-test selection (discarding those predictors whose t-test p-value exceeds 0.05, retaining the rest) on it. The simulation is largely an adaptation of Applied Econometrics with R by Kleiber and Zeileis (2008, pp. 183–189).
When running the code, it usually fails. Yet with certain seeds (e.g. 1534) it produces plausible output. If it does not produce output (e.g. 1911), it fails due to: "Error in x[, ii] : subscript out of bounds", which traces back to na.omit.data.frame(). So, for some reason, the way I attempt to handle the NAs seems to fail, but I'm unable to figure out in how so.
coef <- rep(coef[,3], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
The first block is unlikely to the cause of the error. It merely generates the data and works well on its own and with other methods, like PCA, as well. The second block pulls the p-values from the regression output; removes the p-value of the intercept (beta_0); and fills the vector with as many 7s as necessary to have the same length as the number of variables, to ensure the same dimension for matrix calculations. Seven is arbitrary and could be any number larger than 0.05 to not pass the test of the loop. This becomes – I believe – necessary, if R discards predictors due to multicollinearity.
The final block creates an empty matrix of the original dimensions; inserts the original data, if the t-test p-value is lower than 0.05, else retains the NA; while the penultimate line removes all columns containing NAs ((exclusively NA or one NA is the same here) taken from mnel’s answer to Remove columns from dataframe where ALL values are NA); lastly, the modified data is again put in the shape of a linear regression.
Does anyone know what causes this behavior or how it would work as intended? I would expect it to either work or not, but not kind of both. Ideally, the former.
A working version of the code is:
set.seed(1534)
Sim_TTS <- function(nobs = c(1000, 15000), pdim = pdims, coef = coef100,
model = c("MLC", "MHC"), ...){
DGP_TTS <- function(nobs = 1000, model = c("MLC", "MHC"), coef = coef100,
sd = 1, pdim = pdims, ALPHA = 0.05)
{
model <- match.arg(model)
if(model == "MLC") {
coef <- rep(coef[,1], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
} else {
coef <- rep(coef[,2], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
}
return(TTR)
}
PG_TTS <- function(nrep = 1, ...)
{
rsq <- matrix(rep(NA, nrep), ncol = 1)
rsqad <- matrix(rep(NA, nrep), ncol = 1)
pastr <- matrix(rep(NA, nrep), ncol = 1)
vmat <- cbind(rsq, rsqad, pastr)
colnames(vmat) <- c("R sq.", "adj. R sq.", "p*")
for(i in 1:nrep) {
vmat[i,1] <- summary(DGP_TTS(...))$r.squared
vmat[i,2] <- summary(DGP_TTS(...))$adj.r.squared
vmat[i,3] <- length(DGP_TTS(...)$coefficients)-1
}
return(c(mean(vmat[,1]), mean(vmat[,2]), round(mean(vmat[,3]))))
}
SIM_TTS <- function(...)
{
prs <- expand.grid(pdim = pdim, nobs = nobs, model = model)
nprs <- nrow(prs)
pow <- matrix(rep(NA, 3 * nprs), ncol = 3)
for(i in 1:nprs) pow[i,] <- PG_TTS(pdim = prs[i,1],
nobs = prs[i,2], model = as.character(prs[i,3]), ...)
rval <- rbind(prs, prs, prs)
rval$stat <- factor(rep(1:3, c(nprs, nprs, nprs)),
labels = c("R sq.", "adj. R sq.", "p*"))
rval$power <- c(pow[,1], pow[,2], pow[,3])
rval$nobs <- factor(rval$nobs)
return(rval)
}
psim_TTS <- SIM_TTS()
tab_TTS <- xtabs(power ~ pdim + stat + model + nobs, data = psim_TTS)
ftable(tab_TTS, row.vars = c("model", "nobs", "stat"), col.vars = "pdim")}
FO_TTS <- Sim_TTS()
FO_TTS
}
Preceeded by:
pdims <- seq(12, 100, 4)
coefLC12 <- c(0, rep(0.2, 4), rep(0.1, 4), rep(0, 4))/1.3
rtL <- c(0.2, rep(0, 3))/1.3
coefLC100 <- c(coefLC12, rep(rtL, 22))
coefHC12 <- c(0, rep(0.8, 4), rep(0.4, 4), rep(0, 4))/1.1
rtH <- c(0.8, rep(0, 3))/1.1
coefHC100 <- c(coefHC12, rep(rtH, 22))
coef100 <- cbind(coefLC100, coefHC100)
I’m aware that model selection via the significance of individual predictors is not recommended, but that is the whole point – it is meant to be compared to more sophisticated methods.
Here is a kind of DF, I have to generate to store simulations data).
nbSimul <- 100
nbSampleSizes <- 4
nbCensoredRates <- 4
sampleSize <- c(100, 50, 30, 10)
censoredRate <- c(0.1, 0.3, 0.5, 0.8)
df.sampled <- data.frame(cas = numeric() ,
distribution = character(),
simul = numeric() ,
sampleSize = numeric() ,
censoredRate = numeric() ,
dta = I(list()) ,
quantileLD = I(list()) ,
stringsAsFactors = FALSE)
v <- 0 # Scenario indicator
for(k in 1:nbCensoredRates){
for(j in 1:nbSampleSizes){
for(i in 1:nbSimul){
# Scenario Id + Other info
v <- v + 1
df.sampled[v,"cas"] <- v
df.sampled[v,"distribution"] <- "logNormal"
df.sampled[v,"simul"] <- i
df.sampled[v,"sampleSize"] <- sampleSize[j]
df.sampled[v,"censoredRate"] <- censoredRate[k]
X <- rlnorm(sampleSize[j], meanlog = 0, sdlog = 1)
estimatedLD <- array(9)
for(w in 1:9){
estimatedLD[w] <- quantile(X, probs=censoredRate[k], type=w)[[1]]
}
df.sampled$dta[v] <- list(X)
df.sampled$quantileLD[v] <- list(estimatedLD[1:9])
}
}
}
Which is quite difficult to read.
I would like to find a way to avoid loops, and to reference easily scenarios (v) and attached variables.
Any idea?