Using `cor.test()` on ranked data - r

I would like to do a Spearman correlation test using rank data. How can I do this with cor.test()? I don't want the function to rerank the data.
Additionally, what form does the data need to be in? From the help, it seems to be the raw data as compared to a correlation matrix.
Consider this example
## Hollander & Wolfe (1973), p. 187f.
## Assessment of tuna quality. We compare the Hunter L measure of
## lightness to the averages of consumer panel scores (recoded as
## integer values from 1 to 6 and averaged over 80 such values) in
## 9 lots of canned tuna.
library(tidyverse)
A <- tibble(
x = c(44.4, 45.9, 41.9, 53.3, 44.7, 44.1, 50.7, 45.2, 60.1),
y = c( 2.6, 3.1, 2.5, 5.0, 3.6, 4.0, 5.2, 2.8, 3.8)
) %>%
mutate(rank_x = rank(x),
rank_y = rank(y)
)
Spearman's correlation coefficient is defined as Pearson's correlation between ranked variables
cor(A$x, A$y, method = "spearman")
#[1] 0.6
cor(A$rank_x, A$rank_y, method = "pearson")
#[1] 0.6
what about cor.test()? Can I use the rank data as its input?
x1 <- cor.test(A$x, A$y, method = "spearman")
x1
# Spearman's rank correlation rho
#
# data: A$x and A$y
# S = 48, p-value = 0.1
# alternative hypothesis: true rho is not equal to 0
# sample estimates:
# rho
# 0.6
x2 <- cor.test(A$rank_x, A$rank_y, method = "pearson")
x2
# Pearson's product-moment correlation
# data: A$rank_x and A$rank_y
# t = 2, df = 7, p-value = 0.09
# alternative hypothesis: true correlation is not equal to 0
# 95 percent confidence interval:
# -0.11 0.90
# sample estimates:
# cor
# 0.6
x3 <- cor.test(A$rank_x, A$rank_y, method = "spearman")
# Spearman's rank correlation rho
#
# data: A$rank_x and A$rank_y
# S = 48, p-value = 0.1
# alternative hypothesis: true rho is not equal to 0
# sample estimates:
# rho
# 0.6

Yes, you should use method = Spearman for ranked or original data. If rank data is used, the data is not reranked in the function.
As the help file implies, using method=Pearson with rank data conducts a Pearson's correlation test on the ranks, which would follow a t-distribution. However, since the ranks are not continuous variables, this approach is not correct.
getAnywhere(cor.test.default)
A single object matching ‘cor.test.default’ was found
It was found in the following places
registered S3 method for cor.test from namespace stats
namespace:stats
with value
function (x, y, alternative = c("two.sided", "less",
"greater"), method = c("pearson", "kendall",
"spearman"), exact = NULL, conf.level = 0.95, continuity = FALSE,
...)
{
alternative <- match.arg(alternative)
method <- match.arg(method)
DNAME <- paste(deparse1(substitute(x)), "and", deparse1(substitute(y)))
if (!is.numeric(x))
stop("'x' must be a numeric vector")
if (!is.numeric(y))
stop("'y' must be a numeric vector")
if (length(x) != length(y))
stop("'x' and 'y' must have the same length")
OK <- complete.cases(x, y)
x <- x[OK]
y <- y[OK]
n <- length(x)
NVAL <- 0
conf.int <- FALSE
if (method == "pearson") {
if (n < 3L)
stop("not enough finite observations")
method <- "Pearson's product-moment correlation"
names(NVAL) <- "correlation"
r <- cor(x, y)
df <- n - 2L
ESTIMATE <- c(cor = r)
PARAMETER <- c(df = df)
STATISTIC <- c(t = sqrt(df) * r/sqrt(1 - r^2))
if (n > 3) {
if (!missing(conf.level) && (length(conf.level) !=
1 || !is.finite(conf.level) || conf.level < 0 ||
conf.level > 1))
stop("'conf.level' must be a single number between 0 and 1")
conf.int <- TRUE
z <- atanh(r)
sigma <- 1/sqrt(n - 3)
cint <- switch(alternative, less = c(-Inf, z + sigma *
qnorm(conf.level)), greater = c(z - sigma * qnorm(conf.level),
Inf), two.sided = z + c(-1, 1) * sigma * qnorm((1 +
conf.level)/2))
cint <- tanh(cint)
attr(cint, "conf.level") <- conf.level
}
PVAL <- switch(alternative, less = pt(STATISTIC, df),
greater = pt(STATISTIC, df, lower.tail = FALSE),
two.sided = 2 * min(pt(STATISTIC, df), pt(STATISTIC,
df, lower.tail = FALSE)))
}
else {
if (n < 2)
stop("not enough finite observations")
PARAMETER <- NULL
TIES <- (min(length(unique(x)), length(unique(y))) <
n)
if (method == "kendall") {
method <- "Kendall's rank correlation tau"
names(NVAL) <- "tau"
r <- cor(x, y, method = "kendall")
ESTIMATE <- c(tau = r)
if (!is.finite(ESTIMATE)) {
ESTIMATE[] <- NA
STATISTIC <- c(T = NA)
PVAL <- NA
}
else {
if (is.null(exact))
exact <- (n < 50)
if (exact && !TIES) {
q <- round((r + 1) * n * (n - 1)/4)
STATISTIC <- c(T = q)
pkendall <- function(q, n) .Call(C_pKendall,
q, n)
PVAL <- switch(alternative, two.sided = {
if (q > n * (n - 1)/4) p <- 1 - pkendall(q -
1, n) else p <- pkendall(q, n)
min(2 * p, 1)
}, greater = 1 - pkendall(q - 1, n), less = pkendall(q,
n))
}
else {
xties <- table(x[duplicated(x)]) + 1
yties <- table(y[duplicated(y)]) + 1
T0 <- n * (n - 1)/2
T1 <- sum(xties * (xties - 1))/2
T2 <- sum(yties * (yties - 1))/2
S <- r * sqrt((T0 - T1) * (T0 - T2))
v0 <- n * (n - 1) * (2 * n + 5)
vt <- sum(xties * (xties - 1) * (2 * xties +
5))
vu <- sum(yties * (yties - 1) * (2 * yties +
5))
v1 <- sum(xties * (xties - 1)) * sum(yties *
(yties - 1))
v2 <- sum(xties * (xties - 1) * (xties - 2)) *
sum(yties * (yties - 1) * (yties - 2))
var_S <- (v0 - vt - vu)/18 + v1/(2 * n * (n -
1)) + v2/(9 * n * (n - 1) * (n - 2))
if (exact && TIES)
warning("Cannot compute exact p-value with ties")
if (continuity)
S <- sign(S) * (abs(S) - 1)
STATISTIC <- c(z = S/sqrt(var_S))
PVAL <- switch(alternative, less = pnorm(STATISTIC),
greater = pnorm(STATISTIC, lower.tail = FALSE),
two.sided = 2 * min(pnorm(STATISTIC), pnorm(STATISTIC,
lower.tail = FALSE)))
}
}
}
else {
method <- "Spearman's rank correlation rho"
if (is.null(exact))
exact <- TRUE
names(NVAL) <- "rho"
r <- cor(rank(x), rank(y))
ESTIMATE <- c(rho = r)
if (!is.finite(ESTIMATE)) {
ESTIMATE[] <- NA
STATISTIC <- c(S = NA)
PVAL <- NA
}
else {
pspearman <- function(q, n, lower.tail = TRUE) {
if (n <= 1290 && exact)
.Call(C_pRho, round(q) + 2 * lower.tail,
n, lower.tail)
else {
den <- (n * (n^2 - 1))/6
if (continuity)
den <- den + 1
r <- 1 - q/den
pt(r/sqrt((1 - r^2)/(n - 2)), df = n - 2,
lower.tail = !lower.tail)
}
}
q <- (n^3 - n) * (1 - r)/6
STATISTIC <- c(S = q)
if (TIES && exact) {
exact <- FALSE
warning("Cannot compute exact p-value with ties")
}
PVAL <- switch(alternative, two.sided = {
p <- if (q > (n^3 - n)/6) pspearman(q, n, lower.tail = FALSE) else pspearman(q,
n, lower.tail = TRUE)
min(2 * p, 1)
}, greater = pspearman(q, n, lower.tail = TRUE),
less = pspearman(q, n, lower.tail = FALSE))
}
}
}
RVAL <- list(statistic = STATISTIC, parameter = PARAMETER,
p.value = as.numeric(PVAL), estimate = ESTIMATE, null.value = NVAL,
alternative = alternative, method = method, data.name = DNAME)
if (conf.int)
RVAL <- c(RVAL, list(conf.int = cint))
class(RVAL) <- "htest"
RVAL
}
<bytecode: 0x0000018603fa9418>
<environment: namespace:stats>

Related

Binary Logistic Regression with BFGS using package maxLik

I tried binary logistic regression with BFGS using maxlik, but i have included the feature as per the syntax i attached below, but the result is, but i get output like this
Maximum Likelihood estimation
BFGS maximization, 0 iterations
*Return code 100: Initial value out of range.
https://docs.google.com/spreadsheets/d/1fVLeJznB9k29FQ_BdvdCF8ztkOwbdFpx/edit?usp=sharing&ouid=109040212946671424093&rtpof=true&sd=true (this is my data)*
library(maxLik)
library(optimx)
data=read_excel("Book2.xlsx")
data$JKLaki = ifelse(data$JK==1,1,0)
data$Daerah_Samarinda<- ifelse(data$Daerah==1,1,0)
data$Prodi2 = ifelse(data$Prodi==2,1,0)
data$Prodi3 = ifelse(data$Prodi==3,1,0)
data$Prodi4 = ifelse(data$Prodi==4,1,0)
str(data)
attach(data)
ll<- function(param){
mu <- param[1]
beta <- param[-1]
y<- as.vector(data$Y)
x<- cbind(1, data$JKLaki, data$IPK, data$Daerah_Samarinda, data$Prodi2, data$Prodi3, data$Prodi4)
xb<- x%*%beta
pi<- exp(xb)
val <- -sum(y * log(pi) + (1 - y) * log(1 - pi),log=TRUE)
return(val)
}
gl<- funtion(param){
mu <- param[1]
beta <- param[-1]
y <- as.vector(data$Y)
x <- cbind(0, data$JKLaki,data$IPK,data$Daerah_Samarinda,data$Prodi2,data$Prodi3,data$Prodi4)
sigma <- x*beta
pi<- exp(sigma)/(1+exp(sigma))
v= y-pi
vx=as.matrix(x)%*%as.vector(v)
gg= colSums(vx)
return(-gg)}
mle<-maxLik(logLik=ll, grad=gl,hess=NULL,
start=c(mu=1, beta1=0, beta2=0, beta3=0, beta4=0, beta5=0, beta6=0,beta7=0), method="BFGS")
summary(mle)
can i get some help, i tired get this solution, please.
I have been able to optimize the log-likelihood with the following code :
library(DEoptim)
library(readxl)
data <- read_excel("Book2.xlsx")
data$JKLaki <- ifelse(data$JK == 1, 1, 0)
data$Daerah_Samarinda <- ifelse(data$Daerah == 1, 1, 0)
data$Prodi2 <- ifelse(data$Prodi == 2, 1, 0)
data$Prodi3 <- ifelse(data$Prodi == 3, 1, 0)
data$Prodi4 <- ifelse(data$Prodi == 4, 1, 0)
ll <- function(param, data)
{
mu <- param[1]
beta <- param[-1]
y <- as.vector(data$Y)
x <- cbind(1, data$JKLaki, data$IPK, data$Daerah_Samarinda, data$Prodi2, data$Prodi3, data$Prodi4)
xb <- x %*% beta
pi <- exp(mu + xb)
val <- -sum(y * log(pi) + (1 - y) * log(1 - pi))
if(is.nan(val) == TRUE)
{
return(10 ^ 30)
}else
{
return(val)
}
}
lower <- rep(-500, 8)
upper <- rep(500, 8)
obj_DEoptim_Iter1 <- DEoptim(fn = ll, lower = lower, upper = upper,
control = list(itermax = 5000), data = data)
lower <- obj_DEoptim_Iter1$optim$bestmem - 0.25 * abs(obj_DEoptim_Iter1$optim$bestmem)
upper <- obj_DEoptim_Iter1$optim$bestmem + 0.25 * abs(obj_DEoptim_Iter1$optim$bestmem)
obj_DEoptim_Iter2 <- DEoptim(fn = ll, lower = lower, upper = upper,
control = list(itermax = 5000), data = data)
obj_Optim <- optim(par = obj_DEoptim_Iter2$optim$bestmem, fn = ll, data = data)
$par
par1 par2 par3 par4 par5 par6 par7
-350.91045436 347.79576145 0.05337466 0.69032735 -0.01089112 0.47465162 0.38284804
par8
0.42125664
$value
[1] 95.08457
$counts
function gradient
501 NA
$convergence
[1] 1
$message
NULL

Obtain Bootstrap Results in Matrix

I have written the following code.
library(quantreg)
# return the g function:
G = function(m, N, gamma) {
Tm = m * N
k = 1:Tm
Gvalue = sqrt(m) * (1 + k/m) * (k/(m + k))^gamma
return(Gvalue)
}
sqroot <- function(A) {
e = eigen(A)
v = e$vectors
val = e$values
sq = v %*% diag(sqrt(val)) %*% solve(v)
return(t(sq))
}
fa = function(m, N, a) {
Tm = m * N
k = 1:Tm
t = (m + k)/m
f_value = (t - 1) * t * (a^2 + log(t/(t - 1)))
return(sqrt(f_value))
}
m = 50
N = 2
n= 50*3
x1 = matrix(runif(n, 0, 1), ncol = 1)
x = cbind(1, x1)
beta = c(1, 1)
xb = x %*% beta
pr = 1/(1+exp(-xb))
y = rbinom(n,1,pr)
# calculate statistic:
stat = function(y, x, m, N, a) {
y_train = y[1:m]
x_train = x[(1:m),]
y_test = y[-(1:m)]
x_test = x[-(1:m),]
fit = glm(y ~ 0 + x, family="binomial")
coef = coef(fit)
log_predict = predict(fit, type="response")
sigma = sqrt(1/(m-1)* sum((y_train - log_predict)^2))
Jvalue = t(x_train) %*% x_train/m * sigma^2
Jsroot = sqroot(Jvalue)
fvalue = fa(m, N, a)
score1 = apply((x_test * as.vector((y_test - x_test %*% coef))), 2, cumsum)
statvalue1 = t(solve(Jsroot) %*% t(score1))/fvalue/sqrt(m)
statmax1 = pmax(abs(statvalue1[, 1]), abs(statvalue1[, 2]))
result = list(stat = statmax1)
return(result)
}
m =50
N = 2
a = 2.795
value = stat(y, x, m, N, a)
value
I want to perform bootstrap to obtain B = 999 number of statistics. I use the following r code. But it produces an error saying "Error in statistic(data, original, ...) :
argument "m" is missing, with no default"
library(boot)
data1 = data.frame(y = y, x = x1, m = m , N = N, a = a)
head(data1)
boot_value = boot(data1, statistic = stat, R = 999)
Can anyone give me a hint? Also, am I able to get the bootstrap results in a matrix format? Since the stat function gives 100 values.
There are different kinds of bootstrapping. If you want to draw from your data 999 samples with replications of same size of your data you may just use replicate, no need for packages.
We put the data to be resampled into a data frame. It looks to me like m, N, a remain constant, so we just provide it as vectors.
data2 <- data.frame(y=y, x=x)
stat function needs to be adapted to unpack y and x-matrix. At the bottom we remove the list call to get just a vector back. unnameing will just give us the numbers.
stat2 <- function(data, m, N, a) {
y_train <- data[1:m, 1]
x_train <- as.matrix(data[1:m, 2:3])
y_test <- data[-(1:m), 1]
x_test <- as.matrix(data[-(1:m), 2:3])
y <- data[, "y"]
x <- as.matrix(data[, 2:3])
fit <- glm(y ~ 0 + x, family="binomial")
coef <- coef(fit)
log_predict <- predict(fit, type="response")
sigma <- sqrt(1/(m-1) * sum((y_train - log_predict)^2))
Jvalue <- t(x_train) %*% x_train/m * sigma^2
Jsroot <- sqroot(Jvalue)
fvalue <- fa(m, N, a)
score1 <- apply((x_test * as.vector((y_test - x_test %*% coef))), 2, cumsum)
statvalue1 <- t(solve(Jsroot) %*% t(score1))/fvalue/sqrt(m)
statmax1 <- pmax(abs(statvalue1[, 1]), abs(statvalue1[, 2]))
result <- unname(statmax1)
return(result)
}
replicate is a cousin of sapply, designed for repeated evaluation. In the call we just sample the rows 999 times and already get a matrix back. As in sapply we need to transform our result.
res <- t(replicate(999, stat2(data2[sample(1:nrow(data2), nrow(data2), replace=TRUE), ], m, N, a)))
Result
As result we get 999 bootstrap replications in the rows with 100 attributes in the columns.
str(res)
# num [1:999, 1:100] 0.00205 0.38486 0.10146 0.12726 0.47056 ...
The code also runs quite fast.
user system elapsed
3.46 0.01 3.49
Note, that there are different kinds of bootstrapping. E.g. sometimes just a part of the sample is resampled, weights are used, clustering is applied etc. Since you attempted to use boot the method shown should be the default, though.

Issue with calculating marginal effects for an ordered logit model in R with ocME

I am attempting to estimate an ordered logit model incl. the marginal effects in R through following the code from this tutorial. I am using polr from the MASS package to estimate the model and ocME from the erer package to attempt to calculate the marginal effects.
Estimating the model is no problem.
logitModelSentiment90 <- polr(availability_90_ord ~ mean_sentiment, data = data, Hess = T,
method = "logistic")
However, I run into an issue with ocME which generates the error message below:
ocME(logitModelSentiment90)
Error in eval(predvars, data, env) :
numeric 'envir' arg not of length one
The documentation below for ocME states that the object that should be used needs to come from the polr function which seems to be exactly what I am doing.
ocME(w, rev.dum = TRUE, digits = 3)
w = an ordered probit or logit model object estimated by polr from the MASS library.
So can anybody help me to understand what I am doing wrong? I have published a subset of my data with the two variables for the model here. In R I have the DV set up as a factor variable, the IV is continuous.
Side note:
I can pass the calculation to Stata from R with RStata to calculate the marginal effects without any problems. But I don't want to have to do this on a regular basis so I want to understand what is causing the issue with R and ocME.
stata("ologit availability_90_ord mean_sentiment
mfx", data.in = data)
. ologit availability_90_ord mean_sentiment
Iteration 0: log likelihood = -15379.121
Iteration 1: log likelihood = -15378.742
Iteration 2: log likelihood = -15378.742
Ordered logistic regression Number of obs = 11,901
LR chi2(1) = 0.76
Prob > chi2 = 0.3835
Log likelihood = -15378.742 Pseudo R2 = 0.0000
------------------------------------------------------------------------------
avail~90_ord | Coef. Std. Err. z P>|z| [95% Conf. Interval]
-------------+----------------------------------------------------------------
mean_senti~t | .0044728 .0051353 0.87 0.384 -.0055922 .0145379
-------------+----------------------------------------------------------------
/cut1 | -1.14947 .0441059 -1.235916 -1.063024
/cut2 | -.5286239 .042808 -.6125261 -.4447217
/cut3 | .3127556 .0426782 .2291079 .3964034
------------------------------------------------------------------------------
. mfx
Marginal effects after ologit
y = Pr(availability_90_ord==1) (predict)
= .23446398
------------------------------------------------------------------------------
variable | dy/dx Std. Err. z P>|z| [ 95% C.I. ] X
---------+--------------------------------------------------------------------
mean_s~t | -.0008028 .00092 -0.87 0.384 -.002609 .001004 7.55768
------------------------------------------------------------------------------
Your model has only one explanatory variable (mean_sentiment) and this seems to be a problem for ocME. Try for example to add a second variable to the model:
logitModelSentiment90 <- polr(availability_90_ord ~ mean_sentiment + I(mean_sentiment^2),
data = data, Hess = T, method = "logistic")
ocME(logitModelSentiment90)
# effect.0 effect.1 effect.2 effect.3
# mean_sentiment -0.004 -0.001 0 0.006
# I(mean_sentiment^2) 0.000 0.000 0 0.000
With minor modifications ocME can correctly run also with one independent variable.
Try the following myocME function
myocME <- function (w, rev.dum = TRUE, digits = 3)
{
if (!inherits(w, "polr")) {
stop("Need an ordered choice model from 'polr()'.\n")
}
if (w$method != "probit" & w$method != "logistic") {
stop("Need a probit or logit model.\n")
}
lev <- w$lev
J <- length(lev)
x.name <- attr(x = w$terms, which = "term.labels")
x2 <- w$model[, x.name, drop=FALSE]
ww <- paste("~ 1", paste("+", x.name, collapse = " "), collapse = " ")
x <- model.matrix(as.formula(ww), data = x2)[, -1, drop=FALSE]
x.bar <- as.matrix(colMeans(x))
b.est <- as.matrix(coef(w))
K <- nrow(b.est)
xb <- t(x.bar) %*% b.est
z <- c(-10^6, w$zeta, 10^6)
pfun <- switch(w$method, probit = pnorm, logistic = plogis)
dfun <- switch(w$method, probit = dnorm, logistic = dlogis)
V2 <- vcov(w)
V3 <- rbind(cbind(V2, 0, 0), 0, 0)
ind <- c(1:K, nrow(V3) - 1, (K + 1):(K + J - 1), nrow(V3))
V4 <- V3[ind, ]
V5 <- V4[, ind]
f.xb <- dfun(z[1:J] - c(xb)) - dfun(z[2:(J + 1)] - c(xb))
me <- b.est %*% matrix(data = f.xb, nrow = 1)
colnames(me) <- paste("effect", lev, sep = ".")
se <- matrix(0, nrow = K, ncol = J)
for (j in 1:J) {
u1 <- c(z[j] - xb)
u2 <- c(z[j + 1] - xb)
if (w$method == "probit") {
s1 <- -u1
s2 <- -u2
}
else {
s1 <- 1 - 2 * pfun(u1)
s2 <- 1 - 2 * pfun(u2)
}
d1 <- dfun(u1) * (diag(1, K, K) - s1 * (b.est %*% t(x.bar)))
d2 <- -1 * dfun(u2) * (diag(1, K, K) - s2 * (b.est %*%
t(x.bar)))
q1 <- dfun(u1) * s1 * b.est
q2 <- -1 * dfun(u2) * s2 * b.est
dr <- cbind(d1 + d2, q1, q2)
V <- V5[c(1:K, K + j, K + j + 1), c(1:K, K + j, K + j +
1)]
cova <- dr %*% V %*% t(dr)
se[, j] <- sqrt(diag(cova))
}
colnames(se) <- paste("SE", lev, sep = ".")
rownames(se) <- colnames(x)
if (rev.dum) {
for (k in 1:K) {
if (identical(sort(unique(x[, k])), c(0, 1))) {
for (j in 1:J) {
x.d1 <- x.bar
x.d1[k, 1] <- 1
x.d0 <- x.bar
x.d0[k, 1] <- 0
ua1 <- z[j] - t(x.d1) %*% b.est
ub1 <- z[j + 1] - t(x.d1) %*% b.est
ua0 <- z[j] - t(x.d0) %*% b.est
ub0 <- z[j + 1] - t(x.d0) %*% b.est
me[k, j] <- pfun(ub1) - pfun(ua1) - (pfun(ub0) -
pfun(ua0))
d1 <- (dfun(ua1) - dfun(ub1)) %*% t(x.d1) -
(dfun(ua0) - dfun(ub0)) %*% t(x.d0)
q1 <- -dfun(ua1) + dfun(ua0)
q2 <- dfun(ub1) - dfun(ub0)
dr <- cbind(d1, q1, q2)
V <- V5[c(1:K, K + j, K + j + 1), c(1:K, K +
j, K + j + 1)]
se[k, j] <- sqrt(c(dr %*% V %*% t(dr)))
}
}
}
}
t.value <- me/se
p.value <- 2 * (1 - pt(abs(t.value), w$df.residual))
out <- list()
for (j in 1:J) {
out[[j]] <- round(cbind(effect = me[, j], error = se[,
j], t.value = t.value[, j], p.value = p.value[, j]),
digits)
}
out[[J + 1]] <- round(me, digits)
names(out) <- paste("ME", c(lev, "all"), sep = ".")
result <- listn(w, out)
class(result) <- "ocME"
return(result)
}
and run the following code:
logitModelSentiment90 <- polr(availability_90_ord ~ mean_sentiment,
data = data, Hess = T, method = "logistic")
myocME(logitModelSentiment90)
# effect.0 effect.1 effect.2 effect.3
# mean_sentiment -0.001 0 0 0.001

append values by dataframe row in a loop

I'm running models with various initial values, and I'm trying to append values (3 estimators) by rows to a dataframe in a loop. I assign values to estimators within the loop, but I can't recall them to produce a dataframe.
My code: f is the model for the estimation. Three parameters: alpha, rho, and lambda in the model. I want to output these 3 values.
library("maxLik")
f <- function(param) {
alpha <- param[1]
rho <- param[2]
lambda <- param[3]
u <- 0.5 * (dataset$v_50_1)^alpha - 0.5 * lambda * (dataset$v_50_2)^alpha
p <- 1/(1 + exp(-rho * u))
logl <- sum(dataset$gamble * log(p) + (1 - dataset$gamble) * log(1 - p))
}
df <- data.frame(alpha = numeric(), rho = numeric(), lambda = numeric())
for (j in 1:20) {
tryCatch({
ml <- maxLik(f, start = c(alpha = runif(1, 0, 2), rho = runif(1, 0, 4), lambda = runif(1,
0, 10)), method = "NM")
alpha[j] <- ml$estimate[1]
rho[j] <- ml$estimate[2]
lambda[j] <- ml$estimate[3]
}, error = function(e) {NA})
}
output <- data.frame(alpha, rho, lambda)
error occurs:
Error in data.frame(alpha, rho, lambda) : object 'alpha' not found
Expected output
alpha rho lambda
0.4 1 2 # estimators append by row.
0.6 1.1 3 # each row has estimators that are estimated
0.7 1.5 4 # by one set of initial values, there are 20
# rows, as the estimation loops for 20 times.
I am running an example, by changing the function f
library("maxLik")
t <- rexp(100, 2)
loglik <- function(theta) log(theta) - theta*t
df <- data.frame(alpha = numeric(), rho = numeric(), lambda = numeric())
for (j in 1:20){
tryCatch({
ml <- maxLik(loglik, start = c(alpha = runif(1, 0, 2), rho = runif(1, 0, 4),
lambda = runif(1, 0, 10)), method = "NM")
df <- rbind(df, data.frame(alpha = ml$estimate[1],
rho = ml$estimate[2],
lambda = ml$estimate[3]))
# I tried to append values for each column
}, error = function(e) {NA})}
> row.names(df) <- NULL
> head(df)
alpha rho lambda
1 2.368739 2.322220 2.007375
2 2.367607 2.322328 2.007093
3 2.368324 2.322105 2.007597
4 2.368515 2.322072 2.007334
5 2.368269 2.322071 2.007142
6 2.367998 2.322438 2.007391

Writing a function for the Cramer Von Mises test

The cvm.test() from dgof package provides a way of doing the one-sample Cramer-von Mises test on discrete distributions, my goal is to develop a function that does the test for continuous distributions as well (like the Kolmogorov-Smirnov ks.test() from the stats package).
Note:this post is concerned only with fully specified df null hypothesis, so please no bootstraping or Monte Carlo Simulation here
> cvm.test
function (x, y, type = c("W2", "U2", "A2"), simulate.p.value = FALSE,
B = 2000, tol = 1e-08)
{
cvm.pval.disc <- function(STAT, lambda) {
x <- STAT
theta <- function(u) {
VAL <- 0
for (i in 1:length(lambda)) {
VAL <- VAL + 0.5 * atan(lambda[i] * u)
}
return(VAL - 0.5 * x * u)
}
rho <- function(u) {
VAL <- 0
for (i in 1:length(lambda)) {
VAL <- VAL + log(1 + lambda[i]^2 * u^2)
}
VAL <- exp(VAL * 0.25)
return(VAL)
}
fun <- function(u) return(sin(theta(u))/(u * rho(u)))
pval <- 0
try(pval <- 0.5 + integrate(fun, 0, Inf, subdivisions = 1e+06)$value/pi,
silent = TRUE)
if (pval > 0.001)
return(pval)
if (pval <= 0.001) {
df <- sum(lambda != 0)
est1 <- dchisq(STAT/max(lambda), df)
logf <- function(t) {
ans <- -t * STAT
ans <- ans - 0.5 * sum(log(1 - 2 * t * lambda))
return(ans)
}
est2 <- 1
try(est2 <- exp(nlm(logf, 1/(4 * max(lambda)))$minimum),
silent = TRUE)
return(min(est1, est2))
}
}
cvm.stat.disc <- function(x, y, type = c("W2", "U2", "A2")) {
type <- match.arg(type)
I <- knots(y)
N <- length(x)
e <- diff(c(0, N * y(I)))
obs <- rep(0, length(I))
for (j in 1:length(I)) {
obs[j] <- length(which(x == I[j]))
}
S <- cumsum(obs)
T <- cumsum(e)
H <- T/N
p <- e/N
t <- (p + p[c(2:length(p), 1)])/2
Z <- S - T
Zbar <- sum(Z * t)
S0 <- diag(p) - p %*% t(p)
A <- matrix(1, length(p), length(p))
A <- apply(row(A) >= col(A), 2, as.numeric)
E <- diag(t)
One <- rep(1, nrow(E))
K <- diag(0, length(H))
diag(K)[-length(H)] <- 1/(H[-length(H)] * (1 - H[-length(H)]))
Sy <- A %*% S0 %*% t(A)
M <- switch(type, W2 = E, U2 = (diag(1, nrow(E)) - E %*%
One %*% t(One)) %*% E %*% (diag(1, nrow(E)) - One %*%
t(One) %*% E), A2 = E %*% K)
lambda <- eigen(M %*% Sy)$values
STAT <- switch(type, W2 = sum(Z^2 * t)/N, U2 = sum((Z -
Zbar)^2 * t)/N, A2 = sum((Z^2 * t/(H * (1 - H)))[-length(I)])/N)
return(c(STAT, lambda))
}
cvm.pval.disc.sim <- function(STATISTIC, lambda, y, type,
tol, B) {
knots.y <- knots(y)
fknots.y <- y(knots.y)
u <- runif(B * length(x))
u <- sapply(u, function(a) return(knots.y[sum(a > fknots.y) +
1]))
dim(u) <- c(B, length(x))
s <- apply(u, 1, cvm.stat.disc, y, type)
s <- s[1, ]
return(sum(s >= STATISTIC - tol)/B)
}
type <- match.arg(type)
DNAME <- deparse(substitute(x))
if (is.stepfun(y)) {
if (length(setdiff(x, knots(y))) != 0) {
stop("Data are incompatable with null distribution; ",
"Note: This function is meant only for discrete distributions ",
"you may be receiving this error because y is continuous.")
}
tempout <- cvm.stat.disc(x, y, type = type)
STAT <- tempout[1]
lambda <- tempout[2:length(tempout)]
if (!simulate.p.value) {
PVAL <- cvm.pval.disc(STAT, lambda)
}
else {
PVAL <- cvm.pval.disc.sim(STAT, lambda, y, type,
tol, B)
}
METHOD <- paste("Cramer-von Mises -", type)
names(STAT) <- as.character(type)
RVAL <- list(statistic = STAT, p.value = PVAL, alternative = "Two.sided",
method = METHOD, data.name = DNAME)
}
else {
stop("Null distribution must be a discrete.")
}
class(RVAL) <- "htest"
return(RVAL)
}
<environment: namespace:dgof>
Kolmogorov-Smirnov ks.test() from stats package for comparison (note that this function does both the one-sample and two-sample tests):
> ks.test
function (x, y, ..., alternative = c("two.sided", "less", "greater"),
exact = NULL, tol = 1e-08, simulate.p.value = FALSE, B = 2000)
{
pkolmogorov1x <- function(x, n) {
if (x <= 0)
return(0)
if (x >= 1)
return(1)
j <- seq.int(from = 0, to = floor(n * (1 - x)))
1 - x * sum(exp(lchoose(n, j) + (n - j) * log(1 - x -
j/n) + (j - 1) * log(x + j/n)))
}
exact.pval <- function(alternative, STATISTIC, x, n, y, knots.y,
tol) {
ts.pval <- function(S, x, n, y, knots.y, tol) {
f_n <- ecdf(x)
eps <- min(tol, min(diff(knots.y)) * tol)
eps2 <- min(tol, min(diff(y(knots.y))) * tol)
a <- rep(0, n)
b <- a
f_a <- a
for (i in 1:n) {
a[i] <- min(c(knots.y[which(y(knots.y) + S >=
i/n + eps2)[1]], Inf), na.rm = TRUE)
b[i] <- min(c(knots.y[which(y(knots.y) - S >
(i - 1)/n - eps2)[1]], Inf), na.rm = TRUE)
f_a[i] <- ifelse(!(a[i] %in% knots.y), y(a[i]),
y(a[i] - eps))
}
f_b <- y(b)
p <- rep(1, n + 1)
for (i in 1:n) {
tmp <- 0
for (k in 0:(i - 1)) {
tmp <- tmp + choose(i, k) * (-1)^(i - k - 1) *
max(f_b[k + 1] - f_a[i], 0)^(i - k) * p[k +
1]
}
p[i + 1] <- tmp
}
p <- max(0, 1 - p[n + 1])
if (p > 1) {
warning("numerical instability in p-value calculation.")
p <- 1
}
return(p)
}
less.pval <- function(S, n, H, z, tol) {
m <- ceiling(n * (1 - S))
c <- S + (1:m - 1)/n
CDFVAL <- H(sort(z))
for (j in 1:length(c)) {
ifelse((min(abs(c[j] - CDFVAL)) < tol), c[j] <- 1 -
c[j], c[j] <- 1 - CDFVAL[which(order(c(c[j],
CDFVAL)) == 1)])
}
b <- rep(0, m)
b[1] <- 1
for (k in 1:(m - 1)) b[k + 1] <- 1 - sum(choose(k,
1:k - 1) * c[1:k]^(k - 1:k + 1) * b[1:k])
p <- sum(choose(n, 0:(m - 1)) * c^(n - 0:(m - 1)) *
b)
return(p)
}
greater.pval <- function(S, n, H, z, tol) {
m <- ceiling(n * (1 - S))
c <- 1 - (S + (1:m - 1)/n)
CDFVAL <- c(0, H(sort(z)))
for (j in 1:length(c)) {
if (!(min(abs(c[j] - CDFVAL)) < tol))
c[j] <- CDFVAL[which(order(c(c[j], CDFVAL)) ==
1) - 1]
}
b <- rep(0, m)
b[1] <- 1
for (k in 1:(m - 1)) b[k + 1] <- 1 - sum(choose(k,
1:k - 1) * c[1:k]^(k - 1:k + 1) * b[1:k])
p <- sum(choose(n, 0:(m - 1)) * c^(n - 0:(m - 1)) *
b)
return(p)
}
p <- switch(alternative, two.sided = ts.pval(STATISTIC,
x, n, y, knots.y, tol), less = less.pval(STATISTIC,
n, y, knots.y, tol), greater = greater.pval(STATISTIC,
n, y, knots.y, tol))
return(p)
}
sim.pval <- function(alternative, STATISTIC, x, n, y, knots.y,
tol, B) {
fknots.y <- y(knots.y)
u <- runif(B * length(x))
u <- sapply(u, function(a) return(knots.y[sum(a > fknots.y) +
1]))
dim(u) <- c(B, length(x))
getks <- function(a, knots.y, fknots.y) {
dev <- c(0, ecdf(a)(knots.y) - fknots.y)
STATISTIC <- switch(alternative, two.sided = max(abs(dev)),
greater = max(dev), less = max(-dev))
return(STATISTIC)
}
s <- apply(u, 1, getks, knots.y, fknots.y)
return(sum(s >= STATISTIC - tol)/B)
}
alternative <- match.arg(alternative)
DNAME <- deparse(substitute(x))
x <- x[!is.na(x)]
n <- length(x)
if (n < 1L)
stop("not enough 'x' data")
PVAL <- NULL
if (is.numeric(y)) {
DNAME <- paste(DNAME, "and", deparse(substitute(y)))
y <- y[!is.na(y)]
n.x <- as.double(n)
n.y <- length(y)
if (n.y < 1L)
stop("not enough 'y' data")
if (is.null(exact))
exact <- (n.x * n.y < 10000)
METHOD <- "Two-sample Kolmogorov-Smirnov test"
TIES <- FALSE
n <- n.x * n.y/(n.x + n.y)
w <- c(x, y)
z <- cumsum(ifelse(order(w) <= n.x, 1/n.x, -1/n.y))
if (length(unique(w)) < (n.x + n.y)) {
warning("cannot compute correct p-values with ties")
z <- z[c(which(diff(sort(w)) != 0), n.x + n.y)]
TIES <- TRUE
}
STATISTIC <- switch(alternative, two.sided = max(abs(z)),
greater = max(z), less = -min(z))
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below that of y", greater = "the CDF of x lies above that of y")
if (exact && (alternative == "two.sided") && !TIES)
PVAL <- 1 - .C("psmirnov2x", p = as.double(STATISTIC),
as.integer(n.x), as.integer(n.y), PACKAGE = "dgof")$p
}
else if (is.stepfun(y)) {
z <- knots(y)
if (is.null(exact))
exact <- (n <= 30)
if (exact && n > 30) {
warning("numerical instability may affect p-value")
}
METHOD <- "One-sample Kolmogorov-Smirnov test"
dev <- c(0, ecdf(x)(z) - y(z))
STATISTIC <- switch(alternative, two.sided = max(abs(dev)),
greater = max(dev), less = max(-dev))
if (simulate.p.value) {
PVAL <- sim.pval(alternative, STATISTIC, x, n, y,
z, tol, B)
}
else {
PVAL <- switch(exact, `TRUE` = exact.pval(alternative,
STATISTIC, x, n, y, z, tol), `FALSE` = NULL)
}
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below the null hypothesis",
greater = "the CDF of x lies above the null hypothesis")
}
else {
if (is.character(y))
y <- get(y, mode = "function")
if (mode(y) != "function")
stop("'y' must be numeric or a string naming a valid function")
if (is.null(exact))
exact <- (n < 100)
METHOD <- "One-sample Kolmogorov-Smirnov test"
TIES <- FALSE
if (length(unique(x)) < n) {
warning(paste("default ks.test() cannot compute correct p-values with ties;\n",
"see help page for one-sample Kolmogorov test for discrete distributions."))
TIES <- TRUE
}
x <- y(sort(x), ...) - (0:(n - 1))/n
STATISTIC <- switch(alternative, two.sided = max(c(x,
1/n - x)), greater = max(1/n - x), less = max(x))
if (exact && !TIES) {
PVAL <- if (alternative == "two.sided")
1 - .C("pkolmogorov2x", p = as.double(STATISTIC),
as.integer(n), PACKAGE = "dgof")$p
else 1 - pkolmogorov1x(STATISTIC, n)
}
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below the null hypothesis",
greater = "the CDF of x lies above the null hypothesis")
}
names(STATISTIC) <- switch(alternative, two.sided = "D",
greater = "D^+", less = "D^-")
pkstwo <- function(x, tol = 1e-06) {
if (is.numeric(x))
x <- as.vector(x)
else stop("argument 'x' must be numeric")
p <- rep(0, length(x))
p[is.na(x)] <- NA
IND <- which(!is.na(x) & (x > 0))
if (length(IND)) {
p[IND] <- .C("pkstwo", as.integer(length(x[IND])),
p = as.double(x[IND]), as.double(tol), PACKAGE = "dgof")$p
}
return(p)
}
if (is.null(PVAL)) {
PVAL <- ifelse(alternative == "two.sided", 1 - pkstwo(sqrt(n) *
STATISTIC), exp(-2 * n * STATISTIC^2))
}
RVAL <- list(statistic = STATISTIC, p.value = PVAL, alternative = nm_alternative,
method = METHOD, data.name = DNAME)
class(RVAL) <- "htest"
return(RVAL)
}
<environment: namespace:dgof>

Resources