multinomial logit regression by hand in R - r

I am trying to implement multinomial regression (mlogit or multinom package) in R with Codes and optim (Not using packages).
rm(list= ls())
data = read.table("~/Desktop/R Code/textfiles/keane.csv", sep = ",",header = T)
data1 = data[,c("educ","exper", "expersq", "black", "status")]
data1 = na.omit(data1)
data2 = as.matrix(data1)
y_1 = rep(0, nrow(data2))
y_2 = rep(0, nrow(data2))
y_3 = rep(0, nrow(data2))
data2 = cbind(data2[,1:5], y_1, y_2, y_3)
data2[,6] = ifelse(data2[,5] == 1, 1, 0)
data2[,7] = ifelse(data2[,5] == 2, 1, 0)
data2[,8] = ifelse(data2[,5] == 3, 1, 0)
int = rep(1, nrow(data2)) #intercept
data2 = cbind(int, data2[,c(1:4,6,7,8)])
X = as.matrix(data2[, c(1:5)])
y_1 = as.matrix(data2[, 6]) #replace y values(status = 1)
y_2 = as.matrix(data2[, 7]) #replace y values(status = 2)
y_3 = as.matrix(data2[, 8]) #replace y values(status = 3)
Y = cbind(y_1, y_2, y_3)
##beta
beta = solve(t(X) %*% X) %*% t(X) %*% Y #LPM coefficient
logit.nll = function (beta, X, Y) {
P = as.matrix(rowSums(exp(X %*% beta))); #Sum_(h=1)^3 exp(X * Beta_(h))
Pr_1 = exp(X %*% beta[,2])/(1 + P); #P(y = 2 | X)
Pr_2 = exp(X %*% beta[,3])/(1 + P); #P(y = 3 | X)
Pr_0 = 1/(1+P);#P(y = 1 | X)
(colSums(Y[,1] * log(Pr_0)) + colSums(Y[,2] * log(Pr_1)) + colSums(Y[,3] * log(Pr_2))) #log-likelihood
}
optim(beta, logit.nll, X = X, Y = Y, method = "BFGS")
when I do this code it gives me the message that "Error in X %*% beta : non-conformable arguments". My approach might be fundamentally wrong or the implementation of loglikelihood function is wrong. Can I get some help to fix this code?

Not very familiar with your svm optimization or what you are trying to do, the error you have comes with optim working with a vector. You need to coerce it into a matrix inside the function, let's say your data is like this:
set.seed(111)
data = iris
X = model.matrix(~.,data=data[,1:4])
Y = model.matrix(~0+Species,data=data)
beta = solve(t(X) %*% X) %*% t(X) %*% Y
Now we add the matrix part, also note the by default optim performs minimization (https://stat.ethz.ch/R-manual/R-devel/library/stats/html/optim.html) so you need to return the negative of loglikelihood:
logit.nll = function (beta, X, Y) {
beta = matrix(beta,ncol=3)
P = as.matrix(rowSums(exp(X %*% beta))); #Sum_(h=1)^3 exp(X * Beta_(h))
Pr_1 = exp(X %*% beta[,2])/(1 + P); #P(y = 2 | X)
Pr_2 = exp(X %*% beta[,3])/(1 + P); #P(y = 3 | X)
Pr_0 = 1/(1+P);#P(y = 1 | X)
LL = (colSums(Y[,1] * log(Pr_0)) + colSums(Y[,2] * log(Pr_1)) + colSums(Y[,3] * log(Pr_2))) #log-likelihood
print(LL)
return(-LL)
}
res = optim(beta, logit.nll, X = X, Y = Y, method = "BFGS")
res
$par
Speciessetosa Speciesversicolor Speciesvirginica
(Intercept) -2.085162 15.040679 -27.60634
Sepal.Length -4.649971 -8.971237 -11.43702
Sepal.Width -9.286757 -5.016616 -11.69764
Petal.Length 12.803070 17.125483 26.55641
Petal.Width 6.025760 3.342659 21.63200

Related

Binary Logistic Regression with BFGS using package maxLik

I tried binary logistic regression with BFGS using maxlik, but i have included the feature as per the syntax i attached below, but the result is, but i get output like this
Maximum Likelihood estimation
BFGS maximization, 0 iterations
*Return code 100: Initial value out of range.
https://docs.google.com/spreadsheets/d/1fVLeJznB9k29FQ_BdvdCF8ztkOwbdFpx/edit?usp=sharing&ouid=109040212946671424093&rtpof=true&sd=true (this is my data)*
library(maxLik)
library(optimx)
data=read_excel("Book2.xlsx")
data$JKLaki = ifelse(data$JK==1,1,0)
data$Daerah_Samarinda<- ifelse(data$Daerah==1,1,0)
data$Prodi2 = ifelse(data$Prodi==2,1,0)
data$Prodi3 = ifelse(data$Prodi==3,1,0)
data$Prodi4 = ifelse(data$Prodi==4,1,0)
str(data)
attach(data)
ll<- function(param){
mu <- param[1]
beta <- param[-1]
y<- as.vector(data$Y)
x<- cbind(1, data$JKLaki, data$IPK, data$Daerah_Samarinda, data$Prodi2, data$Prodi3, data$Prodi4)
xb<- x%*%beta
pi<- exp(xb)
val <- -sum(y * log(pi) + (1 - y) * log(1 - pi),log=TRUE)
return(val)
}
gl<- funtion(param){
mu <- param[1]
beta <- param[-1]
y <- as.vector(data$Y)
x <- cbind(0, data$JKLaki,data$IPK,data$Daerah_Samarinda,data$Prodi2,data$Prodi3,data$Prodi4)
sigma <- x*beta
pi<- exp(sigma)/(1+exp(sigma))
v= y-pi
vx=as.matrix(x)%*%as.vector(v)
gg= colSums(vx)
return(-gg)}
mle<-maxLik(logLik=ll, grad=gl,hess=NULL,
start=c(mu=1, beta1=0, beta2=0, beta3=0, beta4=0, beta5=0, beta6=0,beta7=0), method="BFGS")
summary(mle)
can i get some help, i tired get this solution, please.
I have been able to optimize the log-likelihood with the following code :
library(DEoptim)
library(readxl)
data <- read_excel("Book2.xlsx")
data$JKLaki <- ifelse(data$JK == 1, 1, 0)
data$Daerah_Samarinda <- ifelse(data$Daerah == 1, 1, 0)
data$Prodi2 <- ifelse(data$Prodi == 2, 1, 0)
data$Prodi3 <- ifelse(data$Prodi == 3, 1, 0)
data$Prodi4 <- ifelse(data$Prodi == 4, 1, 0)
ll <- function(param, data)
{
mu <- param[1]
beta <- param[-1]
y <- as.vector(data$Y)
x <- cbind(1, data$JKLaki, data$IPK, data$Daerah_Samarinda, data$Prodi2, data$Prodi3, data$Prodi4)
xb <- x %*% beta
pi <- exp(mu + xb)
val <- -sum(y * log(pi) + (1 - y) * log(1 - pi))
if(is.nan(val) == TRUE)
{
return(10 ^ 30)
}else
{
return(val)
}
}
lower <- rep(-500, 8)
upper <- rep(500, 8)
obj_DEoptim_Iter1 <- DEoptim(fn = ll, lower = lower, upper = upper,
control = list(itermax = 5000), data = data)
lower <- obj_DEoptim_Iter1$optim$bestmem - 0.25 * abs(obj_DEoptim_Iter1$optim$bestmem)
upper <- obj_DEoptim_Iter1$optim$bestmem + 0.25 * abs(obj_DEoptim_Iter1$optim$bestmem)
obj_DEoptim_Iter2 <- DEoptim(fn = ll, lower = lower, upper = upper,
control = list(itermax = 5000), data = data)
obj_Optim <- optim(par = obj_DEoptim_Iter2$optim$bestmem, fn = ll, data = data)
$par
par1 par2 par3 par4 par5 par6 par7
-350.91045436 347.79576145 0.05337466 0.69032735 -0.01089112 0.47465162 0.38284804
par8
0.42125664
$value
[1] 95.08457
$counts
function gradient
501 NA
$convergence
[1] 1
$message
NULL

Using `cor.test()` on ranked data

I would like to do a Spearman correlation test using rank data. How can I do this with cor.test()? I don't want the function to rerank the data.
Additionally, what form does the data need to be in? From the help, it seems to be the raw data as compared to a correlation matrix.
Consider this example
## Hollander & Wolfe (1973), p. 187f.
## Assessment of tuna quality. We compare the Hunter L measure of
## lightness to the averages of consumer panel scores (recoded as
## integer values from 1 to 6 and averaged over 80 such values) in
## 9 lots of canned tuna.
library(tidyverse)
A <- tibble(
x = c(44.4, 45.9, 41.9, 53.3, 44.7, 44.1, 50.7, 45.2, 60.1),
y = c( 2.6, 3.1, 2.5, 5.0, 3.6, 4.0, 5.2, 2.8, 3.8)
) %>%
mutate(rank_x = rank(x),
rank_y = rank(y)
)
Spearman's correlation coefficient is defined as Pearson's correlation between ranked variables
cor(A$x, A$y, method = "spearman")
#[1] 0.6
cor(A$rank_x, A$rank_y, method = "pearson")
#[1] 0.6
what about cor.test()? Can I use the rank data as its input?
x1 <- cor.test(A$x, A$y, method = "spearman")
x1
# Spearman's rank correlation rho
#
# data: A$x and A$y
# S = 48, p-value = 0.1
# alternative hypothesis: true rho is not equal to 0
# sample estimates:
# rho
# 0.6
x2 <- cor.test(A$rank_x, A$rank_y, method = "pearson")
x2
# Pearson's product-moment correlation
# data: A$rank_x and A$rank_y
# t = 2, df = 7, p-value = 0.09
# alternative hypothesis: true correlation is not equal to 0
# 95 percent confidence interval:
# -0.11 0.90
# sample estimates:
# cor
# 0.6
x3 <- cor.test(A$rank_x, A$rank_y, method = "spearman")
# Spearman's rank correlation rho
#
# data: A$rank_x and A$rank_y
# S = 48, p-value = 0.1
# alternative hypothesis: true rho is not equal to 0
# sample estimates:
# rho
# 0.6
Yes, you should use method = Spearman for ranked or original data. If rank data is used, the data is not reranked in the function.
As the help file implies, using method=Pearson with rank data conducts a Pearson's correlation test on the ranks, which would follow a t-distribution. However, since the ranks are not continuous variables, this approach is not correct.
getAnywhere(cor.test.default)
A single object matching ‘cor.test.default’ was found
It was found in the following places
registered S3 method for cor.test from namespace stats
namespace:stats
with value
function (x, y, alternative = c("two.sided", "less",
"greater"), method = c("pearson", "kendall",
"spearman"), exact = NULL, conf.level = 0.95, continuity = FALSE,
...)
{
alternative <- match.arg(alternative)
method <- match.arg(method)
DNAME <- paste(deparse1(substitute(x)), "and", deparse1(substitute(y)))
if (!is.numeric(x))
stop("'x' must be a numeric vector")
if (!is.numeric(y))
stop("'y' must be a numeric vector")
if (length(x) != length(y))
stop("'x' and 'y' must have the same length")
OK <- complete.cases(x, y)
x <- x[OK]
y <- y[OK]
n <- length(x)
NVAL <- 0
conf.int <- FALSE
if (method == "pearson") {
if (n < 3L)
stop("not enough finite observations")
method <- "Pearson's product-moment correlation"
names(NVAL) <- "correlation"
r <- cor(x, y)
df <- n - 2L
ESTIMATE <- c(cor = r)
PARAMETER <- c(df = df)
STATISTIC <- c(t = sqrt(df) * r/sqrt(1 - r^2))
if (n > 3) {
if (!missing(conf.level) && (length(conf.level) !=
1 || !is.finite(conf.level) || conf.level < 0 ||
conf.level > 1))
stop("'conf.level' must be a single number between 0 and 1")
conf.int <- TRUE
z <- atanh(r)
sigma <- 1/sqrt(n - 3)
cint <- switch(alternative, less = c(-Inf, z + sigma *
qnorm(conf.level)), greater = c(z - sigma * qnorm(conf.level),
Inf), two.sided = z + c(-1, 1) * sigma * qnorm((1 +
conf.level)/2))
cint <- tanh(cint)
attr(cint, "conf.level") <- conf.level
}
PVAL <- switch(alternative, less = pt(STATISTIC, df),
greater = pt(STATISTIC, df, lower.tail = FALSE),
two.sided = 2 * min(pt(STATISTIC, df), pt(STATISTIC,
df, lower.tail = FALSE)))
}
else {
if (n < 2)
stop("not enough finite observations")
PARAMETER <- NULL
TIES <- (min(length(unique(x)), length(unique(y))) <
n)
if (method == "kendall") {
method <- "Kendall's rank correlation tau"
names(NVAL) <- "tau"
r <- cor(x, y, method = "kendall")
ESTIMATE <- c(tau = r)
if (!is.finite(ESTIMATE)) {
ESTIMATE[] <- NA
STATISTIC <- c(T = NA)
PVAL <- NA
}
else {
if (is.null(exact))
exact <- (n < 50)
if (exact && !TIES) {
q <- round((r + 1) * n * (n - 1)/4)
STATISTIC <- c(T = q)
pkendall <- function(q, n) .Call(C_pKendall,
q, n)
PVAL <- switch(alternative, two.sided = {
if (q > n * (n - 1)/4) p <- 1 - pkendall(q -
1, n) else p <- pkendall(q, n)
min(2 * p, 1)
}, greater = 1 - pkendall(q - 1, n), less = pkendall(q,
n))
}
else {
xties <- table(x[duplicated(x)]) + 1
yties <- table(y[duplicated(y)]) + 1
T0 <- n * (n - 1)/2
T1 <- sum(xties * (xties - 1))/2
T2 <- sum(yties * (yties - 1))/2
S <- r * sqrt((T0 - T1) * (T0 - T2))
v0 <- n * (n - 1) * (2 * n + 5)
vt <- sum(xties * (xties - 1) * (2 * xties +
5))
vu <- sum(yties * (yties - 1) * (2 * yties +
5))
v1 <- sum(xties * (xties - 1)) * sum(yties *
(yties - 1))
v2 <- sum(xties * (xties - 1) * (xties - 2)) *
sum(yties * (yties - 1) * (yties - 2))
var_S <- (v0 - vt - vu)/18 + v1/(2 * n * (n -
1)) + v2/(9 * n * (n - 1) * (n - 2))
if (exact && TIES)
warning("Cannot compute exact p-value with ties")
if (continuity)
S <- sign(S) * (abs(S) - 1)
STATISTIC <- c(z = S/sqrt(var_S))
PVAL <- switch(alternative, less = pnorm(STATISTIC),
greater = pnorm(STATISTIC, lower.tail = FALSE),
two.sided = 2 * min(pnorm(STATISTIC), pnorm(STATISTIC,
lower.tail = FALSE)))
}
}
}
else {
method <- "Spearman's rank correlation rho"
if (is.null(exact))
exact <- TRUE
names(NVAL) <- "rho"
r <- cor(rank(x), rank(y))
ESTIMATE <- c(rho = r)
if (!is.finite(ESTIMATE)) {
ESTIMATE[] <- NA
STATISTIC <- c(S = NA)
PVAL <- NA
}
else {
pspearman <- function(q, n, lower.tail = TRUE) {
if (n <= 1290 && exact)
.Call(C_pRho, round(q) + 2 * lower.tail,
n, lower.tail)
else {
den <- (n * (n^2 - 1))/6
if (continuity)
den <- den + 1
r <- 1 - q/den
pt(r/sqrt((1 - r^2)/(n - 2)), df = n - 2,
lower.tail = !lower.tail)
}
}
q <- (n^3 - n) * (1 - r)/6
STATISTIC <- c(S = q)
if (TIES && exact) {
exact <- FALSE
warning("Cannot compute exact p-value with ties")
}
PVAL <- switch(alternative, two.sided = {
p <- if (q > (n^3 - n)/6) pspearman(q, n, lower.tail = FALSE) else pspearman(q,
n, lower.tail = TRUE)
min(2 * p, 1)
}, greater = pspearman(q, n, lower.tail = TRUE),
less = pspearman(q, n, lower.tail = FALSE))
}
}
}
RVAL <- list(statistic = STATISTIC, parameter = PARAMETER,
p.value = as.numeric(PVAL), estimate = ESTIMATE, null.value = NVAL,
alternative = alternative, method = method, data.name = DNAME)
if (conf.int)
RVAL <- c(RVAL, list(conf.int = cint))
class(RVAL) <- "htest"
RVAL
}
<bytecode: 0x0000018603fa9418>
<environment: namespace:stats>

Obtain Bootstrap Results in Matrix

I have written the following code.
library(quantreg)
# return the g function:
G = function(m, N, gamma) {
Tm = m * N
k = 1:Tm
Gvalue = sqrt(m) * (1 + k/m) * (k/(m + k))^gamma
return(Gvalue)
}
sqroot <- function(A) {
e = eigen(A)
v = e$vectors
val = e$values
sq = v %*% diag(sqrt(val)) %*% solve(v)
return(t(sq))
}
fa = function(m, N, a) {
Tm = m * N
k = 1:Tm
t = (m + k)/m
f_value = (t - 1) * t * (a^2 + log(t/(t - 1)))
return(sqrt(f_value))
}
m = 50
N = 2
n= 50*3
x1 = matrix(runif(n, 0, 1), ncol = 1)
x = cbind(1, x1)
beta = c(1, 1)
xb = x %*% beta
pr = 1/(1+exp(-xb))
y = rbinom(n,1,pr)
# calculate statistic:
stat = function(y, x, m, N, a) {
y_train = y[1:m]
x_train = x[(1:m),]
y_test = y[-(1:m)]
x_test = x[-(1:m),]
fit = glm(y ~ 0 + x, family="binomial")
coef = coef(fit)
log_predict = predict(fit, type="response")
sigma = sqrt(1/(m-1)* sum((y_train - log_predict)^2))
Jvalue = t(x_train) %*% x_train/m * sigma^2
Jsroot = sqroot(Jvalue)
fvalue = fa(m, N, a)
score1 = apply((x_test * as.vector((y_test - x_test %*% coef))), 2, cumsum)
statvalue1 = t(solve(Jsroot) %*% t(score1))/fvalue/sqrt(m)
statmax1 = pmax(abs(statvalue1[, 1]), abs(statvalue1[, 2]))
result = list(stat = statmax1)
return(result)
}
m =50
N = 2
a = 2.795
value = stat(y, x, m, N, a)
value
I want to perform bootstrap to obtain B = 999 number of statistics. I use the following r code. But it produces an error saying "Error in statistic(data, original, ...) :
argument "m" is missing, with no default"
library(boot)
data1 = data.frame(y = y, x = x1, m = m , N = N, a = a)
head(data1)
boot_value = boot(data1, statistic = stat, R = 999)
Can anyone give me a hint? Also, am I able to get the bootstrap results in a matrix format? Since the stat function gives 100 values.
There are different kinds of bootstrapping. If you want to draw from your data 999 samples with replications of same size of your data you may just use replicate, no need for packages.
We put the data to be resampled into a data frame. It looks to me like m, N, a remain constant, so we just provide it as vectors.
data2 <- data.frame(y=y, x=x)
stat function needs to be adapted to unpack y and x-matrix. At the bottom we remove the list call to get just a vector back. unnameing will just give us the numbers.
stat2 <- function(data, m, N, a) {
y_train <- data[1:m, 1]
x_train <- as.matrix(data[1:m, 2:3])
y_test <- data[-(1:m), 1]
x_test <- as.matrix(data[-(1:m), 2:3])
y <- data[, "y"]
x <- as.matrix(data[, 2:3])
fit <- glm(y ~ 0 + x, family="binomial")
coef <- coef(fit)
log_predict <- predict(fit, type="response")
sigma <- sqrt(1/(m-1) * sum((y_train - log_predict)^2))
Jvalue <- t(x_train) %*% x_train/m * sigma^2
Jsroot <- sqroot(Jvalue)
fvalue <- fa(m, N, a)
score1 <- apply((x_test * as.vector((y_test - x_test %*% coef))), 2, cumsum)
statvalue1 <- t(solve(Jsroot) %*% t(score1))/fvalue/sqrt(m)
statmax1 <- pmax(abs(statvalue1[, 1]), abs(statvalue1[, 2]))
result <- unname(statmax1)
return(result)
}
replicate is a cousin of sapply, designed for repeated evaluation. In the call we just sample the rows 999 times and already get a matrix back. As in sapply we need to transform our result.
res <- t(replicate(999, stat2(data2[sample(1:nrow(data2), nrow(data2), replace=TRUE), ], m, N, a)))
Result
As result we get 999 bootstrap replications in the rows with 100 attributes in the columns.
str(res)
# num [1:999, 1:100] 0.00205 0.38486 0.10146 0.12726 0.47056 ...
The code also runs quite fast.
user system elapsed
3.46 0.01 3.49
Note, that there are different kinds of bootstrapping. E.g. sometimes just a part of the sample is resampled, weights are used, clustering is applied etc. Since you attempted to use boot the method shown should be the default, though.

How to speed up the process of nonlinear optimization in R

Consider the following example of nonlinear optimization problem. The procedure is too slow to apply in simulation studies. For example, in case of my studies, it takes 2.5 hours for only one replication. How to speed up the process so that the processing time could also be optimized?
library(mvtnorm)
library(alabama)
n = 200
X <- matrix(0, nrow = n, ncol = 2)
X[,1:2] <- rmvnorm(n = n, mean = c(0,0), sigma = matrix(c(1,1,1,4),
ncol = 2))
x0 = matrix(c(X[1,1:2]), nrow = 1)
y0 = x0 - 0.5 * log(n) * (colMeans(X) - x0)
X = rbind(X, y0)
x01 = y0[1]
x02 = y0[2]
x1 = X[,1]
x2 = X[,2]
pInit = matrix(rep(0.1, n + 1), nrow = n + 1)
outopt = list(kkt2.check=FALSE, "trace" = FALSE)
f1 <- function(p) sum(sqrt(pmax(0, p)))/sqrt(n+1)
heq1 <- function(p) c(sum(x1 * p) - x01, sum(x2 * p) - x02, sum(p) - 1)
hin1 <- function(p) p - 1e-06
sol <- alabama::auglag(pInit, fn = function(p) -f1(p),
heq = heq1, hin = hin1,
control.outer = outopt)
-1 * sol$value

Why is my logistic regression implementation so slow?

Here is an implementation of batch gradient descent algorithm in R (theoretical details here):
logreg = function(y, x) {
x = as.matrix(x)
x = apply(x, 2, scale)
x = cbind(1, x)
m = nrow(x)
n = ncol(x)
alpha = 2/m
# b = matrix(rnorm(n))
# b = matrix(summary(lm(y~x))$coef[, 1])
b = matrix(rep(0, n))
h = 1 / (1 + exp(-x %*% b))
J = -(t(y) %*% log(h) + t(1-y) %*% log(1 -h))
derivJ = t(x) %*% (h-y)
niter = 0
while(1) {
niter = niter + 1
newb = b - alpha * derivJ
h = 1 / (1 + exp(-x %*% newb))
newJ = -(t(y) %*% log(h) + t(0-y) %*% log(1 -h))
while((newJ - J) >= 0) {
print("inner while...")
# step adjust
alpha = alpha / 1.15
newb = b - alpha * derivJ
h = 1 / (1 + exp(-x %*% newb))
newJ = -(t(y) %*% log(h) + t(1-y) %*% log(1 -h))
}
if(max(abs(b - newb)) < 0.001) {
break
}
b = newb
J = newJ
derivJ = t(x) %*% (h-y)
}
b
v = exp(-x %*% b)
h = 1 / (1 + v)
w = h^2 * v
# # hessian matrix of cost function
hess = t(x) %*% diag(as.vector(w)) %*% x
seMat = sqrt(diag(solve(hess)))
zscore = b / seMat
cbind(b, zscore)
}
nr = 5000
nc = 3
# set.seed(17)
x = matrix(rnorm(nr*nc, 0, 999), nr)
x = apply(x, 2, scale)
# y = matrix(sample(0:1, nr, repl=T), nr)
h = 1/(1 + exp(-x %*% rnorm(nc)))
y = round(h)
y[1:round(nr/2)] = sample(0:1, round(nr/2), repl=T)
testglm = function() {
for(i in 1:20) {
res = summary(glm(y~x, family=binomial))$coef
}
print(res)
}
testlogreg = function() {
for(i in 1:20) {
res = logreg(y, x)
}
print(res)
}
print(system.time(testlogreg()))
print(system.time(testglm()))
The algorithm gives me correct results, but it's ten times slower.
print(system.time(testlogreg()))
[,1] [,2]
[1,] -0.0358877 -1.16332
[2,] 0.1904964 6.09873
[3,] -0.1428953 -4.62629
[4,] -0.9151143 -25.33478
user system elapsed
4.013 1.037 5.062
#////////////////////////////////////////////////////
print(system.time(testglm()))
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.0360447 0.0308617 -1.16794 2.42829e-01
x1 0.1912254 0.0312500 6.11922 9.40373e-10
x2 -0.1432585 0.0309001 -4.63618 3.54907e-06
x3 -0.9178177 0.0361598 -25.38226 3.95964e-142
user system elapsed
0.482 0.040 0.522
But if I don't calculate the standard error and z value, then it's a little faster than glm:
#////////////////////////////////////////////////////
print(system.time(testlogreg()))
[,1]
[1,] -0.0396199
[2,] 0.2281502
[3,] -0.3941912
[4,] 0.8456839
user system elapsed
0.404 0.001 0.405
#////////////////////////////////////////////////////
print(system.time(testglm()))
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.0397529 0.0309169 -1.28580 1.98514e-01
x1 0.2289063 0.0312998 7.31336 2.60551e-13
x2 -0.3956140 0.0319847 -12.36884 3.85328e-35
x3 0.8483669 0.0353760 23.98144 4.34358e-127
user system elapsed
0.474 0.000 0.475
So apparently the calculation of se and z-value takes a lot of time, but how does glm do it? How can I improve my implementation?
Finally figured this out, the secret lies in the use of sparse matrix (see also this blog post).
require(Matrix)
logreg = function(y, x) {
x = as.matrix(x)
x = apply(x, 2, scale)
x = cbind(1, x)
m = nrow(x)
n = ncol(x)
alpha = 2/m
# b = matrix(rnorm(n))
# b = matrix(summary(lm(y~x))$coef[, 1])
b = matrix(rep(0, n))
v = exp(-x %*% b)
h = 1 / (1 + v)
J = -(t(y) %*% log(h) + t(1-y) %*% log(1 -h))
derivJ = t(x) %*% (h-y)
derivThresh = 0.0000001
bThresh = 0.001
while(1) {
newb = b - alpha * derivJ
if(max(abs(b - newb)) < bThresh) {
break
}
v = exp(-x %*% newb)
h = 1 / (1 + v)
newderivJ = t(x) %*% (h-y)
if(max(abs(newderivJ - derivJ)) < derivThresh) {
break
}
newJ = -(t(y) %*% log(h) + t(0-y) %*% log(1 -h))
if(newJ > J) {
alpha = alpha/2
}
b = newb
J = newJ
derivJ = newderivJ
}
w = h^2 * v
# # hessian matrix of cost function
hess = t(x) %*% Diagonal(x = as.vector(w)) %*% x
seMat = sqrt(diag(solve(hess)))
zscore = b / seMat
cbind(b, zscore)
}
nr = 5000
nc = 2
# set.seed(17)
x = matrix(rnorm(nr*nc, 3, 9), nr)
# x = apply(x, 2, scale)
# y = matrix(sample(0:1, nr, repl=T), nr)
h = 1/(1 + exp(-x %*% rnorm(nc)))
y = round(h)
y[1:round(nr/2)] = sample(0:1, round(nr/2), repl=T)
ntests = 13
testglm = function() {
nr = 5000
nc = 2
# set.seed(17)
x = matrix(rnorm(nr*nc, 3, 9), nr)
# x = apply(x, 2, scale)
# y = matrix(sample(0:1, nr, repl=T), nr)
h = 1/(1 + exp(-x %*% rnorm(nc)))
y = round(h)
y[1:round(nr/2)] = sample(0:1, round(nr/2), repl=T)
for(i in 1:ntests) {
res = summary(glm(y~x, family=binomial))$coef[, c(1, 3)]
}
res
}
testlogreg = function() {
nr = 5000
nc = 2
# set.seed(17)
x = matrix(rnorm(nr*nc, 3, 9), nr)
# x = apply(x, 2, scale)
# y = matrix(sample(0:1, nr, repl=T), nr)
h = 1/(1 + exp(-x %*% rnorm(nc)))
y = round(h)
y[1:round(nr/2)] = sample(0:1, round(nr/2), repl=T)
for(i in 1:ntests) {
res = logreg(y, x)
}
res
}
print(system.time(testlogreg()))
print(system.time(testglm()))
Now my implementation is even a bit faster than the glm in R!
print(system.time(testlogreg()))
[,1] [,2]
[1,] -0.022598 -0.739494
[2,] -0.510799 -15.793676
[3,] -0.130177 -4.257121
[4,] 0.578318 17.712392
[5,] 0.299080 9.587985
[6,] 0.244131 7.888600
user system elapsed
8.954 0.044 9.000
#////////////////////////////////////////////////////
print(system.time(testglm()))
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.0226784 0.0305694 -0.741865 4.58169e-01
x1 -0.5129285 0.0323621 -15.849653 1.41358e-56
x2 -0.1305872 0.0305892 -4.269057 1.96301e-05
x3 0.5806001 0.0326719 17.770648 1.19304e-70
x4 0.3002898 0.0312072 9.622454 6.42789e-22
x5 0.2451543 0.0309601 7.918407 2.40573e-15
user system elapsed
12.218 0.008 12.229

Resources