kmeans results as initial parameters in Mclust - r

I am doing Gaussian mixture models. I have done kmeans on the dataset and I want to use the means, variances and the size for the initial parameters for the em algorithm in R. I found that the parameters is a list of 3 and I tried to do the same thing but it gives me the following error :
Error in array(x, c(length(x), 1L), if (!is.null(names(x))) list(names(x), :
'data' must be of a vector type, was 'NULL'
My code
l <- kmeans(iris[,-5],centers=3)
pi <- l$size/length(iris[,1])
my <- t(l$centers)
sig <- vector("list", 3)
new <- as.data.frame(cbind(iris[,-5],l$cluster))
for (i in 1:3) {
subdata<-subset(new[,1:4],new[,5]==i);
sig[[i]]<-cov(subdata)
}
par <- vector("list",3)
par[[1]] <- pi; par[[2]] <- my; par[[3]] <- sig
kk <- em(modelName = msEst$modelName, data = iris[,-5],parameters = par)
Can someone please tell how should I assign the kmeans results as initial parameters?

Following is a quick example of what you seem to be after. The main thing you have to do is the get the parameters argument in the correct form. The tickly bit is with the variance list. There is a bit of help with this if you use the mclustVariance function.
library(mclust)
g <- 3
dat <- iris[, -5]
p <- ncol(dat)
n <- nrow(dat)
k_fit <- kmeans(dat, centers=g)
par <- vector("list", g)
par$pro <- k_fit$size/n
par$mean <- t(k_fit$centers)
sigma <- array(NA, c(p, p, g))
new <- as.data.frame(cbind(dat, k_fit$cluster))
for (i in 1 : g) {
subdata <- subset(new[, 1 : p], new[, (p+1)]==i)
sigma[,, i] <- cov(subdata)
}
variance <- mclustVariance("EEE", d = p, G = g)
par$variance <- variance
par$variance$sigma <- sigma
kk <- em(modelName = "EEE", data = dat, parameters = par)

Related

Cannot make sense of the error while using OptimParallel in R

I'm trying to run the following function mentioned below using OptimParallel in R on a certain data set. The code is as follows:
install.packages("optimParallel")
install.packages('parallel')
library(parallel)
library(optimParallel)
library(doParallel)
library(data.table)
library(Rlab)
library(HDInterval)
library(mvtnorm)
library(matrixStats)
library(dplyr)
library(cold)
## Bolus data:
data("bolus")
d1 <- bolus
d1$group <- ifelse(d1$group == "2mg",1,0)
colnames(d1) <- c("index",'group',"time","y")
d2 <- d1 %>% select(index, y, group, time)
colnames(d2) <- c('index','y','x1','x2') ### Final data
## Modification of the objective function:
## Another approach:
dpd_poi <- function(x,fixed = c(rep(FALSE,5))){
params <- fixed
dpd_1 <- function(p){
params[!fixed] <- p
alpha <- params[1]
beta_0 <- params[2]
beta_1 <- params[3]
beta_2 <- params[4]
rho <- params[5]
add_pi <- function(d){
k <- beta_0+(d[3]*beta_1)+(d[4]*beta_2)
k1 <- exp(k) ## for Poisson regression
d <- cbind(d,k1)
}
dat_split <- split(x , f = x$index)
result <- lapply(dat_split, add_pi)
result <- rbindlist(result)
result <- as.data.frame(result)
colnames(result) <- c('index','y','x1','x2','lamb')
result_split <- split(result, f = result$index)
expression <- function(d){
bin <- as.data.frame(combn(d$y , 2))
pr <- as.data.frame(combn(d$lamb , 2))
## Evaluation of the probabilities:
f_jk <- function(u,v){
dummy_func <- function(x,y){
ppois(x, lambda = y)
}
dummy_func_1 <- function(x,y){
ppois(x-1, lambda = y)
}
k <- mapply(dummy_func,u,v)
k_1 <- mapply(dummy_func_1,u,v)
inv1 <- inverseCDF(as.matrix(k), pnorm)
inv2 <- inverseCDF(as.matrix(k_1), pnorm)
mean <- rep(0,2)
lower <- inv2
upper <- inv1
corr <- diag(2)
corr[lower.tri(corr)] <- rho
corr[upper.tri(corr)] <- rho
prob <- pmvnorm(lower = lower, upper = upper, mean = mean, corr = corr)
prob <- (1+(1/alpha))*(prob^alpha)
## First expression: (changes for Poisson regression)
lam <- as.vector(t(v))
v1 <- rpois(1000, lambda = lam[1])
v2 <- rpois(1000, lambda = lam[2])
all_possib <- as.data.frame(rbind(v1,v2))
new_func <- function(u){
k <- mapply(dummy_func,u,v)
k_1 <- mapply(dummy_func_1,u,v)
inv1_1 <- inverseCDF(as.matrix(k), pnorm)
inv2_1 <- inverseCDF(as.matrix(k_1), pnorm)
mean1 <- rep(0,2)
lower1 <- inv2_1
upper1 <- inv1_1
corr1 <- diag(2)
corr1[lower.tri(corr1)] <- rho
corr1[upper.tri(corr1)] <- rho
prob1 <- pmvnorm(lower = lower1, upper = upper1, mean = mean1, corr = corr1)
prob1 <- prob1^(alpha)
}
val <- apply(all_possib, 2, new_func)
val_s <- mean(val) ## approximation
return(val_s - prob)
}
final_res <- mapply(f_jk, bin, pr)
final_value <- sum(final_res)
}
u <- sapply(result_split,expression)
return(sum(u))
}
}
## run the objective function:
cl <- makeCluster(25)
setDefaultCluster(cl=cl)
clusterExport(cl,c('d2','val'))
clusterEvalQ(cl,c(library(data.table), library(Rlab),library(HDInterval),library(mvtnorm),library(matrixStats),library(dplyr),library(cold)))
val <- dpd_poi(d2, c(0.5,FALSE,FALSE,FALSE,FALSE))
optimParallel(par = c(beta_0 =1, beta_1 =0.1 ,beta_2 = 1,rho=0.2),fn = val ,method = "L-BFGS-B",lower = c(-10,-10,-10,0),upper = c(Inf,Inf,Inf,1))
stopCluster(cl)
After running for some time, it returns the following error:
checkForRemoteErrors(val)
9 nodes produced errors; first error: missing value where TRUE/FALSE needed
However, when I make a minor change in the objective function (pick 2 random numbers from rpois instead of 1000) and run the same code using optim, it converges and gives me a proper result. This is a Monte Carlo simulation and it does not make sense to draw so few Poisson variables. I have to use optimParllel, otherwise, it takes way too long to converge. I could also run this code using simulated data.
I'm unable to figure out where the issue truly lies. I truly appreciate any help in this regard.

Error in nrow(X) : object 'X' not found , but it gets defined

I am trying to implement logistic regression and the function works manually, but for some reason I get the error "Error in nrow(X) : object 'X' not found ", even though X is defined before the nrow command. I use the UCI Data "Adult" to test it.
If i try to run the function manually there is no error. Can anyone explain that?
# Sigmoidfunction
sigmoid <- function(z){
g <- 1/(1+exp(-z))
return(g)
}
# Costfunction
cost <- function(theta){
n <- nrow(X)
g <- sigmoid(X %*% theta)
J <- (1/n)*sum((-Y*log(g)) - ((1-Y)*log(1-g)))
return(J)
}
log_reg <- function(datafr, m){
# Train- und Testdaten Split
sample <- sample(1:nrow(datafr), m)
df_train <- datafr[sample,]
df_test <- datafr[-sample,]
num_features <- ncol(datafr) - 1
num_label <- ncol(datafr)
label_levels <- levels(datafr[, num_label])
datafr[, num_features+1] <- ifelse(datafr[, num_label] == names(table(datafr[,num_label]))[1], 0, 1)
# Predictor variables
X <- as.matrix(df_train[, 1:num_features])
X_test <- as.matrix(df_test[, 1:num_features])
# Add ones to X
X <- cbind(rep(1, nrow(X)), X)
X_test <- cbind(rep(1, nrow(X_test)), X_test)
# Response variable
Y <- as.matrix(df_train[, num_label] )
Y <- ifelse(Y == names(table(Y))[1], 0, 1)
Y_test <- as.matrix(df_test[, num_label] )
Y_test <- ifelse(Y_test == names(table(Y_test))[1], 0, 1)
# Intial theta
initial_theta <- rep(0, ncol(X))
# Derive theta using gradient descent using optim function
theta_optim <- optim(par=initial_theta, fn=cost)
predictions <- ifelse(sigmoid(X_test%*%theta_optim$par)>=0.5, 1, 0)
# Generalization error
error_rate <- sum(predictions!=Y_test)/length(Y_test)
return(error_rate)
}
### Adult Data
data <- read.table('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
sep = ',', fill = F, strip.white = T)
colnames(data) <- c('age', 'workclass', 'fnlwgt', 'education',
'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex',
'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income')
# Featureselection
datafr <- data[, c("age", "education_num", "hours_per_week", "income")]
log_reg(datafr = datafr, m = 20)
You are calling cost() in which you refer to X, but X has not been defined in cost(). Either define it within log_reg() after you have defined X, or, better, make X a parameter for cost().
cost <- function(theta, X, Y){
n <- nrow(X)
g <- sigmoid(X %*% theta)
J <- (1/n)*sum((-Y*log(g)) - ((1-Y)*log(1-g)))
return(J)
}
And later
theta_optim <- optim(par=initial_theta, fn=cost, X=X, Y=Y)
In general, try to avoid having variables used in a function which are not defined explicitly as arguments to that function. Otherwise you will always end up with problems like this one.
Also, how did I find it out? I used traceback():
> traceback()
5: nrow(X) at #2
4: fn(par, ...)
3: (function (par)
fn(par, ...))(c(0, 0, 0, 0))
2: optim(par = initial_theta, fn = cost) at #33
1: log_reg(datafr = datafr, m = 20)

Multi-data likelihood function and mle2 function from bbmle package in R

I have written a custom likelihood function that fits a multi-data model that integrates mark-recapture and telemetry data (sensu Royle et al. 2013 Methods in Ecology and Evolution). The likelihood function is designed to be flexible in terms of whether and how many covariates are specified for different linear models in different likelihood components which is determined by values supplied as function arguments (i.e., data matrices "detcovs" and "dencovs" in my code). The likelihood function works when I directly supply it to optimization functions (e.g., optim or nlm), but does not play nice with the mle2 function in the bbmle package. My problem is that I continually run into the following error: "some named arguments in 'start' are not arguments to the specified log-likelihood function". This is my first attempt at writing custom likelihood functions so I'm sure there are general coding conventions of which I'm unaware that make such tasks much more efficient and amendable to the mle2 function. Below is my likelihood function, code creating the staring value objects, and code calling the mle2 function. Any advice how to solve the error problem and general comments on writing cleaner functions is welcome. Many thanks in advance.
Edit: As requested, I have simplified the likelihood function and provided code to simulate reproducible data to which the model can be fit. Included in the simulation code are 2 custom functions and use of the raster function from the raster package. Hopefully, I have sufficiently simplified everything to enable others to troubleshoot. Again, many thanks for your help!
Jared
Likelihood function:
CSCR.RSF.intlik2.EXAMPLE <- function(alpha0,sigma,alphas=NULL,betas=NULL,n0,yscr=NULL,K=NULL,X=X,trapcovs=NULL,Gden=NULL,Gdet=NULL,ytel=NULL,stel=NULL,
dencovs=NULL,detcovs=NULL){
#
# this version of the code handles a covariate on log(Density). This is starting value 5
#
# start = vector of starting values
# yscr = nind x ntraps encounter matrix
# K = number of occasions
# X = trap locations
# Gden = matrix with grid cell coordinates for density raster
# Gdet = matrix with gride cell coordinates for RSF raster
# dencovs = all covariate values for all nGden pixels in density raster
# trapcovs = covariate value at trap locations
# detcovs = all covariate values for all nGrsf pixels in RSF raster
# ytel = nguys x nGdet matrix of telemetry fixes in each nGdet pixels
# stel = home range center of telemetered individuals, IF you wish to estimate it. Not necessary
# alphas = starting values for RSF/detfn coefficients excluding sigma and intercept
# alpha0 = starting values for RSF/detfn intercept
# sigma = starting value for RSF/detfn sigma
# betas = starting values for density function coefficients
# n0 = starting value for number of undetected individuals on log scale
#
n0 = exp(n0)
nGden = nrow(Gden)
D = e2dist(X,Gden)
nGdet <- nrow(Gdet)
alphas = alphas
loglam = alpha0 -(1/(2*sigma*sigma))*D*D + as.vector(trapcovs%*%alphas) # ztrap recycled over nG
psi = exp(as.vector(dencovs%*%betas))
psi = psi/sum(psi)
probcap = 1-exp(-exp(loglam))
#probcap = (exp(theta0)/(1+exp(theta0)))*exp(-theta1*D*D)
Pm = matrix(NA,nrow=nrow(probcap),ncol=ncol(probcap))
ymat = yscr
ymat = rbind(yscr,rep(0,ncol(yscr)))
lik.marg = rep(NA,nrow(ymat))
for(i in 1:nrow(ymat)){
Pm[1:length(Pm)] = (dbinom(rep(ymat[i,],nGden),rep(K,nGden),probcap[1:length(Pm)],log=TRUE))
lik.cond = exp(colSums(Pm))
lik.marg[i] = sum( lik.cond*psi )
}
nv = c(rep(1,length(lik.marg)-1),n0)
part1 = lgamma(nrow(yscr)+n0+1) - lgamma(n0+1)
part2 = sum(nv*log(lik.marg))
out = -1*(part1+ part2)
lam = t(exp(a0 - (1/(2*sigma*sigma))*t(D2)+ as.vector(detcovs%*%alphas)))# recycle zall over all ytel guys
# lam is now nGdet x nG!
denom = rowSums(lam)
probs = lam/denom # each column is the probs for a guy at column [j]
tel.loglik = -1*sum( ytel*log(probs) )
out = out + tel.loglik
out
}
Data simulation code:
library(raster)
library(bbmle)
e2dist <- function (x, y){
i <- sort(rep(1:nrow(y), nrow(x)))
dvec <- sqrt((x[, 1] - y[i, 1])^2 + (x[, 2] - y[i, 2])^2)
matrix(dvec, nrow = nrow(x), ncol = nrow(y), byrow = F)
}
spcov <- function(R) {
v <- sqrt(nrow(R))
D <- as.matrix(dist(R))
V <- exp(-D/2)
cov1 <- t(chol(V)) %*% rnorm(nrow(R))
Rd <- as.data.frame(R)
colnames(Rd) <- c("x", "y")
Rd$C <- as.numeric((cov1 - mean(cov1)) / sd(cov1))
return(Rd)
}
set.seed(1234)
co <- seq(0.3, 0.7, length=5)
X <- cbind(rep(co, each=5),
rep(co, times=5))
B <- 10
co <- seq(0, 1, length=B)
Z <- cbind(rep(co, each=B), rep(co, times=B))
dencovs <- cbind(spcov(Z),spcov(Z)[,3]) # ordered as reading raster image from left to right, bottom to top
dimnames(dencovs)[[2]][3:4] <- c("dencov1","dencov2")
denr.list <- vector("list",2)
for(i in 1:2){
denr.list[[i]] <- raster(
list(x=seq(0,1,length=10),
y=seq(0,1,length=10),
z=t(matrix(dencovs[,i+2],10,10,byrow=TRUE)))
)
}
B <- 20
co <- seq(0, 1, length=B)
Z <- cbind(rep(co, each=B), rep(co, times=B))
detcovs <- cbind(spcov(Z),spcov(Z)[,3]) # ordered as reading raster image from left to right, bottom to top
dimnames(detcovs)[[2]][3:4] <- c("detcov1","detcov2")
detcov.raster.list <- vector("list",2)
trapcovs <- matrix(0,J,2)
for(i in 1:2){
detr.list[[i]] <- raster(
list(x=seq(0,1,length=20),
y=seq(0,1,length=20),
z=t(matrix(detcovs[,i+2],20,20,byrow=TRUE)))
)
trapcovs[,i] <- extract(detr.list[[i]],X)
}
alpha0 <- -3
sigma <- 0.15
alphas <- c(1,-1)
beta0 <- 3
betas <- c(-1,1)
pixelArea <- (dencovs$y[2] - dencovs$y[1])^2
mu <- exp(beta0 + as.matrix(dencovs[,3:4])%*%betas)*pixelArea
EN <- sum(mu)
N <- rpois(1, EN)
pi <- mu/sum(mu)
s <- dencovs[sample(1:nrow(dencovs), size=N, replace=TRUE, prob=pi),1:2]
J <- nrow(X)
K <- 10
yc <- d <- p <- matrix(NA, N, J)
D <- e2dist(s,X)
loglam <- t(alpha0 - t((1/(2*sigma*sigma))*D*D) + as.vector(trapcovs%*%alphas))
p <- 1-exp(-exp(loglam))
for(i in 1:N) {
for(j in 1:J) {
yc[i,j] <- rbinom(1, K, p[i,j])
}
}
detected <- apply(yc>0, 1, any)
yscr <- yc[detected,]
ntel <- 5
nfixes <- 100
poss.tel <- which(s[,1]>0.2 & s[,1]<0.8 & s[,2]>0.2 & s[,2]<0.8)
stel.id <- sample(poss.tel,ntel)
stel <- s[stel.id,]
ytel <- matrix(NA,ntel,nrow(detcovs))
d <- e2dist(stel,detcovs[,1:2])
lam <- t(exp(1 - t((1/(2*sigma*sigma))*d*d) + as.vector(as.matrix(detcovs[,3:4])%*%alphas)))
for(i in 1:ntel){
ytel[i,] <- rmultinom(1,nfixes,lam[i,]/sum(lam[i,]))
}
Specify starting values and call mle2 function:
start1 <- list(alpha0=alpha0,sigma=sigma,alphas=alphas,betas=betas,n0=log(N-nrow(yscr)))
parnames(CSCR.RSF.intlik2.EXAMPLE) <- names(start)
out1 <- mle2(CSCR.RSF.intlik2.EXAMPLE,start=start1,method="SANN",optimizer="optim",
data=list(yscr=yscr,K=K,X=X,trapcovs=trapcovs,Gden=dencovs[,1:2],Gdet=detcovs[,1:2],
ytel=ytel,stel=stel,dencovs=as.matrix(dencovs[,3:4]),detcovs=as.matrix(detcovs[,3:4]))
)

Asymmetric eta square matrix in R

I have a function that calculates the eta square coefficient for the correlation between qualitative and quantitative variable:
eta <- function(x,y){
m <- mean(x,na.rm = TRUE)
sct <- sum((x-m)^2,na.rm = TRUE)
n <- table(y)
mk <- tapply(x,y,mean)
sce <- sum(n*(mk-m)^2)
return(ifelse(sct>0,sce/sct,0))
}
Imagine I have the following variables (just an example), with missing values in it:
a <- factor(c("M","NA","F","F","NA","M","F"))
b <- factor(c("y","y","y","n","n","n","y")
d <- c(2,5,4,8,9,6,4)
e <- c(5,7,8,5,6,9,7)
I need to program a function that returns a matrix with the eta coefficient for each combination of qualitative and quantitative variable x and y.
Something like this maybe (it doesnt work):
matrix<-function(x,y){
col.y<-ncol(y)
row.x<-nrow(x)
M<-matrix(ncol=col.y,nrow=row.x,dimnames = list(names(x), names(y)))
for(i in 1:col.y){
for(j in 1:row.x){
M[i,j]<-rap.corr(y[,i],x[,j])
}
}
return(M)
}
How can I do this? Thank you in advance..
You can do this using mapply:
qual = list(a=a, b=b)
quant = list(e=e,d=d)
m = matrix(mapply(eta, rep(quant, length(qual)), rep(qual, each=length(quant))), ncol=length(qual))
rownames(m) = names(quant)
colnames(m) = names(qual)
#> m
# a b
#e 0.01950355 0.0008865248
#d 0.25333333 0.7363333333

DEoptim parallel options

Hi I am trying to understand how to get DEoptim to work using parallel processing, but am struggling to get the correct parameters to be put into the function to get it to work...below is a reproducible example (it has a financial context) but it is designed for creating a random portfolio of 7 assets to optimise for ES. It was inspired by this http://mpra.ub.uni-muenchen.de/28187/1/RJwrapper.pdf and also http://files.meetup.com/1772780/20120201_Ulrich_Parallel_DEoptim.pdf
the second of which does include a parallel option but want to use the unix forking rather than the SOCK clusters.
require(quantmod)
require(PerformanceAnalytics)
require(DEoptim)
tickers <- c("^GSPC","^IXIC","^TNX", "DIA","USO","GLD","SLV","UNG","^VIX","F","^FTSE","GS","MS","MSFT","MCD","COKE","AAPL","GOOG","T","C","BHP","RIO","CMG")
getSymbols(tickers)
tickers <- gsub("\\^","",tickers)
x <- lapply(tickers, function(x){ClCl(get(x))})
comb <- na.omit(do.call(merge,x))
colnames(comb) <- paste0(tickers,".cc")
obj <- function(w) {
if (sum(w) == 0) { w <- w + 1e-2 }
w <- w / sum(w)
CVaR <- ES(weights = w,
method = "gaussian",
portfolio_method = "component",
mu = mu,
sigma = sigma)
tmp1 <- CVaR$ES
tmp2 <- max(CVaR$pct_contrib_ES - 0.225, 0)
out <- tmp1 + 1e3 * tmp2
}
comb1 <- comb[,sample(1:ncol(comb),7)]
no.of.assets <- ncol(comb1)
mu <- colMeans(comb1)
sigma <- cov(comb1)
## The non-parallel version
output <- DEoptim(fn = obj,lower = rep(0, no.of.assets), upper = rep(1, no.of.assets))
## The parallel version that doesn't seem to work...
output <- DEoptim(fn = obj,lower = rep(0, no.of.assets), upper = rep(1, no.of.assets), DEoptim.control(itermax=10000, trace=250, parallelType="parallel", packages=c("PerformanceAnalytics"), parVar=c("mu","sigma")))
I get the following error message
Error in missing(packages) : 'missing' can only be used for arguments

Resources