There is a code with three for loops running with data containing enough missing values. The major problem is with the unacceptably long run time which seems to take at least more than a month although I try to keep my PC opened during most of the day.
The structure below is 100% correct from what I am trying to achieve when I test with a very few data points. But as the number of columns and rows become 2781 and 280, respectively, I perceive it takes forever although I am 100% sure that this is running correctly even when I see the updated environment window of my R-Studio each time I refresh it.
My data also has lots of missing values, probably 40% or something. I think this is making the computation time extremely longer as well.
The data dimension is 315 * 2781.
However, I am trying to achieve an output in a 280 * 2781 matrix form.
May I please get help minimizing the run time of this following code?
It would be very appreciated if I can!
options(java.parameters = "- Xmx8000m")
memory.limit(size=8e+6)
data=read.table("C:/Data/input.txt",T,sep="\t");
data=data.frame(data)[,-1]
corr<-NULL
corr2<-NULL
corr3<-NULL
for(i in 1:280)
{
corr2<-NULL
for(j in 1:2781)
{
data2<-data[,-j]
corr<-NULL
for(k in 1:2780)
{
ifelse((is.error(grangertest(data[i:(i+35),j] ~ data2[i:(i+35),k], order = 1, na.action = na.omit)$P[2])==TRUE) || (grangertest(data[i:(i+35),j] ~ data2[i:(i+35),k], order = 1, na.action = na.omit)$P[2])>0.05|| (is.na(grangertest(data[i:(i+35),j] ~ data2[i:(i+35),k], order = 1, na.action = na.omit)$P[2])==TRUE),corr<-cbind(corr,0),corr<-cbind(corr,1))
}
corr2<-rbind(corr2,corr)
}
corr3<-rbind(corr3,rowSums(corr2))
}
The snippet of my data is as below:
> dput(data[1:30, 1:10])
structure(c(0.567388170165941, 0.193093325709924, 0.965938209090382,
0.348295788047835, 0.496113050729036, 0.0645384560339153, 0.946750836912543,
0.642093246569857, 0.565092500532046, 0.0952424583956599, 0.444063827162609,
0.709971546428278, 0.756330407923087, 0.601746253203601, 0.341865634545684,
0.953319212188944, 0.0788604547269642, 0.990508111426607, 0.35519331949763,
0.697004508692771, 0.285368352662772, 0.274287624517456, 0.575733694015071,
0.12937490013428, 0.00476219342090189, 0.684308280004188, 0.189448777819052,
0.615732178557664, 0.404873769031838, 0.357331350911409, 0.565436001634225,
0.380773033713922, 0.348490287549794, 0.0473814208526164, 0.389312234241515,
0.562123290728778, 0.30642102798447, 0.911173274740577, 0.566258994862437,
0.837928073247895, 0.107747194357216, 0.253737836843356, 0.651503744535148,
0.187739939894527, 0.951192815322429, 0.740037888288498, 0.0817571650259197,
0.740519099170342, 0.601534485351294, 0.120900869136676, 0.415282893227413,
0.591146623482928, 0.698511375114322, 0.08557975362055, 0.139396222075447,
0.303953414550051, 0.0743798329494894, 0.0293272000271827, 0.335832208395004,
0.665010208031163, 0.0319741254206747, 0.678886031731963, 0.154593498911709,
0.275712370406836, 0.828485634410754, 0.921500099124387, 0.651940459152684,
0.00574865937232971, 0.82236105017364, 0.55089360428974, 0.209424041677266,
0.861786168068647, 0.672873278381303, 0.301034058211371, 0.180336013436317,
0.481560358777642, 0.901354183442891, 0.986482679378241, 0.90117057505995,
0.476308439625427, 0.638073122361675, 0.27481731469743, 0.689271076582372,
0.324349449947476, 0.56620552809909, 0.867861548438668, 0.78374840435572,
0.0668482843320817, 0.276675389613956, 0.990600393852219, 0.990227151894942,
0.417612489778548, 0.391012848122045, 0.348758921027184, 0.0799746725242585,
0.88941288786009, 0.511429069796577, 0.0338982092216611, 0.240115304477513,
0.0268365524243563, 0.67206134647131, 0.816803207853809, 0.344421110814437,
0.864659120794386, 0.84128700569272, 0.116056860191748, 0.303730394458398,
0.48192183743231, 0.341675494797528, 0.0622653553728014, 0.823110743425786,
0.483212807681412, 0.968748248415068, 0.953057422768325, 0.116025703493506,
0.327919023809955, 0.590675016632304, 0.832283023977652, 0.342327545629814,
0.576901035616174, 0.942689201096073, 0.59300709143281, 0.565881528891623,
0.600007816683501, 0.133237989619374, 0.873827134957537, 0.744597729761153,
0.755133397178724, 0.0245723063126206, 0.97799762734212, 0.636845340020955,
0.73828601022251, 0.644093665992841, 0.57204390084371, 0.496023115236312,
0.703613247489557, 0.149237307952717, 0.0871439634356648, 0.0632112647872418,
0.83703236351721, 0.433215840253979, 0.430483993608505, 0.924051651498303,
0.913056606892496, 0.914889572421089, 0.215407102368772, 0.76880722376518,
0.269207723205909, 0.865548757137731, 0.28798541566357, 0.391722843516618,
0.649806497385725, 0.459413924254477, 0.907465039752424, 0.48731207777746,
0.554472463205457, 0.779784266138449, 0.566323830280453, 0.208658932242543,
0.958056638715789, 0.61858483706601, 0.838681482244283, 0.286310768220574,
0.895410191034898, 0.448722236789763, 0.297688684659079, 0.33291415637359,
0.0115265529602766, 0.850776052568108, 0.764857453294098, 0.469730701530352,
0.222089925780892, 0.0496484278701246, 0.32886885642074, 0.356443469878286,
0.612877089297399, 0.727906176587567, 0.0292073413729668, 0.429160050582141,
0.232313714455813, 0.678631312213838, 0.642334033036605, 0.99107678886503,
0.542449960019439, 0.835914565017447, 0.52798323193565, 0.303808332188055,
0.919654499506578, 0.944237019168213, 0.52141259261407, 0.794379767496139,
0.72268659202382, 0.114752230467275, 0.175116094760597, 0.437696389388293,
0.852590200025588, 0.511136321350932, 0.30879021063447, 0.174206420546398,
0.14262041519396, 0.375411552377045, 0.0204910831525922, 0.852757754037157,
0.631567053496838, 0.475924106314778, 0.508682047016919, 0.307679089019075,
0.70284536993131, 0.851252349093556, 0.0868967010173947, 0.586291917832568,
0.0529140203725547, 0.440692059928551, 0.207642213441432, 0.777513341512531,
0.141496006632224, 0.548626560717821, 0.419565241318196, 0.0702310993801802,
0.499403427587822, 0.189343606121838, 0.370725362794474, 0.888076487928629,
0.83070912421681, 0.466137421084568, 0.177098380634561, 0.91202046489343,
0.142300580162555, 0.823691181838512, 0.41561916610226, 0.939948018174618,
0.806491429451853, 0.795849160756916, 0.566376683535054, 0.36814984655939,
0.307756055146456, 0.602875682059675, 0.506007500691339, 0.538658684119582,
0.420845189364627, 0.663071365095675, 0.958144341595471, 0.793743418296799,
0.983086514985189, 0.266262857476249, 0.817585011478513, 0.122843299992383,
0.989197303075343, 0.71584410732612, 0.500571243464947, 0.397394519997761,
0.659465527161956, 0.459530522814021, 0.602246116613969, 0.250076721422374,
0.17533828667365, 0.6599256307818, 0.184704560553655, 0.15679649473168,
0.513444944983348, 0.205572377191857, 0.430164282443002, 0.131548407254741,
0.914019819349051, 0.935795902274549, 0.857401241315529, 0.977940042736009,
0.41389597626403, 0.179183913161978, 0.431347143370658, 0.477178965462372,
0.121315707685426, 0.107695729471743, 0.634954946814105, 0.859707030234858,
0.855825762730092, 0.708672808250412, 0.674073817208409, 0.672288877889514,
0.622144045541063, 0.433355041313916, 0.952878215815872, 0.229569894727319,
0.289388840552419, 0.937473804224283, 0.116283216979355, 0.659604362910613,
0.240837284363806, 0.726138337515295, 0.68390148691833, 0.381577257299796,
0.899390475358814, 0.26472729514353, 0.0383855854161084, 0.855232689995319,
0.655799814499915, 0.335587574867532, 0.163842789363116, 0.0353666560258716,
0.048316186061129), .Dim = c(30L, 10L))
I converted just the inner loop to mapply and did a quick speed test:
library(lmtest)
data <- matrix(runif(315*2781), nrow = 315)
get01 <- function(x, y) {
try(gt <- grangertest(x ~ y, order = 1, na.action = na.omit)$P[2])
if (exists("gt")) {
if (gt > 0.05 || is.na(gt)) {
return(0)
} else {
return(1)
}
} else {
return(0)
}
}
i <- 1; j <- 1
system.time(corr <- mapply(function(k) {get01(data[i:(i+35),j], data[i:(i+35),k])}, (1:2781)[-j]))
#> user system elapsed
#> 21.505 0.014 21.520
It would need to perform that mapply 778680 times, so that puts it at about 200 days. You'll either need a different approach with the Granger test or several cores. Here's the command to replace the full loop:
corr3 <- t(mapply(function(i) colSums(mapply(function(j) mapply(function(k) {get01(data[i:(i+35),j], data[i:(i+35),k])}, (1:2781)[-j]), 1:2781)), 1:280))
Replace that first mapply with simplify2array(parLapply to parallelize:
library(parallel)
cl <- makeCluster(detectCores())
clusterExport(cl, list("data", "get01"))
parLapply(cl, cl, function(x) require(lmtest))
corr3 <- t(simplify2array(parLapply(cl, 1:280, function(i) colSums(mapply(function(j) mapply(function(k) {get01(data[i:(i+35),j], data[i:(i+35),k])}, (1:2781)[-j]), 1:2781)))))
stopCluster(cl)
Here is a version, not parallelized, that speeds up the code in the question by a factor greater than 4.
Some bottlenecks in the question's code are easy to detect:
The matrices corr? are extended inside the loops. The solution is to reserve memory beforehand;
The test grangertest is called 3 times per inner iteration when only one is needed;
To cbind with 0 or 1 is in fact creating a vector, not a matrix.
Here is a comparative test between the question's code and the function below.
library(lmtest)
# avoids loading an extra package
is.error <- function(x){
inherits(x, c("error", "try-error"))
}
Lag <- 5L
nr <- nrow(data)
nc <- ncol(data)
t0 <- system.time({
corr<-NULL
corr2<-NULL
corr3<-NULL
for(i in 1:(nr - Lag))
{
corr2<-NULL
data3 <- data[i:(i + Lag), ]
for(j in 1:nc)
{
data2<-data[,-j]
corr<-NULL
for(k in 1:(nc - 1L))
{
ifelse((is.error(grangertest(data[i:(i+Lag),j] ~ data2[i:(i+Lag),k], order = 1, na.action = na.omit)$P[2])==TRUE) ||
(grangertest(data[i:(i+Lag),j] ~ data2[i:(i+Lag),k], order = 1, na.action = na.omit)$P[2])>0.05 ||
(is.na(grangertest(data[i:(i+Lag),j] ~ data2[i:(i+Lag),k], order = 1, na.action = na.omit)$P[2])==TRUE),
corr<-cbind(corr,0),
corr<-cbind(corr,1)
)
}
corr2 <- rbind(corr2, corr)
}
corr3<-rbind(corr3, rowSums(corr2))
}
corr3
})
I will use a simplified version of lmtest::grangertest.
granger_test <- function (x, y, order = 1, na.action = na.omit, ...) {
xnam <- deparse(substitute(x))
ynam <- deparse(substitute(y))
n <- length(x)
all <- cbind(x = x[-1], y = y[-1], x_1 = x[-n], y_1 = y[-n])
y <- as.vector(all[, 2])
lagX <- as.matrix(all[, (1:order + 2)])
lagY <- as.matrix(all[, (1:order + 2 + order)])
fm <- lm(y ~ lagY + lagX)
rval <- lmtest::waldtest(fm, 2, ...)
attr(rval, "heading") <- c("Granger causality test\n", paste("Model 1: ",
ynam, " ~ ", "Lags(", ynam, ", 1:", order, ") + Lags(",
xnam, ", 1:", order, ")\nModel 2: ", ynam, " ~ ", "Lags(",
ynam, ", 1:", order, ")", sep = ""))
rval
}
And now the function to run the tests.
f_Rui <- function(data, Lag){
nr <- nrow(data)
nc <- ncol(data)
corr3 <- matrix(0, nrow = nr - Lag, ncol = nc)
data3 <- matrix(0, nrow = Lag + 1L, ncol = nc)
data2 <- matrix(0, nrow = Lag + 1L, ncol = nc - 1L)
for(i in 1:(nr - Lag)) {
corr2 <- matrix(0, nrow = nc, ncol = nc - 1L)
data3[] <- data[i:(i + Lag), ]
for(j in 1:nc) {
corr <- integer(nc - 1L)
data2[] <- data3[, -j]
for(k in 1:(nc - 1L)){
res <- tryCatch(
grangertest(x = data2[, k], y = data3[, j], order = 1, na.action = na.omit),
error = function(e) e
)
if(!inherits(res, "error") && !is.na(res[['Pr(>F)']][2]) && res[['Pr(>F)']][2] <= 0.05) {
corr[k] <- 1L
}
}
corr2[j, ] <- corr
}
corr3[i, ] <- rowSums(corr2)
}
corr3
}
The results are identical and the timings much better.
t1 <- system.time({
res <- f_Rui(data, 5L)
})
identical(corr3, res)
#[1] TRUE
times <- rbind(t0, t1)
t(t(times)/t1)
# user.self sys.self elapsed user.child sys.child
#t0 4.682908 1.736111 4.707783 NaN NaN
#t1 1.000000 1.000000 1.000000 NaN NaN
I am trying to implement NLPCA (Nonlinear PCA) on a data set using the homals package in R but I keep on getting the following error message:
Error in dimnames(x) <- dn : length of 'dimnames' [1] not equal to array extent
The data set I use can be found in the UCI ML Repository and it's called dat when imported in R: https://archive.ics.uci.edu/ml/datasets/South+German+Credit+%28UPDATE%29
Here is my code (some code is provided once the data set is downloaded):
nlpcasouthgerman <- homals(dat, rank=1, level=c('nominal','numerical',rep('nominal',2),
'numerical','nominal',
rep('ordinal',2), rep('nominal',2),
'ordinal','nominal','numerical',
rep('nominal',2), 'ordinal',
'nominal','ordinal',rep('nominal',3)),
active=c(FALSE, rep(TRUE, 20)), ndim=3, verbose=1)
I am trying to predict the first attribute, therefore I set it to be active=FALSE.
The output looks like this (skipped all iteration messages):
Iteration: 1 Loss Value: 0.000047
Iteration: 2 Loss Value: 0.000044
...
Iteration: 37 Loss Value: 0.000043
Iteration: 38 Loss Value: 0.000043
Error in dimnames(x) <- dn :
length of 'dimnames' [1] not equal to array extent
I don't understand why this error comes up. I have used the same code on some other data set and it worked fine so I don't see why this error persists. Any suggestions about what might be going wrong and how I could fix this issue?
Thanks!
It seems the error comes from code generating NAs in the homals function, specifically for your data for the number_credits levels, which causes problems with sort(as.numeric((rownames(clist[[i]])))) and the attempt to catch the error, since one of the levels does not give an NA value.
So either you have to modify the homals function to take care of such an edge case, or change problematic factor levels. This might be something to file as a bug report to the package maintainer.
As a work-around in your case you could do something like:
levels(dat$number_credits)[1] <- "_1"
and the function should run without problems.
Edit:
I think one solution would be to change one line of code in the homals function, but no guarantee this does work as intended. Better submit a bug report to the package author/maintainer - see https://cran.r-project.org/web/packages/homals/ for the address.
Using rnames <- as.numeric(rownames(clist[[i]]))[order(as.numeric(rownames(clist[[i]])))] instead of rnames <- sort(as.numeric((rownames(clist[[i]])))) would allow the following code to identify NAs, but I am not sure why the author did not try to preserve factor levels outright.
Anyway, you could run a modified function in your local environment, which would require to explicitly call internal (not exported) homals functions, as shown below. Not necessarily the best approach, but would help you out in a pinch.
homals <- function (data, ndim = 2, rank = ndim, level = "nominal", sets = 0,
active = TRUE, eps = 0.000001, itermax = 1000, verbose = 0) {
dframe <- data
name <- deparse(substitute(dframe))
nobj <- nrow(dframe)
nvar <- ncol(dframe)
vname <- names(dframe)
rname <- rownames(dframe)
for (j in 1:nvar) {
dframe[, j] <- as.factor(dframe[, j])
levfreq <- table(dframe[, j])
if (any(levfreq == 0)) {
newlev <- levels(dframe[, j])[-which(levfreq == 0)]
}
else {
newlev <- levels(dframe[, j])
}
dframe[, j] <- factor(dframe[, j], levels = sort(newlev))
}
varcheck <- apply(dframe, 2, function(tl) length(table(tl)))
if (any(varcheck == 1))
stop("Variable with only 1 value detected! Can't proceed with estimation!")
active <- homals:::checkPars(active, nvar)
rank <- homals:::checkPars(rank, nvar)
level <- homals:::checkPars(level, nvar)
if (length(sets) == 1)
sets <- lapply(1:nvar, "c")
if (!all(sort(unlist(sets)) == (1:nvar))) {
print(cat("sets union", sort(unlist(sets)), "\n"))
stop("inappropriate set structure !")
}
nset <- length(sets)
mis <- rep(0, nobj)
for (l in 1:nset) {
lset <- sets[[l]]
if (all(!active[lset]))
(next)()
jset <- lset[which(active[lset])]
for (i in 1:nobj) {
if (any(is.na(dframe[i, jset])))
dframe[i, jset] <- NA
else mis[i] <- mis[i] + 1
}
}
for (j in 1:nvar) {
k <- length(levels(dframe[, j]))
if (rank[j] > min(ndim, k - 1))
rank[j] <- min(ndim, k - 1)
}
x <- cbind(homals:::orthogonalPolynomials(mis, 1:nobj, ndim))
x <- homals:::normX(homals:::centerX(x, mis), mis)$q
y <- lapply(1:nvar, function(j) homals:::computeY(dframe[, j], x))
sold <- homals:::totalLoss(dframe, x, y, active, rank, level, sets)
iter <- pops <- 0
repeat {
iter <- iter + 1
y <- homals:::updateY(dframe, x, y, active, rank, level, sets,
verbose = verbose)
smid <- homals:::totalLoss(dframe, x, y, active, rank, level,
sets)/(nobj * nvar * ndim)
ssum <- homals:::totalSum(dframe, x, y, active, rank, level, sets)
qv <- homals:::normX(homals:::centerX((1/mis) * ssum, mis), mis)
z <- qv$q
snew <- homals:::totalLoss(dframe, z, y, active, rank, level,
sets)/(nobj * nvar * ndim)
if (verbose > 0)
cat("Iteration:", formatC(iter, digits = 3, width = 3),
"Loss Value: ", formatC(c(smid), digits = 6,
width = 6, format = "f"), "\n")
r <- abs(qv$r)/2
ops <- sum(r)
aps <- sum(La.svd(crossprod(x, mis * z), 0, 0)$d)/ndim
if (iter == itermax) {
stop("maximum number of iterations reached")
}
if (smid > sold) {
warning(cat("Loss function increases in iteration ",
iter, "\n"))
}
if ((ops - pops) < eps)
break
else {
x <- z
pops <- ops
sold <- smid
}
}
ylist <- alist <- clist <- ulist <- NULL
for (j in 1:nvar) {
gg <- dframe[, j]
c <- homals:::computeY(gg, z)
d <- as.vector(table(gg))
lst <- homals:::restrictY(d, c, rank[j], level[j])
y <- lst$y
a <- lst$a
u <- lst$z
ylist <- c(ylist, list(y))
alist <- c(alist, list(a))
clist <- c(clist, list(c))
ulist <- c(ulist, list(u))
}
dimlab <- paste("D", 1:ndim, sep = "")
for (i in 1:nvar) {
if (ndim == 1) {
ylist[[i]] <- cbind(ylist[[i]])
ulist[[i]] <- cbind(ulist[[i]])
clist[[i]] <- cbind(clist[[i]])
}
options(warn = -1)
# Here is the line that I changed in the code:
# rnames <- sort(as.numeric((rownames(clist[[i]]))))
rnames <- as.numeric(rownames(clist[[i]]))[order(as.numeric(rownames(clist[[i]])))]
options(warn = 0)
if ((any(is.na(rnames))) || (length(rnames) == 0))
rnames <- rownames(clist[[i]])
if (!is.matrix(ulist[[i]]))
ulist[[i]] <- as.matrix(ulist[[i]])
rownames(ylist[[i]]) <- rownames(ulist[[i]]) <- rownames(clist[[i]]) <- rnames
rownames(alist[[i]]) <- paste(1:dim(alist[[i]])[1])
colnames(clist[[i]]) <- colnames(ylist[[i]]) <- colnames(alist[[i]]) <- dimlab
colnames(ulist[[i]]) <- paste(1:dim(as.matrix(ulist[[i]]))[2])
}
names(ylist) <- names(ulist) <- names(clist) <- names(alist) <- colnames(dframe)
rownames(z) <- rownames(dframe)
colnames(z) <- dimlab
dummymat <- as.matrix(homals:::expandFrame(dframe, zero = FALSE, clean = FALSE))
dummymat01 <- dummymat
dummymat[dummymat == 2] <- NA
dummymat[dummymat == 0] <- Inf
scoremat <- array(NA, dim = c(dim(dframe), ndim), dimnames = list(rownames(dframe),
colnames(dframe), paste("dim", 1:ndim, sep = "")))
for (i in 1:ndim) {
catscores.d1 <- do.call(rbind, ylist)[, i]
dummy.scores <- t(t(dummymat) * catscores.d1)
freqlist <- apply(dframe, 2, function(dtab) as.list(table(dtab)))
cat.ind <- sequence(sapply(freqlist, length))
scoremat[, , i] <- t(apply(dummy.scores, 1, function(ds) {
ind.infel <- which(ds == Inf)
ind.minfel <- which(ds == -Inf)
ind.nan <- which(is.nan(ds))
ind.nael <- which((is.na(ds) + (cat.ind != 1)) ==
2)
ds[-c(ind.infel, ind.minfel, ind.nael, ind.nan)]
}))
}
disc.mat <- apply(scoremat, 3, function(xx) {
apply(xx, 2, function(cols) {
(sum(cols^2, na.rm = TRUE))/nobj
})
})
result <- list(datname = name, catscores = ylist, scoremat = scoremat,
objscores = z, cat.centroids = clist, ind.mat = dummymat01,
loadings = alist, low.rank = ulist, discrim = disc.mat,
ndim = ndim, niter = iter, level = level, eigenvalues = r,
loss = smid, rank.vec = rank, active = active, dframe = dframe,
call = match.call())
class(result) <- "homals"
result
}