I would like to plot heatmap(s) below quantmod::chart_Series(). How to add the below heatmap to chart_Series (or xts::plot.xts):
library(quantmod)
# Get data fro symbol from Google Finance
symbol <- "SPY"
src <- "google"
from <- "2017-01-01"
symbolData <- getSymbols(symbol, src=src, from=from, auto.assign=FALSE)
# Calculate simple returns
symbolData.ret <- ROC(Cl(symbolData), type="discrete")
# Calculate lagged autocorrelations (Pearson correlation for each value of lag)
nLags <- 100
averageLength <- 3
symbolData.laggedAutocorr <- matrix(0, nLags, NROW(symbolData.ret))
for (lag in 2: nLags) {
# Set the average length as M
if (averageLength == 0) M <- lag
else M <- averageLength
symbolData.laggedAutocorr[lag, ] <- runCor(symbolData.ret, lag(symbolData.ret, lag), M)
}
symbolData.laggedAutocorr[is.na(symbolData.laggedAutocorr)] <- 0
symbolData.laggedAutocorr.xts <- reclass(t(symbolData.laggedAutocorr), symbolData)ΕΎ
subset <- "2017"
chart_Series(symbolData, name=symbol, subset=subset)
# Use transposed symbolData.laggedAutocorr for plot so you have data aligned to symbolData
# How to add the below heatmap to chart_Series?
heatmap(symbolData.laggedAutocorr.xts, Rowv = NA, Colv = NA, na.rm = TRUE, labCol = "")
add_Heatmap <- function(heatmapdata, ...) {
lenv <- new.env()
lenv$plot_ta <- function(x, heatmapdata, ...) {
# fill in body of low level plot calls here
# use a switch based on type of TA to draw: bands, bars, lines, dots...
xsubset <- x$Env$xsubset
#heatmapdata <- heatmapdata[subset] # TODO: Something is wrong if I have a subset here
heatmap(heatmapdata, Rowv=NA, Colv=NA, na.rm=TRUE, labCol="")
#image(1:NROW(heatmapdata), 1:NCOL(heatmapdata), coredata(heatmapdata), axes=FALSE)
}
mapply(function(name, value) {assign(name,value,envir=lenv)},
names(list(heatmapdata=heatmapdata,...)),
list(heatmapdata=heatmapdata,...))
exp <- parse(text=gsub("list","plot_ta",
as.expression(substitute(list(x=current.chob(),
heatmapdata=heatmapdata,
...)))), srcfile=NULL)
chob <- current.chob()
chob$add_frame(ylim=c(0, 0.3), asp=0.3) # need to have a value set for ylim
chob$next_frame()
chob$replot(exp,env=c(lenv, chob$Env),expr=TRUE)
chob
}
chart_Series(symbolData)
add_Heatmap(symbolData.laggedAutocorr.xts)
The above almost works... The issue is that the heatmap or image is plotted over the main part of chart_Series instead below of it. What to do in order for it to plot correctly?
I hope this is useful for other people since I managed to get this working (to a certain level). There are still issues. Please see comments at the end of code below and comment what to do in order to remove those issues.
add_Heatmap <- function(heatmapcol, ..., yvalues=1:NCOL(heatmapcol)) {
lenv <- new.env()
lenv$plot_ta <- function(x, heatmapcol, ...) {
xdata <- x$Env$xdata # internal main series
xsubset <- x$Env$xsubset
heatmapcol <- heatmapcol[xsubset]
x.pos <- 1:NROW(heatmapcol)
segments(axTicksByTime(xdata[xsubset], ticks.on=x$Env$ticks.on),
0,
axTicksByTime(xdata[xsubset], ticks.on=x$Env$ticks.on),
NCOL(heatmapcol), col=x$Env$theme$grid)
# TODO: What is faster polgon or rect (https://stackoverflow.com/questions/15627674/efficiency-of-drawing-rectangles-on-image-matrix-in-r)
# TODO: What is faster for or lapply?
# for (i in 1:NCOL(heatmapcol)) {
# rect(x.pos - 1/2, i - 1/2, x.pos + 1/2, i + 1/2 + 1, col=heatmapcol[x.pos, i], border=NA, ...) # base graphics call
# }
lapply(1:NCOL(heatmapcol), function(i) rect(x.pos - 1/2, i - 1/2, x.pos + 1/2, i + 1/2 + 1, col=heatmapcol[x.pos, i], border=NA, ...))
}
mapply(function(name, value) {assign(name,value,envir=lenv)},
names(list(heatmapcol=heatmapcol, ...)),
list(heatmapcol=heatmapcol, ...))
exp <- parse(text=gsub("list", "plot_ta",
as.expression(substitute(list(x=current.chob(),
heatmapcol=heatmapcol,
...)))), srcfile=NULL)
chob <- current.chob()
# chob$add_frame(ylim=c(0, 1),asp=0.15) # add the header frame
# chob$next_frame() # move to header frame
chob$add_frame(ylim=c(1, NCOL(heatmapcol)), asp=1) # need to have a value set for ylim
chob$next_frame()
if (length(yvalues) != NCOL(heatmapcol)) {
# We have a case when min and max is specified
yvalues <- (range(yvalues)[1]):(range(yvalues)[2])
}
# add grid lines
lenv$grid_lines_val <- function(xdata, x) {
ret <- pretty(yvalues)
if (ret[1] != min(yvalues)) {
if (ret[1] <= min(yvalues)) {
ret[1] <- min(yvalues)
} else {
ret <- c(min(yvalues), ret)
}
}
if (ret[length(ret)] != max(yvalues)) {
if (ret[length(ret)] >= max(yvalues)) {
ret[length(ret)] <- max(yvalues)
} else {
ret <- c(ret, max(yvalues))
}
}
return(ret)
}
lenv$grid_lines_pos <- function(xdata, x) {
ret <- lenv$grid_lines_val(xdata, x)
ret <- ret - min(yvalues)
return(ret)
}
exp <- c(exp,
# Add axis labels/boxes
expression(text(1- 1/3 - max(strwidth(grid_lines_val(xdata, xsubset))), grid_lines_pos(xdata, xsubset),
noquote(format(grid_lines_val(xdata, xsubset), justify="right")),
col=theme$labels, offset=0, pos=4, cex=0.9)),
expression(text(NROW(xdata[xsubset]) + 1/3, grid_lines_pos(xdata, xsubset),
noquote(format(grid_lines_val(xdata, xsubset), justify="right")),
col=theme$labels, offset=0, pos=4, cex=0.9)))
chob$replot(exp, env=c(lenv, chob$Env), expr=TRUE)
chob
}
colorsForHeatmap<-function(heatmapdata) {
heatmapdata <- 0.5*(heatmapdata + 1)
r <- coredata((heatmapdata > 0.5)*round(255*(2 - 2*heatmapdata)) + (heatmapdata <= 0.5)*255)
g <- coredata((heatmapdata > 0.5)*255 + (heatmapdata <= 0.5)*round(255*2*heatmapdata))
b <- coredata(heatmapdata*0.0) # Set to 0 for all
col <- rgb(r, g, b, maxColorValue=255)
dim(col) <- dim(r)
col <- reclass(col, heatmapdata)
return(col)
}
library(quantmod)
# Get data for symbol from Google Finance
symbol <- "SPY"
src <- "google"
from <- "1990-01-01"
symbolData <- getSymbols(symbol, src=src, from=from, auto.assign=FALSE)
# Calculate simple returns
symbolData.ret <- ROC(Cl(symbolData), type="discrete")
# Calculate lagged autocorrelations (Pearson correlation for each value of lag)
nLags <- 48
averageLength <- 3
symbolData.laggedAutocorr <- matrix(0, NROW(symbolData.ret), nLags)
for (lag in 2:nLags) {
# Set the average length as M
if (averageLength == 0) M <- lag
else M <- averageLength
symbolData.laggedAutocorr[, lag] <- runCor(symbolData.ret, lag(symbolData.ret, lag), M)
}
symbolData.laggedAutocorr[is.na(symbolData.laggedAutocorr)] <- 0
symbolData.laggedAutocorr.xts <- xts(symbolData.laggedAutocorr, index(symbolData))
heatmapColData <- colorsForHeatmap(symbolData.laggedAutocorr.xts)
symbolData.rsi2 <- RSI(Cl(symbolData), n=2)
subset <- "2011/"
chart_Series(symbolData, name=symbol, subset=subset)
add_Heatmap(heatmapColData, yvalues=2:nLags)
# TODO: There are still issues:
# - add a horizontal line
five <- symbolData[, 1]
five[, 1] <- 5
add_TA(five, col="violet", on=3)
#> add_TA(five, col="violet", on=3)
#Error in ranges[[frame]] : subscript out of bounds
# - add RSI for example and heatmap disappears
add_RSI()
# - or add TA
add_TA(symbolData.rsi2)
# What to do so it works like intended: I can add lines on top of heatmaps? I can add other TAs in new panes?
Related
I am trying to implement NLPCA (Nonlinear PCA) on a data set using the homals package in R but I keep on getting the following error message:
Error in dimnames(x) <- dn : length of 'dimnames' [1] not equal to array extent
The data set I use can be found in the UCI ML Repository and it's called dat when imported in R: https://archive.ics.uci.edu/ml/datasets/South+German+Credit+%28UPDATE%29
Here is my code (some code is provided once the data set is downloaded):
nlpcasouthgerman <- homals(dat, rank=1, level=c('nominal','numerical',rep('nominal',2),
'numerical','nominal',
rep('ordinal',2), rep('nominal',2),
'ordinal','nominal','numerical',
rep('nominal',2), 'ordinal',
'nominal','ordinal',rep('nominal',3)),
active=c(FALSE, rep(TRUE, 20)), ndim=3, verbose=1)
I am trying to predict the first attribute, therefore I set it to be active=FALSE.
The output looks like this (skipped all iteration messages):
Iteration: 1 Loss Value: 0.000047
Iteration: 2 Loss Value: 0.000044
...
Iteration: 37 Loss Value: 0.000043
Iteration: 38 Loss Value: 0.000043
Error in dimnames(x) <- dn :
length of 'dimnames' [1] not equal to array extent
I don't understand why this error comes up. I have used the same code on some other data set and it worked fine so I don't see why this error persists. Any suggestions about what might be going wrong and how I could fix this issue?
Thanks!
It seems the error comes from code generating NAs in the homals function, specifically for your data for the number_credits levels, which causes problems with sort(as.numeric((rownames(clist[[i]])))) and the attempt to catch the error, since one of the levels does not give an NA value.
So either you have to modify the homals function to take care of such an edge case, or change problematic factor levels. This might be something to file as a bug report to the package maintainer.
As a work-around in your case you could do something like:
levels(dat$number_credits)[1] <- "_1"
and the function should run without problems.
Edit:
I think one solution would be to change one line of code in the homals function, but no guarantee this does work as intended. Better submit a bug report to the package author/maintainer - see https://cran.r-project.org/web/packages/homals/ for the address.
Using rnames <- as.numeric(rownames(clist[[i]]))[order(as.numeric(rownames(clist[[i]])))] instead of rnames <- sort(as.numeric((rownames(clist[[i]])))) would allow the following code to identify NAs, but I am not sure why the author did not try to preserve factor levels outright.
Anyway, you could run a modified function in your local environment, which would require to explicitly call internal (not exported) homals functions, as shown below. Not necessarily the best approach, but would help you out in a pinch.
homals <- function (data, ndim = 2, rank = ndim, level = "nominal", sets = 0,
active = TRUE, eps = 0.000001, itermax = 1000, verbose = 0) {
dframe <- data
name <- deparse(substitute(dframe))
nobj <- nrow(dframe)
nvar <- ncol(dframe)
vname <- names(dframe)
rname <- rownames(dframe)
for (j in 1:nvar) {
dframe[, j] <- as.factor(dframe[, j])
levfreq <- table(dframe[, j])
if (any(levfreq == 0)) {
newlev <- levels(dframe[, j])[-which(levfreq == 0)]
}
else {
newlev <- levels(dframe[, j])
}
dframe[, j] <- factor(dframe[, j], levels = sort(newlev))
}
varcheck <- apply(dframe, 2, function(tl) length(table(tl)))
if (any(varcheck == 1))
stop("Variable with only 1 value detected! Can't proceed with estimation!")
active <- homals:::checkPars(active, nvar)
rank <- homals:::checkPars(rank, nvar)
level <- homals:::checkPars(level, nvar)
if (length(sets) == 1)
sets <- lapply(1:nvar, "c")
if (!all(sort(unlist(sets)) == (1:nvar))) {
print(cat("sets union", sort(unlist(sets)), "\n"))
stop("inappropriate set structure !")
}
nset <- length(sets)
mis <- rep(0, nobj)
for (l in 1:nset) {
lset <- sets[[l]]
if (all(!active[lset]))
(next)()
jset <- lset[which(active[lset])]
for (i in 1:nobj) {
if (any(is.na(dframe[i, jset])))
dframe[i, jset] <- NA
else mis[i] <- mis[i] + 1
}
}
for (j in 1:nvar) {
k <- length(levels(dframe[, j]))
if (rank[j] > min(ndim, k - 1))
rank[j] <- min(ndim, k - 1)
}
x <- cbind(homals:::orthogonalPolynomials(mis, 1:nobj, ndim))
x <- homals:::normX(homals:::centerX(x, mis), mis)$q
y <- lapply(1:nvar, function(j) homals:::computeY(dframe[, j], x))
sold <- homals:::totalLoss(dframe, x, y, active, rank, level, sets)
iter <- pops <- 0
repeat {
iter <- iter + 1
y <- homals:::updateY(dframe, x, y, active, rank, level, sets,
verbose = verbose)
smid <- homals:::totalLoss(dframe, x, y, active, rank, level,
sets)/(nobj * nvar * ndim)
ssum <- homals:::totalSum(dframe, x, y, active, rank, level, sets)
qv <- homals:::normX(homals:::centerX((1/mis) * ssum, mis), mis)
z <- qv$q
snew <- homals:::totalLoss(dframe, z, y, active, rank, level,
sets)/(nobj * nvar * ndim)
if (verbose > 0)
cat("Iteration:", formatC(iter, digits = 3, width = 3),
"Loss Value: ", formatC(c(smid), digits = 6,
width = 6, format = "f"), "\n")
r <- abs(qv$r)/2
ops <- sum(r)
aps <- sum(La.svd(crossprod(x, mis * z), 0, 0)$d)/ndim
if (iter == itermax) {
stop("maximum number of iterations reached")
}
if (smid > sold) {
warning(cat("Loss function increases in iteration ",
iter, "\n"))
}
if ((ops - pops) < eps)
break
else {
x <- z
pops <- ops
sold <- smid
}
}
ylist <- alist <- clist <- ulist <- NULL
for (j in 1:nvar) {
gg <- dframe[, j]
c <- homals:::computeY(gg, z)
d <- as.vector(table(gg))
lst <- homals:::restrictY(d, c, rank[j], level[j])
y <- lst$y
a <- lst$a
u <- lst$z
ylist <- c(ylist, list(y))
alist <- c(alist, list(a))
clist <- c(clist, list(c))
ulist <- c(ulist, list(u))
}
dimlab <- paste("D", 1:ndim, sep = "")
for (i in 1:nvar) {
if (ndim == 1) {
ylist[[i]] <- cbind(ylist[[i]])
ulist[[i]] <- cbind(ulist[[i]])
clist[[i]] <- cbind(clist[[i]])
}
options(warn = -1)
# Here is the line that I changed in the code:
# rnames <- sort(as.numeric((rownames(clist[[i]]))))
rnames <- as.numeric(rownames(clist[[i]]))[order(as.numeric(rownames(clist[[i]])))]
options(warn = 0)
if ((any(is.na(rnames))) || (length(rnames) == 0))
rnames <- rownames(clist[[i]])
if (!is.matrix(ulist[[i]]))
ulist[[i]] <- as.matrix(ulist[[i]])
rownames(ylist[[i]]) <- rownames(ulist[[i]]) <- rownames(clist[[i]]) <- rnames
rownames(alist[[i]]) <- paste(1:dim(alist[[i]])[1])
colnames(clist[[i]]) <- colnames(ylist[[i]]) <- colnames(alist[[i]]) <- dimlab
colnames(ulist[[i]]) <- paste(1:dim(as.matrix(ulist[[i]]))[2])
}
names(ylist) <- names(ulist) <- names(clist) <- names(alist) <- colnames(dframe)
rownames(z) <- rownames(dframe)
colnames(z) <- dimlab
dummymat <- as.matrix(homals:::expandFrame(dframe, zero = FALSE, clean = FALSE))
dummymat01 <- dummymat
dummymat[dummymat == 2] <- NA
dummymat[dummymat == 0] <- Inf
scoremat <- array(NA, dim = c(dim(dframe), ndim), dimnames = list(rownames(dframe),
colnames(dframe), paste("dim", 1:ndim, sep = "")))
for (i in 1:ndim) {
catscores.d1 <- do.call(rbind, ylist)[, i]
dummy.scores <- t(t(dummymat) * catscores.d1)
freqlist <- apply(dframe, 2, function(dtab) as.list(table(dtab)))
cat.ind <- sequence(sapply(freqlist, length))
scoremat[, , i] <- t(apply(dummy.scores, 1, function(ds) {
ind.infel <- which(ds == Inf)
ind.minfel <- which(ds == -Inf)
ind.nan <- which(is.nan(ds))
ind.nael <- which((is.na(ds) + (cat.ind != 1)) ==
2)
ds[-c(ind.infel, ind.minfel, ind.nael, ind.nan)]
}))
}
disc.mat <- apply(scoremat, 3, function(xx) {
apply(xx, 2, function(cols) {
(sum(cols^2, na.rm = TRUE))/nobj
})
})
result <- list(datname = name, catscores = ylist, scoremat = scoremat,
objscores = z, cat.centroids = clist, ind.mat = dummymat01,
loadings = alist, low.rank = ulist, discrim = disc.mat,
ndim = ndim, niter = iter, level = level, eigenvalues = r,
loss = smid, rank.vec = rank, active = active, dframe = dframe,
call = match.call())
class(result) <- "homals"
result
}
I have a numeric data.frame df with 134946 rows x 1938 columns.
99.82% of the data are NA.
For each pair of (distinct) columns "P1" and "P2", I need to find which rows have non-NA values for both and then do some operations on those rows (linear model).
I wrote a script that does this, but it seems quite slow.
This post seems to discuss a related task, but I can't immediately see if or how it can be adapted to my case.
Borrowing the example from that post:
set.seed(54321)
nr = 1000;
nc = 900;
dat = matrix(runif(nr*nc), nrow=nr)
rownames(dat) = paste(1:nr)
colnames(dat) = paste("time", 1:nc)
dat[sample(nr*nc, nr*nc*0.9)] = NA
df <- as.data.frame(dat)
df_ps <- names(df)
N_ps <- length(df_ps)
My script is:
tic = proc.time()
out <- do.call(rbind,sapply(1:(N_ps-1), function(i) {
if (i/10 == floor(i/10)) {
cat("\ni = ",i,"\n")
toc = proc.time();
show(toc-tic);
}
do.call(rbind,sapply((i+1):N_ps, function(j) {
w <- which(complete.cases(df[,i],df[,j]))
N <- length(w)
if (N >= 5) {
xw <- df[w,i]
yw <- df[w,j]
if ((diff(range(xw)) != 0) & (diff(range(yw)) != 0)) {
s <- summary(lm(yw~xw))
o <- c(i,j,N,s$adj.r.squared,s$coefficients[2],s$coefficients[4],s$coefficients[8],s$coefficients[1],s$coefficients[3],s$coefficients[7])} else {
o <- c(i,j,N,rep(NA,7))
}
} else {o <- NULL}
return(o)
},simplify=F))
}
,simplify=F))
toc = proc.time();
show(toc-tic);
This takes about 10 minutes on my machine.
You can imagine what happens when I need to handle a much larger (although more sparse) data matrix. I never managed to finish the calculation.
Question: do you think this could be done more efficiently?
The thing is I don't know which operations take more time (subsetting of df, in which case I would remove duplications of that? appending matrix data, in which case I would create a flat vector and then convert it to matrix at the end? ...).
Thanks!
EDIT following up from minem's post
As shown by minem, the speed of this calculation strongly depended on the way linear regression parameters were calculated. Therefore changing that part was the single most important thing to do.
My own further trials showed that: 1) it was essential to use sapply in combination with do.call(rbind, rather than any flat vector, to store the data (I am still not sure why - I might make a separate post about this); 2) on the original matrix I am working on, much more sparse and with a much larger nrows/ncolumns ratio than the one in this example, using the information on the x vector available at the start of each i iteration to reduce the y vector at the start of each j iteration increased the speed by several orders of magnitude, even compared with minem's original script, which was already much better than mine above.
I suppose the advantage comes from filtering out many rows a priori, thus avoiding costly xna & yna operations on very long vectors.
The modified script is the following:
set.seed(54321)
nr = 1000;
nc = 900;
dat = matrix(runif(nr*nc), nrow = nr)
rownames(dat) = paste(1:nr)
colnames(dat) = paste("time", 1:nc)
dat[sample(nr*nc, nr*nc*0.90)] = NA
df <- as.data.frame(dat)
df_ps <- names(df)
N_ps <- length(df_ps)
tic = proc.time()
naIds <- lapply(df, function(x) !is.na(x))
dl <- as.list(df)
rl <- sapply(1:(N_ps - 1), function(i) {
if ((i-1)/10 == floor((i-1)/10)) {
cat("\ni = ",i,"\n")
toc = proc.time();
show(toc-tic);
}
x <- dl[[i]]
xna <- which(naIds[[i]])
rl2 <- sapply((i + 1):N_ps, function(j) {
y <- dl[[j]][xna]
yna <- which(naIds[[j]][xna])
w <- xna[yna]
N <- length(w)
if (N >= 5) {
xw <- x[w]
yw <- y[yna]
if ((min(xw) != max(xw)) && (min(yw) != max(yw))) {
# extracts from lm/lm.fit/summary.lm functions
X <- cbind(1L, xw)
m <- .lm.fit(X, yw)
# calculate adj.r.squared
fitted <- yw - m$residuals
rss <- sum(m$residuals^2)
mss <- sum((fitted - mean(fitted))^2)
n <- length(m$residuals)
rdf <- n - m$rank
# rdf <- df.residual
r.squared <- mss/(mss + rss)
adj.r.squared <- 1 - (1 - r.squared) * ((n - 1L)/rdf)
# calculate se & pvals
p1 <- 1L:m$rank
Qr <- m$qr
R <- chol2inv(Qr[p1, p1, drop = FALSE])
resvar <- rss/rdf
se <- sqrt(diag(R) * resvar)
est <- m$coefficients[m$pivot[p1]]
tval <- est/se
pvals <- 2 * pt(abs(tval), rdf, lower.tail = FALSE)
res <- c(m$coefficients[2], se[2], pvals[2],
m$coefficients[1], se[1], pvals[1])
o <- c(i, j, N, adj.r.squared, res)
} else {
o <- c(i,j,N,rep(NA,7))
}
} else {o <- NULL}
return(o)
}, simplify = F)
do.call(rbind, rl2)
}, simplify = F)
out2 <- do.call(rbind, rl)
toc = proc.time();
show(toc - tic)
E.g. try with nr=100000; nc=100.
I should probably mention that I tried using indices, i.e.:
naIds <- lapply(df, function(x) which(!is.na(x)))
and then obviously generating w by intersection:
w <- intersect(xna,yna)
N <- length(w)
This however is slower than the above.
Larges bottleneck is lm function, because there are lot of checks & additional calculations, that you do not necessarily need. So I extracted only the needed parts.
I got this to run in +/- 18 seconds.
set.seed(54321)
nr = 1000;
nc = 900;
dat = matrix(runif(nr*nc), nrow = nr)
rownames(dat) = paste(1:nr)
colnames(dat) = paste("time", 1:nc)
dat[sample(nr*nc, nr*nc*0.9)] = NA
df <- as.data.frame(dat)
df_ps <- names(df)
N_ps <- length(df_ps)
tic = proc.time()
naIds <- lapply(df, function(x) !is.na(x)) # outside loop
dl <- as.list(df) # sub-setting list elements is faster that columns
rl <- sapply(1:(N_ps - 1), function(i) {
x <- dl[[i]]
xna <- naIds[[i]] # relevant logical vector if not empty elements
rl2 <- sapply((i + 1):N_ps, function(j) {
y <- dl[[j]]
yna <- naIds[[j]]
w <- xna & yna
N <- sum(w)
if (N >= 5) {
xw <- x[w]
yw <- y[w]
if ((min(xw) != max(xw)) && (min(xw) != max(xw))) { # faster
# extracts from lm/lm.fit/summary.lm functions
X <- cbind(1L, xw)
m <- .lm.fit(X, yw)
# calculate adj.r.squared
fitted <- yw - m$residuals
rss <- sum(m$residuals^2)
mss <- sum((fitted - mean(fitted))^2)
n <- length(m$residuals)
rdf <- n - m$rank
# rdf <- df.residual
r.squared <- mss/(mss + rss)
adj.r.squared <- 1 - (1 - r.squared) * ((n - 1L)/rdf)
# calculate se & pvals
p1 <- 1L:m$rank
Qr <- m$qr
R <- chol2inv(Qr[p1, p1, drop = FALSE])
resvar <- rss/rdf
se <- sqrt(diag(R) * resvar)
est <- m$coefficients[m$pivot[p1]]
tval <- est/se
pvals <- 2 * pt(abs(tval), rdf, lower.tail = FALSE)
res <- c(m$coefficients[2], se[2], pvals[2],
m$coefficients[1], se[1], pvals[1])
o <- c(i, j, N, adj.r.squared, res)
} else {
o <- c(i,j,N,rep(NA,6))
}
} else {o <- NULL}
return(o)
}, simplify = F)
do.call(rbind, rl2)
}, simplify = F)
out2 <- do.call(rbind, rl)
toc = proc.time();
show(toc - tic);
# user system elapsed
# 17.94 0.11 18.44
I have tried to improve my previous code so that I can incorporate conditional probability.
Source Code
states <- c(1, 2)
alpha <- c(1, 1)/2
mat <- matrix(c(0.5, 0.5,
0, 1), nrow = 2, ncol = 2, byrow = TRUE)
# this function calculates the next state, if present state is given.
# X = present states
# pMat = probability matrix
nextX <- function(X, pMat)
{
#set.seed(1)
probVec <- vector() # initialize vector
if(X == states[1]) # if the present state is 1
{
probVec <- pMat[1,] # take the 1st row
}
if(X==states[2]) # if the prsent state is 2
{
probVec <- pMat[2,] # take the 2nd row
}
return(sample(states, 1, replace=TRUE, prob=probVec)) # calculate the next state
}
# this function simulates 5 steps
steps <- function(alpha1, mat1, n1)
{
vec <- vector(mode="numeric", length = n1+1) # initialize an empty vector
X <- sample(states, 1, replace=TRUE, prob=alpha1) # initial state
vec[1] <- X
for (i in 2:(n1+1))
{
X <- nextX(X, mat1)
vec[i] <- X
}
return (vec)
}
# this function repeats the simulation n1 times.
# steps(alpha1=alpha, mat1=mat, n1=5)
simulate <- function(alpha1, mat1, n1)
{
mattt <- matrix(nrow=n1, ncol=6, byrow=T);
for (i in 1:(n1))
{
temp <- steps(alpha1, mat1, 5)
mattt[i,] <- temp
}
return (mattt)
}
Execution
I created this function so that it can handle any conditional probability:
prob <- function(simMat, fromStep, toStep, fromState, toState)
{
mean(simMat[toStep+1, simMat[fromStep+1, ]==fromState]==toState)
}
sim <- simulate(alpha, mat, 10)
p <- prob(sim, 0,1,1,1) # P(X1=1|X0=1)
p
Output
NaN
Why is this source code giving NaN?
How can I correct it?
I didn't inspect the rest of your code, but it seems that only prob has a mistake; you are mixing up rows with columns and instead it should be
prob <- function(simMat, fromStep, toStep, fromState, toState)
mean(simMat[simMat[, fromStep + 1] == fromState, toStep + 1] == toState)
Then NaN still remains a valid possibility for the following reason. We are looking at a conditional probability P(X1=1|X0=1) which, by definition, is well defined only when P(X0=1)>0. The same holds with sample estimates: if there are no cases where X0=1, then the "denominator" in the mean inside of prob is zero. Thus, it cannot and should not be fixed (i.e., returning 0 in those cases would be wrong).
I'm trying to follow the code and steps described on THIS page.
Which is in two parts:
Part 1
library(foreach)
library(doParallel)
library(data.table)
library(raster)
# Time the code
start <- proc.time()
if (!file.exists("./DataSets")) {
dir.create("./DataSets")
}
# Data Source:
# http://sedac.ciesin.columbia.edu/data/set/gpw-v3-population-count/data-download
# Format: .ascii, 1/2 degree, 2000
population.file <- "./Canada/VoteDensityRaster64Bit.tif"
# Load the raster file
population.raster <- raster(population.file)
# Convert the raster file to a points file
population.points <- rasterToPoints(population.raster)
all.data <- as.data.table(population.points)
setnames(all.data, c("x", "y", "population"))
# If you have your data in a CSV file, use this instead
# file <- "./DataSets/NBBuildingsWGS84.csv"
# all.data <- data.table(fread(file))
# The following are used to manipulate various data sets
# colnames(all.data) <- c("Name", "Mass", "Latitude", "Longitude") # Meteorites
# all.data$X <- as.numeric(all.data$X)
# all.data$Y <- as.numeric(all.data$Y)
# all.data$Mass <- as.numeric(all.data$Mass)
startEnd <- function(lats, lngs) {
# Find the "upper left" (NW) and "bottom right" (SE) coordinates
# of a set of data.
#
# Args:
# lats: A list of latitude coordinates
# lngs: A list of longitude coordinates
#
# Returns:
# A list of values corresponding to the northwest-most and
# southeast-most coordinates
# Convert to real number and remove NA values
lats <- na.omit(as.numeric(lats))
lngs <- na.omit(as.numeric(lngs))
topLat <- max(lats)
topLng <- min(lngs)
botLat <- min(lats)
botLng <- max(lngs)
return(c(topLat, topLng, botLat, botLng))
}
startEndVals <- startEnd(all.data$y, all.data$x)
remove(startEnd)
startLat <- startEndVals[1]
endLat <- startEndVals[3]
startLng <- startEndVals[2]
endLng <- startEndVals[4]
remove(startEndVals)
interval.v.num = 200.0
interval.h.num = 800.0
interval.v <- (startLat - endLat) / interval.v.num
interval.h <- (endLng - startLng) / interval.h.num
remove(num_intervals)
lat.list <- seq(startLat, endLat + interval.v, -1*interval.v)
# testLng <- -66.66152983 # Fredericton
# testLat <- 45.96538183 # Fredericton
# Prepare the data to be sent in
# If you have a value you want to sum, use this
data <- all.data[,list(x, y, population)]
# If you want to perform a count, use this
# data <- all.data[,list(x, y)]
# data[,Value:=1]
sumInsideSquare <- function(pointLat, pointLng, data) {
# Sum all the values that fall within a square on a map given a point,
# an interval of the map, and data that contains lat, lng and the values
# of interest
setnames(data, c("lng", "lat", "value"))
# Get data inside lat/lon boundaries
lng.interval <- c(pointLng, pointLng + interval.h)
lat.interval <- c(pointLat - interval.v, pointLat)
data <- data[lng %between% lng.interval][lat %between% lat.interval]
return(sum(data$value))
}
# Debugging
# squareSumTemp <- sumInsideSquare(testLat, testLng, interval, data)
# Given a start longitude and an end longitude, calculate an array of values
# corresponding to the sums for that latitude
calcSumLat <- function(startLng, endLng, lat, data) {
row <- c()
lng <- startLng
while (lng < endLng) {
row <- c(row, sumInsideSquare(lat, lng, data))
lng <- lng + interval.h
}
return(row)
}
# Debugging
# rowTemp <- calcSumLat(startLng, endLng, testLat, interval, data)
# write.csv(rowTemp, file = "Temp.csv", row.names = FALSE)
# Set up parallel computing with the number of cores you have
cl <- makeCluster(detectCores(), outfile = "./Progress.txt")
registerDoParallel(cl)
all.sums <- foreach(lat=lat.list, .packages=c("data.table")) %dopar% {
lat.data <- calcSumLat(startLng, endLng, lat, data)
# Progress indicator that works on Mac/Windows
print((startLat - lat)/(startLat - endLat)*100) # Prints to Progress.txt
lat.data
}
stopCluster(cl = cl)
# Convert to data frame
all.sums.table <- as.data.table(all.sums)
# Save to disk so I don't have to run it again
if (!file.exists("./GeneratedData")) {
dir.create("./GeneratedData")
}
output.file <- "./GeneratedData/VoteDensityHighRes.csv"
write.csv(all.sums.table, file = output.file, row.names = FALSE)
# End timer
totalTime <- proc.time() - start
print(totalTime)
# remove(cl, endLat, endLng, startLat, startLng, lat.list, start, calcSumLat, sumInsideSquare, interval)
Part 2
library(graphics)
library(tcltk)
library(pracma)
# Load the data generated by 01GenerateData.R
plot.data <- read.csv("GeneratedData/VoteDensityHighRes.csv", header=TRUE, stringsAsFactors=FALSE)
# Add padding above/below where there was data
# On top
top.padding <- 1:23
for (i in top.padding) {
plot.data <- cbind(0, plot.data)
}
# On bottom
bottom.padding <- 1:23
for (i in bottom.padding) {
plot.data <- cbind(plot.data, 0)
}
# On left
zero.row <- vector(mode="integer", length=dim(plot.data)[1])
left.padding <- 1:10
for (i in left.padding) {
plot.data <- rbind(zero.row, plot.data)
}
# On right
right.padding <- 1:10
for (i in left.padding) {
plot.data <- rbind(plot.data, zero.row)
}
max <- max(plot.data) # Max value in the data, used for scaling
plottingHeight <- 1000 # Arbitrary number that provides the graph's height
scaleFactor <- 300 # Discovered through trial and error to keep the graph in the boundaries
gap <- plottingHeight / length(plot.data) # Space between lines
# Output the file to a 36 inch by 24 inch SVG canvas
plot.width = 36
plot.height = 24
svg(filename = "./TestPlots/CanadaSG03.svg", pointsize=12, width=plot.width, height=plot.height)
# Create a blank plot
yVals <- as.vector(plot.data[[1]] / max * scaleFactor)
plot(0, 0, xlim=c(0, length(yVals)), ylim=c(0,1100), type="n", las=1, xlab=NA, ylab=NA, bty="n", axes=FALSE)
plotting.threshold <- 0.1
plot.length = length(plot.data)
# Progress bar
pb = tkProgressBar(title = "Plot Progress", label = "", min = 1, max = plot.length, initial = 1, width = 300)
# Plot each line
for (i in 1:plot.length) {
# Grabs a row of data
yVals <- as.vector(plot.data[[i]] / max * scaleFactor)
xVals <- c(0:(length(yVals) - 1))
yVals.smooth = savgol(yVals, 3, forder=4)
polygon(xVals, yVals.smooth + plottingHeight, border = NA, col = "#ffffff")
lines(xVals, yVals.smooth + plottingHeight, col="#cccccc", lwd=1.5)
# Plot the peaks with a darker line.
j <- 2 # Skip padding
while (j <= (length(yVals.smooth) - 2)) {
if ((yVals.smooth[j]) > plotting.threshold | (yVals.smooth[j+1]) > plotting.threshold) {
segments(xVals[j], yVals.smooth[j] + plottingHeight, xVals[j+1], yVals.smooth[j+1] + plottingHeight, col="#000000", lwd=1.5)
} else { } # Do nothing
j <- j + 1
}
plottingHeight <- plottingHeight - gap
# Update the progress bar
info <- sprintf("%d%% Complete", round(i / plot.length * 100))
setTkProgressBar(pb, i, title="Progress", info)
}
dev.off()
Sys.sleep(1)
close(pb) # Close the progress bar after a couple seconds
Everything runs perfect until this part of the code from the second part is running:
yVals <- as.vector(plot.data[[1]] / max * scaleFactor)
plot(0, 0, xlim=c(0, length(yVals)), ylim=c(0,1100), type="n", las=1,xlab=NA, ylab=NA, bty="n", axes=FALSE)
And I get the following error message:
Error in plot.new() : cairo error 'error while writing to output stream'
I'm using R 3.3.1 and Rstudio on windows 10, I've also try to run the code with R 2.15.3.
How can i fix this error?
I would like to do an acf plot in R for only the negative values of a time series. I cannot do this by just subsetting the data for only negative values beforehand, because then the autocorrelation will remove arbitrary number of positive days in between the negative values and be unreasonably high, but rather, I would like to run the autocorrelation on the whole time series and then filter out the results given the first day is negative.
For example, in theory, I could make a data frame with the original series and all of the lagged time series in a data frame, then filter for the negative values in the original series, and then plot the correlations. However, I would like to automate this using the acf plot.
Here is an example of my time series:
> dput(exampleSeries)
c(0, 0, -0.000687, -0.004489, -0.005688, 0.000801, 0.005601,
0.004546, 0.003451, -0.000836, -0.002796, 0.005581, -0.003247,
-0.002416, 0.00122, 0.005337, -0.000195, -0.004255, -0.003097,
0.000751, -0.002037, 0.00837, -0.003965, -0.001786, 0.008497,
0.000693, 0.000824, 0.005681, 0.002274, 0.000773, 0.001141, 0.000652,
0.001559, -0.006201, 0.000479, -0.002041, 0.002757, -0.000736,
-2.1e-05, 0.000904, -0.000319, -0.000227, -0.006589, 0.000998,
0.00171, 0.000271, -0.004121, -0.002788, -9e-04, 0.001639, 0.004245,
-0.00267, -0.004738, 0.001192, 0.002175, 0.004666, 0.006005,
0.001218, -0.003188, -0.004363, 0.000462, -0.002241, -0.004806,
0.000463, 0.000795, -0.005715, 0.004635, -0.004286, -0.008908,
-0.001044, -0.000842, -0.00445, -0.006094, -0.001846, 0.005013,
-0.006599, 0.001914, 0.00221, 6.2e-05, -0.001391, 0.004369, -0.005739,
-0.003467, -0.002103, -0.000882, 0.001483, 0.003074, 0.00165,
-0.00035, -0.000573, -0.00316, -0.00102, -0.00144, 0.003421,
0.005436, 0.001994, 0.00619, 0.005319, 7.3e-05, 0.004513)
I tried to implement your description.
correl <- function(x, lag.max = 10){
library(dplyr)
m <- matrix(ncol = lag.max, nrow = length(x))
for(i in 1:lag.max){
m[,i] <- lag(x, i)
}
m <- m[x<0,]
res <- apply(m, 2, function(y) cor(y, x[x<0], use = "complete.obs"))
barplot(res)
}
correl(exampleSeries)
Maybe just write your own function? Something like:
negativeACF <- function(x, num.lags = 10)
{
n <- length(x)
acfs <- sapply(0:num.lags, function(i) cor(x[-i:-1], x[(-n-1+i):-n]))
names(acfs) <- 0:num.lags
acfs[acfs < 0]
}
results <- negativeACF(exampleSeries, num.lags=20)
barplot(results)
Yea I ended up writing my own functions and just replacing the values in the R acf object with my own values that are just the correlations. So:
genACF <- function(series, my.acf, lag.max = NULL, neg){
x <- na.fail(as.ts(series))
x.freq <- frequency(x)
x <- as.matrix(x)
if (!is.numeric(x))
stop("'x' must be numeric")
sampleT <- as.integer(nrow(x))
nser <- as.integer(ncol(x))
if (is.null(lag.max))
lag.max <- floor(10 * (log10(sampleT) - log10(nser)))
lag.max <- as.integer(min(lag.max, sampleT - 1L))
if (is.na(lag.max) || lag.max < 0)
stop("'lag.max' must be at least 0")
if(neg){
indices <- which(series < 0)
}else{
indices <- which(series > 0)
}
series <- scale(series, scale = FALSE)
series.zoo <- zoo(series)
for(i in 0:lag.max){
lag.series <- lag(series.zoo, k = -i, na.pad = TRUE)
temp.corr <- cor(series.zoo[indices], lag.series[indices], use = 'complete.obs', method = 'pearson')
my.acf[i+1] <- temp.corr
}
my.acf[1] <- 0
return(my.acf)
}
plotMyACF <- function(series, main, type = 'correlation', neg = TRUE){
series.acf <- acf(series, plot = FALSE)
my.acf <- genACF(series, series.acf$acf, neg = neg)
series.acf$acf <- my.acf
plot(series.acf, xlim = c(1, dim(series.acf$acf)[1] - (type == 'correlation')), xaxt = "n", main = main)
if (dim(series.acf$acf)[1] < 25){
axis(1, at = 1:(dim(series.acf$acf)[1] - 1))
}else{
axis(1)
}
}
And I get something like this: