Interval sets algebra in R (union, intersection, differences, inclusion, ...)

Interval sets algebra in R (union, intersection, differences, inclusion, ...) - r

I am wondering whether a proper framework for interval manipulation and comparison does exist in R.
After some search, I was only able to find the following:
- function findInterval in base Package. (but I hardly understand it)
- some answers here and there about union and intersection (notably: http://r.789695.n4.nabble.com/Union-Intersect-two-continuous-sets-td4224545.html)
Would you know of an initiative to implement a comprehensive set of tools to easily handles frequent tasks in interval manipulation, like inclusion/setdiff/union/intersection/etc. (eg see here for a list of functionalities)?
or would you have advice in developing such an approach?
below are some drafts on my side for doing so. it is surely awkward and still has some bugs but it might illustrate what I am looking for.
preliminary aspects about the options taken
- should deal seamlessly with intervals or intervals set
- intervals are represented as 2 columns data.frames (lower boundary, higher boundary), on one row
- intervals sets are represented as 2 columns with several rows
- a third column might be needed for identification of intervals sets
UNION
interval_union <- function(df){ # for data frame
df <- interval_clean(df)
if(is.empty(df)){
return(as.data.frame(NULL))
} else {
if(is.POSIXct(df[,1])) {
dated <- TRUE
df <- colwise(as.numeric)(df)
} else {
dated <- FALSE
}
M <- as.matrix(df)
o <- order(c(M[, 1], M[, 2]))
n <- cumsum( rep(c(1, -1), each=nrow(M))[o])
startPos <- c(TRUE, n[-1]==1 & n[-length(n)]==0)
endPos <- c(FALSE, n[-1]==0 & n[-length(n)]==1)
M <- M[o]
if(dated == TRUE) {
df2 <- colwise(mkDateTime)(as.data.frame(cbind(M[startPos], M[endPos])), from.s = TRUE)
} else {
df2 <- as.data.frame(cbind(M[startPos], M[endPos]))
}
colnames(df2) <- colnames(df)
# print(df2)
return(df2)
}
}
union_1_1 <- function(test, ref){
names(ref) <- names(test)
tmp <- interval_union(as.data.frame(rbind(test, ref)))
return(tmp)
}
union_1_n <- function(test, ref){
return(union_1_1(test, ref))
}
union_n_n <- function(test, ref){
testnn <- adply(.data = test, 1, union_1_n, ref, .expand = FALSE)
return(testnn)
}
ref_interval_union <- function(df, ref){
tmp0 <- adply(df, 1, union_1_1, ref, .expand = FALSE) # set to FALSE to keep ID
return(tmp0)
}
INTERSECTION
interval_intersect <- function(df){
# adapted from : http://r.789695.n4.nabble.com/Union-Intersect-two-continuous-sets-td4224545.html
M <- as.matrix(df)
L <- max(M[, 1])
R <- min(M[, 2])
Inew <- if (L <= R) c(L, R) else c()
if (!is.empty(Inew)){
df2 <- t(as.data.frame(Inew))
colnames(df2) <- colnames(df)
rownames(df2) <- NULL
} else {
df2 <- NULL
}
return(as.data.frame(df2))
}
ref_interval_intersect <- function(df, ref){
tmpfun <- function(a, b){
names(b) <- names(a)
tmp <- interval_intersect(as.data.frame(rbind(a, b)))
return(tmp)
}
tmp0 <- adply(df, 1, tmpfun, ref, .expand = FALSE) # [,3:4]
#if(!is.empty(tmp0)) colnames(tmp0) <- colnames(df)
return(tmp0)
}
int_1_1 <- function(test, ref){
te <- as.vector(test)
re <- as.vector(ref)
names(re) <- names(te)
tmp0 <- c(max(te[1, 1], re[1, 1]), min(te[1, 2], re[1, 2]))
if(tmp0[1]>tmp0[2]) tmp0 <- NULL # inverse of a correct interval --> VOID
if(!is.empty(tmp0)){
tmp1 <- colwise(mkDateTime)(as.data.frame(t(as.data.frame(tmp0))))
colnames(tmp1) <- colnames(test)
} else {
tmp1 <- data.frame(NULL)
}
return(tmp1)
}
int_1_n <- function(test, ref){
test1 <- adply(.data = ref, 1, int_1_1, test = test, .expand = FALSE)
if(is.empty(test1)){
return(data.frame(NULL))
} else {
testn <- interval_union(test1[,2:3])
return(testn)
}
}
int_n_n <- function(test, ref){
testnn <- adply(.data = test, 1, int_1_n, ref, .expand = FALSE)
# return(testnn[,2:3]) # return interval set without index (1st column)
return(testnn) # return interval set with index (1st column) --> usefull to go with merge to keep metadata going alon g with interval description
}
int_intersect <- function(df, ref){
mycols <- colnames(df)
df$X1 <- 1:nrow(df)
test <- df[, 1:2]
tmp <- int_n_n(test, ref)
intersection <- merge(tmp, df, by = "X1", suffixes = c("", "init"))
return(intersection[,mycols])
}
EXCLUSION
excl_1_1 <- function(test, ref){
te <- as.vector(test)
re <- as.vector(ref)
names(re) <- names(te)
if(te[1] < re[1]){ # Lower Bound
if(te[2] > re[1]){ # overlap
x <- unlist(c(te[1], re[1]))
} else { # no overlap
x <- unlist(c(te[1], te[2]))
}
} else { # test > ref on lower bound side
x <- NULL
}
if(te[2] > re[2]){ # Upper Bound
if(te[1] < re[2]){ # overlap
y <- unlist(c(re[2], te[2]))
} else { # no overlap
y <- unlist(c(te[1], te[2]))
}
} else { # test < ref on upper bound side
y <- NULL
}
if(is.empty(x) & is.empty(y)){
tmp0 <- NULL
tmp1 <- tmp0
} else {
tmp0 <- as.data.frame(rbind(x, y))
colnames(tmp0) <- colnames(test)
tmp1 <- interval_union(tmp0)
}
return(tmp1)
}
excl_1_n <- function(test, ref){
testn0 <- adply(.data = ref, 1, excl_1_1, test = test, .expand=FALSE)
# boucle pour intersecter successivement les intervalles sets, pour gérer les intervalles disjoints (identifiés par X1, col1)
tmp <- range(testn0)
names(tmp) <- colnames(testn0)[2:3]
tmp <- as.data.frame(t(tmp))
for(i in unique(testn0[,1])){
tmp <- int_n_n(tmp, testn0[testn0[,1]==i, 2:3])
}
return(tmp)
}
INCLUSION
incl_1_1 <- function(test, ref){
te <- as.vector(test)
re <- as.vector(ref)
if(te[1] >= re[1] & te[2] <= re[2]){ return(TRUE) } else { return(FALSE) }
}
incl_1_n <- function(test, ref){
testn <- adply(.data = ref, 1, incl_1_1, test = test)
return(any(testn[,ncol(testn)]))
}
incl_n_n <- function(test, ref){
testnn <- aaply(.data = test, 1, incl_1_n, ref, .expand = FALSE)
names(testnn) <- NULL
return(testnn)
}
flat_incl_n_n <- function(test, ref){
ref <- interval_union(ref)
return(incl_n_n(test, ref))
}
# testing for a vector, instead of an interval set
incl_x_1 <- function(x, ref){
test <- (x>=ref[1,1] & x<ref[1,2])
return(test)
}
incl_x_n <- function(x, ref){
test <- any(x>=ref[,1] & x<ref[,2])
return(test)
}

I think you might be able to make good use of the many interval-related functions in the sets package.
Here's a small example illustrating the package's support for interval construction, intersection, set difference, union, and complementation, as well as its test for inclusion in an interval. These and many other related functions are documented on the help page for ?interval.
library(sets)
i1 <- interval(1,6)
i2 <- interval(5,10)
i3 <- interval(200,400)
i4 <- interval(202,402)
i5 <- interval_union(interval_intersection(i1,i2),
interval_symdiff(i3,i4))
i5
# [5, 6] U [200, 202) U (400, 402]
interval_complement(i5)
# [-Inf, 5) U (6, 200) U [202, 400] U (402, Inf]
interval_contains_element(i5, 5.5)
# [1] TRUE
interval_contains_element(i5, 201)
# [1] TRUE
If your intervals are currently encoded in a two-column data.frame, you could use something like mapply() to convert them to intervals of the type used by the sets package:
df <- data.frame(lBound = c(1,5,100), uBound = c(10, 6, 200))
Ints <- with(df, mapply("interval", l=lBound, r=uBound, SIMPLIFY=FALSE))
Ints
# [[1]]
# [1, 10]
# [[2]]
# [5, 6]
# [[3]]
# [100, 200]

Related

homals package for Nonlinear PCA in R: Error in dimnames(x) <- dn : length of 'dimnames' [1] not equal to array extent

I am trying to implement NLPCA (Nonlinear PCA) on a data set using the homals package in R but I keep on getting the following error message:
Error in dimnames(x) <- dn : length of 'dimnames' [1] not equal to array extent
The data set I use can be found in the UCI ML Repository and it's called dat when imported in R: https://archive.ics.uci.edu/ml/datasets/South+German+Credit+%28UPDATE%29
Here is my code (some code is provided once the data set is downloaded):
nlpcasouthgerman <- homals(dat, rank=1, level=c('nominal','numerical',rep('nominal',2),
'numerical','nominal',
rep('ordinal',2), rep('nominal',2),
'ordinal','nominal','numerical',
rep('nominal',2), 'ordinal',
'nominal','ordinal',rep('nominal',3)),
active=c(FALSE, rep(TRUE, 20)), ndim=3, verbose=1)
I am trying to predict the first attribute, therefore I set it to be active=FALSE.
The output looks like this (skipped all iteration messages):
Iteration: 1 Loss Value: 0.000047
Iteration: 2 Loss Value: 0.000044
...
Iteration: 37 Loss Value: 0.000043
Iteration: 38 Loss Value: 0.000043
Error in dimnames(x) <- dn :
length of 'dimnames' [1] not equal to array extent
I don't understand why this error comes up. I have used the same code on some other data set and it worked fine so I don't see why this error persists. Any suggestions about what might be going wrong and how I could fix this issue?
Thanks!

It seems the error comes from code generating NAs in the homals function, specifically for your data for the number_credits levels, which causes problems with sort(as.numeric((rownames(clist[[i]])))) and the attempt to catch the error, since one of the levels does not give an NA value.
So either you have to modify the homals function to take care of such an edge case, or change problematic factor levels. This might be something to file as a bug report to the package maintainer.
As a work-around in your case you could do something like:
levels(dat$number_credits)[1] <- "_1"
and the function should run without problems.
Edit:
I think one solution would be to change one line of code in the homals function, but no guarantee this does work as intended. Better submit a bug report to the package author/maintainer - see https://cran.r-project.org/web/packages/homals/ for the address.
Using rnames <- as.numeric(rownames(clist[[i]]))[order(as.numeric(rownames(clist[[i]])))] instead of rnames <- sort(as.numeric((rownames(clist[[i]])))) would allow the following code to identify NAs, but I am not sure why the author did not try to preserve factor levels outright.
Anyway, you could run a modified function in your local environment, which would require to explicitly call internal (not exported) homals functions, as shown below. Not necessarily the best approach, but would help you out in a pinch.
homals <- function (data, ndim = 2, rank = ndim, level = "nominal", sets = 0,
active = TRUE, eps = 0.000001, itermax = 1000, verbose = 0) {
dframe <- data
name <- deparse(substitute(dframe))
nobj <- nrow(dframe)
nvar <- ncol(dframe)
vname <- names(dframe)
rname <- rownames(dframe)
for (j in 1:nvar) {
dframe[, j] <- as.factor(dframe[, j])
levfreq <- table(dframe[, j])
if (any(levfreq == 0)) {
newlev <- levels(dframe[, j])[-which(levfreq == 0)]
}
else {
newlev <- levels(dframe[, j])
}
dframe[, j] <- factor(dframe[, j], levels = sort(newlev))
}
varcheck <- apply(dframe, 2, function(tl) length(table(tl)))
if (any(varcheck == 1))
stop("Variable with only 1 value detected! Can't proceed with estimation!")
active <- homals:::checkPars(active, nvar)
rank <- homals:::checkPars(rank, nvar)
level <- homals:::checkPars(level, nvar)
if (length(sets) == 1)
sets <- lapply(1:nvar, "c")
if (!all(sort(unlist(sets)) == (1:nvar))) {
print(cat("sets union", sort(unlist(sets)), "\n"))
stop("inappropriate set structure !")
}
nset <- length(sets)
mis <- rep(0, nobj)
for (l in 1:nset) {
lset <- sets[[l]]
if (all(!active[lset]))
(next)()
jset <- lset[which(active[lset])]
for (i in 1:nobj) {
if (any(is.na(dframe[i, jset])))
dframe[i, jset] <- NA
else mis[i] <- mis[i] + 1
}
}
for (j in 1:nvar) {
k <- length(levels(dframe[, j]))
if (rank[j] > min(ndim, k - 1))
rank[j] <- min(ndim, k - 1)
}
x <- cbind(homals:::orthogonalPolynomials(mis, 1:nobj, ndim))
x <- homals:::normX(homals:::centerX(x, mis), mis)$q
y <- lapply(1:nvar, function(j) homals:::computeY(dframe[, j], x))
sold <- homals:::totalLoss(dframe, x, y, active, rank, level, sets)
iter <- pops <- 0
repeat {
iter <- iter + 1
y <- homals:::updateY(dframe, x, y, active, rank, level, sets,
verbose = verbose)
smid <- homals:::totalLoss(dframe, x, y, active, rank, level,
sets)/(nobj * nvar * ndim)
ssum <- homals:::totalSum(dframe, x, y, active, rank, level, sets)
qv <- homals:::normX(homals:::centerX((1/mis) * ssum, mis), mis)
z <- qv$q
snew <- homals:::totalLoss(dframe, z, y, active, rank, level,
sets)/(nobj * nvar * ndim)
if (verbose > 0)
cat("Iteration:", formatC(iter, digits = 3, width = 3),
"Loss Value: ", formatC(c(smid), digits = 6,
width = 6, format = "f"), "\n")
r <- abs(qv$r)/2
ops <- sum(r)
aps <- sum(La.svd(crossprod(x, mis * z), 0, 0)$d)/ndim
if (iter == itermax) {
stop("maximum number of iterations reached")
}
if (smid > sold) {
warning(cat("Loss function increases in iteration ",
iter, "\n"))
}
if ((ops - pops) < eps)
break
else {
x <- z
pops <- ops
sold <- smid
}
}
ylist <- alist <- clist <- ulist <- NULL
for (j in 1:nvar) {
gg <- dframe[, j]
c <- homals:::computeY(gg, z)
d <- as.vector(table(gg))
lst <- homals:::restrictY(d, c, rank[j], level[j])
y <- lst$y
a <- lst$a
u <- lst$z
ylist <- c(ylist, list(y))
alist <- c(alist, list(a))
clist <- c(clist, list(c))
ulist <- c(ulist, list(u))
}
dimlab <- paste("D", 1:ndim, sep = "")
for (i in 1:nvar) {
if (ndim == 1) {
ylist[[i]] <- cbind(ylist[[i]])
ulist[[i]] <- cbind(ulist[[i]])
clist[[i]] <- cbind(clist[[i]])
}
options(warn = -1)
# Here is the line that I changed in the code:
# rnames <- sort(as.numeric((rownames(clist[[i]]))))
rnames <- as.numeric(rownames(clist[[i]]))[order(as.numeric(rownames(clist[[i]])))]
options(warn = 0)
if ((any(is.na(rnames))) || (length(rnames) == 0))
rnames <- rownames(clist[[i]])
if (!is.matrix(ulist[[i]]))
ulist[[i]] <- as.matrix(ulist[[i]])
rownames(ylist[[i]]) <- rownames(ulist[[i]]) <- rownames(clist[[i]]) <- rnames
rownames(alist[[i]]) <- paste(1:dim(alist[[i]])[1])
colnames(clist[[i]]) <- colnames(ylist[[i]]) <- colnames(alist[[i]]) <- dimlab
colnames(ulist[[i]]) <- paste(1:dim(as.matrix(ulist[[i]]))[2])
}
names(ylist) <- names(ulist) <- names(clist) <- names(alist) <- colnames(dframe)
rownames(z) <- rownames(dframe)
colnames(z) <- dimlab
dummymat <- as.matrix(homals:::expandFrame(dframe, zero = FALSE, clean = FALSE))
dummymat01 <- dummymat
dummymat[dummymat == 2] <- NA
dummymat[dummymat == 0] <- Inf
scoremat <- array(NA, dim = c(dim(dframe), ndim), dimnames = list(rownames(dframe),
colnames(dframe), paste("dim", 1:ndim, sep = "")))
for (i in 1:ndim) {
catscores.d1 <- do.call(rbind, ylist)[, i]
dummy.scores <- t(t(dummymat) * catscores.d1)
freqlist <- apply(dframe, 2, function(dtab) as.list(table(dtab)))
cat.ind <- sequence(sapply(freqlist, length))
scoremat[, , i] <- t(apply(dummy.scores, 1, function(ds) {
ind.infel <- which(ds == Inf)
ind.minfel <- which(ds == -Inf)
ind.nan <- which(is.nan(ds))
ind.nael <- which((is.na(ds) + (cat.ind != 1)) ==
2)
ds[-c(ind.infel, ind.minfel, ind.nael, ind.nan)]
}))
}
disc.mat <- apply(scoremat, 3, function(xx) {
apply(xx, 2, function(cols) {
(sum(cols^2, na.rm = TRUE))/nobj
})
})
result <- list(datname = name, catscores = ylist, scoremat = scoremat,
objscores = z, cat.centroids = clist, ind.mat = dummymat01,
loadings = alist, low.rank = ulist, discrim = disc.mat,
ndim = ndim, niter = iter, level = level, eigenvalues = r,
loss = smid, rank.vec = rank, active = active, dframe = dframe,
call = match.call())
class(result) <- "homals"
result
}

Speeding up linear model fitting on complete pairwise observations in large sparse matrix in R

I have a numeric data.frame df with 134946 rows x 1938 columns.
99.82% of the data are NA.
For each pair of (distinct) columns "P1" and "P2", I need to find which rows have non-NA values for both and then do some operations on those rows (linear model).
I wrote a script that does this, but it seems quite slow.
This post seems to discuss a related task, but I can't immediately see if or how it can be adapted to my case.
Borrowing the example from that post:
set.seed(54321)
nr = 1000;
nc = 900;
dat = matrix(runif(nr*nc), nrow=nr)
rownames(dat) = paste(1:nr)
colnames(dat) = paste("time", 1:nc)
dat[sample(nr*nc, nr*nc*0.9)] = NA
df <- as.data.frame(dat)
df_ps <- names(df)
N_ps <- length(df_ps)
My script is:
tic = proc.time()
out <- do.call(rbind,sapply(1:(N_ps-1), function(i) {
if (i/10 == floor(i/10)) {
cat("\ni = ",i,"\n")
toc = proc.time();
show(toc-tic);
}
do.call(rbind,sapply((i+1):N_ps, function(j) {
w <- which(complete.cases(df[,i],df[,j]))
N <- length(w)
if (N >= 5) {
xw <- df[w,i]
yw <- df[w,j]
if ((diff(range(xw)) != 0) & (diff(range(yw)) != 0)) {
s <- summary(lm(yw~xw))
o <- c(i,j,N,s$adj.r.squared,s$coefficients[2],s$coefficients[4],s$coefficients[8],s$coefficients[1],s$coefficients[3],s$coefficients[7])} else {
o <- c(i,j,N,rep(NA,7))
}
} else {o <- NULL}
return(o)
},simplify=F))
}
,simplify=F))
toc = proc.time();
show(toc-tic);
This takes about 10 minutes on my machine.
You can imagine what happens when I need to handle a much larger (although more sparse) data matrix. I never managed to finish the calculation.
Question: do you think this could be done more efficiently?
The thing is I don't know which operations take more time (subsetting of df, in which case I would remove duplications of that? appending matrix data, in which case I would create a flat vector and then convert it to matrix at the end? ...).
Thanks!
EDIT following up from minem's post
As shown by minem, the speed of this calculation strongly depended on the way linear regression parameters were calculated. Therefore changing that part was the single most important thing to do.
My own further trials showed that: 1) it was essential to use sapply in combination with do.call(rbind, rather than any flat vector, to store the data (I am still not sure why - I might make a separate post about this); 2) on the original matrix I am working on, much more sparse and with a much larger nrows/ncolumns ratio than the one in this example, using the information on the x vector available at the start of each i iteration to reduce the y vector at the start of each j iteration increased the speed by several orders of magnitude, even compared with minem's original script, which was already much better than mine above.
I suppose the advantage comes from filtering out many rows a priori, thus avoiding costly xna & yna operations on very long vectors.
The modified script is the following:
set.seed(54321)
nr = 1000;
nc = 900;
dat = matrix(runif(nr*nc), nrow = nr)
rownames(dat) = paste(1:nr)
colnames(dat) = paste("time", 1:nc)
dat[sample(nr*nc, nr*nc*0.90)] = NA
df <- as.data.frame(dat)
df_ps <- names(df)
N_ps <- length(df_ps)
tic = proc.time()
naIds <- lapply(df, function(x) !is.na(x))
dl <- as.list(df)
rl <- sapply(1:(N_ps - 1), function(i) {
if ((i-1)/10 == floor((i-1)/10)) {
cat("\ni = ",i,"\n")
toc = proc.time();
show(toc-tic);
}
x <- dl[[i]]
xna <- which(naIds[[i]])
rl2 <- sapply((i + 1):N_ps, function(j) {
y <- dl[[j]][xna]
yna <- which(naIds[[j]][xna])
w <- xna[yna]
N <- length(w)
if (N >= 5) {
xw <- x[w]
yw <- y[yna]
if ((min(xw) != max(xw)) && (min(yw) != max(yw))) {
# extracts from lm/lm.fit/summary.lm functions
X <- cbind(1L, xw)
m <- .lm.fit(X, yw)
# calculate adj.r.squared
fitted <- yw - m$residuals
rss <- sum(m$residuals^2)
mss <- sum((fitted - mean(fitted))^2)
n <- length(m$residuals)
rdf <- n - m$rank
# rdf <- df.residual
r.squared <- mss/(mss + rss)
adj.r.squared <- 1 - (1 - r.squared) * ((n - 1L)/rdf)
# calculate se & pvals
p1 <- 1L:m$rank
Qr <- m$qr
R <- chol2inv(Qr[p1, p1, drop = FALSE])
resvar <- rss/rdf
se <- sqrt(diag(R) * resvar)
est <- m$coefficients[m$pivot[p1]]
tval <- est/se
pvals <- 2 * pt(abs(tval), rdf, lower.tail = FALSE)
res <- c(m$coefficients[2], se[2], pvals[2],
m$coefficients[1], se[1], pvals[1])
o <- c(i, j, N, adj.r.squared, res)
} else {
o <- c(i,j,N,rep(NA,7))
}
} else {o <- NULL}
return(o)
}, simplify = F)
do.call(rbind, rl2)
}, simplify = F)
out2 <- do.call(rbind, rl)
toc = proc.time();
show(toc - tic)
E.g. try with nr=100000; nc=100.
I should probably mention that I tried using indices, i.e.:
naIds <- lapply(df, function(x) which(!is.na(x)))
and then obviously generating w by intersection:
w <- intersect(xna,yna)
N <- length(w)
This however is slower than the above.

Larges bottleneck is lm function, because there are lot of checks & additional calculations, that you do not necessarily need. So I extracted only the needed parts.
I got this to run in +/- 18 seconds.
set.seed(54321)
nr = 1000;
nc = 900;
dat = matrix(runif(nr*nc), nrow = nr)
rownames(dat) = paste(1:nr)
colnames(dat) = paste("time", 1:nc)
dat[sample(nr*nc, nr*nc*0.9)] = NA
df <- as.data.frame(dat)
df_ps <- names(df)
N_ps <- length(df_ps)
tic = proc.time()
naIds <- lapply(df, function(x) !is.na(x)) # outside loop
dl <- as.list(df) # sub-setting list elements is faster that columns
rl <- sapply(1:(N_ps - 1), function(i) {
x <- dl[[i]]
xna <- naIds[[i]] # relevant logical vector if not empty elements
rl2 <- sapply((i + 1):N_ps, function(j) {
y <- dl[[j]]
yna <- naIds[[j]]
w <- xna & yna
N <- sum(w)
if (N >= 5) {
xw <- x[w]
yw <- y[w]
if ((min(xw) != max(xw)) && (min(xw) != max(xw))) { # faster
# extracts from lm/lm.fit/summary.lm functions
X <- cbind(1L, xw)
m <- .lm.fit(X, yw)
# calculate adj.r.squared
fitted <- yw - m$residuals
rss <- sum(m$residuals^2)
mss <- sum((fitted - mean(fitted))^2)
n <- length(m$residuals)
rdf <- n - m$rank
# rdf <- df.residual
r.squared <- mss/(mss + rss)
adj.r.squared <- 1 - (1 - r.squared) * ((n - 1L)/rdf)
# calculate se & pvals
p1 <- 1L:m$rank
Qr <- m$qr
R <- chol2inv(Qr[p1, p1, drop = FALSE])
resvar <- rss/rdf
se <- sqrt(diag(R) * resvar)
est <- m$coefficients[m$pivot[p1]]
tval <- est/se
pvals <- 2 * pt(abs(tval), rdf, lower.tail = FALSE)
res <- c(m$coefficients[2], se[2], pvals[2],
m$coefficients[1], se[1], pvals[1])
o <- c(i, j, N, adj.r.squared, res)
} else {
o <- c(i,j,N,rep(NA,6))
}
} else {o <- NULL}
return(o)
}, simplify = F)
do.call(rbind, rl2)
}, simplify = F)
out2 <- do.call(rbind, rl)
toc = proc.time();
show(toc - tic);
# user system elapsed
# 17.94 0.11 18.44

Why calling rbind on data.frame with 0 columns drops all the rows?

I noticed a discrepancy with rbind behaviour between matrix and data.frame objects.
With matrix objects everything works as expected:
mat1 <- matrix(nrow=2, ncol=0)
mat2 <- matrix(nrow=2, ncol=0)
dim(rbind(mat1, mat2))
[1] 4 0
But if we turn them to data.frame all of a sudden it looses the number of rows:
> dim(rbind(as.data.frame(mat1), as.data.frame(mat2)))
[1] 0 0
What I would like to understand is - is this behaviour intentional? And if so what is the reasoning for dropping the number of rows in this situation?
EDIT: As noted by #PoGibas - this behaviour is documented in ?rbind. No reason is given and it would probably be hard to infer one. So the question becomes:
How to rbind an arbitrary number of data.frames while always preserving their number of rows?

Workaround could be to use cbind and transposition:
m <- matrix(nrow = 2, ncol = 0)
as.data.frame(t(cbind(as.data.frame(t(m)), as.data.frame(t(m)))))
# Returns: data frame with 0 columns and 4 rows
Here cbind creates a data.frame with 0 rows and 4 columns and we transpose it to matrix with 4 rows and 0 columns.
Another solution is just brutal modification of original base::rbind.data.frame (source on github) function.
You have to remove/comment out two parts there:
Removal of arguments if there length is not a positive integer (length(data.frame()) returns 0). Comment out this part:
allargs <- allargs[lengths(allargs) > 0L]
Return of empty data.frame if attribute names is empty (you can't set attribute to an empty data.frame - names(as.data.frame(mat1)) <- "" returns an error). Comment out this part:
if(nvar == 0L) return(structure(list(), class = "data.frame", row.names = integer()))
Result:
m <- matrix(nrow = 2, ncol = 0)
dim(rbind.data.frame2(as.data.frame(m), as.data.frame(m)))
# Returns: [1] 4 0
Code:
rbind.data.frame2 <- function(..., deparse.level = 1, make.row.names = TRUE,
stringsAsFactors = default.stringsAsFactors())
{
match.names <- function(clabs, nmi)
{
if(identical(clabs, nmi)) NULL
else if(length(nmi) == length(clabs) && all(nmi %in% clabs)) {
## we need 1-1 matches here
m <- pmatch(nmi, clabs, 0L)
if(any(m == 0L))
stop("names do not match previous names")
m
} else stop("names do not match previous names")
}
if(make.row.names)
Make.row.names <- function(nmi, ri, ni, nrow)
{
if(nzchar(nmi)) {
if(ni == 0L) character() # PR8506
else if(ni > 1L) paste(nmi, ri, sep = ".")
else nmi
}
else if(nrow > 0L && identical(ri, seq_len(ni)) &&
identical(unlist(rlabs, FALSE, FALSE), seq_len(nrow)))
as.integer(seq.int(from = nrow + 1L, length.out = ni))
else ri
}
allargs <- list(...)
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# allargs <- allargs[lengths(allargs) > 0L]
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
if(length(allargs)) {
## drop any zero-row data frames, as they may not have proper column
## types (e.g. NULL).
nr <- vapply(allargs, function(x)
if(is.data.frame(x)) .row_names_info(x, 2L)
else if(is.list(x)) length(x[[1L]])
# mismatched lists are checked later
else length(x), 1L)
if(any(nr > 0L)) allargs <- allargs[nr > 0L]
else return(allargs[[1L]]) # pretty arbitrary
}
n <- length(allargs)
if(n == 0L)
return(structure(list(),
class = "data.frame",
row.names = integer()))
nms <- names(allargs)
if(is.null(nms))
nms <- character(n)
cl <- NULL
perm <- rows <- vector("list", n)
rlabs <- if(make.row.names) rows # else NULL
nrow <- 0L
value <- clabs <- NULL
all.levs <- list()
for(i in seq_len(n)) {
## check the arguments, develop row and column labels
xi <- allargs[[i]]
nmi <- nms[i]
## coerce matrix to data frame
if(is.matrix(xi)) allargs[[i]] <- xi <-
as.data.frame(xi, stringsAsFactors = stringsAsFactors)
if(inherits(xi, "data.frame")) {
if(is.null(cl))
cl <- oldClass(xi)
ri <- attr(xi, "row.names")
ni <- length(ri)
if(is.null(clabs)) ## first time
clabs <- names(xi)
else {
if(length(xi) != length(clabs))
stop("numbers of columns of arguments do not match")
pi <- match.names(clabs, names(xi))
if( !is.null(pi) ) perm[[i]] <- pi
}
rows[[i]] <- seq.int(from = nrow + 1L, length.out = ni)
if(make.row.names) rlabs[[i]] <- Make.row.names(nmi, ri, ni, nrow)
nrow <- nrow + ni
if(is.null(value)) { ## first time ==> setup once:
value <- unclass(xi)
nvar <- length(value)
all.levs <- vector("list", nvar)
has.dim <- facCol <- ordCol <- logical(nvar)
for(j in seq_len(nvar)) {
xj <- value[[j]]
facCol[j] <-
if(!is.null(levels(xj))) {
all.levs[[j]] <- levels(xj)
TRUE # turn categories into factors
} else
is.factor(xj)
ordCol[j] <- is.ordered(xj)
has.dim[j] <- length(dim(xj)) == 2L
}
}
else for(j in seq_len(nvar)) {
xij <- xi[[j]]
if(is.null(pi) || is.na(jj <- pi[[j]])) jj <- j
if(facCol[jj]) {
if(length(lij <- levels(xij))) {
all.levs[[jj]] <- unique(c(all.levs[[jj]], lij))
ordCol[jj] <- ordCol[jj] & is.ordered(xij)
} else if(is.character(xij))
all.levs[[jj]] <- unique(c(all.levs[[jj]], xij))
}
}
}
else if(is.list(xi)) {
ni <- range(lengths(xi))
if(ni[1L] == ni[2L])
ni <- ni[1L]
else stop("invalid list argument: all variables should have the same length")
rows[[i]] <- ri <-
as.integer(seq.int(from = nrow + 1L, length.out = ni))
nrow <- nrow + ni
if(make.row.names) rlabs[[i]] <- Make.row.names(nmi, ri, ni, nrow)
if(length(nmi <- names(xi)) > 0L) {
if(is.null(clabs))
clabs <- nmi
else {
if(length(xi) != length(clabs))
stop("numbers of columns of arguments do not match")
pi <- match.names(clabs, nmi)
if( !is.null(pi) ) perm[[i]] <- pi
}
}
}
else if(length(xi)) { # 1 new row
rows[[i]] <- nrow <- nrow + 1L
if(make.row.names)
rlabs[[i]] <- if(nzchar(nmi)) nmi else as.integer(nrow)
}
}
nvar <- length(clabs)
if(nvar == 0L)
nvar <- max(lengths(allargs)) # only vector args
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# if(nvar == 0L)
# return(structure(list(), class = "data.frame",
# row.names = integer()))
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
pseq <- seq_len(nvar)
if(is.null(value)) { # this happens if there has been no data frame
value <- list()
value[pseq] <- list(logical(nrow)) # OK for coercion except to raw.
all.levs <- vector("list", nvar)
has.dim <- facCol <- ordCol <- logical(nvar)
}
names(value) <- clabs
for(j in pseq)
if(length(lij <- all.levs[[j]]))
value[[j]] <-
factor(as.vector(value[[j]]), lij, ordered = ordCol[j])
if(any(has.dim)) {
rmax <- max(unlist(rows))
for(i in pseq[has.dim])
if(!inherits(xi <- value[[i]], "data.frame")) {
dn <- dimnames(xi)
rn <- dn[[1L]]
if(length(rn) > 0L) length(rn) <- rmax
pi <- dim(xi)[2L]
length(xi) <- rmax * pi
value[[i]] <- array(xi, c(rmax, pi), list(rn, dn[[2L]]))
}
}
for(i in seq_len(n)) {
xi <- unclass(allargs[[i]])
if(!is.list(xi))
if(length(xi) != nvar)
xi <- rep(xi, length.out = nvar)
ri <- rows[[i]]
pi <- perm[[i]]
if(is.null(pi)) pi <- pseq
for(j in pseq) {
jj <- pi[j]
xij <- xi[[j]]
if(has.dim[jj]) {
value[[jj]][ri, ] <- xij
## copy rownames
rownames(value[[jj]])[ri] <- rownames(xij)
} else {
## coerce factors to vectors, in case lhs is character or
## level set has changed
value[[jj]][ri] <- if(is.factor(xij)) as.vector(xij) else xij
## copy names if any
if(!is.null(nm <- names(xij))) names(value[[jj]])[ri] <- nm
}
}
}
if(make.row.names) {
rlabs <- unlist(rlabs)
if(anyDuplicated(rlabs))
rlabs <- make.unique(as.character(rlabs), sep = "")
}
if(is.null(cl)) {
as.data.frame(value, row.names = rlabs, fix.empty.names = TRUE,
stringsAsFactors = stringsAsFactors)
} else {
structure(value, class = cl,
row.names = if(is.null(rlabs)) .set_row_names(nrow) else rlabs)
}
}

R - lapply on deep nested elements of list

I have a large list that stored measurements (a product of other lapply() runs). I now want to gather these measurements and calculate median/mean/sd etc but I don't know how to access them. The structure of this list is like this:
foo[[i]][[j]][[k]][[1]]
foo[[i]][[j]][[k]][[2]]$bar
I can't figure out a function that would return e.g. mean of $bar (but not of $x) and keep relation the values of the indices i,j,k.
A sample list can be generated with the following R code:
library(purrr)
metrics <- function(y){
tt10r <- median(y)
list(y, flatten(list(bar = tt10r)))
}
example_list <- list()
for (i in 1:10)
{
v <- list()
for (j in 1:10)
{
w <- 1:10
v[j] <- list(w)
}
example_list[[i]] <- v
}
foo <- list()
for (i in 1:length(example_list))
{
u <- list()
values <- list()
for (j in 1:length(example_list[[i]]))
{
u[[j]] <- lapply(example_list[[i]][[j]], function(x) mean(x))
values[[j]] <- lapply(u[[j]], function(x) metrics(x))
}
foo[[i]] <- values
}

The following code works nicely, but I am not sure if it is efficient (loops!). Gives the anticipated result:
final <- matrix(nrow = tail(cumsum(unlist(lapply(foo, function(x) lengths(x) -2))), n=1), ncol = 3)
final <- data.frame(final)
j=1
i=1
all_js <- c(0, cumsum(lengths(foo)))
starts <- c(0, cumsum(unlist(lapply(foo, function(x) lengths(x) -2)))) + 1
ends <- c(0, cumsum(unlist(lapply(foo, function(x) lengths(x) -2))))
for (i in 1:length(foo))
{
a <- foo[[i]]
for (j in 1:length(a))
{
b <- a[[j]]
data <- unlist(lapply(lapply(b[1], '[', 2), '[[', 1))
for (k in 2:c(length(b)-2))
{
data <- rbind(data,unlist(lapply(lapply(b[k], '[', 2), '[[', 1)))
}
row.names(data) <- NULL
colnames(final) <- c("i", "j", colnames(data))
first <- starts[all_js[i] + j]
last <- ends[all_js[i] + j+1]
final[first:last,] <- data.frame(cbind(i = i, j = j, data))
}
}

Collapse runs of consecutive numbers to ranges

Consider the following comma-separated string of numbers:
s <- "1,2,3,4,8,9,14,15,16,19"
s
# [1] "1,2,3,4,8,9,14,15,16,19"
Is it possible to collapse runs of consecutive numbers to its corresponding ranges, e.g. the run 1,2,3,4 above would be collapsed to the range 1-4. The desired result looks like the following string:
s
# [1] "1-4,8,9,14-16,19"

I took some heavy inspiration from the answers in this question.
findIntRuns <- function(run){
rundiff <- c(1, diff(run))
difflist <- split(run, cumsum(rundiff!=1))
unlist(lapply(difflist, function(x){
if(length(x) %in% 1:2) as.character(x) else paste0(x[1], "-", x[length(x)])
}), use.names=FALSE)
}
s <- "1,2,3,4,8,9,14,15,16,19"
s2 <- as.numeric(unlist(strsplit(s, ",")))
paste0(findIntRuns(s2), collapse=",")
[1] "1-4,8,9,14-16,19"
EDIT: Multiple solutions: benchmarking time!
Unit: microseconds
expr min lq median uq max neval
spee() 277.708 295.517 301.5540 311.5150 1612.207 1000
seb() 294.611 313.025 321.1750 332.6450 1709.103 1000
marc() 672.835 707.549 722.0375 744.5255 2154.942 1000
#speendo's solution is the fastest at the moment, but none of these have been optimised yet.

I was too slow... but here's another solution.
It uses less R-specific functions so it could be ported to other languages (on the other hand maybe it's less elegant)
s <- "1,2,3,4,8,9,14,15,16,19"
collapseConsecutive <- function(s){
x <- as.numeric(unlist(strsplit(s, ",")))
x_0 <- x[1]
out <- toString(x[1])
hasDash <- FALSE
for(i in 2:length(x)) {
x_1 <- x[i]
x_2 <- x[i+1]
if((x_0 + 1) == x_1 && !is.na(x_2) && (x_1 + 1) == x_2) {
if(!hasDash) {
out <- c(out, "-")
hasDash <- TRUE
}
} else {
if(hasDash) {
hasDash <- FALSE
} else {
out <- c(out, ",")
}
out <- c(out, x_1)
hasDash <- FALSE
}
x_0 <- x_1
}
outString <- paste(out, collapse="")
outString
}
collapseConsecutive(s)
# [1] "1-4,8,9,14-16,19"

Another fairly compact option
in.seq <- function(x) {
# returns TRUE for elments within ascending sequences
(c(diff(x, 1), NA) == 1 & c(NA, diff(x,2), NA) == 2)
}
contractSeqs <- function(x) {
# returns string formatted with contracted sequences
x[in.seq(x)] <- ""
gsub(",{2,}", "-", paste(x, collapse=","), perl=TRUE)
}
s <- "1,2,3,4,8,9,14,15,16,19"
s1 <- as.numeric(unlist(strsplit(s, ","))) # as earlier answers
# assumes: numeric vector, length > 2, positive integers, ascending sequences
contractSeqs(s1)
# [1] "1-4,8,9,14-16,19"
I also wrote a bells & whistles version that can handle both numeric and string input including named objects, descending sequences and alternative punctuation, as well as performing error checking and reporting. If anyone is interested, I can add this to my answer.

Here's a function that should do what you want:
conseq <- function(s){
s <- as.numeric(unlist(strsplit(s, ",")))
dif <- s[seq(length(s))][-1] - s[seq(length(s)-1)]
new <- !c(0, dif == 1)
cs <- cumsum(new)
res <- vector(mode="list", max(cs))
for(i in seq(res)){
s.i <- s[which(cs == i)]
if(length(s.i) > 2){
res[[i]] <- paste(min(s.i), max(s.i), sep="-")
} else {
res[[i]] <- as.character(s.i)
}
}
paste(unlist(res), collapse=",")
}
Example
> s <- "1,2,3,4,8,9,14,15,16,19"
> conseq(s)
[1] "1-4,8,9,14-16,19"

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Interval sets algebra in R (union, intersection, differences, inclusion, ...) - r

Related

homals package for Nonlinear PCA in R: Error in dimnames(x) <- dn : length of 'dimnames' [1] not equal to array extent

Speeding up linear model fitting on complete pairwise observations in large sparse matrix in R

Why calling rbind on data.frame with 0 columns drops all the rows?

R - lapply on deep nested elements of list

Collapse runs of consecutive numbers to ranges

Categories

Resources