splitting up ranges - r

Say I have some ranges represented by start coordinates start<-c(1,2,3) and end coordiantes end<-c(4,5,4) ;ranges<-data.frame(start,end) How can I split this up into one length intervals?
i.e. I want
this
starts ends
1 1 4
2 2 5
3 3 4
to be transformed into this:
starts ends
1 1 2 |
2 3 4 <-end of original first interval
3 2 3 |
4 4 5 <-end of original second interval
5 3 4 <-end of original third interval
right now I have a for loop iterating through the list and creating a sequence sequence that goes from start to end but this loop takes a very long time to execute for long lists of ranges.

Here's one way. It's a "glorified for-loop" in the disguise of lapply on a sequence.
# Your sample data
ranges<-data.frame(start=c(1,2,3),end=c(4,5,4))
# Extract the start/end columns
start <- ranges$start
end <- ranges$end
# Calculate result data
res <- lapply(seq_along(start), function(i) start[i]+seq(0, end[i]-start[i]))
# Make it into a data.frame by way of a matrix (which has a byrow argument)
newRanges <- as.data.frame( matrix(unlist(res), ncol=2, byrow=TRUE, dimnames=list(NULL, names(ranges))) )
Which gives the correct result:
> newRanges
start end
1 1 2
2 3 4
3 2 3
4 4 5
5 3 4
And then time it on a bigger problem:
n <- 1e5
start <- sample(10, n, replace=TRUE)
end <- start + sample( 3, n, replace=TRUE)*2-1
system.time( newRanges <- as.data.frame( matrix(unlist(lapply(seq_along(start), function(i) start[i]+seq(0, end[i]-start[i]))), ncol=2, byrow=TRUE) ) )
This takes about 1.6 seconds on my machine. Good enough?
...The trick is to work on the vectors directly instead of on the data.frame. And then build the data.frame at the end.
Update #Ellipsis... commented that lapply is no better than a for-loop. Let's see:
system.time( a <- unlist(lapply(seq_along(start), function(i) start[i]+seq(0, end[i]-start[i]))) ) # 1.6 secs
system.time( b <- {
res <- vector('list', length(start))
for (i in seq_along(start)) {
res[[i]] <- start[i]+seq(0, end[i]-start[i])
}
unlist(res)
}) # 1.8 secs
So, not only is the for-loop about 12% slower in this case, it is also much more verbose...
UPDATE AGAIN!
#Martin Morgan suggested using Map, and it is indeed the fastest solution yet - faster than do.call in my other answer. Also, by using seq.int my first solution is also much faster:
# do.call solution: 0.46 secs
system.time( matrix(do.call('c', lapply(seq_along(start), function(i) call(':', start[i], end[i]))), ncol=2, byrow=TRUE) )
# lapply solution: 0.42 secs
system.time( matrix(unlist(lapply(seq_along(start), function(i) start[[i]]+seq.int(0L, end[[i]]-start[[i]]))), ncol=2, byrow=TRUE) )
# Map solution: 0.26 secs
system.time( matrix(unlist(Map(seq.int, start, end)), ncol=2, byrow=TRUE) )

You could try creating text for the vectors, parse-ing and eval-uating and then using a matrix to create the data.frame:
txt <- paste("c(",paste(ranges$start,ranges$end,sep=":",collapse=","),")",sep="")
> txt
[1] "c(1:4,2:5,3:4)"
vec <- eval(parse(text=txt))
> vec
[1] 1 2 3 4 2 3 4 5 3 4
mat <- matrix(vec,ncol=2,byrow=T)
> data.frame(mat)
X1 X2
1 1 2
2 3 4
3 2 3
4 4 5
5 3 4

Here's another answer based on #James great solution. It avoids paste and parse and is a little bit faster:
vec <- do.call('c', lapply(seq_along(start), function(i) call(':', start[i], end[i])))
mat <- matrix(vec,ncol=2,byrow=T)
Timing it:
set.seed(42)
n <- 1e5
start <- sample(10, n, replace=TRUE)
end <- start + sample( 3, n, replace=TRUE)*2-1
# #James code: 6,64 secs
system.time({
for(i in 1:10) {
txt <- paste("c(",paste(start,end,sep=":",collapse=","),")",sep="")
vec <- eval(parse(text=txt))
mat <- matrix(vec,ncol=2,byrow=T)
}
})
# My variant: 5.17 secs
system.time({
for(i in 1:10) {
vec <- do.call('c', lapply(seq_along(start), function(i) call(':', start[i], end[i])))
mat <- matrix(vec,ncol=2,byrow=T)
}
})

Related

How to run a function with multiple arguments of varying length in a loop in R

I need to run this function like 6000 times with all of its iterations. I have 6 arguments in total for the function. The first 3 of them go hand in hand and number 75. The next argument has 9 values. And the last 2 arguments have 3 values.
#require dplyr
#data is history as list
matchloop <- function(data, data2, x, a, b, c) {
#history as list
split <- data
#history for reference
fh <- FullHistory
#start counter
n<-1
#end counter
m<-a
tempdf0.3 <- fh
#set condition for loop
while(nrow(tempdf0.3) > 1 && m <= (nrow(data2))*b) {
#put history into a variable
tempdf0.0 <- split
#put fh into a variable
tempdf0.5 <- fh
#put test path into variable from row n to m
tempdf0.1 <- as.data.frame(data2[n:m,], stringsAsFactors = FALSE)
#change column name of test path
colnames(tempdf0.1) <- "directions"
#put row n to m of history into variable
tempdf0.2 <- lapply(tempdf0.0, function(df) df[n:m,])
#put output into output
tempdf0.3 <- orderedDistancespos(tempdf0.2, tempdf0.1,
"allPaths","directions")
#add to output routeID based on reference from fh-the test path ID
tempdf0.3 <- mutate(tempdf0.3, routeID = (subset(tempdf0.5, routeID
!= x)$routeID))
#reduce output based on the matched threshold
tempdf0.3 <- subset(tempdf0.3, dists >= a*c)
#create new history based on the IDs remaining in output
split <- split[as.character(tempdf0.3$routeID)]
#create new history for reference based on the IDs remaining in
output
fh <- subset(fh, routeID %in% tempdf0.3$routeID)
#increase loop counter
n <- n+a
#increase loop counter
m <- n+(a-1)
}
#show output
mylist <- list(tempdf0.3, nrow(tempdf0.3))
return(mylist)
}
I tried putting the 3 arguments with 75 elements in them to their own lists and use mapply. This works. But even at this level I still have to run the code 81 times to cover all the variables because as far as I understand mapply recycles based on the length of the longest argument.
mapply(matchloop, mylist2,mylist3,mylist4, MoreArgs = list(a=a, b=b, c=c))
data is a list of dataframes
data2 is a dataframe
x, a, b, c are all numerical.
Right now I'm trying to streamline my output so that its in just 1 line. So if possible I would like 1 single csv output with all 6000+ lines.
You can combine mapply and apply function to cycle through all possible combination of a, b and c variables. To create all possible combinations you can use expand.grid. Finally you can contatenate list of rows into a data.frame with the help of do.call and rbind functions as follows:
matchloop_stub <- matchloop <- function(data, data2, x, a, b, c) {
# stub
c(d = sum(data), d2 = sum(data2), x = sum(x), a = a, b = b, c = c, r = a + b + c)
}
set.seed(123)
mylist2 <- replicate(75, data.frame(rnorm(1)))
mylist3 <- replicate(75, data.frame(rnorm(2)))
mylist4 <- replicate(75, data.frame(rnorm(3)))
a <- 1:9
b <- 1:3
c <- 1:3
abc <- expand.grid(a, b, c)
names(abc) <- c("a", "b", "c")
xs <- apply(abc, 1, function(x) (mapply(matchloop_stub, mylist2, mylist3, mylist4, x[1], x[2], x[3], SIMPLIFY = FALSE)))
df <- do.call(rbind, do.call(rbind, xs))
write.csv(df, file = "temp.csv")
res <- read.csv("temp.csv")
nrow(res)
# [1] 6075
head(res)
# X d d2 x a b c r
# 1 1 -0.5604756 0.7407984 -1.362065 1 1 1 3
# 2 2 -0.5604756 0.7407984 -1.362065 2 1 1 4
# 3 3 -0.5604756 0.7407984 -1.362065 3 1 1 5
# 4 4 -0.5604756 0.7407984 -1.362065 4 1 1 6
# 5 5 -0.5604756 0.7407984 -1.362065 5 1 1 7
# 6 6 -0.5604756 0.7407984 -1.362065 6 1 1 8

Problems with speeding up loop in R

I have a particularly big dataset which consists of 3.7 mio rows and 76 string columns.
I want to compare the above row with the below row in terms of whether they match and have written this code. The number of same patterns of the above and the below row should be indicated.
a <- c("a","a","a","a","a","a","a","a","a")
b <- c("b","b","b","b","a","b","b","b","b")
c <- c("c","c","c","c","a","a","a","b","b")
d <- c("d","d","d","d","d","d","d","d","d")
features_split <- data.frame(a,b,c,d); features_split
ncol = max(sapply(features_split,length))
safe <- as.data.table(lapply(1:ncol,function(i)sapply(features_split,"[",i)))
nrow(safe)
df <- safe
LIST <-list()
LIST2 <-list()
for(i in 1:(nrow(df)-1))
{
LIST[[i]] <-df[i+1,] %in% df[i,]
LIST2[[i]] <- length(LIST[[i]][LIST[[i]]==TRUE])
}
safe2 <- unlist(LIST2)
not_available <- rowSums(!is.na(safe))
It takes forever to run that loop. How can I improve?
(about 1 hour for 100.000 rows, but I have more than 3.7 mio)
Grateful for anything,
Tobi
Using a data.frame
Proof of concept, using data.frame:
set.seed(4)
nr <- 1000
mydf <- data.frame(a=sample(letters[1:3], nr, repl=TRUE),
b=sample(letters[1:3], nr, repl=TRUE),
c=sample(letters[1:3], nr, repl=TRUE),
d=sample(letters[1:3], nr, repl=TRUE),
stringsAsFactors=FALSE)
matches <- vapply(seq.int(nrow(mydf)-1),
function(ii,zz) sum(mydf[ii,] == mydf[ii+1,]),
integer(1))
head(matches)
## [1] 0 3 4 2 1 0
sum(matches == 4) # total number of perfect row-matches
## 16
In matches, the integer in position i indicates how many strings from row i exactly match the corresponding string from row i+1. A match of 0 means no matches at all, and (in this case) 4 means the row is a perfect match.
Taking it a bit larger for a demonstration of time:
nr <- 100000
nc <- 76
mydf2 <- as.data.frame(matrix(sample(letters[1:4], nr*nc, repl=TRUE), nc=nc),
stringsAsFactors=FALSE)
dim(mydf2)
## [1] 100000 76
system.time(
matches2 <- vapply(seq.int(nrow(mydf2)-1),
function(ii) sum(mydf2[ii,] == mydf2[ii+1,]),
integer(1))
)
## user system elapsed
## 370.63 12.14 385.36
Using a matrix instead
If you can afford to do it as a matrix (since you have a homogenous data type of "character") instead of a data.frame, you'll get considerably better performance:
nr <- 100000
nc <- 76
mymtx2 <- matrix(sample(letters[1:4], nr*nc, repl=TRUE), nc=nc)
dim(mymtx2)
## [1] 10000 76
system.time(
matches2 <- vapply(seq.int(nrow(mymtx2)-1),
function(ii) sum(mymtx2[ii,] == mymtx2[ii+1,]),
integer(1))
)
## user system elapsed
## 0.81 0.00 0.81
(Compare with 370.63 user from the previous run.) Scaling it up to full-strength:
nr <- 3.7e6
nc <- 76
mymtx3 <- matrix(sample(letters[1:4], nr*nc, repl=TRUE), nc=nc)
dim(mymtx3)
## [1] 3700000 76
system.time(
matches3 <- vapply(seq.int(nrow(mymtx3)-1),
function(ii) sum(mymtx3[ii,] == mymtx3[ii+1,]),
integer(1))
)
## user system elapsed
## 35.32 0.05 35.81
length(matches3)
## [1] 3699999
sum(matches3 == nc)
## [1] 0
Unfortunately, still no matches, but I think 36 seconds is considerably better for 3.7M than an hour for 100K. (Please correct me if I'm made an incorrect assumption.)
(Ref: win7 x64, R-3.0.3-64bit, intel i7-2640M 2.8GHz, 8GB RAM)

Find first greater element with higher index

I have two vectors, A and B. For every element in A I want to find the index of the first element in B that is greater and has higher index. The length of A and B are the same.
So for vectors:
A <- c(10, 5, 3, 4, 7)
B <- c(4, 8, 11, 1, 5)
I want a result vector:
R <- c(3, 3, 5, 5, NA)
Of course I can do it with two loops, but it's very slow, and I don't know how to use apply() in this situation, when the indices matter. My data set has vectors of length 20000, so the speed is really important in this case.
A few bonus questions:
What if I have a sequence of numbers (like seq = 2:10), and I want to find the first number in B that is higher than a+s for every a of A and every s of seq.
Like with question 1), but I want to know the first greater, and the first lower value, and create a matrix, which stores which one was first. So for example I have a of A, and 10 from seq. I want to find the first value of B, which is higher than a+10, or lower than a-10, and then store it's index and value.
sapply(sapply(seq_along(a),function(x) which(b[-seq(x)]>a[x])+x),"[",1)
[1] 3 3 5 5 NA
This is a great example of when sapply is less efficient than loops.
Although the sapply does make the code look neater, you are paying for that neatness with time.
Instead you can wrap a while loop inside a for loop inside a nice, neat function.
Here are benchmarks comparing a nested-apply loop against nested for-while loop (and a mixed apply-while loop, for good measure). Update: added the vapply..match.. mentioned in comments. Faster than sapply, but still much slower than while loop.
BENCHMARK:
test elapsed relative
1 for.while 0.069 1.000
2 sapply.while 0.080 1.159
3 vapply.match 0.101 1.464
4 nested.sapply 0.104 1.507
Notice you save a third of your time; The savings will likely be larger when you start adding the sequences to A.
For the second part of your question:
If you have this all wrapped up in an nice function, it is easy to add a seq to A
# Sample data
A <- c(10, 5, 3, 4, 7, 100, 2)
B <- c(4, 8, 11, 1, 5, 18, 20)
# Sample sequence
S <- seq(1, 12, 3)
# marix with all index values (with names cleaned up)
indexesOfB <- t(sapply(S, function(s) findIndx(A+s, B)))
dimnames(indexesOfB) <- list(S, A)
Lastly, if you want to instead find values of B less than A, just swap the operation in the function.
(You could include an if-clause in the function and use only a single function. I find it more efficient
to have two separate functions)
findIndx.gt(A, B) # [1] 3 3 5 5 6 NA 8 NA NA
findIndx.lt(A, B) # [1] 2 4 4 NA 8 7 NA NA NA
Then you can wrap it up in one nice pacakge
rangeFindIndx(A, B, S)
# A S indxB.gt indxB.lt
# 10 1 3 2
# 5 1 3 4
# 3 1 5 4
# 4 1 5 NA
# 7 1 6 NA
# 100 1 NA NA
# 2 1 NA NA
# 10 4 6 4
# 5 4 3 4
# ...
FUNCTIONS
(Notice they depend on reshape2)
rangeFindIndx <- function(A, B, S) {
# For each s in S, and for each a in A,
# find the first value of B, which is higher than a+s, or lower than a-s
require(reshape2)
# Create gt & lt matricies; add dimnames for melting function
indexesOfB.gt <- sapply(S, function(s) findIndx.gt(A+s, B))
indexesOfB.lt <- sapply(S, function(s) findIndx.lt(A-s, B))
dimnames(indexesOfB.gt) <- dimnames(indexesOfB.gt) <- list(A, S)
# melt the matricies and combine into one
gtltMatrix <- cbind(melt(indexesOfB.gt), melt(indexesOfB.lt)$value)
# clean up their names
names(gtltMatrix) <- c("A", "S", "indxB.gt", "indxB.lt")
return(gtltMatrix)
}
findIndx.gt <- function(A, B) {
lng <- length(A)
ret <- integer(0)
b <- NULL
for (j in seq(lng-1)) {
i <- j + 1
while (i <= lng && ((b <- B[[i]]) < A[[j]]) ) {
i <- i + 1
}
ret <- c(ret, ifelse(i<lng, i, NA))
}
c(ret, NA)
}
findIndx.lt <- function(A, B) {
lng <- length(A)
ret <- integer(0)
b <- NULL
for (j in seq(lng-1)) {
i <- j + 1
while (i <= lng && ((b <- B[[i]]) > A[[j]]) ) { # this line contains the only difference from findIndx.gt
i <- i + 1
}
ret <- c(ret, ifelse(i<lng, i, NA))
}
c(ret, NA)
}

Efficiently modify list in R

I have foreach loop that produces a list within each loop and a .combine function to combine them that looks like this:
mergelists = function(x,xn) {
padlen = length(x[[1]])
for (n in names(x)[!names(x) %in% names(xn)]) xn[[n]] = 0
for (n in names(xn)[!names(xn) %in% names(x)]) xn[[n]] = c(rep(0,padlen), xn[[n]])
for (idx in names(xn)) { x[[idx]] = c( x[[idx]], xn[[idx]] ) }
x
}
The first two for-loops modify the new list (xn) to make it compatible to the the one that gathers the results (x). The last one joins x and xn onto x.
I believe my code is ridiculously inefficient, because it re-allocates a lot and uses for-loops. But I can't think about a better solution. Any ideas?
Some more explanation:
I don't know the list names in advance (they are patterns from a bootstrap exercise which takes place in the foreach part).
Example:
> x
$foo
[1] 3 2
$bar
[1] 3 2
and
> xn
$foo
[1] 1
$baz
[1] 1
should join to
> x
$foo
[1] 3 2 1
$bar
[1] 3 2 0
$baz
[1] 0 0 1
That's it.
In my benchmarking, this approach takes longer than your approach, but since I already worked it out, I thought I'd post it anyway. Here's to doubling effort. If the names are completely unknown and you are forced to pad with zeros in the .combine function, you could try the following. (perhaps try it on a subset of your iterations first to see if it works):
library(reshape2)
mergeList2 <- function(x, xn) {
xDF <- data.frame(ID = seq_along(x[[1]]), x)
xnDF <- data.frame(ID = seq_along(xn[[1]]) + nrow(xDF), xn)
meltedX <- melt(xDF, id = "ID")
meltedXN <- melt(xnDF, id = "ID")
res <- as.list(dcast(rbind(meltedX, meltedXN), ID ~ variable,
fill = 0))[-1]
return(res)
}
Your example:
mergeList2(list(foo = c(3, 2), bar = c(3, 2)),
list(foo = 1, baz= 1))
# $foo
# [1] 3 2 1
# $bar
# [1] 3 2 0
# $baz
# [1] 0 0 1
Test it out with a foreach example
set.seed(1)
foreach(dd = 1:10, .combine = mergeList2) %do% {
theNames <- sample(c("foo", "bar", "baz"), 2)
ans <- as.list(rpois(2, 4))
names(ans) <- theNames
ans
}
# $foo
# [1] 4 7 2 4 0 2 0 4 5 3
# $baz
# [1] 7 0 0 5 3 5 3 4 0 5
# $bar
# [1] 0 5 2 0 5 0 0 0 6 0
If foo and bar exist in every list and are in order, then mapply works. As #BenBarnes suggested, having a pre-processing step to create the 0's makes this a viable option even if they do not exist everywhere. Sorting is easy. I've changed the 0's to NAs since that seems more appropriate.
# Make data
x <- list(foo=c(3,2),bar=c(6,7))
xn <- list(foo=c(1),bar=c(1),aught=c(5,2))
lol <- list(x=x,xn=xn)
# Pre-process
allnames <- sort(unique(unlist(lapply(lol, names))))
cleanlist <- function(l,allnames) {
ret <- l[allnames]
names(ret) <- allnames
ret[sapply(ret,is.null)] <- NA
ret
}
lol <- lapply(lol,cleanlist,allnames=allnames)
# Combine
do.call("mapply", c(c,lol) )
Which produces:
aught bar foo
x NA 6 3
xn1 5 7 2
xn2 2 1 1
Benchmarking
That said, if you're hoping for speed gains, the original version is still the fastest, presumably because it does the least. But the loopless approach is pretty elegant and scales to an arbitrary number of x's.
library(microbenchmark)
microbenchmark( mergelists(lol$x,lol$xn), mergeList2(lol$x,lol$xn), do.call("mapply", c(c,lol) ) )
Unit: microseconds
expr min lq median uq max
1 do.call("mapply", c(c, lol)) 155.048 159.5175 192.0635 195.5555 245.841
2 mergeList2(lol$x, lol$xn) 19938.288 20095.9905 20225.4750 20719.6730 27143.674
3 mergelists(lol$x, lol$xn) 63.416 68.1650 78.0825 84.3680 95.265

Aggregate rows in a large matrix by rowname

I would like to aggregate the rows of a matrix by adding the values in rows that have the same rowname. My current approach is as follows:
> M
a b c d
1 1 1 2 0
1 2 3 4 2
2 3 0 1 2
3 4 2 5 2
> index <- as.numeric(rownames(M))
> M <- cbind(M,index)
> Dfmat <- data.frame(M)
> Dfmat <- aggregate(. ~ index, data = Dfmat, sum)
> M <- as.matrix(Dfmat)
> rownames(M) <- M[,"index"]
> M <- subset(M, select= -index)
> M
a b c d
1 3 4 6 2
2 3 0 1 2
3 4 2 5 2
The problem of this appraoch is that i need to apply it to a number of very large matrices (up to 1.000 rows and 30.000 columns). In these cases the computation time is very high (Same problem when using ddply). Is there a more eficcient to come up with the solution? Does it help that the original input matrices are DocumentTermMatrix from the tm package? As far as I know they are stored in a sparse matrix format.
Here's a solution using by and colSums, but requires some fiddling due to the default output of by.
M <- matrix(1:9,3)
rownames(M) <- c(1,1,2)
t(sapply(by(M,rownames(M),colSums),identity))
V1 V2 V3
1 3 9 15
2 3 6 9
There is now an aggregate function in Matrix.utils. This can accomplish what you want with a single line of code and is about 10x faster than the combineByRow solution and 100x faster than the by solution:
N <- 10000
m <- matrix( runif(N*100), nrow=N)
rownames(m) <- sample(1:(N/2),N,replace=T)
> microbenchmark(a<-t(sapply(by(m,rownames(m),colSums),identity)),b<-combineByRow(m),c<-aggregate.Matrix(m,row.names(m)),times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
a <- t(sapply(by(m, rownames(m), colSums), identity)) 6000.26552 6173.70391 6660.19820 6419.07778 7093.25002 7723.61642 10
b <- combineByRow(m) 634.96542 689.54724 759.87833 732.37424 866.22673 923.15491 10
c <- aggregate.Matrix(m, row.names(m)) 42.26674 44.60195 53.62292 48.59943 67.40071 70.40842 10
> identical(as.vector(a),as.vector(c))
[1] TRUE
EDIT: Frank is right, rowsum is somewhat faster than any of these solutions. You would want to consider using another one of these other functions only if you were using a Matrix, especially a sparse one, or if you were performing an aggregation besides sum.
The answer by James work as expected, but is quite slow for large matrices. Here is a version that avoids creating of new objects:
combineByRow <- function(m) {
m <- m[ order(rownames(m)), ]
## keep track of previous row name
prev <- rownames(m)[1]
i.start <- 1
i.end <- 1
## cache the rownames -- profiling shows that it takes
## forever to look at them
m.rownames <- rownames(m)
stopifnot(all(!is.na(m.rownames)))
## go through matrix in a loop, as we need to combine some unknown
## set of rows
for (i in 2:(1+nrow(m))) {
curr <- m.rownames[i]
## if we found a new row name (or are at the end of the matrix),
## combine all rows and mark invalid rows
if (prev != curr || is.na(curr)) {
if (i.start < i.end) {
m[i.start,] <- apply(m[i.start:i.end,], 2, max)
m.rownames[(1+i.start):i.end] <- NA
}
prev <- curr
i.start <- i
} else {
i.end <- i
}
}
m[ which(!is.na(m.rownames)),]
}
Testing it shows that is about 10x faster than the answer using by (2 vs. 20 seconds in this example):
N <- 10000
m <- matrix( runif(N*100), nrow=N)
rownames(m) <- sample(1:(N/2),N,replace=T)
start <- proc.time()
m1 <- combineByRow(m)
print(proc.time()-start)
start <- proc.time()
m2 <- t(sapply(by(m,rownames(m),function(x) apply(x, 2, max)),identity))
print(proc.time()-start)
all(m1 == m2)

Resources