Collapse runs of consecutive numbers to ranges - r

Consider the following comma-separated string of numbers:
s <- "1,2,3,4,8,9,14,15,16,19"
s
# [1] "1,2,3,4,8,9,14,15,16,19"
Is it possible to collapse runs of consecutive numbers to its corresponding ranges, e.g. the run 1,2,3,4 above would be collapsed to the range 1-4. The desired result looks like the following string:
s
# [1] "1-4,8,9,14-16,19"

I took some heavy inspiration from the answers in this question.
findIntRuns <- function(run){
rundiff <- c(1, diff(run))
difflist <- split(run, cumsum(rundiff!=1))
unlist(lapply(difflist, function(x){
if(length(x) %in% 1:2) as.character(x) else paste0(x[1], "-", x[length(x)])
}), use.names=FALSE)
}
s <- "1,2,3,4,8,9,14,15,16,19"
s2 <- as.numeric(unlist(strsplit(s, ",")))
paste0(findIntRuns(s2), collapse=",")
[1] "1-4,8,9,14-16,19"
EDIT: Multiple solutions: benchmarking time!
Unit: microseconds
expr min lq median uq max neval
spee() 277.708 295.517 301.5540 311.5150 1612.207 1000
seb() 294.611 313.025 321.1750 332.6450 1709.103 1000
marc() 672.835 707.549 722.0375 744.5255 2154.942 1000
#speendo's solution is the fastest at the moment, but none of these have been optimised yet.

I was too slow... but here's another solution.
It uses less R-specific functions so it could be ported to other languages (on the other hand maybe it's less elegant)
s <- "1,2,3,4,8,9,14,15,16,19"
collapseConsecutive <- function(s){
x <- as.numeric(unlist(strsplit(s, ",")))
x_0 <- x[1]
out <- toString(x[1])
hasDash <- FALSE
for(i in 2:length(x)) {
x_1 <- x[i]
x_2 <- x[i+1]
if((x_0 + 1) == x_1 && !is.na(x_2) && (x_1 + 1) == x_2) {
if(!hasDash) {
out <- c(out, "-")
hasDash <- TRUE
}
} else {
if(hasDash) {
hasDash <- FALSE
} else {
out <- c(out, ",")
}
out <- c(out, x_1)
hasDash <- FALSE
}
x_0 <- x_1
}
outString <- paste(out, collapse="")
outString
}
collapseConsecutive(s)
# [1] "1-4,8,9,14-16,19"

Another fairly compact option
in.seq <- function(x) {
# returns TRUE for elments within ascending sequences
(c(diff(x, 1), NA) == 1 & c(NA, diff(x,2), NA) == 2)
}
contractSeqs <- function(x) {
# returns string formatted with contracted sequences
x[in.seq(x)] <- ""
gsub(",{2,}", "-", paste(x, collapse=","), perl=TRUE)
}
s <- "1,2,3,4,8,9,14,15,16,19"
s1 <- as.numeric(unlist(strsplit(s, ","))) # as earlier answers
# assumes: numeric vector, length > 2, positive integers, ascending sequences
contractSeqs(s1)
# [1] "1-4,8,9,14-16,19"
I also wrote a bells & whistles version that can handle both numeric and string input including named objects, descending sequences and alternative punctuation, as well as performing error checking and reporting. If anyone is interested, I can add this to my answer.

Here's a function that should do what you want:
conseq <- function(s){
s <- as.numeric(unlist(strsplit(s, ",")))
dif <- s[seq(length(s))][-1] - s[seq(length(s)-1)]
new <- !c(0, dif == 1)
cs <- cumsum(new)
res <- vector(mode="list", max(cs))
for(i in seq(res)){
s.i <- s[which(cs == i)]
if(length(s.i) > 2){
res[[i]] <- paste(min(s.i), max(s.i), sep="-")
} else {
res[[i]] <- as.character(s.i)
}
}
paste(unlist(res), collapse=",")
}
Example
> s <- "1,2,3,4,8,9,14,15,16,19"
> conseq(s)
[1] "1-4,8,9,14-16,19"

Related

R median function from scratch

I am a R beginner and I tried to make a median function from scratch.
Here is my code:
mymedian <- function(x) {
len <- length(x)
sorted <- sort(x)
if (len %% 2 == 0) {
med1 <- sorted[len / 2]
med2 <- sorted[(len + 1) %/% 2]
result <- sorted[med1 + med2 / 2]
return(result)
} else {
result <- sorted[(len + 1)/2]
return(result)
}
}
Im getting "NA" output. I couldn't find where the problem is.
Main issue is you're trying to index your sorted vector with a non-integer (e.g., 168.5). Compare your function to this:
mymedian <- function(x){
len <- length(x)
sorted <-sort(x)
if(len%%2==0){
i <- len/2
med1<-sorted[i]
med2 <- sorted[i+1]
result <- sum(med1,med2)/2
return(result)
}else{
result<-sorted[(len+1)/2]
return(result)
}
}

R numeric to char precision loss

I want to convert my many-digit numeric vector to character. I tried the following solutions here which works for one number but not for a vector. This is OK
options(digits=20)
options(scipen=99999)
x<-129483.19999999999709;format(round(x, 12), nsmall = 12)
[1] "129483.199999999997"
But this is not. how to keep numeric precision in characters for numeric vectors?
> y <- c(129483.19999999999709, 1.3546746874,687676846.2546746464)
Specially problematic is 687676846.2546746464 Also tried:
> specify_decimal(y, 12)
[1] "129483.199999999997" "1.354674687400" "687676846.254674673080"
> formatC(y, digits = 12, format = "f")
[1] "129483.199999999997" "1.354674687400" "687676846.254674673080"
> formattable(y, digits = 12, format = "f")
[1] 129483.199999999997 1.354674687400 687676846.254674673080
> sprintf(y, fmt='%#.12g')
[1] "129483.200000" "1.35467468740" "687676846.255"
> sprintf(y, fmt='%#.22g')
[1] "129483.1999999999970896" "1.354674687399999966075" "687676846.2546746730804"
Expected result:
[1] "129483.199999999997" "1.354674687400" "687676846.254674646400"
It seems that precision loss occurs once only, it is not repeated.
> require(dplyr)
> convert <- function(x) as.numeric(as.character(x))
> 687676846.2546746464 %>% convert
[1] 687676846.25467503
> 687676846.2546746464 %>% convert %>% convert %>% convert
[1] 687676846.25467503
Here I only have 5-digit precision, but more problematic I can't know beforehand what precision I am going to get..
At the end I could do what I wanted using these functions. addtrailingzeroes will add a number of zeroes after decimal to x.
nbdec <- function(x) {
x1 <- as.character(x)
xsplit <- strsplit(x1,"\\.")
xlength <- sapply(xsplit, function(d) nchar(d)[2])
xlength <- ifelse(is.na(xlength), 0, xlength)
return(xlength)
}
trailingzeroes <- function(x, dig) {
res <- rep(NA, length(x))
for( i in 1:length(x)) {
if(!is.na(x[i])) res[i] <- { paste0(rep(0,max(0,dig-nbdec(x[i]))), collapse="") }
else { res[i] <- ""}
}
return(res)
}
trailingcommas <- function(x) ifelse(is.na(x), NA, ifelse(nbdec(x)==0, ".",""))
addtrailingzeroes <- function(x, digits) {
return(ifelse(!is.na(x), paste0(x, trailingcommas(x), trailingzeroes(x, digits)),NA))
}
However to suppress inaccuracies and rounding mistakes, x has to be cropped first using roundnumerics.max:
roundnumerics.max <- function(df, startdig=12) {
for(icol in 1:ncol(df)) {
if( is.numeric(df[,icol])) {
dig <- startdig
while(any(!as.numeric(as.character(df[,icol])) %==% df[,icol])) {
dig <- dig-1
df[,icol] <- round(df[,icol], digits=dig)
if(dig==0) {
break
pprint("ERROR: zero numeric accuracy")
}
}
pprint("Numeric accuracy for column ",icol," ", colnames(df)[icol], " is ", dig)
}
}
return(data.frame(df, stringsAsFactors = F))
}
This is slow and far from elegant... I still think it hard to believe that R has such an accuracy limitation to 16 significant digits, and adds inaccurate noise that causes divergences when you try to increase the digits option...Without letting you know...

Optimize performance of a formula spanning three consecutive indices, with wraparound

I want to optimize the implementation of this formula.
Here is the formula:
x is an array of values. i goes from 1 to N where N > 2400000.
For i=0, i-1 is the last element and for i=lastElement, i+1 is the first element. Here is the code which I have written:
x <- 1:2400000
re <- array(data=NA, dim = NROW(x))
lastIndex = NROW(x)
for(i in 1:lastIndex){
if (i==1) {
re[i] = x[i]*x[i] - x[lastIndex]*x[i+1]
} else if(i==lastIndex) {
re[i] = x[i]*x[i] - x[i-1]*x[1]
} else {
re[i] = x[i]*x[i] - x[i-1]*x[i+1]
}
}
Can it be done by apply in R?
We can use direct vectorization for this
# Make fake data
x <- 1:10
n <- length(x)
# create vectors for the plus/minus indices
xminus1 <- c(x[n], x[-n])
xplus1 <- c(x[-1], x[1])
# Use direct vectorization to get re
re <- x^2 - xminus1*xplus1
If really each x[i] is equal to i then you can do a little math:
xi^2 - (xi-1)*(xi+1) = 1
so all elements of the result are 1 (only the first and the last are not 1).
The result is:
c(1-2*N, rep(1, N-2), N*N-(N-1))
In the general case (arbitrary values in x) you can do (as in the answer from Dason):
x*x - c(x[N], x[-N])*c(x[-1], x[1])
Here is a solution with rollapply() from zoo:
library("zoo")
rollapply(c(x[length(x)],x, x[1]), width=3, function(x) x[2]^2 - x[1]*x[3]) # or:
rollapply(c(tail(x,1), x, x[1]), width=3, function(x) x[2]^2 - x[1]*x[3])
Here is the benchmark:
library("microbenchmark")
library("zoo")
N <- 10000
x <- 1:N
microbenchmark(
math=c(1-2*N, rep(1, N-2), N*N-(N-1)), # for the data from the question
vect.i=x*x - c(x[N], x[-N])*c(x[-1], x[1]), # general data
roll.i=rollapply(c(x[length(x)],x, x[1]), width=3, function(x) x[2]^2 - x[1]*x[3]), # or:
roll.tail=rollapply(c(tail(x,1), x, x[1]), width=3, function(x) x[2]^2 - x[1]*x[3])
)
# Unit: microseconds
# expr min lq mean median uq max neval cld
# math 33.613 34.4950 76.18809 36.9130 38.0355 2002.152 100 a
# vect.i 188.928 192.5315 732.50725 197.1955 198.5245 51649.652 100 a
# roll.i 56748.920 62217.2550 67666.66315 68195.5085 71214.9785 109195.049 100 b
# roll.tail 57661.835 63855.7060 68815.91001 67315.5425 71339.6045 119428.718 100 b
An lapply implementation of your formula would look like this:
x <- c(1:2400000)
last <- length(x)
re <- lapply(x, function(i) {
if(i == 1) {
x[i]*x[i] - x[last]*x[i+1]
} else if (i == last) {
x[i]*x[i] - x[i-1]*x[1]
} else {
x[i]*x[i] - x[i-1]*x[i+1]
}
})
re <- unlist(re)
lapply will return a list, so conversion to a vector is done using unlist()
1) You can avoid all the special-casing in the computation by padding the start and end of array x with copies of the last and first rows; something like this:
N <- NROW(x)
x <- rbind(x[N], x, x[1]) # pad start and end to give wraparound
re <- lapply(2:N, function(i) { x[i]*x[i] - x[i-1]*x[i+1] } )
#re <- unlist(re) as andbov wrote
# and remember not to use all of x, just x[2:N], elsewhere
2) Directly vectorize, as #Dason's answer:
# Do the padding trick on x , then
x[2:N]^2 - x[1:N-1]*x[3:N+1]
3) If performance matters, I suspect using data.table or else for-loop on i will be faster, since it references three consecutive rows.
4) For more performance, use byte-compiling
5) If you need even more speed, use Rcpp extension (C++ under the hood) How to use Rcpp to speed up a for loop?
See those questions I cited for good examples of using lineprof and microbenchmarking to figure out where your bottleneck is.

Converting a function to accept input directly in r

I was reading a book and I came across this function in R. This function basically finds out patterns in the input string having a minimum threshold of 3.
vec <- "da0abcab0abcaab0d0"
find_rep_path <- function(vec, reps) {
regexp <- paste0(c("(.+)", rep("\\1", reps - 1L)), collapse = "")
match <- regmatches(vec, regexpr(regexp, vec, perl = TRUE))
substr(match, 1, nchar(match) / reps)
}
vals <- unique(strsplit(vec, "")[[1]])
str <- NULL
for (i in seq.int(nchar(vec))) {
x <- vec
for (v in vals) {
substr(x, i, i) <- v
tmp <- find_rep_path(x, 3)
if (length(tmp) > 0)
str <- c(str, tmp)
}
}
nc <- nchar(str)
unique(str[which(nc == max(nc))])
Now, I wish to convert this function into the form like,
function("da0abcab0abcaab0d0"). This means, that I can easily pass a string to the function directly and not hardcode it in the original function. How can I modify this?
I know this is a beginner question but I am completely at sea right now as far as R is concerned. Please help!
I don't see how it's hardcoded. But you can just wrap your code into a function if that's what you mean?
# Function 1
find_rep_path <- function(vec, reps) {
regexp <- paste0(c("(.+)", rep("\\1", reps - 1L)), collapse = "")
match <- regmatches(vec, regexpr(regexp, vec, perl = TRUE))
substr(match, 1, nchar(match) / reps)
}
# Function 2
foo <- function(vec) {
vals <- unique(strsplit(vec, "")[[1]])
str <- NULL
for (i in seq.int(nchar(vec))) {
x <- vec
for (v in vals) {
substr(x, i, i) <- v
tmp <- find_rep_path(x, 3)
if (length(tmp) > 0)
str <- c(str, tmp)
}
}
nc <- nchar(str)
return(unique(str[which(nc == max(nc))]))
}
vec <- "da0abcab0abcaab0d0"
foo(vec)
#[1] "0ab" "abc"
Edit1
To get the place of the matches you can use gregexr:
a <- foo(vec)
gregexpr(a[1], vec)
#[[1]]
#[1] 3 9
#attr(,"match.length")
#[1] 3 3
#attr(,"useBytes")
#[1] TRUE
This tells you that a[1] ("0ab") was matched in vec at positions 3 and 9. Run ?gregexpr for more informations.
Edit2
To add this information to each match, we can do something like
bar <- function(vec) {
m <- foo(vec)
ans <- sapply(m, gregexpr, vec, fixed = TRUE)
ans <- lapply(ans, function(x) {attributes(x) <- NULL; x})
return(ans)
}
bar(vec)
#$`0ab`
#[1] 3 9
#
#$abc
#[1] 4 10

Interval sets algebra in R (union, intersection, differences, inclusion, ...)

I am wondering whether a proper framework for interval manipulation and comparison does exist in R.
After some search, I was only able to find the following:
- function findInterval in base Package. (but I hardly understand it)
- some answers here and there about union and intersection (notably: http://r.789695.n4.nabble.com/Union-Intersect-two-continuous-sets-td4224545.html)
Would you know of an initiative to implement a comprehensive set of tools to easily handles frequent tasks in interval manipulation, like inclusion/setdiff/union/intersection/etc. (eg see here for a list of functionalities)?
or would you have advice in developing such an approach?
below are some drafts on my side for doing so. it is surely awkward and still has some bugs but it might illustrate what I am looking for.
preliminary aspects about the options taken
- should deal seamlessly with intervals or intervals set
- intervals are represented as 2 columns data.frames (lower boundary, higher boundary), on one row
- intervals sets are represented as 2 columns with several rows
- a third column might be needed for identification of intervals sets
UNION
interval_union <- function(df){ # for data frame
df <- interval_clean(df)
if(is.empty(df)){
return(as.data.frame(NULL))
} else {
if(is.POSIXct(df[,1])) {
dated <- TRUE
df <- colwise(as.numeric)(df)
} else {
dated <- FALSE
}
M <- as.matrix(df)
o <- order(c(M[, 1], M[, 2]))
n <- cumsum( rep(c(1, -1), each=nrow(M))[o])
startPos <- c(TRUE, n[-1]==1 & n[-length(n)]==0)
endPos <- c(FALSE, n[-1]==0 & n[-length(n)]==1)
M <- M[o]
if(dated == TRUE) {
df2 <- colwise(mkDateTime)(as.data.frame(cbind(M[startPos], M[endPos])), from.s = TRUE)
} else {
df2 <- as.data.frame(cbind(M[startPos], M[endPos]))
}
colnames(df2) <- colnames(df)
# print(df2)
return(df2)
}
}
union_1_1 <- function(test, ref){
names(ref) <- names(test)
tmp <- interval_union(as.data.frame(rbind(test, ref)))
return(tmp)
}
union_1_n <- function(test, ref){
return(union_1_1(test, ref))
}
union_n_n <- function(test, ref){
testnn <- adply(.data = test, 1, union_1_n, ref, .expand = FALSE)
return(testnn)
}
ref_interval_union <- function(df, ref){
tmp0 <- adply(df, 1, union_1_1, ref, .expand = FALSE) # set to FALSE to keep ID
return(tmp0)
}
INTERSECTION
interval_intersect <- function(df){
# adapted from : http://r.789695.n4.nabble.com/Union-Intersect-two-continuous-sets-td4224545.html
M <- as.matrix(df)
L <- max(M[, 1])
R <- min(M[, 2])
Inew <- if (L <= R) c(L, R) else c()
if (!is.empty(Inew)){
df2 <- t(as.data.frame(Inew))
colnames(df2) <- colnames(df)
rownames(df2) <- NULL
} else {
df2 <- NULL
}
return(as.data.frame(df2))
}
ref_interval_intersect <- function(df, ref){
tmpfun <- function(a, b){
names(b) <- names(a)
tmp <- interval_intersect(as.data.frame(rbind(a, b)))
return(tmp)
}
tmp0 <- adply(df, 1, tmpfun, ref, .expand = FALSE) # [,3:4]
#if(!is.empty(tmp0)) colnames(tmp0) <- colnames(df)
return(tmp0)
}
int_1_1 <- function(test, ref){
te <- as.vector(test)
re <- as.vector(ref)
names(re) <- names(te)
tmp0 <- c(max(te[1, 1], re[1, 1]), min(te[1, 2], re[1, 2]))
if(tmp0[1]>tmp0[2]) tmp0 <- NULL # inverse of a correct interval --> VOID
if(!is.empty(tmp0)){
tmp1 <- colwise(mkDateTime)(as.data.frame(t(as.data.frame(tmp0))))
colnames(tmp1) <- colnames(test)
} else {
tmp1 <- data.frame(NULL)
}
return(tmp1)
}
int_1_n <- function(test, ref){
test1 <- adply(.data = ref, 1, int_1_1, test = test, .expand = FALSE)
if(is.empty(test1)){
return(data.frame(NULL))
} else {
testn <- interval_union(test1[,2:3])
return(testn)
}
}
int_n_n <- function(test, ref){
testnn <- adply(.data = test, 1, int_1_n, ref, .expand = FALSE)
# return(testnn[,2:3]) # return interval set without index (1st column)
return(testnn) # return interval set with index (1st column) --> usefull to go with merge to keep metadata going alon g with interval description
}
int_intersect <- function(df, ref){
mycols <- colnames(df)
df$X1 <- 1:nrow(df)
test <- df[, 1:2]
tmp <- int_n_n(test, ref)
intersection <- merge(tmp, df, by = "X1", suffixes = c("", "init"))
return(intersection[,mycols])
}
EXCLUSION
excl_1_1 <- function(test, ref){
te <- as.vector(test)
re <- as.vector(ref)
names(re) <- names(te)
if(te[1] < re[1]){ # Lower Bound
if(te[2] > re[1]){ # overlap
x <- unlist(c(te[1], re[1]))
} else { # no overlap
x <- unlist(c(te[1], te[2]))
}
} else { # test > ref on lower bound side
x <- NULL
}
if(te[2] > re[2]){ # Upper Bound
if(te[1] < re[2]){ # overlap
y <- unlist(c(re[2], te[2]))
} else { # no overlap
y <- unlist(c(te[1], te[2]))
}
} else { # test < ref on upper bound side
y <- NULL
}
if(is.empty(x) & is.empty(y)){
tmp0 <- NULL
tmp1 <- tmp0
} else {
tmp0 <- as.data.frame(rbind(x, y))
colnames(tmp0) <- colnames(test)
tmp1 <- interval_union(tmp0)
}
return(tmp1)
}
excl_1_n <- function(test, ref){
testn0 <- adply(.data = ref, 1, excl_1_1, test = test, .expand=FALSE)
# boucle pour intersecter successivement les intervalles sets, pour gérer les intervalles disjoints (identifiés par X1, col1)
tmp <- range(testn0)
names(tmp) <- colnames(testn0)[2:3]
tmp <- as.data.frame(t(tmp))
for(i in unique(testn0[,1])){
tmp <- int_n_n(tmp, testn0[testn0[,1]==i, 2:3])
}
return(tmp)
}
INCLUSION
incl_1_1 <- function(test, ref){
te <- as.vector(test)
re <- as.vector(ref)
if(te[1] >= re[1] & te[2] <= re[2]){ return(TRUE) } else { return(FALSE) }
}
incl_1_n <- function(test, ref){
testn <- adply(.data = ref, 1, incl_1_1, test = test)
return(any(testn[,ncol(testn)]))
}
incl_n_n <- function(test, ref){
testnn <- aaply(.data = test, 1, incl_1_n, ref, .expand = FALSE)
names(testnn) <- NULL
return(testnn)
}
flat_incl_n_n <- function(test, ref){
ref <- interval_union(ref)
return(incl_n_n(test, ref))
}
# testing for a vector, instead of an interval set
incl_x_1 <- function(x, ref){
test <- (x>=ref[1,1] & x<ref[1,2])
return(test)
}
incl_x_n <- function(x, ref){
test <- any(x>=ref[,1] & x<ref[,2])
return(test)
}
I think you might be able to make good use of the many interval-related functions in the sets package.
Here's a small example illustrating the package's support for interval construction, intersection, set difference, union, and complementation, as well as its test for inclusion in an interval. These and many other related functions are documented on the help page for ?interval.
library(sets)
i1 <- interval(1,6)
i2 <- interval(5,10)
i3 <- interval(200,400)
i4 <- interval(202,402)
i5 <- interval_union(interval_intersection(i1,i2),
interval_symdiff(i3,i4))
i5
# [5, 6] U [200, 202) U (400, 402]
interval_complement(i5)
# [-Inf, 5) U (6, 200) U [202, 400] U (402, Inf]
interval_contains_element(i5, 5.5)
# [1] TRUE
interval_contains_element(i5, 201)
# [1] TRUE
If your intervals are currently encoded in a two-column data.frame, you could use something like mapply() to convert them to intervals of the type used by the sets package:
df <- data.frame(lBound = c(1,5,100), uBound = c(10, 6, 200))
Ints <- with(df, mapply("interval", l=lBound, r=uBound, SIMPLIFY=FALSE))
Ints
# [[1]]
# [1, 10]
# [[2]]
# [5, 6]
# [[3]]
# [100, 200]

Resources