Check for consecutive elements in vector

Check for consecutive elements in vector - r

I have a vector of K elements and I need to check if it contains a precise subset of 2 consecutive elements.
Assume K = 6
vec = c(4,4,3,1,2,2)
How can I get an indicator that checks for the subset (4,2) as consecutive elements? Should return FALSE in this example while (3,1) should return TRUE.

One another approach could be:
grepl("42", paste(vec, collapse = ""))
[1] FALSE
grepl("31", paste(vec, collapse = ""))
[1] TRUE

1) Use rollapply with the identical function.
library(zoo)
vec <- c(4,4,3,1,2,2)
x <- c(4, 2)
any(rollapply(vec, length(x), identical, x))
## [1] FALSE
x <- c(3, 1)
any(rollapply(vec, length(x), identical, x))
## [1] TRUE
2) Create strings out of vec and x and use grepl:
x <- c(4, 2)
grepl(sprintf("\\b%s\\b", toString(x)), toString(vec))
## [1] FALSE
x <- c(3, 1)
grepl(sprintf("\\b%s\\b", toString(x)), toString(vec))
## [1] TRUE
3) If x always has two elements then:
x <- c(4, 2)
any(head(vec, -1) == x[1] & tail(vec, -1) == x[2])
## [1] FALSE
x <- c(3, 1)
any(head(vec, -1) == x[1] & tail(vec, -1) == x[2])
## [1] TRUE

Especially if you have more than 2 elements to check, this might be worthwhile:
vec <-= c(4,4,3,1,2,2)
chk <- c(3, 1)
as.logical(length(Reduce(intersect, lapply(seq_along(chk), function(x) which(chk[x] == lead(vec, x - 1))))))
# [1] TRUE
chk <- c(4, 1)
as.logical(length(Reduce(intersect, lapply(seq_along(chk), function(x) which(chk[x] == lead(vec, x - 1))))))
# [1] FALSE

Using base R:
find_subsequence = function (vec, needle) {
sub_idx = seq_along(needle) - 1L
Filter(
function (i) all(needle == vec[sub_idx + i]),
seq_len(length(vec) - length(needle) + 1L)
)
}
find_subsequence(vec, c(3, 1))
# [1] 4
find_subsequence(vec, c(4, 2))
# integer(0)

Related

Use Recursion in R to Split a String Into Chunks

I am trying to wrap my head around the idea of recursion. However, when I apply my recursive R function, it does not return a string split into the number of chunks desired. It only returns two chunks. However, my goal is to split a long string into multiple chunks of smaller strings of size n. I am sure there are other ways to do this, but I am trying find a recursive solution. Any help is appreciated thanks in advance.
# Sample dataset
x <- paste0(rep(letters, 10000), collapse = "")
split_group <- function(x, n = 10) {
if (nchar(x) < n) {
return(x)
} else {
beginning <- substring(x, 1, n)
remaining <- substring(x, (n + 1), (n + 1) + (n - 1))
c(beginning, split_group(remaining, n))
}
}
split_group(x = x, n = 10)
# Returns: "abcdefghij" "klmnopqrst" ""

Use <= instead of < and fix remaining.
split_group <- function(x, n = 10) {
if (nchar(x) <= n) x
else {
beginning <- substring(x, 1, n)
remaining <- substring(x, n + 1)
c(beginning, split_group(remaining, n))
}
}
x <- substring(paste(letters, collapse = ""), 1, 24)
split_group(x, 2)
## [1] "ab" "cd" "ef" "gh" "ij" "kl" "mn" "op" "qr" "st" "uv" "wx"
split_group(x, 5)
## [1] "abcde" "fghij" "klmno" "pqrst" "uvwx"
split_group(x, 6)
## [1] "abcdef" "ghijkl" "mnopqr" "stuvwx"
split_group(x, 10)
## [1] "abcdefghij" "klmnopqrst" "uvwx"
split_group(x, 23)
## [1] "abcdefghijklmnopqrstuvw" "x"
split_group(x, 24)
## [1] "abcdefghijklmnopqrstuvwx"
split_group(x, 25)
## [1] "abcdefghijklmnopqrstuvwx"
2) and some approaches without recursion The first is the shortest but the second is the simplest and only uses base R. The third only uses base R as well.
library(gsubfn)
strapply(x, "(.{1,10})", simplify = c)
## [1] "abcdefghij" "klmnopqrst" "uvwx"
ix <- seq(1, nchar(x), 10)
substring(x, ix, ix + 10 - 1)
## [1] "abcdefghij" "klmnopqrst" "uvwx"
sapply(seq(1, nchar(x), 10), function(i) substring(x, i, i + 10 - 1))
## [1] "abcdefghij" "klmnopqrst" "uvwx"
library(zoo)
s <- strsplit(x, "")[[1]]
rollapply(s, 10, by = 10, paste0, collapse = "", partial = TRUE, align = "left")
## [1] "abcdefghij" "klmnopqrst" "uvwx"

A base R option would be
x1 <- strsplit(x, "(?<=.{10})(?=.)", perl = TRUE)[[1]]
-output
> head(x1, 10)
[1] "abcdefghij" "klmnopqrst" "uvwxyzabcd" "efghijklmn" "opqrstuvwx" "yzabcdefgh" "ijklmnopqr" "stuvwxyzab" "cdefghijkl" "mnopqrstuv"

Shortening a long vector in R

Vectors a and b can be shortened using toString(width = 10) in Base R resulting in a shorter vector that ends in ....
However, I wonder how I can make the shortened vector to end in ..., last vector element?
My desired_output is shown below.
a <- 1:26
b <- LETTERS
toString(a, width = 10)
# [1] "1,2,...."
desired_output1 = "1,2,...,26"
toString(b, width = 10)
# [1] "A,B,...."
desired_output2 = "A,B,...,Z"

You could just add the end on.
paste(toString(a, width = 10), a[length(a)], sep=", ")
[1] "1, 2, ...., 26"
paste(toString(b, width = 10), b[length(b)], sep=", ")
[1] "A, B, ...., Z"

After applting the toString, we may use sub to remove the substring to format
f1 <- function(vec, n = 2) {
gsub("\\s+", "",
sub(sprintf("^(([^,]+, ){%s}).*, ([^,]+)$", n), "\\1...,\\3", toString(vec)))
}
-testing
> f1(a)
[1] "1,2,...,26"
> f1(b)
[1] "A,B,...,Z"
> f1(a, 3)
[1] "1,2,3,...,26"
> f1(b, 3)
[1] "A,B,C,...,Z"
> f1(a, 4)
[1] "1,2,3,4,...,26"
> f1(b, 4)
[1] "A,B,C,D,...,Z"

We could do it this way:
Creating a function that extraxt the first two elements and the last element of the vector and paste them together:
my_func <- function(x) {
a <- paste(x[1:2], collapse=",")
b <- tail(x, n=1)
paste0(a,",...,",b)
}
my_func(a)
[1] "1,2,...,26"
my_func(b)
[1] "A,B,...,Z"

library(stringr)
a <- 1:26
b <- LETTERS
reduce_string <- function(x, n_show) {
str_c(x[1:n_show], collapse = ',') %>%
str_c('....,', x[[length(x)]])
}
reduce_string(a, 2)
#> [1] "1,2....,26"
Created on 2022-01-02 by the reprex package (v2.0.1)

Check if a number is between two others

I am looking for a function that verifies if a number is between two other numbers. I also need to control if I want a strict comparison (a
I know the function between() in dplyr. Yet, I have to know the upper and lower numbers.
MyNumber = 8
First = 2
Second = 10
# This will return TRUE
between(MyNumber, lower = First, upper = Second)
# But this will return FALSE
between(MyNumber, lower = Second, upper = First)
# This will return TRUE. I want it to return FALSE
First = 8
between(MyNumber, lower = First, upper = Second)
I need a function that returns TRUE no matter what is the order.

Something like:
between2 <- function(number,bounds) { number > min(bounds) & number < max(bounds)}
between2(8, c(2,10))
[1] TRUE
between2(8, c(10,2))
[1] TRUE
This function also deals with your added condition
between2(8,c(8,10))
[1] FALSE

You could do it with a simple arithmetics:
between <- function(number, first, second) { (first - number) * (second - number) < 0 }
Here are some example outputs:
> between(8, 2, 10)
[1] TRUE
> between(8, 10, 2)
[1] TRUE
> between(8, 10, 12)
[1] FALSE
> between(8, 1, 2)
[1] FALSE

You could use %in% with the : function, once you now first and last:
first <- 2
last <- 10
number <- 8
number %in% first:last
[1] TRUE
first <- 10
last <- 2
number <- 8
number %in% first:last
[1] TRUE
first <- 10
last <- 12
number <- 8
number %in% first:last
[1] FALSE
first <- 12
last <- 10
number <- 8
number %in% first:last
[1] FALSE
In a function, and strict lets you consider or not strict comparison:
my_between <- function(n, f, l, strict = FALSE) {
if (!strict) {
n %in% f:l # if strict == FALSE (default)
} else {
n %in% (f+1):(l-1) # if strict == TRUE
}
}
my_between(8, 2, 10)

What's wrong with
f_between <- function (num, L, R) num>=min(L,R) & num<=max(L,R)
f_between(8, 2, 10)
#[1] TRUE
f_between(6, 6, 10)
#[1] TRUE
f_between(2, -10, -2)
#[1] FALSE
f_between(3, 5, 7)
#[1] FALSE

Compare two vectors (including multiple items)

I need to find which elements in a new vector (vb) have been added to another vector (va). If there for example is unly one "2" in va, but two "2" in vb, then one "2" has been added.
The comment in the code below shows what is sought.
va <- c(1, 2) # Original vector
vb <- c(1, 2) # NA or NULL
vb <- c(2, 2) # 2
vb <- c(1, 1) # 1
vb <- c(1) # NA or NULL
vb <- c(2) # NA or NULL
vb <- c(3, 3) # c(3, 3)
I've tried match, union, intersect, %in%, etc. but can't get it to work to consider also multiple instances. This feels irritatingly simple...

The following reproduces your expected outcome. Just as an honest heads-up, I'm not really happy with my solution, this seems oddly convoluted:
f <- function(a, b) {
a <- as.data.frame(unclass(rle(a)));
b <- as.data.frame(unclass(rle(b)));
t <- merge(a, b, by = "values", all = TRUE);
t$lengths.x[is.na(t$lengths.x)] <- 0;
t$diff <- t$lengths.y - t$lengths.x;
t <- t[!is.na(t$diff) & t$diff > 0, ];
return(rep(t$values, t$diff));
}
va <- c(1, 2);
vb <- c(1, 2) # NA or NULL
f(va, vb);
#numeric(0)
vb <- c(2, 2) # 2
f(va, vb);
#[1] 2
vb <- c(1, 1) # 1
f(va, vb);
#[1] 1
vb <- c(1) # NA or NULL
f(va, vb);
#numeric(0)
vb <- c(2) # NA or NULL
f(va, vb);
#numeric(0)
vb <- c(3, 3) # c(3, 3)
#[1] 3 3
Explanation: I'm making use of rle to compare the lengths (level of duplicity) of different entries in va and vb; then report only those that are not already in va.
Update
Here is a much cleaner method using a recursive function.
f <- function(a, b) {
if (length(a) == 0 | length(b) == 0) return(NULL);
m <- data.frame(idx.a = 1:length(a), idx.b = match(a, b));
m <- m[complete.cases(m), ];
# Here is the recursive call
if (nrow(m) > 0) f(a[-m$idx.a[1]], b[-m$idx.b[1]]) else b;
}
va <- c(1, 2) # Original vector
f(va, c(1, 2));
#NULL
f(va, c(2, 2));
#[1] 2
f(va, c(1, 1));
#[1] 1
f(va, c(1));
#NULL
f(va, c(2));
#NULL
f(va, c(3, 3));
#[1] 3 3

Not the most elegant, but it works for all your cases:
Diff_frequency <- function(va,vb){
df <- merge(as.data.frame(table(va)), as.data.frame(table(vb)), by.x="va", by.y="vb", all=T)
df$Freq.x[is.na(df$Freq.x)] <- 0
df$Dif <- df$Freq.y - df$Freq.x
df$Dif[is.na(df$Dif) | df$Dif < 0] <- 0
return(rep(as.numeric(as.character(df[,1])), df$Dif))
}
Diff_frequency(va,vb)
Examples of output:
va=c(1,1,1,2,2,2,3)
vb=c(1,1,4,4,2,2,5)
Diff_frequency(va,vb)
[1] 4 4 5
va=c(1,1,1,2,2,2,3)
vb=c(1,1,1,1,2,2,2,3,3,5)
Diff_frequency(va,vb)
1] 1 3 5
va=c(1,1,1,2,2,2,3)
vb=c(1,1,2,3)
Diff_frequency(va,vb)
numeric(0)

How to check if a vector contains n consecutive numbers

Suppose that my vector numbers contains c(1,2,3,5,7,8), and I wish to find if it contains 3 consecutive numbers, which in this case, are 1,2,3.
numbers = c(1,2,3,5,7,8)
difference = diff(numbers) //The difference output would be 1,1,2,2,1
To verify that there are 3 consecutive integers in my numbers vector, I've tried the following with little reward.
rep(1,2)%in%difference
The above code works in this case, but if my difference vector = (1,2,2,2,1), it would still return TRUE even though the "1"s are not consecutive.

Using diff and rle, something like this should work:
result <- rle(diff(numbers))
any(result$lengths>=2 & result$values==1)
# [1] TRUE
In response to the comments below, my previous answer was specifically only testing for runs of length==3 excluding longer lengths. Changing the == to >= fixes this. It also works for runs involving negative numbers:
> numbers4 <- c(-2, -1, 0, 5, 7, 8)
> result <- rle(diff(numbers4))
> any(result$lengths>=2 & result$values==1)
[1] TRUE

Benchmarks!
I am including a couple functions of mine. Feel free to add yours. To qualify, you need to write a general function that tells if a vector x contains n or more consecutive numbers. I provide a unit test function below.
The contenders:
flodel.filter <- function(x, n, incr = 1L) {
if (n > length(x)) return(FALSE)
x <- as.integer(x)
is.cons <- tail(x, -1L) == head(x, -1L) + incr
any(filter(is.cons, rep(1L, n-1L), sides = 1, method = "convolution") == n-1L,
na.rm = TRUE)
}
flodel.which <- function(x, n, incr = 1L) {
is.cons <- tail(x, -1L) == head(x, -1L) + incr
any(diff(c(0L, which(!is.cons), length(x))) >= n)
}
thelatemail.rle <- function(x, n, incr = 1L) {
result <- rle(diff(x))
any(result$lengths >= n-1L & result$values == incr)
}
improved.rle <- function(x, n, incr = 1L) {
result <- rle(diff(as.integer(x)) == incr)
any(result$lengths >= n-1L & result$values)
}
carl.seqle <- function(x, n, incr = 1) {
if(!is.numeric(x)) x <- as.numeric(x)
z <- length(x)
y <- x[-1L] != x[-z] + incr
i <- c(which(y | is.na(y)), z)
any(diff(c(0L, i)) >= n)
}
Unit tests:
check.fun <- function(fun)
stopifnot(
fun(c(1,2,3), 3),
!fun(c(1,2), 3),
!fun(c(1), 3),
!fun(c(1,1,1,1), 3),
!fun(c(1,1,2,2), 3),
fun(c(1,1,2,3), 3)
)
check.fun(flodel.filter)
check.fun(flodel.which)
check.fun(thelatemail.rle)
check.fun(improved.rle)
check.fun(carl.seqle)
Benchmarks:
x <- sample(1:10, 1000000, replace = TRUE)
library(microbenchmark)
microbenchmark(
flodel.filter(x, 6),
flodel.which(x, 6),
thelatemail.rle(x, 6),
improved.rle(x, 6),
carl.seqle(x, 6),
times = 10)
# Unit: milliseconds
# expr min lq median uq max neval
# flodel.filter(x, 6) 96.03966 102.1383 144.9404 160.9698 177.7937 10
# flodel.which(x, 6) 131.69193 137.7081 140.5211 185.3061 189.1644 10
# thelatemail.rle(x, 6) 347.79586 353.1015 361.5744 378.3878 469.5869 10
# improved.rle(x, 6) 199.35402 200.7455 205.2737 246.9670 252.4958 10
# carl.seqle(x, 6) 213.72756 240.6023 245.2652 254.1725 259.2275 10

After diff you can check for any consecutive 1s -
numbers = c(1,2,3,5,7,8)
difference = diff(numbers) == 1
## [1] TRUE TRUE FALSE FALSE TRUE
## find alteast one consecutive TRUE
any(tail(difference, -1) &
head(difference, -1))
## [1] TRUE

It's nice to see home-grown solutions here.
Fellow Stack Overflow user Carl Witthoft posted a function he named seqle() and shared it here.
The function looks like this:
seqle <- function(x,incr=1) {
if(!is.numeric(x)) x <- as.numeric(x)
n <- length(x)
y <- x[-1L] != x[-n] + incr
i <- c(which(y|is.na(y)),n)
list(lengths = diff(c(0L,i)),
values = x[head(c(0L,i)+1L,-1L)])
}
Let's see it in action. First, some data:
numbers1 <- c(1, 2, 3, 5, 7, 8)
numbers2 <- c(-2, 2, 3, 5, 6, 7, 8)
numbers3 <- c(1, 2, 2, 2, 1, 2, 3)
Now, the output:
seqle(numbers1)
# $lengths
# [1] 3 1 2
#
# $values
# [1] 1 5 7
#
seqle(numbers2)
# $lengths
# [1] 1 2 4
#
# $values
# [1] -2 2 5
#
seqle(numbers3)
# $lengths
# [1] 2 1 1 3
#
# $values
# [1] 1 2 2 1
#
Of particular interest to you is the "lengths" in the result.
Another interesting point is the incr argument. Here we can set the increment to, say, "2" and look for sequences where the difference between the numbers are two. So, for the first vector, we would expect the sequence of 3, 5, and 7 to be detected.
Let's try:
> seqle(numbers1, incr = 2)
$lengths
[1] 1 1 3 1
$values
[1] 1 2 3 8
So, we can see that we have a sequence of 1 (1), 1 (2), 3 (3, 5, 7), and 1 (8) if we set incr = 2.
How does it work with ECII's second challenge? Seems OK!
> numbers4 <- c(-2, -1, 0, 5, 7, 8)
> seqle(numbers4)
$lengths
[1] 3 1 2
$values
[1] -2 5 7

Simple but works
numbers = c(-2,2,3,4,5,10,6,7,8)
x1<-c(diff(numbers),0)
x2<-c(0,diff(numbers[-1]),0)
x3<-c(0,diff(numbers[c(-1,-2)]),0,0)
rbind(x1,x2,x3)
colSums(rbind(x1,x2,x3) )==3 #Returns TRUE or FALSE where in the vector the consecutive intervals triplet takes place
[1] FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
sum(colSums(rbind(x1,x2,x3) )==3) #How many triplets of consecutive intervals occur in the vector
[1] 3
which(colSums(rbind(x1,x2,x3) )==3) #Returns the location of the triplets consecutive integers
[1] 2 3 7
Note that this will not work for consecutive negative intervals c(-2,-1,0) because of how diff() works