Index consecutive duplicates in vector - r

What is the optimal way to get the index of all elements that are repeated # times? I want to identify the elements that are duplicated more than 2 times.
rle() and rleid() both hint to the values I need but neither method directly gives me the indices.
I came up with this code:
t1 <- c(1, 10, 10, 10, 14, 37, 3, 14, 8, 8, 8, 8, 39, 12)
t2 <- lag(t1,1)
t2[is.na(t2)] <- 0
t3 <- ifelse(t1 - t2 == 0, 1, 0)
t4 <- rep(0, length(t3))
for (i in 2:length(t3)) t4[i] <- ifelse(t3[i] > 0, t3[i - 1] + t3[i], 0)
which(t4 > 1)
returns:
[1] 4 11 12
and those are the values I need.
Are there any R-functions that are more appropriate?
Ben

One option with data.table. No real reason to use this instead of lag/shift when n = 2, but for larger n this would save you from creating a large number of new lagged vectors.
library(data.table)
which(rowid(rleid(t1)) > 2)
# [1] 4 11 12
Explanation:
rleid will produce a unique value for each "run" of equal values, and rowid will mark how many elements "into" the run each element is. What you want is elements more than 2 "into" a run.
data.table(
t1,
rleid(t1),
rowid(t1))
# t1 V2 V3
# 1: 1 1 1
# 2: 10 2 1
# 3: 10 2 2
# 4: 10 2 3
# 5: 14 3 1
# 6: 37 4 1
# 7: 3 5 1
# 8: 14 6 2
# 9: 8 7 1
# 10: 8 7 2
# 11: 8 7 3
# 12: 8 7 4
# 13: 39 8 1
# 14: 12 9 1
Edit: If, as in the example posed by this question, no two runs (even length-1 "runs") are of the same value (or if you don't care whether the duplicates are next to eachother), you can just use which(rowid(t1) > 2) instead. (This is noted by Frank in the comments)
Hopefully this example clarifies the differences
a <- c(1, 1, 1, 2, 2, 1)
which(rowid(a) > 2)
# [1] 3 6
which(rowid(rleid(a)) > 2)
# [1] 3

You can use dplyr::lag or data.table::shift (note, default for shift is to lag, so shift(t1, 1) is equal to shift(t1, 1, type = "lag"):
which(t1 == lag(t1, 1) & lag(t1, 1) == lag(t1, 2))
[1] 4 11 12
# Or
which(t1 == shift(t1, 1) & shift(t1, 1) == shift(t1, 2))
[1] 4 11 12
If you need it to scale for several duplicates you can do the following (thanks for the tip #IceCreamToucan):
n <- 2
df1 <- sapply(0:n, function(x) shift(t1, x))
which(rowMeans(df1 == df1[,1]) == 1)
[1] 4 11 12

This is usually a case that rle is useful, i.e.
v1 <- rle(t1)
i1 <- seq_along(t1)[t1 %in% v1$values[v1$lengths > 2]]
i2 <- t1[t1 %in% v1$values[v1$lengths > 2]]
tapply(i1, i2, function(i) tail(i, -2))
#$`8`
#[1] 11 12
#$`10`
#[1] 4
You can unlist and get it as a vector,
unlist(tapply(i1, i2, function(i) tail(i, -2)))
#81 82 10
#11 12 4
There is also a function called rleid in data.table package which we can use,
unlist(lapply(Filter(function(i) length(i) > 2, split(seq_along(t1), data.table::rleid(t1))),
function(i) tail(i, -2)))
#2 71 72
#4 11 12

Another possibility involving rle() could be:
pseudo_rleid <- with(rle(t1), rep(seq_along(values), lengths))
which(ave(t1, pseudo_rleid, FUN = function(x) seq_along(x) > 2) != 0)
[1] 4 11 12

Related

When the before value is not 1 then impute the before value in place of current value until next non 1 value is found in R

The input vector is as below,
data=c(1,1,1,1,11,1,1,1,1,12,1,1,2,1,1,1)
I want the output as 1,1,1,1,11,11,11,11,11,12,12,12,2,2,2,2 where the 1's proceeding the non 1's should be imputed the non 1 value in R.
I tried the following code
data=c(1,1,1,1,11,1,1,1,1,12,1,1,2,1,1,1)
sapply(data, function(x) ifelse (lag(x)!=1,lag(x),x))
but it didn't yield expected output
You can convert every 1 after the first non-1 value to NA then use zoo::na.locf():
library(zoo)
x <- c(1,1,1,1,11,1,1,1,1,12,1,1,2,1,1,1)
data[seq_along(x) > which.max(x!= 1) & x== 1] <- NA
na.locf(x)
[1] 1 1 1 1 11 11 11 11 11 12 12 12 2 2 2 2
Or using replace() to add the NA values:
na.locf(replace(x, seq_along(x) > which.max(x != 1) & x == 1, NA))
In response to your comment about applying it to groups, you can use ave():
df <- data.frame(x = c(x, rev(x)), grp = rep(1:2, each = length(x)))
ave(df$x, df$grp, FUN = function(y)
na.locf(replace(y, seq_along(y) > which.max(y != 1) & y == 1, NA))
)
You can write your custom fill function:
x <- c(1,1,1,1,11,1,1,1,1,12,1,1,2,1,1,1)
myfill <- function(x) {
mem <- x[1]
for (i in seq_along(x)) {
if (x[i] == 1) {
x[i] <- mem
} else {
mem <- x[i]
}
}
x
}
myfill(x)
# 1 1 1 1 11 11 11 11 11 12 12 12 2 2 2 2
You could match unique 1 and non-1 values with the cumsum of non-1 values.
(c(1, x[x != 1]))[match(cumsum(x != 1), 0:3)]
# [1] 1 1 1 1 11 11 11 11 11 12 12 12 2 2 2 2
Data
x <- c(1, 1, 1, 1, 11, 1, 1, 1, 1, 12, 1, 1, 2, 1, 1, 1)
You can use rle from base to overwrite 1 with the value before.
x <- rle(data)
y <- c(FALSE, (x$values == 1)[-1])
x$values[y] <- x$values[which(y)-1]
inverse.rle(x)
# [1] 1 1 1 1 11 11 11 11 11 12 12 12 2 2 2 2

Returning values after last NA in a vector

Returning values after last NA in a vector
I can remove all NA values from a vector
v1 <- c(1,2,3,NA,5,6,NA,7,8,9,10,11,12)
v2 <- na.omit(v1)
v2
but how do I return a vector with values only after the last NA
c( 7,8,9,10,11,12)
Thank you for your help.
You could detect the last NA with which and add 1 to get the index past the last NA and index until the length(v1):
v1[(max(which(is.na(v1)))+1):length(v1)]
[1] 7 8 9 10 11 12
Here’s an alternative solution that does not use indices and only vectorised operations:
after_last_na = as.logical(rev(cumprod(rev(! is.na(v1)))))
v1[after_last_na]
The idea is to use cumprod to fill the non-NA fields from the last to the end. It’s not a terribly useful solution in its own right (I urge you to use the more obvious, index range based solution from other answers) but it shows some interesting techniques.
You could detect the last NA with which
v1[(tail(which(is.na(v1)), 1) + 1):length(v1)]
# [1] 7 8 9 10 11 12
However, the most general - as #MrFlick pointed out - seems to be this:
tail(v1, -tail(which(is.na(v1)), 1))
# [1] 7 8 9 10 11 12
which also handles the following case correctly:
v1[13] <- NA
tail(v1, -tail(which(is.na(v1)), 1))
# numeric(0)
To get the null NA case, too,
v1 <- 1:13
we can do
if (any(is.na(v1))) tail(v1, -tail(which(is.na(v1)), 1)) else v1
# [1] 1 2 3 4 5 6 7 8 9 10 11 12 13
Data
v1 <- c(1, 2, 3, NA, 5, 6, NA, 7, 8, 9, 10, 11, 12)
v1 <- c(1,2,3,NA,5,6,NA,7,8,9,10,11,12)
v1[seq_along(v1) > max(0, tail(which(is.na(v1)), 1))]
#[1] 7 8 9 10 11 12
v1 = 1:5
v1[seq_along(v1) > max(0, tail(which(is.na(v1)), 1))]
#[1] 1 2 3 4 5
v1 = c(1:5, NA)
v1[seq_along(v1) > max(0, tail(which(is.na(v1)), 1))]
#integer(0)
The following will do what you want.
i <- which(is.na(v1))
if(i[length(i)] < length(v1)){
v1[(i[length(i)] + 1):length(v1)]
}else{
NULL
}
#[1] 7 8 9 10 11 12

Extract multiple ranges from a numeric vector

First, I simplify my question. I want to extract certain ranges from a numeric vector. For example, extracting 3 ranges from 1:20 at the same time :
1 < x < 5
8 < x < 12
17 < x < 20
Therefore, the expected output is 2, 3, 4, 9, 10, 11, 18, 19.
I try to use the function findInterval() and control arguments rightmost.closed and left.open to do that, but any arguments sets cannot achieve the goal.
x <- 1:20
v <- c(1, 5, 8, 12, 17, 20)
x[findInterval(x, v) %% 2 == 1]
# [1] 1 2 3 4 8 9 10 11 17 18 19
x[findInterval(x, v, rightmost.closed = T) %% 2 == 1]
# [1] 1 2 3 4 8 9 10 11 17 18 19 20
x[findInterval(x, v, left.open = T) %% 2 == 1]
# [1] 2 3 4 5 9 10 11 12 18 19 20
By the way, the conditions can also be a matrix like that :
[,1] [,2]
[1,] 1 5
[2,] 8 12
[3,] 17 20
I don't want to use for loop if it's not necessary.
I am grateful for any helps.
I'd probably do it using purrr::map2 or Map, passing your lower-bounds and upper-bounds as arguments and filtering your dataset with a custom function
library(purrr)
x <- 1:20
lower_bounds <- c(1, 8, 17)
upper_bounds <- c(5, 12, 20)
map2(
lower_bounds, upper_bounds, function(lower, upper) {
x[x > lower & x < upper]
}
)
You may use data.table::inrange and its incbounds argument. Assuming ranges are in a matrix 'm', as shown in your question:
x[data.table::inrange(x, m[ , 1], m[ , 2], incbounds = FALSE)]
# [1] 2 3 4 9 10 11 18 19
m <- matrix(v, ncol = 2, byrow = TRUE)
You were on the right path, and left.open indeed helps, but rightmost.closed actually concerns only the last interval rather than the right "side" of each interval. Hence, we need to use left.open twice. As you yourself figured out, it looks like an optimal way to do that is
x[findInterval(x, v) %% 2 == 1 & findInterval(x, v, left.open = TRUE) %% 2 == 1]
# [1] 2 3 4 9 10 11 18 19
Clearly there are alternatives. E.g.,
fun <- function(x, v)
if(length(v) > 1) v[1] < x & x < v[2] | fun(x, v[-1:-2]) else FALSE
x[fun(x, v)]
# [1] 2 3 4 9 10 11 18 19
I found an easy way just with sapply() :
x <- 1:20
v <- c(1, 5, 8, 12, 17, 20)
(v.df <- as.data.frame(matrix(v, 3, 2, byrow = T)))
# V1 V2
# 1 1 5
# 2 8 12
# 3 17 20
y <- sapply(x, function(x){
ind <- (x > v.df$V1 & x < v.df$V2)
if(any(ind)) x else NA
})
y[!is.na(y)]
# [1] 2 3 4 9 10 11 18 19

How to sort odd and even numbers of an array in a specific format

I have a vector like this
seq_vector <- c(3,12,5,9,11,8,4,6,7,11,15,3,9,10,12,2)
I want to format them in descending order of odd numbers, followed by ascending order of even numbers. Output of above seq_vector will be
new_seq_vector <- c(15,11,11,9,9,7,5,3,3,2,4,6,8,10,12,12)
Can you please help me with the logic of the same?
Try x[order(x*v)] where v is -1 for odd, +1 for even.
Thanks to #lmo for this:
x[order( x*(-1)^x )]
# [1] 15 11 11 9 9 7 5 3 3 2 4 6 8 10 12 12
So v = (-1)^x here.
Some other ways to build v: #d.b's (-1)^(x %% 2); and mine, 1-2*(x %% 2).
(Thanks #d.b) If x contains negative integers, an additional sorting vector is needed:
# new example
x = c(2, 5, -15, -10, 1, -3, 12)
x[order(v <- (-1)^x, x*v)]
# [1] 5 1 -3 -15 -10 2 12
Take modulus by 2 (%% 2) to determine the odd and even elements and sort accordingly.
c(sort(seq_vector[seq_vector %% 2 == 1], decreasing = TRUE), #For odd
sort(seq_vector[seq_vector %% 2 == 0])) #For even
#[1] 15 11 11 9 9 7 5 3 3 2 4 6 8 10 12 12
Use an auxiliary function.
is.odd <- function(x) (x %% 2) == 1
result <- c(sort(seq_vector[is.odd(seq_vector)], decreasing = TRUE),
sort(seq_vector[!is.odd(seq_vector)]))
result

Find consecutive sub-vectors of length k out of a numeric vector which satisfy a given condition

I have a numeric vector in R, say
v= c(2,3,5,6,7,6,3,2,3,4,5,7,8,9,6,1,1,2,5,6,7,11,2,3,4)
Now, I have to find all the consecutive sub-vector of size 4 out of it with the condition that each element of the sub-vector must be greater than 2 and all sub-vector must be disjoint in the sense that non of the two sub-vector can contain same index element. So my output will be:
(3,5,6,7),(3,4,5,7),(5,6,7,11)
Edited:
Other examples for illustration purpose: for,
v=c(3,3,3,3,1,3,3,3,3,3,3,3,3)
output will be :
(3,3,3,3), (3,3,3,3),(3,3,3,3).
and for,
v= c(2,3,5,5,7,6,3,2,3,4,5,7,8,9,6,1,1,2,5,6,7,11,2,3,4)
output will be
(3,5,5,7),(3,4,5,7),(5,6,7,11)
The second condition on the output simply says that if we found any sub- array say (v[m],v[m+1],v[m+2],v[m+3]) with each element greater than > 2 then it will goes into my output and the next sub-array can only be start from v[m+4](if possible)
This solution uses embed() to create a matrix of lags and then extracts the desired rows from this matrix:
v <- c(2,3,5,6,7,6,3,2,3,4,5,7,8,9,6,1,1,2,5,6,7,11,2,3,4)
e <- embed(v, 4)
ret <- which(
apply(e, 1, function(x)all(x > 2)) &
apply(e, 1, function(x)length(unique(x)) == 4)
)
rows <- ret[c(1, 1 + which(diff(ret) > 4))]
e[rows, 4:1]
[,1] [,2] [,3] [,4]
[1,] 3 5 6 7
[2,] 3 4 5 7
[3,] 5 6 7 11
Try:
fun1 <- function(vec, n, cond1) {
lst1 <- lapply(1:(length(vec) - n+1), function(i) {
x1 <- vec[i:(i + (n-1))]
if (all(diff(x1) >= 0) & all(x1 > cond1))
x1
})
indx <- which(sapply(lst1, length) == n)
indx2 <- unlist(lapply(split(indx, cumsum(c(TRUE, diff(indx) != 1))), function(x) x[seq(1,
length(x), by = n-1)]))
lst1[indx2]
}
v1 <- c(3,3,3,3,1,3,3,3,3,3,3,3,3)
v2 <- c(2,3,5,5,7,6,3,2,3,4,5,7,8,9,6,1,1,2,5,6,7,11,2,3,4)
v3 <- c(2,3,5,6,7,6,3,2,3,4,5,7,8,9,6,1,1,2,5,6,7,11,2,3,4)
fun1(v1,4,2)
#[[1]]
#[1] 3 3 3 3
#[[2]]
#[1] 3 3 3 3
#[[3]]
#[1] 3 3 3 3
fun1(v2,4,2)
#[[1]]
#[1] 3 5 5 7
#[[2]]
#[1] 3 4 5 7
#[[3]]
#[1] 5 6 7 11
fun1(v3,4,2)
#[[1]]
#[1] 3 5 6 7
#[[2]]
#[1] 3 4 5 7
#[[3]]
#[1] 5 6 7 11
Here is another idea based on rle:
ff = function(x, size, thres)
{
valid_subsets = sapply(head(seq_along(x), -(size - 1)),
function(i) all(x[i:(i + (size - 1))] > thres))
r = rle(valid_subsets)
lapply(unlist(mapply(function(a, b) a + (seq_len(b) - 1) * size,
(cumsum(r$lengths) - r$lengths + 1)[which(r$values)],
(r$lengths[which(r$values)] + size - 1) %/% size)),
function(i) x[i:(i + (size - 1))])
}
ff(c(3,3,3,3,1,3,3,3,3,3,3,3,3), 4, 2)
ff(c(2,3,5,6,7,6,3,2,3,4,5,7,8,9,6,1,1,2,5,6,7,11,2,3,4), 4, 2)
Testing on another vector (I assume this is the correct output):
set.seed(4); xx = sample(1:10, 20, T)
xx
# [1] 6 1 3 3 9 3 8 10 10 1 8 3 2 10 5 5 10 6 10 8
ff(xx, 4, 2)
#[[1]]
#[1] 3 3 9 3
#
#[[2]]
#[1] 10 5 5 10
Unless I'm missing something, on "xx" (as well as on other cases) the other posted answers do not seem to work:
fun1(xx, 4, 2)
#[[1]]
#[1] 3 8 10 10
#e[rows, 4:1]
#[1] 9 3 8 10

Resources