Remove duplicate and small vectors from list - r

I have a list of vectors, say:
li <- list( c(1, 2, 3),
c(1, 2, 3, 4),
c(2, 3, 4),
c(5, 6, 7, 8, 9, 10, 11, 12),
numeric(0),
c(5, 6, 7, 8, 9, 10, 11, 12, 13)
)
And I would like to remove all the vectors that are already contained in others (bigger or equal), as well as all the empty vectors
In this case, I would be left with only the list
1 2 3 4
5 6 7 8 9 10 11 12 13
Is there any useful function for achieving this?
Thanks in advance

First you should sort the list by vector length, such that in the excision loop it is guaranteed that each lower-index vector is shorter than each higher-index vector, so a one-way setdiff() is all you need.
l <- list(1:3, 1:4, 2:4, 5:12, double(), 5:13 );
ls <- l[order(sapply(l,length))];
i <- 1; while (i <= length(ls)-1) if (length(ls[[i]]) == 0 || any(sapply((i+1):length(ls),function(i2) length(setdiff(ls[[i]],ls[[i2]]))) == 0)) ls[[i]] <- NULL else i <- i+1;
ls;
## [[1]]
## [1] 1 2 3 4
##
## [[2]]
## [1] 5 6 7 8 9 10 11 12 13
Here's a slight alternative, replacing the any(sapply(...)) with a second while-loop. The advantage is that the while-loop can break prematurely if it finds any superset in the remainder of the list.
l <- list(1:3, 1:4, 2:4, 5:12, double(), 5:13 );
ls <- l[order(sapply(l,length))];
i <- 1; while (i <= length(ls)-1) if (length(ls[[i]]) == 0 || { j <- i+1; res <- F; while (j <= length(ls)) if (length(setdiff(ls[[i]],ls[[j]])) == 0) { res <- T; break; } else j <- j+1; res; }) ls[[i]] <- NULL else i <- i+1;
ls;
## [[1]]
## [1] 1 2 3 4
##
## [[2]]
## [1] 5 6 7 8 9 10 11 12 13

x is contained in y if
length(setdiff(x, y)) == 0
You can apply it to each pair of vectors using functions like expand.grid or combn.

Related

Group numeric vector by predefined maximal group sum

I have a numeric vector like this x <- c(1, 23, 7, 10, 9, 2, 4) and I want to group the elements from left to right with the constrain that each group sum must not exceed 25. Thus, here the first group is c(1, 23), the second is c(7, 10) and the last c(9, 2, 4). the expected output is a dataframe with a second column containing the groups:
data.frame(x= c(1, 23, 7, 10, 9, 2, 4), group= c(1, 1, 2, 2, 3, 3, 3))
I have tried different things with cumsum but am not able to kind of dynamically restart cumsum for the new group once the limit sum of 25 for the last group is reached.
I think cpp function is the fastest way:
library(Rcpp)
cppFunction(
"IntegerVector GroupBySum(const NumericVector& x, const double& max_sum = 25)
{
double sum = 0;
int cnt = 0;
int period = 1;
IntegerVector res(x.size());
for (int i = 0; i < x.size(); ++i)
{
++cnt;
sum += x[i];
if (sum > max_sum)
{
sum = x[i];
if (cnt > 1)
++period;
cnt = 1;
}
res[i] = period;
}
return res;
}"
)
GroupBySum(c(1, 23, 7, 10, 9, 2, 4), 25)
We can try this as a programming practice if you like :)
f1 <- function(x) {
group <- c()
while (length(x)) {
idx <- cumsum(x) <= 25
x <- x[!idx]
group <- c(group, rep(max(group, 0) + 1, sum(idx)))
}
group
}
or
f2 <- function(x) {
group <- c()
g <- 0
while (length(x)) {
cnt <- s <- 0
for (i in seq_along(x)) {
s <- s + x[i]
if (s <= 25) {
cnt <- cnt + 1
} else {
break
}
}
g <- g + 1
group <- c(group, rep(g, cnt))
x <- x[-(1:cnt)]
}
group
}
or
f3 <- function(x) {
s <- cumsum(x)
r <- c()
grp <- 1
while (length(s)) {
idx <- (s <= 25)
r <- c(r, rep(grp, sum(idx)))
grp <- grp + 1
s <- s[!idx] - tail(s[idx], 1)
}
r
}
which gives
[1] 1 1 2 2 3 3 3
and benchmarking among them looks like
set.seed(1)
set.seed(1)
x <- runif(1e3, 0, 25)
bm <- microbenchmark(
f1(x),
f2(x),
f3(x),
check = "equivalent"
)
autoplot(bm)
Recursion version
Another option is using recursion (based on f1())
f <- function(x, res = c()) {
if (!length(x)) {
return(res)
}
idx <- cumsum(x) <= 25
Recall(x[!idx], res = c(res, list(x[idx])))
}
and you will see
> f(x)
[[1]]
[1] 1 23
[[2]]
[1] 7 10
[[3]]
[1] 9 2 4
You can use the cumsumbinning built-in function from the MESS package:
# install.packages("MESS")
MESS::cumsumbinning(x, 25, cutwhenpassed = F)
# [1] 1 1 2 2 3 3 3
Or it can be done with purrr::accumulate:
cumsum(x == accumulate(x, ~ifelse(.x + .y <= 25, .x + .y, .y)))
# [1] 1 1 2 2 3 3 3
output
group <- MESS::cumsumbinning(x, 25, cutwhenpassed = F)
data.frame(x= c(1, 23, 7, 10, 9, 2, 4),
group = group)
x group
1 1 1
2 23 1
3 7 2
4 10 2
5 9 3
6 2 3
7 4 3
Quick benchmark:
x<- c(1, 23, 7, 10, 9, 2, 4)
bm <- microbenchmark(
fThomas(x),
fThomasRec(x),
fJKupzig(x),
fCumsumbinning(x),
fAccumulate(x),
fReduce(x),
fRcpp(x),
times = 100L,
setup = gc(FALSE)
)
autoplot(bm)
Егор Шишунов's Rcpp is the fastest, closely followed by MESS::cumsumbinning and ThomasIsCoding's both functions.
With n = 100, the gap gets bigger but Rcpp and cumsumbinning are still the top choices and the while loop option is no longer efficient (I had to remove ThomasIsCoding's functions because the execution time was too long):
x = runif(100, 1, 50)
In base R you could also use Reduce:
do.call(rbind, Reduce(\(x,y) if((z<-x[1] + y) > 25) c(y, x[2]+1)
else c(z, x[2]), x[-1], init = c(x[1], 1), accumulate = TRUE))
[,1] [,2]
[1,] 1 1
[2,] 24 1
[3,] 7 2
[4,] 17 2
[5,] 9 3
[6,] 11 3
[7,] 15 3
Breaking it down:
f <- function(x, y){
z <- x[1] + y
if(z > 25) c(y, x[2] + 1)
else c(z, x[2])
}
do.call(rbind, Reduce(f, x[-1], init = c(x[1], 1), accumulate = TRUE))
if using accumulate
library(tidyverse)
accumulate(x[-1], f, .init = c(x[1], 1)) %>%
invoke(rbind, .)
[,1] [,2]
[1,] 1 1
[2,] 24 1
[3,] 7 2
[4,] 17 2
[5,] 9 3
[6,] 11 3
[7,] 15 3
Here is a solution using base R and cumsum (and lapply for iteration):
id <- c(seq(1, length(x),1)[!duplicated(cumsum(x) %/% 25)], length(x)+1)
id2 <- 1:length(id)
group <- unlist(lapply(1:(length(id)-1), function(x) rep(id2[x], diff(id)[x])))
data.frame(x=x, group=group)
x group
1 1 1
2 23 1
3 7 2
4 10 2
5 9 3
6 2 3
7 4 3
Edit: New Approach using recursive function
Here is a new more efficient approach that should also cover the special case which #ЕгорШишунов considered and should work efficiently because it's written as a recursive function.
recursiveFunction<- function(x, maxN=25, sumX=0, period=1, period2return=c()){
sumX <- sumX + x[1]
if (sumX >= maxN) { sumX=x[1]; period = period + 1}
period2return <- c(period2return, period)
if (length(x) == 1) { return(period2return)}
return(recursiveFunction(x[-1], 25, sumX, period, period2return))
}
recursiveFunction(x, maxN=25)
Note that you should not change the entries for the last three function parameters (sumX=0, period=1, period2return=c()) because they are only important during the recursive call of the function.

Why there appear three NA's at the end?

I was trying to write an interleave() function for two vectors of arbitrary length.
For equal-length vectors,
I found in the internet:
.interleave <- function(vec1, vec2) {
# cuts away longer
res <- rbind(vec1, vec2)
attributes(res) <- NULL
res
}
# c(rbind(vec1, vec2)) is shorter code, but
# is 3x slower according to the blog in the link
So for arbitrary length, I thought, I measure the lengths first and attach the rest of the longer vector.
interleave <- function(vec1, vec2) {
vec1_len <- length(vec1)
vec2_len <- length(vec2)
min_len <- min(vec1_len, vec2_len)
if (vec1_len == vec2_len) {
.interleave(vec1, vec2)
} else {
c(.interleave(vec1[1:min_len], vec2[1:min_len]),
if (vec1_len > vec2_len) {
vec1[min_len+1:vec1_len]
} else {
vec2[min_len+1:vec2_len]
})
}
} # strangely 3 NA's at end if unequal length
But now comes the strange thing:
interleave(c(1, 2, 3), c(4, 5, 6, 7, 8, 9))
## [1] 1 4 2 5 3 6 7 8 9 NA NA NA
interleave(c(1, 2, 3), c(4, 5, 6))
## [1] 1 4 2 5 3 6
interleave(c(1, 2, 3), c(4, 5))
## [1] 1 4 2 5 3 NA NA
interleave(c(1, 2, 3), c(4, 5, 6, 7, 8, 9, 10, 11))
## [1] 1 4 2 5 3 6 7 8 9 10 11 NA NA NA
interleave(c(1, 2, 3, 4, 5, 6), c( 7, 8, 9, 10, 11))
## [1] 1 7 2 8 3 9 4 10 5 11 6 NA NA NA NA NA
From where do the NAs come from?
Remark: I see the pattern that the number of attached NAs
is the number of elements in the shorter vector ...
How to generate a version without NAs?
Solution
Sorry, I found it out myself.
Problem was the subsetting of the rest-vector.
I forgot some parantheses.
interleave <- function(vec1, vec2) {
vec1_len <- length(vec1)
vec2_len <- length(vec2)
min_len <- min(vec1_len, vec2_len)
if (vec1_len == vec2_len) {
.interleave(vec1, vec2)
} else {
c(.interleave(vec1[1:min_len], vec2[1:min_len]),
if (vec1_len > vec2_len) {
vec1[(min_len+1):vec1_len] # parantheses!
} else {
vec2[(min_len+1):vec2_len] # parantheses!
})
}
} # no NA's any more!
Slightly shorter
interleave <- function(vec1, vec2) {
vec1_len <- length(vec1)
vec2_len <- length(vec2)
min_len <- min(vec1_len, vec2_len)
if (vec1_len == vec2_len) {
.interleave(vec1, vec2)
} else {
c(.interleave(vec1[1:min_len], vec2[1:min_len]),
if (vec1_len > vec2_len) {
vec1[(min_len+1):vec1_len]
} else {
vec2[(min_len+1):vec2_len]
})
}
}
A general function:
interleave <- function(...) {
l <- list(...)
len <- max(lengths(l))
l <- lapply(l, function(x) `length<-`(x, len))
res <- na.omit(c(do.call(rbind, l)))
attributes(res) <- NULL
res
}
interleave(1:9, seq(10,40,10), seq(100,500,100))
#[1] 1 10 100 2 20 200 3 30 300 4 40 400 5 500 6 7 8 9
and also generalizing Interleave lists in R
interleave <- function(...) {
l <- list(...)
idx <- order(unlist(lapply(l, function(x) seq_along(x))))
unlist(l)[idx]
}

Extract multiple ranges from a numeric vector

First, I simplify my question. I want to extract certain ranges from a numeric vector. For example, extracting 3 ranges from 1:20 at the same time :
1 < x < 5
8 < x < 12
17 < x < 20
Therefore, the expected output is 2, 3, 4, 9, 10, 11, 18, 19.
I try to use the function findInterval() and control arguments rightmost.closed and left.open to do that, but any arguments sets cannot achieve the goal.
x <- 1:20
v <- c(1, 5, 8, 12, 17, 20)
x[findInterval(x, v) %% 2 == 1]
# [1] 1 2 3 4 8 9 10 11 17 18 19
x[findInterval(x, v, rightmost.closed = T) %% 2 == 1]
# [1] 1 2 3 4 8 9 10 11 17 18 19 20
x[findInterval(x, v, left.open = T) %% 2 == 1]
# [1] 2 3 4 5 9 10 11 12 18 19 20
By the way, the conditions can also be a matrix like that :
[,1] [,2]
[1,] 1 5
[2,] 8 12
[3,] 17 20
I don't want to use for loop if it's not necessary.
I am grateful for any helps.
I'd probably do it using purrr::map2 or Map, passing your lower-bounds and upper-bounds as arguments and filtering your dataset with a custom function
library(purrr)
x <- 1:20
lower_bounds <- c(1, 8, 17)
upper_bounds <- c(5, 12, 20)
map2(
lower_bounds, upper_bounds, function(lower, upper) {
x[x > lower & x < upper]
}
)
You may use data.table::inrange and its incbounds argument. Assuming ranges are in a matrix 'm', as shown in your question:
x[data.table::inrange(x, m[ , 1], m[ , 2], incbounds = FALSE)]
# [1] 2 3 4 9 10 11 18 19
m <- matrix(v, ncol = 2, byrow = TRUE)
You were on the right path, and left.open indeed helps, but rightmost.closed actually concerns only the last interval rather than the right "side" of each interval. Hence, we need to use left.open twice. As you yourself figured out, it looks like an optimal way to do that is
x[findInterval(x, v) %% 2 == 1 & findInterval(x, v, left.open = TRUE) %% 2 == 1]
# [1] 2 3 4 9 10 11 18 19
Clearly there are alternatives. E.g.,
fun <- function(x, v)
if(length(v) > 1) v[1] < x & x < v[2] | fun(x, v[-1:-2]) else FALSE
x[fun(x, v)]
# [1] 2 3 4 9 10 11 18 19
I found an easy way just with sapply() :
x <- 1:20
v <- c(1, 5, 8, 12, 17, 20)
(v.df <- as.data.frame(matrix(v, 3, 2, byrow = T)))
# V1 V2
# 1 1 5
# 2 8 12
# 3 17 20
y <- sapply(x, function(x){
ind <- (x > v.df$V1 & x < v.df$V2)
if(any(ind)) x else NA
})
y[!is.na(y)]
# [1] 2 3 4 9 10 11 18 19

Exchange two elements of a vector in one call

I have a vector c(9,6,3,4,2,1,5,7,8), and I want to switch the elements at index 2 and at index 5 in the vector. However, I don't want to have to create a temporary variable and would like to make the switch in one call. How would I do that?
How about just x[c(i,j)] <- x[c(j,i)]? Similar to replace(...), but perhaps a bit simpler.
swtch <- function(x,i,j) {x[c(i,j)] <- x[c(j,i)]; x}
swtch(c(9,6,3,4,2,1,5,7,8) , 2,5)
# [1] 9 2 3 4 6 1 5 7 8
You could use replace().
x <- c(9, 6, 3, 4, 2, 1, 5, 7, 8)
replace(x, c(2, 5), x[c(5, 2)])
# [1] 9 2 3 4 6 1 5 7 8
And if you don't even want to assign x, you can use
replace(
c(9, 6, 3, 4, 2, 1, 5, 7, 8),
c(2, 5),
c(9, 6, 3, 4, 2, 1, 5, 7, 8)[c(5, 2)]
)
# [1] 9 2 3 4 6 1 5 7 8
but that's a bit silly. You will probably want x assigned to begin with.
If you actually want to do it without creating a temporary copy of the vector, you would need to write a short C function.
library(inline)
swap <- cfunction(c(i = "integer", j = "integer", vec="integer"),"
int *v = INTEGER(vec);
int ii = INTEGER(i)[0]-1, jj = INTEGER(j)[0]-1;
int tmp = v[ii];
v[ii] = v[jj];
v[jj] = tmp;
return R_NilValue;
")
vec <- as.integer(c(9,6,3,4,2,1,5,7,8))
swap(2L, 5L, vec)
vec
# [1] 9 2 3 4 6 1 5 7 8

Extract a numeric vector from data frame in R

I have a data.frame like following example. I want to write a function to do these two tasks for me in one function in R? first extract the value of data frame which is same for x and y and I want to save it as a numeric vector and also make the rest as a data frame.
d = data.frame(x = c(1,7, 2, 9, 11),y=c(6, 7, 8, 9,10))
v = c(7, 9)
w = data.frame(x=c(1, 2, 11), y=c(6, 8, 10))
My desire result as follows:
> result
$v
[1] 7 9
$w
x y
1 1 6
2 2 8
3 11 10
Maybe with is what you want?
with(d, list(v = x[x==y] ,w=d[x!=y,]))
$v
[1] 7 9
$w
x y
1 1 6
3 2 8
5 11 10
Something along these lines should do this too
splitdf <- function(df) {
if (ncol(df) != 2) stop("df must have 2 columns")
ind <- do.call("==", df)
list(v = df[ind, 1], w = df[!ind, ])
}
d <- data.frame(x = c(1, 7, 2, 9, 11), y = c(6, 7, 8, 9, 10))
splitdf(d)
## $v
## [1] 7 9
## $w
## x y
## 1 1 6
## 3 2 8
## 5 11 10
df <- data.frame(x = c(1, 7, 2, 9, 11), z = c(7, 8, 10, 9, 12))
splitdf(df)
## $v
## [1] 9
## $w
## x z
## 1 1 7
## 2 7 8
## 3 2 10
## 5 11 12

Resources