Merging list of numbers in R - r

I've a list with groups of numbers as follows:
myList <- list(1:5, c(1,3,4,7), 2:6, c(3,6:9), 4:8)
myList
#[[1]]
#[1] 1 2 3 4 5
#
#[[2]]
#[1] 1 3 4 7
#
#[[3]]
#[1] 2 3 4 5 6
#
#[[4]]
#[1] 3 6 7 8 9
#
#[[5]]
#[1] 4 5 6 7 8
So we have 5 groups of numbers in myList. I want to merge two groups if they have at least s% same numbers in them. I compute similarity using the following function call:
get.Overlap <- function(group_A, group_B)
{
common <- length(intersect(group_A, group_B))
minimum_length <- min(length(group_A), length(group_B))
#formula: |A ^ B| / min{|A|, |B|}
overlap_score <- common / minimum_length
overlap_score
}
I have implemented mergeList(myList, threshold_s) using loops as follows:
mergeList <- function(myList, threshold_s)
{
returnList <- list()
i <- 1
while(i <= length(myList))
{
thisList <- myList[[i]]
j <- i + 1
while(j <= length(myList))
{
tempList <- myList[[j]]
if(get.Overlap(thisList, tempList) >= threshold_s)
{
thisList <- union(thisList, tempList)
myList[[j]] <- NULL
}
else
{
j <- j + 1
}
}
returnList <- c(returnList, list(thisList))
i <- i + 1
}
returnList
}
Now if I call merge(myList, threshold_s), where threshold_s is set to 0.80 meaning 80% similarity, the output will be
#[[1]]
#[1] 1 2 3 4 5 6
#
#[[2]]
#[1] 1 3 4 7
#
#[[3]]
#[1] 3 4 6 7 8 9
The complexity is comparatively high. I am looking for a fast implement of merge(myList, threshold_s) for a large list, say length of myList may be around 50,000.
Thanks in advance.

Related

Applying a function over batches of links - splitting a character vector into subsets [duplicate]

I have to split a vector into n chunks of equal size in R. I couldn't find any base function to do that. Also Google didn't get me anywhere. Here is what I came up with so far;
x <- 1:10
n <- 3
chunk <- function(x,n) split(x, factor(sort(rank(x)%%n)))
chunk(x,n)
$`0`
[1] 1 2 3
$`1`
[1] 4 5 6 7
$`2`
[1] 8 9 10
A one-liner splitting d into chunks of size 20:
split(d, ceiling(seq_along(d)/20))
More details: I think all you need is seq_along(), split() and ceiling():
> d <- rpois(73,5)
> d
[1] 3 1 11 4 1 2 3 2 4 10 10 2 7 4 6 6 2 1 1 2 3 8 3 10 7 4
[27] 3 4 4 1 1 7 2 4 6 0 5 7 4 6 8 4 7 12 4 6 8 4 2 7 6 5
[53] 4 5 4 5 5 8 7 7 7 6 2 4 3 3 8 11 6 6 1 8 4
> max <- 20
> x <- seq_along(d)
> d1 <- split(d, ceiling(x/max))
> d1
$`1`
[1] 3 1 11 4 1 2 3 2 4 10 10 2 7 4 6 6 2 1 1 2
$`2`
[1] 3 8 3 10 7 4 3 4 4 1 1 7 2 4 6 0 5 7 4 6
$`3`
[1] 8 4 7 12 4 6 8 4 2 7 6 5 4 5 4 5 5 8 7 7
$`4`
[1] 7 6 2 4 3 3 8 11 6 6 1 8 4
chunk2 <- function(x,n) split(x, cut(seq_along(x), n, labels = FALSE))
A simplified version:
n = 3
split(x, sort(x%%n))
NB: This will only work on numeric vectors.
Using base R's rep_len:
x <- 1:10
n <- 3
split(x, rep_len(1:n, length(x)))
# $`1`
# [1] 1 4 7 10
#
# $`2`
# [1] 2 5 8
#
# $`3`
# [1] 3 6 9
And as already mentioned if you want sorted indices, simply:
split(x, sort(rep_len(1:n, length(x))))
# $`1`
# [1] 1 2 3 4
#
# $`2`
# [1] 5 6 7
#
# $`3`
# [1] 8 9 10
Try the ggplot2 function, cut_number:
library(ggplot2)
x <- 1:10
n <- 3
cut_number(x, n) # labels = FALSE if you just want an integer result
#> [1] [1,4] [1,4] [1,4] [1,4] (4,7] (4,7] (4,7] (7,10] (7,10] (7,10]
#> Levels: [1,4] (4,7] (7,10]
# if you want it split into a list:
split(x, cut_number(x, n))
#> $`[1,4]`
#> [1] 1 2 3 4
#>
#> $`(4,7]`
#> [1] 5 6 7
#>
#> $`(7,10]`
#> [1] 8 9 10
This will split it differently to what you have, but is still quite a nice list structure I think:
chunk.2 <- function(x, n, force.number.of.groups = TRUE, len = length(x), groups = trunc(len/n), overflow = len%%n) {
if(force.number.of.groups) {
f1 <- as.character(sort(rep(1:n, groups)))
f <- as.character(c(f1, rep(n, overflow)))
} else {
f1 <- as.character(sort(rep(1:groups, n)))
f <- as.character(c(f1, rep("overflow", overflow)))
}
g <- split(x, f)
if(force.number.of.groups) {
g.names <- names(g)
g.names.ordered <- as.character(sort(as.numeric(g.names)))
} else {
g.names <- names(g[-length(g)])
g.names.ordered <- as.character(sort(as.numeric(g.names)))
g.names.ordered <- c(g.names.ordered, "overflow")
}
return(g[g.names.ordered])
}
Which will give you the following, depending on how you want it formatted:
> x <- 1:10; n <- 3
> chunk.2(x, n, force.number.of.groups = FALSE)
$`1`
[1] 1 2 3
$`2`
[1] 4 5 6
$`3`
[1] 7 8 9
$overflow
[1] 10
> chunk.2(x, n, force.number.of.groups = TRUE)
$`1`
[1] 1 2 3
$`2`
[1] 4 5 6
$`3`
[1] 7 8 9 10
Running a couple of timings using these settings:
set.seed(42)
x <- rnorm(1:1e7)
n <- 3
Then we have the following results:
> system.time(chunk(x, n)) # your function
user system elapsed
29.500 0.620 30.125
> system.time(chunk.2(x, n, force.number.of.groups = TRUE))
user system elapsed
5.360 0.300 5.663
Note: Changing as.factor() to as.character() made my function twice as fast.
If you don't like split() and you don't like matrix() (with its dangling NAs), there's this:
chunk <- function(x, n) (mapply(function(a, b) (x[a:b]), seq.int(from=1, to=length(x), by=n), pmin(seq.int(from=1, to=length(x), by=n)+(n-1), length(x)), SIMPLIFY=FALSE))
Like split(), it returns a list, but it doesn't waste time or space with labels, so it may be more performant.
A few more variants to the pile...
> x <- 1:10
> n <- 3
Note, that you don't need to use the factor function here, but you still want to sort o/w your first vector would be 1 2 3 10:
> chunk <- function(x, n) split(x, sort(rank(x) %% n))
> chunk(x,n)
$`0`
[1] 1 2 3
$`1`
[1] 4 5 6 7
$`2`
[1] 8 9 10
Or you can assign character indices, vice the numbers in left ticks above:
> my.chunk <- function(x, n) split(x, sort(rep(letters[1:n], each=n, len=length(x))))
> my.chunk(x, n)
$a
[1] 1 2 3 4
$b
[1] 5 6 7
$c
[1] 8 9 10
Or you can use plainword names stored in a vector. Note that using sort to get consecutive values in x alphabetizes the labels:
> my.other.chunk <- function(x, n) split(x, sort(rep(c("tom", "dick", "harry"), each=n, len=length(x))))
> my.other.chunk(x, n)
$dick
[1] 1 2 3
$harry
[1] 4 5 6
$tom
[1] 7 8 9 10
Yet another possibility is the splitIndices function from package parallel:
library(parallel)
splitIndices(20, 3)
Gives:
[[1]]
[1] 1 2 3 4 5 6 7
[[2]]
[1] 8 9 10 11 12 13
[[3]]
[1] 14 15 16 17 18 19 20
NB: this works only with numeric values though. If you want to split a character vector, you would need to do some indexing: lapply(splitIndices(20, 3), \(x) letters[1:20][x])
You could combine the split/cut, as suggested by mdsummer, with quantile to create even groups:
split(x,cut(x,quantile(x,(0:n)/n), include.lowest=TRUE, labels=FALSE))
This gives the same result for your example, but not for skewed variables.
split(x,matrix(1:n,n,length(x))[1:length(x)])
perhaps this is more clear, but the same idea:
split(x,rep(1:n, ceiling(length(x)/n),length.out = length(x)))
if you want it ordered,throw a sort around it
Here's another variant.
NOTE: with this sample you're specifying the CHUNK SIZE in the second parameter
all chunks are uniform, except for the last;
the last will at worst be smaller, never bigger than the chunk size.
chunk <- function(x,n)
{
f <- sort(rep(1:(trunc(length(x)/n)+1),n))[1:length(x)]
return(split(x,f))
}
#Test
n<-c(1,2,3,4,5,6,7,8,9,10,11)
c<-chunk(n,5)
q<-lapply(c, function(r) cat(r,sep=",",collapse="|") )
#output
1,2,3,4,5,|6,7,8,9,10,|11,|
I needed the same function and have read the previous solutions, however i also needed to have the unbalanced chunk to be at the end i.e if i have 10 elements to split them into vectors of 3 each, then my result should have vectors with 3,3,4 elements respectively. So i used the following (i left the code unoptimised for readability, otherwise no need to have many variables):
chunk <- function(x,n){
numOfVectors <- floor(length(x)/n)
elementsPerVector <- c(rep(n,numOfVectors-1),n+length(x) %% n)
elemDistPerVector <- rep(1:numOfVectors,elementsPerVector)
split(x,factor(elemDistPerVector))
}
set.seed(1)
x <- rnorm(10)
n <- 3
chunk(x,n)
$`1`
[1] -0.6264538 0.1836433 -0.8356286
$`2`
[1] 1.5952808 0.3295078 -0.8204684
$`3`
[1] 0.4874291 0.7383247 0.5757814 -0.3053884
Simple function for splitting a vector by simply using indexes - no need to over complicate this
vsplit <- function(v, n) {
l = length(v)
r = l/n
return(lapply(1:n, function(i) {
s = max(1, round(r*(i-1))+1)
e = min(l, round(r*i))
return(v[s:e])
}))
}
Sorry if this answer comes so late, but maybe it can be useful for someone else. Actually there is a very useful solution to this problem, explained at the end of ?split.
> testVector <- c(1:10) #I want to divide it into 5 parts
> VectorList <- split(testVector, 1:5)
> VectorList
$`1`
[1] 1 6
$`2`
[1] 2 7
$`3`
[1] 3 8
$`4`
[1] 4 9
$`5`
[1] 5 10
Credit to #Sebastian for this function
chunk <- function(x,y){
split(x, factor(sort(rank(row.names(x))%%y)))
}
If you don't like split() and you don't mind NAs padding out your short tail:
chunk <- function(x, n) { if((length(x)%%n)==0) {return(matrix(x, nrow=n))} else {return(matrix(append(x, rep(NA, n-(length(x)%%n))), nrow=n))} }
The columns of the returned matrix ([,1:ncol]) are the droids you are looking for.
I need a function that takes the argument of a data.table (in quotes) and another argument that is the upper limit on the number of rows in the subsets of that original data.table. This function produces whatever number of data.tables that upper limit allows for:
library(data.table)
split_dt <- function(x,y)
{
for(i in seq(from=1,to=nrow(get(x)),by=y))
{df_ <<- get(x)[i:(i + y)];
assign(paste0("df_",i),df_,inherits=TRUE)}
rm(df_,inherits=TRUE)
}
This function gives me a series of data.tables named df_[number] with the starting row from the original data.table in the name. The last data.table can be short and filled with NAs so you have to subset that back to whatever data is left. This type of function is useful because certain GIS software have limits on how many address pins you can import, for example. So slicing up data.tables into smaller chunks may not be recommended, but it may not be avoidable.
I have come up with this solution:
require(magrittr)
create.chunks <- function(x, elements.per.chunk){
# plain R version
# split(x, rep(seq_along(x), each = elements.per.chunk)[seq_along(x)])
# magrittr version - because that's what people use now
x %>% seq_along %>% rep(., each = elements.per.chunk) %>% extract(seq_along(x)) %>% split(x, .)
}
create.chunks(letters[1:10], 3)
$`1`
[1] "a" "b" "c"
$`2`
[1] "d" "e" "f"
$`3`
[1] "g" "h" "i"
$`4`
[1] "j"
The key is to use the seq(each = chunk.size) parameter so make it work. Using seq_along acts like rank(x) in my previous solution, but is actually able to produce the correct result with duplicated entries.
Here's yet another one, allowing you to control if you want the result ordered or not:
split_to_chunks <- function(x, n, keep.order=TRUE){
if(keep.order){
return(split(x, sort(rep(1:n, length.out = length(x)))))
}else{
return(split(x, rep(1:n, length.out = length(x))))
}
}
split_to_chunks(x = 1:11, n = 3)
$`1`
[1] 1 2 3 4
$`2`
[1] 5 6 7 8
$`3`
[1] 9 10 11
split_to_chunks(x = 1:11, n = 3, keep.order=FALSE)
$`1`
[1] 1 4 7 10
$`2`
[1] 2 5 8 11
$`3`
[1] 3 6 9
Not sure if this answers OP's question, but I think the %% can be useful here
df # some data.frame
N_CHUNKS <- 10
I_VEC <- 1:nrow(df)
df_split <- split(df, sort(I_VEC %% N_CHUNKS))
This splits into chunks of size ⌊n/k⌋+1 or ⌊n/k⌋ and does not use the O(n log n) sort.
get_chunk_id<-function(n, k){
r <- n %% k
s <- n %/% k
i<-seq_len(n)
1 + ifelse (i <= r * (s+1), (i-1) %/% (s+1), r + ((i - r * (s+1)-1) %/% s))
}
split(1:10, get_chunk_id(10,3))

Partion of a vector to a list [duplicate]

I have to split a vector into n chunks of equal size in R. I couldn't find any base function to do that. Also Google didn't get me anywhere. Here is what I came up with so far;
x <- 1:10
n <- 3
chunk <- function(x,n) split(x, factor(sort(rank(x)%%n)))
chunk(x,n)
$`0`
[1] 1 2 3
$`1`
[1] 4 5 6 7
$`2`
[1] 8 9 10
A one-liner splitting d into chunks of size 20:
split(d, ceiling(seq_along(d)/20))
More details: I think all you need is seq_along(), split() and ceiling():
> d <- rpois(73,5)
> d
[1] 3 1 11 4 1 2 3 2 4 10 10 2 7 4 6 6 2 1 1 2 3 8 3 10 7 4
[27] 3 4 4 1 1 7 2 4 6 0 5 7 4 6 8 4 7 12 4 6 8 4 2 7 6 5
[53] 4 5 4 5 5 8 7 7 7 6 2 4 3 3 8 11 6 6 1 8 4
> max <- 20
> x <- seq_along(d)
> d1 <- split(d, ceiling(x/max))
> d1
$`1`
[1] 3 1 11 4 1 2 3 2 4 10 10 2 7 4 6 6 2 1 1 2
$`2`
[1] 3 8 3 10 7 4 3 4 4 1 1 7 2 4 6 0 5 7 4 6
$`3`
[1] 8 4 7 12 4 6 8 4 2 7 6 5 4 5 4 5 5 8 7 7
$`4`
[1] 7 6 2 4 3 3 8 11 6 6 1 8 4
chunk2 <- function(x,n) split(x, cut(seq_along(x), n, labels = FALSE))
A simplified version:
n = 3
split(x, sort(x%%n))
NB: This will only work on numeric vectors.
Using base R's rep_len:
x <- 1:10
n <- 3
split(x, rep_len(1:n, length(x)))
# $`1`
# [1] 1 4 7 10
#
# $`2`
# [1] 2 5 8
#
# $`3`
# [1] 3 6 9
And as already mentioned if you want sorted indices, simply:
split(x, sort(rep_len(1:n, length(x))))
# $`1`
# [1] 1 2 3 4
#
# $`2`
# [1] 5 6 7
#
# $`3`
# [1] 8 9 10
Try the ggplot2 function, cut_number:
library(ggplot2)
x <- 1:10
n <- 3
cut_number(x, n) # labels = FALSE if you just want an integer result
#> [1] [1,4] [1,4] [1,4] [1,4] (4,7] (4,7] (4,7] (7,10] (7,10] (7,10]
#> Levels: [1,4] (4,7] (7,10]
# if you want it split into a list:
split(x, cut_number(x, n))
#> $`[1,4]`
#> [1] 1 2 3 4
#>
#> $`(4,7]`
#> [1] 5 6 7
#>
#> $`(7,10]`
#> [1] 8 9 10
This will split it differently to what you have, but is still quite a nice list structure I think:
chunk.2 <- function(x, n, force.number.of.groups = TRUE, len = length(x), groups = trunc(len/n), overflow = len%%n) {
if(force.number.of.groups) {
f1 <- as.character(sort(rep(1:n, groups)))
f <- as.character(c(f1, rep(n, overflow)))
} else {
f1 <- as.character(sort(rep(1:groups, n)))
f <- as.character(c(f1, rep("overflow", overflow)))
}
g <- split(x, f)
if(force.number.of.groups) {
g.names <- names(g)
g.names.ordered <- as.character(sort(as.numeric(g.names)))
} else {
g.names <- names(g[-length(g)])
g.names.ordered <- as.character(sort(as.numeric(g.names)))
g.names.ordered <- c(g.names.ordered, "overflow")
}
return(g[g.names.ordered])
}
Which will give you the following, depending on how you want it formatted:
> x <- 1:10; n <- 3
> chunk.2(x, n, force.number.of.groups = FALSE)
$`1`
[1] 1 2 3
$`2`
[1] 4 5 6
$`3`
[1] 7 8 9
$overflow
[1] 10
> chunk.2(x, n, force.number.of.groups = TRUE)
$`1`
[1] 1 2 3
$`2`
[1] 4 5 6
$`3`
[1] 7 8 9 10
Running a couple of timings using these settings:
set.seed(42)
x <- rnorm(1:1e7)
n <- 3
Then we have the following results:
> system.time(chunk(x, n)) # your function
user system elapsed
29.500 0.620 30.125
> system.time(chunk.2(x, n, force.number.of.groups = TRUE))
user system elapsed
5.360 0.300 5.663
Note: Changing as.factor() to as.character() made my function twice as fast.
If you don't like split() and you don't like matrix() (with its dangling NAs), there's this:
chunk <- function(x, n) (mapply(function(a, b) (x[a:b]), seq.int(from=1, to=length(x), by=n), pmin(seq.int(from=1, to=length(x), by=n)+(n-1), length(x)), SIMPLIFY=FALSE))
Like split(), it returns a list, but it doesn't waste time or space with labels, so it may be more performant.
A few more variants to the pile...
> x <- 1:10
> n <- 3
Note, that you don't need to use the factor function here, but you still want to sort o/w your first vector would be 1 2 3 10:
> chunk <- function(x, n) split(x, sort(rank(x) %% n))
> chunk(x,n)
$`0`
[1] 1 2 3
$`1`
[1] 4 5 6 7
$`2`
[1] 8 9 10
Or you can assign character indices, vice the numbers in left ticks above:
> my.chunk <- function(x, n) split(x, sort(rep(letters[1:n], each=n, len=length(x))))
> my.chunk(x, n)
$a
[1] 1 2 3 4
$b
[1] 5 6 7
$c
[1] 8 9 10
Or you can use plainword names stored in a vector. Note that using sort to get consecutive values in x alphabetizes the labels:
> my.other.chunk <- function(x, n) split(x, sort(rep(c("tom", "dick", "harry"), each=n, len=length(x))))
> my.other.chunk(x, n)
$dick
[1] 1 2 3
$harry
[1] 4 5 6
$tom
[1] 7 8 9 10
Yet another possibility is the splitIndices function from package parallel:
library(parallel)
splitIndices(20, 3)
Gives:
[[1]]
[1] 1 2 3 4 5 6 7
[[2]]
[1] 8 9 10 11 12 13
[[3]]
[1] 14 15 16 17 18 19 20
NB: this works only with numeric values though. If you want to split a character vector, you would need to do some indexing: lapply(splitIndices(20, 3), \(x) letters[1:20][x])
You could combine the split/cut, as suggested by mdsummer, with quantile to create even groups:
split(x,cut(x,quantile(x,(0:n)/n), include.lowest=TRUE, labels=FALSE))
This gives the same result for your example, but not for skewed variables.
split(x,matrix(1:n,n,length(x))[1:length(x)])
perhaps this is more clear, but the same idea:
split(x,rep(1:n, ceiling(length(x)/n),length.out = length(x)))
if you want it ordered,throw a sort around it
Here's another variant.
NOTE: with this sample you're specifying the CHUNK SIZE in the second parameter
all chunks are uniform, except for the last;
the last will at worst be smaller, never bigger than the chunk size.
chunk <- function(x,n)
{
f <- sort(rep(1:(trunc(length(x)/n)+1),n))[1:length(x)]
return(split(x,f))
}
#Test
n<-c(1,2,3,4,5,6,7,8,9,10,11)
c<-chunk(n,5)
q<-lapply(c, function(r) cat(r,sep=",",collapse="|") )
#output
1,2,3,4,5,|6,7,8,9,10,|11,|
I needed the same function and have read the previous solutions, however i also needed to have the unbalanced chunk to be at the end i.e if i have 10 elements to split them into vectors of 3 each, then my result should have vectors with 3,3,4 elements respectively. So i used the following (i left the code unoptimised for readability, otherwise no need to have many variables):
chunk <- function(x,n){
numOfVectors <- floor(length(x)/n)
elementsPerVector <- c(rep(n,numOfVectors-1),n+length(x) %% n)
elemDistPerVector <- rep(1:numOfVectors,elementsPerVector)
split(x,factor(elemDistPerVector))
}
set.seed(1)
x <- rnorm(10)
n <- 3
chunk(x,n)
$`1`
[1] -0.6264538 0.1836433 -0.8356286
$`2`
[1] 1.5952808 0.3295078 -0.8204684
$`3`
[1] 0.4874291 0.7383247 0.5757814 -0.3053884
Simple function for splitting a vector by simply using indexes - no need to over complicate this
vsplit <- function(v, n) {
l = length(v)
r = l/n
return(lapply(1:n, function(i) {
s = max(1, round(r*(i-1))+1)
e = min(l, round(r*i))
return(v[s:e])
}))
}
Sorry if this answer comes so late, but maybe it can be useful for someone else. Actually there is a very useful solution to this problem, explained at the end of ?split.
> testVector <- c(1:10) #I want to divide it into 5 parts
> VectorList <- split(testVector, 1:5)
> VectorList
$`1`
[1] 1 6
$`2`
[1] 2 7
$`3`
[1] 3 8
$`4`
[1] 4 9
$`5`
[1] 5 10
Credit to #Sebastian for this function
chunk <- function(x,y){
split(x, factor(sort(rank(row.names(x))%%y)))
}
If you don't like split() and you don't mind NAs padding out your short tail:
chunk <- function(x, n) { if((length(x)%%n)==0) {return(matrix(x, nrow=n))} else {return(matrix(append(x, rep(NA, n-(length(x)%%n))), nrow=n))} }
The columns of the returned matrix ([,1:ncol]) are the droids you are looking for.
I need a function that takes the argument of a data.table (in quotes) and another argument that is the upper limit on the number of rows in the subsets of that original data.table. This function produces whatever number of data.tables that upper limit allows for:
library(data.table)
split_dt <- function(x,y)
{
for(i in seq(from=1,to=nrow(get(x)),by=y))
{df_ <<- get(x)[i:(i + y)];
assign(paste0("df_",i),df_,inherits=TRUE)}
rm(df_,inherits=TRUE)
}
This function gives me a series of data.tables named df_[number] with the starting row from the original data.table in the name. The last data.table can be short and filled with NAs so you have to subset that back to whatever data is left. This type of function is useful because certain GIS software have limits on how many address pins you can import, for example. So slicing up data.tables into smaller chunks may not be recommended, but it may not be avoidable.
I have come up with this solution:
require(magrittr)
create.chunks <- function(x, elements.per.chunk){
# plain R version
# split(x, rep(seq_along(x), each = elements.per.chunk)[seq_along(x)])
# magrittr version - because that's what people use now
x %>% seq_along %>% rep(., each = elements.per.chunk) %>% extract(seq_along(x)) %>% split(x, .)
}
create.chunks(letters[1:10], 3)
$`1`
[1] "a" "b" "c"
$`2`
[1] "d" "e" "f"
$`3`
[1] "g" "h" "i"
$`4`
[1] "j"
The key is to use the seq(each = chunk.size) parameter so make it work. Using seq_along acts like rank(x) in my previous solution, but is actually able to produce the correct result with duplicated entries.
Here's yet another one, allowing you to control if you want the result ordered or not:
split_to_chunks <- function(x, n, keep.order=TRUE){
if(keep.order){
return(split(x, sort(rep(1:n, length.out = length(x)))))
}else{
return(split(x, rep(1:n, length.out = length(x))))
}
}
split_to_chunks(x = 1:11, n = 3)
$`1`
[1] 1 2 3 4
$`2`
[1] 5 6 7 8
$`3`
[1] 9 10 11
split_to_chunks(x = 1:11, n = 3, keep.order=FALSE)
$`1`
[1] 1 4 7 10
$`2`
[1] 2 5 8 11
$`3`
[1] 3 6 9
Not sure if this answers OP's question, but I think the %% can be useful here
df # some data.frame
N_CHUNKS <- 10
I_VEC <- 1:nrow(df)
df_split <- split(df, sort(I_VEC %% N_CHUNKS))
This splits into chunks of size ⌊n/k⌋+1 or ⌊n/k⌋ and does not use the O(n log n) sort.
get_chunk_id<-function(n, k){
r <- n %% k
s <- n %/% k
i<-seq_len(n)
1 + ifelse (i <= r * (s+1), (i-1) %/% (s+1), r + ((i - r * (s+1)-1) %/% s))
}
split(1:10, get_chunk_id(10,3))

R - Collapse into vector same member of a list

I have a list with same structure for every member as the following
config <- NULL
config[["secA"]] <- NULL
config[["secA"]]$VAL <- 0
config[["secA"]]$ARR <- c(1,2,3,4,5)
config[["secA"]]$DF <- data.frame(matrix(c(1,5,3,8),2,2))
config[["secB"]] <- NULL
config[["secB"]]$VAL <- 1
config[["secB"]]$ARR <- c(1,3,2,4,9)
config[["secB"]]$DF <- data.frame(matrix(c(2,6,1,9),2,2))
config[["secC"]] <- NULL
config[["secC"]]$VAL <- 5
config[["secC"]]$ARR <- c(4,2,1,5,8)
config[["secC"]]$DF <- data.frame(matrix(c(4,2,1,7),2,2))
and I need to obtain 3 vectors VAL, ARR and DF, each with the concatenated elements of the corresponding member. such as
# VAL: 0,1,5
# ARR: 1,2,3,4,5,1,3,2,4,9,4,2,1,5,8
# DF: 1,5,3,8,2,6,1,9,4,2,1,7
Looking at similar situations, I have the feeling I need to use a combination of do.call and cbind or lapply but I have no clue. any suggestions?
config <- NULL
config[["secA"]] <- NULL
config[["secA"]]$VAL <- 0
config[["secA"]]$ARR <- c(1,2,3,4,5)
config[["secA"]]$DF <- data.frame(matrix(c(1,5,3,8),2,2))
config[["secB"]] <- NULL
config[["secB"]]$VAL <- 1
config[["secB"]]$ARR <- c(1,3,2,4,9)
config[["secB"]]$DF <- data.frame(matrix(c(2,6,1,9),2,2))
config[["secC"]] <- NULL
config[["secC"]]$VAL <- 5
config[["secC"]]$ARR <- c(4,2,1,5,8)
config[["secC"]]$DF <- data.frame(matrix(c(4,2,1,7),2,2))
sapply(names(config[[1]]), function(x)
unname(unlist(sapply(config, `[`, x))), USE.NAMES = TRUE)
# $VAL
# [1] 0 1 5
#
# $ARR
# [1] 1 2 3 4 5 1 3 2 4 9 4 2 1 5 8
#
# $DF
# [1] 1 5 3 8 2 6 1 9 4 2 1 7
Or you can use this clist function
Unfortunately there were no other answers.
(l <- Reduce(clist, config))
# $VAL
# [1] 0 1 5
#
# $ARR
# [1] 1 2 3 4 5 1 3 2 4 9 4 2 1 5 8
#
# $DF
# X1 X2 X1 X2 X1 X2
# 1 1 3 2 1 4 1
# 2 5 8 6 9 2 7
It merges data frames and matrices, so you need to unlist to get the vector you want
l$DF <- unname(unlist(l$DF))
l
# $VAL
# [1] 0 1 5
#
# $ARR
# [1] 1 2 3 4 5 1 3 2 4 9 4 2 1 5 8
#
# $DF
# [1] 1 5 3 8 2 6 1 9 4 2 1 7
Function
clist <- function (x, y) {
islist <- function(x) inherits(x, 'list')
'%||%' <- function(a, b) if (!is.null(a)) a else b
get_fun <- function(x, y)
switch(class(x %||% y),
matrix = cbind,
data.frame = function(x, y)
do.call('cbind.data.frame', Filter(Negate(is.null), list(x, y))),
factor = function(...) unlist(list(...)), c)
stopifnot(islist(x), islist(y))
nn <- names(rapply(c(x, y), names, how = 'list'))
if (is.null(nn) || any(!nzchar(nn)))
stop('All non-NULL list elements should have unique names', domain = NA)
nn <- unique(c(names(x), names(y)))
z <- setNames(vector('list', length(nn)), nn)
for (ii in nn)
z[[ii]] <- if (islist(x[[ii]]) && islist(y[[ii]]))
Recall(x[[ii]], y[[ii]]) else
(get_fun(x[[ii]], y[[ii]]))(x[[ii]], y[[ii]])
z
}
Another approach, with slightly less code.
un_config <- unlist(config)
un_configNAM <- names(un_config)
vecNAM <- c("VAL", "ARR", "DF")
for(n in vecNAM){
assign(n, un_config[grepl(n, un_configNAM)])
}
This will return 3 vectors as the OP requested. However, generally it is more advantageous to store results in a list as rawr suggests. You of course can adopt the above code so that results are stored within a list.
l <- rep(list(NA), length(vecNAM))
i = 1
for(n in vecNAM){
l[[i]] <- un_config[grepl(n, un_configNAM)]
i = i +1
}

Find consecutive sub-vectors of length k out of a numeric vector which satisfy a given condition

I have a numeric vector in R, say
v= c(2,3,5,6,7,6,3,2,3,4,5,7,8,9,6,1,1,2,5,6,7,11,2,3,4)
Now, I have to find all the consecutive sub-vector of size 4 out of it with the condition that each element of the sub-vector must be greater than 2 and all sub-vector must be disjoint in the sense that non of the two sub-vector can contain same index element. So my output will be:
(3,5,6,7),(3,4,5,7),(5,6,7,11)
Edited:
Other examples for illustration purpose: for,
v=c(3,3,3,3,1,3,3,3,3,3,3,3,3)
output will be :
(3,3,3,3), (3,3,3,3),(3,3,3,3).
and for,
v= c(2,3,5,5,7,6,3,2,3,4,5,7,8,9,6,1,1,2,5,6,7,11,2,3,4)
output will be
(3,5,5,7),(3,4,5,7),(5,6,7,11)
The second condition on the output simply says that if we found any sub- array say (v[m],v[m+1],v[m+2],v[m+3]) with each element greater than > 2 then it will goes into my output and the next sub-array can only be start from v[m+4](if possible)
This solution uses embed() to create a matrix of lags and then extracts the desired rows from this matrix:
v <- c(2,3,5,6,7,6,3,2,3,4,5,7,8,9,6,1,1,2,5,6,7,11,2,3,4)
e <- embed(v, 4)
ret <- which(
apply(e, 1, function(x)all(x > 2)) &
apply(e, 1, function(x)length(unique(x)) == 4)
)
rows <- ret[c(1, 1 + which(diff(ret) > 4))]
e[rows, 4:1]
[,1] [,2] [,3] [,4]
[1,] 3 5 6 7
[2,] 3 4 5 7
[3,] 5 6 7 11
Try:
fun1 <- function(vec, n, cond1) {
lst1 <- lapply(1:(length(vec) - n+1), function(i) {
x1 <- vec[i:(i + (n-1))]
if (all(diff(x1) >= 0) & all(x1 > cond1))
x1
})
indx <- which(sapply(lst1, length) == n)
indx2 <- unlist(lapply(split(indx, cumsum(c(TRUE, diff(indx) != 1))), function(x) x[seq(1,
length(x), by = n-1)]))
lst1[indx2]
}
v1 <- c(3,3,3,3,1,3,3,3,3,3,3,3,3)
v2 <- c(2,3,5,5,7,6,3,2,3,4,5,7,8,9,6,1,1,2,5,6,7,11,2,3,4)
v3 <- c(2,3,5,6,7,6,3,2,3,4,5,7,8,9,6,1,1,2,5,6,7,11,2,3,4)
fun1(v1,4,2)
#[[1]]
#[1] 3 3 3 3
#[[2]]
#[1] 3 3 3 3
#[[3]]
#[1] 3 3 3 3
fun1(v2,4,2)
#[[1]]
#[1] 3 5 5 7
#[[2]]
#[1] 3 4 5 7
#[[3]]
#[1] 5 6 7 11
fun1(v3,4,2)
#[[1]]
#[1] 3 5 6 7
#[[2]]
#[1] 3 4 5 7
#[[3]]
#[1] 5 6 7 11
Here is another idea based on rle:
ff = function(x, size, thres)
{
valid_subsets = sapply(head(seq_along(x), -(size - 1)),
function(i) all(x[i:(i + (size - 1))] > thres))
r = rle(valid_subsets)
lapply(unlist(mapply(function(a, b) a + (seq_len(b) - 1) * size,
(cumsum(r$lengths) - r$lengths + 1)[which(r$values)],
(r$lengths[which(r$values)] + size - 1) %/% size)),
function(i) x[i:(i + (size - 1))])
}
ff(c(3,3,3,3,1,3,3,3,3,3,3,3,3), 4, 2)
ff(c(2,3,5,6,7,6,3,2,3,4,5,7,8,9,6,1,1,2,5,6,7,11,2,3,4), 4, 2)
Testing on another vector (I assume this is the correct output):
set.seed(4); xx = sample(1:10, 20, T)
xx
# [1] 6 1 3 3 9 3 8 10 10 1 8 3 2 10 5 5 10 6 10 8
ff(xx, 4, 2)
#[[1]]
#[1] 3 3 9 3
#
#[[2]]
#[1] 10 5 5 10
Unless I'm missing something, on "xx" (as well as on other cases) the other posted answers do not seem to work:
fun1(xx, 4, 2)
#[[1]]
#[1] 3 8 10 10
#e[rows, 4:1]
#[1] 9 3 8 10

Split a vector into chunks

I have to split a vector into n chunks of equal size in R. I couldn't find any base function to do that. Also Google didn't get me anywhere. Here is what I came up with so far;
x <- 1:10
n <- 3
chunk <- function(x,n) split(x, factor(sort(rank(x)%%n)))
chunk(x,n)
$`0`
[1] 1 2 3
$`1`
[1] 4 5 6 7
$`2`
[1] 8 9 10
A one-liner splitting d into chunks of size 20:
split(d, ceiling(seq_along(d)/20))
More details: I think all you need is seq_along(), split() and ceiling():
> d <- rpois(73,5)
> d
[1] 3 1 11 4 1 2 3 2 4 10 10 2 7 4 6 6 2 1 1 2 3 8 3 10 7 4
[27] 3 4 4 1 1 7 2 4 6 0 5 7 4 6 8 4 7 12 4 6 8 4 2 7 6 5
[53] 4 5 4 5 5 8 7 7 7 6 2 4 3 3 8 11 6 6 1 8 4
> max <- 20
> x <- seq_along(d)
> d1 <- split(d, ceiling(x/max))
> d1
$`1`
[1] 3 1 11 4 1 2 3 2 4 10 10 2 7 4 6 6 2 1 1 2
$`2`
[1] 3 8 3 10 7 4 3 4 4 1 1 7 2 4 6 0 5 7 4 6
$`3`
[1] 8 4 7 12 4 6 8 4 2 7 6 5 4 5 4 5 5 8 7 7
$`4`
[1] 7 6 2 4 3 3 8 11 6 6 1 8 4
chunk2 <- function(x,n) split(x, cut(seq_along(x), n, labels = FALSE))
A simplified version:
n = 3
split(x, sort(x%%n))
NB: This will only work on numeric vectors.
Using base R's rep_len:
x <- 1:10
n <- 3
split(x, rep_len(1:n, length(x)))
# $`1`
# [1] 1 4 7 10
#
# $`2`
# [1] 2 5 8
#
# $`3`
# [1] 3 6 9
And as already mentioned if you want sorted indices, simply:
split(x, sort(rep_len(1:n, length(x))))
# $`1`
# [1] 1 2 3 4
#
# $`2`
# [1] 5 6 7
#
# $`3`
# [1] 8 9 10
Try the ggplot2 function, cut_number:
library(ggplot2)
x <- 1:10
n <- 3
cut_number(x, n) # labels = FALSE if you just want an integer result
#> [1] [1,4] [1,4] [1,4] [1,4] (4,7] (4,7] (4,7] (7,10] (7,10] (7,10]
#> Levels: [1,4] (4,7] (7,10]
# if you want it split into a list:
split(x, cut_number(x, n))
#> $`[1,4]`
#> [1] 1 2 3 4
#>
#> $`(4,7]`
#> [1] 5 6 7
#>
#> $`(7,10]`
#> [1] 8 9 10
This will split it differently to what you have, but is still quite a nice list structure I think:
chunk.2 <- function(x, n, force.number.of.groups = TRUE, len = length(x), groups = trunc(len/n), overflow = len%%n) {
if(force.number.of.groups) {
f1 <- as.character(sort(rep(1:n, groups)))
f <- as.character(c(f1, rep(n, overflow)))
} else {
f1 <- as.character(sort(rep(1:groups, n)))
f <- as.character(c(f1, rep("overflow", overflow)))
}
g <- split(x, f)
if(force.number.of.groups) {
g.names <- names(g)
g.names.ordered <- as.character(sort(as.numeric(g.names)))
} else {
g.names <- names(g[-length(g)])
g.names.ordered <- as.character(sort(as.numeric(g.names)))
g.names.ordered <- c(g.names.ordered, "overflow")
}
return(g[g.names.ordered])
}
Which will give you the following, depending on how you want it formatted:
> x <- 1:10; n <- 3
> chunk.2(x, n, force.number.of.groups = FALSE)
$`1`
[1] 1 2 3
$`2`
[1] 4 5 6
$`3`
[1] 7 8 9
$overflow
[1] 10
> chunk.2(x, n, force.number.of.groups = TRUE)
$`1`
[1] 1 2 3
$`2`
[1] 4 5 6
$`3`
[1] 7 8 9 10
Running a couple of timings using these settings:
set.seed(42)
x <- rnorm(1:1e7)
n <- 3
Then we have the following results:
> system.time(chunk(x, n)) # your function
user system elapsed
29.500 0.620 30.125
> system.time(chunk.2(x, n, force.number.of.groups = TRUE))
user system elapsed
5.360 0.300 5.663
Note: Changing as.factor() to as.character() made my function twice as fast.
If you don't like split() and you don't like matrix() (with its dangling NAs), there's this:
chunk <- function(x, n) (mapply(function(a, b) (x[a:b]), seq.int(from=1, to=length(x), by=n), pmin(seq.int(from=1, to=length(x), by=n)+(n-1), length(x)), SIMPLIFY=FALSE))
Like split(), it returns a list, but it doesn't waste time or space with labels, so it may be more performant.
A few more variants to the pile...
> x <- 1:10
> n <- 3
Note, that you don't need to use the factor function here, but you still want to sort o/w your first vector would be 1 2 3 10:
> chunk <- function(x, n) split(x, sort(rank(x) %% n))
> chunk(x,n)
$`0`
[1] 1 2 3
$`1`
[1] 4 5 6 7
$`2`
[1] 8 9 10
Or you can assign character indices, vice the numbers in left ticks above:
> my.chunk <- function(x, n) split(x, sort(rep(letters[1:n], each=n, len=length(x))))
> my.chunk(x, n)
$a
[1] 1 2 3 4
$b
[1] 5 6 7
$c
[1] 8 9 10
Or you can use plainword names stored in a vector. Note that using sort to get consecutive values in x alphabetizes the labels:
> my.other.chunk <- function(x, n) split(x, sort(rep(c("tom", "dick", "harry"), each=n, len=length(x))))
> my.other.chunk(x, n)
$dick
[1] 1 2 3
$harry
[1] 4 5 6
$tom
[1] 7 8 9 10
Yet another possibility is the splitIndices function from package parallel:
library(parallel)
splitIndices(20, 3)
Gives:
[[1]]
[1] 1 2 3 4 5 6 7
[[2]]
[1] 8 9 10 11 12 13
[[3]]
[1] 14 15 16 17 18 19 20
NB: this works only with numeric values though. If you want to split a character vector, you would need to do some indexing: lapply(splitIndices(20, 3), \(x) letters[1:20][x])
You could combine the split/cut, as suggested by mdsummer, with quantile to create even groups:
split(x,cut(x,quantile(x,(0:n)/n), include.lowest=TRUE, labels=FALSE))
This gives the same result for your example, but not for skewed variables.
split(x,matrix(1:n,n,length(x))[1:length(x)])
perhaps this is more clear, but the same idea:
split(x,rep(1:n, ceiling(length(x)/n),length.out = length(x)))
if you want it ordered,throw a sort around it
Here's another variant.
NOTE: with this sample you're specifying the CHUNK SIZE in the second parameter
all chunks are uniform, except for the last;
the last will at worst be smaller, never bigger than the chunk size.
chunk <- function(x,n)
{
f <- sort(rep(1:(trunc(length(x)/n)+1),n))[1:length(x)]
return(split(x,f))
}
#Test
n<-c(1,2,3,4,5,6,7,8,9,10,11)
c<-chunk(n,5)
q<-lapply(c, function(r) cat(r,sep=",",collapse="|") )
#output
1,2,3,4,5,|6,7,8,9,10,|11,|
I needed the same function and have read the previous solutions, however i also needed to have the unbalanced chunk to be at the end i.e if i have 10 elements to split them into vectors of 3 each, then my result should have vectors with 3,3,4 elements respectively. So i used the following (i left the code unoptimised for readability, otherwise no need to have many variables):
chunk <- function(x,n){
numOfVectors <- floor(length(x)/n)
elementsPerVector <- c(rep(n,numOfVectors-1),n+length(x) %% n)
elemDistPerVector <- rep(1:numOfVectors,elementsPerVector)
split(x,factor(elemDistPerVector))
}
set.seed(1)
x <- rnorm(10)
n <- 3
chunk(x,n)
$`1`
[1] -0.6264538 0.1836433 -0.8356286
$`2`
[1] 1.5952808 0.3295078 -0.8204684
$`3`
[1] 0.4874291 0.7383247 0.5757814 -0.3053884
Simple function for splitting a vector by simply using indexes - no need to over complicate this
vsplit <- function(v, n) {
l = length(v)
r = l/n
return(lapply(1:n, function(i) {
s = max(1, round(r*(i-1))+1)
e = min(l, round(r*i))
return(v[s:e])
}))
}
Sorry if this answer comes so late, but maybe it can be useful for someone else. Actually there is a very useful solution to this problem, explained at the end of ?split.
> testVector <- c(1:10) #I want to divide it into 5 parts
> VectorList <- split(testVector, 1:5)
> VectorList
$`1`
[1] 1 6
$`2`
[1] 2 7
$`3`
[1] 3 8
$`4`
[1] 4 9
$`5`
[1] 5 10
Credit to #Sebastian for this function
chunk <- function(x,y){
split(x, factor(sort(rank(row.names(x))%%y)))
}
If you don't like split() and you don't mind NAs padding out your short tail:
chunk <- function(x, n) { if((length(x)%%n)==0) {return(matrix(x, nrow=n))} else {return(matrix(append(x, rep(NA, n-(length(x)%%n))), nrow=n))} }
The columns of the returned matrix ([,1:ncol]) are the droids you are looking for.
I need a function that takes the argument of a data.table (in quotes) and another argument that is the upper limit on the number of rows in the subsets of that original data.table. This function produces whatever number of data.tables that upper limit allows for:
library(data.table)
split_dt <- function(x,y)
{
for(i in seq(from=1,to=nrow(get(x)),by=y))
{df_ <<- get(x)[i:(i + y)];
assign(paste0("df_",i),df_,inherits=TRUE)}
rm(df_,inherits=TRUE)
}
This function gives me a series of data.tables named df_[number] with the starting row from the original data.table in the name. The last data.table can be short and filled with NAs so you have to subset that back to whatever data is left. This type of function is useful because certain GIS software have limits on how many address pins you can import, for example. So slicing up data.tables into smaller chunks may not be recommended, but it may not be avoidable.
I have come up with this solution:
require(magrittr)
create.chunks <- function(x, elements.per.chunk){
# plain R version
# split(x, rep(seq_along(x), each = elements.per.chunk)[seq_along(x)])
# magrittr version - because that's what people use now
x %>% seq_along %>% rep(., each = elements.per.chunk) %>% extract(seq_along(x)) %>% split(x, .)
}
create.chunks(letters[1:10], 3)
$`1`
[1] "a" "b" "c"
$`2`
[1] "d" "e" "f"
$`3`
[1] "g" "h" "i"
$`4`
[1] "j"
The key is to use the seq(each = chunk.size) parameter so make it work. Using seq_along acts like rank(x) in my previous solution, but is actually able to produce the correct result with duplicated entries.
Here's yet another one, allowing you to control if you want the result ordered or not:
split_to_chunks <- function(x, n, keep.order=TRUE){
if(keep.order){
return(split(x, sort(rep(1:n, length.out = length(x)))))
}else{
return(split(x, rep(1:n, length.out = length(x))))
}
}
split_to_chunks(x = 1:11, n = 3)
$`1`
[1] 1 2 3 4
$`2`
[1] 5 6 7 8
$`3`
[1] 9 10 11
split_to_chunks(x = 1:11, n = 3, keep.order=FALSE)
$`1`
[1] 1 4 7 10
$`2`
[1] 2 5 8 11
$`3`
[1] 3 6 9
Not sure if this answers OP's question, but I think the %% can be useful here
df # some data.frame
N_CHUNKS <- 10
I_VEC <- 1:nrow(df)
df_split <- split(df, sort(I_VEC %% N_CHUNKS))
This splits into chunks of size ⌊n/k⌋+1 or ⌊n/k⌋ and does not use the O(n log n) sort.
get_chunk_id<-function(n, k){
r <- n %% k
s <- n %/% k
i<-seq_len(n)
1 + ifelse (i <= r * (s+1), (i-1) %/% (s+1), r + ((i - r * (s+1)-1) %/% s))
}
split(1:10, get_chunk_id(10,3))

Resources