First and Last value before NA - r

I am trying to obtain the first and last value for different segments before an NA value in a vector. Here is an example:
xx = seq(1, 122, by = 1)
xx[c(2:10, 14, 45:60, 120:121)] = NA
In turn, my results we would 1; 11 and 13; 15 and 44; 61 and 119; 122.

Using a c++ function to do some looping will be fast on a large set.
This function returns a 2-column matrix, the first column gives the 'start' of the sequence of numbers, the second column gives the 'end' of the sequence.
library(Rcpp)
cppFunction('NumericMatrix naSeq(NumericVector myVec) {
int n = myVec.size();
NumericVector starts(n); // pre-allocate
NumericVector ends(n); // pre-allocate
starts.fill(NumericVector::get_na());
ends.fill(NumericVector::get_na());
int startCounter = 0;
int endCounter = 0;
bool firstNumber = !NumericVector::is_na(myVec[0]); // initialise based on first value
// groups are considered sequential numbers without an NA between them
for (int i = 0; i < (n-1); i++) {
if ( !NumericVector::is_na(myVec[i]) && NumericVector::is_na(myVec[i+1]) ) {
if (i == 0 && firstNumber) {
startCounter++;
}
ends[endCounter] = i + 1;
endCounter++;
}
if (NumericVector::is_na(myVec[i]) && !NumericVector::is_na(myVec[i+1]) ) {
if ( i == 0 && !firstNumber){
endCounter++;
}
starts[startCounter] = i + 2;
startCounter++;
}
}
int matSize = startCounter > endCounter ? startCounter : endCounter;
IntegerVector idx = seq(0, matSize);
NumericMatrix m(matSize, 2);
starts = starts[idx];
ends = ends[idx];
m(_, 0) = starts;
m(_, 1) = ends;
return m;
}')
naSeq(xx)
which gives
# [,1] [,2]
# [1,] NA 1
# [2,] 11 13
# [3,] 15 44
# [4,] 61 119
# [5,] 122 NA
Benchmarking
If you do care about speed, here's a quick benchmark of the solutions. Note that I'm taking the functions as-is from each answer, regardless of the format (or even content) of the result of each function.
library(microbenchmark)
set.seed(123)
xx <- seq(1:1e6)
naXX <- sample(xx, size = 1e5)
xx[naXX] <- NA
mb <- microbenchmark(
late = { latemail(xx) },
sym = { naSeq(xx) },
www = { www(xx) },
mkr = { mkr(xx) },
times = 5
)
print(mb, order = "median")
# Unit: milliseconds
# expr min lq mean median uq max neval
# sym 22.66139 23.26898 27.18414 23.48402 27.85917 38.64716 5
# www 45.11008 46.69587 55.73575 56.97421 61.63140 68.26719 5
# mkr 369.69303 384.15262 427.35080 392.26770 469.59242 521.04821 5
# late 2417.21556 2420.25472 2560.41563 2627.19973 2665.19272 2672.21543 5
Using
latemail <- function(xx) {
nas <- is.na(xx)
by(xx[!nas], cumsum(nas)[!nas], function(x) x[unique(c(1,length(x)))] )
}
www <- function(xx) {
RLE <- rle(is.na(xx))
L <- RLE$lengths
Index <- cumsum(L[-length(L)]) + (1:(length(L) - 1) + 1) %% 2
matrix(c(Index[1], NA, Index[2:length(Index)], NA), ncol = 2, byrow = TRUE)
}
library(dplyr)
mkr <- function(xx) {
df <- data.frame(xx = xx)
df %>% mutate(value = ifelse(is.na(xx), ifelse(!is.na(lag(xx)), lag(xx),
ifelse(!is.na(lead(xx)),lead(xx), NA)), NA)) %>%
select(value) %>%
filter(!is.na(value))
}

Make a counter that is constant for groups of non-NA values - cumsum(nas)[!nas], then take the first and last value in each group of non-NA values:
nas <- is.na(xx)
by(xx[!nas], cumsum(nas)[!nas], function(x) x[unique(c(1,length(x)))] )
#cumsum(nas)[!nas]: 0
#[1] 1
#--------------
#cumsum(nas)[!nas]: 9
#[1] 11 13
#--------------
#cumsum(nas)[!nas]: 10
#[1] 15 44
#--------------
#cumsum(nas)[!nas]: 26
#[1] 61 119
#--------------
#cumsum(nas)[!nas]: 28
#[1] 122
If speed is a concern, by can be a fair bit slower than splitting and lapplying:
lapply(split(xx[!nas], cumsum(nas)[!nas]), function(x) x[unique(c(1,length(x)))] )

The easiest solution I can think of is using tidyverse. First create a data.frame using vector from OP. Then add (mutate) a column with desired values.
The use of lead and lag will provide option to get hold of non-NA value from either previous or next row. The rows corresponding to NA will have NA value which can be later filter out.
library(tidyverse)
xx = seq(1, 122, by = 1)
xx[c(2:10, 14, 45:60, 120:121)] = NA
df <- data.frame(xx = xx)
df %>% mutate(value = ifelse(is.na(xx), ifelse(!is.na(lag(xx)), lag(xx),
ifelse(!is.na(lead(xx)),lead(xx), NA)), NA)) %>%
select(value) %>%
filter(!is.na(value))
#Result
# value
#1 1
#2 11
#3 13
#4 44
#5 61
#6 119
#7 122

We can use rle and cumsum.
RLE <- rle(is.na(xx))
L <- RLE$lengths
Index <- c(1, cumsum(L) + (1:length(L) + 1) %% 2)
matrix(Index, ncol = 2, byrow = TRUE)
# [,1] [,2]
# [1,] 1 1
# [2,] 11 13
# [3,] 15 44
# [4,] 61 119
# [5,] 122 122
Explanation
rle(is.na(xx)) creates the run-length-encoding of is.na(xx), which contains the length of each NA and non-NA group.
RLE <- rle(is.na(xx))
RLE
# Run Length Encoding
# lengths: int [1:9] 1 9 3 1 30 16 59 2 1
# values : logi [1:9] FALSE TRUE FALSE TRUE FALSE TRUE ...
L <- RLE$lengths extracts the length of each group.
L <- RLE$lengths
L
# [1] 1 9 3 1 30 16 59 2 1
cumsum(L) calculates the cumulative sum all the length to obtain the index.
cumsum(L)
# [1] 1 10 13 14 44 60 119 121 122
We then need to add one for those even index numbers. So we used (1:length(L) + 1) %% 2 to specify that.
(1:(length(L) - 1) + 1) %% 2
# [1] 0 1 0 1 0 1 0 1 0
By combing the above two vectors, we can get the final result.
Index <- c(1, cumsum(L) + (1:length(L) + 1) %% 2)
Index
# [1] 1 1 11 13 15 44 61 119 122 122
Finally, I used matrix(Index, ncol = 2, byrow = TRUE) just to view the results more clearly. Each row represents one group. The first column indicates the beginning index of each group, while the second column indicates the end of each group.
matrix(Index, ncol = 2, byrow = TRUE)
# [,1] [,2]
# [1,] 1 1
# [2,] 11 13
# [3,] 15 44
# [4,] 61 119
# [5,] 122 122

Related

R how to find a series of common values in a vector (identifying growing season)

I'm looking for a way to identify a growing season which consists of a number of days greater than say 60 between the last frost day of spring and the first frost day in the fall. A general version of this problem is this. If I have a vector of numbers like testVec, I want the item numbers of the beginning and end range of values where the number of items is 5 or greater and all of them are greater than 0.
testVec <- c(1,3,4,0, 1, -5, 6, 0, 1,3,4,6,7,5,9, 0)
In this example, the relevant range is 1,3,4,6,7,5,9 which is testVec[9] to testVec[15]
One option could be:
testVec[with(rle(testVec > 0), rep(lengths * values >= 5, lengths))]
[1] 1 3 4 6 7 5 9
Here, the idea is to, first, create runs of values that are smaller or equal to zero and bigger than zero. Second, it checks whether the runs of values bigger than zero are of length 5 or more. Finally, it subsets the original vector for the runs of values bigger than zero with length 5 or more.
1) rleid This also handles any number of sequences including zero. rleid(ok) is a vector the same length as ok such that the first run of identical elements is replaced with 1, the second run with 2 and so on. The result is a list of vectors where each vector has its positions in the original input as its names.
library(data.table)
getSeq <- function(x) {
names(x) <- seq_along(x)
ok <- x > 0
s <- split(x[ok], rleid(ok)[ok])
unname(s)[lengths(s) >= 5]
}
getSeq(testVec)
## [[1]]
## 9 10 11 12 13 14 15
## 1 3 4 6 7 5 9
getSeq(numeric(16))
## list()
getSeq(c(testVec, 10 * testVec))
## [[1]]
## 9 10 11 12 13 14 15
## 1 3 4 6 7 5 9
##
## [[2]]
## 25 26 27 28 29 30 31
## 10 30 40 60 70 50 90
If a data frame were desired then following gives the values and which sequence the row came from. The row names indicate the positions in the original input.
gs <- getSeq(c(testVec, 10 * testVec))
names(gs) <- seq_along(gs)
if (length(gs)) stack(gs) else gs
## values ind
## 9 1 1
## 10 3 1
## 11 4 1
## 12 6 1
## 13 7 1
## 14 5 1
## 15 9 1
## 25 10 2
## 26 30 2
## 27 40 2
## 28 60 2
## 29 70 2
## 30 50 2
## 31 90 2
2) gregexpr Replace each element that is > 0 with 1 and each other element with 0 pasting the 0's and 1's into a single character string. Then use gregexpr to look for sequences of 1's at least 5 long and for the ith such nonoverlapping sequence return the first positions, g, and lengths, attr(g, "match.length"). Define a function vals which extracts the values at the required positions from testVec of the ith such nonoverlapping sequence returning a list such that the ith component of the list is the ith such sequence. The names in the output vector are its positions in the input.
getSeq2 <- function(x) {
g <- gregexpr("1{5,}", paste(+(x > 0), collapse = ""))[[1]]
vals <- function(i) {
ix <- seq(g[i], length = attr(g, "match.length")[i])
setNames(x[ix], ix)
}
if (length(g) == 1 && g == -1) list() else lapply(seq_along(g), vals)
}
getSeq2(testVec)
## [[1]]
## 9 10 11 12 13 14 15
## 1 3 4 6 7 5 9
The above handles any number of sequences including 0 but if we knew there were exactly one sequence (which is the case for the example in the question) then it could be simplified to the following where the return value is just that vector:
g <- gregexpr("1{5,}", paste(+(testVec > 0), collapse = ""))[[1]]
ix <- seq(g, length = attr(g, "match.length"))
setNames(testVec[ix], ix)
## 9 10 11 12 13 14 15
## 1 3 4 6 7 5 9
You could "fix" #tmfmnk's solution like this:
f1 <- function(x, threshold, n) {
range(which(with(rle(x > threshold), rep(lengths * values >= n, lengths))))
}
x <- c(1, 3, 4, 0, 1, -5, 6, 0, 1,3,4,6,7,5,9, 0)
f1(x, 0, 5)
#[1] 9 15
But that does not work well when there are multiple runs
xx <- c(x, x)
f1(xx, 0, 5)
#[1] 9 31
Here is another, not so concise approach that returns the start and end of the longest run (the first one if there are ties).
f2 <- function(x, threshold, n) {
y <- x > threshold
y[is.na(y)] <- FALSE
a <- ave(y, cumsum(!y), FUN=cumsum)
m <- max(a)
if (m < n) return (c(NA, NA))
i <- which(a == m)[1]
c(i-m+1, i)
}
f2(x, 0, 5)
#[1] 9 15
f2(xx, 0, 5)
#[1] 9 15
or with rle
f3 <- function(x, threshold, n) {
y <- x > threshold
r <- rle(y)
m <- max(r$lengths)
if (m < n) return (c(NA, NA))
i <- sum(r$lengths[1:which.max(r$lengths)[1]])
c(i-max(r$lengths)+1, i)
}
f3(x, 0, 5)
#[1] 9 15
f3(xx, 0, 5)
#[1] 9 15
If you wanted the first run that is at least n, that is you do not want a next run, even if it is longer, you could do
f4 <- function(x, threshold, n) {
y <- with(rle(x > threshold), rep(lengths * values >= n, lengths))
i <- which(y)[1]
j <- i + which(!y[-c(1:i)])[1] - 1
c(i, j)
}

Find pattern and filter the start location

I would like to find the location of a pattern AND filter the location.
I'm looking for a function to return the start location of the pattern "gaaa" between 30 and 34 for each row.
I explain, for the moment here is what I have as a result with the function str_locate_all :
library(stringr)
Sequence <- data.frame(All = c("ggcgaagcagugcucccaguguuuuagagcuagaaauagcaaguuaaaauaaggcuaguccguuaucaacuugaaaaaguggcaccgagucggugcuu",
"aggacaacucgcuccacggccguuuuagagcuagaaauagcaaguuaaaauaaggcuaguccguuaucaacuugaaaaaguggcaccgagucggugcuu",
"cugaaauggcagcagaaacguuuuagagcuagaaauagcaaguuaaaauaaggcuaguccguuaucaacuugaaaaaguggcaccgagucggugcaacaaa",
"ggucaaagaggaggagcucguuuuagagcuagaaauagcaaguuaaaauaaggcuaguccguuaucaacuugaaaaaguggcaccgagucggugcuu"))
str_locate_all(pattern = 'gaaa', Sequence$All)
[[1]]
start end
[1,] 33 36
[2,] 73 76
[[2]]
start end
[1,] 34 37
[2,] 74 77
[[3]]
start end
[1,] 3 6
[2,] 15 18
[3,] 32 35
[4,] 72 75
[[4]]
start end
[1,] 32 35
[2,] 72 75
Here is what I would like to have as a result:
start
1 33
2 34
3 32
4 32
Thanks you!
Here is a way. It uses the output of the str_locate_all instruction in the question and filters it in a lapply loop.
found <- str_locate_all(pattern = 'gaaa', Sequence$All)
found <- lapply(found, function(x){
y <- x[, 'start']
data.frame(start = y[y >= 30 & y <= 34])
})
do.call(rbind, found)
# start
#1 33
#2 34
#3 32
#4 32
Here is another way. It searches only a sub string of the original string.
first <- 30
last <- 34
tmp <- substr(Sequence$All, first, last + nchar('gaaa') - 1)
data.frame(start = str_locate(pattern = 'gaaa', tmp)[, 1] + first - 1)
#Timings
Here are the timings of the 3 answers available so far, r2evans, mine and tmfmnk.
I only post the results with larger input, since that's what should make the timings important.
library(stringr)
library(dplyr)
library(purrr)
r2evans <- function(){
Sequence$start <-
sapply(str_locate_all(pattern = 'gaaa', Sequence$All),
function(z) { ind <- which(30 <= z[,1] & z[,1] <= 34); if (length(ind)) z[ind[1],1] else NA })
Sequence[,2,drop=FALSE]
}
rui <- function(){
first <- 30
last <- 34
tmp <- substr(Sequence$All, first, last + nchar('gaaa') - 1)
data.frame(start = str_locate(pattern = 'gaaa', tmp)[, 1] + first - 1)
}
tmfmnk <- function(){
map_dfr(.x = str_locate_all(pattern = "gaaa", Sequence$All),
~ as.data.frame(.x) %>%
filter(start %in% c(30:34)),
.id = "ID")
}
library(microbenchmark)
for(i in 1:8) Sequence <- rbind(Sequence, Sequence)
dim(Sequence)
#[1] 1024 1
mb <- microbenchmark(
revans = f1(),
rui = f2()
tmfmnk = f3()
)
print(mb, unit = 'relative', order = 'median')
#Unit: relative
# expr min lq mean median uq max neval
# rui 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 100
# r2evans 19.66135 17.52724 16.28008 15.47317 16.20747 5.60779 100
# tmfmnk 1529.51644 1235.86285 1079.56958 1073.49131 1072.39265 317.95638 100
Sequence$start <-
sapply(str_locate_all(pattern = 'gaaa', Sequence$All),
function(z) { ind <- which(30 <= z[,1] & z[,1] <= 34); if (length(ind)) z[ind[1],1] else NA })
Sequence[,2,drop=FALSE]
# start
# 1 33
# 2 34
# 3 32
# 4 32
One dplyr and purrr solution could be:
map_dfr(.x = str_locate_all(pattern = "gaaa", Sequence$All),
~ as.data.frame(.x) %>%
filter(start %in% c(30:34)),
.id = "ID")
ID start end
1 1 33 36
2 2 34 37
3 3 32 35
4 4 32 35

Minimum Cost Distance in Matrix

I have got this matrix below
k
[,1] [,2] [,3] ,4][,5] [,6]
[1,] 1 4 9 16 25 36
[2,] 1 3 7 13 21 31
[3,] 2 2 5 10 17 26
[4,] 4 2 4 8 14 22
[5,] 7 3 3 6 11 18
[6,] 11 5 3 5 9 15
and I want to loop through starting from k[1,1] and ending at k[6,6]. My looping criteria is based on min(k[i,j+1], k[i+1,j], k[i+1, j+1]) and the answer I hope to get is something like 1+1+2+2+3+3+5+9+15 = 41 (travelling through the minimum path)
So pretty much it calculates the minimum starting from k[1,1] and then continues downwards till k[6,6]
warpingDist = function(x, y, z){
mincal = numeric(length(k))
m = nrow(k)
n = ncol(k)
i=1
j=1
mincal = which(k == min(k[i, j+1], k[i+1, j], k[i+1, j+1]), arr.ind = TRUE)
indx = data.frame(mincal)
i= indx$row
j= indx$col
if(i != m || j!=n)
{
warpingDist(k[i, j+1], k[i+1, j], k[i+1, j+1])
}
warpSum = sum(mincal)
return(warpSum)
}
value = apply(k, c(1,2), warpingDist)
value
When I run this code it displays the below:
Error: object 'value' not found
Not sure why this is happening...
As you don't provide a minimal reproducible example, I can only guess:
warpingDist = function(x, y, z, k){
# browser() # This is a good option to activate, if you run your script in RStudio
...
return(warpSum)
}
# your code
k <- whatever it is
result <- warpingDist(x, y, z, k)
I hope that helps.
Am glad, I was finally able to solve the problem...The code runs fast as well
Problem: To find the minimum cost for a matrix. For clarity, let's assume I have the matrix given below:
[1,] 1 4 6 7 8 9 0
[2,] 10 12 1 3 11 2 0
[3,] 11 12 2 8 17 1 0
[4,] 20 1 18 4 28 1 0
[5,] 5 20 80 6 9 3 0
My goal is to add the minimum path distance starting from kata[1,1] first row to the last row K[5,4]. So effectively, I want to have something like 1 + 4 + 1 + 2 + 4 + 6 + 9 + 3.
Below is the R code which I have used to implement this. It implements two functions:
# Function that calculates minimum of three values. Returns the Value.
minFUN <- function(Data, a, b){
d = (min(Data[a, b+1], Data[a+1, b], Data[a+1, b+1]))
return(d)
}
# Function that calculates the index of the minimum value, from which the
# The next iteration begins
NextRC <- function(Data, a, b){
d = min(Data[a, b+1], Data[a+1, b], Data[a+1, b+1])
if(d == Data[a, b+1]){
c = cbind(a, b+1)
}else
if(d == Data[a+1, b]){
c = cbind(a+1, b)
} else
if(d == Data[a+1, b+1]){
c = cbind(a+1, b+1)
}
return(c)
}
Je <- c()
NewRow = 1
NewCol = 1
# Warping Function that uses both functions above to loop through the dataset
WarpDist <- function(Data, a = NewRow, b = NewCol){
for(i in 1:4) {
Je[i] = minFUN(Data, a, b)
# Next Start Point
NSP = NextRC(Data, a,b)
NewRow = as.numeric(NSP[1,1])
NewCol = as.numeric(NSP[1,2])
a = NewRow
b = NewCol
}
return(Je)
}
Value=WarpDist(Data = Data, a = NewRow, b = NewCol)
warpo = Data[1,1] + sum(Value)
w = sqrt(warpo)
The result is the minimum path from the first row to the last row
Value
[1] 4 1 2 4 6
The result omits 9 and 3 because its already on the last row.
Time:
Time difference of 0.08833408 secs

Tracing the location index of the components of a cumulative sum of sequential lists elements

I have the list below and I would like to trace from the final result from the function at the bottom of the writing the location at each steep of each sublist DF3 that sequentially cumsums to 221. In other words tracing the location index of the components of a cumulative sum of each sequential lists elements.
# List
DF3 <- list( c ( 12 ,35 ,90 ,33 ,51 ) , c ( 44 , 3 ,88 ,35 ,51 ) , c(12 ,16 ,6 ,10 ,3 ,12 ,2 ,6 ,9 ,4 ,4 ,51 ,13 ,22 ,51 ))
DF3
[[1]]
12 35 90 33 51
[[2]]
44 3 88 35 51
[[3]]
12 16 6 10 3 12 2 6 9 4 4 51 13 22 51
# Obtain the list of elements to add to the prior list of elements sequentially.
fun <- function (x) tail ( DF3[[x]] , length ( DF3[[x]] ))
S <- lapply ( seq ( length ( DF3 ))[ 1 : ( max (length ( DF3 )))] , fun )[-1]
fun <- function (x) tail (S[[x]] , length( S[[x]])-x)
SS <- c( DF3[[1]][[1]], lapply ( seq ( length ( DF3 )-1), fun ))
SS
[[1]]
12
[[2]]
3 88 35 51
[[3]]
6 10 3 12 2 6 9 4 4 51 13 22 51
# CumSum of elements
D1 <- ( SS[[1]] + SS[[2]] )
a <- seq(nrow(expand.grid(SS[[1]]+SS[[2]],SS[[3]])))
fun <- function (x) sum(expand.grid ( SS[[1]] + SS[[2]] , SS[[3]] )[,1][x], expand.grid ( SS[[1]] + SS[[2]] , SS[[3]] )[,2][x] )
D2 <- unlist(lapply (a , fun))
# get results
D1
D2
# Those cs that add up to 106
which ( D2 == 106 )
# Tracing back the cumsum result from above in DF3
DF3
[[1]]
**12** 35 90 33 51
[[2]]
44 3 **88** 35 51
[[3]]
12 16 **6** 10 3 12 2 **6** 9 4 4 51 13 22 51
I would like to obtain the index of each list as per DF3 which adds up to the value -- which ( D2 == 106 ) --
# Result would be
DF3[[1]][[1]]
DF3[[2]][[3]]
DF3[[3]][[3]]
EDIT 2
The way I am using the fist example is via a list with several sublists, so I do the following. However produces the error at the end of the page as it seems it is not supoorted at present, Would you kinldy let me know how could you find out how would you do it its implementation modus.
# problem repeated values.
fun <- function (x) lapply( values[[x]] , function(a) data.table(a,seq_along( a )))
index_map <- lapply ( seq(length( values )) , fun )
fun <- function(x) lapply ( names ( values[[x]] ), function(b) c(b, sub ( "V" , "index", b )))
nams<- lapply ( seq ( length ( values )) , fun )
fun <- function (x) Map ( setnames , index_map[[x]] , nams[[x]] )
value <- lapply ( seq ( length ( values )) , fun )
# merge.
for ( index in index_map )
fun <- function (x) merge ( values[[x]] , index , by = names(index[[x]]) [1])
value <- lapply ( seq ( length ( values )) , fun )
value
# Error in setkeyv(copy(y), by) :
Column 'V1' is type 'list' which is not supported as a key column type, currently.
I think what you want to do is the following.
You have a list of vectors (Df3) and target value (106). You want to pick a number from each of the vectors in the list so that they add up to the target value. And you also want the location of the numbers, not only the numbers themselves.
If my interpretation is wrong, please clarify.
Assuming I got it right, I suggest two approaches.
Define Inputs
Let me define the input_list and total_value for the clarity of exposition.
input_list is identical to DF3.
input_list <- list(
c(12, 35, 90, 33, 51),
c(44, 3, 88, 35, 51),
c(12, 16, 6, 10, 3, 12, 2, 6, 9, 4, 4, 51, 13, 22, 51))
total_value <- 106
Method1
First approach is to use data.table package to test all combinations. Thanks to the efficient implementation of the package, this works faster than expand.grid.
library(data.table)
# data.table::CJ is a fast expand.grid
# get all value combinations that add up to the target
DT <- do.call(CJ, args = input_list) # get all combination
values <- unique(DT[rowSums(DT) == total_value])
# make a mapping from value to index
# for each element in the original list
# then rename so that merge works
index_map <- lapply(input_list, function(a)
data.table(a, seq_along(a)))
nams <- lapply(names(values), function(b)
c(b, sub("V", "index", b)))
Map(setnames, index_map, nams)
# merge
for (index in index_map)
values <- merge(values, index, by = names(index)[1])
values
# V3 V2 V1 index1 index2 index3
#1: 4 51 51 5 5 10
#2: 4 51 51 5 5 11
#3: 6 88 12 1 3 3
#4: 6 88 12 1 3 8
#5: 13 3 90 3 2 13
#6: 22 51 33 4 5 14
Method2
Alternatively, you can find the numbers and indices by the recursive functions.
First, this function finds all combinations of numbers.
library(magrittr)
find_comb <- function(input, value, comb = numeric(0)) {
if (length(input)==1) {
if (value %in% input[[1]]) {
c(comb, value)
}
} else {
lapply(unique(input[[1]]), function(x)
find_comb(input[-1], value-x, c(comb, x)))
}
}
find_comb(input_list, total_value) %>%
unlist() %>%
matrix(ncol = length(input_list), byrow = TRUE)
# [,1] [,2] [,3]
#[1,] 12 88 6
#[2,] 90 3 13
#[3,] 33 51 22
#[4,] 51 51 4
The function below finds all combinations of indices.
find_index <- function(input, value, index = numeric(0)) {
if (length(input)==1) {
if (value %in% input[[1]]) {
lapply(which(input[[1]] == value),
function(x) c(index, x))
}
} else {
lapply(seq_along(input[[1]]), function(i)
find_index(input[-1], value-input[[1]][i], c(index, i)))
}
}
find_index(input_list, total_value) %>%
unlist() %>%
matrix(ncol = length(input_list), byrow = TRUE)
# [,1] [,2] [,3]
#[1,] 1 3 3
#[2,] 1 3 8
#[3,] 3 2 13
#[4,] 4 5 14
#[5,] 5 5 10
#[6,] 5 5 11

insert elements in a vector in R

I have a vector in R,
a = c(2,3,4,9,10,2,4,19)
let us say I want to efficiently insert the following vectors, b, and c,
b = c(2,1)
d = c(0,1)
right after the 3rd and 7th positions (the "4" entries), resulting in,
e = c(2,3,4,2,1,9,10,2,4,0,1,19)
How would I do this efficiently in R, without recursively using cbind or so.
I found a package R.basic but its not part of CRAN packages so I thought about using a supported version.
Try this:
result <- vector("list",5)
result[c(TRUE,FALSE)] <- split(a, cumsum(seq_along(a) %in% (c(3,7)+1)))
result[c(FALSE,TRUE)] <- list(b,d)
f <- unlist(result)
identical(f, e)
#[1] TRUE
EDIT: generalization to arbitrary number of insertions is straightforward:
insert.at <- function(a, pos, ...){
dots <- list(...)
stopifnot(length(dots)==length(pos))
result <- vector("list",2*length(pos)+1)
result[c(TRUE,FALSE)] <- split(a, cumsum(seq_along(a) %in% (pos+1)))
result[c(FALSE,TRUE)] <- dots
unlist(result)
}
> insert.at(a, c(3,7), b, d)
[1] 2 3 4 2 1 9 10 2 4 0 1 19
> insert.at(1:10, c(4,7,9), 11, 12, 13)
[1] 1 2 3 4 11 5 6 7 12 8 9 13 10
> insert.at(1:10, c(4,7,9), 11, 12)
Error: length(dots) == length(pos) is not TRUE
Note the bonus error checking if the number of positions and insertions do not match.
You can use the following function,
ins(a, list(b, d), pos=c(3, 7))
# [1] 2 3 4 2 1 9 10 2 4 0 1 4 19
where:
ins <- function(a, to.insert=list(), pos=c()) {
c(a[seq(pos[1])],
to.insert[[1]],
a[seq(pos[1]+1, pos[2])],
to.insert[[2]],
a[seq(pos[2], length(a))]
)
}
Here's another function, using Ricardo's syntax, Ferdinand's split and #Arun's interleaving trick from another question:
ins2 <- function(a,bs,pos){
as <- split(a,cumsum(seq(a)%in%(pos+1)))
idx <- order(c(seq_along(as),seq_along(bs)))
unlist(c(as,bs)[idx])
}
The advantage is that this should extend to more insertions. However, it may produce weird output when passed invalid arguments, e.g., with any(pos > length(a)) or length(bs)!=length(pos).
You can change the last line to unname(unlist(... if you don't want a's items named.
The straightforward approach:
b.pos <- 3
d.pos <- 7
c(a[1:b.pos],b,a[(b.pos+1):d.pos],d,a[(d.pos+1):length(a)])
[1] 2 3 4 2 1 9 10 2 4 0 1 19
Note the importance of parenthesis for the boundaries of the : operator.
After using Ferdinand's function, I tried to write my own and surprisingly it is far more efficient.
Here's mine :
insertElems = function(vect, pos, elems) {
l = length(vect)
j = 0
for (i in 1:length(pos)){
if (pos[i]==1)
vect = c(elems[j+1], vect)
else if (pos[i] == length(vect)+1)
vect = c(vect, elems[j+1])
else
vect = c(vect[1:(pos[i]-1+j)], elems[j+1], vect[(pos[i]+j):(l+j)])
j = j+1
}
return(vect)
}
tmp = c(seq(1:5))
insertElems(tmp, c(2,4,5), c(NA,NA,NA))
# [1] 1 NA 2 3 NA 4 NA 5
insert.at(tmp, c(2,4,5), c(NA,NA,NA))
# [1] 1 NA 2 3 NA 4 NA 5
And there's the benchmark result :
> microbenchmark(insertElems(tmp, c(2,4,5), c(NA,NA,NA)), insert.at(tmp, c(2,4,5), c(NA,NA,NA)), times = 10000)
Unit: microseconds
expr min lq mean median uq max neval
insertElems(tmp, c(2, 4, 5), c(NA, NA, NA)) 9.660 11.472 13.44247 12.68 13.585 1630.421 10000
insert.at(tmp, c(2, 4, 5), c(NA, NA, NA)) 58.866 62.791 70.36281 64.30 67.923 2475.366 10000
my code works even better for some cases :
> insert.at(tmp, c(1,4,5), c(NA,NA,NA))
# [1] 1 2 3 NA 4 NA 5 NA 1 2 3
# Warning message:
# In result[c(TRUE, FALSE)] <- split(a, cumsum(seq_along(a) %in% (pos))) :
# number of items to replace is not a multiple of replacement length
> insertElems(tmp, c(1,4,5), c(NA,NA,NA))
# [1] NA 1 2 3 NA 4 NA 5
Here's an alternative that uses append. It's fine for small vectors, but I can't imagine it being efficient for large vectors since a new vector is created upon each iteration of the loop (which is, obviously, bad). The trick is to reverse the vector of things that need to be inserted to get append to insert them in the correct place relative to the original vector.
a = c(2,3,4,9,10,2,4,19)
b = c(2,1)
d = c(0,1)
pos <- c(3, 7)
z <- setNames(list(b, d), pos)
z <- z[order(names(z), decreasing=TRUE)]
for (i in seq_along(z)) {
a <- append(a, z[[i]], after = as.numeric(names(z)[[i]]))
}
a
# [1] 2 3 4 2 1 9 10 2 4 0 1 19

Resources