Problems with control statement if...else in R - r

I am a beginner in R and here's my code:
for (i in 1:7){
testing<-vector(length=(length(yy)-3))
if(all(yy[i:(i+2)]==0))
testing[i]<-1
else
testing[i]<-NA
}
yy refers to the following vector of length 10:
> yy
[1] 1 0 0 0 0 1 0 1 0 1
testing is like a predict function output, which will predict a 1 if the previous 3 elements in yy are all 0.If not, it will not predict anything, and so NA. Since yy has a total of 10 elements, testing will have a total of 7 elements (Therefore length 7) However, instead of giving me a output of 1s and NAs, it is giving this:
> testing
[1] FALSE FALSE FALSE FALSE FALSE FALSE NA
I cannot figure out why, some help will be great. Thank you.

You should define testing outside the loop:
testing<-vector(length=(length(yy)-3))
for (i in 1:7){
if(all(yy[i:(i+2)]==0))
testing[i]<-1
else
testing[i]<-NA
}
testing
[1] NA 1 1 NA NA NA NA
For this task you couls also use rollapply from zoo:
library(zoo)
rollapply(yy, 3, function(x) ifelse(all(x == 0), 1, NA))
[1] NA 1 1 NA NA NA NA NA

Here are some additional vectorized ways of solving this
Base r stats::filter
N <- 3
NA^(stats::filter(yy == 0, rep(1, N), sides = 1)[-(1:N-1)] != N)
# [1] NA 1 1 NA NA NA NA NA
data.table::shift
NA^(Reduce(`+`, data.table::shift(yy == 0, 0:(N-1)))[-(1:N-1)] != N)
# [1] NA 1 1 NA NA NA NA NA
RcppRoll::roll_sum
NA^(RcppRoll::roll_sum(yy == 0, N) != N)
# [1] NA 1 1 NA NA NA NA NA
Some becnmarks (I've also added a compiled version of the for loop using compiler::cmpfun and two more efficient zoo solutions)
ForLoop <- function(yy, N){
testing<-vector(length=(length(yy)-N))
for (i in 1:length(testing)){
if(all(yy[i:(i+(N-1))]==0))
testing[i]<-1
else
testing[i]<-NA
}
testing
}
ForLoopBin <- compiler::cmpfun(ForLoop)
ZOO <- function(yy, N) zoo::rollapply(yy, N, function(x) ifelse(all(x == 0), 1, NA))
ZOO2 <- function(yy, N) NA^!zoo::rollapply(yy == 0, N, all)
ZOO3 <- function(yy, N) NA^(zoo::rollsum(yy == 0, N) != N)
RCPPROLL <- function(yy, N) NA^(RcppRoll::roll_sum(yy == 0, N) != N)
BaseFilter <- function(yy, N) NA^(stats::filter(yy == 0, rep(1, N), sides = 1)[-(1:N-1)] != N)
DTShift <- function(yy, N) NA^(Reduce(`+`, data.table::shift(yy == 0, 0:(N-1)))[-(1:N-1)] != N)
set.seed(123)
yy <- sample(0:1, 1e4, replace = TRUE)
N <- 3
library(microbenchmark)
microbenchmark(
"for loop" = ForLoop(yy, N),
"Compiled for loop" = ForLoopBin(yy, N),
"zoo1" = ZOO(yy, N),
"zoo2" = ZOO2(yy, N),
"zoo3" = ZOO3(yy, N),
"Rcpproll" = RCPPROLL(yy, N),
"stats::filter" = BaseFilter(yy, N),
"data.table::shift" = DTShift(yy, N)
)
# Unit: microseconds
# expr min lq mean median uq max neval cld
# for loop 25917.837 26858.936 30157.3927 28546.2595 29334.2430 110135.205 100 d
# Compiled for loop 7559.837 8208.142 9709.7256 8882.6875 9428.9155 22683.347 100 c
# zoo1 101699.548 107857.014 112210.5929 110402.3985 113335.7745 171537.068 100 f
# zoo2 72265.949 77788.709 81275.9028 80292.8135 81917.8985 153197.948 100 e
# zoo3 4584.861 4734.778 4939.3528 4785.9770 4853.6560 13228.514 100 b
# Rcpproll 216.636 246.076 290.7211 290.0745 311.3540 663.667 100 a
# stats::filter 425.912 475.350 536.0757 509.5900 544.6295 1497.568 100 a
# data.table::shift 334.394 365.593 443.2138 409.4325 424.6320 1944.279 100 a

Related

Closest subsequent index for a specified value

Consider a vector:
int = c(1, 1, 0, 5, 2, 0, 0, 2)
I'd like to get the closest subsequent index (not the difference) for a specified value. The first parameter of the function should be the vector, while the second should be the value one wants to see the closest subsequent elements.
For instance,
f(int, 0)
# [1] 2 1 0 2 1 0 0 NA
Here, the first element of the vector (1) is two positions away from the first subsequent 0, (3 - 1 = 2), so it should return 2. Then the second element is 1 position away from a 0 (2 - 1 = 1). When there is no subsequent values that match the specified value, return NA (here it's the case for the last element, because no subsequent value is 0).
Other examples:
f(int, 1)
# [1] 0 0 NA NA NA NA NA NA
f(int, 2)
# [1] 4 3 2 1 0 2 1 0
f(int, 3)
# [1] NA NA NA NA NA NA NA NA
This should also work for character vectors:
char = c("A", "B", "C", "A", "A")
f(char, "A")
# [1] 0 2 1 0 0
Find the location of each value (numeric or character)
int = c(1, 1, 0, 5, 2, 0, 0, 2)
value = 0
idx = which(int == value)
## [1] 3 6 7
Expand the index to indicate the nearest value of interest, using an NA after the last value in int.
nearest = rep(NA, length(int))
nearest[1:max(idx)] = rep(idx, diff(c(0, idx))),
## [1] 3 3 3 6 6 6 7 NA
Use simple arithmetic to find the difference between the index of the current value and the index of the nearest value
abs(seq_along(int) - nearest)
## [1] 2 1 0 2 1 0 0 NA
Written as a function
f <- function(x, value) {
idx = which(x == value)
nearest = rep(NA, length(x))
if (length(idx)) # non-NA values only if `value` in `x`
nearest[1:max(idx)] = rep(idx, diff(c(0, idx)))
abs(seq_along(x) - nearest)
}
We have
> f(int, 0)
[1] 2 1 0 2 1 0 0 NA
> f(int, 1)
[1] 0 0 NA NA NA NA NA NA
> f(int, 2)
[1] 4 3 2 1 0 2 1 0
> f(char, "A")
[1] 0 2 1 0 0
> f(char, "B")
[1] 1 0 NA NA NA
> f(char, "C")
[1] 2 1 0 NA NA
The solution doesn't involve recursion or R-level loops, so should e fast even for long vectors.
Look for the match from nth position to the end of the vector, then get the 1st match:
f <- function(v, x){
sapply(seq_along(v), function(i){
which(v[ i:length(v) ] == x)[ 1 ] - 1
})
}
f(int, 0)
# [1] 2 1 0 2 1 0 0 NA
f(int, 1)
# [1] 0 0 NA NA NA NA NA NA
f(int, 2)
# [1] 4 3 2 1 0 2 1 0
f(int, 3)
# [1] NA NA NA NA NA NA NA NA
f(char, "A")
# [1] 0 2 1 0 0
Using sequence:
f <- function(v, x){
d = diff(c(0, which(v == x)))
vec <- sequence(d, d-1, by = -1)
length(vec) <- length(int)
vec
}
Output
int = c(1, 1, 0, 5, 2, 0, 0, 2)
char = c("A", "B", "C", "A", "A")
f(int, 0)
# [1] 2 1 0 2 1 0 0 NA
f(int, 1)
# [1] 0 0 NA NA NA NA NA NA
f(int, 2)
# [1] 4 3 2 1 0 2 1 0
f(char, "A")
# [1] 0 2 1 0 0
Benchmark (n = 1000):
set.seed(123)
int = sample(0:100, size = 1000, replace = T)
library(microbenchmark)
bm <- microbenchmark(
fSequence(int, 0),
fzx8754(int, 0),
fRecursive(int, 0),
fMartinMorgan(int, 0),
fMap2dbl(int, 0),
fReduce(int, 0),
fAve(int, 0),
fjblood94(int, 0),
times = 10L,
setup = gc(FALSE)
)
autoplot(bm)
Martin Morgan's solution seems to be the quickest, followed by this answer's sequence solution, sbarbit's recursive solution, and jblood94's for loop solution.
Functions used:
fSequence <- function(v, x){
vec <- sequence(diff(c(0, which(v == x))), diff(c(0, which(v == x))) - 1, by = -1)
length(vec) <- length(v)
vec
}
fzx8754 <- function(v, x){
sapply(seq_along(v), function(i){
which(v[ i:length(v) ] == x)[ 1 ] - 1
})
}
fRecursive <- function(lookup,val ) {
ind <- which(lookup == val)[1] -1
if (length(lookup) > 1) {
c(ind, f(lookup[-1], val))
} else {
ind
}
}
fMartinMorgan <- function(x, value) {
idx = which(x == value)
nearest = rep(NA, length(x))
nearest[1:max(idx)] = rep(idx, diff(c(0, idx)))
abs(seq_along(x) - nearest)
}
fMap2dbl <- function(int, num)
{
n <- length(int)
map2_dbl(num, 1:n, ~ ifelse(length(which(.x == int[.y:n])) == 0, NA,
min(which(.x == int[.y:n])) - 1))
}
fReduce <- function(vec, value) {
replace(
Reduce(
function(x, y)
x + (y * x) ,
vec != value,
right = TRUE,
accumulate = TRUE
),
max(tail(which(vec == value), 1), 0) < seq_along(vec),
NA
)
}
fAve <- function(init, k) {
ave(
seq_along(init),
c(0, head(cumsum(init == k), -1)),
FUN = function(x) if (any(x == k)) rev(seq_along(x) - 1) else NA
)
}
fjblood94 <- function(v, val) {
out <- integer(length(v))
if (v[length(v)] != val) out[length(v)] <- NA_integer_
for (i in (length(v) - 1L):1) {
if (v[i] == val) {
out[i] <- 0L
} else {
out[i] <- out[i + 1L] + 1L
}
}
return(out)
}
Here f is defined as a recursive function that calls itself over shorter tails of the lookup vector:
f <- function(lookup,val ) {
ind <- which(lookup == val)[1] -1
if (length(lookup) > 1) {
c(ind, f(lookup[-1], val))
} else {
ind
}
}
Here is an approach using Reduce() and then some fiddling to get the NA values.
f <- function(vec, value) {
replace(
Reduce(
function(x, y)
x + (y * x) ,
vec != value,
right = TRUE,
accumulate = TRUE
),
max(tail(which(vec == value), 1), 0) < seq_along(vec),
NA
)
}
f(int, 0)
[1] 2 1 0 2 1 0 0 NA
f(int, 1)
[1] 0 0 NA NA NA NA NA NA
f(int, 2)
[1] 4 3 2 1 0 2 1 0
f(int, 3)
[1] NA NA NA NA NA NA NA NA
char = c("A", "B", "C", "A", "A")
f(char, "A")
[1] 0 2 1 0 0
Another possible solution, based on purrr::map2_dbl:
library(purrr)
int = c(1, 1, 0, 5, 2, 0, 0, 2)
f <- function(int, num)
{
n <- length(int)
map2_dbl(num, 1:n, ~ ifelse(length(which(.x == int[.y:n])) == 0, NA,
min(which(.x == int[.y:n])) - 1))
}
f(int, 0)
#> [1] 2 1 0 2 1 0 0 NA
f(int, 1)
#> [1] 0 0 NA NA NA NA NA NA
f(int, 2)
#> [1] 4 3 2 1 0 2 1 0
f(int, 3)
#> [1] NA NA NA NA NA NA NA NA
char = c("A", "B", "C", "A", "A")
f(char, "A")
#> [1] 0 2 1 0 0
A single-pass for loop is simple and efficient:
f1 <- function(v, val) {
out <- integer(length(v))
if (v[length(v)] != val) out[length(v)] <- NA_integer_
for (i in (length(v) - 1L):1) {
if (v[i] == val) {
out[i] <- 0L
} else {
out[i] <- out[i + 1L] + 1L
}
}
return(out)
}
int <- c(1, 1, 0, 5, 2, 0, 0, 2)
chr <- c("A", "B", "C", "A", "A")
f1(int, 0)
#> [1] 2 1 0 2 1 0 0 NA
f1(chr, "A")
#> [1] 0 2 1 0 0
Benchmarking against other solutions:
f2 <- function(v, x){
sapply(seq_along(v), function(i){
which(v[ i:length(v) ] == x)[ 1 ] - 1
})
}
f3 <- function(lookup,val ) {
ind <- which(lookup == val)[1] -1
if (length(lookup) > 1) {
c(ind, f3(lookup[-1], val))
} else {
ind
}
}
f4 <- function(x, value) {
idx = which(x == value)
nearest = rep(NA, length(x))
nearest[1:max(idx)] = rep(idx, diff(c(0, idx)))
abs(seq_along(x) - nearest)
}
f5 <- function(vec, value) {
replace(
Reduce(
function(x, y)
x + (y * x) ,
vec != value,
right = TRUE,
accumulate = TRUE
),
max(tail(which(vec == value), 1), 0) < seq_along(vec),
NA
)
}
microbenchmark::microbenchmark(f1 = {f1(int, 0); f1(chr, "A")},
f2 = {f2(int, 0); f2(chr, "A")},
f3 = {f3(int, 0); f3(chr, "A")},
f4 = {f4(int, 0); f4(chr, "A")},
f5 = {f5(int, 0); f5(chr, "A")},
check = "equal")
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1 6.0 7.50 8.990 8.40 9.60 18.3 100
#> f2 54.2 61.45 71.752 65.55 79.40 131.8 100
#> f3 25.5 28.60 33.393 30.75 35.90 105.2 100
#> f4 22.3 26.30 30.599 28.00 32.65 82.4 100
#> f5 59.7 64.55 73.474 69.10 75.70 157.0 100
A base R option using ave + cumsum
f <- function(init, k) {
ave(
seq_along(init),
c(0, head(cumsum(init == k), -1)),
FUN = function(x) if (any(x == k)) rev(seq_along(x) - 1) else NA
)
}
and you will see
> f(init, 0)
[1] 2 1 0 2 1 0 0 NA
> f(init, 1)
[1] 0 0 NA NA NA NA NA NA
> f(init, 2)
[1] 4 3 2 1 0 2 1 0
> f(init, 3)
[1] NA NA NA NA NA NA NA NA

How to create a vector sequencing along nonNA elements

In R, given a vector with some elements as NA, how can I count along it skipping NAs?
For example:
let <- letters[1:10]
let[c(2,3,7,9)] <- NA
How would I get the vector?
1,NA,NA,2,3,4,NA,5,NA,6
One way is to match all the indices of let to the non-na indices:
> match(seq_along(let), which(!is.na(let)))
[1] 1 NA NA 2 3 4 NA 5 NA 6
Another option using seq_along
let[!is.na(let)] <- seq_along(let[!is.na(let)])
as.numeric(let)
# [1] 1 NA NA 2 3 4 NA 5 NA 6
benchmark
library(microbenchmark)
n <- 1e7
let_long <- seq_len(n)
set.seed(1)
let_long[sample(seq_len(n), size = 1e6)] <- NA
benchmark <- microbenchmark(
Karolis = Karolis(let_long),
Markus = Markus(let_long),
Snoram = Snoram(let_long),
Alexandra = Alexandra(let_long),
Frank = Frank(let_long) # see comment under Snoram's answer
)
To get the chart below, type autoplot(benchmark).
#Unit: milliseconds
# expr min lq mean median uq max neval
# Karolis 1042.0708 1216.6241 1314.9765 1290.3428 1374.7090 1807.4604 100
# Markus 210.3860 259.9957 310.0776 293.8244 363.4317 488.2171 100
# Snoram 714.4514 938.5760 1033.6168 1029.8205 1104.5614 1546.3733 100
# Alexandra 4317.5206 4470.2634 4665.9004 4603.6446 4771.5768 6495.3595 100
# Frank 103.3624 126.2842 166.7555 159.3568 190.5186 290.0422 100
Functions compared so far.
Karolis <- function(x) {
match(seq_along(x), which(!is.na(x)))
}
Markus <- function(x) {
x[!is.na(x)] <- seq_along(x[!is.na(x)])
as.numeric(x)
}
Snoram <- function(x) {
ifelse(is.na(x), NA, cumsum(!is.na(x)))
}
Alexandra <- function(x) {
j = 0
for (i in 1:length(x)) {
if(is.na(x[i]) == FALSE){
j = j + 1
x[i] <- j
}
}
as.numeric(x)
}
Frank <- function(x) {
replace(cumsum(!is.na(x)), is.na(x), NA)
}
Other options include:
Use ifelse() and cumsum()
ifelse(is.na(let), NA, cumsum(!is.na(let)))
#[1] 1 NA NA 2 3 4 NA 5 NA 6
This would get the result you are looking for but isn't efficient
let <- letters[1:10]
let[c(2,3,7,9)] <- NA
j = 0
for (i in 1:length(let)) {
if(is.na(let[i]) == FALSE){
j = j + 1
let[i] <- j
}
}

Replace NA with previous value with limit on number of consecutive NA

I would like to replace up to n consecutive NA values in vector with latest non-NA value.
For example, if:
a <- c(1,NA,NA,NA,NA,NA,2,NA,1,NA,NA,NA)
n <- 2
I would like to obtain:
c(1,1,1,NA,NA,NA,2,2,1,1,1,NA)
n is maximum number of NA values that can be replaced by given element).
I know na.locf() function, but I don't know how to set the limit n. Is it possible to do it?
Here's an option using na.locf and rle
library(zoo)
r <- rle(is.na(a))
a <- na.locf(a)
is.na(a) <- sequence(r$lengths) > n & rep(r$values, r$lengths)
a
# [1] 1 1 1 NA NA NA 2 2 1 1 1 NA
So here I first computed the run lengths of elements in a (including the NA entries), then replaced all NA's using na.locf and finally turned those elements back to NA's where the run lengths were greater than n and the elements were NA.
As another idea, we can find the last indices of "a" without NAs:
is = seq_along(a)
i = cummax((!is.na(a)) * is)
i
# [1] 1 1 1 1 1 1 7 7 9 9 9 9
Replace the last non-NA index with the current index if last non-NA is more than "n" steps away:
wh = (is - i) > n
i[wh] = is[wh]
i
# [1] 1 1 1 4 5 6 7 7 9 9 9 12
And subset "a":
a[i]
# [1] 1 1 1 NA NA NA 2 2 1 1 1 NA
You could do this using split and replace in base R
f <- function(a, n) {
# split the vector based on the position of non-NA values
l <- split(a, cumsum(seq_along(a) %in% which(!is.na(a))))
unlist(lapply(l, function(r) replace(r, 1:(n+1), r[1])[seq_along(r)]),use.names = FALSE)
}
f(a, n = 2)
#[1] 1 1 1 NA NA NA 2 2 1 1 1 NA
f(a, n = 3)
#[1] 1 1 1 1 NA NA 2 2 1 1 1 1
Benchmarking (random generated vector of size 7467)
library(microbenchmark)
library(dplyr)
library(zoo)
set.seed(123)
a <- unlist(replicate(1000, c(sample(10, 2), rep(NA, sample.int(10, 1)))))
length(a)
# [1] 7467
n <- 3
f_989 <- function(a, n) {
# split the vector based on the position of non-NA values
l <- split(a, cumsum(seq_along(a) %in% which(!is.na(a))))
unlist(lapply(l, function(r) replace(r, 1:(n+1), r[1])[seq_along(r)]),use.names = FALSE)
}
f_zx8754 <- function(a, n)
data.frame(a) %>% mutate(gr = cumsum(!is.na(a))) %>%
group_by(gr) %>%
mutate(res = if_else(row_number() <= n + 1, na.locf(a), a)) %>%
.$res
f_docendo_discimus <- function(a, n){
r <- rle(is.na(a))
a <- na.locf(a)
is.na(a) <- sequence(r$lengths) > n & rep(r$values, r$lengths)
a
}
f_akrun <- function(a,n)
ave(a, cumsum(!is.na(a)), FUN = function(x) replace(x, pmin(length(x), seq(n+1)), x[1]))
f_alexis_laz=function(a,n){
is = seq_along(a)
i = cummax((!is.na(a)) * is)
wh = (is - i) > n
i[wh] = is[wh]
a[i]
}
r <- f_989(a,n)
identical(r, f_zx8754(a,n))
# [1] TRUE
identical(r, f_docendo_discimus(a,n))
# [1] TRUE
identical(r, f_akrun(a,n))
# [1] TRUE
identical(r, f_alexis_laz(a,n))
# [1] TRUE
res <- microbenchmark("f1"=f_989(a,n), "f2"=f_zx8754(a,n),
"f3"=f_docendo_discimus(a,n), "f4"=f_akrun(a,n), "f5"=f_alexis_laz(a,n))
print(res, order="mean")
# Unit: microseconds
# expr min lq mean median uq max neval
# f5 129.804 137.014 161.106 141.6715 151.7375 1031.511 100
# f3 1249.351 1354.215 1471.478 1392.9750 1482.2140 2553.086 100
# f1 4736.895 5093.852 5630.367 5345.3450 6069.9260 8848.513 100
# f4 22165.601 23936.866 24660.990 24485.6725 24883.6440 29453.177 100
# f2 205854.339 215582.174 221524.448 218643.9540 224211.0435 261512.922 100
We can use a base R approach by creating a grouping variable with cumsum and diff, then using the grouping variable in ave we replace the NA values based on the condition given by 'n'
ave(a, cumsum(c(TRUE, diff(is.na(a)) < 0)),
FUN = function(x) replace(x, is.na(x) & seq_along(x) <= n + 1, x[1]))
#[1] 1 1 1 NA NA NA 2 2 1 1 1 NA
Or more compact option
ave(a, cumsum(!is.na(a)), FUN = function(x) replace(x, pmin(length(x), seq(n+1)), x[1]))
#[1] 1 1 1 NA NA NA 2 2 1 1 1 NA
Using dplyr::group_by and zoo::na.locf:
library(dplyr)
library(zoo)
data.frame(a) %>%
mutate(gr = cumsum(!is.na(a))) %>%
group_by(gr) %>%
mutate(res = if_else(row_number() <= n + 1, na.locf(a), a)) %>%
.$res
# [1] 1 1 1 NA NA NA 2 2 1 1 1 NA

How to compare with values adjacent in a sequence in the same group

Let's say I have something like this:
set.seed(0)
the.df <- data.frame( x=rep(letters[1:3], each=4),
n=rep(0:3, 3),
val=round(runif(12)))
the.df
x n val
1 a 0 1
2 a 1 0
3 a 2 0
4 a 3 1
5 b 0 1
6 b 1 0
7 b 2 1
8 b 3 1
9 c 0 1
10 c 1 1
11 c 2 0
12 c 3 0
Within each x, starting from n==2 (going from small to large), I want to set val to 0 if the previous val (in terms of n) is 0; otherwise, leave it as is.
For example, in the subset x=="b", I first ignore the two rows where n < 2. Now, in Row 7, because the previous val is 0 (the.df$val[the.df$x=="b" & the.df$n==1]), I set val to 0 (the.df$val[the.df$x=="b" & the.df$n==2] <- 0). Then on Row 8, now that val for the previous n is 0 (we just set it), I also want to set val here to 0 (the.df$val[the.df$x=="b" & the.df$n==3] <- 0).
Imagine that the data.frame is not sorted. Therefore procedures that depend on the order would require a sort. I also can't assume that adjacent rows exist (e.g., the row the.df[the.df$x=="a" & the.df$n==1, ] might be missing).
The trickiest part seems to be evaluating val in sequence. I can do this using a loop but I imagine that it would be inefficient (I have millions of rows). Is there a way I can do this more efficiently?
EDIT: wanted output
the.df
x n val wanted
1 a 0 1 1
2 a 1 0 0
3 a 2 0 0
4 a 3 1 0
5 b 0 1 1
6 b 1 0 0
7 b 2 1 0
8 b 3 1 0
9 c 0 1 1
10 c 1 1 1
11 c 2 0 0
12 c 3 0 0
Also, I don't mind making new columns (e.g., putting the wanted values there).
Using data.table I would try the following
library(data.table)
setDT(the.df)[order(n),
val := if(length(indx <- which(val[2:.N] == 0L)))
c(val[1:(indx[1L] + 1L)], rep(0L, .N - (indx[1L] + 1L))),
by = x]
the.df
# x n val
# 1: a 0 1
# 2: a 1 0
# 3: a 2 0
# 4: a 3 0
# 5: b 0 1
# 6: b 1 0
# 7: b 2 0
# 8: b 3 0
# 9: c 0 1
# 10: c 1 1
# 11: c 2 0
# 12: c 3 0
This will simultaneously order the data by n (as you said it's not ordered in real life) and recreate val by condition (meaning that if condition not satisfied, val will be untouched).
Hopefully in the near future this will be implemented and then the code could potentially be
setDT(the.df)[order(n), val[n > 2] := if(val[2L] == 0) 0L, by = x]
Which could be a great improvement both performance and syntax wise
A base R approach might be
df <- the.df[order(the.df$x, the.df$n),]
df$val <- ave(df$val, df$x, FUN=fun)
As for fun, #DavidArenburg's answer in plain R and written a bit more poetically might be
fun0 <- function(v) {
idx <- which.max(v[2:length(v)] == 0L) + 1L
if (length(idx))
v[idx:length(v)] <- 0L
v
}
It seems like a good idea to formulate the solution as an independent function first, because then it is easy to test. fun0 fails for some edge cases, e.g.,
> fun0(0)
[1] 0 0 0
> fun0(1)
[1] 0 0 0
> fun0(c(1, 1))
[1] 1 0
A better version is
fun1 <- function(v) {
tst <- tail(v, -1) == 0L
if (any(tst)) {
idx <- which.max(tst) + 1L
v[idx:length(v)] <- 0L
}
v
}
And even better, following #Arun
fun <- function(v)
if (length(v) > 2) c(v[1], cummin(v[-1])) else v
This is competitive (same order of magnitude) with the data.table solution, with ordering and return occurring in less than 1s for the ~10m row data.frame of #m-dz 's timings. At a second for millions of rows, it doesn't seem worth while to pursue further optimization.
Nonetheless, when there are a very large number of small groups (e.g., 2M each of size 5) an improvement is to avoid the tapply() function call by using group identity to offset the minimum. For instance,
df <- df[order(df$x, df$n),]
grp <- match(df$x, unique(df$x)) # strictly sequential groups
keep <- duplicated(grp) # ignore the first of each group
df$val[keep] <- cummin(df$val[keep] - grp[keep]) + grp[keep]
Hmmm, should be pretty efficient if you switch to data.table...
library(data.table)
# Define the.df as a data.table (or use data.table::setDT() function)
set.seed(0)
the.df <- data.table(
x = rep(letters[1:3], each = 4),
n = rep(0:3, 3),
val = round(runif(12))
)
m_dz <- function() {
setorder(the.df, x, n)
repeat{
# Get IDs of rows to change
# ids <- which(the.df[, (n > 1) & (val == 1) & (shift(val, 1L, type = "lag") == 0)])
ids <- the.df[(n > 1) & (val == 1) & (shift(val, 1L, type = "lag") == 0), , which = TRUE]
# If no IDs break
if(length(ids) == 0){
break
}
# Set val to 0
# for (i in ids) set(the.df, i = i, j = "val", value = 0)
set(the.df, i = ids, j = "val", value = 0)
}
return(the.df)
}
Edit: Above function is slightly modified thanks to #jangorecki's, i.e. uses which = TRUE and set(the.df, i = ids, j = "val", value = 0), which made the timings much more stable (no very high max timings).
Edit: timing comparison with #David Arenburgs's answer on a slightly bigger table, m-dz() updated (#FoldedChromatin's answer skipped because of diffrent results).
My function is slightly faster in terms of median and upper quantile, but there is quite a big spread in timings (see max...), I cannot figure out why. Hopefully the timing methodology is correct (returning the result to different object etc.).
Anything bigger will kill my PC :(
set.seed(0)
groups_ids <- replicate(300, paste(sample(LETTERS, 5, replace=TRUE), collapse = ""))
size1 <- length(unique(groups_ids))
size2 <- round(1e7/size1)
the.df1 <- data.table(
x = rep(groups_ids, each = size2), # 52 * 500 = 26000
n = rep(0:(size2-1), size1),
val = round(runif(size1*size2))
)
the.df2 <- copy(the.df1)
# m-dz
m_dz <- function() {
setorder(df1, x, n)
repeat{
ids <- df1[(n > 1) & (val == 1) & (shift(val, 1L, type = "lag") == 0), , which = TRUE]
if(length(ids) == 0){
break
}
set(df1, i = ids, j = "val", value = 0)
}
return(df1)
}
# David Arenburg
DavidArenburg <- function() {
setorder(df2, x, n)
df2[, val := if(length(indx <- which.max(val[2:.N] == 0) + 1L)) c(val[1:indx], rep(0L, .N - indx)), by = x]
return(df2)
}
library(microbenchmark)
microbenchmark(
res1 <- m_dz(),
res2 <- DavidArenburg(),
times = 100
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# res1 <- m_dz() 247.4136 268.5005 363.0117 288.4216 312.7307 7071.0960 100 a
# res2 <- DavidArenburg() 270.6074 281.3935 314.7864 303.5229 328.1210 525.8095 100 a
identical(res1, res2)
# [1] TRUE
Edit: (Old) results for even bigger table:
set.seed(0)
groups_ids <- replicate(300, paste(sample(LETTERS, 5, replace=TRUE), collapse = ""))
size1 <- length(unique(groups_ids))
size2 <- round(1e8/size1)
# Unit: seconds
# expr min lq mean median uq max neval cld
# res1 <- m_dz() 5.599855 5.800264 8.773817 5.923721 6.021132 289.85107 100 a
# res2 <- m_dz2() 5.571911 5.836191 9.047958 5.970952 6.123419 310.65280 100 a
# res3 <- DavidArenburg() 9.183145 9.519756 9.714105 9.723325 9.918377 10.28965 100 a
Why not just use by
> set.seed(0)
> the.df <- data.frame( x=rep(letters[1:3], each=4),
n=rep(0:3, 3),
val=round(runif(12)))
> the.df
x n val
1 a 0 1
2 a 1 0
3 a 2 0
4 a 3 1
5 b 0 1
6 b 1 0
7 b 2 1
8 b 3 1
9 c 0 1
10 c 1 1
11 c 2 0
12 c 3 0
> Mod.df<-by(the.df,INDICES=the.df$x,function(x){
x$val[x$n==2]=0
Which=which(x$n==2 & x$val==0)+1
x$val[Which]=0
x})
> do.call(rbind,Mod.df)
x n val
a.1 a 0 1
a.2 a 1 0
a.3 a 2 0
a.4 a 3 0
b.5 b 0 1
b.6 b 1 0
b.7 b 2 0
b.8 b 3 0
c.9 c 0 1
c.10 c 1 1
c.11 c 2 0
c.12 c 3 0

insert elements in a vector in R

I have a vector in R,
a = c(2,3,4,9,10,2,4,19)
let us say I want to efficiently insert the following vectors, b, and c,
b = c(2,1)
d = c(0,1)
right after the 3rd and 7th positions (the "4" entries), resulting in,
e = c(2,3,4,2,1,9,10,2,4,0,1,19)
How would I do this efficiently in R, without recursively using cbind or so.
I found a package R.basic but its not part of CRAN packages so I thought about using a supported version.
Try this:
result <- vector("list",5)
result[c(TRUE,FALSE)] <- split(a, cumsum(seq_along(a) %in% (c(3,7)+1)))
result[c(FALSE,TRUE)] <- list(b,d)
f <- unlist(result)
identical(f, e)
#[1] TRUE
EDIT: generalization to arbitrary number of insertions is straightforward:
insert.at <- function(a, pos, ...){
dots <- list(...)
stopifnot(length(dots)==length(pos))
result <- vector("list",2*length(pos)+1)
result[c(TRUE,FALSE)] <- split(a, cumsum(seq_along(a) %in% (pos+1)))
result[c(FALSE,TRUE)] <- dots
unlist(result)
}
> insert.at(a, c(3,7), b, d)
[1] 2 3 4 2 1 9 10 2 4 0 1 19
> insert.at(1:10, c(4,7,9), 11, 12, 13)
[1] 1 2 3 4 11 5 6 7 12 8 9 13 10
> insert.at(1:10, c(4,7,9), 11, 12)
Error: length(dots) == length(pos) is not TRUE
Note the bonus error checking if the number of positions and insertions do not match.
You can use the following function,
ins(a, list(b, d), pos=c(3, 7))
# [1] 2 3 4 2 1 9 10 2 4 0 1 4 19
where:
ins <- function(a, to.insert=list(), pos=c()) {
c(a[seq(pos[1])],
to.insert[[1]],
a[seq(pos[1]+1, pos[2])],
to.insert[[2]],
a[seq(pos[2], length(a))]
)
}
Here's another function, using Ricardo's syntax, Ferdinand's split and #Arun's interleaving trick from another question:
ins2 <- function(a,bs,pos){
as <- split(a,cumsum(seq(a)%in%(pos+1)))
idx <- order(c(seq_along(as),seq_along(bs)))
unlist(c(as,bs)[idx])
}
The advantage is that this should extend to more insertions. However, it may produce weird output when passed invalid arguments, e.g., with any(pos > length(a)) or length(bs)!=length(pos).
You can change the last line to unname(unlist(... if you don't want a's items named.
The straightforward approach:
b.pos <- 3
d.pos <- 7
c(a[1:b.pos],b,a[(b.pos+1):d.pos],d,a[(d.pos+1):length(a)])
[1] 2 3 4 2 1 9 10 2 4 0 1 19
Note the importance of parenthesis for the boundaries of the : operator.
After using Ferdinand's function, I tried to write my own and surprisingly it is far more efficient.
Here's mine :
insertElems = function(vect, pos, elems) {
l = length(vect)
j = 0
for (i in 1:length(pos)){
if (pos[i]==1)
vect = c(elems[j+1], vect)
else if (pos[i] == length(vect)+1)
vect = c(vect, elems[j+1])
else
vect = c(vect[1:(pos[i]-1+j)], elems[j+1], vect[(pos[i]+j):(l+j)])
j = j+1
}
return(vect)
}
tmp = c(seq(1:5))
insertElems(tmp, c(2,4,5), c(NA,NA,NA))
# [1] 1 NA 2 3 NA 4 NA 5
insert.at(tmp, c(2,4,5), c(NA,NA,NA))
# [1] 1 NA 2 3 NA 4 NA 5
And there's the benchmark result :
> microbenchmark(insertElems(tmp, c(2,4,5), c(NA,NA,NA)), insert.at(tmp, c(2,4,5), c(NA,NA,NA)), times = 10000)
Unit: microseconds
expr min lq mean median uq max neval
insertElems(tmp, c(2, 4, 5), c(NA, NA, NA)) 9.660 11.472 13.44247 12.68 13.585 1630.421 10000
insert.at(tmp, c(2, 4, 5), c(NA, NA, NA)) 58.866 62.791 70.36281 64.30 67.923 2475.366 10000
my code works even better for some cases :
> insert.at(tmp, c(1,4,5), c(NA,NA,NA))
# [1] 1 2 3 NA 4 NA 5 NA 1 2 3
# Warning message:
# In result[c(TRUE, FALSE)] <- split(a, cumsum(seq_along(a) %in% (pos))) :
# number of items to replace is not a multiple of replacement length
> insertElems(tmp, c(1,4,5), c(NA,NA,NA))
# [1] NA 1 2 3 NA 4 NA 5
Here's an alternative that uses append. It's fine for small vectors, but I can't imagine it being efficient for large vectors since a new vector is created upon each iteration of the loop (which is, obviously, bad). The trick is to reverse the vector of things that need to be inserted to get append to insert them in the correct place relative to the original vector.
a = c(2,3,4,9,10,2,4,19)
b = c(2,1)
d = c(0,1)
pos <- c(3, 7)
z <- setNames(list(b, d), pos)
z <- z[order(names(z), decreasing=TRUE)]
for (i in seq_along(z)) {
a <- append(a, z[[i]], after = as.numeric(names(z)[[i]]))
}
a
# [1] 2 3 4 2 1 9 10 2 4 0 1 19

Resources