Treat NAs as zeros when adding vectors - r

Adding two vectors is easy:
> c(1:5) + c(6:10)
[1] 7 9 11 13 15
But since adding any number to NA gives NA, this happens:
> c(1,NA,3:5)+c(6:10)
[1] 7 NA 11 13 15
How can I add two vectors where there may be some NAs, treating them as zeros? I need to get this result:
> c(1,NA,3:5)+c(6:10)
[1] 7 7 11 13 15
Any ideas on how to do this using {base} and not changing the NAs to zeros on the original vectors?

You can also use colSums or rowSums, e.g.:
rowSums(cbind(x, y), na.rm = T)
# [1] 7 7 11 13 15
colSums(rbind(x, y), na.rm = T)
# [1] 7 7 11 13 15
Benchmarks; surprisingly colSums works the fastest:
microbenchmark::microbenchmark(fn_replace(x, y),
fn_rowSums(x, y),
fn_colSums(x, y),
fn_coalesce(x, y))
# Unit: milliseconds
# expr min lq mean median uq max neval
# fn_replace(x, y) 121.4322 130.99067 174.1531 162.2454 183.1781 385.7348 100
# fn_rowSums(x, y) 143.0654 146.20815 172.5396 149.3953 179.0337 370.1625 100
# fn_colSums(x, y) 96.8848 99.46521 121.5916 106.8800 140.9279 298.1607 100
# fn_coalesce(x, y) 259.2923 310.16915 357.0241 326.1245 360.9110 595.9711 100
## Code to generate x, y and functions for benchmark:
fn_replace <- function(x, y) {
replace(x, is.na(x), 0) + replace(y, is.na(y), 0)
}
fn_rowSums <- function(x, y) {
rowSums(cbind(x, y), na.rm = T)
}
fn_colSums <- function(x, y) {
colSums(rbind(x, y), na.rm = T)
}
fn_coalesce <- function(x, y) {
dplyr::coalesce(x, rep(0, length(x))) +
dplyr::coalesce(y, rep(0, length(y)))
}
n_rep <- 1e6
x <- as.numeric(rep(c(1, NA, 3:5, NA, NA, 5), n_rep))
y <- as.numeric(rep(c(NA, 6:9, NA, 3, 4), n_rep))

Maybe replace NA's with 0 and then add the vectors
x <- c(1,NA,3:5)
y <- c(6:10)
replace(x, is.na(x), 0) + replace(y, is.na(y), 0)
#[1] 7 7 11 13 15

We could try using coalesce() from the dplyr package:
require(dplyr)
x <- c(1,NA,3:5)
y <- c(6:10)
coalesce(x, rep(0, 5)) + coalesce(y, rep(0, 5))
coalesce(x, y) works by taking the first non NA value from x, should that position have a non NA value, or from y, e.g.
x rep(0, 5) => result
1 0 1
NA 0 0
3 0 3
4 0 4
5 0 5

Instead of base::replace() and dplyr::coalesce() as above, we can also use tidyr::replace_na():
library(tidyr)
replace_na(x, 0) + replace_na(y, 0)
#[1] 7 7 11 13 15

Related

How to compare list of strings and output number of strings that are not matching?

I have a list as:
s <- c('peel', 'peer', 'pear', 'tggc', 'gcgt')
I would like to compare each string with every other string in the list and I use the following command:
z <- Map(utf8ToInt, s)
dmat <- outer(z, z, FUN=Vectorize(function(x, y) sum(bitwXor(x, y) > 0)))
However, I would like to output the number of character differences (instead of characters matching) based on position:
For example "tggc" when compared with the string "gcgt" should be output as 3.
Just use a simple negation ! as per the following:
s <- c('peel', 'peer', 'pear', 'tggc', 'gcgt')
z <- Map(utf8ToInt, s)
dmat <- outer(z, z, FUN = Vectorize(function(x, y) sum(!bitwXor(x, y))))
dmat
Or use a straightforward equality comparison given that you've mapped the characters to integers.
dmat <- outer(z, z, FUN = Vectorize(function(x, y) sum(x == y)))
Both give output:
peel peer pear tggc gcgt
peel 4 3 2 0 0
peer 3 4 3 0 0
pear 2 3 4 0 0
tggc 0 0 0 4 1
gcgt 0 0 0 1 4
Note: If you have fixed string length, you can also use subtraction, but the above saves you from passing this explicitly, which adds a little generality.
If performance is a concern:
s <- c('peel', 'peer', 'pear', 'tggc', 'gcgt')
z <- mapply(utf8ToInt, s)
n <- length(s)
n1 <- 1:(n - 1L)
replace(matrix(nrow = n, ncol = n),
sequence(n1, seq(n + 1L, by = n, length.out = n - 1L)),
colSums(z[, sequence(n1)] == z[, rep.int(2:n, n1)]))
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] NA 3 2 0 0
#> [2,] NA NA 3 0 0
#> [3,] NA NA NA 0 0
#> [4,] NA NA NA NA 1
#> [5,] NA NA NA NA NA
# benchmarking with a larger character vector
s <- mapply(FUN = function(x) paste0(sample(letters[1:4]), collapse = ""), 1:100)
microbenchmark::microbenchmark(bitwXor = {z <- Map(utf8ToInt, s)
outer(z, z, FUN = Vectorize(function(x, y) sum(!bitwXor(x, y))))},
logical = {z <- Map(utf8ToInt, s)
outer(z, z, FUN = Vectorize(function(x, y) sum(x == y)))},
mat = {z <- mapply(utf8ToInt, s)
n <- length(s)
n1 <- 1:(n - 1L)
replace(matrix(nrow = n, ncol = n),
sequence(n1, seq(n + 1L, by = n, length.out = n - 1L)),
colSums(z[, sequence(n1)] == z[, rep.int(2:n, n1)]))})
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> bitwXor 23846.1 24875.6 26207.230 26120.95 27134.35 33842.8 100
#> logical 16645.5 17514.8 19020.051 18383.35 19875.15 32716.8 100
#> mat 387.4 455.0 511.322 482.70 544.05 1224.4 100
# confirm that the results are the same
z <- Map(utf8ToInt, s)
mat1 <- outer(z, z, FUN = Vectorize(function(x, y) sum(!bitwXor(x, y))))
mat2 <- outer(z, z, FUN = Vectorize(function(x, y) sum(x == y)))
z <- mapply(utf8ToInt, s)
n <- length(s)
n1 <- 1:(n - 1L)
mat3 <- replace(matrix(nrow = n, ncol = n), sequence(n1, seq(n + 1L, by = n, length.out = n - 1L)), colSums(z[, sequence(n1)] == z[, rep.int(2:n, n1)]))
all.equal(mat1[upper.tri(mat1)], mat2[upper.tri(mat2)])
#> [1] TRUE
all.equal(mat1[upper.tri(mat1)], mat3[upper.tri(mat3)])
#> [1] TRUE
A possible solution:
library(tidyverse)
sample <- c('peel','peer','pear','tggc','gcgt')
sample %>%
expand.grid(sample) %>%
rowwise %>%
mutate(cmp = mapply(function(x,y)
{ x != y}, x=str_split(Var1, ""), y=str_split(Var2, "")) %>% sum)
#> # A tibble: 25 × 3
#> # Rowwise:
#> Var1 Var2 cmp
#> <fct> <fct> <int>
#> 1 peel peel 0
#> 2 peer peel 1
#> 3 pear peel 2
#> 4 tggc peel 4
#> 5 gcgt peel 4
#> 6 peel peer 1
#> 7 peer peer 0
#> 8 pear peer 1
#> 9 tggc peer 4
#> 10 gcgt peer 4
#> # … with 15 more rows

Closest other Value in the same Vector

I have a vector
set.seed(2)
x <- sample.int(20, 5)
[1] 4 14 11 3 16
Now, for every element I want to find
the element with the minimum distance (min(abs(x[i]-x[-i])) for element i), which here would be
[1] 3 16 14 4 14
the (first) index of the element with the minimum distance, which here would be
[1] 4 5 2 1 2
The point is that the element itself is not considered, but only all the other elements, which is why this R - Fastest way to find nearest value in vector
is not the answer.
If the actual answer is out there, sorry - I didn't find it.
1) Rfast Using dista in Rfast we get the indexes of the closest two. Take the second closest as the closest will be the same value.
library(Rfast)
x <- c(4, 14, 11, 3, 16) # input
x[ dista(x, x, k = 2, index = TRUE)[, 2] ]
## [1] 3 16 14 4 14
2) sqldf Using SQL we can left join DF to itself excluding the same value value and take the row with the minimum distance.
DF <- data.frame(x) # x is from (1)
sqldf("select a.x, b.x nearest, min(abs(a.x - b.x))
from DF a
left join DF b on a.x != b.x
group by a.rowid")[1:2]
giving:
x nearest
1 4 3
2 14 16
3 11 14
4 3 4
5 16 14
3) zoo Sort the input, take the value corresponding to the least difference on either of side of each element and order it back.
library(zoo)
ix <- order(x)
least <- function(x) if (x[2] - x[1] < x[3] - x[2]) x[1] else x[3]
rollapply(c(-Inf, x[ix], Inf), 3, least)[order(ix)]
## [1] 3 16 14 4 14
4) Base R Using ix and least from (3) we can mimic (3) using only base functions as follows.
apply(embed(c(-Inf, x[ix], Inf), 3)[, 3:1], 1, least)[order(ix)]
## [1] 3 16 14 4 14
4a) This slightly shorter variation would also work:
-apply(embed(-c(-Inf, x[ix], Inf), 3), 1, least)[order(ix)]
## [1] 3 16 14 4 14
4b) Simplifying further we have the following base solution where, again, ix is from (3):
xx <- x[ix]
x1 <- c(-Inf, xx[-length(xx)])
x2 <- c(xx[-1], Inf)
ifelse(xx - x1 < x2 - xx, x1, x2)[order(ix)]
## [1] 3 16 14 4 14
Duplicates
The example in the question had no duplicates but if there were duplicates there is some question regarding the problem definition. For example if we had c(1, 3, 4, 1) then if we look at the first value, 1, there is another value exactly equal to it so the closest value is 1. Another interpretation is that the closest value not equal to 1 should be returned which in this case is 3. In the codes above the sqldf solution gives the closest value not equal to the current value whereas the others give the closest value among the remaining values.
If we wanted the interpretation of the closest not equal for those other than sqldf then we could use rle after ordering to compress it down to unique values and then use inverse.rle afterwards as shown on the modified 4b:
x <- c(1, 3, 4, 1)
ix <- order(x)
r <- rle(x[ix])
xx <- r$values
x1 <- c(-Inf, xx[-length(xx)])
x2 <- c(xx[-1], Inf)
r$values <- ifelse(xx - x1 < x2 - xx, x1, x2)
inverse.rle(r)[order(ix)]
## [1] 3 4 3 3
I was very interested in this question and in the approaches suggested in the other responses, so I compared them with regard to their running time (and I added another approach using the package RANN). The code is appended below. TL;DR: The base R version 4b by user G. Grothendieck was most efficient, and by a significant margin.
library(RANN)
library(zoo)
library(data.table)
library(Rfast)
library(sqldf)
# All functions take a vector as argument,
# and return the values of nearest neighbours (not their index)
# Using base R, by ThomasIsCoding
base_nn <- function(x) {
d <- data.frame(`diag<-`(as.matrix(dist(x)),Inf))
id <- unlist(Map(which.min,d))
x[id]
}
# Using Rfast, by G. Grothendieck
rfast_nn <- function(x) {
x[ dista(x, x, k = 2, index = TRUE)[, 2] ]
}
# Using sqldf, by G. Grothendieck
sqldf_nn <- function(x) {
DF <- data.frame(x) # x is from (1)
unname(
unlist(sqldf("select a.x, b.x nearest, min(abs(a.x - b.x))
from DF a
left join DF b on a.x != b.x
group by a.rowid")[2])
)
}
# Using `zoo`, by G. Grothendieck
zoo_nn <- function(x) {
ix <- order(x)
least <- function(x) if (x[2] - x[1] < x[3] - x[2]) x[1] else x[3]
rollapply(c(-Inf, x[ix], Inf), 3, least)[order(ix)]
}
# Using base R (v 4), by G. Grothendieck
base2_nn <- function(x) {
ix <- order(x)
least <- function(x) if (x[2] - x[1] < x[3] - x[2]) x[1] else x[3]
apply(embed(c(-Inf, x[ix], Inf), 3)[, 3:1], 1, least)[order(ix)]
}
# Using base R (v 4a), by G. Grothendieck
base3_nn <- function(x) {
ix <- order(x)
least <- function(x) if (x[2] - x[1] < x[3] - x[2]) x[1] else x[3]
-apply(embed(-c(-Inf, x[ix], Inf), 3), 1, least)[order(ix)]
}
# Using base R (v 4b), by G. Grothendieck
base4_nn <- function(x) {
ix <- order(x)
xx <- x[ix]
x1 <- c(-Inf, xx[-length(xx)])
x2 <- c(xx[-1], Inf)
ifelse(xx - x1 < x2 - xx, x1, x2)[order(ix)]
}
# Using data.table, by IceCreamToucan
dt_nn <- function(x) {
dt <- setkey(data.table(x), x)
dt[dt, on = .(x > x), mult = 'first', lowx := i.x][, lowx := fcoalesce(lowx + .0, -Inf)]
dt[dt, on = .(x < x), mult = 'last', highx := i.x][, highx := fcoalesce(highx + .0, Inf)]
dt[, closex := fifelse(x - lowx < highx - x, lowx, highx)]
unname(unlist(dt[, .(closex)]))
}
# Using, RANN, by me
rann_nn <- function(x) {
id <- RANN::nn2(as.matrix(x), k = 2)$nn.idx[, 2]
x[id]
}
### Apply all methods
# Test that all have the same output:
x <- c(4, 14,11,3,16)
rann_nn(x)
# [1] 3 16 14 4 14
base_nn(x)
# [1] 3 16 14 4 14
rfast_nn(x)
# [1] 3 16 14 4 14
sqldf_nn(x)
# [1] 3 16 14 4 14
zoo_nn(x)
# [1] 3 16 14 4 14
base2_nn(x)
# [1] 3 16 14 4 14
base3_nn(x)
# [1] 3 16 14 4 14
base4_nn(x)
# [1] 3 16 14 4 14
dt_nn(x) # differently ordered for some reason
# [1] 4 3 14 16 14
# Compare running times
library(microbenchmark)
# Compare for N = 1000 elements
benchmark_data <- rnorm(1000)
microbenchmark(
rann_nn(benchmark_data),
base_nn(benchmark_data),
rfast_nn(benchmark_data),
sqldf_nn(benchmark_data),
zoo_nn(benchmark_data),
base2_nn(benchmark_data),
base3_nn(benchmark_data),
base4_nn(benchmark_data),
dt_nn(benchmark_data)
)
# Unit: microseconds
# expr min lq mean median uq max neval
# rann_nn(benchmark_data) 641.180 684.1975 776.5467 711.6680 775.3635 3822.023 100
# base_nn(benchmark_data) 166523.177 179240.8130 209471.1333 187633.0515 249740.8425 330864.712 100
# rfast_nn(benchmark_data) 45160.603 47032.5225 47681.0557 47594.0075 48308.8440 50579.839 100
# sqldf_nn(benchmark_data) 133916.594 138769.8175 143505.9315 140543.3250 143830.2765 211873.960 100
# zoo_nn(benchmark_data) 4359.359 4604.0275 5008.4291 4785.1515 5037.9705 14999.802 100
# base2_nn(benchmark_data) 1292.322 1407.4875 1747.8404 1462.7295 1588.1580 11297.321 100
# base3_nn(benchmark_data) 1263.644 1396.9210 1615.7495 1472.9940 1571.8575 11828.015 100
# base4_nn(benchmark_data) 119.543 146.1080 254.5075 178.1065 197.4265 7726.156 100
# dt_nn(benchmark_data) 5290.337 6580.6965 7111.1816 6892.3800 7351.3795 29469.815 100
# For N = 100000, leaving out the slowest versions (e.g., `base_nn()`
# no longer works because a distance matrix cannot be computed for
# N = 100000)
benchmark_data <- rnorm(100000)
microbenchmark(
rann_nn(benchmark_data),
zoo_nn(benchmark_data),
base2_nn(benchmark_data),
base3_nn(benchmark_data),
base4_nn(benchmark_data),
dt_nn(benchmark_data)
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# rann_nn(benchmark_data) 130.957025 141.02904 149.94052 148.60184 156.14506 271.1882 100
# zoo_nn(benchmark_data) 606.690004 673.88980 720.12545 717.51658 766.98190 886.4397 100
# base2_nn(benchmark_data) 142.554407 176.30358 198.58375 193.34812 212.33885 329.5470 100
# base3_nn(benchmark_data) 142.074126 168.78195 189.65122 184.45025 205.89414 287.0740 100
# base4_nn(benchmark_data) 9.354764 10.46687 17.22086 12.36354 14.22882 166.4758 100
# dt_nn(benchmark_data) 96.503882 104.06914 117.95408 108.20284 121.11428 247.2092 100
Here is a base R solution
d <- data.frame(`diag<-`(as.matrix(dist(x)),Inf))
ids <- unlist(Map(which.min,d))
val <- x[ids]
such that
> ids
X1 X2 X3 X4 X5
4 5 2 1 2
> val
[1] 3 16 14 4 14
DATA
x <- c(4, 14,11,3,16)
Option with a data.table non-equi-join
dt <- setkey(data.table(x), x)
dt[dt, on = .(x > x), mult = 'first', lowx := i.x][, lowx := fcoalesce(lowx + .0, -Inf)]
dt[dt, on = .(x < x), mult = 'last', highx := i.x][, highx := fcoalesce(highx + .0, Inf)]
dt[, closex := fifelse(x - lowx < highx - x, lowx, highx)]
dt[, .(x, closex)]
# x closex
# 1: 3 4
# 2: 4 3
# 3: 11 14
# 4: 14 16
# 5: 16 14

In R, find elements of a vector in a list using vectorization

I have a vector v1
v1 = c(1, 200, 4000)
I would like to find the indices of the elements of v1 in a list L1 vectorially, i.e. without a loop, where
> L1
[[1]]
[1] 1 2 3 4
[[2]]
[1] 100 200 300 400
[[3]]
[1] 1000 2000 3000 4000
The output should be c(1, 2, 4).
Is there a way to do this without using a loop or apply (which is computationally the same as using a loop?) I have to do this for very long vectors.
We can do
sapply(L1, function(x) which(x %in% v1))
#[1] 1 2 4
Or with Vectorize
Vectorize(function(x) which(x %in% v1))(L1)
#[1] 1 2 4
If each element is checked against corresponding element of another
mapply(function(x, y) which(x %in% y), L1, v1)
#[1] 1 2 4
As #nicola mentioned match could also be used to get the first index. If there are duplicate elements, then which would be useful
mapply(match, v1, L1)
#[1] 1 2 4
Or using the purrr::map2
purrr::map2_int(L1, v1, ~ .x %in% .y %>%
which)
#[1] 1 2 4
we can do this, seems to be the fastest by far.
v1 <- c(1, 200, 4000)
L1 <- list(1:4, 1:4*100, 1:4*1000)
sequence(lengths(L1))[match(v1, unlist(L1))]
# [1] 1 2 4
sequence(lengths(L1))[which(unlist(L1) %in% v1)]
# [1] 1 2 4
library(microbenchmark)
library(tidyverse)
microbenchmark(
akrun_sapply = {sapply(L1, function(x) which(x %in% v1))},
akrun_Vectorize = {Vectorize(function(x) which(x %in% v1))(L1)},
akrun_mapply = {mapply(function(x, y) which(x %in% y), L1, v1)},
akrun_mapply_match = {mapply(match, v1, L1)},
akrun_map2 = {purrr::map2_int(L1, v1, ~ .x %in% .y %>% which)},
CPak = {setNames(rep(1:length(L1), times=lengths(L1)), unlist(L1))[as.character(v1)]},
zacdav = {sequence(lengths(L1))[match(v1, unlist(L1))]},
zacdav_which = {sequence(lengths(L1))[which(unlist(L1) %in% v1)]},
times = 10000
)
Unit: microseconds
expr min lq mean median uq max neval
akrun_sapply 18.187 22.7555 27.17026 24.6140 27.8845 2428.194 10000
akrun_Vectorize 60.119 76.1510 88.82623 83.4445 89.9680 2717.420 10000
akrun_mapply 19.006 24.2100 29.78381 26.2120 29.9255 2911.252 10000
akrun_mapply_match 14.136 18.4380 35.45528 20.0275 23.6560 127960.324 10000
akrun_map2 217.209 264.7350 303.64609 277.5545 298.0455 9204.243 10000
CPak 15.741 19.7525 27.31918 24.7150 29.0340 235.245 10000
zacdav 6.649 9.3210 11.30229 10.4240 11.5540 2399.686 10000
zacdav_which 7.364 10.2395 12.22632 11.2985 12.4515 2492.789 10000
You can try something like this
v1 = c(1, 200, 4000)
L1 <- list(1:4, 1:4*100, 1:4*1000)
setNames(rep(1:length(L1), times=lengths(L1)), unlist(L1))[as.character(v1)]
# 1 200 4000
# 1 2 3
We can also use
unlist(lapply(L1, function(x) which(x %in% v1)))
#[1] 1 2 4
Or use
unlist(Map(function(x, y) which(x %in% y), L1, v1 ))
#[1] 1 2 4

Replace NA with previous value with limit on number of consecutive NA

I would like to replace up to n consecutive NA values in vector with latest non-NA value.
For example, if:
a <- c(1,NA,NA,NA,NA,NA,2,NA,1,NA,NA,NA)
n <- 2
I would like to obtain:
c(1,1,1,NA,NA,NA,2,2,1,1,1,NA)
n is maximum number of NA values that can be replaced by given element).
I know na.locf() function, but I don't know how to set the limit n. Is it possible to do it?
Here's an option using na.locf and rle
library(zoo)
r <- rle(is.na(a))
a <- na.locf(a)
is.na(a) <- sequence(r$lengths) > n & rep(r$values, r$lengths)
a
# [1] 1 1 1 NA NA NA 2 2 1 1 1 NA
So here I first computed the run lengths of elements in a (including the NA entries), then replaced all NA's using na.locf and finally turned those elements back to NA's where the run lengths were greater than n and the elements were NA.
As another idea, we can find the last indices of "a" without NAs:
is = seq_along(a)
i = cummax((!is.na(a)) * is)
i
# [1] 1 1 1 1 1 1 7 7 9 9 9 9
Replace the last non-NA index with the current index if last non-NA is more than "n" steps away:
wh = (is - i) > n
i[wh] = is[wh]
i
# [1] 1 1 1 4 5 6 7 7 9 9 9 12
And subset "a":
a[i]
# [1] 1 1 1 NA NA NA 2 2 1 1 1 NA
You could do this using split and replace in base R
f <- function(a, n) {
# split the vector based on the position of non-NA values
l <- split(a, cumsum(seq_along(a) %in% which(!is.na(a))))
unlist(lapply(l, function(r) replace(r, 1:(n+1), r[1])[seq_along(r)]),use.names = FALSE)
}
f(a, n = 2)
#[1] 1 1 1 NA NA NA 2 2 1 1 1 NA
f(a, n = 3)
#[1] 1 1 1 1 NA NA 2 2 1 1 1 1
Benchmarking (random generated vector of size 7467)
library(microbenchmark)
library(dplyr)
library(zoo)
set.seed(123)
a <- unlist(replicate(1000, c(sample(10, 2), rep(NA, sample.int(10, 1)))))
length(a)
# [1] 7467
n <- 3
f_989 <- function(a, n) {
# split the vector based on the position of non-NA values
l <- split(a, cumsum(seq_along(a) %in% which(!is.na(a))))
unlist(lapply(l, function(r) replace(r, 1:(n+1), r[1])[seq_along(r)]),use.names = FALSE)
}
f_zx8754 <- function(a, n)
data.frame(a) %>% mutate(gr = cumsum(!is.na(a))) %>%
group_by(gr) %>%
mutate(res = if_else(row_number() <= n + 1, na.locf(a), a)) %>%
.$res
f_docendo_discimus <- function(a, n){
r <- rle(is.na(a))
a <- na.locf(a)
is.na(a) <- sequence(r$lengths) > n & rep(r$values, r$lengths)
a
}
f_akrun <- function(a,n)
ave(a, cumsum(!is.na(a)), FUN = function(x) replace(x, pmin(length(x), seq(n+1)), x[1]))
f_alexis_laz=function(a,n){
is = seq_along(a)
i = cummax((!is.na(a)) * is)
wh = (is - i) > n
i[wh] = is[wh]
a[i]
}
r <- f_989(a,n)
identical(r, f_zx8754(a,n))
# [1] TRUE
identical(r, f_docendo_discimus(a,n))
# [1] TRUE
identical(r, f_akrun(a,n))
# [1] TRUE
identical(r, f_alexis_laz(a,n))
# [1] TRUE
res <- microbenchmark("f1"=f_989(a,n), "f2"=f_zx8754(a,n),
"f3"=f_docendo_discimus(a,n), "f4"=f_akrun(a,n), "f5"=f_alexis_laz(a,n))
print(res, order="mean")
# Unit: microseconds
# expr min lq mean median uq max neval
# f5 129.804 137.014 161.106 141.6715 151.7375 1031.511 100
# f3 1249.351 1354.215 1471.478 1392.9750 1482.2140 2553.086 100
# f1 4736.895 5093.852 5630.367 5345.3450 6069.9260 8848.513 100
# f4 22165.601 23936.866 24660.990 24485.6725 24883.6440 29453.177 100
# f2 205854.339 215582.174 221524.448 218643.9540 224211.0435 261512.922 100
We can use a base R approach by creating a grouping variable with cumsum and diff, then using the grouping variable in ave we replace the NA values based on the condition given by 'n'
ave(a, cumsum(c(TRUE, diff(is.na(a)) < 0)),
FUN = function(x) replace(x, is.na(x) & seq_along(x) <= n + 1, x[1]))
#[1] 1 1 1 NA NA NA 2 2 1 1 1 NA
Or more compact option
ave(a, cumsum(!is.na(a)), FUN = function(x) replace(x, pmin(length(x), seq(n+1)), x[1]))
#[1] 1 1 1 NA NA NA 2 2 1 1 1 NA
Using dplyr::group_by and zoo::na.locf:
library(dplyr)
library(zoo)
data.frame(a) %>%
mutate(gr = cumsum(!is.na(a))) %>%
group_by(gr) %>%
mutate(res = if_else(row_number() <= n + 1, na.locf(a), a)) %>%
.$res
# [1] 1 1 1 NA NA NA 2 2 1 1 1 NA

There is pmin and pmax each taking na.rm, why no psum?

It seems that R might be missing an obvious simple function: psum. Does it exist as a different name, or is it in a package somewhere?
x = c(1,3,NA,5)
y = c(2,NA,4,1)
min(x,y,na.rm=TRUE) # ok
[1] 1
max(x,y,na.rm=TRUE) # ok
[1] 5
sum(x,y,na.rm=TRUE) # ok
[1] 16
pmin(x,y,na.rm=TRUE) # ok
[1] 1 3 4 1
pmax(x,y,na.rm=TRUE) # ok
[1] 2 3 4 5
psum(x,y,na.rm=TRUE)
[1] 3 3 4 6 # expected result
Error: could not find function "psum" # actual result
I realise that + is already like psum, but what about NA?
x+y
[1] 3 NA NA 6 # can't supply `na.rm=TRUE` to `+`
Is there a case to add psum? Or have I missed something.
This question is a follow up from this question :
Using := in data.table to sum the values of two columns in R, ignoring NAs
Following #JoshUlrich's comment on the previous question,
psum <- function(...,na.rm=FALSE) {
rowSums(do.call(cbind,list(...)),na.rm=na.rm) }
edit: from Sven Hohenstein:
psum2 <- function(...,na.rm=FALSE) {
dat <- do.call(cbind,list(...))
res <- rowSums(dat, na.rm=na.rm)
idx_na <- !rowSums(!is.na(dat))
res[idx_na] <- NA
res
}
x = c(1,3,NA,5,NA)
y = c(2,NA,4,1,NA)
z = c(1,2,3,4,NA)
psum(x,y,na.rm=TRUE)
## [1] 3 3 4 6 0
psum2(x,y,na.rm=TRUE)
## [1] 3 3 4 6 NA
n = 1e7
x = sample(c(1:10,NA),n,replace=TRUE)
y = sample(c(1:10,NA),n,replace=TRUE)
z = sample(c(1:10,NA),n,replace=TRUE)
library(rbenchmark)
benchmark(psum(x,y,z,na.rm=TRUE),
psum2(x,y,z,na.rm=TRUE),
pmin(x,y,z,na.rm=TRUE),
pmax(x,y,z,na.rm=TRUE), replications=20)
## test replications elapsed relative
## 4 pmax(x, y, z, na.rm = TRUE) 20 26.114 1.019
## 3 pmin(x, y, z, na.rm = TRUE) 20 25.632 1.000
## 2 psum2(x, y, z, na.rm = TRUE) 20 164.476 6.417
## 1 psum(x, y, z, na.rm = TRUE) 20 63.719 2.486
Sven's version (which arguably is the correct one) is quite a bit slower,
although whether it matters obviously depends on the application.
Anyone want to hack up an inline/Rcpp version?
As for why this doesn't exist: don't know, but good luck getting R-core to make additions like this ... I can't offhand think of a sufficiently widespread *misc package into which this could go ...
Follow up thread by Matthew on r-devel is here (which seems to confirm) :
r-devel: There is pmin and pmax each taking na.rm, how about psum?
After a quick search on CRAN, there are at least 3 packages that have a psum function. rccmisc, incadata and kit. kit seems to be the fastest. Below reproducing the example of Ben Bolker.
benchmark(
rccmisc::psum(x,y,z,na.rm=TRUE),
incadata::psum(x,y,z,na.rm=TRUE),
kit::psum(x,y,z,na.rm=TRUE),
psum(x,y,z,na.rm=TRUE),
psum2(x,y,z,na.rm=TRUE),
replications=20
)
# test replications elapsed relative
# 2 incadata::psum(x, y, z, na.rm = TRUE) 20 20.05 14.220
# 3 kit::psum(x, y, z, na.rm = TRUE) 20 1.41 1.000
# 4 psum(x, y, z, na.rm = TRUE) 20 8.04 5.702
# 5 psum2(x, y, z, na.rm = TRUE) 20 20.44 14.496
# 1 rccmisc::psum(x, y, z, na.rm = TRUE) 20 23.24 16.482
Another approach whose advantage is to also work with matrices, just like pmin and pmax.
psum <- function(..., na.rm = FALSE) {
plus_na_rm <- function(x, y) ifelse(is.na(x), 0, x) + ifelse(is.na(y), 0, y)
Reduce(if(na.rm) plus_na_rm else `+`, list(...))
}
x = c(1,3,NA,5)
y = c(2,NA,4,1)
psum(x, y)
#> [1] 3 NA NA 6
psum(x, y, na.rm = TRUE)
#> [1] 3 3 4 6
# With matrices
A <- matrix(1:9, nrow = 3)
B <- matrix(c(NA, 2:8, NA), nrow = 3)
psum(A, B)
#> [,1] [,2] [,3]
#> [1,] NA 8 14
#> [2,] 4 10 16
#> [3,] 6 12 NA
psum(A, B, na.rm = TRUE)
#> [,1] [,2] [,3]
#> [1,] 1 8 14
#> [2,] 4 10 16
#> [3,] 6 12 9
Created on 2020-03-09 by the reprex package (v0.3.0)
One caveat: if an element is NA across all the summed objects and na.rm = TRUE, the result will be 0 (and not NA).
For example:
psum(NA, NA, na.rm = TRUE)
#> [1] 0

Resources