I have some vectors in the following format
v1 <- c(NA,NA,NA,10,10,10,10)
v2 <- c(NA,NA, 3, 3, 3,NA,NA)
v3 <- c( 5, 5, NA,NA,NA,NA,NA)
For each vector I want to calculate how many leading NAs and trailing NAs.
For v1, LeadNA = 3, TrailNA = 0
For v2, LeadNA = 2, TrailNA = 2
For v3, LeadNA = 0, TrailNA = 5
1) Cumsum - An option would be to create a logical vector with cumsum on the presence of non-NA elements and get the sum (base R - No packages used)
f1 <- function(vec, trail = FALSE) {
if(trail) {
vec <- rev(vec)
}
sum(!cumsum(!is.na(vec)))
}
f1(v1)
#[1] 3
f1(v1, TRUE)
#[1] 0
sapply(mget(paste0("v", 1:3)), f1)
# v1 v2 v3
# 3 2 0
sapply(mget(paste0("v", 1:3)), f1, TRUE)
# v1 v2 v3
# 0 2 5
2 rle - Another base R option is rle (No packages are used)
with(rle(is.na(v2)), lengths[values & seq_along(values) %in% c(1, length(values))])
This turns out to be similar to #Bulat's solution
count_nas <- function(x) {
nas <- is.na(x)
if (sum(nas) == length(x)) {
warning('all elements were NA')
return(c(start_na = NA_integer_, end_na = NA_integer_))
}
c(start_na = which.min(nas) - 1,
end_na = which.min(rev(nas)) - 1)
}
count_nas(v1)
#start_na end_na
# 3 0
sapply(list(v1,v2,v3), count_nas)
# [,1] [,2] [,3]
#start_na 3 2 0
#end_na 0 2 5
As far as performance, this is the fastest method with #akrun's methods being in the ballpark.
v_test3 <- sample(10000)
v_test3[c(1:3, 9998:10000)] <- NA_integer_
Unit: microseconds
expr min lq mean median uq max neval
akrun_cumsum 175.7 182.15 193.580 186.55 200.80 354.7 100
akrun_rle 168.6 199.25 210.635 209.25 221.00 289.3 100
g_grothen_zoo 1848.5 1904.45 2008.994 1941.40 2001.35 4799.6 100
g_grothen_reduce 12467.3 12888.10 14174.157 13445.15 15054.35 28241.6 100
www_rleid 5357.2 5439.40 5741.471 5517.15 5947.15 8470.4 100
bulat_and_cole 63.5 66.45 73.681 71.25 75.75 96.9 100
Code for reproducibility:
library(microbenchmark)
library(zoo)
library(data.table)
v_test3 <- sample(10000)
v_test3[c(1:3, 9998:10000)] <- NA_integer_
count_nas <- function(x) {
nas <- is.na(x)
if (sum(nas) == length(x)) {
warning('all elements were NA')
return(c(start_na = NA_integer_, end_na = NA_integer_))
}
c(start_na = which.min(nas) - 1,
end_na = which.min(rev(nas)) - 1)
}
countNA <- function(x) {
len <- function(fromLast = FALSE) length(na.locf(x, fromLast = fromLast))
if (all(is.na(x))) c(left = NA, right = NA)
else length(x) - c(left = len(), right = len(TRUE))
}
f1 <- function(vec, trail = FALSE) {
if(trail) {
vec <- rev(vec)
}
sum(!cumsum(!is.na(vec)))
}
count_fun <- function(x){
y <- rleid(x)
z <- split(x, y)[c(1, length(unique(y)))]
ans <- sapply(z, function(x) sum(is.na(x)))
return(unname(ans))
}
countNA2 <- function(x) {
f <- function(x) sum(Reduce(all, is.na(x), acc = TRUE))
if (all(is.na(x))) c(left = NA, right = NA)
else c(left = f(x), right = f(rev(x)))
}
microbenchmark(
akrun_cumsum = {
f1(v_test3, TRUE)
f1(v_test3, FALSE)
}
,
akrun_rle = {
with(rle(is.na(v_test3)), lengths[values & seq_along(values) %in% c(1, length(values))])
}
,
g_grothen_zoo = {
countNA(v_test3)
}
,
g_grothen_reduce = {
countNA2(v_test3)
}
,
www_rleid = {
count_fun(v_test3)
}
,
bulat_and_cole = {
count_nas(v_test3)
}
)
Wrapper over which.max:
leading.nas <- function(x) {
if (length(x) == 0) {
0L
}
else {
which.min(!is.na(x)) - 1
}
}
A function returns two numbers. The first is the count of leading NA. The second is the count of trailing NA. It requires the rleid function from the data.table package.
library(data.table)
count_fun <- function(x){
y <- rleid(x)
z <- split(x, y)[c(1, length(unique(y)))]
ans <- sapply(z, function(x) sum(is.na(x)))
return(unname(ans))
}
count_fun(v1)
# [1] 3 0
count_fun(v2)
# [1] 2 2
count_fun(v3)
# [1] 0 5
1) na.locf Remove the leading NAs using na.locf and determine the difference in length between the original and reduced vector. Do the same for the trailing NAs. It is not clear what should be returned if the input vector is empty or all NAs so we return NA for both left and right.
library(zoo)
countNA <- function(x) {
len <- function(fromLast = FALSE) length(na.locf(x, fromLast = fromLast))
if (all(is.na(x))) c(left = NA, right = NA)
else length(x) - c(left = len(), right = len(TRUE))
}
countNA(v1)
## left right
## 3 0
countNA(v2)
## left right
## 2 2
countNA(v3)
## left right
## 0 5
It would also be possible to use na.fill to perform this calculation.
2) Reduce A second approach is to use Reduce. It gives the same answer. No packages are used.
countNA2 <- function(x) {
f <- function(x) sum(Reduce(all, is.na(x), acc = TRUE))
if (all(is.na(x))) c(left = NA, right = NA)
else c(left = f(x), right = f(rev(x)))
}
Related
I have a matrix X:
one two three four
[1,] 1 3 2 4
[2,] 2 0 1 5
[3,] 3 2 1 4
[4,] 4 9 11 19
[5,] 4 3 2 1
I want to get a new matrix Y which only contains rows that are permutations of "1", "2", "3", "4". That is:
one two three four
[1,] 1 3 2 4
[3,] 3 2 1 4
[5,] 4 3 2 1
What function or command should I use?
mat <- rbind(
c(1, 3, 2, 4),
c(2, 0, 1, 5),
c(3, 2, 1, 4)
)
ok <- apply(mat, 1L, function(x) setequal(x, c(1, 2, 3, 4)))
mat[ok, ]
Your example matrix and target vector:
X <- structure(c(1, 2, 3, 4, 4, 3, 0, 2, 9, 3, 2, 1, 1, 11, 2, 4, 5, 4, 19, 1),
dim = 5:4)
v <- 1:4
But let's construct a more challenging one (thanks to user harre):
X <- rbind(X, 1, c(1, 2, 1, 2))
A fully vectorized approach (using package matrixStats)
rk <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
ct <- matrixStats::rowTabulates(rk, values = 1:length(v))
zo <- matrixStats::rowCounts(ct, value = 0L)
## all rows that are permutations of 'v'
X[zo == 0L, ]
# [,1] [,2] [,3] [,4]
#[1,] 1 3 2 4
#[2,] 3 2 1 4
#[3,] 4 3 2 1
## remove rows that are permutations of 'v'
X[zo > 0L, ]
Another fully vectorized method (base R)
This is a mathematical solution. For a nonlinear and asymmetric weight function w(x), the following weighted sum:
1 x w(1) + 2 x w(2) + 3 x w(3) + 4 x w(4)
is a unique score or identifier and is invariant to permutations. So for example, the following gives the same value:
2 x w(2) + 1 x w(1) + 3 x w(3) + 4 x w(4)
But anything else will give different values, like:
1 x w(1) + 3 x w(1) + 3 x w(3) + 4 x w(4)
0 x w(0) + 3 x w(1) + 0 x w(0) + 4 x w(4)
Here is an implementation using cosine weights. It works even if X and v are floating point numbers or characters.
## method = "tab" for tabulation method
## method = "cos" for cosine weights method
FindPerm <- function (X, v, method) {
## number of elements
n <- length(v)
if (ncol(X) != n) stop("inconformable 'X' and 'v'!")
if (anyDuplicated(v)) stop("'v' should not contain duplicated values!")
## convert everything to integers 0, 1, 2, ..., n
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:n
## branches
if (method == "tab") {
## row-wise tabulating
rtab <- matrixStats::rowTabulates(Xi, values = vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
matrixStats::rowCounts(rtab, value = 0L) == 0L
} else if (method == "cos") {
## evaluate cosine weights for Xi and vi
w <- pi / (n + 1)
cos.Xi <- cos(w * Xi)
cos.vi <- cos(w * vi)
## weighted sum for Xi
wtsum.Xi <- rowSums(Xi * cos.Xi)
## weighted sum for vi
wtsum.vi <- sum(vi * cos.vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
wtsum.Xi == wtsum.vi
} else {
stop("unknown method!")
}
}
X[FindPerm(X, v, "cos"), ]
# [,1] [,2] [,3] [,4]
#[1,] 1 3 2 4
#[2,] 3 2 1 4
#[3,] 4 3 2 1
Benchmark
Performance depends on the number of values in v. The tabulation method will slow down as v becomes long.
## a benchmark function, relying on package "microbenchmark"
## nr: number of matrix rows
## nc: number of elements in 'v'
bm <- function (nr, nc) {
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark::microbenchmark("tab" = FindPerm(X, v, "tab"),
"cos" = FindPerm(X, v, "cos"),
check = "identical")
}
bm(2e+4, 4)
#Unit: milliseconds
# expr min lq mean median uq max
# tab 4.302674 4.324236 4.536260 4.336955 4.359814 7.039699
# cos 4.846893 4.872361 5.163209 4.882942 4.901288 7.837580
bm(2e+4, 20)
#Unit: milliseconds
# expr min lq mean median uq max
# tab 30.63438 30.70217 32.73508 30.77588 33.08046 135.64322
# cos 21.16669 21.26161 22.28298 21.37563 23.60574 26.31775
Update since there's so much interest in this question, here's a method using indexing to give a speed boost on Zheyuan Li's excellent generalization of my original answer.
The idea is to index on a length(v)-dimensional array for small v, or to index on v*sin(w*v) using the results of match instead of calculating X*sin(W*X) when v is large:
library(RcppAlgos)
# simplified version of Zheyuan Li's function
f1 <- function(X, v) {
n <- length(v)
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:n
w <- pi/(n + 1)
rowSums(Xi*sin(Xi*w)) == sum(vi*sin(vi*w))
}
f2 <- function(X, v) {
n <- length(v)
if (n < 6) {
# index an n-dimensional array
m <- array(FALSE, rep(n + 1L, n))
m[permuteGeneral(n)] <- TRUE
X[] <- match(X, v, nomatch = length(v) + 1L)
m[X]
} else {
nn <- 1:n
u <- c(nn*sin(pi*nn/(n + 1L)), 0)
X[] <- u[match(X, v, nomatch = n + 1L)]
rowSums(X) == sum(u)
}
}
set.seed(123)
# using Zheyuan Li's test dataset
nr <- 2000; nc <- 4
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark::microbenchmark(f1 = f1(X, v),
f2 = f2(X, v),
check = "identical")
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1 344.4 367.25 438.932 374.05 386.75 5960.6 100
#> f2 81.9 85.00 163.332 88.90 98.50 6924.4 100
# Zheyuan Li's larger test dataset
set.seed(123)
nr <- 2000; nc <- 20
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark::microbenchmark(f1 = f1(X, v),
f2 = f2(X, v),
check = "identical")
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1 1569.2 1575.45 1653.510 1601.30 1683.6 3972.6 100
#> f2 355.2 359.90 431.705 366.85 408.6 2253.8 100
Original answer edited to use X + exp(1/X) (see comments).
This should work with positive integers:
Y <- X[rowSums(X + exp(1/X)) == sum(1:4 + exp(1/(1:4))),]
Benchmarking against the apply solution:
f1 <- function(x) x[apply(x, 1L, function(x) setequal(x, 1:4)),]
f2 <- function(x) x[rowSums(x + exp(1/x)) == sum(1:4 + exp(1/(1:4))),]
X <- matrix(sample(10, 4e5, TRUE), 1e5)
microbenchmark::microbenchmark(f1 = f1(X),
f2 = f2(X),
times = 10,
check = "equal")
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f1 448.2680 450.8778 468.55179 461.62620 472.0022 542.0455 10
#> f2 28.5362 28.6889 31.50941 29.44845 30.2693 50.4402 10
This question is HOT. I am learning so I take this as a good opportunity to learn. It is really difficult for me to come up with new solutions, but I found two things missing here:
there is no serious validation of these answers;
there is no benchmark for all of them.
I would like to convert each answer to a function that returns a TRUE/FALSE vector for flagging rows. I also want this function to work with any matrix and any vector.
Stéphane Laurent's answer, Zheyuan Li's answer and ThomasIsCoding's answer need minimal adaption.
Mohamed Desouky's answer is also easy to adapt, by taking out the function applied in Filter() and apply() it over matrix rows.
jblood94's answer is challenging. It was commented that for other matrices and vectors, conversion is needed using match. I don't know what is the appropriate way, but I saw match in Zheyuan Li's answer, so I borrowed that part.
TarJae's answer is awful (sorry; don't take this as an insult). None of them seems to work. I don't see any comparison between matrix rows and vectors in the base R solution. For other tidyverse codes, I don't know what df_matrix is. I have requested TarJae to please revise the answer.
harre's answer uses tidyverse and dose not return TRUE/FALSE. So I have to exclude it from the benchmark (sorry).
Here are the functions for the benchmark.
S.Laurent <- function (X, v) apply(X, 1L, function(x) setequal(x, v))
Z.Li <- function (X, v, method) {
## number of elements
n <- length(v)
if (ncol(X) != n) stop("inconformable 'X' and 'v'!")
if (anyDuplicated(v)) stop("'v' should not contain duplicated values!")
## convert everything to integers 0, 1, 2, ..., n
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:n
## branches
if (method == "tab") {
## row-wise tabulating
rtab <- matrixStats::rowTabulates(Xi, values = vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
matrixStats::rowCounts(rtab, value = 0L) == 0L
} else if (method == "cos") {
## evaluate cosine weights for Xi and vi
w <- pi / (n + 1)
cos.Xi <- cos(w * Xi)
cos.vi <- cos(w * vi)
## weighted sum for Xi
wtsum.Xi <- rowSums(Xi * cos.Xi)
## weighted sum for vi
wtsum.vi <- sum(vi * cos.vi)
## the i-th value is TRUE if X[i, ] is a permutation of v
wtsum.Xi == wtsum.vi
} else {
stop("unknown method!")
}
}
Thomas <- function (X, v) colSums(mapply(`%in%`, list(v), asplit(X, 1))) == ncol(X)
M.Desouky <- function (X, v) apply(X, 1, function (x) all((x %in% v) & length(unique(x)) == length(v)))
jblood94 <- function (X, v) {
Xi <- matrix(match(X, v, nomatch = 0L), nrow(X), ncol(X))
vi <- 1:length(v)
rowSums(Xi + exp(1/Xi)) == sum(vi + exp(1/vi))
}
For benchmark, I followed the setup in Zheyuan Li's answer.
library(matrixStats)
library(microbenchmark); library(ggplot2)
nr <- 2000; nc <- 4
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark("S.Laurent" = S.Laurent(X, v),
"Z.Li.tab" = Z.Li(X, v, "tab"),
"Z.Li.cos" = Z.Li(X, v, "cos"),
"Thomas" = Thomas(X, v),
"M.Desouky" = M.Desouky(X, v),
"jblood94" = jblood94(X, v),
check = "identical") -> bm1
autoplot(bm1)
nr <- 2000; nc <- 20
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark("S.Laurent" = S.Laurent(X, v),
"Z.Li.tab" = Z.Li(X, v, "tab"),
"Z.Li.cos" = Z.Li(X, v, "cos"),
"Thomas" = Thomas(X, v),
"M.Desouky" = M.Desouky(X, v),
"jblood94" = jblood94(X, v),
check = "identical") -> bm2
autoplot(bm2)
I don't know how time is transformed for plotting, but clearly, they are not on the usual scale. Those to the left are far faster than it is suggested from the plot!
Conclusion: Zheyuan Li's "cos" method is the winner.
We can try this
> mat[colSums(mapply(`%in%`, list(1:4), asplit(mat, 1))) == ncol(mat), ]
[,1] [,2] [,3] [,4]
[1,] 1 3 2 4
[2,] 3 2 1 4
[3,] 4 3 2 1
Another option is using Filter function
t(Filter(\(x) all((x %in% 1:4) & length(unique(x)) == 4) ,
data.frame(t(X))))
Just for fun
Who can give me 4 distinct digits whose sum equal 6 ?
there is just {0,1,2,3}
then we can use the module of 4 using %%
X[apply(X , 1 , \(x) sum(unique(x %% 4)) == 6 & length(unique(x)) == 4) , ]
OR
with Using pure for loop
ans <- data.frame(matrix(NA , ncol = ncol(X)))
r <- 1
for(i in 1:nrow(X)){
if(all((X[i,] %in% 1:4) & length(unique(X[i,])) == 4)){
ans[r,] <- X[i,]
r <- r + 1
}
}
ans <- as.matrix(ans)
output
X1 X2 X3 X4
1 1 3 2 4
2 3 2 1 4
3 4 3 2 1
data
X <- matrix(c(1, 2, 3, 4, 4, 3, 0, 2, 9, 3, 2, 1, 1, 11, 2, 4,
5, 4, 19, 1) , ncol = 4)
For the fun of tidyverse-solutions, even if I think we'd rather work on the matrices directly. However, we could use rowwise() and c_across():
With set-operations (inspired by #Stéphane Laurent):
library(dplyr)
mat %>%
as_tibble() |>
rowwise() |>
filter(setequal(c_across(), c(1, 2, 3, 4))) |>
ungroup() |>
as.matrix()
Or without set-operations:
library(dplyr)
mat %>%
as_tibble() |>
rowwise() |>
filter(1 %in% c_across(everything()) &
2 %in% c_across(everything()) &
3 %in% c_across(everything()) &
4 %in% c_across(everything())
) |>
ungroup() |>
as.matrix()
Or inspired by #Mohamed Desouky:
mat %>%
as_tibble() |>
rowwise() |>
filter(all(c_across() %in% 1:4) & n_distinct(c_across()) == 4) |>
ungroup() |>
as.matrix()
And so on..
The algorithm library in C++ offers a function called std::is_permutation that does just the trick.
The workhorse function below uses Rcpp and is fairly straightforward.
#include <Rcpp.h>
// [[Rcpp::export]]
SEXP perm_idx_cpp(Rcpp::IntegerMatrix mat, const std::vector<int> &v) {
const int nRows = mat.nrow();
const int nCols = mat.ncol();
std::vector<int> test(nCols);
Rcpp::LogicalVector res(nRows);
for (int i = 0; i < nRows; ++i) {
for (int j = 0; j < nCols; ++j) {
test[j] = mat(i, j);
}
res[i] = std::is_permutation(
test.begin(), test.end(), v.begin()
);
}
return res;
}
And calling it in R we have (N.B. We use match to get integer indices as in #Zheyuan Li's answer, which is absolutely brilliant btw!):
get_perm_idx <- function(X, v) {
Xi <- matrix(match(X, v, nomatch = 0L), ncol = ncol(X))
perm_idx_cpp(Xi, seq_along(v))
}
It is very efficient as well. Here is a simple benchmark:
nr <- 2e4
nc <- 20
X <- matrix(sample.int(nc + 1L, nr * nc, replace = TRUE), nr)
v <- 1:nc
microbenchmark("tab" = FindPerm(X, v, "tab"),
"cos" = FindPerm(X, v, "cos"),
"is_perm_cpp" = get_perm_idx(X, v),
check = "identical")
Unit: milliseconds
expr min lq mean median uq max neval
tab 33.641345 36.479660 39.00994 37.402306 39.560015 54.88057 100
cos 9.496309 12.887493 15.30122 13.306302 14.053643 132.24079 100
is_perm_cpp 3.232093 4.819553 6.08687 4.993367 5.248818 19.56919 100
You could probably squeeze out some extra efficiency, but it is tough to beat the simplicity here.
I'm a R newbie. I've got a vector
vec <- c(105,29,41,70,77,0,56,49,63,0,105)
and i would like to sum values till "0" occurs and then create a vector with such values, such as:
vec2 <- c(322,168,105)
But i really don't know where to start! Any suggestion?
Starting with this vector...
> vec
[1] 105 29 41 70 77 0 56 49 63 0 105
We can compute a logical TRUE/FALSE vector of where the zeroes are:
> vec == 0
[1] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
When you add FALSE and TRUE, FALSE is zero and TRUE is one, so if we add that vector up every time we get to a TRUE the value increases. So using cumsum for the cumulative sum, we get:
> cumsum(vec==0)
[1] 0 0 0 0 0 1 1 1 1 2 2
Now that result defines the groups that we want to add up within, so let's split vec by that result:
> split(vec, cumsum(vec==0))
$`0`
[1] 105 29 41 70 77
$`1`
[1] 0 56 49 63
$`2`
[1] 0 105
So apart from the zeroes in the second and subsequent parts of the list, that's the numbers we want to add up. Because we are adding we can add the zeroes and it doesn't make any difference (but if you wanted the mean you would have to drop the zeroes). Now we use sapply to iterate over list elements and compute the sum:
> sapply(split(vec, cumsum(vec==0)),sum)
0 1 2
322 168 105
Job done. Ignore the 0 1 2 labels.
The aggregate function is useful for this kind of thing. You create a grouping variable with cumsum (similar to how #Spacedman explained). Using the sum function as the aggregating operation. The [[2]] at the end just extracts what you want from what aggregate returns:
aggregate(vec, by = list(cumsum(vec == 0)), FUN = sum)[[2]]
[1] 322 168 105
Another option is by
as.numeric(by(vec, cumsum(vec == 0), sum))
#[1] 322 168 105
Benchmark
Benchmark comparison of the methods for a larger vector based on microbenchmark
# Create sample vector with N entries
set.seed(2018)
N <- 10000
vec <- sample(100, N, replace = T)
vec[sample(length(vec), 100)] <- 0
library(microbenchmark)
res <- microbenchmark(
vapply = {
I <- which(vec == 0)
vapply(1:(length(I)+1),
function(k) sum(vec[max(I[k-1],1):min(I[k], length(vec), na.rm = TRUE)]),
numeric(1))
},
by = {
as.numeric(by(vec, cumsum(vec == 0), sum))
},
aggregate = {
aggregate(vec, by = list(cumsum(vec == 0)), FUN = sum)[[2]]
},
split = {
sapply(split(vec, cumsum(vec == 0)), sum)
},
Reduce = {
ans <- numeric(0)
s <- n <- 0
Reduce(f = function (y,x) {
if(x == 0) {
ans <<- c(ans,s)
s <<- 0
}
n <<- n+1
s <<- x+s
if (n == length(vec))
ans <<- c(ans,s)
s
}, vec, init = 0, accumulate = TRUE)
ans
},
for_loop = {
I <- which(vec == 0)
n <- length(vec)
N <- length(I) + 1
res <- numeric(N)
for(k in seq_along(res)) {
if (k == 1) {
res[k] <- sum(vec[1:I[1]])
next
}
if (k == N) {
res[k] <- sum(vec[I[N-1]:n])
next
}
res[k] <- sum(vec[I[k-1]:I[k]])
}
res
}
)
res
# Unit: microseconds
# expr min lq mean median uq max
# vapply 435.658 487.4230 621.6155 511.3625 607.2005 6175.039
# by 3897.401 4187.2825 4721.3168 4436.5850 4936.2900 12365.351
# aggregate 4817.032 5392.0620 6002.2579 5831.2905 6310.3665 9782.524
# split 611.175 758.4485 895.2201 838.7665 957.0085 1516.556
# Reduce 21372.054 22169.9110 25363.8684 23022.6920 25503.6145 49255.714
# for_loop 15172.255 15846.5735 17252.6895 16445.7900 17572.7535 34401.827
library(ggplot2)
autoplot(res)
With vapply
Here is an option with vapply
I <- which(vec == 0)
vapply(1:(length(I)+1),
function(k) sum(vec[max(I[k-1],1):min(I[k], length(vec), na.rm = TRUE)]),
numeric(1))
# [1] 322 168 105
With Reduce
Here is a solution using Reduce
ans <- numeric(0)
s <- n <- 0
Reduce(f = function (y,x) {
if(x == 0) {
ans <<- c(ans,s)
s <<- 0
}
n <<- n+1
s <<- x+s
if(n == length(vec))
ans <<- c(ans,s)
s
}, vec, init = 0, accumulate = TRUE)
ans
# [1] 322 168 105
With A Loop
Or maybe an old fashioned loop
I <- which(vec == 0)
n <- length(vec)
N <- length(I) + 1
res <- numeric(N)
for(k in seq_along(res)) {
if (k == 1) {
res[k] <- sum(vec[1:I[1]])
next
}
if (k == N) {
res[k] <- sum(vec[I[N-1]:n])
next
}
res[k] <- sum(vec[I[k-1]:I[k]])
}
res
# [1] 322 168 105
Benchmarking
Data
Here is the data used for benchmarking
# c.f. #MauritsEvers
# Create sample vector with N entries
set.seed(2018)
N <- 10000
vec <- sample(100, N, replace = T)
vec[sample(length(vec), 100)] <- 0
Functions
Here are the functions for the second benchmarking figures:
reduce <- function(vec) {
ans <- numeric(0)
s <- n <- 0
Reduce(f = function (y,x) {
if(x == 0) {
ans <<- c(ans,s)
s <<- 0
}
n <<- n+1
s <<- x+s
if(n == length(vec))
ans <<- c(ans,s)
s
}, vec, init = 0, accumulate = TRUE)
ans
}
Vapply <- function (vec) {
I <- which(vec == 0)
vapply(1:(length(I)+1),
function(k) sum(vec[max(I[k-1],1):min(I[k], length(vec), na.rm = TRUE)]),
numeric(1))
}
By <- function (vec) as.numeric(by(vec, cumsum(vec == 0), sum))
Split <- function (vec) sapply(split(vec, cumsum(vec==0)),sum)
Aggregate <- function (vec) aggregate(vec, by = list(cumsum(vec == 0)), FUN = sum)[[2]]
for_loop <- function(vec) {
I <- which(vec == 0)
n <- length(vec)
N <- length(I)+1
res <- numeric(N)
for(k in seq_along(res)) {
if (k == 1) {
res[k] <- sum(vec[1:I[1]])
next
}
if (k == N) {
res[k] <- sum(vec[I[N-1]:n])
next
}
res[k] <- sum(vec[I[k-1]:I[k]])
}
res
}
Rowsum <- function (vec) rowsum(vec, cumsum(vec == 0))
Benchmarking
Here are the two benchmarking processes combined:
# c.f. #MauritsEvers
resBoth <- microbenchmark::microbenchmark(
Vapply = {
I <- which(vec == 0)
vapply(1:(length(I)+1),
function(k) sum(vec[max(I[k-1],1):min(I[k], length(vec), na.rm = TRUE)]),
numeric(1))
},
Vapply(vec),
By = {
as.numeric(by(vec, cumsum(vec == 0), sum))
},
By(vec),
Aggregate = {
aggregate(vec, by = list(cumsum(vec == 0)), FUN = sum)[[2]]
},
Aggregate(vec),
Split = {
sapply(split(vec, cumsum(vec == 0)), sum)
},
Split(vec),
reduce = {
ans <- numeric(0)
s <- n <- 0
Reduce(f = function (y,x) {
if(x == 0) {
ans <<- c(ans,s)
s <<- 0
}
n <<- n+1
s <<- x+s
if (n == length(vec))
ans <<- c(ans,s)
s
}, vec, init = 0, accumulate = TRUE)
ans
},
reduce(vec),
for_loop = {
I <- which(vec == 0)
n <- length(vec)
N <- length(I) + 1
res <- numeric(N)
for(k in seq_along(res)) {
if (k == 1) {
res[k] <- sum(vec[1:I[1]])
next
}
if (k == N) {
res[k] <- sum(vec[I[N-1]:n])
next
}
res[k] <- sum(vec[I[k-1]:I[k]])
}
res
},
for_loop(vec),
Rowsum = {rowsum(vec, cumsum(vec == 0))},
Rowsum(vec),
times = 10^3
)
Results
Here are the benchmarking results
resBoth
# Unit: microseconds
# expr min lq mean median uq max neval cld
# Vapply 234.121 281.5280 358.0708 311.7955 343.5215 4775.018 1000 ab
# Vapply(vec) 234.850 278.6100 376.3956 306.3260 334.4050 14564.278 1000 ab
# By 1866.029 2108.7175 2468.1208 2209.0025 2370.5520 23316.045 1000 c
# By(vec) 1870.769 2120.5695 2473.1643 2217.3900 2390.6090 21039.762 1000 c
# Aggregate 2738.324 3015.6570 3298.0863 3117.9480 3313.2295 13328.404 1000 d
# Aggregate(vec) 2733.583 2998.1530 3295.6874 3109.1955 3349.1500 8277.694 1000 d
# Split 359.202 412.0800 478.0553 444.1710 492.3080 4622.220 1000 b
# Split(vec) 366.131 410.4395 475.2633 444.1715 490.3025 4601.799 1000 b
# reduce 10862.491 13062.3755 15353.2826 14465.0870 16559.3990 76305.463 1000 g
# reduce(vec) 10403.004 12448.9965 14658.4035 13825.9995 15893.3255 67337.080 1000 f
# for_loop 6687.724 7429.4670 8518.0470 7818.0250 9023.9955 27541.136 1000 e
# for_loop(vec) 123.624 145.8690 187.2201 157.5390 177.4140 9928.200 1000 a
# Rowsum 235.579 264.3880 305.7516 282.2570 322.7360 792.068 1000 ab
# Rowsum(vec) 239.590 264.9350 307.2508 284.8100 322.0060 1778.143 1000 ab
rowsum() is known to be quite fast. We can use cumsum(vec == 0) for the grouping.
c(rowsum(vec, cumsum(vec == 0)))
# [1] 322 168 105
I'm trying to create a variant of pmax / pmin that works with an additional filter_value parameter across an arbitrary set of columns that would be defined using .SD / .SDcols. The first version of the function below hard-codes the filter value, but works with .SD:
testFuncV1 <- function(...) {
cols <- list(...)
num_cols <- length(cols)
num_records <- length(cols[[1]])
max_records <- c()
for (record_num in 1:num_records) {
v <- c()
for (l in cols) {
v <- c(v, l[[record_num]])
}
filt_v <- Filter(function(x) { x <= 1 }, v)
if (length(filt_v) == 0) {
max_records <- c(max_records, NA)
} else {
max_records <- c(max_records, max(filt_v))
}
}
max_records
}
test_dt_v1 <- data.table(a = c(1,3,5), b = c(2,3,-1), c = c(-3, 5, 2))
test_dt_v1[, max_with_filter := do.call(testFuncV1, .SD), .SDcols = c('a', 'b', 'c')]
returns:
a b c max_with_filter
1: 1 2 -3 1
2: 3 3 5 NA
3: 5 -1 2 -1
The second version of the function below takes a second filter parameter, but I was not able to get it to work with .SD, and rather, had to pass the individual column vectors in as a list to get things to work:
testFuncV2 <- function(cols, filter) {
num_cols <- length(cols)
num_records <- length(cols[[1]])
max_records <- c()
for (record_num in 1:num_records) {
v <- c()
for (l in cols) {
v <- c(v, l[[record_num]])
}
filt_v <- Filter(function(x) { x <= filter }, v)
if (length(filt_v) == 0) {
max_records <- c(max_records, NA)
} else {
max_records <- c(max_records, max(filt_v))
}
}
max_records
}
test_dt_v2 <- data.table(a = c(1,3,5), b = c(2,3,-1), c = c(-3, 5, 2))
test_dt_v2[, max_with_filter := do.call(testFuncV2, list(list(test_dt_v2$a, test_dt_v2$b, test_dt_v2$c), 1))]
also returns:
a b c max_with_filter
1: 1 2 -3 1
2: 3 3 5 NA
3: 5 -1 2 -1
Ideally, I'd be able to either figure out an approach that works with .SD using do.call, or substitute in something that works with lapply (which I also experimented around with, to no avail). Thanks in advance!
Here is an option using apply(MARGIN=1, ...)
func <- function(x, threshold) {
if (any(x <= threshold)) return(max(x[x <= threshold]))
NA
}
test_dt_v1[, max_with_filter := apply(.SD, 1, func, threshold=1),
.SDcols=c("a","b","c")]
Another option using do.call and pmax by converting values above 1 to NA first (idea came from rowwise maximum for R)
test_dt_v1[, max_with_filter := do.call(pmax, c(`is.na<-`(.SD, .SD>1), na.rm=T))]
I have a data.frame that has several variables with zero values. I need to construct an extra variable that would return the combination of variables that are not zero for each observation. E.g.
df <- data.frame(firm = c("firm1", "firm2", "firm3", "firm4", "firm5"),
A = c(0, 0, 0, 1, 2),
B = c(0, 1, 0, 42, 0),
C = c(1, 1, 0, 0, 0))
Now I would like to generate the new variable:
df$varCombination <- c("C", "B-C", NA, "A-B", "A")
I thought up something like this, which obviously did not work:
for (i in 1:nrow(df)){
df$varCombination[i] <- paste(names(df[i,2:ncol(df) & > 0]), collapse = "-")
}
This could be probably solved easily using apply(df, 1, fun), but here is an attempt to solve this column wise instead of row wise for performance sake (I once saw something similar done by #alexis_laz but can't find it right now)
## Create a logical matrix
tmp <- df[-1] != 0
## or tmp <- sapply(df[-1], `!=`, 0)
## Prealocate result
res <- rep(NA, nrow(tmp))
## Run per column instead of per row
for(j in colnames(tmp)){
res[tmp[, j]] <- paste(res[tmp[, j]], j, sep = "-")
}
## Remove the pre-allocated `NA` values from non-NA entries
gsub("NA-", "", res, fixed = TRUE)
# [1] "C" "B-C" NA "A-B" "A"
Some benchmarks on a bigger data set
set.seed(123)
BigDF <- as.data.frame(matrix(sample(0:1, 1e4, replace = TRUE), ncol = 10))
library(microbenchmark)
MM <- function(df) {
var_names <- names(df)[-1]
res <- character(nrow(df))
for (i in 1:nrow(df)){
non_zero_names <- var_names[df[i, -1] > 0]
res[i] <- paste(non_zero_names, collapse = '-')
}
res
}
ZX <- function(df) {
res <-
apply(df[,2:ncol(df)]>0, 1,
function(i)paste(colnames(df[, 2:ncol(df)])[i], collapse = "-"))
res[res == ""] <- NA
res
}
DA <- function(df) {
tmp <- df[-1] != 0
res <- rep(NA, nrow(tmp))
for(j in colnames(tmp)){
res[tmp[, j]] <- paste(res[tmp[, j]], j, sep = "-")
}
gsub("NA-", "", res, fixed = TRUE)
}
microbenchmark(MM(BigDF), ZX(BigDF), DA(BigDF))
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# MM(BigDF) 239.36704 248.737408 253.159460 252.177439 255.144048 289.340528 100 c
# ZX(BigDF) 35.83482 37.617473 38.295425 38.022897 38.357285 76.619853 100 b
# DA(BigDF) 1.62682 1.662979 1.734723 1.735296 1.761695 2.725659 100 a
Using apply:
# paste column names
df$varCombination <-
apply(df[,2:ncol(df)]>0, 1,
function(i)paste(colnames(df[, 2:ncol(df)])[i], collapse = "-"))
# convert blank to NA
df$varCombination[df$varCombination == ""] <- NA
# result
df
# firm A B C varCombination
# 1 firm1 0 0 1 C
# 2 firm2 0 1 1 B-C
# 3 firm3 0 0 0 <NA>
# 4 firm4 1 42 0 A-B
# 5 firm5 2 0 0 A
You had the right idea but the logical comparison in your loop wasn't correct.
I've attempted to keep the code fairly similar to what you had before, this should work:
var_names <- names(df)[-1]
df$varCombination <- character(nrow(df))
for (i in 1:nrow(df)){
non_zero_names <- var_names[df[i, -1] > 0]
df$varCombination[i] <- paste(non_zero_names, collapse = '-')
}
> df
firm A B C varCombination
1 firm1 0 0 1 C
2 firm2 0 1 1 B-C
3 firm3 0 0 0
4 firm4 1 42 0 A-B
5 firm5 2 0 0 A
What would be a good way to populate NA values with the previous value times (1 + growth)?
df <- data.frame(
year = 0:6,
price1 = c(1.1, 2.1, 3.2, 4.8, NA, NA, NA),
price2 = c(1.1, 2.1, 3.2, NA, NA, NA, NA)
)
growth <- .02
In this case, I would want the missing values in price1 to be filled with 4.8*1.02, 4.8*1.02^2, and 4.8*1.02^3. Similarly, I would want the missing values in price2 to be filled with 3.2*1.02, 3.2*1.02^2, 3.2*1.02^3, and 3.2*1.02^4.
I've tried this, but I think it needs to be set to repeat somehow (apply?):
library(dplyr)
df %>%
mutate(price1 = ifelse(is.na(price1),
lag(price1) * (1 + growth), price1
))
I'm not using dplyr for anything else (yet), so something from base R or plyr or similar would be appreciated.
Assuming only trailing NAs:
NAgrow <- function(x,growth=0.02) {
isna <- is.na(x)
lastval <- tail(x[!isna],1)
x[isna] <- lastval*(1+growth)^seq(sum(isna))
return(x)
}
If there are interior NA values as well this would get a little trickier.
Apply to all columns except the first:
df[-1] <- lapply(df[-1],NAgrow)
## year price1 price2
## 1 0 1.100000 1.100000
## 2 1 2.100000 2.100000
## 3 2 3.200000 3.200000
## 4 3 4.800000 3.264000
## 5 4 4.896000 3.329280
## 6 5 4.993920 3.395866
## 7 6 5.093798 3.463783
A compact base R solution can be obtained using Reduce:
growthfun <- function(x, y) if (is.na(y)) (1+growth)*x else y
replace(df, TRUE, lapply(df, Reduce, f = growthfun, acc = TRUE))
giving:
year price1 price2
1 0 1.100000 1.100000
2 1 2.100000 2.100000
3 2 3.200000 3.200000
4 3 4.800000 3.264000
5 4 4.896000 3.329280
6 5 4.993920 3.395866
7 6 5.093798 3.463783
Note: The data in the question has no non-trailing NA values but if there were some then we could use na.fill from zoo to first replace the trailing NAs with a special value, such as NaN, and look for it instead of NA:
library(zoo)
DF <- as.data.frame(na.fill(df, c(NA, NA, NaN)))
growthfun <- function(x, y) if (is.nan(y)) (1+growth)*x else y
replace(DF, TRUE, lapply(DF, Reduce, f = growthfun, acc = TRUE))
The following solution based on rle works with NA in any position and does not rely on looping to fill in the missing values:
NAgrow.rle <- function(x) {
if (is.na(x[1])) stop("Can't have NA at beginning")
r <- rle(is.na(x))
na.loc <- which(r$values)
b <- rep(cumsum(r$lengths)[na.loc-1], r$lengths[na.loc])
x[is.na(x)] <- ave(x[b], b, FUN=function(y) y[1]*(1+growth)^seq_along(y))
x
}
df[,-1] <- lapply(df[,-1], NAgrow.rle)
# year price1 price2
# 1 0 1.100000 1.100000
# 2 1 2.100000 2.100000
# 3 2 3.200000 3.200000
# 4 3 4.800000 3.264000
# 5 4 4.896000 3.329280
# 6 5 4.993920 3.395866
# 7 6 5.093798 3.463783
I'll drop in two additional solutions using for loops, one in base R and one in Rcpp:
NAgrow.for <- function(x) {
for (i in which(is.na(x))) {
x[i] <- x[i-1] * (1+growth)
}
x
}
library(Rcpp)
cppFunction(
"NumericVector NAgrowRcpp(NumericVector x, double growth) {
const int n = x.size();
NumericVector y(x);
for (int i=1; i < n; ++i) {
if (R_IsNA(x[i])) {
y[i] = (1.0 + growth) * y[i-1];
}
}
return y;
}")
The solutions based on rle (crimson and josilber.rle) take about twice as long as the simple solution based on a for loop (josilber.for), and as expected the Rcpp solution is the fastest, running in about 0.002 seconds.
set.seed(144)
big.df <- data.frame(ID=1:100000,
price1=sample(c(1:10, NA), 100000, replace=TRUE),
price2=sample(c(1:10, NA), 100000, replace=TRUE))
crimson <- function(df) apply(df[,-1], 2, function(x){
if(sum(is.na(x)) == 0){return(x)}
## updated with optimized portion from #josilber
r <- rle(is.na(x))
na.loc <- which(r$values)
b <- rep(cumsum(r$lengths)[na.loc-1], r$lengths[na.loc])
lastValIs <- 1:length(x)
lastValIs[is.na(x)] <- b
x[is.na(x)] <-
sapply(which(is.na(x)), function(i){
return(x[lastValIs[i]]*(1 + growth)^(i - lastValIs[i]))
})
return(x)
})
ggrothendieck <- function(df) {
growthfun <- function(x, y) if (is.na(y)) (1+growth)*x else y
lapply(df[,-1], Reduce, f = growthfun, acc = TRUE)
}
josilber.rle <- function(df) lapply(df[,-1], NAgrow.rle)
josilber.for <- function(df) lapply(df[,-1], NAgrow.for)
josilber.rcpp <- function(df) lapply(df[,-1], NAgrowRcpp, growth=growth)
library(microbenchmark)
microbenchmark(crimson(big.df), ggrothendieck(big.df), josilber.rle(big.df), josilber.for(big.df), josilber.rcpp(big.df))
# Unit: milliseconds
# expr min lq mean median uq max neval
# crimson(big.df) 98.447546 131.063713 161.494366 152.477661 183.175840 379.643222 100
# ggrothendieck(big.df) 437.015693 667.760401 822.530745 817.864707 925.974019 1607.352929 100
# josilber.rle(big.df) 59.678527 115.220519 132.874030 127.476340 151.665657 262.003756 100
# josilber.for(big.df) 21.076516 57.479169 73.860913 72.959536 84.846912 178.412591 100
# josilber.rcpp(big.df) 1.248793 1.894723 2.373469 2.190545 2.697246 5.646878 100
It looks like dplyr can't handle access newly assigned lag values. Here is a solution that should work even if the NA's are in the middle of a column.
df <- apply(
df, 2, function(x){
if(sum(is.na(x)) == 0){return(x)}
## updated with optimized portion from #josilber
r <- rle(is.na(x))
na.loc <- which(r$values)
b <- rep(cumsum(r$lengths)[na.loc-1], r$lengths[na.loc])
lastValIs <- 1:length(x)
lastValI[is.na(x)] <- b
x[is.na(x)] <-
sapply(which(is.na(x)), function(i){
return(x[lastValIs[i]]*(1 + growth)^(i - lastValIs[i]))
})
return(x)
})
You can try such function
test <- function(x,n) {
if (!is.na(df[x,n])) return (df[x,n])
else return (test(x-1,n)*(1+growth))
}
a=1:nrow(df)
lapply(a, FUN=function(i) test(i,2))
unlist(lapply(a, FUN=function(i) test(i,2)))
[1] 1.100000 2.100000 3.200000 4.800000 4.896000 4.993920 5.093798