Given a 0/1 vector and a k value
x <- c(0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1)
k <- 3
I need to define a function f so that
f(x, k)
[1] 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0
That is: if any value of x = 1 set the next k = 3 values of x to zero
I have solved the problem with this terrible solution
f <- function(x, k){
.f <- function(x, i, k){
s <- seq(i+1, i+k)
if (x[[i]] == 1) x[s] <- 0
x
}
n <- length(x)
for ( i in seq(1, n-1)) {
#for ( i in seq(1, n-k-1)) {
x <- .f(x, i , k)
}
length(x) <- n
x
}
Can anyone provide a more elegant (functional) solution ?
Thanks for any help
This takes some steps out:
foo <- function(x, k) {
n <- length(x)
for (i in seq_along(x)) {
if (x[i] == 1) x[seq(i+1L, min(i+k, n))] <- 0
}
x
}
foo(x, k)
# [1] 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0
This takes bigger steps when it encounters a 1, should be more efficient at least for big k.
foo2 <- function(x, k) {
n <- length(x)
i <- 1L
while (i < n) {
if (x[i] == 1) {
x[seq(i+1L, min(i+k, n))] <- 0
i <- i+k
} else {
i <- i+1L
}
}
x
}
A simple counter will also do the job :
fnew <- function(x, k){
counter <- 0L
for (i in 1:length(x)){
if (x[i] & !counter) {
counter <- k
next
}
if (!!counter) {
x[i] <- 0
counter <- counter - 1L
}
}
x
}
fnew(x,3)
[1] 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0
Advantage of a simple loop is that it can easily be converted to C++:
Rcpp::cppFunction('
NumericVector fcpp(NumericVector x, int k) {
int n = x.size();
int i;
int counter = 0;
NumericVector xout(n);
xout = x;
for (i = 0;i<n;i++){
if ((x[i]==1) & (counter==0)) {
counter = k;
} else
if (counter > 0) {
xout[i] = 0;
counter += -1;
}
}
return xout;
}')
I found interesting to compare performance of the different options proposed until now, which shows that a simple loop still has its word to say :
x <- rnorm(1e3)>0
microbenchmark::microbenchmark(f(x,3),gen_vec(x,3),foo(x,3),onePeriodic(x,3),fnew(x,3),fA5C1(x,3),fcpp(x,3))
Unit: microseconds
expr min lq mean median uq max neval cld
f(x, 3) 7783.375 8205.9390 9340.76417 8559.7845 8868.092 33439.139 100 e
gen_vec(x, 3) 2821.330 3086.5605 3683.71671 3176.2015 3490.867 25794.430 100 d
foo(x, 3) 1396.922 1495.1775 1689.60223 1532.5110 1640.818 5901.121 100 c
onePeriodic(x, 3) 879.178 994.0510 1090.42763 1049.4345 1103.793 2109.536 100 bc
fnew(x, 3) 413.538 452.7175 492.22530 473.6405 494.564 1142.563 100 ab
fA5C1(x, 3) 160.000 178.4620 274.42453 188.7180 213.949 7361.222 100 a
fcpp(x, 3) 6.154 16.4100 21.46069 20.5130 24.206 94.359 100 a
Waldi's fcpp function is great, but if you want to stick with base R, you can try this for loop.
f <- function(x, y) {
l <- length(x)
x <- as.integer(x)
ind <- which(as.logical(x))
for (i in seq_along(ind)) {
if (x[ind[i]] == 1L) {
x[ind[i] + seq.int(y)] <- 0L
}
}
x[1:l]
}
f(x, 3)
# [1] 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0
Here is another base R option:
onePeriodic <- function(x, k) {
w <- which(x==1L)
idx <- unlist(lapply(split(w, c(0L, cumsum(diff(w) > k+1L))), function(v) {
seq(v[1L], v[length(v)], by=k+1L)
}))
replace(integer(length(x)), idx, 1L)
}
k <- 3
(x <- c(0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1))
(o <- onePeriodic(x, k))
output:
x: [1] 0 1 1 0 0 1 1 0 1 1 1 1 1 1 1 0 0 0 0 0 1 0 0 1 1
o: [1] 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1
I don't know why you are specifically looking for a functional programming solution, but, without claiming anything regarding the quality of the solution, this does the job:
x <- c(0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1)
gen_vec <- function(x,k){
x2 <- x
purrr::walk(1:length(x2), function(n,k){
ifelse(x2[n]==1, x2[(n+1):min(n+k, length(x2))] <<- 0, x2[n])
}, k=k)
x2
}
Output
> gen_vec(x, 3)
[1] 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0
Original x is unmodified:
> x
[1] 0 1 1 0 0 1 1 0 1 1 1 1 1 1 1
Related
trying these two methods to solve a problem... but they doesn't work.
Column 3 should continue to be "1" according to the final condition after column 1 changes to 0 from 1.
Method 1:
a <- as.data.frame(c(0,0,0,1,1,1,1,1,0,0,0,0))
b <- as.data.frame(c(0,0,0,0,0,0,0,0,0,0,0,0))
df <- cbind(a,b)
df[1,3] <- 0
df[-1,3] <- ifelse(df[-1,1] == 1 & df[-1,2] == 0, 1, ifelse(df[-1,1] == 1 &
df[-1,2] == 1, 0, df[sum(!is.na(df[,3])),3]))
Method 2:
a <- as.data.frame(c(0,0,0,1,1,1,1,1,0,0,0,0))
b <- as.data.frame(c(0,0,0,0,0,0,0,0,0,0,0,0))
df <- cbind(a,b)
df[1,3] <- 0
ndates <- as.numeric(length(df[,1]))
x <- 1
while (ndates > x - 1){
df[-1,3] <- ifelse(df[-1,1] == 1 & df[-1,2] == 0, 1, ifelse(df[-1,1] == 1
& df[-1,2] == 1, 0, df[sum(!is.na(df[,3])),3]))
x <- x + 1
}
Any help would be appreciated... seems like I'm missing something that is probably quite basic.
Updated answer:
Okay with a better understanding of what you're trying to accomplish here's a for loop version that should be right. Let me know if I'm still missing what your intention is.
df[-1,3] <- ifelse(df[-1,1] == 1 & df[-1,2] == 0, 1, ifelse(df[-1,1] == 1 &
df[-1,2] == 1, 0, df[sum(!is.na(df[,3])),3]))
a <- as.data.frame(c(0,0,0,1,1,1,1,1,0,0,0,0))
b <- as.data.frame(c(0,0,0,0,0,0,0,0,0,0,0,0))
df <- cbind(a,b)
df[1,3] <- 0
for (i in 2:nrow(df)) {
df[i,3] <- ifelse(df[i,1] == 1 & df[i,2] == 0, 1,
ifelse(df[i,1] == 1 & df[i,2] == 1, 0, df[i-1,3]))
}
This code loops once through each row and updates based on the rules you gave ([1,0] = 1, [1,1] = 0, otherwise previous row). And this is the resulting output:
> df
c(0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0) c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) V3
1 0 0 0
2 0 0 0
3 0 0 0
4 1 0 1
5 1 0 1
6 1 0 1
7 1 0 1
8 1 0 1
9 0 0 1
10 0 0 1
11 0 0 1
12 0 0 1
>
Initial answer:
It might be helpful if you could clarify what you're trying to accomplish. I worked through your first method and it seems to give the expected results. This is my understanding in pseudocode:
if C1,C2 == [1,0]:
set C3 to 1
else:
if C1,C2 == [1,1]:
set C3 to 0
else:
set C3 to the val in col=3,row=number of non NA vals in col3
Since you have 1 non-NA then the final statement evaluates to cell [1,3] which is 0 so it sets all the [1,0] cases to 1 and 0 otherwise.
What exactly are you trying to accomplish with df[sum(!is.na(df[,3])),3]))? This might be a case of a logic error, but it's hard to tell without understanding what you're hoping the outcome will be.
This question already has answers here:
Create counter within consecutive runs of certain values
(6 answers)
Closed 3 years ago.
I have a logical vector like
as.logical(c(0,0,1,1,1,0,1,1,0,0,0,1,1,1,1))
but much longer. How can i transform it to:
c(0,0,1,2,3,0,1,2,0,0,0,1,2,3,4)
by counting the length of ones?
Another rle option:
r <- rle(x)
x[x] <- sequence(r$l[r$v])
#[1] 0 0 1 2 3 0 1 2 0 0 0 1 2 3 4
Or without saving r:
x[x] <- sequence(with(rle(x), lengths[values]))
with C++ through Rcpp
library(Rcpp)
cppFunction('NumericVector seqOfLogical(LogicalVector lv) {
size_t n = lv.size();
NumericVector res(n);
int foundCounter = 0;
for (size_t i = 0; i < n; i++) {
if (lv[i] == 1) {
foundCounter++;
} else {
foundCounter = 0;
}
res[i] = foundCounter;
}
return res;
}')
seqOfLogical(x)
# [1] 0 0 1 2 3 0 1 2 0 0 0 1 2 3 4
Benchmarks
library(microbenchmark)
set.seed(1)
x <- sample(c(T,F), size = 1e6, replace = T)
microbenchmark(
symbolix = { symbolix(x) },
thelatemail1 = { thelatemail1(x) },
thelatemail2 = { thelatemail2(x) },
wen = { wen(x) },
maurits = { maurits(x) },
#mhammer = { mhammer(x) }, ## this errors
times = 5
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# symbolix 2.760152 4.579596 34.60909 4.833333 22.31126 138.5611 5
# thelatemail1 154.050925 189.784368 235.16431 235.982093 262.33704 333.6671 5
# thelatemail2 138.876834 146.197278 158.66718 148.547708 179.80223 179.9119 5
# wen 780.432786 898.505231 1091.39099 1093.702177 1279.33318 1404.9816 5
# maurits 1002.267323 1043.590621 1136.35624 1086.967756 1271.38803 1277.5675 5
functions
symbolix <- function(x) {
seqOfLogical(x)
}
thelatemail1 <- function(x) {
r <- rle(x)
x[x] <- sequence(r$l[r$v])
return(x)
}
thelatemail2 <- function(x) {
x[x] <- sequence(with(rle(x), lengths[values]))
return(x)
}
maurits <- function(x) {
unlist(Map(function(l, v) if (!isTRUE(v)) rep(0, l) else 1:l, rle(x)$lengths, rle(x)$values))
}
wen <- function(A) {
B=data.table::rleid(A)
B=ave(B,B,FUN = seq_along)
B[!A]=0
B
}
mhammer <- function(x) {
x_counts <- x
for(i in seq_along(x)) {
if(x[i] == 1) { x_counts[i] <- x_counts[i] + x_counts[i-1] }
}
return(x_counts)
}
You can using rleid in data.table
A=as.logical(c(0,0,1,1,1,0,1,1,0,0,0,1,1,1,1))
B=data.table::rleid(A)
B=ave(B,B,FUN = seq_along)
B[!A]=0
B
[1] 0 0 1 2 3 0 1 2 0 0 0 1 2 3 4
x <- c(0,0,1,1,1,0,1,1,0,0,0,1,1,1,1)
x_counts <- x
for(i in seq_along(x)) {
if(x[i] == 1) { x_counts[i] <- x_counts[i] + x_counts[i-1] }
}
x_counts
Here is a solution using base R's rle with Map
x <- as.logical(c(0,0,1,1,1,0,1,1,0,0,0,1,1,1,1))
unlist(Map(function(l, v) if (!isTRUE(v)) rep(0, l) else 1:l, rle(x)$lengths, rle(x)$values))
# [1] 0 0 1 2 3 0 1 2 0 0 0 1 2 3 4
or using purrr::pmap
library(purrr);
unlist(pmap(unclass(rle(x)),
function(lengths, values) if (!isTRUE(values)) rep(0, lengths) else 1:lengths))
#[1] 0 0 1 2 3 0 1 2 0 0 0 1 2 3 4
slightly different from Wen's, I came up with:
library(data.table)
ave(v,rleid(v),FUN=function(x) x *seq_along(x))
# [1] 0 0 1 2 3 0 1 2 0 0 0 1 2 3 4
I recommend runner package and function streak_run which calculates consecutive occurences. Possible also calculating on sliding windows (eg. last 5 observations), more in github documentation
x <- as.logical(c(0,0,1,1,1,0,1,1,0,0,0,1,1,1,1))
streak <- streak_run(x)
streak[x == 0] <- 0
print(streak)
# [1] 0 0 1 2 3 0 1 2 0 0 0 1 2 3 4
I have a vector of numbers:
x <- c(0, 0, 0, 30, 60, 0, 0, 0, 0, 0, 10, 0, 0, 15, 45, 0, 0)
For each element i in x, I would like to do the following
If x[i] > 0, return 0
If all 4 elements before x[i] are 0, return NA
If the 4 elements before x[i] are not 0, count how many elements are between the last not-0 element and x[i]
I expect this output:
#> x
#[1] 0 0 0 30 60 0 0 0 0 0 10 0 0 15 45 0 0
#> x_out
#[1] NA NA NA 0 0 1 2 3 4 NA 0 1 2 0 0 1 2
Notice that the solution should also work when there are less than 4 elements available at the beginning of the vector (i.e. condition 2 and 3 should use as many elements as are available). Does anybody have a solution for this? A vectorised approach is preferred because the vectors are long and the dataset is fairly big.
Here is a simple Rcpp solution. Create a new C++ file in RStudio and paste the code into it and source the file. Obviously, you'll need to have installed Rtools if you use Windows.
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
IntegerVector funRcpp(const IntegerVector x) {
const double n = x.length();
int counter = 4;
IntegerVector y(n);
for (double i = 0; i < n; ++i) {
if (x(i) > 0) {
y(i) = 0;
counter = 0;
}
else {
if (counter > 3) {
y(i) = NA_INTEGER;
} else {
counter++;
y(i) = counter;
}
}
}
return y;
}
/*** R
x <- c(0, 0, 0, 30, 60, 0, 0, 0, 0, 0, 10, 0, 0, 15, 45, 0, 0)
funRcpp(x)
*/
This returns the desired result:
> funRcpp(x)
[1] NA NA NA 0 0 1 2 3 4 NA 0 1 2 0 0 1 2
This is my current approach:
library(dplyr)
last_x_months <- 4
my_list <- vector("list", 1 + last_x_months)
my_list[[1]] <- x
# create lagged variants of vector
for (j in seq_along(1:last_x_months)) {
my_list[[1 + j]] <- lag(my_list[[1]], n = j, default = NA)
}
# row bind it to a data.frame
i_dat <- do.call(rbind, my_list) %>%
as.data.frame()
# apply function to each column in dataframe
sapply(i_dat, function(x) {
if (sum(x, na.rm = TRUE) == 0) {
NA
} else if (x[1] > 0) {
0
} else {
rle(x)$lengths[1]
}
})
This is the output I get:
#> output
#[1] NA NA NA 0 0 1 2 3 4 NA 0 1 2 0 0 1 2
Is this good practice or could I improve performance with a shortcut? I am pretty inexperienced when it comes to performance optimisation, that's why I posed the question.
I have two vectors
x <- c(2, 3, 4)
y <- rep(0, 5)
I want to get the following output:
> z
2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0
How can I create z? I have tried to use paste and c but nothing seems to work. The only thing I can think of is using a for() and it is terribly slow. I have googled this and I am sure the solution is out there and I am just not hitting the right keywords.
UPDATE:
For benchmarking purposes:
Using Nicola's solution:
> system.time(
+ precipitation <- `[<-`(numeric(length(x)*(length(y)+1)),seq(1,by=length(y)+1,length.out=length(x)),x)
+ )
user system elapsed
0.419 0.407 0.827
This is ridiculously fast! I must say! Can someone please explain this to me? My for() which I know is always wrong in R would have taken at least a day if it even finished.
The other suggestions:
> length(prate)
[1] 4914594
> length(empty)
[1] 207
> system.time(
+ precipitation <- unlist(sapply(prate, FUN = function(prate) c(prate,empty), simplify=FALSE))
+ )
user system elapsed
16.470 3.859 28.904
I had to kill
len <- length(prate)
precip2 <- c(rbind(prate, matrix(rep(empty, len), ncol = len)))
After 15 minutes.
you can try this
unlist(sapply(x, FUN = function(x) c(x,y), simplify=FALSE))
[1] 2 0 0 0 0 0 3 0 0 0 0 0 4 0 0 0 0 0
or simpler from #docendodiscimus
unlist(lapply(x, FUN = function(x) c(x,y)))
This seems faster for some reason:
unlist(t(matrix(c(as.list(x),rep(list(y),length(x))),ncol=2)))
The above solution is general, in the sense that both x and y can have any value. In the OP case, where y is made just of 0, this is fast as it can be:
`[<-`(numeric(length(x)*(length(y)+1)),seq(1,by=length(y)+1,length.out=length(x)),x)
#[1] 2 0 0 0 0 0 3 0 0 0 0 0 4 0 0 0 0 0
Edit
I realise I've been very cryptic and the code I produced is not easy to follow, despite being just one line. I'm gonna explain in detail what the second solution does.
First of all, you notice that the resulting vector will have the values containd in x plus the zeroes in y repeated length(x) times. So in total, it will be length(x) + length(x)*length(y) or length(x)*(length(y)+1) long. So we create a vector with just zeroes as long as needed:
res<-numeric(length(x)*(length(y)+1))
Now we have to place the x values in res. We notice that the first value of x occupies the first value in res; the second will be after length(y)+1 from the first and so on, until all the length(x) values are filled. We can create a vector of indices in which to put the x values:
indices<-seq.int(1,by=length(y)+1,length.out=length(x))
And then we make the replacement:
res[indices]<-x
My line was just a shortcut for the three lines above. Hope this clarifies a little.
You could also try to vectorize as follows
len <- length(x)
c(rbind(x, matrix(rep(y, len), ncol = len)))
## [1] 2 0 0 0 0 0 3 0 0 0 0 0 4 0 0 0 0 0
A more compact, but potentially slower option (contributed by #akrun) would be
c(rbind(x, replicate(len, y)))
## [1] 2 0 0 0 0 0 3 0 0 0 0 0 4 0 0 0 0 0
You can try:
c(sapply(x, 'c', y))
#[1] 2 0 0 0 0 0 3 0 0 0 0 0 4 0 0 0 0
Or a crazy solution with gusb and paste..
library(functional)
p = Curry(paste0, collapse='')
as.numeric(strsplit(p(gsub('(.*)$', paste0('\\1',p(y)),x)),'')[[1]])
#[1] 2 0 0 0 0 0 3 0 0 0 0 0 4 0 0 0 0 0
Here's another way:
options(scipen=100)
as.numeric(unlist(strsplit(as.character(x * 10^5), "")))
And some benchmarks:
microbenchmark({as.numeric(unlist(strsplit(as.character(x*10^5), "")))}, {unlist(t(matrix(c(as.list(x),rep(list(y),length(x))),ncol=2)))}, {unlist(sapply(x, FUN = function(x) c(x,y), simplify=FALSE))}, times=100000)
Unit: microseconds
expr
{ as.numeric(unlist(strsplit(as.character(x * 10^5), ""))) }
{ unlist(t(matrix(c(as.list(x), rep(list(y), length(x))), ncol = 2))) }
{ unlist(sapply(x, FUN = function(x) c(x, y), simplify = FALSE)) }
min lq mean median uq max neval
9.286 10.644 12.15242 11.678 12.286 1650.133 100000
9.485 11.164 13.25424 12.288 13.067 1887.761 100000
5.607 7.429 9.21015 8.147 8.784 30457.994 100000
And here's another idea (but it seems slow):
r = rle(1)
r$lengths = rep(c(1,5), length(x))
r$values = as.vector(rbind(x, 0))
inverse.rle(r)
I have a sequence of 0s and 1s in this manner:
xx <- c(0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1)
And I want to select the 0s and the first 1s.
The results should be:
ans <- c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1)
What's the fastest way? in R
Use rle() to extract the run lengths and values, do some minor surgery, and then put the run-length encoded vector "back together" using inverse.rle().
rr <- rle(xx)
rr$lengths[rr$values==1] <- 1
inverse.rle(rr)
# [1] 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1
Here's one way:
idx <- which(xx == 1)
pos <- which(diff(c(xx[1], idx)) == 1)
xx[-idx[pos]] # following Frank's suggestion
# [1] 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1
Without rle:
xx[head(c(TRUE, (xx != 1)), -1) | (xx != 1)]
#[1] 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1
Since OP mentioned speed, here's a benchmark:
josh = function(xx) {
rr <- rle(xx)
rr$lengths[rr$values==1] <- 1
inverse.rle(rr)
}
arun = function(xx) {
idx <- which(xx == 1)
pos <- which(diff(c(xx[1], idx)) == 1)
xx[setdiff(seq_along(xx), idx[pos])]
}
eddi = function(xx) {
xx[head(c(TRUE, (xx != 1)), -1) | (xx != 1)]
}
simon = function(xx) {
# The body of the function is supplied in #SimonO101's answer
first1(xx)
}
set.seed(1)
N = 1e6
xx = sample(c(0,1), N, T)
library(microbenchmark)
bm <- microbenchmark(josh(xx), arun(xx), eddi(xx), simon(xx) , times = 25)
print( bm , digits = 2 , order = "median" )
#Unit: milliseconds
# expr min lq median uq max neval
# simon(xx) 20 21 23 26 72 25
# eddi(xx) 97 102 104 118 149 25
# arun(xx) 205 245 253 258 332 25
# josh(xx) 228 268 275 287 365 25
Here's a quick Rcpp solution. Should be fastish (but I've no idea how it will stack up against the others here)...
Rcpp::cppFunction( 'std::vector<int> first1( IntegerVector x ){
std::vector<int> out;
for( IntegerVector::iterator it = x.begin(); it != x.end(); ++it ){
if( *it == 1 && *(it-1) != 1 || *it == 0 )
out.push_back(*it);
}
return out;
}')
first1(xx)
# [1] 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1
Even tho' I'm a staunch supporter of rle , since it's Friday here's an alternative method. I did it for fun, so YMMV.
yy<-paste(xx,collapse='')
zz<-gsub('[1]{1,}','1',yy) #I probably screwed up the regex here
aa<- as.numeric(strsplit(zz,'')[[1]])