This question already has answers here:
Create counter within consecutive runs of certain values
(6 answers)
Closed 3 years ago.
I have a logical vector like
as.logical(c(0,0,1,1,1,0,1,1,0,0,0,1,1,1,1))
but much longer. How can i transform it to:
c(0,0,1,2,3,0,1,2,0,0,0,1,2,3,4)
by counting the length of ones?
Another rle option:
r <- rle(x)
x[x] <- sequence(r$l[r$v])
#[1] 0 0 1 2 3 0 1 2 0 0 0 1 2 3 4
Or without saving r:
x[x] <- sequence(with(rle(x), lengths[values]))
with C++ through Rcpp
library(Rcpp)
cppFunction('NumericVector seqOfLogical(LogicalVector lv) {
size_t n = lv.size();
NumericVector res(n);
int foundCounter = 0;
for (size_t i = 0; i < n; i++) {
if (lv[i] == 1) {
foundCounter++;
} else {
foundCounter = 0;
}
res[i] = foundCounter;
}
return res;
}')
seqOfLogical(x)
# [1] 0 0 1 2 3 0 1 2 0 0 0 1 2 3 4
Benchmarks
library(microbenchmark)
set.seed(1)
x <- sample(c(T,F), size = 1e6, replace = T)
microbenchmark(
symbolix = { symbolix(x) },
thelatemail1 = { thelatemail1(x) },
thelatemail2 = { thelatemail2(x) },
wen = { wen(x) },
maurits = { maurits(x) },
#mhammer = { mhammer(x) }, ## this errors
times = 5
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# symbolix 2.760152 4.579596 34.60909 4.833333 22.31126 138.5611 5
# thelatemail1 154.050925 189.784368 235.16431 235.982093 262.33704 333.6671 5
# thelatemail2 138.876834 146.197278 158.66718 148.547708 179.80223 179.9119 5
# wen 780.432786 898.505231 1091.39099 1093.702177 1279.33318 1404.9816 5
# maurits 1002.267323 1043.590621 1136.35624 1086.967756 1271.38803 1277.5675 5
functions
symbolix <- function(x) {
seqOfLogical(x)
}
thelatemail1 <- function(x) {
r <- rle(x)
x[x] <- sequence(r$l[r$v])
return(x)
}
thelatemail2 <- function(x) {
x[x] <- sequence(with(rle(x), lengths[values]))
return(x)
}
maurits <- function(x) {
unlist(Map(function(l, v) if (!isTRUE(v)) rep(0, l) else 1:l, rle(x)$lengths, rle(x)$values))
}
wen <- function(A) {
B=data.table::rleid(A)
B=ave(B,B,FUN = seq_along)
B[!A]=0
B
}
mhammer <- function(x) {
x_counts <- x
for(i in seq_along(x)) {
if(x[i] == 1) { x_counts[i] <- x_counts[i] + x_counts[i-1] }
}
return(x_counts)
}
You can using rleid in data.table
A=as.logical(c(0,0,1,1,1,0,1,1,0,0,0,1,1,1,1))
B=data.table::rleid(A)
B=ave(B,B,FUN = seq_along)
B[!A]=0
B
[1] 0 0 1 2 3 0 1 2 0 0 0 1 2 3 4
x <- c(0,0,1,1,1,0,1,1,0,0,0,1,1,1,1)
x_counts <- x
for(i in seq_along(x)) {
if(x[i] == 1) { x_counts[i] <- x_counts[i] + x_counts[i-1] }
}
x_counts
Here is a solution using base R's rle with Map
x <- as.logical(c(0,0,1,1,1,0,1,1,0,0,0,1,1,1,1))
unlist(Map(function(l, v) if (!isTRUE(v)) rep(0, l) else 1:l, rle(x)$lengths, rle(x)$values))
# [1] 0 0 1 2 3 0 1 2 0 0 0 1 2 3 4
or using purrr::pmap
library(purrr);
unlist(pmap(unclass(rle(x)),
function(lengths, values) if (!isTRUE(values)) rep(0, lengths) else 1:lengths))
#[1] 0 0 1 2 3 0 1 2 0 0 0 1 2 3 4
slightly different from Wen's, I came up with:
library(data.table)
ave(v,rleid(v),FUN=function(x) x *seq_along(x))
# [1] 0 0 1 2 3 0 1 2 0 0 0 1 2 3 4
I recommend runner package and function streak_run which calculates consecutive occurences. Possible also calculating on sliding windows (eg. last 5 observations), more in github documentation
x <- as.logical(c(0,0,1,1,1,0,1,1,0,0,0,1,1,1,1))
streak <- streak_run(x)
streak[x == 0] <- 0
print(streak)
# [1] 0 0 1 2 3 0 1 2 0 0 0 1 2 3 4
Related
I have data about machines in the following form
Number of rows - 900k
Data
A B C D E F G H I J K L M N
---- -- --- ---- --- --- --- --- --- --- --- --- --- ---
1 1 1 1 1 1 1 1 1 1 0 1 1 0 0
2 0 0 0 0 1 1 1 0 1 1 0 0 1 0
3 0 0 0 0 0 0 0 1 1 1 1 1 0 0
1 indicates that the machine was active and 0 indicates that it was inactive.
I want my output to look like
A B C D E F G H I J K L M N
---- -- --- ---- --- --- --- --- --- --- --- --- --- ---
1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
2 0 0 0 0 1 1 1 1 1 1 0 0 1 0
3 0 0 0 0 0 0 0 1 1 1 1 1 0 0
Basically all I am trying to do is look for zeros in a particular row and if that zero is surrounded by ones on either side, replace 0 with 1
example -
in row 1 you have zero in column J
but you also have 1 in column I and K
which means I replace that 0 by 1 because it is surrounded by 1s
The code I am using is this
for(j in 2:13) {
if(data[i,j]==0 && data[i,j-1]==1 && data[i,j+1]==1){
data[i,j] = 1
}
}
}
Is there a way to reduce the time computation for this? This takes me almost 30 mins to run in R. Any help would be appreciated.
this is faster because it does not require to iterate through the rows.
for(j in 2:13) {
data[,j] = ifelse(data[,j-1] * data[,j+1]==1,1,data[,j])
}
or a littlebit more optimized, without using ifelse
for(j in 2:(ncol(data) - 1)) {
data[data[, j - 1] * data[, j + 1] == 1, j] <- 1
}
You could also use gsub to replace any instances of 101 with 111 using the following code:
collapsed <- gsub('101', '111', apply(df1, 1, paste, collapse = ''))
data <- as_tibble(t(matrix(unlist(sapply(collapsed, strsplit, split = '')), nrow = numLetters)))
names(data) <- LETTERS[1:numLetters]
Here's a comparison of everyone's solutions:
library(data.table)
library(rbenchmark)
library(tidyverse)
set.seed(1)
numLetters <- 13
df <- as_tibble(matrix(round(runif(numLetters * 100)), ncol = numLetters))
names(df) <- LETTERS[1:numLetters]
benchmark(
'gsub' = {
data <- df
collapsed <- gsub('101', '111', apply(data, 1, paste, collapse = ''))
data <- as_tibble(t(matrix(unlist(sapply(collapsed, strsplit, split = '')), nrow = numLetters)))
names(data) <- LETTERS[1:numLetters]
},
'for_orig' = {
data <- df
for(i in 1:nrow(data)) {
for(j in 2:(ncol(data) - 1)) {
if(data[i, j] == 0 && data[i, j - 1] == 1 && data[i, j + 1] == 1) {
data[i, j] = 1
}
}
}
},
'for_norows' = {
data <- df
for(j in 2:(ncol(data) - 1)) {
data[, j] = ifelse(data[, j - 1] * data[, j + 1] == 1, 1, data[, j])
}
},
'vectorize' = {
data <- df
for(i in seq(ncol(data) - 2) + 1) {
condition <- data[, i - 1] == data[, i + 1] & data[, i - 1] == 1 & data[, i] == 0
data[which(condition), i] <- 1
}
},
'index' = {
data <- df
idx <- apply(data, 1, function(x) c(0, diff(x)))
data[which(idx == -1 & lead(idx == 1), arr.ind = TRUE)[, 2:1]] <- 1
},
replications = 100
)
The indexing solution (which has since been deleted) wins hands-down in terms of computational time for a 13-by-100 data frame.
test replications elapsed relative user.self sys.self user.child
3 for_norows 100 1.19 7.438 1.19 0 NA
2 for_orig 100 9.29 58.063 9.27 0 NA
1 gsub 100 0.28 1.750 0.28 0 NA
5 index 100 0.16 1.000 0.16 0 NA
4 vectorize 100 0.87 5.438 0.87 0 NA
sys.child
3 NA
2 NA
1 NA
5 NA
4 NA
Cut the time by using vectorized operations. As you are planning to do the same thing for every row, this can be done by utilizing the vectorized conditional statements.
for(i in seq(ncol(data) - 2) + 1){ #<== all but last and first column
#Find all neighbouring columns that are equal, where the the center column is equal to 0
condition <- data[, i - 1] == data[, i + 1] & data[, i - 1] == 1 & data[, i] == 0
#Overwrite only the values that holds the condition
data[which(condition), i] <- 1
}
You can avoid loops altogether and use indexing to replace all the values at once:
nc <- ncol(df)
df[, 2:(nc - 1)][df[, 1:(nc - 2)] * df[, 3:nc] == 1] <- 1
I don't know what it is called, nested arithmetic progression maybe?
If n is a integer, say n=50, What I would like is
(1,2,3...n,2,3...n,3,4..n...n-1,n)
it is like concatenation of
1:n, 2:n, ...,n-1:n
Is there an easy way of doing this?
Thanks!
The subject says the last subsequence is n but the body of the question says it is (n-1):n. I have assumed (n-1):n but to get the other just change each n-1 in the code to n and each 2 in the code to 1.
1) lapply Assuming we want 1:n, 2:n, ..., (n-1):n iterate over the starting value of each subsequence like this:
n <- 4
unlist(lapply(seq_len(n-1), seq, n))
## [1] 1 2 3 4 2 3 4 3 4
2) sequence Another approach is to transform sequence(seq(n, 2)) like this:
s <- sequence(seq(n, 2))
s + cumsum(s == 1) - 1
## [1] 1 2 3 4 2 3 4 3 4
3) outer
m <- outer(seq_len(n), seq_len(n-1), ">=") * seq(n)
m[m > 0]
## [1] 1 2 3 4 2 3 4 3 4
3a) This variation of (3) also works:
m <- outer(seq_len(n), seq_len(n-1), "+") - 1
m[m <= n]
## [1] 1 2 3 4 2 3 4 3 4
4) Reduce
f <- function(x, y) c(x, seq(y, n))
Reduce(f, 1:(n-1), c())
## [1] 1 2 3 4 2 3 4 3 4
5) Recursion
Recurse <- function(v) {
if (length(v) > 2) c(v, Recall(tail(v, -1))) else v
}
Recurse(1:n)
## [1] 1 2 3 4 2 3 4 3 4
Using Rcpp
library(Rcpp)
cppFunction('Rcpp::NumericVector mySeq( int n ) {
Rcpp::IntegerVector vec = seq(0, n);
int total_n = sum( vec );
Rcpp::NumericVector out(total_n);
size_t i, j;
int idx = 0;
int x = 1;
for( i = 0; i < n; i++ ) {
x = i + 1;
for( j = i; j < n; j++) {
out[idx] = x;
x++;
idx++;
}
}
return out;
}')
mySeq(5)
# [1] 1 2 3 4 5 2 3 4 5 3 4 5 4 5 5
mySeq(10)
# [1] 1 2 3 4 5 6 7 8 9 10 2 3 4 5 6 7 8 9 10 3 4 5 6 7 8 9 10 4 5 6 7 8 9 10 5 6 7 8
# [39] 9 10 6 7 8 9 10 7 8 9 10 8 9 10 9 10 10
And as ever with these multi-option answers, here's a benchmark
library(microbenchmark)
n <- 10000
microbenchmark(
rcpp = { mySeq(n) },
lapply = { lapn(n) },
sequence = { seqn(n) },
outer = { outn(n) },
outer2 = { outn2(n) },
# reduce = { reducen(n) }, ## takes too long
# recurse = { recursen(n) }, ## takes too long
times = 10
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# rcpp 213.9762 220.3786 245.6753 230.6847 262.8544 326.5764 10
# lapply 250.5695 260.5681 288.2523 278.9582 302.9768 367.5507 10
# sequence 1356.2691 1430.5877 1472.6946 1455.7467 1485.3578 1753.4076 10
# outer 2381.8864 2459.8159 2497.1630 2478.9865 2526.9577 2662.0489 10
# outer2 2509.8079 2531.1497 2651.6906 2636.3873 2785.3693 2820.2356 10
Functions
lapn <- function(n) { unlist(lapply(seq_len(n-1), seq, n)) }
seqn <- function(n) {
s <- sequence(seq(n, 2))
s + cumsum(s == 1) - 1
return(s)
}
outn <- function(n) {
m <- outer(seq_len(n), seq_len(n-1), ">=") * seq(n)
m[m > 0]
}
outn2 <- function(n) {
m <- outer(seq_len(n), seq_len(n-1), "+") - 1
m[m <= n]
}
reducen <- function(n) {
f <- function(x, y) c(x, seq(y, n))
Reduce(f, 1:(n-1), c())
}
recursen <- function(n) {
Recurse <- function(v) {
if (length(v) > 2) c(v, Recall(tail(v, -1))) else v
}
Recurse(1:n)
}
For example,
> n <- 4
> X <- matrix(1:n, nrow = n, ncol = n)
> X
[,1] [,2] [,3] [,4]
[1,] 1 1 1 1
[2,] 2 2 2 2
[3,] 3 3 3 3
[4,] 4 4 4 4
> lower.tri(X, diag = TRUE)
[,1] [,2] [,3] [,4]
[1,] TRUE FALSE FALSE FALSE
[2,] TRUE TRUE FALSE FALSE
[3,] TRUE TRUE TRUE FALSE
[4,] TRUE TRUE TRUE TRUE
> x <- X[lower.tri(X, diag = TRUE)]
> x
[1] 1 2 3 4 2 3 4 3 4 4
> x[-length(x)]
[1] 1 2 3 4 2 3 4 3 4
I am trying to do the below calculation using R. my function is recursive and it uses a double for loop to calculate values of "result" matrix. Is there a method to replace the for loops or achieve the if condition faster?
x<-rnorm(2400,0, 3)
y<-rnorm(400,0,3)
no_row<-length(x)
no_col<-length(y)
input<-matrix(data=1,nrow = no_row, ncol = no_col)
result<-matrix(nrow = no_row, ncol = no_col)
calculation<-function(x,y)
{
for(i in 1:no_row)
{
for(j in 1:no_col)
{
z<-exp(x[i]-y[j])
result[i,j]<-(z/1+z)
}
}
new_x<-x-1
new_y<-y-1
residual<-input-result
sq_sum_residulas<-sum((rowSums(residual, na.rm = T))^2)
if(sq_sum_residulas>=1){calculation(new_x,new_y)}
else(return(residual))
}
output<-calculation(x,y)
To complete Benjamin answer, you shouldn't use a recursion function. You should instead use a while loop with a max_iter parameter.
Reusing Benjamin function:
calculation2 <- function(x, y){
result <- outer(x, y, function(x, y) { z <- exp(x - y); z / 1 + z})
result
}
calculation <- function(x, y, max_iter = 10){
input <- matrix(data=1,nrow = length(x), ncol = length(y))
sq_sum_residulas <- 1 # Initialize it to enter while loop
new_x <- x # Computation x: it will be updated at each loop
new_y <- y # Computation y
n_iter <- 1 # Counter of iteration
while(sq_sum_residulas >= 1 & n_iter < max_iter){
result <- calculation2(new_x, new_y)
new_x <- x - 1
new_y <- y - 1
residual <- input - result
sq_sum_residulas <- sum((rowSums(residual, na.rm = T))^2)
n_iter <- n_iter + 1
}
if (n_iter == max_iter){
stop("Didn't converge")
}
return(residual)
}
If you try to run this code, you will see that it doesn't converge. I geuess there is a mistake in your computation. Especially in z/1 + z ?
The outer function is the tool you are looking for.
Compare these two functions that only generate the result matrix
x<-rnorm(100,0, 3)
y<-rnorm(100,0,3)
calculation<-function(x,y)
{
result <- matrix(nrow = length(x), ncol = length(y))
for(i in seq_along(x))
{
for(j in seq_along(y))
{
z<-exp(x[i]-y[j])
result[i,j]<-(z/1+z)
}
}
result
}
calculation2 <- function(x, y){
result <- outer(x, y, function(x, y) { z <- exp(x - y); z / 1 + z})
result
}
library(microbenchmark)
microbenchmark(
calculation(x, y),
calculation2(x, y)
)
Unit: microseconds
expr min lq mean median uq max neval
calculation(x, y) 1862.40 1868.119 1941.5523 1871.490 1876.1825 8375.666 100
calculation2(x, y) 466.26 469.192 515.3696 471.392 480.9225 4481.371 100
That discrepancy in time seems to grow as the length of the vectors increases.
Note, this will solve the speed for your double for loop, but there seem to be other issues in your function. It isn't clear to me what you are trying to do, or why you are calling calculation from within itself. As you have it written, there are no changes to x and y before it gets to calling itself again, so it would be stuck in a loop forever, if it worked at all (it doesn't on my machine)
#Benjamin #Emmanuel-Lin Thanks for the solutions :) I was able to solve the issue with your inputs. Please find below the sample data set and code. The solution converges when sq_sum_residual becomes less than 0.01. This is more than 12x faster than my code with double for loops.Sorry for the confusion created by the sample data & new_x, new_y calculation provided in the question.
Input is a dichotomous 9x10 matrix
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
1 NA 1 1 1 1 1 1 1 0 1
2 1 1 1 1 1 1 1 0 1 0
3 1 1 1 1 1 1 0 1 0 0
4 1 1 1 1 1 1 0 1 0 0
5 1 1 1 1 1 1 0 1 0 0
6 1 1 1 1 1 0 1 0 0 0
7 1 1 1 1 0 1 0 0 0 0
8 1 0 1 0 1 0 0 0 0 0
9 0 1 0 1 0 0 0 0 0 0
x<-c( 2.0794415,1.3862944,0.8472979, 0.8472979, 0.8472979,0.4054651,0.0000000, -0.8472979, -1.3862944)
y<-c(-1.4404130, -1.5739444, -1.5739444, -1.5739444, -0.7472659, -0.1876501, 1.1986443 , 0.7286407,2.5849387,2.5849387 )
result<-matrix(nrow = length(x), ncol = length(y))
calculation<-function(x,y)
{
result<-outer(x,y,function(x,y){ z<-exp(x-y);z/(1+z)})
result[!is.finite(result)]<-NA
variance_result<-result*(1-result)
row_var<- (-1)*rowSums(variance_result,na.rm=T)
col_var<- (-1)*colSums(variance_result,na.rm=T)
residual<-input-result
row_residual<-rowSums(residual,na.rm=T)#(not to be multiplied by -1)
col_residual<-(-1)*colSums(residual,na.rm=T)
new_x<-x-(row_residual/row_var)
new_x[!is.finite(new_x)]<-NA
new_x<as.array(new_x)
new_y<-y-(col_residual/col_var)
new_y[!is.finite(new_y)]<-NA
avg_new_y<-mean(new_y, na.rm = T)
new_y<-new_y-avg_new_y
new_y<-as.array(new_y)
sq_sum_residual<-round(sum(row_residual^2),5)
if(sq_sum_residual>=.01)
{calculation(new_x,new_y)}
else(return(residual))
}
calculation(x,y)
Let's say I have something like this:
set.seed(0)
the.df <- data.frame( x=rep(letters[1:3], each=4),
n=rep(0:3, 3),
val=round(runif(12)))
the.df
x n val
1 a 0 1
2 a 1 0
3 a 2 0
4 a 3 1
5 b 0 1
6 b 1 0
7 b 2 1
8 b 3 1
9 c 0 1
10 c 1 1
11 c 2 0
12 c 3 0
Within each x, starting from n==2 (going from small to large), I want to set val to 0 if the previous val (in terms of n) is 0; otherwise, leave it as is.
For example, in the subset x=="b", I first ignore the two rows where n < 2. Now, in Row 7, because the previous val is 0 (the.df$val[the.df$x=="b" & the.df$n==1]), I set val to 0 (the.df$val[the.df$x=="b" & the.df$n==2] <- 0). Then on Row 8, now that val for the previous n is 0 (we just set it), I also want to set val here to 0 (the.df$val[the.df$x=="b" & the.df$n==3] <- 0).
Imagine that the data.frame is not sorted. Therefore procedures that depend on the order would require a sort. I also can't assume that adjacent rows exist (e.g., the row the.df[the.df$x=="a" & the.df$n==1, ] might be missing).
The trickiest part seems to be evaluating val in sequence. I can do this using a loop but I imagine that it would be inefficient (I have millions of rows). Is there a way I can do this more efficiently?
EDIT: wanted output
the.df
x n val wanted
1 a 0 1 1
2 a 1 0 0
3 a 2 0 0
4 a 3 1 0
5 b 0 1 1
6 b 1 0 0
7 b 2 1 0
8 b 3 1 0
9 c 0 1 1
10 c 1 1 1
11 c 2 0 0
12 c 3 0 0
Also, I don't mind making new columns (e.g., putting the wanted values there).
Using data.table I would try the following
library(data.table)
setDT(the.df)[order(n),
val := if(length(indx <- which(val[2:.N] == 0L)))
c(val[1:(indx[1L] + 1L)], rep(0L, .N - (indx[1L] + 1L))),
by = x]
the.df
# x n val
# 1: a 0 1
# 2: a 1 0
# 3: a 2 0
# 4: a 3 0
# 5: b 0 1
# 6: b 1 0
# 7: b 2 0
# 8: b 3 0
# 9: c 0 1
# 10: c 1 1
# 11: c 2 0
# 12: c 3 0
This will simultaneously order the data by n (as you said it's not ordered in real life) and recreate val by condition (meaning that if condition not satisfied, val will be untouched).
Hopefully in the near future this will be implemented and then the code could potentially be
setDT(the.df)[order(n), val[n > 2] := if(val[2L] == 0) 0L, by = x]
Which could be a great improvement both performance and syntax wise
A base R approach might be
df <- the.df[order(the.df$x, the.df$n),]
df$val <- ave(df$val, df$x, FUN=fun)
As for fun, #DavidArenburg's answer in plain R and written a bit more poetically might be
fun0 <- function(v) {
idx <- which.max(v[2:length(v)] == 0L) + 1L
if (length(idx))
v[idx:length(v)] <- 0L
v
}
It seems like a good idea to formulate the solution as an independent function first, because then it is easy to test. fun0 fails for some edge cases, e.g.,
> fun0(0)
[1] 0 0 0
> fun0(1)
[1] 0 0 0
> fun0(c(1, 1))
[1] 1 0
A better version is
fun1 <- function(v) {
tst <- tail(v, -1) == 0L
if (any(tst)) {
idx <- which.max(tst) + 1L
v[idx:length(v)] <- 0L
}
v
}
And even better, following #Arun
fun <- function(v)
if (length(v) > 2) c(v[1], cummin(v[-1])) else v
This is competitive (same order of magnitude) with the data.table solution, with ordering and return occurring in less than 1s for the ~10m row data.frame of #m-dz 's timings. At a second for millions of rows, it doesn't seem worth while to pursue further optimization.
Nonetheless, when there are a very large number of small groups (e.g., 2M each of size 5) an improvement is to avoid the tapply() function call by using group identity to offset the minimum. For instance,
df <- df[order(df$x, df$n),]
grp <- match(df$x, unique(df$x)) # strictly sequential groups
keep <- duplicated(grp) # ignore the first of each group
df$val[keep] <- cummin(df$val[keep] - grp[keep]) + grp[keep]
Hmmm, should be pretty efficient if you switch to data.table...
library(data.table)
# Define the.df as a data.table (or use data.table::setDT() function)
set.seed(0)
the.df <- data.table(
x = rep(letters[1:3], each = 4),
n = rep(0:3, 3),
val = round(runif(12))
)
m_dz <- function() {
setorder(the.df, x, n)
repeat{
# Get IDs of rows to change
# ids <- which(the.df[, (n > 1) & (val == 1) & (shift(val, 1L, type = "lag") == 0)])
ids <- the.df[(n > 1) & (val == 1) & (shift(val, 1L, type = "lag") == 0), , which = TRUE]
# If no IDs break
if(length(ids) == 0){
break
}
# Set val to 0
# for (i in ids) set(the.df, i = i, j = "val", value = 0)
set(the.df, i = ids, j = "val", value = 0)
}
return(the.df)
}
Edit: Above function is slightly modified thanks to #jangorecki's, i.e. uses which = TRUE and set(the.df, i = ids, j = "val", value = 0), which made the timings much more stable (no very high max timings).
Edit: timing comparison with #David Arenburgs's answer on a slightly bigger table, m-dz() updated (#FoldedChromatin's answer skipped because of diffrent results).
My function is slightly faster in terms of median and upper quantile, but there is quite a big spread in timings (see max...), I cannot figure out why. Hopefully the timing methodology is correct (returning the result to different object etc.).
Anything bigger will kill my PC :(
set.seed(0)
groups_ids <- replicate(300, paste(sample(LETTERS, 5, replace=TRUE), collapse = ""))
size1 <- length(unique(groups_ids))
size2 <- round(1e7/size1)
the.df1 <- data.table(
x = rep(groups_ids, each = size2), # 52 * 500 = 26000
n = rep(0:(size2-1), size1),
val = round(runif(size1*size2))
)
the.df2 <- copy(the.df1)
# m-dz
m_dz <- function() {
setorder(df1, x, n)
repeat{
ids <- df1[(n > 1) & (val == 1) & (shift(val, 1L, type = "lag") == 0), , which = TRUE]
if(length(ids) == 0){
break
}
set(df1, i = ids, j = "val", value = 0)
}
return(df1)
}
# David Arenburg
DavidArenburg <- function() {
setorder(df2, x, n)
df2[, val := if(length(indx <- which.max(val[2:.N] == 0) + 1L)) c(val[1:indx], rep(0L, .N - indx)), by = x]
return(df2)
}
library(microbenchmark)
microbenchmark(
res1 <- m_dz(),
res2 <- DavidArenburg(),
times = 100
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# res1 <- m_dz() 247.4136 268.5005 363.0117 288.4216 312.7307 7071.0960 100 a
# res2 <- DavidArenburg() 270.6074 281.3935 314.7864 303.5229 328.1210 525.8095 100 a
identical(res1, res2)
# [1] TRUE
Edit: (Old) results for even bigger table:
set.seed(0)
groups_ids <- replicate(300, paste(sample(LETTERS, 5, replace=TRUE), collapse = ""))
size1 <- length(unique(groups_ids))
size2 <- round(1e8/size1)
# Unit: seconds
# expr min lq mean median uq max neval cld
# res1 <- m_dz() 5.599855 5.800264 8.773817 5.923721 6.021132 289.85107 100 a
# res2 <- m_dz2() 5.571911 5.836191 9.047958 5.970952 6.123419 310.65280 100 a
# res3 <- DavidArenburg() 9.183145 9.519756 9.714105 9.723325 9.918377 10.28965 100 a
Why not just use by
> set.seed(0)
> the.df <- data.frame( x=rep(letters[1:3], each=4),
n=rep(0:3, 3),
val=round(runif(12)))
> the.df
x n val
1 a 0 1
2 a 1 0
3 a 2 0
4 a 3 1
5 b 0 1
6 b 1 0
7 b 2 1
8 b 3 1
9 c 0 1
10 c 1 1
11 c 2 0
12 c 3 0
> Mod.df<-by(the.df,INDICES=the.df$x,function(x){
x$val[x$n==2]=0
Which=which(x$n==2 & x$val==0)+1
x$val[Which]=0
x})
> do.call(rbind,Mod.df)
x n val
a.1 a 0 1
a.2 a 1 0
a.3 a 2 0
a.4 a 3 0
b.5 b 0 1
b.6 b 1 0
b.7 b 2 0
b.8 b 3 0
c.9 c 0 1
c.10 c 1 1
c.11 c 2 0
c.12 c 3 0
I have a pair of binary variables (1's and 0's), and my professor wants me to create a new binary variable that takes the value 1 if both of the previous variables have the value 1 (i.e., x,y=1) and takes the value zero otherwise.
How would I do this in R?
Thanks!
JMC
Here's one example with some sample data to play with:
set.seed(1)
A <- sample(0:1, 10, replace = TRUE)
B <- sample(0:1, 10, replace = TRUE)
A
# [1] 0 0 1 1 0 1 1 1 1 0
B
# [1] 0 0 1 0 1 0 1 1 0 1
as.numeric(A + B == 2)
# [1] 0 0 1 0 0 0 1 1 0 0
as.numeric(rowSums(cbind(A, B)) == 2)
# [1] 0 0 1 0 0 0 1 1 0 0
as.numeric(A == 1 & B == 1)
# [1] 0 0 1 0 0 0 1 1 0 0
Update (to introduce some more alternatives and share a link and a benchmark)
set.seed(1)
A <- sample(0:1, 1e7, replace = TRUE)
B <- sample(0:1, 1e7, replace = TRUE)
fun1 <- function() ifelse(A == 1 & B == 1, 1, 0)
fun2 <- function() as.numeric(A + B == 2)
fun3 <- function() as.numeric(A & B)
fun4 <- function() as.numeric(A == 1 & B == 1)
fun5 <- function() as.numeric(rowSums(cbind(A, B)) == 2)
library(microbenchmark)
microbenchmark(fun1(), fun2(), fun3(), fun4(), fun5(), times = 5)
# Unit: milliseconds
# expr min lq median uq max neval
# fun1() 4842.8559 4871.7072 5022.3525 5093.5932 10424.6589 5
# fun2() 220.8336 220.9867 226.1167 229.1225 472.4408 5
# fun3() 440.7427 445.9342 461.0114 462.6184 488.6627 5
# fun4() 604.1791 613.9284 630.4838 645.2146 682.4689 5
# fun5() 373.8088 373.8532 373.9460 435.0385 1084.6227 5
As can be seen, ifelse is indeed much slower than the other approaches mentioned here. See this SO question and answer for some more details about the efficiency of ifelse.