I have this dataset in which students take an exam multiple times over a period of years - a "fail" is a 0 and a "pass" is a 1. The data looks something like this:
# Load the data.table package
library(data.table)
# Generate some sample data
id = sample.int(10000, 100000, replace = TRUE)
res = c(1,0)
results = sample(res, 100000, replace = TRUE)
date_exam_taken = sample(seq(as.Date('1999/01/01'), as.Date('2020/01/01'), by="day"), 100000, replace = TRUE)
# Create a data table from the sample data
my_data = data.table(id, results, date_exam_taken)
my_data <- my_data[order(id, date_exam_taken)]
# Generate some additional columns for each record
my_data$general_id = 1:nrow(my_data)
my_data$exam_number = ave(my_data$general_id, my_data$id, FUN = seq_along)
my_data$general_id = NULL
id results date_exam_taken exam_number
1: 1 0 2002-10-06 1
2: 1 1 2003-07-21 2
3: 1 1 2003-10-15 3
4: 1 0 2005-07-21 4
5: 1 1 2014-08-22 5
6: 1 1 2015-09-11 6
I want to track the number of times each student failed an exam, given that they failed the two previous exams (and all such combinations). I tried to do this with the data.table library in R:
# Create new columns that contain the previous exam results
my_data$prev_exam = shift(my_data$results, 1)
my_data$prev_2_exam = shift(my_data$results, 2)
my_data$prev_3_exam = shift(my_data$results, 3)
# Count the number of exam results for each record
out <- my_data[!is.na(prev_exam), .(tally = .N), by = .(id, results, prev_exam, prev_2_exam, prev_3_exam)]
out = na.omit(out)
> head(out)
id results prev_exam prev_2_exam prev_3_exam tally
1: 1 1 1 1 1 1
2: 1 0 1 1 1 3
3: 1 0 0 1 1 2
4: 1 1 0 0 1 1
5: 1 0 1 0 0 1
6: 1 1 0 1 0 1
Can someone please tell me if I have done this correctly? Have I correctly used the "shift()" function in data.table?
shift is being used correctly, but it's hard to tell what is going on once we get to my_grid$counts = as.integer(rnorm(8,10,5)).
One thing, though. The table should be filtered on !is.na(prev_3_exam) instead of !is.na(prev_exam).
Here is a function that uses a similar approach to return out and my_grid in a list for the lag specified as a parameter. It uses data.table grouping rather than a for loop.
f1 <- function(dt, lag = 1L) {
if (!identical(key(dt), c("id", "date_exam_taken"))) setkey(dt, id, date_exam_taken)
nms <- paste0("exam", 1:lag)
list(
out = dt2 <- copy(dt)[
,(nms) := shift(results, 1:lag) # see Frank's comment
][
id == shift(id, lag)
][
, .(tally = .N), by = c("id", "results", nms)
],
my_grid = setorderv(
dt2[
, {
counts <- sum(tally)
.(
counts = counts,
probability = sum(results*tally)/counts
)
}, nms
], rev(nms)
)
)
}
Output:
f1(dt, 3L)
#> $out
#> id results exam1 exam2 exam3 tally
#> 1: 1 0 0 1 1 1
#> 2: 1 1 0 0 1 1
#> 3: 1 0 1 0 0 1
#> 4: 1 1 0 1 0 2
#> 5: 1 1 1 0 1 2
#> ---
#> 57437: 10000 1 0 0 0 1
#> 57438: 10000 0 1 0 0 1
#> 57439: 10000 1 0 1 0 1
#> 57440: 10000 0 1 0 1 1
#> 57441: 10000 0 0 1 0 1
#>
#> $my_grid
#> exam1 exam2 exam3 counts probability
#> 1: 0 0 0 8836 0.4980761
#> 2: 1 0 0 8832 0.5005661
#> 3: 0 1 0 8684 0.4947029
#> 4: 1 1 0 8770 0.4976055
#> 5: 0 0 1 8792 0.5013649
#> 6: 1 0 1 8631 0.5070096
#> 7: 0 1 1 8806 0.5021576
#> 8: 1 1 1 8682 0.4997696
If only my_grid is needed, here is a function wrapping an Rcpp function that uses bit shifting to perform the aggregation in a single-pass for loop without creating the helper columns with shift. It will be very fast, and its speed will be only marginally affected by the value of lag.
Rcpp::cppFunction("
IntegerVector exam_contingency(const IntegerVector& id, const IntegerVector& result, const int& lag) {
const int n = id.size();
const int lag1 = lag + 1;
int comb = result(0);
int mask = ~(1 << lag1);
IntegerVector out(pow(2, lag1));
for (int i = 1; i < lag1; i++) comb = (comb << 1) + result(i);
out(comb) = id(lag) == id(0);
for (int i = lag1; i < n; i++) {
comb = ((comb << 1) + result(i)) & mask;
out(comb) += id(i - lag) == id(i);
}
return(out);
}
")
f2 <- function(dt, lag = 1L) {
if (!identical(key(dt), c("id", "date_exam_taken"))) setkey(dt, id, date_exam_taken)
m <- matrix(
exam_contingency(dt$id, dt$results, as.integer(lag)),
2^lag, 2, 1
)
rs <- rowSums(m)
cbind(
if (lag == 1L) {
data.frame(exam1 = 0:1)
} else {
setNames(
expand.grid(rep(list(0:1), lag)),
paste0("exam", 1:lag)
)
},
data.frame(counts = rs, probability = m[,2]/rs)
)
}
It gives the same output as f1's my_grid:
all.equal(f1(dt, 3L)$my_grid, setDT(f2(dt, 3L)))
#> [1] TRUE
Benchmarking:
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1(dt, 3) 8802.2 9706.70 11478.458 10394.00 12134.40 69630.0 100
#> f2(dt, 3) 971.2 1016.40 1179.404 1047.20 1108.65 7733.8 100
#> f2(dt, 10) 1181.3 1208.05 1256.333 1237.65 1302.40 1406.6 100
Data:
library(data.table)
(seed <- sample(.Machine$integer.max, 1))
#> [1] 1784920766
set.seed(seed)
dt <- data.table(
id = sample.int(10000, 100000, replace = TRUE),
results = sample(0:1, 100000, replace = TRUE),
date_exam_taken = sample(seq(as.Date('1999/01/01'), as.Date('2020/01/01'), by="day"), 100000, replace = TRUE)
)
setkey(dt, id, date_exam_taken)
Related
I have this dataset over here (e.g. students wrote an exam many times over a period of years and either pass or failed - I am interested in studying the effect of the previous test on the next test):
id = sample.int(10000, 100000, replace = TRUE)
res = c(1,0)
results = sample(res, 100000, replace = TRUE)
date_exam_taken = sample(seq(as.Date('1999/01/01'), as.Date('2020/01/01'), by="day"), 100000, replace = TRUE)
my_data = data.frame(id, results, date_exam_taken)
my_data <- my_data[order(my_data$id, my_data$date_exam_taken),]
my_data$general_id = 1:nrow(my_data)
my_data$exam_number = ave(my_data$general_id, my_data$id, FUN = seq_along)
my_data$general_id = NULL
id results date_exam_taken exam_number
7992 1 1 2004-04-23 1
24837 1 0 2004-12-10 2
12331 1 1 2007-01-19 3
34396 1 0 2007-02-21 4
85250 1 0 2007-09-26 5
11254 1 1 2009-12-20 6
I wrote this standard FOR LOOP and everything seems to work fine:
my_list = list()
for (i in 1:length(unique(my_data$id)))
{
{tryCatch({
start_i = my_data[my_data$id == i,]
pairs_i = data.frame(first = head(start_i$results, -1), second = tail(start_i$results, -1))
frame_i = as.data.frame(table(pairs_i))
frame_i$id = i
print(frame_i)
my_list[[i]] = frame_i
}, error = function(e){})
}}
final_a = do.call(rbind.data.frame, my_list)
Now, I am trying to "optimize" this loop by using "doParallel" libraries in R.
Using this post (How to transform a "for loop" in a "foreach" loop in R?) as a tutorial, I tried to convert my loop as follows:
# does this mean I should set makeCluster() to makeCluster(8)???
> detectCores()
[1] 8
my_list = list()
max = length(unique(my_data$id))
library(doParallel)
registerDoParallel(cl <- makeCluster(3))
# note: for some reason, this loop isn't printing?
test = foreach(i = 1:max, .combine = "rbind") %dopar% {
{tryCatch({
start_i = my_data[my_data$id == i,]
pairs_i = data.frame(first = head(start_i$results, -1), second = tail(start_i$results, -1))
frame_i = as.data.frame(table(pairs_i))
frame_i$id = i
print(frame_i)
my_list[[i]] = frame_i
}, error = function(e){})
}}
final_b = do.call(rbind.data.frame, test)
Based on this - I have the following questions:
Have I correctly used the "doParallel" functionalities as they are intended to be used?
Is there yet a better way to do this?
Note: I am looking to run this code on a dataset with around 10 million unique ID's
Here is a way with the parallel code written as a function.
I split the data by id beforehand, instead of comparing each id with the current index i. This saves some time. It also saves time to extract the results vector only once.
I don't know why, I haven't found any errors in my parallel code but the final data.frame is not equal to the sequential output final_a, it has more rows.
This is system dependent but as you can see in the timings, the 6 cores run is the fastest.
library(parallel)
library(doParallel)
#> Loading required package: foreach
#> Loading required package: iterators
parFun <- function(my_data, ncores) {
split_data <- split(my_data, my_data$id)
registerDoParallel(cl <- makeCluster(ncores))
on.exit(stopCluster(cl))
test <- foreach(i = seq_along(split_data)) %dopar% {
start_i_results <- split_data[[i]]$results
n <- length(start_i_results)
if(n > 1L) {
tryCatch({
pairs_i <- data.frame(first = start_i_results[-n],
second = start_i_results[-1L])
frame_i <- as.data.frame(table(pairs_i))
frame_i$id <- i
frame_i
}, error = function(e) {e})
} else NULL
}
final_b <- do.call(rbind.data.frame, test)
final_b
}
set.seed(2022)
id <- sample.int(10000, 100000, replace = TRUE)
res <- c(1,0)
results <- sample(res, 100000, replace = TRUE)
date_exam_taken <- sample(seq(as.Date('1999/01/01'), as.Date('2020/01/01'), by="day"), 100000, replace = TRUE)
my_data <- data.frame(id, results, date_exam_taken)
my_data <- my_data[order(my_data$id, my_data$date_exam_taken),]
my_data$general_id = 1:nrow(my_data)
my_data$exam_number = ave(my_data$general_id, my_data$id, FUN = seq_along)
my_data$general_id = NULL
t0 <- system.time({
my_list = list()
for (i in 1:length(unique(my_data$id)))
{
{tryCatch({
start_i = my_data[my_data$id == i,]
pairs_i = data.frame(first = head(start_i$results, -1), second = tail(start_i$results, -1))
frame_i = as.data.frame(table(pairs_i))
frame_i$id = i
# print(frame_i)
my_list[[i]] = frame_i
}, error = function(e){})
}}
final_a = do.call(rbind.data.frame, my_list)
})
ncores <- detectCores()
# run with 3 cores
t3 <- system.time(parFun(my_data, 3L))
# run with 6 cores and save the result in `res6`
t6 <- system.time(res6 <- parFun(my_data, ncores - 2L))
rbind(t0, t3, t6)[,1:3]
#> user.self sys.self elapsed
#> t0 12.86 1.00 15.37
#> t3 3.50 0.22 8.37
#> t6 3.61 0.46 7.65
head(final_a, 10)
#> first second Freq id
#> 1 0 0 2 1
#> 2 1 0 3 1
#> 3 0 1 4 1
#> 4 1 1 0 1
#> 5 0 0 5 2
#> 6 1 0 3 2
#> 7 0 1 2 2
#> 8 1 1 0 2
#> 9 0 0 0 3
#> 10 1 0 1 3
head(res6, 10)
#> first second Freq id
#> 1 0 0 2 1
#> 2 1 0 3 1
#> 3 0 1 4 1
#> 4 1 1 0 1
#> 5 0 0 5 2
#> 6 1 0 3 2
#> 7 0 1 2 2
#> 8 1 1 0 2
#> 9 0 0 0 3
#> 10 1 0 1 3
str(final_a)
#> 'data.frame': 38945 obs. of 4 variables:
#> $ first : Factor w/ 2 levels "0","1": 1 2 1 2 1 2 1 2 1 2 ...
#> $ second: Factor w/ 2 levels "0","1": 1 1 2 2 1 1 2 2 1 1 ...
#> $ Freq : int 2 3 4 0 5 3 2 0 0 1 ...
#> $ id : int 1 1 1 1 2 2 2 2 3 3 ...
str(res6)
#> 'data.frame': 38949 obs. of 4 variables:
#> $ first : Factor w/ 2 levels "0","1": 1 2 1 2 1 2 1 2 1 2 ...
#> $ second: Factor w/ 2 levels "0","1": 1 1 2 2 1 1 2 2 1 1 ...
#> $ Freq : int 2 3 4 0 5 3 2 0 0 1 ...
#> $ id : int 1 1 1 1 2 2 2 2 3 3 ...
Created on 2022-12-11 with reprex v2.0.2
Edit
The following version seems much faster.
parFun2 <- function(my_data, ncores) {
registerDoParallel(cl <- makeCluster(ncores))
on.exit(stopCluster(cl))
results_list <- split(my_data$results, my_data$id)
test <- foreach(i = seq_along(results_list)) %dopar% {
start_i_results <- results_list[[i]]
n <- length(start_i_results)
if(n > 1L) {
tbl <- table(first = start_i_results[-n],
second = start_i_results[-1L])
frame_i <- as.data.frame(tbl)
frame_i$id <- i
frame_i
} else NULL
}
data.table::rbindlist(test)
}
I can´t nest an apply funtion to repeat it many times with other combination of columns
I need to get a percentage of sp1==1 & s1==1and sp2==1 & s1==1 regarding s1, and in the same way regarding s2, s3... s1000. Here a short example:
x <- data.frame("sp1"=rep(0:1, times=5),
"sp2"=rep(0:1, each=5),
"s1" = rep(0:1, times=10),
"s2" = rep(0:1, each=2),
"s3" = rep(1:0, each=2))
> x
sp1 sp2 s1 s2 s3
1 0 0 0 0 1
2 1 0 1 0 1
3 0 0 0 1 0
4 1 0 1 1 0
5 0 0 0 0 1
6 1 1 1 0 1
7 0 1 0 1 0
8 1 1 1 1 0
9 0 1 0 0 1
10 1 1 1 0 1
11 0 0 0 1 0
12 1 0 1 1 0
13 0 0 0 0 1
14 1 0 1 0 1
15 0 0 0 1 0
16 1 1 1 1 0
17 0 1 0 0 1
18 1 1 1 0 1
19 0 1 0 1 0
20 1 1 1 1 0
Now I typed a function to calculate percentage regarding s1:
r <- as.data.frame(sapply(x[,1:2],
function(i) sum(i ==1 & x$s1 == 1)/sum(i ==1)))
> r
sapply(x[, 1:2], function(i) sum(i == 1 & x$s1 == 1)/sum(i == 1))
sp1 1.0
sp2 0.6
I want to built a df with all percentages of sp1, sp2, sp3, ...sp200 regarding s1, s2, s3, ...s1000...
> r
s1 s2 s3 ... s1000
sp1 1.0 0.5 0.5
sp2 0.6 0.5 0.5
...
sp200
I've tried to do a function with both groups-variables, one for sp's and another for s's:
intento <- as.data.frame(sapply(i=x[,1:2],
j=x[,3:5],
function(i,j)sum(i ==1 & j == 1)/sum(i ==1)))
But logically that´s not the way:
Error in match.fun(FUN) : argument "FUN" is missing, with no default
We can seperate the columns based on their names and use sapply on them
sp_cols <- grep("^sp", names(x))
s_cols <- grep("^s\\d+", names(x))
sapply(x[sp_cols], function(i) sapply(x[s_cols],
function(j) sum(i == 1 & j == 1)/sum(i == 1)))
If you have only 1 and 0's as values in the columns this can be reduced to
sapply(x[s_cols], function(i) sapply(x[sp_cols], function(j) sum(i & j)/sum(j)))
# s1 s2 s3
#sp1 1.0 0.5 0.5
#sp2 0.6 0.5 0.5
You're looking for outer. Your function just needs to be Vectorized.
FUN <- Vectorize(function(i,j) sum(x[i] == 1 & x[j] == 1)/sum(x[i] == 1))
outer(1:2, 3:5, FUN)
# [,1] [,2] [,3]
# [1,] 1.0 0.5 0.5
# [2,] 0.6 0.5 0.5
You could refine this using grep to find the columns automatically
outer(grep("sp", names(x)), grep("s\\d+", names(x)), FUN)
A similar approach is to use lapply(x, function(x) which(x == 1) and then use that down the road. The thought process being that we might as well store the information instead of repeatedly checking it.
#as suggested by #Ronak
sp_cols <- grep("^sp", names(x))
s_cols <- grep("^s\\d+", names(x))
x_l_zero <- lapply(x, function(x) which(x == 1))
sapply(x_l_zero[s_cols]
, function(x) sapply(x_l_zero[sp_cols]
, function(y) length(intersect(x,y))/length(y)))
s1 s2 s3
sp1 1.0 0.5 0.5
sp2 0.6 0.5 0.5
#Ronak has the fastest solution and is more-or-less the OP's code that's been addressed.
Unit: microseconds
expr min lq mean median uq max neval
jay.sf_outer_FUN 1190.8 1240.85 1360.103 1284.50 1337.30 2627.0 100
cole_which_apply 268.4 289.00 454.609 306.05 322.00 7610.7 100
ronak_1_unsimple 181.3 193.95 321.863 209.95 233.40 6227.4 100
ronak_2_simple 228.5 241.25 342.354 250.65 276.05 7478.4 100
akrun_dplyr 5218.7 5506.05 6108.997 5721.80 6081.65 25147.3 100
Code for performance:
library(microbenchmark)
library(tidyverse)
##data set
x <- data.frame("sp1"=rep(0:1, times=5),
"sp2"=rep(0:1, each=5),
"s1" = rep(0:1, times=10),
"s2" = rep(0:1, each=2),
"s3" = rep(1:0, each=2))
#for jay.sf
FUN <- Vectorize(function(i,j) sum(x[i] == 1 & x[j] == 1)/sum(x[i] == 1))
#names of columns
sp_cols <- grep("^sp", names(x))
s_cols <- grep("^s\\d+", names(x))
sp_cols_nam <- grep("^sp", names(x), value = T)
s_cols_nam <- grep("^s\\d+", names(x), value = T)
#benchmark
microbenchmark(
outer_FUN = {
outer(sp_cols, s_cols, FUN)
}
, apply_heaven = {
x_l_zero <- lapply(x, function(x) which(x == 1))
sapply(x_l_zero[s_cols], function(x) sapply(x_l_zero[sp_cols] , function(y) length(intersect(x,y))/length(y)))
}
, ronak_1_unsimple = {
sapply(x[sp_cols], function(i) sapply(x[s_cols],
function(j) sum(i == 1 & j == 1)/sum(i == 1)))
}
, ronak_2_simple = {
sapply(x[s_cols], function(i) sapply(x[sp_cols], function(j) sum(i & j)/sum(j)))
}
, akrun_dplyr = {
crossing(nm1 = sp_cols_nam,
nm2 = s_cols_nam) %>%
mutate(val = pmap_dbl(., ~ sum(x[..1] ==1 & x[..2] == 1)/sum(x[..1]))) %>%
spread(nm2, val)
}
)
Here is an option with tidyverse
library(tidyverse)
crossing(nm1 = names(x)[startsWith(names(x), "sp")],
nm2 = grep("^s\\d+", names(x), value = TRUE)) %>%
mutate(val = pmap_dbl(., ~ sum(x[..1] ==1 & x[..2] == 1)/sum(x[..1]))) %>%
spread(nm2, val)
# A tibble: 2 x 4
# nm1 s1 s2 s3
# <chr> <dbl> <dbl> <dbl>
#1 sp1 1 0.5 0.5
#2 sp2 0.6 0.5 0.5
I have a table in the long format:
require(data.table)
sampleDT <- data.table(Old = c("A","B","A","B","A","B","A","B")
, New = c("A","A","B","B","A","A","B","B")
, Time = c(1,1,1,1,2,2,2,2)
, value1 = c(1,1,1,1,1,1,1,1)
, value2 = c(0,0,0,0,0,0,0,0))
print(sampleDT)
Old New Time value1 value2
1: A A 1 1 0
2: B A 1 1 0
3: A B 1 1 0
4: B B 1 1 0
5: A A 2 1 0
6: B A 2 1 0
7: A B 2 1 0
8: B B 2 1 0
I would like to convert to an array of 3 dimensions. Something like:
Basically, we would have columns "New, Old, Time" as our three dimensions.
And the value for each cell is an output of some sort of functions whose input are "value1, value2".
In this case, when Time = 1, the result is:
matrix(data = c(1, 1+0, 0, -0), nrow = 2, ncol = 2, byrow = FALSE)
[,1] [,2]
[1,] 1 0
[2,] 1 0
How to achieve it?
Memory usage and computing time are important considerations, as we're working on relatively big datasets.
Try xtabs():
sampleDT <- data.frame(Old = c("A","B","A","B","A","B","A","B"),
New = c("A","A","B","B","A","A","B","B"),
Time = c(1,1,1,1,2,2,2,2),
value1 = c(1,1,1,1,1,1,1,1),
value2 = c(0,0,0,0,0,0,0,0))
Value1 <- xtabs(value1 ~ Old + New + Time, sampleDT, drop = FALSE)
Value2 <- xtabs(value2 ~ Old + New + Time, sampleDT, drop = FALSE)
is.array(Value1)
is.array(Value2)
Value1[, 2,] <- 0 # Sets all second columns to zero for Value1
Value2[1,,] <- 0 # Idem with first row for Value2
Value2[2,2,] <- Value2[2,2,] * (-1)
Result <- Value1 + Value2
Result
, , Time = 1
New
Old A B
A 1 0
B 1 0
, , Time = 2
New
Old A B
A 1 0
B 1 0
Hope it helps.
Let's say I have something like this:
set.seed(0)
the.df <- data.frame( x=rep(letters[1:3], each=4),
n=rep(0:3, 3),
val=round(runif(12)))
the.df
x n val
1 a 0 1
2 a 1 0
3 a 2 0
4 a 3 1
5 b 0 1
6 b 1 0
7 b 2 1
8 b 3 1
9 c 0 1
10 c 1 1
11 c 2 0
12 c 3 0
Within each x, starting from n==2 (going from small to large), I want to set val to 0 if the previous val (in terms of n) is 0; otherwise, leave it as is.
For example, in the subset x=="b", I first ignore the two rows where n < 2. Now, in Row 7, because the previous val is 0 (the.df$val[the.df$x=="b" & the.df$n==1]), I set val to 0 (the.df$val[the.df$x=="b" & the.df$n==2] <- 0). Then on Row 8, now that val for the previous n is 0 (we just set it), I also want to set val here to 0 (the.df$val[the.df$x=="b" & the.df$n==3] <- 0).
Imagine that the data.frame is not sorted. Therefore procedures that depend on the order would require a sort. I also can't assume that adjacent rows exist (e.g., the row the.df[the.df$x=="a" & the.df$n==1, ] might be missing).
The trickiest part seems to be evaluating val in sequence. I can do this using a loop but I imagine that it would be inefficient (I have millions of rows). Is there a way I can do this more efficiently?
EDIT: wanted output
the.df
x n val wanted
1 a 0 1 1
2 a 1 0 0
3 a 2 0 0
4 a 3 1 0
5 b 0 1 1
6 b 1 0 0
7 b 2 1 0
8 b 3 1 0
9 c 0 1 1
10 c 1 1 1
11 c 2 0 0
12 c 3 0 0
Also, I don't mind making new columns (e.g., putting the wanted values there).
Using data.table I would try the following
library(data.table)
setDT(the.df)[order(n),
val := if(length(indx <- which(val[2:.N] == 0L)))
c(val[1:(indx[1L] + 1L)], rep(0L, .N - (indx[1L] + 1L))),
by = x]
the.df
# x n val
# 1: a 0 1
# 2: a 1 0
# 3: a 2 0
# 4: a 3 0
# 5: b 0 1
# 6: b 1 0
# 7: b 2 0
# 8: b 3 0
# 9: c 0 1
# 10: c 1 1
# 11: c 2 0
# 12: c 3 0
This will simultaneously order the data by n (as you said it's not ordered in real life) and recreate val by condition (meaning that if condition not satisfied, val will be untouched).
Hopefully in the near future this will be implemented and then the code could potentially be
setDT(the.df)[order(n), val[n > 2] := if(val[2L] == 0) 0L, by = x]
Which could be a great improvement both performance and syntax wise
A base R approach might be
df <- the.df[order(the.df$x, the.df$n),]
df$val <- ave(df$val, df$x, FUN=fun)
As for fun, #DavidArenburg's answer in plain R and written a bit more poetically might be
fun0 <- function(v) {
idx <- which.max(v[2:length(v)] == 0L) + 1L
if (length(idx))
v[idx:length(v)] <- 0L
v
}
It seems like a good idea to formulate the solution as an independent function first, because then it is easy to test. fun0 fails for some edge cases, e.g.,
> fun0(0)
[1] 0 0 0
> fun0(1)
[1] 0 0 0
> fun0(c(1, 1))
[1] 1 0
A better version is
fun1 <- function(v) {
tst <- tail(v, -1) == 0L
if (any(tst)) {
idx <- which.max(tst) + 1L
v[idx:length(v)] <- 0L
}
v
}
And even better, following #Arun
fun <- function(v)
if (length(v) > 2) c(v[1], cummin(v[-1])) else v
This is competitive (same order of magnitude) with the data.table solution, with ordering and return occurring in less than 1s for the ~10m row data.frame of #m-dz 's timings. At a second for millions of rows, it doesn't seem worth while to pursue further optimization.
Nonetheless, when there are a very large number of small groups (e.g., 2M each of size 5) an improvement is to avoid the tapply() function call by using group identity to offset the minimum. For instance,
df <- df[order(df$x, df$n),]
grp <- match(df$x, unique(df$x)) # strictly sequential groups
keep <- duplicated(grp) # ignore the first of each group
df$val[keep] <- cummin(df$val[keep] - grp[keep]) + grp[keep]
Hmmm, should be pretty efficient if you switch to data.table...
library(data.table)
# Define the.df as a data.table (or use data.table::setDT() function)
set.seed(0)
the.df <- data.table(
x = rep(letters[1:3], each = 4),
n = rep(0:3, 3),
val = round(runif(12))
)
m_dz <- function() {
setorder(the.df, x, n)
repeat{
# Get IDs of rows to change
# ids <- which(the.df[, (n > 1) & (val == 1) & (shift(val, 1L, type = "lag") == 0)])
ids <- the.df[(n > 1) & (val == 1) & (shift(val, 1L, type = "lag") == 0), , which = TRUE]
# If no IDs break
if(length(ids) == 0){
break
}
# Set val to 0
# for (i in ids) set(the.df, i = i, j = "val", value = 0)
set(the.df, i = ids, j = "val", value = 0)
}
return(the.df)
}
Edit: Above function is slightly modified thanks to #jangorecki's, i.e. uses which = TRUE and set(the.df, i = ids, j = "val", value = 0), which made the timings much more stable (no very high max timings).
Edit: timing comparison with #David Arenburgs's answer on a slightly bigger table, m-dz() updated (#FoldedChromatin's answer skipped because of diffrent results).
My function is slightly faster in terms of median and upper quantile, but there is quite a big spread in timings (see max...), I cannot figure out why. Hopefully the timing methodology is correct (returning the result to different object etc.).
Anything bigger will kill my PC :(
set.seed(0)
groups_ids <- replicate(300, paste(sample(LETTERS, 5, replace=TRUE), collapse = ""))
size1 <- length(unique(groups_ids))
size2 <- round(1e7/size1)
the.df1 <- data.table(
x = rep(groups_ids, each = size2), # 52 * 500 = 26000
n = rep(0:(size2-1), size1),
val = round(runif(size1*size2))
)
the.df2 <- copy(the.df1)
# m-dz
m_dz <- function() {
setorder(df1, x, n)
repeat{
ids <- df1[(n > 1) & (val == 1) & (shift(val, 1L, type = "lag") == 0), , which = TRUE]
if(length(ids) == 0){
break
}
set(df1, i = ids, j = "val", value = 0)
}
return(df1)
}
# David Arenburg
DavidArenburg <- function() {
setorder(df2, x, n)
df2[, val := if(length(indx <- which.max(val[2:.N] == 0) + 1L)) c(val[1:indx], rep(0L, .N - indx)), by = x]
return(df2)
}
library(microbenchmark)
microbenchmark(
res1 <- m_dz(),
res2 <- DavidArenburg(),
times = 100
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# res1 <- m_dz() 247.4136 268.5005 363.0117 288.4216 312.7307 7071.0960 100 a
# res2 <- DavidArenburg() 270.6074 281.3935 314.7864 303.5229 328.1210 525.8095 100 a
identical(res1, res2)
# [1] TRUE
Edit: (Old) results for even bigger table:
set.seed(0)
groups_ids <- replicate(300, paste(sample(LETTERS, 5, replace=TRUE), collapse = ""))
size1 <- length(unique(groups_ids))
size2 <- round(1e8/size1)
# Unit: seconds
# expr min lq mean median uq max neval cld
# res1 <- m_dz() 5.599855 5.800264 8.773817 5.923721 6.021132 289.85107 100 a
# res2 <- m_dz2() 5.571911 5.836191 9.047958 5.970952 6.123419 310.65280 100 a
# res3 <- DavidArenburg() 9.183145 9.519756 9.714105 9.723325 9.918377 10.28965 100 a
Why not just use by
> set.seed(0)
> the.df <- data.frame( x=rep(letters[1:3], each=4),
n=rep(0:3, 3),
val=round(runif(12)))
> the.df
x n val
1 a 0 1
2 a 1 0
3 a 2 0
4 a 3 1
5 b 0 1
6 b 1 0
7 b 2 1
8 b 3 1
9 c 0 1
10 c 1 1
11 c 2 0
12 c 3 0
> Mod.df<-by(the.df,INDICES=the.df$x,function(x){
x$val[x$n==2]=0
Which=which(x$n==2 & x$val==0)+1
x$val[Which]=0
x})
> do.call(rbind,Mod.df)
x n val
a.1 a 0 1
a.2 a 1 0
a.3 a 2 0
a.4 a 3 0
b.5 b 0 1
b.6 b 1 0
b.7 b 2 0
b.8 b 3 0
c.9 c 0 1
c.10 c 1 1
c.11 c 2 0
c.12 c 3 0
I have a pair of binary variables (1's and 0's), and my professor wants me to create a new binary variable that takes the value 1 if both of the previous variables have the value 1 (i.e., x,y=1) and takes the value zero otherwise.
How would I do this in R?
Thanks!
JMC
Here's one example with some sample data to play with:
set.seed(1)
A <- sample(0:1, 10, replace = TRUE)
B <- sample(0:1, 10, replace = TRUE)
A
# [1] 0 0 1 1 0 1 1 1 1 0
B
# [1] 0 0 1 0 1 0 1 1 0 1
as.numeric(A + B == 2)
# [1] 0 0 1 0 0 0 1 1 0 0
as.numeric(rowSums(cbind(A, B)) == 2)
# [1] 0 0 1 0 0 0 1 1 0 0
as.numeric(A == 1 & B == 1)
# [1] 0 0 1 0 0 0 1 1 0 0
Update (to introduce some more alternatives and share a link and a benchmark)
set.seed(1)
A <- sample(0:1, 1e7, replace = TRUE)
B <- sample(0:1, 1e7, replace = TRUE)
fun1 <- function() ifelse(A == 1 & B == 1, 1, 0)
fun2 <- function() as.numeric(A + B == 2)
fun3 <- function() as.numeric(A & B)
fun4 <- function() as.numeric(A == 1 & B == 1)
fun5 <- function() as.numeric(rowSums(cbind(A, B)) == 2)
library(microbenchmark)
microbenchmark(fun1(), fun2(), fun3(), fun4(), fun5(), times = 5)
# Unit: milliseconds
# expr min lq median uq max neval
# fun1() 4842.8559 4871.7072 5022.3525 5093.5932 10424.6589 5
# fun2() 220.8336 220.9867 226.1167 229.1225 472.4408 5
# fun3() 440.7427 445.9342 461.0114 462.6184 488.6627 5
# fun4() 604.1791 613.9284 630.4838 645.2146 682.4689 5
# fun5() 373.8088 373.8532 373.9460 435.0385 1084.6227 5
As can be seen, ifelse is indeed much slower than the other approaches mentioned here. See this SO question and answer for some more details about the efficiency of ifelse.