I have this dataset over here (e.g. students wrote an exam many times over a period of years and either pass or failed - I am interested in studying the effect of the previous test on the next test):
id = sample.int(10000, 100000, replace = TRUE)
res = c(1,0)
results = sample(res, 100000, replace = TRUE)
date_exam_taken = sample(seq(as.Date('1999/01/01'), as.Date('2020/01/01'), by="day"), 100000, replace = TRUE)
my_data = data.frame(id, results, date_exam_taken)
my_data <- my_data[order(my_data$id, my_data$date_exam_taken),]
my_data$general_id = 1:nrow(my_data)
my_data$exam_number = ave(my_data$general_id, my_data$id, FUN = seq_along)
my_data$general_id = NULL
id results date_exam_taken exam_number
7992 1 1 2004-04-23 1
24837 1 0 2004-12-10 2
12331 1 1 2007-01-19 3
34396 1 0 2007-02-21 4
85250 1 0 2007-09-26 5
11254 1 1 2009-12-20 6
I wrote this standard FOR LOOP and everything seems to work fine:
my_list = list()
for (i in 1:length(unique(my_data$id)))
{
{tryCatch({
start_i = my_data[my_data$id == i,]
pairs_i = data.frame(first = head(start_i$results, -1), second = tail(start_i$results, -1))
frame_i = as.data.frame(table(pairs_i))
frame_i$id = i
print(frame_i)
my_list[[i]] = frame_i
}, error = function(e){})
}}
final_a = do.call(rbind.data.frame, my_list)
Now, I am trying to "optimize" this loop by using "doParallel" libraries in R.
Using this post (How to transform a "for loop" in a "foreach" loop in R?) as a tutorial, I tried to convert my loop as follows:
# does this mean I should set makeCluster() to makeCluster(8)???
> detectCores()
[1] 8
my_list = list()
max = length(unique(my_data$id))
library(doParallel)
registerDoParallel(cl <- makeCluster(3))
# note: for some reason, this loop isn't printing?
test = foreach(i = 1:max, .combine = "rbind") %dopar% {
{tryCatch({
start_i = my_data[my_data$id == i,]
pairs_i = data.frame(first = head(start_i$results, -1), second = tail(start_i$results, -1))
frame_i = as.data.frame(table(pairs_i))
frame_i$id = i
print(frame_i)
my_list[[i]] = frame_i
}, error = function(e){})
}}
final_b = do.call(rbind.data.frame, test)
Based on this - I have the following questions:
Have I correctly used the "doParallel" functionalities as they are intended to be used?
Is there yet a better way to do this?
Note: I am looking to run this code on a dataset with around 10 million unique ID's
Here is a way with the parallel code written as a function.
I split the data by id beforehand, instead of comparing each id with the current index i. This saves some time. It also saves time to extract the results vector only once.
I don't know why, I haven't found any errors in my parallel code but the final data.frame is not equal to the sequential output final_a, it has more rows.
This is system dependent but as you can see in the timings, the 6 cores run is the fastest.
library(parallel)
library(doParallel)
#> Loading required package: foreach
#> Loading required package: iterators
parFun <- function(my_data, ncores) {
split_data <- split(my_data, my_data$id)
registerDoParallel(cl <- makeCluster(ncores))
on.exit(stopCluster(cl))
test <- foreach(i = seq_along(split_data)) %dopar% {
start_i_results <- split_data[[i]]$results
n <- length(start_i_results)
if(n > 1L) {
tryCatch({
pairs_i <- data.frame(first = start_i_results[-n],
second = start_i_results[-1L])
frame_i <- as.data.frame(table(pairs_i))
frame_i$id <- i
frame_i
}, error = function(e) {e})
} else NULL
}
final_b <- do.call(rbind.data.frame, test)
final_b
}
set.seed(2022)
id <- sample.int(10000, 100000, replace = TRUE)
res <- c(1,0)
results <- sample(res, 100000, replace = TRUE)
date_exam_taken <- sample(seq(as.Date('1999/01/01'), as.Date('2020/01/01'), by="day"), 100000, replace = TRUE)
my_data <- data.frame(id, results, date_exam_taken)
my_data <- my_data[order(my_data$id, my_data$date_exam_taken),]
my_data$general_id = 1:nrow(my_data)
my_data$exam_number = ave(my_data$general_id, my_data$id, FUN = seq_along)
my_data$general_id = NULL
t0 <- system.time({
my_list = list()
for (i in 1:length(unique(my_data$id)))
{
{tryCatch({
start_i = my_data[my_data$id == i,]
pairs_i = data.frame(first = head(start_i$results, -1), second = tail(start_i$results, -1))
frame_i = as.data.frame(table(pairs_i))
frame_i$id = i
# print(frame_i)
my_list[[i]] = frame_i
}, error = function(e){})
}}
final_a = do.call(rbind.data.frame, my_list)
})
ncores <- detectCores()
# run with 3 cores
t3 <- system.time(parFun(my_data, 3L))
# run with 6 cores and save the result in `res6`
t6 <- system.time(res6 <- parFun(my_data, ncores - 2L))
rbind(t0, t3, t6)[,1:3]
#> user.self sys.self elapsed
#> t0 12.86 1.00 15.37
#> t3 3.50 0.22 8.37
#> t6 3.61 0.46 7.65
head(final_a, 10)
#> first second Freq id
#> 1 0 0 2 1
#> 2 1 0 3 1
#> 3 0 1 4 1
#> 4 1 1 0 1
#> 5 0 0 5 2
#> 6 1 0 3 2
#> 7 0 1 2 2
#> 8 1 1 0 2
#> 9 0 0 0 3
#> 10 1 0 1 3
head(res6, 10)
#> first second Freq id
#> 1 0 0 2 1
#> 2 1 0 3 1
#> 3 0 1 4 1
#> 4 1 1 0 1
#> 5 0 0 5 2
#> 6 1 0 3 2
#> 7 0 1 2 2
#> 8 1 1 0 2
#> 9 0 0 0 3
#> 10 1 0 1 3
str(final_a)
#> 'data.frame': 38945 obs. of 4 variables:
#> $ first : Factor w/ 2 levels "0","1": 1 2 1 2 1 2 1 2 1 2 ...
#> $ second: Factor w/ 2 levels "0","1": 1 1 2 2 1 1 2 2 1 1 ...
#> $ Freq : int 2 3 4 0 5 3 2 0 0 1 ...
#> $ id : int 1 1 1 1 2 2 2 2 3 3 ...
str(res6)
#> 'data.frame': 38949 obs. of 4 variables:
#> $ first : Factor w/ 2 levels "0","1": 1 2 1 2 1 2 1 2 1 2 ...
#> $ second: Factor w/ 2 levels "0","1": 1 1 2 2 1 1 2 2 1 1 ...
#> $ Freq : int 2 3 4 0 5 3 2 0 0 1 ...
#> $ id : int 1 1 1 1 2 2 2 2 3 3 ...
Created on 2022-12-11 with reprex v2.0.2
Edit
The following version seems much faster.
parFun2 <- function(my_data, ncores) {
registerDoParallel(cl <- makeCluster(ncores))
on.exit(stopCluster(cl))
results_list <- split(my_data$results, my_data$id)
test <- foreach(i = seq_along(results_list)) %dopar% {
start_i_results <- results_list[[i]]
n <- length(start_i_results)
if(n > 1L) {
tbl <- table(first = start_i_results[-n],
second = start_i_results[-1L])
frame_i <- as.data.frame(tbl)
frame_i$id <- i
frame_i
} else NULL
}
data.table::rbindlist(test)
}
Related
I have this dataset in which students take an exam multiple times over a period of years - a "fail" is a 0 and a "pass" is a 1. The data looks something like this:
# Load the data.table package
library(data.table)
# Generate some sample data
id = sample.int(10000, 100000, replace = TRUE)
res = c(1,0)
results = sample(res, 100000, replace = TRUE)
date_exam_taken = sample(seq(as.Date('1999/01/01'), as.Date('2020/01/01'), by="day"), 100000, replace = TRUE)
# Create a data table from the sample data
my_data = data.table(id, results, date_exam_taken)
my_data <- my_data[order(id, date_exam_taken)]
# Generate some additional columns for each record
my_data$general_id = 1:nrow(my_data)
my_data$exam_number = ave(my_data$general_id, my_data$id, FUN = seq_along)
my_data$general_id = NULL
id results date_exam_taken exam_number
1: 1 0 2002-10-06 1
2: 1 1 2003-07-21 2
3: 1 1 2003-10-15 3
4: 1 0 2005-07-21 4
5: 1 1 2014-08-22 5
6: 1 1 2015-09-11 6
I want to track the number of times each student failed an exam, given that they failed the two previous exams (and all such combinations). I tried to do this with the data.table library in R:
# Create new columns that contain the previous exam results
my_data$prev_exam = shift(my_data$results, 1)
my_data$prev_2_exam = shift(my_data$results, 2)
my_data$prev_3_exam = shift(my_data$results, 3)
# Count the number of exam results for each record
out <- my_data[!is.na(prev_exam), .(tally = .N), by = .(id, results, prev_exam, prev_2_exam, prev_3_exam)]
out = na.omit(out)
> head(out)
id results prev_exam prev_2_exam prev_3_exam tally
1: 1 1 1 1 1 1
2: 1 0 1 1 1 3
3: 1 0 0 1 1 2
4: 1 1 0 0 1 1
5: 1 0 1 0 0 1
6: 1 1 0 1 0 1
Can someone please tell me if I have done this correctly? Have I correctly used the "shift()" function in data.table?
shift is being used correctly, but it's hard to tell what is going on once we get to my_grid$counts = as.integer(rnorm(8,10,5)).
One thing, though. The table should be filtered on !is.na(prev_3_exam) instead of !is.na(prev_exam).
Here is a function that uses a similar approach to return out and my_grid in a list for the lag specified as a parameter. It uses data.table grouping rather than a for loop.
f1 <- function(dt, lag = 1L) {
if (!identical(key(dt), c("id", "date_exam_taken"))) setkey(dt, id, date_exam_taken)
nms <- paste0("exam", 1:lag)
list(
out = dt2 <- copy(dt)[
,(nms) := shift(results, 1:lag) # see Frank's comment
][
id == shift(id, lag)
][
, .(tally = .N), by = c("id", "results", nms)
],
my_grid = setorderv(
dt2[
, {
counts <- sum(tally)
.(
counts = counts,
probability = sum(results*tally)/counts
)
}, nms
], rev(nms)
)
)
}
Output:
f1(dt, 3L)
#> $out
#> id results exam1 exam2 exam3 tally
#> 1: 1 0 0 1 1 1
#> 2: 1 1 0 0 1 1
#> 3: 1 0 1 0 0 1
#> 4: 1 1 0 1 0 2
#> 5: 1 1 1 0 1 2
#> ---
#> 57437: 10000 1 0 0 0 1
#> 57438: 10000 0 1 0 0 1
#> 57439: 10000 1 0 1 0 1
#> 57440: 10000 0 1 0 1 1
#> 57441: 10000 0 0 1 0 1
#>
#> $my_grid
#> exam1 exam2 exam3 counts probability
#> 1: 0 0 0 8836 0.4980761
#> 2: 1 0 0 8832 0.5005661
#> 3: 0 1 0 8684 0.4947029
#> 4: 1 1 0 8770 0.4976055
#> 5: 0 0 1 8792 0.5013649
#> 6: 1 0 1 8631 0.5070096
#> 7: 0 1 1 8806 0.5021576
#> 8: 1 1 1 8682 0.4997696
If only my_grid is needed, here is a function wrapping an Rcpp function that uses bit shifting to perform the aggregation in a single-pass for loop without creating the helper columns with shift. It will be very fast, and its speed will be only marginally affected by the value of lag.
Rcpp::cppFunction("
IntegerVector exam_contingency(const IntegerVector& id, const IntegerVector& result, const int& lag) {
const int n = id.size();
const int lag1 = lag + 1;
int comb = result(0);
int mask = ~(1 << lag1);
IntegerVector out(pow(2, lag1));
for (int i = 1; i < lag1; i++) comb = (comb << 1) + result(i);
out(comb) = id(lag) == id(0);
for (int i = lag1; i < n; i++) {
comb = ((comb << 1) + result(i)) & mask;
out(comb) += id(i - lag) == id(i);
}
return(out);
}
")
f2 <- function(dt, lag = 1L) {
if (!identical(key(dt), c("id", "date_exam_taken"))) setkey(dt, id, date_exam_taken)
m <- matrix(
exam_contingency(dt$id, dt$results, as.integer(lag)),
2^lag, 2, 1
)
rs <- rowSums(m)
cbind(
if (lag == 1L) {
data.frame(exam1 = 0:1)
} else {
setNames(
expand.grid(rep(list(0:1), lag)),
paste0("exam", 1:lag)
)
},
data.frame(counts = rs, probability = m[,2]/rs)
)
}
It gives the same output as f1's my_grid:
all.equal(f1(dt, 3L)$my_grid, setDT(f2(dt, 3L)))
#> [1] TRUE
Benchmarking:
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1(dt, 3) 8802.2 9706.70 11478.458 10394.00 12134.40 69630.0 100
#> f2(dt, 3) 971.2 1016.40 1179.404 1047.20 1108.65 7733.8 100
#> f2(dt, 10) 1181.3 1208.05 1256.333 1237.65 1302.40 1406.6 100
Data:
library(data.table)
(seed <- sample(.Machine$integer.max, 1))
#> [1] 1784920766
set.seed(seed)
dt <- data.table(
id = sample.int(10000, 100000, replace = TRUE),
results = sample(0:1, 100000, replace = TRUE),
date_exam_taken = sample(seq(as.Date('1999/01/01'), as.Date('2020/01/01'), by="day"), 100000, replace = TRUE)
)
setkey(dt, id, date_exam_taken)
I want to find all possible combinations (without replacement) of a big sparse matrix. Every combination can choose at most one time from each row and column. My goal is to find the combination that maximizes the sum of the chosen entries.
Say I have the following matrix:
6 8 . .
. 5 7 .
. 6 . 9
There are 4 possible combinations (in terms of i and j): [(1,1),(2,2),(3,4)],[(1,1),(2,3),(3,2)],[(1,2),(2,3),(3,2)],[(1,2),(2,3),(3,4)]
My outcome should be the sum of entries for each possible combination, where my final goal is to find the combination that maximizes this outcome ([(1,2),(2,3),(3,4)] = 8 + 7 + 9 = 24 in this example).
Edit: here is the full code that generates the sparse matrix of which I want to find the optimal combination:
library(data.table)
library(ggplot2)
library(haven)
library(Matrix)
library(evd)
set.seed(12345)
N1 <- 100
M <- 100
I1 <- 10
I2 <- 2
I <- I1 * I2
N <- N1 * I2
J <- 5
p_c_A = 0.02
p_c_B = 0.01
p_0 = 0.05
p_1 = 0.2
dt_workers<- data.table(worker_id = 1:N,
firm_id = sample.int(M, N, replace = TRUE),
worker_type = sample.int(I1, N, replace = TRUE))
dt_workers[, worker_ethnicity := 1 * (worker_id > N1)]
dt_firms <- data.table(firm_id = 1:M,
firm_type = sample(J) )
sys_util <- matrix(NA, nrow=I1, ncol=J)
for(i in 1:dim(sys_util)[1]){
for(j in 1:dim(sys_util)[2]){
sys_util[i,j] <- i * j}
}
joint_surplus
con_A <- matrix(data = runif(N1 * M), nrow = N1, ncol = M)
con_B <- matrix(data = runif(N1 * M), nrow = N1, ncol = M)
con_A <- 1 * (con_A < p_c_A)
con_B <- 1 * (con_B < p_c_B)
p_meet_A <- con_A * p_1 + (1 - con_A) * p_0
p_meet_B <- con_B * p_1 + (1 - con_B) * p_0
meet_A <- matrix(data = runif(N1 * M), nrow = N1, ncol = M)
meet_B <- matrix(data = runif(N1 * M), nrow = N1, ncol = M)
meet_A <- 1* ( meet_A < p_meet_A )
meet_B <- 1* ( meet_B < p_meet_B )
meet <- rbind(meet_A,meet_B)
meet_sparse <- Matrix(meet, sparse = TRUE)
util <- which (meet_sparse>0, arr.ind=T)
n_draws <- dim(util)[1]
mu = 0
sigma = 10
idio = rgumbel(n=n_draws, loc=mu, scale=sigma)
util <- cbind(util,idio)
sys <- vector()
for(k in 1:dim(util)[1]){
g <- util[k,1]
f <- util[k,2]
i <- dt_workers[g, worker_type]
j <- dt_firms[f, firm_type]
sys[k] = sys_util[i,j]
}
util <- cbind(util,sys)
total_util = util[,3] + util[,4]
M <- sparseMatrix(
i = util[,1],
j = util[,2],
x = total_util
)
dat <- as.data.frame(summary(M))
dat <-dat[order(dat$i, dat$j),]
rownames(dat) <- NULL
library(Matrix)
M <- sparseMatrix(
i = c(1, 1, 2, 2, 3, 3),
j = c(1, 2, 2, 3, 2, 4),
x = c(6, 8, 5, 7, 6, 9)
)
#> 3 x 4 sparse Matrix of class "dgCMatrix"
#>
#> [1,] 6 8 . .
#> [2,] . 5 7 .
#> [3,] . 6 . 9
dat <- as.data.frame(summary(M))
#> i j x
#> 1 1 1 6
#> 2 1 2 8
#> 3 2 2 5
#> 4 3 2 6
#> 5 2 3 7
#> 6 3 4 9
row_indices <- unique(dat$i)
col_indices <- split(dat$j, dat$i)
#> $`1`
#> [1] 1 2
#>
#> $`2`
#> [1] 2 3
#>
#> $`3`
#> [1] 2 4
all_combinations_with_atmost_one_per_row <- do.call(expand.grid, col_indices)
#> 1 2 3
#> 1 1 2 2
#> 2 2 2 2
#> 3 1 3 2
#> 4 2 3 2
#> 5 1 2 4
#> 6 2 2 4
#> 7 1 3 4
#> 8 2 3 4
more_than_one_per_col <- apply(all_combinations_with_atmost_one_per_row, MARGIN = 1, anyDuplicated)
#> [1] 3 2 0 3 0 2 0 0
combinations <- all_combinations_with_atmost_one_per_row[!more_than_one_per_col, , drop = FALSE]
#> 1 2 3
#> 3 1 3 2
#> 5 1 2 4
#> 7 1 3 4
#> 8 2 3 4
lapply(
split(combinations, 1:nrow(combinations)),
function(cols) {
elements <- data.frame(i = row_indices, j = unlist(cols))
elements$value <- M[as.matrix(elements)]
list(elements = elements, sum = sum(elements$value))
}
)
#> $`1`
#> $`1`$elements
#> i j value
#> 1 1 1 6
#> 2 2 3 7
#> 3 3 2 6
#>
#> $`1`$sum
#> [1] 19
#>
#>
#> $`2`
#> $`2`$elements
#> i j value
#> 1 1 1 6
#> 2 2 2 5
#> 3 3 4 9
#>
#> $`2`$sum
#> [1] 20
#>
#>
#> $`3`
#> $`3`$elements
#> i j value
#> 1 1 1 6
#> 2 2 3 7
#> 3 3 4 9
#>
#> $`3`$sum
#> [1] 22
#>
#>
#> $`4`
#> $`4`$elements
#> i j value
#> 1 1 2 8
#> 2 2 3 7
#> 3 3 4 9
#>
#> $`4`$sum
#> [1] 24
Created on 2019-04-10 by the reprex package (v0.2.1)
And the best combination is found with res[[which.max(sapply(res, `[[`, "sum"))]]
$elements
i j value
1 1 2 8
2 2 3 7
3 3 4 9
$sum
[1] 24
I found a solution using linear programming with the help of Aurèle:
f.con <- matrix(,nrow = dim(dat)[1],ncol=0)
for(k in 1: N){
vec <- 1 * (dat[,1] == k)
f.con <- cbind(f.con , vec )
}
for(k in 1: M){
vec <- 1 * (dat[,2] == k)
f.con <- cbind(f.con , vec )
}
f.con <- t(f.con)
f.obj <- dat[,3]
f.dir <- rep ("<=", dim(f.con)[1])
f.rhs <- rep (1, dim(f.con)[1])
res = lp (direction = "max", f.obj, f.con, f.dir, f.rhs , all.int=TRUE)$solution
There might be a simple solution to this but I'm struggling.
I have a code as follows:
for(i in 1:nrow(df)){
x[i] <- df[i,]$X4
if(length(unique(df[1:i,]$X4)) == length(unique(df$X4))){
break
}
collect <- data.frame(df[1,]$X1, df[1,]$X2, df[i+1,]$X3)
}
The loop breaks after the if condition length(unique(df[1:i,]$X4)) == length(unique(df$X4)) is reached. However, I want to start the same loop again from i+1'th iteration, and keep checking until the same if condition is met again, till the end of my dataframe.
My sample data is as follows:
1 930000 1000000 E2-A
1 1890000 2110000 E2-A
1 2120000 2330000 D
1 2340000 3350000 E2-B
1 3365000 3405000 B
1 5695000 5810000 E2-A
1 6305000 6405000 E2-B
1 6425000 6465000 E1-A
1 6780000 6960000 E2-B
1 7100000 7270000 D
1 7730000 7810000 D
1 8030000 8380000 E2-A
1 8970000 9170000 E1-A
1 9345000 9555000 E1-B
1 9845000 9930000 E1-A
1 10000000 10100000 E1-B
1 10430000 10560000 E3
1 11720000 11780000 B
1 11900000 11960000 C
1 12185000 12270000 E1-A
1 12450000 12680000 A #break point of loop because if(length(unique(df[1:i,]$X4)) == length(unique(df$X4)))
1 13990000 14290000 B
1 15250000 15355000 E2-B
1 15475000 15600000 D
1 15655000 15755000 A
1 15920000 16080000 E2-A
1 16120000 16280000 C
1 16400000 16570000 E1-B
1 17280000 17380000 E1-B
1 17450000 17735000 A
1 17760000 17820000 E1-B
1 17825000 17935000 A
1 18925000 19150000 E1-A
1 19220000 19410000 C
1 19680000 19980000 C
1 20230000 20820000 E3 #the if condition is met again after the break, but using break exits the loop
1 20845000 20970000 E2-A
1 21580000 21695000 D
1 21700000 21920000 E2-A
1 22430000 22750000 B
1 22740000 22980000 A
1 23300000 23515000 C
1 23870000 23965000 A
1 24525000 24720000 E2-B
1 25010000 25160000 D
1 25170000 25430000 B
1 25930000 26130000 A
1 26220000 26330000 E2-B
1 26435000 26485000 C
My expected output is:
1 930000 12680000
1 13990000 20820000
But what I get so far is:
1 930000 12680000
How do I do so?
# I saved the data you provided into a file and read them back into R session
df <- read.table("df.txt",quote="#")
# It looks like you are using X1, X2, and so on in your code example
# so I renamed the column names
names(df) <- c("X1","X2","X3","X4")
# check the structure of the data frame
str(df)
# 'data.frame': 49 obs. of 4 variables:
# $ X1: int 1 1 1 1 1 1 1 1 1 1 ...
# $ X2: int 930000 1890000 2120000 2340000 3365000 5695000 6305000 6425000 6780000 7100000 ...
# $ X3: int 1000000 2110000 2330000 3350000 3405000 5810000 6405000 6465000 6960000 7270000 ...
# $ X4: Factor w/ 9 levels "A","B","C","D",..: 7 7 4 8 2 7 8 5 8 4 ...
result <- list()
i.new = 1
j = 0
# number of unique values in the 4th column
n.unique <- length(unique(df$X4))
for ( i in seq(nrow(df) )) {
if(length(unique(df[i.new : i,"X4" ])) == n.unique ){
j = j+1
result[[j]] <- c( df[i.new, 2], df[i, 3])
i.new = i + 1
}
}
result
# [[1]]
# [1] 930000 12680000
#
# [[2]]
# [1] 13990000 20820000
# If Dataframe is needed:
do.call(rbind.data.frame, result)
#c.930000L..13990000L. c.12680000L..20820000L.
#1 930000 12680000
#2 13990000 20820000
#If matrix is OK
matrix(unlist(result, use.names=F), ncol = 2, byrow = TRUE)
# [,1] [,2]
#[1,] 930000 12680000
#[2,] 13990000 20820000
ulist<- unique(df$X4)
uniq <- length(unique(df$X4))
brk <- 0
rn <- length(df$X1)
coor <- NULL
while (brk < rn) {
found <- rep(0,uniq)
coor.s <- 0
coor.e <- 0
coor.s <- df$X2[brk+1]
for (i in (brk+1):rn) {
for (j in 1:uniq) {
if(df$X4[i] == ulist[j]) {found[j]<-1}
}
if (sum(found)==uniq) {coor.e <- df$X3[i];brk=i;break}
}
if(sum(found)<uniq) {
break
} else {
collect.df <- as.data.frame(rbind(coor,c(coor.s,coor.e)))
}
}
I have a dataframe that has two types of value. I'd like to slice it in groups.
This groups are expected to provide two conditions. Each group should be;
Conditions 1: max cumulative value of w <= 75
Conditions 1: max cumulative value of n <= 15
If one of these criteria reach the max cumulative value, it should reset the cumulative sums
and start over again for both.
id<- sample(1:33)
w <- c(2,1,32,5,1,1,12,1,2,32,32,32,1,3,2,12,1,1,1,1,1,1,5,3,5,1,1,1,2,7,2,32,1)
n <- c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)
df <- data.frame(id, w, n)
the expected result (made manully)
w cumsum_w n cumsum_n group
2 2 1 1 1
1 3 1 2 1
32 35 1 3 1
5 40 1 4 1
1 41 1 5 1
1 42 1 6 1
12 54 1 7 1
1 55 1 8 1
2 57 1 9 1
32 32 1 2 2
32 64 1 3 2
32 32 1 1 3
1 33 1 2 3
3 36 1 3 3
2 38 1 4 3
12 50 1 5 3
1 51 1 6 3
1 52 1 7 3
1 53 1 8 3
1 54 1 9 3
1 55 1 10 3
1 56 1 11 3
5 61 1 12 3
3 64 1 13 3
5 69 1 14 3
1 70 1 15 3
1 1 1 1 4
1 2 1 2 4
2 4 1 3 4
7 11 1 4 4
2 13 1 5 4
32 45 1 6 4
1 46 1 7 4
I tried to solve some methods:
Method 1
library(BBmisc)
chunk(df, chunk.size = 75, n.chunks = 15)
Error in chunk(df, chunk.size = 75, n.chunks = 15) :
You must provide exactly one of 'chunk.size', 'n.chunks' or 'props'
Method 2
cumsum_with_reset_group <- function(w, n, threshold_w, threshold_n) {
cumsum_w <- 0
cumsum_n <- 0
group <- 1
result <- numeric()
for (i in 1:length(w)) {
cumsum_w <- cumsum_w + w[i]
cumsum_n <- cumsum_n + n[i]
if (cumsum_w > threshold_w | cumsum_n > threshold_n) {
group <- group + 1
cumsum_w <- cumsum_w + w[i]
cumsum_n <- cumsum_n + n[i]
}
result = c(result, group)
}
return (result)
}
# cumsum with reset
cumsum_w_with_reset <- function(w, threshold_w) {
cumsum_w <- 0
group <- 1
result <- numeric()
for (i in 1:length(w)) {
cumsum_w <- cumsum_w + w[i]
if (cumsum_w > threshold_w) {
group <- group + 1
cumsum_w <- w[i]
}
result = c(result, cumsum_w)
}
return (result)
}
# cumsum with reset
cumsum_n_with_reset <- function(n, threshold_n) {
cumsum_n <- 0
group <- 1
result <- numeric()
for (i in 1:length(n)) {
cumsum_n <- cumsum_n + n[i]
if (cumsum_n > threshold_n | cumsum_w > threshold_w) {
group <- group + 1
cumsum_n <- n[i]
}
result = c(result, cumsum_n)
}
return (result)
}
# use functions above as window functions inside mutate statement
y<-df %>% group_by() %>%
mutate(
cumsum_w = cumsum_w_with_reset(w, 75),
cumsum_n =cumsum_n_with_reset(n, 15),
group = cumsum_with_reset_group(w, n, 75, 15)
) %>%
ungroup()
Error in mutate_impl(.data, dots) :
Evaluation error: object 'cumsum_w' not found
Thanks!
Here is a hack, which is done by repeated subsetting and binding. As such, this will be very slow with large data sets. This takes the whole data set as an input.
library(dplyr)
cumsumdf <- function(df){
cumsum_75 <- function(x) {cumsum(x) %/% 76}
cumsum_15 <- function(x) {cumsum(x) %/% 16}
cumsum_w75 <- function(x) {cumsum(x) %% 76}
cumsum_n15 <- function(x) {cumsum(x) %% 16}
m <- nrow(df)
df$grp <- 0
df <- df %>%
group_by(grp) %>%
mutate(cumsum_w = numeric(m), cumsum_n = numeric(m))
n = 0
df2 <- df[0,]
while(nrow(df) >0 ){
df$cumsum_w = cumsum_75(df$w)
df$cumsum_n = cumsum_15(df$n)
n <- n + 1
df1 <- df[df$cumsum_n == 0 & df$cumsum_w == 0,]
df <- df[df$cumsum_n != 0 | df$cumsum_w != 0,]
df1$grp <- n
df1 <- df1 %>% group_by(grp) %>%
mutate(cumsum_w = cumsum_w75(w), cumsum_n = cumsum_n15(n))
df2 <- rbind(df2,df1)
}
return(df2)
}
cumsumdf(df)
I have a list with same structure for every member as the following
config <- NULL
config[["secA"]] <- NULL
config[["secA"]]$VAL <- 0
config[["secA"]]$ARR <- c(1,2,3,4,5)
config[["secA"]]$DF <- data.frame(matrix(c(1,5,3,8),2,2))
config[["secB"]] <- NULL
config[["secB"]]$VAL <- 1
config[["secB"]]$ARR <- c(1,3,2,4,9)
config[["secB"]]$DF <- data.frame(matrix(c(2,6,1,9),2,2))
config[["secC"]] <- NULL
config[["secC"]]$VAL <- 5
config[["secC"]]$ARR <- c(4,2,1,5,8)
config[["secC"]]$DF <- data.frame(matrix(c(4,2,1,7),2,2))
and I need to obtain 3 vectors VAL, ARR and DF, each with the concatenated elements of the corresponding member. such as
# VAL: 0,1,5
# ARR: 1,2,3,4,5,1,3,2,4,9,4,2,1,5,8
# DF: 1,5,3,8,2,6,1,9,4,2,1,7
Looking at similar situations, I have the feeling I need to use a combination of do.call and cbind or lapply but I have no clue. any suggestions?
config <- NULL
config[["secA"]] <- NULL
config[["secA"]]$VAL <- 0
config[["secA"]]$ARR <- c(1,2,3,4,5)
config[["secA"]]$DF <- data.frame(matrix(c(1,5,3,8),2,2))
config[["secB"]] <- NULL
config[["secB"]]$VAL <- 1
config[["secB"]]$ARR <- c(1,3,2,4,9)
config[["secB"]]$DF <- data.frame(matrix(c(2,6,1,9),2,2))
config[["secC"]] <- NULL
config[["secC"]]$VAL <- 5
config[["secC"]]$ARR <- c(4,2,1,5,8)
config[["secC"]]$DF <- data.frame(matrix(c(4,2,1,7),2,2))
sapply(names(config[[1]]), function(x)
unname(unlist(sapply(config, `[`, x))), USE.NAMES = TRUE)
# $VAL
# [1] 0 1 5
#
# $ARR
# [1] 1 2 3 4 5 1 3 2 4 9 4 2 1 5 8
#
# $DF
# [1] 1 5 3 8 2 6 1 9 4 2 1 7
Or you can use this clist function
Unfortunately there were no other answers.
(l <- Reduce(clist, config))
# $VAL
# [1] 0 1 5
#
# $ARR
# [1] 1 2 3 4 5 1 3 2 4 9 4 2 1 5 8
#
# $DF
# X1 X2 X1 X2 X1 X2
# 1 1 3 2 1 4 1
# 2 5 8 6 9 2 7
It merges data frames and matrices, so you need to unlist to get the vector you want
l$DF <- unname(unlist(l$DF))
l
# $VAL
# [1] 0 1 5
#
# $ARR
# [1] 1 2 3 4 5 1 3 2 4 9 4 2 1 5 8
#
# $DF
# [1] 1 5 3 8 2 6 1 9 4 2 1 7
Function
clist <- function (x, y) {
islist <- function(x) inherits(x, 'list')
'%||%' <- function(a, b) if (!is.null(a)) a else b
get_fun <- function(x, y)
switch(class(x %||% y),
matrix = cbind,
data.frame = function(x, y)
do.call('cbind.data.frame', Filter(Negate(is.null), list(x, y))),
factor = function(...) unlist(list(...)), c)
stopifnot(islist(x), islist(y))
nn <- names(rapply(c(x, y), names, how = 'list'))
if (is.null(nn) || any(!nzchar(nn)))
stop('All non-NULL list elements should have unique names', domain = NA)
nn <- unique(c(names(x), names(y)))
z <- setNames(vector('list', length(nn)), nn)
for (ii in nn)
z[[ii]] <- if (islist(x[[ii]]) && islist(y[[ii]]))
Recall(x[[ii]], y[[ii]]) else
(get_fun(x[[ii]], y[[ii]]))(x[[ii]], y[[ii]])
z
}
Another approach, with slightly less code.
un_config <- unlist(config)
un_configNAM <- names(un_config)
vecNAM <- c("VAL", "ARR", "DF")
for(n in vecNAM){
assign(n, un_config[grepl(n, un_configNAM)])
}
This will return 3 vectors as the OP requested. However, generally it is more advantageous to store results in a list as rawr suggests. You of course can adopt the above code so that results are stored within a list.
l <- rep(list(NA), length(vecNAM))
i = 1
for(n in vecNAM){
l[[i]] <- un_config[grepl(n, un_configNAM)]
i = i +1
}