Group-wise subsetting where feasible - r

I would like to subset rows of my data
library(data.table); set.seed(333); n <- 100
dat <- data.table(id=1:n, group=rep(1:2,each=n/2), x=runif(n,100,120), y=runif(n,200,220), z=runif(n,300,320))
> head(dat)
id group x y z
1: 1 1 109.3400 208.6732 308.7595
2: 2 1 101.6920 201.0989 310.1080
3: 3 1 119.4697 217.8550 313.9384
4: 4 1 111.4261 205.2945 317.3651
5: 5 1 100.4024 212.2826 305.1375
6: 6 1 114.4711 203.6988 319.4913
in several stages within each group. I need to automate this and it might happen that the subset is empty. For example, focusing only on group 1,
dat1 <- dat[1:50]
> s <-subset(dat1,x>119)
> s
id group x y z
1: 3 1 119.4697 217.8550 313.9384
2: 50 1 119.2519 214.2517 318.8567
the second step subset(s, y>219) would come up empty but I would still want to apply the third step subset(s,z>315). If I were to set the threshold manually, Frank has provided an excellent solution here that outputs
> f(dat1, x>119, y>219, z>315)
cond skip
1: x > 119 FALSE
2: y > 219 TRUE
3: z > 315 FALSE
id group x y z
1: 50 1 119.2519 214.2517 318.8567
and reports which parts were skipped.
My problem is that I need to apply this to different groups simultaneously, where the thresholds for each group are given in a separate data.table. The goal is to have at least one id per group. For example, if my thresholds were
c <- data.table(group=1:2, x=c(119,119), y=c(219,219), z=c(315,319))
> c
group x y z
1: 1 119 219 315
2: 2 119 219 319
I would like to end up with
> res
id group x y z
1: 50 1 119.2519 214.2517 318.8567
2: 55 2 119.2634 219.0044 315.6556
I could apply Frank's function repeatedly within a for-loop but I am sure there are cleverer ways that save time. I wonder, for instance, whether the function can be applied to each group within data.table. Or perhaps there is a way within the tidyverse, which I am not really familiar with yet.

Another possible approach using standard evaluation:
#convert conditions into long format, storing operator in data.table as well
cond <- data.table(group=1:2, bop=c(`>`, `>`), x=c(119,119), y=c(219,219), z=c(315,319))
thres <- melt(cond, id.vars=c("group","bop"))
#convert data into long format and lookup filter and thresholds
mdat <- melt(dat, id.vars=c("id", "group"))[
thres, on=.(group, variable), c("bop","thres") := mget(c("bop","i.value"))]
#apply filtering
ss <- mdat[mapply(function(f, x, y) f(x, y), bop, value, thres)]
#apply sequential subsetting
dat[id %in% ss[, {
idx <- id
ans <- .SD[, {
x <- intersect(idx, id)
if(length(x) > 0) {
idx <- x
}
idx
}, .(variable)]
ans[variable==last(variable), V1]
}, .(group)]$V1
]
output:
id group x y z
1: 50 1 119.2519 214.2517 318.8567
2: 55 2 119.2634 219.0044 315.6556
3: 58 2 119.2211 214.0305 319.3097
4: 72 2 114.0802 217.7402 313.3655
5: 90 2 116.8115 215.1576 317.0261
6: 99 2 119.2964 212.9973 308.9360
data:
library(data.table)
set.seed(333)
n <- 100
dat <- data.table(id=1:n, group=rep(1:2,each=n/2),
x=runif(n,100,120), y=runif(n,200,220), z=runif(n,300,320))

Related

Group-wise conditional subsetting where feasible

I would like to subset rows of my data
library(data.table); set.seed(333); n <- 100
dat <- data.table(id=1:n, group=rep(1:2,each=n/2), x=runif(n,100,120), y=runif(n,200,220), z=runif(n,300,320))
> head(dat)
id group x y z
1: 1 1 109.3400 208.6732 308.7595
2: 2 1 101.6920 201.0989 310.1080
3: 3 1 119.4697 217.8550 313.9384
4: 4 1 111.4261 205.2945 317.3651
5: 5 1 100.4024 212.2826 305.1375
6: 6 1 114.4711 203.6988 319.4913
in several stages, unless it results in an empty subset. In this case, I would like to skip that specific subsetting. In an earlier question, Frank has found a great solution for this:
f = function(x, ..., verbose=FALSE){
L = substitute(list(...))[-1]
mon = data.table(cond = as.character(L))[, skip := FALSE]
for (i in seq_along(L)){
d = eval( substitute(x[cond, verbose=v], list(cond = L[[i]], v = verbose)) )
if (nrow(d)){
x = d
} else {
mon[i, skip := TRUE]
}
}
print(mon)
return(x)
}
where I can enter the data, and the cut-offs for each variable manually.
> f(dat, x > 119, y > 219, y > 1e6)
cond skip
1: x > 119 FALSE
2: y > 219 FALSE
3: y > 1e+06 TRUE
id group x y z
1: 55 2 119.2634 219.0044 315.6556
I now wonder how this (or something even better!) could be applied to a case where the cut-offs are in a second data.table
c <- data.table(group=1:2, x=c(110,119), y=c(210,219), z=c(310,319))
> c
group x y z
1: 1 110 210 310
2: 2 119 219 319
and specified for each group separately.
If I were to use f(.), I thought of a join of c into dat but can't figure it out. But perhaps there is a smarter way entirely.
First, I would change how c is constructed. You currently have it set up with one column per filter, but a long format would allow you to use multiple filters on the same column similar to your initial example (i.e. two filters on y):
c <- data.table(group=c(1,2,1,2,1,2,1),variable = c("x","x","y","y","z","z","y"), c_val = c(110,119,210,219,310,319,1e6))
c[, c_id := 1:.N]
c
group variable c_val c_id
1: 1 x 110 1
2: 2 x 119 2
3: 1 y 210 3
4: 2 y 219 4
5: 1 z 310 5
6: 2 z 319 6
7: 1 y 1000000 7
you can then merge your filters to your data.
dat_cut <- melt(dat, id.vars = c("id", "group"), value.name = "dat_val")
output <- merge(dat_cut, c, by = c("group","variable"), allow.cartesian = TRUE)
This line then tests the filters - you can expand this line if you want to expand your filter logic (greater than / less than, equal etc.), and can code that logic back to c
output <- output[dat_val > c_val]
You then want to find any line where the number of filters met is equal to the unique total number of filters met, for that group:
output[,req_match := uniqueN(c_id), by = .(group)] # number of filters where a condition was met.
selection <- output[,.N,by = .(id, group, req_match)][N == req_match, id]
If a filter did not match any rows, it will be excluded here.
Then you can filter your initial dataset for the solution:
dat[id %in% selection]
id group x y z
1: 3 1 119.4697 217.8550 313.9384
2: 18 1 117.2930 216.5670 310.4617
3: 35 1 110.4283 218.6130 312.0904
4: 50 1 119.2519 214.2517 318.8567

Find average values of a column in terms of date range of another column in R

I have two data frames that look like this:
> head(y,n=4)
Source: local data frame [6 x 3]
Start Date End Date Length
1 2006-06-08 2006-06-10 3
2 2006-06-12 2006-06-14 3
3 2006-06-18 2006-06-21 4
4 2006-06-24 2006-06-25 2
and
> head(x,n=19)
Date Group.Size
413 2006-06-07 6
414 2006-06-08 3
415 2006-06-09 1
416 2006-06-10 3
417 2006-06-11 15
418 2006-06-12 12
419 2006-06-13 NA
420 2006-06-14 4
421 2006-06-15 8
422 2006-06-16 3
423 2006-06-17 1
424 2006-06-18 3
425 2006-06-19 10
426 2006-06-20 2
427 2006-06-21 7
428 2006-06-22 6
429 2006-06-23 2
430 2006-06-24 1
431 2006-06-25 0
I'm looking for a way to add a new column in data frame y that will show the average Group.Size of data frame x (rounded to nearest integer), depending on the given Start Date and End Dates provided in y.
For example, in the first row of y, I have 6/8/06 to 6/10/06. This is a length of 3 days, so I would want the new column to have the number 2, because the corresponding Group.Size values are 3, 1, and 3 for the respective days in data frame x (mean=2.33, rounded to nearest integer is 2).
If there is an NA in my dataframe x, I'd like to consider it a 0.
There are multiple steps involved in this task, and there is probably a straightforward approach... I am relatively new to R, and am having a hard time breaking it down. Please let me know if I should clarify my example.
Assuming that x$Date, y$StartDate, and y$EndDate are of class Date (or, character), the following apply approach should be doing the trick:
y$AvGroupSize<- apply(y, 1, function(z) {
round(mean(x$Group.Size[which(x$Date >= z[1] & x$Date <=z[2])], na.rm=T),0)
}
)
#Replace missing values in x with 0
x[is.na(x)] <- 0
#Create new 'Group' variable and loop through x to create groups
x$Group <-1
j <- 1
for(i in 1:nrow(x)){
if(x[i,"Date"]==y[j,"StartDate"]){
x[i,"Group"] <- j+1
if(j<nrow(y)){
j <- j+1
} else{
j <- j
}
}else if(i>1){
x[i,"Group"] <- x[i-1,"Group"]
}else {
x[i,"Group"] <- 1
}
}
#Use tapply function to get the rounded mean of each Group
tapply(x$Group.Size, x$Group, function(z) round(mean(z)))
Here is a different dplyr solution
library(dplyr)
na2zero <- function(x) ifelse(is.na(x),0,x) # Convert NA to zero
ydf %>%
group_by(Start_Date, End_Date) %>%
mutate(avg = round(mean(na2zero(xdf$Group.Size[ between(xdf$Date, Start_Date, End_Date) ])), 0)) %>%
ungroup
## Start_Date End_Date Length avg
## (time) (time) (int) (dbl)
## 1 2006-06-08 2006-06-10 3 2
## 2 2006-06-12 2006-06-14 3 5
## 3 2006-06-18 2006-06-21 4 6
## 4 2006-06-24 2006-06-25 2 0
This is a solution that applies over the rows of the data frame y:
library(dplyr)
get_mean_size <- function(start, end, length) {
s <- sum(filter(x, Date >= start, Date <= end)$Group.Size, na.rm = TRUE)
round(s/length)
}
y$Mean.Size = Map(get_mean_size, y$Start_Date, y$End_Date, y$Length)
y
## Start_Date End_Date Length Mean.Size
## 1 2006-06-08 2006-06-10 3 2
## 2 2006-06-12 2006-06-14 3 5
## 3 2006-06-18 2006-06-21 4 6
## 4 2006-06-24 2006-06-25 2 0
It uses two functions from the dplyr package: filter() and mutate().
First I define the function get_mean_size that is supposed with the three values from a column in y: Start_Date, End_Date and length. It fist selects the relevant rows from x using filter and sums up the column Group.Size. Using na.rm = TRUE tells sum() to ignore NA values, which is the same as setting them to zero. Then the average is calculated by dividing by length and rounding. Note that round rounds half to even, thus 0.5 is rounded to 0, while 1.5 is rounded to 2.
This function is then applied to all rows of y using Map() and added as a new column to y.
A final note regarding the dates in x and y. This solution assumes that the dates are stored as Date object. You can check this using, e. g.,
is(x$Date, "Date")
If they do not have class Date, you can convert them using
x$Date <- as.Date(x$Date)
(and simliarly for y$Start_Date and y$End_Date).
There are many ways but here is one. We can first create a list of date positions with lapply (SN: Be sure that the dates are in chronological order). Then we map the function round(mean(Group.Size)) to each of the values:
lst <- lapply(y[1:2], function(.x) match(.x, x[,"Date"]))
y$avg <- mapply(function(i,j) round(mean(x$Group.Size[i:j], na.rm=TRUE)), lst[[1]],lst[[2]])
y
# StartDate EndDate Length avg
# 1 2006-06-08 2006-06-10 3 2
# 2 2006-06-12 2006-06-14 3 8
# 3 2006-06-18 2006-06-21 4 6
# 4 2006-06-24 2006-06-25 2 0

Counting existing pairs of data in large data table

This question is based on the response given by #Arun here. In the response, #Arun proposes a clever way to avoid creating sparse matrices by simply only looking at occurring pairs, hence avoiding the saving of many zeros and the doubling of pairs A-B and B-A.
The following is copy-pasted from his answer:
Step 1: Construct sample data of your dimensions approximately:
require(data.table) ## 1.9.4+
set.seed(1L) ## For reproducibility
N = 2724098L
motif = sample(paste("motif", 1:1716, sep="_"), N, TRUE)
id = sample(83509, N, TRUE)
DT = data.table(id, motif)
Step 2: Pre-processing:
DT = unique(DT) ## IMPORTANT: not to have duplicate motifs within same id
setorder(DT) ## IMPORTANT: motifs are ordered within id as well
setkey(DT, id) ## reset key to 'id'. Motifs ordered within id from previous step
DT[, runlen := .I]
Step 3: Solution:
ans = DT[DT, {
tmp = runlen < i.runlen;
list(motif[tmp], i.motif[any(tmp)])
},
by=.EACHI][, .N, by="V1,V2"]
Running this works fine provided you have enough memory on your computer. I also humbly admit I have no ideawhat exactly the code is doing to create the wanted results so I'm just looking at input and output, agnostic of the process.
When applying the exact same code to my data, what seems to happen is that pairs appear that are not in the original data.
I'm running the following code which is a slightly adapted version of what #Arun had provided. The adaptation is because I need to run the code for 17 different blocks. I.e. I'm looking for which pairs occur within a specific block.
cooc <- data.frame()
for(j in 1:17){
DT <- dt[block == j,c("pid", "cid"), with =F]
DT$pid <- as.factor(DT$pid)
setorder(DT)
setkey(DT,pid)
DT[, runlen := .I]
ans <- DT[DT, {
tmp = runlen < i.runlen;
list(cid[tmp],i.cid[any(tmp)])
},
by= .EACHI][, .N, by="V1,V2"]
ans$block <- j
cooc <- data.table(rbind(cooc,ans))
rm(ans)
}
For as far as I understand the code, it's all identical, just looped with for to do the same thing for 17 blocks. both pid and cid are just integers that identify a variable of interest.
For j = 1 , the following goes:
DT[cid == 39] # cid is my equivalent of motif above and pid is my equivalent of id above
pid cid runlen
20319 39 3614
This suggests there is only one pid for which cid equals 39
Now when I look into the resulting ans datatable I get the following:
ans[V1 == 39]
V1 V2 N block
1: 39 41 1 1
2: 39 42 1 1
3: 39 44 1 1
4: 39 47 1 1
5: 39 7027 1 1
6: 39 7043 1 1
7: 39 7174 1 1
8: 39 9434 1 1
9: 39 11493 1 1
10: 39 18815 1 1
11: 39 18875 1 1
12: 39 18896 1 1
13: 39 18909 1 1
14: 39 18924 1 1
15: 39 18928 1 1
16: 39 18929 1 1
17: 39 18931 1 1
18: 39 18932 1 1
19: 39 19265 1 1
20: 39 19410 1 1
Suddenly, there are 20 occurrences of V1 (if I understand the code correctly, this is the equivalent of what was cid). Yet in DT there is only 1 pid assigned to cid.
I have no idea how to reproduce this finding so I tried to show what seems to be inconsistent. I don't think the original code has this problem so I'm hoping someone can explain where the additional occurrences of cid == 39 come from, based on the info I have given here.
Two things:
First, I don't understand what's wrong with the result you get. Starting from
require(data.table)
set.seed(1L)
N = 2724098L
motif = sample(paste("motif", 1:1716, sep="_"), N, TRUE)
id = sample(83509, N, TRUE)
DT = data.table(id, motif)
let me recreate the data that helps answer your Q.
# keep only one of 'motif_456'
DT2 = rbind(DT[1L], DT[motif != "motif_456"])
DT2[1L]
# id motif
# 1: 49338 motif_456
DT2[ , .N, by=motif]
# motif N
# 1: motif_456 1
# 2: motif_639 1637
# 3: motif_984 1649
# 4: motif_1559 1531
# 5: motif_347 1603
# ---
# 1712: motif_46 1623
# 1713: motif_521 1613
# 1714: motif_803 1568
# 1715: motif_603 1573
# 1716: motif_461 1591
Let's check all motifs corresponding to id = 49338:
DT2[id == 49338, motif]
# [1] "motif_456" "motif_553" "motif_1048" "motif_1680" "motif_171" "motif_1706"
# [7] "motif_707" "motif_163" "motif_489" "motif_107" "motif_1419" "motif_595"
# [13] "motif_1223" "motif_1274" "motif_1164" "motif_427" "motif_505" "motif_1041"
# [19] "motif_1321" "motif_1231" "motif_1498" "motif_837" "motif_298" "motif_649"
# [25] "motif_631"
So obviously for all these motifs' combination with motif_456 the result should be 1. And that's what the data.table solution provides. Here's the relevant result after running data.table solution:
# data.table solution takes 11.2 secs
ans[V1 == "motif_456", .N] + ans[V2 == "motif_456", .N]
# [1] 24
Second, while the data.table answer does well, we can do this more efficiently with the solution shown by #nograpes. Let's try it on DT2:
require(Matrix)
DT2[, names(DT2) := lapply(.SD, as.factor)]
s <- sparseMatrix(
as.integer(DT2$id),
as.integer(DT2$motif),
dimnames = list(levels(DT2$id),levels(DT2$motif)),
x = TRUE)
co.oc <- t(s) %*% s # Find co-occurrences.
tab <- summary(co.oc) # Create triplet representation.
tab <- tab[tab$i < tab$j,] # Extract upper triangle of matrix
ans = setDT(list(motif1 = levels(DT2$motif)[tab$i],
motif2 = levels(DT2$motif)[tab$j],
number = tab$x))
# Matrix solution takes 2.4 secs
ans[motif1 == "motif_456", .N] + ans[motif2 == "motif_456", .N]
# [1] 24

R data.table: calculating grouped frequencies

I'm trying to add columns to my data.table that essentially append a cumulative frequency table for each group that is aggregated. Unfortunately, my current solution is about ten times slower than I had hoped.
Here is what I'm using (apologies for the ugly one-liner):
DT[, c("bin1","bin2","bin3","bin4") := as.list(cumsum(hist(colx,c(lbound,bound1,bound2, bound3,ubound),plot=FALSE)$counts)), by=category]
If the bin boundaries are set at 0,25,50,75,100, I would like my table to look like:
id category colx bin1 bin2 bin3 bin4
1 a 5 1 2 2 3
2 a 30 1 2 2 3
3 b 21 1 2 3 4
4 c 62 0 1 3 3
5 b 36 1 2 3 4
6 a 92 1 2 2 3
7 c 60 0 1 3 3
8 b 79 1 2 3 4
9 b 54 1 2 3 4
10 c 27 0 1 3 3
In the actual dataset I'm grouping using 4 different columns and there are millions of rows and unique groups. When I try a simpler function, such as sum, it takes an acceptable amount of time to do the calculation. Is there any way to significantly speed up the counting process?
Okay, here's one way (here I use data.table v1.9.3). Remove the by=.EACHI if you're using versions <= 1.9.2.
dt[, ival := findInterval(colx, seq(0, 100, by=25), rightmost.closed=TRUE)]
setkey(dt, category, ival)
ans <- dt[CJ(unique(category), unique(ival)), .N, allow.cartesian=TRUE, by=.EACHI]
ans[, N := cumsum(N), by="category"][, bin := "bin"]
ans <- dcast.data.table(ans, category ~ bin+ival, value.var="N")
ans <- dt[ans][, ival := NULL]
id category colx bin_1 bin_2 bin_3 bin_4
1: 1 a 5 1 2 2 3
2: 2 a 30 1 2 2 3
3: 6 a 92 1 2 2 3
4: 3 b 21 1 2 3 4
5: 5 b 36 1 2 3 4
6: 9 b 54 1 2 3 4
7: 8 b 79 1 2 3 4
8: 10 c 27 0 1 3 3
9: 4 c 62 0 1 3 3
10: 7 c 60 0 1 3 3
Benchmark on simulated large data:
I generate here a data.table with 20 million rows and a total of 1-million groups with 2 grouping columns (instead of 4 as you state in your question).
K = 1e3L
N = 20e6L
sim_data <- function(K, N) {
set.seed(1L)
ff <- function(K, N) sample(paste0("V", 1:K), N, TRUE)
data.table(x=ff(K,N), y=ff(K,N), val=sample(1:100, N, TRUE))
}
dt <- sim_data(K, N)
method1 <- function(x) {
dt[, ival := findInterval(val, seq(0, 100, by=25), rightmost.closed=TRUE)]
setkey(dt, x, y, ival)
ans <- dt[CJ(unique(x), unique(y), unique(ival)), .N, allow.cartesian=TRUE, by=.EACHI]
ans[, N := cumsum(N), by="x,y"][, bin := "bin"]
ans <- dcast.data.table(ans, x+y ~ bin+ival, value.var="N")
ans <- dt[ans][, ival := NULL]
}
system.time(ans1 <- method1(dt))
# user system elapsed
# 13.148 2.778 16.209
I hope this is faster than your original solution and scales well for your real data dimensions.
Update: Here's another version using data.table's rolling joins instead of findInterval from base. We've to modify the intervals slightly so that the rolling join finds the right match.
dt <- sim_data(K, N)
method2 <- function(x) {
ivals = seq(24L, 100L, by=25L)
ivals[length(ivals)] = 100L
setkey(dt, x,y,val)
dt[, ival := seq_len(.N), by="x,y"]
ans <- dt[CJ(unique(x), unique(y), ivals), roll=TRUE, mult="last"][is.na(ival), ival := 0L][, bin := "bin"]
ans <- dcast.data.table(ans, x+y~bin+val, value.var="ival")
dt[, ival := NULL]
ans2 <- dt[ans]
}
system.time(ans2 <- method2(dt))
# user system elapsed
# 12.538 2.649 16.079
## check if both methods give identical results:
setkey(ans1, x,y,val)
setnames(ans2, copy(names(ans1)))
setkey(ans2, x,y,val)
identical(ans1, ans2) # [1] TRUE
Edit: Some explanation on why OP's is very time consuming:
A huge reason, I suspect, for the difference in runtime between these solutions and hist is that both the answers here are vectorised (written entirely in C and will work on the whole data set directly), where as hist is a S3 method (which'll take time for dispatch to the .default method and added to that, it's written in R. So, basically you're executing about a million times hist, a function in R, where as the other two vectorised solutions are calling it once in C (no need to call for every group here).
And since that's the most complex part of your question, it obviously slows things down.

aggregate data frame by equal buckets

I would like to aggregate an R data.frame by equal amounts of the cumulative sum of one of the variables in the data.frame. I googled quite a lot, but probably I don't know the correct terminology to find anything useful.
Suppose I have this data.frame:
> x <- data.frame(cbind(p=rnorm(100, 10, 0.1), v=round(runif(100, 1, 10))))
> head(x)
p v
1 10.002904 4
2 10.132200 2
3 10.026105 6
4 10.001146 2
5 9.990267 2
6 10.115907 6
7 10.199895 9
8 9.949996 8
9 10.165848 8
10 9.953283 6
11 10.072947 10
12 10.020379 2
13 10.084002 3
14 9.949108 8
15 10.065247 6
16 9.801699 3
17 10.014612 8
18 9.954638 5
19 9.958256 9
20 10.031041 7
I would like to reduce the x to a smaller data.frame where each line contains the weighted average of p, weighted by v, corresponding to an amount of n units of v. Something of this sort:
> n <- 100
> cum.v <- cumsum(x$v)
> f <- cum.v %/% n
> x.agg <- aggregate(cbind(v*p, v) ~ f, data=x, FUN=sum)
> x.agg$'v * p' <- x.agg$'v * p' / x.agg$v
> x.agg
f v * p v
1 0 10.039369 98
2 1 9.952049 94
3 2 10.015058 104
4 3 9.938271 103
5 4 9.967244 100
6 5 9.995071 69
First question, I was wondering if there is a better (more efficient approach) to the code above. The second, more important, question is how to correct the code above in order to obtain more precise bucketing. Namely, each row in x.agg should contain exacly 100 units of v, not just approximately as it is the case above. For example, the first row contains the aggregate of the first 17 rows of x which correspond to 98 units of v. The next row (18th) contains 5 units of v and is fully included in the next bucket. What I would like to achieve instead would be attribute 2 units of row 18th to the first bucket and the remaining 3 units to the following one.
Thanks in advance for any help provided.
Here's another method that does this with out repeating each p v times. And the way I understand it is, the place where it crosses 100 (see below)
18 9.954638 5 98
19 9.958256 9 107
should be changed to:
18 9.954638 5 98
19.1 9.958256 2 100 # ---> 2 units will be considered with previous group
19.2 9.958256 7 107 # ----> remaining 7 units will be split for next group
The code:
n <- 100
# get cumulative sum, an id column (for retrace) and current group id
x <- transform(x, cv = cumsum(x$v), id = seq_len(nrow(x)), grp = cumsum(x$v) %/% n)
# Paste these two lines in R to install IRanges
source("http://bioconductor.org/biocLite.R")
biocLite("IRanges")
require(IRanges)
ir1 <- successiveIRanges(x$v)
ir2 <- IRanges(seq(n, max(x$cv), by=n), width=1)
o <- findOverlaps(ir1, ir2)
# gets position where multiple of n(=100) occurs
# (where we'll have to do something about it)
pos <- queryHits(o)
# how much do the values differ from multiple of 100?
val <- start(ir2)[subjectHits(o)] - start(ir1)[queryHits(o)] + 1
# we need "pos" new rows of "pos" indices
x1 <- x[pos, ]
x1$v <- val # corresponding values
# reduce the group by 1, so that multiples of 100 will
# belong to the previous row
x1$grp <- x1$grp - 1
# subtract val in the original data x
x$v[pos] <- x$v[pos] - val
# bind and order them
x <- rbind(x1,x)
x <- x[with(x, order(id)), ]
# remove unnecessary entries
x <- x[!(duplicated(x$id) & x$v == 0), ]
x$cv <- cumsum(x$v) # updated cumsum
x$id <- NULL
require(data.table)
x.dt <- data.table(x, key="grp")
x.dt[, list(res = sum(p*v)/sum(v), cv = tail(cv, 1)), by=grp]
Running on your data:
# grp res cv
# 1: 0 10.037747 100
# 2: 1 9.994648 114
Running on #geektrader's data:
# grp res cv
# 1: 0 9.999680 100
# 2: 1 10.040139 200
# 3: 2 9.976425 300
# 4: 3 10.026622 400
# 5: 4 10.068623 500
# 6: 5 9.982733 562
Here's a benchmark on a relatively big data:
set.seed(12345)
x <- data.frame(cbind(p=rnorm(1e5, 10, 0.1), v=round(runif(1e5, 1, 10))))
require(rbenchmark)
benchmark(out <- FN1(x), replications=10)
# test replications elapsed relative user.self
# 1 out <- FN1(x) 10 13.817 1 12.586
It takes about 1.4 seconds on 1e5 rows.
If you are looking for precise bucketing, I am assuming value of p is same for 2 "split" v
i.e. in your example, value of p for 2 units of row 18th that go in first bucket is 9.954638
With above assumption, you can do following for not super large datasets..
> set.seed(12345)
> x <- data.frame(cbind(p=rnorm(100, 10, 0.1), v=round(runif(100, 1, 10))))
> z <- unlist(mapply(function(x,y) rep(x,y), x$p, x$v, SIMPLIFY=T))
this creates a vector with each value of p repeated v times for each row and result is combined into single vector using unlist.
After this aggregation is trivial using aggregate function
> aggregate(z, by=list((1:length(z)-0.5)%/%100), FUN=mean)
Group.1 x
1 0 9.999680
2 1 10.040139
3 2 9.976425
4 3 10.026622
5 4 10.068623
6 5 9.982733

Resources