data:
structure(list(id = c(1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 4, 4, 5),
ax = c("a", "a", "b", "b", "b", "b", "b", "b", "c", "c",
"d", "d", "e"), time = c(1, 3, 0, 2, 4, 5, 6, 8, 7, 9, 10,
11, 12)), .Names = c("id", "ax", "time"), class = c("data.table",
"data.frame"), row.names = c(NA, -13L))
looks like:
id ax time
1: 1 a 1
2: 1 a 3
3: 2 b 0
4: 2 b 2
5: 2 b 4
6: 2 b 5
7: 2 b 6
8: 2 b 8
9: 3 c 7
10: 3 c 9
11: 4 d 10
12: 4 d 11
13: 5 e 12
I want to have the max of the previous group next to the actual group:
desired output:
id ax time newCol
1: 1 a 1 NA
2: 1 a 3 NA
3: 2 b 0 3
4: 2 b 2 3
5: 2 b 4 3
6: 2 b 5 3
7: 2 b 6 3
8: 2 b 8 3
9: 3 c 7 8
10: 3 c 9 8
11: 4 d 10 9
12: 4 d 11 9
13: 5 e 12 11
Is it also possible to have the value of the "previous-previous" grp?
Interessted in baseR, data.table and tidyverse solutions
note:
Can be grouped by EITHER id or ax. The example is a little redundant here.
A data.table solution:
dtt.max <- dtt[, .(max = max(time)), by = ax]
dtt.max[, max.prev := shift(max)]
dtt[dtt.max, newCol := i.max.prev, on = 'ax']
# > dtt
# id ax time newCol
# 1: 1 a 1 NA
# 2: 1 a 3 NA
# 3: 2 b 0 3
# 4: 2 b 2 3
# 5: 2 b 4 3
# 6: 2 b 5 3
# 7: 2 b 6 3
# 8: 2 b 8 3
# 9: 3 c 7 8
# 10: 3 c 9 8
# 11: 4 d 10 9
# 12: 4 d 11 9
# 13: 5 e 12 11
data.table solution using id + 1
library(data.table)
merge(d, setDT(d)[, max(time), id + 1], all.x = TRUE)
Here is a dplyr approach. The key here is to group and ungroup when necessary:
df %>%
group_by(ax) %>%
mutate(new = time[n()]) %>%
ungroup() %>%
mutate(new = lag(new)) %>%
group_by(ax) %>%
mutate(new = new[1])
# A tibble: 13 x 4
# Groups: ax [5]
id ax time new
<dbl> <chr> <dbl> <dbl>
1 1. a 1. NA
2 1. a 3. NA
3 2. b 0. 3.
4 2. b 2. 3.
5 2. b 4. 3.
6 2. b 5. 3.
7 2. b 6. 3.
8 2. b 8. 3.
9 3. c 7. 8.
10 3. c 9. 8.
11 4. d 10. 9.
12 4. d 11. 9.
13 5. e 12. 11.
Assuming id is the same as group:
dfr <- dfr %>% group_by(id) %>% mutate(groupmax = max(time))
dfr$old_group_max <- dfr$groupmax[match(dfr$id - 1, dfr$id)]
The antepenultimate group is left as an exercise :-)
1) This uses no packages. It computes the maximum for each group giving Ag and and then lags it giving LagMax. Finally it left joins using merge that back into the original data frame DF:
Ag <- aggregate(time ~ id, DF, max)
LagMax <- transform(Ag, lagmax = c(NA, head(time, -1)), time = NULL)
merge(DF, LagMax, by = "id", all.x = TRUE)
giving:
id ax time lagmax
1 1 a 1 NA
2 1 a 3 NA
3 2 b 0 3
4 2 b 2 3
5 2 b 4 3
6 2 b 5 3
7 2 b 6 3
8 2 b 8 3
9 3 c 7 8
10 3 c 9 8
11 4 d 10 9
12 4 d 11 9
13 5 e 12 11
2) This sorts time within id so that we know that the maximum is the last value in each id group.
o <- order(factor(DF$id, levels = unique(DF$id)), DF$time)
Time <- DF$time[o]
lagmax <- function(r) if (r[1] == 1) NA else Time[r[1] - 1]
transform(DF, lagmax = ave(seq_along(id), id, FUN = lagmax))
In the question the time values are already sorted within id and if that is known to be the case the above could be shortened to:
lagmax <- function(r) if (r[1] == 1) NA else DF$time[r[1] - 1]
transform(DF, lagmax = ave(seq_along(id), id, FUN = lagmax))
3) This one-liner is a data.table translation of (2):
library(data.table)
DT <- copy(DF) # don't overwrite DF
setDT(DT)[, g:=rleid(id)][, lagmax := DT$time[.I[1]-1], keyby = c("g", "id")]
In the sample data in the question time is sorted within id and if that were known to be the case we could use the following shorter code in place of the last line above
setDT(DT)[, lagmax := DT$time[.I[1]-1], by = id]
Related
I have a very large data frame that includes integer columns state and state_cyclen. Every row is a gameframe, while state describes the state a game is in at that frame and state_cyclen is coded to indicate n occurrence of that state (it is basically data.table::rleid(state)). Conditioning on state and cycling by state_cyclen I need to import several columns from other definitions data frames. Definition data frames store properties about state and their row ordering informs on the way these properties are cycled throughout the game (players encounter each game state many times).
A minimal example of the long data that should be left joined:
data <- data.frame(
state = c(1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3, 3, 4, 4, 3, 3),
state_cyclen = c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 1, 1, 4, 4)
)
data
#> state state_cyclen
#> 1 1 1
#> 2 1 1
#> 3 2 1
#> 4 2 1
#> 5 3 1
#> 6 3 1
#> 7 1 2
#> 8 1 2
#> 9 2 2
#> 10 2 2
#> 11 3 2
#> 12 3 2
#> 13 2 3
#> 14 2 3
#> 15 3 3
#> 16 3 3
#> 17 3 3
#> 18 4 1
#> 19 4 1
#> 20 3 4
#> 21 3 4
Minimal example for definition data frames storing the ordering:
def_one <- data.frame(
prop = letters[1:3],
others = LETTERS[1:3]
)
def_two <- data.frame(
prop = letters[4:10],
others = LETTERS[4:10]
)
def_three <- data.frame(
prop = letters[11:12],
others = LETTERS[11:12]
)
I have a solution written in base R that gives the desired output, but it's neither very readable, nor probably very efficient.
# Add empty columns
data$prop <- NA
data$others <- NA
# Function that recycles numeric vector bounded by a upper limit
bounded_vec_recyc <- function(vec, n) if(n == 1) vec else (vec - 1) %% n + 1
# My solution
vec_pos_one <- data[data[, "state"] == 1, ]$state_cyclen
vec_pos_one <- bounded_vec_recyc(vec_pos_one, n = nrow(def_one))
data[data[, "state"] == 1, ][, c("prop", "others")] <- def_one[vec_pos_one,]
vec_pos_two <- data[data[, "state"] == 2, ]$state_cyclen
vec_pos_two <- bounded_vec_recyc(vec_pos_two, n = nrow(def_two))
data[data[, "state"] == 2, ][, c("prop", "others")] <- def_two[vec_pos_two,]
vec_pos_three <- data[data[, "state"] == 3, ]$state_cyclen
vec_pos_three <- bounded_vec_recyc(vec_pos_three, n = nrow(def_three))
data[data[, "state"] == 3, ][, c("prop", "others")] <- def_three[vec_pos_three,]
data
#> state state_cyclen prop others
#> 1 1 1 a A
#> 2 1 1 a A
#> 3 2 1 d D
#> 4 2 1 d D
#> 5 3 1 k K
#> 6 3 1 k K
#> 7 1 2 b B
#> 8 1 2 b B
#> 9 2 2 e E
#> 10 2 2 e E
#> 11 3 2 l L
#> 12 3 2 l L
#> 13 2 3 f F
#> 14 2 3 f F
#> 15 3 3 k K
#> 16 3 3 k K
#> 17 3 3 k K
#> 18 4 1 <NA> <NA>
#> 19 4 1 <NA> <NA>
#> 20 3 4 l L
#> 21 3 4 l L
Created on 2022-08-30 with reprex v2.0.2
TLDR: As you can see, I am basically trying to merge one by one these definition data frames to the main data frame on corresponding state by recycling the rows of the definition data frame while retaining their order, using the state_cyclen column to keep track of occurrences of each state throughout the game.
Is there a way to do this within the tidyverse or data.table that is faster or at least easier to read? I need this to be quite fast as I have many such gameframe files (in the hundreds) and they are lengthy (hundreds of thousands of rows).
P.S. Not sure if title is adequate for the operations I am doing, as I can imagine multiple ways of implementation. Edits on it are welcome.
Here, I make a lookup table combining the three sources. Then I join the data with the number of rows for each state, modify the state_cyclen in data using modulo with that number to be within the lookup range, then join.
library(tidyverse)
def <- bind_rows(def_one, def_two, def_three, .id = "state") %>%
mutate(state = as.numeric(state)) %>%
group_by(state) %>%
mutate(state_cyclen_adj = row_number()) %>%
ungroup()
data %>%
left_join(def %>% count(state)) %>%
# eg for row 15 we change 3 to 1 since the lookup table only has 2 rows
mutate(state_cyclen_adj = (state_cyclen - 1) %% n + 1) %>%
left_join(def)
Joining, by = "state"
Joining, by = c("state", "state_cyclen_adj")
state state_cyclen n state_cyclen_adj prop others
1 1 1 3 1 a A
2 1 1 3 1 a A
3 2 1 7 1 d D
4 2 1 7 1 d D
5 3 1 2 1 k K
6 3 1 2 1 k K
7 1 2 3 2 b B
8 1 2 3 2 b B
9 2 2 7 2 e E
10 2 2 7 2 e E
11 3 2 2 2 l L
12 3 2 2 2 l L
13 2 3 7 3 f F
14 2 3 7 3 f F
15 3 3 2 1 k K
16 3 3 2 1 k K
17 3 3 2 1 k K
18 4 1 NA NA <NA> <NA>
19 4 1 NA NA <NA> <NA>
20 3 4 2 2 l L
21 3 4 2 2 l L
Here is a data.table solution. Not sure it is easier to read, but pretty sure it is more efficient:
library(data.table)
dt <- rbind(setDT(def_one)[,state := 1],
setDT(def_two)[,state := 2],
setDT(def_three)[,state := 3])
dt[,state_cyclen := 1:.N,by = state]
data <- setDT(data)
data[dt[,.N,by = state],
state_cyclen := bounded_vec_recyc(state_cyclen,i.N),
on = "state",
by = .EACHI]
dt[data,on = c("state","state_cyclen")]
prop others state state_cyclen
1: a A 1 1
2: a A 1 1
3: d D 2 1
4: d D 2 1
5: k K 3 1
6: k K 3 1
7: b B 1 2
8: b B 1 2
9: e E 2 2
10: e E 2 2
11: l L 3 2
12: l L 3 2
13: f F 2 3
14: f F 2 3
15: k K 3 1
16: k K 3 1
17: k K 3 1
18: <NA> <NA> 4 1
19: <NA> <NA> 4 1
20: l L 3 2
21: l L 3 2
prop others state state_cyclen
By step:
I bind the def_one, def_two and def_three dataframes to create a data.table with the variable you need to merge
dt <- rbind(setDT(def_one)[,state := 1],
setDT(def_two)[,state := 2],
setDT(def_three)[,state := 3])
dt[,state_cyclen := 1:.N,by = state]
In case you want to merge a lot of dataframes, you can use rbindlist and a list of data.tables.
I then modify your state_cyclen in data to do the same recycling than you:
dt[,.N,by = state]
state N
1: 1 3
2: 2 7
3: 3 2
gives the lengths you use to define your recycling.
data[dt[,.N,by = state],
state_cyclen := bounded_vec_recyc(state_cyclen,i.N),
on = "state",
by = .EACHI]
I use the by = .EACHI to modify the variable for each group during the merge, using the N variable from dt[,.N,by = state]
Then I just have to do the left join:
dt[data,on = c("state","state_cyclen")]
An option with nest/unnest
library(dplyr)
library(tidyr)
data %>%
nest_by(state) %>%
left_join(tibble(state = 1:3, dat = list(def_one, def_two, def_three))) %>%
mutate(data = list(bind_cols(data, if(!is.null(dat))
dat[data %>%
pull(state_cyclen) %>%
bounded_vec_recyc(., nrow(dat)),] else NULL)), dat = NULL) %>%
ungroup %>%
unnest(data)
-output
# A tibble: 21 × 4
state state_cyclen prop others
<dbl> <dbl> <chr> <chr>
1 1 1 a A
2 1 1 a A
3 1 2 b B
4 1 2 b B
5 2 1 d D
6 2 1 d D
7 2 2 e E
8 2 2 e E
9 2 3 f F
10 2 3 f F
# … with 11 more rows
I am new to R. I would like to calculate the mean for each row of a dataframe, but using different subset of columns for each row. I have two extra-columns providing me the names of the column that represent the "start" and the "end" that I should use to calculate each mean, respectively.
Let's take this example
dframe <- data.frame(a=c("2","3","4", "2"), b=c("1","3","6", "2"), c=c("4","5","6", "3"), d=c("4","2","8", "5"), e=c("a", "c", "a", "b"), f=c("c", "d", "d", "c"))
dframe
Which provides the following dataframe:
a b c d e f
1 2 1 4 4 a c
2 3 3 5 2 c d
3 4 6 6 8 a d
4 2 2 3 5 b c
The columns e and f represent the first and last column I use to calculate the mean for each row.
For example, on line 1, the mean would be calculated including column a, b, c ((2+1+4)/3 -> 2.3)
So I would like to obtain the following output:
a b c d e f mean
1 2 1 4 4 a c 2.3
2 3 3 5 2 c d 3.5
3 4 6 6 8 a d 6
4 2 2 3 5 b c 2.5
I learnt how to create the indices, and I want then to use RowMeans, but I cannot find the correct arguments.
dframe %>%
mutate(e_indice = match(e, colnames(dframe)))%>%
mutate(f_indice = match(f, colnames(dframe)))%>%
mutate(mean = RowMeans(????, na.rm = TRUE))
Thanks a lot for your help
One dplyr option could be:
dframe %>%
rowwise() %>%
mutate(mean = rowMeans(cur_data()[match(e, names(.)):match(f, names(.))]))
a b c d e f mean
<dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl>
1 2 1 4 4 a c 2.33
2 3 3 5 2 c d 3.5
3 4 6 6 8 a d 6
4 2 2 3 5 b c 2.5
I would define a helper function that lets you slice the indices you want
from a matrix.
rowSlice <- function(x, start, stop) {
replace(x, col(x) < start | col(x) > stop, NA)
}
rowSlice(matrix(1, 4, 4), c(1, 3, 1, 2), c(3, 4, 4, 3))
#> [,1] [,2] [,3] [,4]
#> [1,] 1 1 1 NA
#> [2,] NA NA 1 1
#> [3,] 1 1 1 1
#> [4,] NA 1 1 NA
Then use across() to select the relvant columns, slice them,
and take the rowMeans().
library(dplyr)
dframe <- data.frame(
a = c(2, 3, 4, 2),
b = c(1, 3, 6, 2),
c = c(4, 5, 6, 3),
d = c(4, 2, 8, 5),
e = c("a", "c", "a", "b"),
f = c("c", "d", "d", "c")
)
dframe %>%
mutate(ei = match(e, colnames(dframe))) %>%
mutate(fi = match(f, colnames(dframe))) %>%
mutate(
mean = across(a:d) %>%
rowSlice(ei, fi) %>%
rowMeans(na.rm = TRUE)
)
#> a b c d e f ei fi mean
#> 1 2 1 4 4 a c 1 3 2.333333
#> 2 3 3 5 2 c d 3 4 3.500000
#> 3 4 6 6 8 a d 1 4 6.000000
#> 4 2 2 3 5 b c 2 3 2.500000
A base R solution. First, set columns to numeric. Then create a list of the columns on which to apply the mean. Then apply mean on selected columns.
s <- mapply(seq, match(dframe$e, colnames(dframe)), match(dframe$f, colnames(dframe)))
dframe$mean <- lapply(seq(nrow(dframe)), function(x) rowMeans(dframe[x, s[[x]]]))
a b c d e f mean
1 2 1 4 4 a c 2.333333
2 3 3 5 2 c d 3.5
3 4 6 6 8 a d 6
4 2 2 3 5 b c 2.5
A base R approach using apply
dframe$mean <- apply(dframe, 1, function(x)
mean(as.numeric(x[which(names(x) == x["e"]) : which(names(x) == x["f"])])))
dframe
a b c d e f mean
1 2 1 4 4 a c 2.333333
2 3 3 5 2 c d 3.500000
3 4 6 6 8 a d 6.000000
4 2 2 3 5 b c 2.500000
I have a problem that I don't manage to solve properly in data.table. I have the following data:
plouf <- data.table( ID = rep(LETTERS[1:10],each = 10) )
plouf[,c(paste0("X",1:10)) := lapply(1:10,function(x){sample(10,100,replace = T)})]
There are two things that block me time to time:
col <- "X1"
plouf[get(col) > 5, .(col = get(col)[1]) ,by = ID]
ID col
1: A 7
2: B 7
3: C 9
4: D 6
5: E 8
6: F 7
7: G 6
8: H 7
9: I 6
10: J 7
The column is named "col" instead of "X1". I tried with eval, get, didn't get it.
And same kind :
col <- 1
plouf[get(paste0("X",col)) > 5, .(paste0("X",col) = get(paste0("X",col))[1]) ,by = ID]
Error: unexpected '=' in "plouf[get(paste0("X",col)) > 5, .(paste0("X",col) ="
I tried this from Using paste when naming a list :
plouf[get(paste0("X",col)) > 5,setNames( get(paste0("X",col))[1],paste0("X",col)) ,by = ID]
ID V1
1: A 7
2: B 7
3: C 9
4: D 6
5: E 8
6: F 7
7: G 6
8: H 7
9: I 6
10: J 7
but it is not the desired result. Could someone explain me how it works ?
We can use setNames
plouf[get(col) > 5, setNames(list(get(col)[1]), col) ,by = ID]
or another option is setnames after getting the result
setnames(plouf[get(col) > 5, .(get(col)[1]) ,by = ID], 'V1', col)[]
# ID X1
#1: A 8
#2: B 7
#3: C 6
#4: D 10
#5: F 9
#6: G 8
#7: H 10
#8: I 6
#9: J 8
If we are using dplyr, then the option would be
library(dplyr)
plouf %>%
filter_at(col, any_vars(.>5)) %>%
group_by(ID) %>%
summarise_at(col, first)
# A tibble: 9 x 2
# ID X1
# <chr> <int>
#1 A 8
#2 B 7
#3 C 6
#4 D 10
#5 F 9
#6 G 8
#7 H 10
#8 I 6
#9 J 8
Or with := and sym from rlang
plouf %>%
filter(!! rlang::sym(col) > 5) %>%
group_by(ID) %>%
summarise(!! col := first(!!rlang::sym(col)))
I have data table
Name Score
A 5
A 6
B 9
B 1
B 0
...
I want to calculate and add a column 'FScore'=max score to this table
My expected result
Name Score Fscore
A 5 6
A 6 6
B 9 9
B 1 9
B 0 9
Thank.
We can use the base R option ave
df$Fscore <- ave(df$Score, df$Name, FUN = max)
df
# Name Score Fscore
#1 A 5 6
#2 A 6 6
#3 B 9 9
#4 B 1 9
#5 B 0 9
If you are trying to find the maximum score for each Name value, you can use data.table as below.
# example data
d <- data.table(Name = c("A", "A", "B", "B", "B"),
Score = c(5, 6, 9, 1, 0))
# find max for each Name and save the value in a new column, Fscore
d[ , Fscore := max(Score), by=Name]
Result:
> print(d)
Name Score Fscore
1: A 5 6
2: A 6 6
3: B 9 9
4: B 1 9
5: B 0 9
Another option using dplyr could be:
df = data.frame(Name = c('a', 'a', 'b','b','b'), Score = c(5,6,9,1,0))
df %>% group_by(Name) %>% mutate(Fscore = max(Score))
Source: local data frame [5 x 3]
Groups: Name [2]
Name Score FScore
<fctr> <dbl> <dbl>
1 a 5 6
2 a 6 6
3 b 9 9
4 b 1 9
5 b 0 9
I have a data set with repeating rows. I want to remove consecutive repeated and count them but only if they're consecutive. I'm looking for an efficient way to do this. Can't think of how in dplyr or data.table.
MWE
dat <- data.frame(
x = c(6, 2, 3, 3, 3, 1, 1, 6, 5, 5, 6, 6, 5, 4),
y = c(7, 5, 7, 7, 7, 5, 5, 7, 1, 2, 7, 7, 1, 7),
z = c(rep(LETTERS[1:2], each=7))
)
## x y z
## 1 6 7 A
## 2 2 5 A
## 3 3 7 A
## 4 3 7 A
## 5 3 7 A
## 6 1 5 A
## 7 1 5 A
## 8 6 7 B
## 9 5 1 B
## 10 5 2 B
## 11 6 7 B
## 12 6 7 B
## 13 5 1 B
## 14 4 7 B
Desired output
x y z n
1 6 7 A 1
2 2 5 A 1
3 3 7 A 3
4 1 5 A 2
5 6 7 B 1
6 5 1 B 1
7 5 2 B 1
8 6 7 B 2
9 5 1 B 1
10 4 7 B 1
With data.table:
library(data.table)
setDT(dat)
dat[, c(.SD[1L], .N), by=.(g = rleidv(dat))][, g := NULL]
x y z N
1: 6 7 A 1
2: 2 5 A 1
3: 3 7 A 3
4: 1 5 A 2
5: 6 7 B 1
6: 5 1 B 1
7: 5 2 B 1
8: 6 7 B 2
9: 5 1 B 1
10: 4 7 B 1
Similar to Ricky's answer, here's another base solution:
with(rle(do.call(paste, dat)), cbind(dat[ cumsum(lengths), ], lengths))
In case paste doesn't cut it for the column classes you have, you can do
ud = unique(dat)
ud$r = seq_len(nrow(ud))
dat$r0 = seq_len(nrow(dat))
newdat = merge(dat, ud)
with(rle(newdat[order(newdat$r0), ]$r), cbind(dat[cumsum(lengths), ], lengths))
... though I'm guessing there's some better way.
With dplyr, you can borrow data.table::rleid to make a run ID column, then use n to count rows and unique to chop out repeats:
dat %>% group_by(run = data.table::rleid(x, y, z)) %>% mutate(n = n()) %>%
distinct() %>% ungroup() %>% select(-run)
You can replace rleid with just base R, if you like, but it's not as pretty:
dat %>% group_by(run = rep(seq_along(rle(paste(x, y, z))$len),
times = rle(paste(x, y, z))$len)) %>%
mutate(n = n()) %>% distinct() %>% ungroup() %>% select(-run)
Either way, you get:
Source: local data frame [10 x 4]
x y z n
(dbl) (dbl) (fctr) (int)
1 6 7 A 1
2 2 5 A 1
3 3 7 A 3
4 1 5 A 2
5 6 7 B 1
6 5 1 B 1
7 5 2 B 1
8 6 7 B 2
9 5 1 B 1
10 4 7 B 1
Edit
Per #Frank's comment, you can also use summarise to insert n and collapse instead of mutate and unique if you group_by all the variables you want to keep before run, as summarise collapses the last group. One advantage to this approach is that you don't have to ungroup to get rid of run, as summarise does for you:
dat %>% group_by(x, y, z, run = data.table::rleid(x, y, z)) %>%
summarise(n = n()) %>% select(-run)
A base solution below
idx <- rle(with(dat, paste(x, y, z)))
d <- cbind(do.call(rbind, strsplit(idx$values, " ")), idx$lengths)
as.data.frame(d)
V1 V2 V3 V4
1 6 7 A 1
2 2 5 A 1
3 3 7 A 3
4 1 5 A 2
5 6 7 B 1
6 5 1 B 1
7 5 2 B 1
8 6 7 B 2
9 5 1 B 1
10 4 7 B 1
If you have a large dataset, you could use a similar idea to Frank's data.table solution, but avoid using .SD like this:
dat[, g := rleidv(dat)][, N := .N, keyby = g
][J(unique(g)), mult = "first"
][, g := NULL
][]
It's less readable, and it turns out it's slower, too. Frank's solution is faster and more readable.
# benchmark on 14 million rows
dat <- data.frame(
x = rep(c(6, 2, 3, 3, 3, 1, 1, 6, 5, 5, 6, 6, 5, 4), 1e6),
y = rep(c(7, 5, 7, 7, 7, 5, 5, 7, 1, 2, 7, 7, 1, 7), 1e6),
z = rep(c(rep(LETTERS[1:2], each=7)), 1e6)
)
setDT(dat)
d1 <- copy(dat)
d2 <- copy(dat)
With R 3.2.4 and data.table 1.9.7 (on Frank's computer):
system.time(d1[, c(.SD[1L], .N), by=.(g = rleidv(d1))][, g := NULL])
# user system elapsed
# 0.42 0.10 0.52
system.time(d2[, g := rleidv(d2)][, N := .N, keyby = g][J(unique(g)), mult = "first"][, g := NULL][])
# user system elapsed
# 2.48 0.25 2.74
Not much different than the other answers, but (1) having ordered data and (2) looking for consecutive runs seems a good candidate for, just, ORing x[-1L] != x[-length(x)] accross columns instead of pasteing or other complex operations. I guess this is, somehow, equivalent to data.table::rleid.
ans = logical(nrow(dat) - 1L)
for(j in seq_along(dat)) ans[dat[[j]][-1L] != dat[[j]][-nrow(dat)]] = TRUE
ans = c(TRUE, ans)
#or, the two-pass, `c(TRUE, Reduce("|", lapply(dat, function(x) x[-1L] != x[-length(x)])))`
cbind(dat[ans, ], n = tabulate(cumsum(ans)))
# x y z n
#1 6 7 A 1
#2 2 5 A 1
#3 3 7 A 3
#6 1 5 A 2
#8 6 7 B 1
#9 5 1 B 1
#10 5 2 B 1
#11 6 7 B 2
#13 5 1 B 1
#14 4 7 B 1
Another base attempt using ave, just because:
dat$grp <- ave(
seq_len(nrow(dat)),
dat[c("x","y","z")],
FUN=function(x) cumsum(c(1,diff(x))!=1)
)
dat$count <- ave(dat$grp, dat, FUN=length)
dat[!duplicated(dat[1:4]),]
# x y z grp count
#1 6 7 A 0 1
#2 2 5 A 0 1
#3 3 7 A 0 3
#6 1 5 A 0 2
#8 6 7 B 0 1
#9 5 1 B 0 1
#10 5 2 B 0 1
#11 6 7 B 1 2
#13 5 1 B 1 1
#14 4 7 B 0 1
And a data.table conversion attempt:
d1[, .(sq=.I, grp=cumsum(c(1, diff(.I)) != 1)), by=list(x,y,z)][(sq), .N, by=list(x,y,z,grp)]