dplyr::slice in data.table [duplicate] - r

This question already has answers here:
How to extract the first n rows per group?
(4 answers)
Subset rows corresponding to max value by group using data.table
(1 answer)
Closed 4 years ago.
What is the idiomatic way to do the action below in data.table?
library(dplyr)
df %>%
group_by(b) %>%
slice(1:10)
I can do
library(data.table)
df[, .SD[1:10]
, by = b]
but that appears much slower. Is there a better way?
set.seed(0)
df <- rep(1:500, sample(500:1000, 500, T)) %>%
data.table(a = runif(length(.))
,b = .)
f1 <- function(df){
df %>%
group_by(b) %>%
slice(1:10)
}
f2 <- function(df){
df[, .SD[1:10]
, by = b]
}
library(microbenchmark)
microbenchmark(f1(df), f2(df))
#Unit: milliseconds
# expr min lq mean median uq max neval
# f1(df) 17.67435 19.50381 22.06026 20.50166 21.42668 78.3318 100
# f2(df) 69.69554 79.43387 119.67845 88.25585 106.38661 581.3067 100
========== Benchmarks with suggested methods ==========
set.seed(0)
df <- rep(1:500, sample(500:1000, 500, T)) %>%
data.table(a = runif(length(.))
,b = .)
use.slice <- function(df){
df %>%
group_by(b) %>%
slice(1:10)
}
IndexSD <- function(df){
df[, .SD[1:10]
, by = b]
}
Index.I <- function(df) {
df[df[, .I[seq_len(10)], by = b]$V1]
}
use.head <- function(df){
df[, head(.SD, 10)
, by = b]
}
library(microbenchmark)
microbenchmark(use.slice(df)
, IndexSD(df)
, Index.I(df)
, use.head(df)
, unit = "relative"
, times = 100L)
#Unit: relative
# expr min lq mean median uq max neval
# use.slice(df) 9.804549 10.269234 9.167413 8.900060 8.782862 6.520270 100
# IndexSD(df) 38.881793 42.548555 39.044095 38.636523 39.942621 18.981748 100
# Index.I(df) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 100
# use.head(df) 3.666898 4.033038 3.728299 3.408249 3.545258 3.951565 100

We can use .I to extract the row index and should be faster
out <- df[df[, .I[seq_len(10)], by = b]$V1]
dim(out)
#[1] 5000 2
Checking if there are NAs (as the OP commented)
any(out[, Reduce(`|`, lapply(.SD, is.na))])
#[1] FALSE
dim(df)
#[1] 374337 2
Benchmarks
f3 <- function(df) {
df[df[, .I[seq_len(10)], by = b]$V1]
}
microbenchmark(f1(df), f2(df), f3(df), unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# f1(df) 5.727822 5.480741 4.945486 5.672206 4.317531 5.10003 10 b
# f2(df) 24.572633 23.774534 17.842622 23.070634 16.099822 11.58287 10 c
# f3(df) 1.000000 1.000000 1.000000 1.000000 1.000000 1.00000 10 a

Related

Faster "Resample with replacement by cluster"

I have the same question as Resample with replacement by cluster, i.e. I want to do cluster bootstrapping. The best answer's approach to that question using rbindlist(lapply(resampled_ids, function(resampled_id) df[df$id == resampled_id,])) works, but because I have a big dataset, this resampling step is rather slow. My question is, is it possible to speed this up?
Use sequence to index. Demonstrated with a larger data.frame:
df <- data.frame(id = rep.int(1:1e2, sample(100:200, 1e2, replace = TRUE))[1:1e4], X = rnorm(1e4))
resampled_ids <- sample(unique(df$id), replace = TRUE)
idx <- sequence(tabulate(df$id)[resampled_ids], match(unique(df$id), df$id)[resampled_ids])
s <- data.frame(id = df$id[idx], X = df$X[idx])
Benchmarking against the rbindlist solution:
library(data.table)
library(microbenchmark)
microbenchmark(rbindlist = rbindlist(lapply(resampled_ids, function(x) df[df$id %in% x,])),
sequence = {idx <- sequence(tabulate(df$id)[resampled_ids], match(unique(df$id), df$id)[resampled_ids])
data.frame(id = df$id[idx], X = df$X[idx])})
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> rbindlist 9480.4 9921.95 11470.567 10431.05 12555.35 31178.2 100
#> sequence 406.7 444.55 564.873 498.10 545.70 2818.4 100
Note that creating a new data.frame from indexed vectors is much faster than row-indexing the original data.frame. The difference is much less pronounced if a data.table is used, but, surprisingly, the rbindlist solution becomes even slower:
microbenchmark(rbindlist = rbindlist(lapply(resampled_ids, function(x) df[df$id %in% x,])),
sequence1 = df[sequence(tabulate(df$id)[resampled_ids], match(unique(df$id), df$id)[resampled_ids]),],
sequence2 = {idx <- sequence(tabulate(df$id)[resampled_ids], match(unique(df$id), df$id)[resampled_ids])
data.frame(id = df$id[idx], X = df$X[idx])})
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> rbindlist 9431.9 9957.7 11101.545 10508.15 12395.25 15363.3 100
#> sequence1 4284.5 4550.3 4866.891 4674.80 5009.90 8350.1 100
#> sequence2 414.1 455.6 541.590 508.40 551.40 2881.1 100
setDT(df)
microbenchmark(rbindlist = rbindlist(lapply(resampled_ids, function(x) df[df$id %in% x,])),
sequence1 = df[sequence(tabulate(df$id)[resampled_ids], match(unique(df$id), df$id)[resampled_ids]),],
sequence2 = {idx <- sequence(tabulate(df$id)[resampled_ids], match(unique(df$id), df$id)[resampled_ids])
data.table(id = df$id[idx], X = df$X[idx])})
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> rbindlist 14877.4 15878.30 17181.572 16348.50 18527.6 22520.9 100
#> sequence1 795.0 1016.80 1187.266 1101.95 1326.7 2566.5 100
#> sequence2 386.4 441.75 556.226 473.70 500.9 3373.6 100
Update
To address the comment from jay.sf:
lens <- tabulate(df$id)[resampled_ids]
idx <- sequence(lens, match(unique(df$id), df$id)[resampled_ids])
s <- data.frame(cluster = rep.int(seq_along(resampled_ids), lens), id = df$id[idx], X = df$X[idx])
cluster corresponds to the index of resampled_ids.
f = data.frame( id=c(1,1,2,2,2,3,3), X = rnorm(7) )
Try this:
ind_id <- split(seq_along(f$id), f$id)
samp_id <- sample(names(ind_id), replace = TRUE)
f[unlist(ind_id[samp_id]), ]

How to change a variable at the nth occurrence of a value in another variable?

There is a data.table
library(data.table)
car <- data.table(no = 1:100, turn = sample(1:5,100,replace = TRUE),
dis = sample(1:10,100,replace = TRUE))
I want to change "dis" to -1, at the nth occurrence of turn == 3, say the third time that "turn" is 3.
I can select the third row of turn == 3:
car[turn == 3, .SD[3]]
However, I don't manage to update "dis" at this row:
car[turn == 3, .SD[3]][, dis := -1]
A related Q&A: Conditionally replacing column values with data.table.
Some alternatives. Use rowid or cumsum to create a counter of rows within groups. Add the counter to your condition in i.
I use a slightly smaller toy data set, just to make it easier to track the changes:
d <- data.table(x = 1:3, y = 1:12)
d[rowid(x) == 3 & x == 3, y := -1]
# #mt1022
d[cumsum(x == 3) == 3 & (x == 3), y := -1]
# #docendo discimus
d[(ix <- x == 3) & cumsum(ix) == 3, y := -1]
Although OP didn't mention speed as an issue, I was still curious to time the different approaches on a larger vector. Unsurprisingly, #Frank's method is the fastest, especially so when the number of unique values to search among increases:
frank << docendo < henrik < mt022
microbenchmark(henrik = d[rowid(x) == 3 & x == 3, y := -1],
mt1022 = d[cumsum(x == 3) == 3 & (x == 3), y := -1],
docendo = d[(ix <- x == 3) & cumsum(ix) == 3, y := -1],
frank = d[d[x == 3, which = TRUE][3], y := -1], unit = "relative")
d <- data.table(x = sample(1:3, 1e6, replace = TRUE), y = 1:1e6)
# Unit: relative
# expr min lq mean median uq max neval cld
# henrik 4.417303 4.369407 4.133514 4.319839 4.329658 1.260394 100 b
# mt1022 5.461961 5.285562 5.174559 5.186404 5.239738 1.608712 100 c
# docendo 3.572646 3.624369 3.788678 3.589705 3.576637 1.733272 100 b
# frank 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 100 a
d <- data.table(x = sample(1:30, 1e6, replace = TRUE), y = 1:1e6)
# Unit: relative
# expr min lq mean median uq max neval cld
# henrik 22.64881 19.54375 18.81963 18.91335 19.78559 5.507692 100 bc
# mt1022 24.58258 21.17535 19.84417 20.96256 22.76020 3.625263 100 c
# docendo 19.40044 16.75912 16.23321 16.47953 18.06264 4.234100 100 b
# frank 1.00000 1.00000 1.00000 1.00000 1.00000 1.000000 100 a
d <- data.table(x = sample(1:300, 1e6, replace = TRUE), y = 1:1e6)
# Unit: relative
# expr min lq mean median uq max neval cld
# henrik 31.81237 32.51122 28.79490 30.35766 28.63560 8.236282 100 b
# mt1022 34.71984 35.45341 33.20405 33.57394 31.50914 21.556367 100 c
# docendo 27.99046 28.15855 26.56954 26.60644 25.20044 7.847163 100 b
# frank 1.00000 1.00000 1.00000 1.00000 1.00000 1.000000 100 a
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# henrik 60.655582 76.455531 83.061266 77.632036 78.57818 203.224042 100 c
# mt1022 66.701182 84.133034 87.967300 84.937201 85.72464 201.167914 100 c
# docendo 52.938545 67.214360 71.558130 68.003891 68.51897 184.178346 100 b
# frank 1.977821 2.494039 2.629852 2.663577 2.76089 3.613905 100 a
Here's another way:
car[car[turn == 3, which = TRUE][3], dis := -1 ]
Comment. For an operation like this, you might want to turn verbose on, so you can see if any change was made. For example, if we look for the 111th 3....
car[car[turn == 3, which = TRUE][111], dis := -1, verbose = TRUE ]
# Detected that j uses these columns: dis
# Assigning to 0 row subset of 100 rows
It shows that 0 rows were edited.
See also Using .I to return row numbers with data.table package on the use of which = TRUE.

R: scan vectors once instead of 4 times?

Suppose I have two equal length logical vectors.
Computing the confusion matrix the easy way:
c(sum(actual == 1 & predicted == 1),
sum(actual == 0 & predicted == 1),
sum(actual == 1 & predicted == 0),
sum(actual == 0 & predicted == 0))
requires scanning the vectors 4 times.
Is it possible to do that in a single pass?
PS. I tried table(2*actual+predicted) and table(actual,predicted) but both are obviously much slower.
PPS. Speed is not my main consideration here, I am more interested in understanding the language.
You could try using data.table
library(data.table)
DT <- data.table(actual, predicted)
setkey(DT, actual, predicted)[,.N, .(actual, predicted)]$N
data
set.seed(24)
actual <- sample(0:1, 10 , replace=TRUE)
predicted <- sample(0:1, 10, replace=TRUE)
Benchmarks
Using data.table_1.9.5 and dplyr_0.4.0
library(microbenchmark)
set.seed(245)
actual <- sample(0:1, 1e6 , replace=TRUE)
predicted <- sample(0:1, 1e6, replace=TRUE)
f1 <- function(){
DT <- data.table(actual, predicted)
setkey(DT, actual, predicted)[,.N, .(actual, predicted)]$N}
f2 <- function(){table(actual, predicted)}
f3 <- function() {data_frame(actual, predicted) %>%
group_by(actual, predicted) %>%
summarise(n())}
microbenchmark(f1(), f2(), f3(), unit='relative', times=20L)
#Unit: relative
# expr min lq mean median uq max neval cld
#f1() 1.000000 1.000000 1.000000 1.00000 1.000000 1.000000 20 a
#f2() 20.818410 22.378995 22.321816 22.56931 22.140855 22.984667 20 b
#f3() 1.262047 1.248396 1.436559 1.21237 1.220109 2.504662 20 a
Including the count from dplyr and tabulate also in the benchmarks on a slightly bigger dataset
set.seed(498)
actual <- sample(0:1, 1e7 , replace=TRUE)
predicted <- sample(0:1, 1e7, replace=TRUE)
f4 <- function() {data_frame(actual, predicted) %>%
count(actual, predicted)}
f5 <- function(){tabulate(4-actual-2*predicted, 4)}
Update
Including another data.table solution (provided by #Arun) also in the benchmarks
f6 <- function() {setDT(list(actual, predicted))[,.N, keyby=.(V1,V2)]$N}
microbenchmark(f1(), f3(), f4(), f5(), f6(), unit='relative', times=20L)
#Unit: relative
#expr min lq mean median uq max neval cld
#f1() 2.003088 1.974501 2.020091 2.015193 2.080961 1.924808 20 c
#f3() 2.488526 2.486019 2.450749 2.464082 2.481432 2.141309 20 d
#f4() 2.388386 2.423604 2.430581 2.459973 2.531792 2.191576 20 d
#f5() 1.034442 1.125585 1.192534 1.217337 1.239453 1.294920 20 b
#f6() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20 a
Like this:
tabulate(4 - actual - 2*predicted, 4)
(tabulate here is much faster than table because it knows the output will be a vector of length 4).
There is table which computes a cross tabulation and should give similar results if actual and predicted contain only zeros and ones:
table(actual, predicted)
Internally, this works by pasteing the vectors -- horribly inefficient. It seems that the coercion to character also happens when tabulating only one value, and this might be the very reason for the bad performance also of table(actual*2 + predicted).

aggregate a matrix (or data.frame) by column name groups in R

I have a large matrix with about 3000 columns x 3000 rows. I'd like to aggregate (calculate the mean) grouped by column names for every row. Each column is named similar to this method...(and in random order)
Tree Tree House House Tree Car Car House
I would need the data result (aggregation of mean of every row) to have the following columns:
Tree House Car
the tricky part (at least for me) is that I do not know all the column names and they are all in random order!
You could try
res1 <- vapply(unique(colnames(m1)), function(x)
rowMeans(m1[,colnames(m1)== x,drop=FALSE], na.rm=TRUE),
numeric(nrow(m1)) )
Or
res2 <- sapply(unique(colnames(m1)), function(x)
rowMeans(m1[,colnames(m1)== x,drop=FALSE], na.rm=TRUE) )
identical(res1,res2)
#[1] TRUE
Another option might be to reshape into long form and then do the aggregation
library(data.table)
res3 <-dcast.data.table(setDT(melt(m1)), Var1~Var2, fun=mean)[,Var1:= NULL]
identical(res1, as.matrix(res3))
[1] TRUE
Benchmarks
It seems like the first two methods are slightly faster for a 3000*3000 matrix
set.seed(24)
m1 <- matrix(sample(0:40, 3000*3000, replace=TRUE),
ncol=3000, dimnames=list(NULL, sample(c('Tree', 'House', 'Car'),
3000,replace=TRUE)))
library(microbenchmark)
f1 <-function() {vapply(unique(colnames(m1)), function(x)
rowMeans(m1[,colnames(m1)== x,drop=FALSE], na.rm=TRUE),
numeric(nrow(m1)) )}
f2 <- function() {sapply(unique(colnames(m1)), function(x)
rowMeans(m1[,colnames(m1)== x,drop=FALSE], na.rm=TRUE) )}
f3 <- function() {dcast.data.table(setDT(melt(m1)), Var1~Var2, fun=mean)[,
Var1:= NULL]}
microbenchmark(f1(), f2(), f3(), unit="relative", times=10L)
# Unit: relative
# expr min lq mean median uq max neval
# f1() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10
# f2() 1.026208 1.027723 1.037593 1.034516 1.028847 1.079004 10
# f3() 4.529037 4.567816 4.834498 4.855776 4.930984 5.529531 10
data
set.seed(24)
m1 <- matrix(sample(0:40, 10*40, replace=TRUE), ncol=10,
dimnames=list(NULL, sample(c("Tree", "House", "Car"), 10, replace=TRUE)))
I came up with my own solution. I first just transpose the matrix (called test_mean) so the columns become rows,then:
# removing numbers from rownames
rownames(test_mean)<-gsub("[0-9.]","",rownames(test_mean))
#aggregate by rownames
test_mean<-aggregate(test_mean, by=list(rownames(test_mean)), FUN=mean)
matrixStats:rowMeans2 with some coercive help from data.table, for the win!
Adding it to benchmarking from #akrun we get:
f4<- function() {
ucn<-unique(colnames(m1))
as.matrix(setnames(setDF(lapply(ucn, function(n) rowMeans2(m1,cols=colnames(m1)==n)))
,ucn))
}
> all.equal(f4(),f1())
[1] TRUE
> microbenchmark(f1(), f2(), f3(), f4(), unit="relative", times=10L)
Unit: relative
expr min lq mean median uq max neval cld
f1() 1.837496 1.841282 1.823375 1.834471 1.818822 1.749826 10 b
f2() 1.760133 1.825352 1.817355 1.826257 1.838439 1.793824 10 b
f3() 15.451106 15.606912 15.847117 15.586192 16.626629 16.104648 10 c
f4() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a

replace loop with an *pply alternative

I am trying to speedup my code by replacing some lookup loops with tapply (How to do vlookup and fill down (like in Excel) in R?) and I stumbled upon this code piece:
DF<-data.frame(id=c(rep("A", 5),rep("B", 7),rep("C", 9)), series=NA, chi=c(letters[1:5], LETTERS[6:12], letters[13:21]))
for (i in unique(DF$id)){
DF$series[ DF$id==i ]<-1:length(DF$id[ DF$id==i ])
}
DF
Is it possible to replace this with an *apply family function? Or any other way to speed it up?
You may try ave:
DF$series <- ave(DF$id, DF$id, FUN = seq_along)
For larger data sets, dplyr is faster though.
library(dplyr)
fun_ave <- function(df) transform(df, series = ave(id, id, FUN = seq_along))
fun_dp <- function(df) df %.%
group_by(id) %.%
mutate(
series = seq_along(id))
df <- data.frame(id= sample(letters[1:3], 100000, replace = TRUE))
microbenchmark(fun_ave(df))
# Unit: milliseconds
# expr min lq median uq max neval
# fun_ave(df) 38.59112 39.40802 50.77921 51.2844 128.6791 100
microbenchmark(fun_dp(df))
# Unit: milliseconds
# expr min lq median uq max neval
# fun_dp(df) 4.977035 5.034244 5.060663 5.265173 17.16018 100
Could also use data.table
library(data.table)
DT <- data.table(DF)
DT[, series_new := 1:.N, by = id]
and using tapply
DF$series_new <- unlist(tapply(DF$id, DF$id, function(x) 1:length(x)))
Extending #Henrik's comparison above both data.table and dplyr are quite a bit faster for large data sets.
library(data.table)
library(dplyr)
df <- data.frame(id= sample(letters[1:3], 100000, replace = TRUE), stringsAsFactors = F)
dt <- data.table(df)
fun_orig <- function(df){
for (i in unique(df$id)){
df$series[df$id==i]<-1:length(df$id[df$id==i])
}}
fun_tapply <- function(df){
df$series <- unlist(tapply(df$id, df$id, function(x) 1:length(x)))
}
fun_ave <- function(df){
transform(df, series = ave(df$id, df$id, FUN = seq_along))
}
fun_dp <- function(df){
df %.%
group_by(id) %.%
mutate(
series = seq_along(id))
}
fun_dt <- function(dt) dt[, 1:.N, by = id]
microbenchmark(fun_dt(dt), times = 1000)
#Unit: milliseconds
# expr min lq median uq max neval
# fun_dt(dt) 2.473253 2.597031 2.771771 3.76307 40.59909 1000
microbenchmark(fun_dp(df), times = 1000)
#Unit: milliseconds
# expr min lq median uq max neval
# fun_dp(df) 2.71375 2.786829 2.914569 3.081609 40.48445 1000
microbenchmark(fun_orig(df), times = 1000)
#Unit: milliseconds
# expr min lq median uq max neval
# fun_orig(df) 30.65534 31.93449 32.72991 33.88885 75.13967 1000
microbenchmark(fun_tapply(df), times = 1000)
#Unit: milliseconds
# expr min lq median uq max neval
# fun_tapply(df) 56.67636 61.72207 66.37193 102.4189 124.6661 1000
microbenchmark(fun_ave(df), times = 1000)
#Unit: milliseconds
# expr min lq median uq max neval
# fun_ave(df) 97.36992 103.161 107.5007 139.1362 157.9464 1000

Resources