Row Wise Mode in data.table R - r

I am trying to find an efficient way to get a row wise modes on a subset of columns in data.table
#Sample data
a <- data.frame(
id=letters[],
dattyp1 = sample( 1:2, 26, replace=T) ,
dattyp2 = sample( 1:2, 26, replace=T) ,
dattyp3 = sample( 1:2, 26, replace=T) ,
dattyp4 = sample( 1:2, 26, replace=T) ,
dattyp5 = sample( 1:2, 26, replace=T) ,
dattyp6 = sample( 1:2, 26, replace=T)
)
library(modeest)
library(data.table)
I know from: To find "row wise" "Mode" of a given data in R that I can do this:
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
apply(a[ ,paste0("dattyp",1:6)], 1, Mode)
But this is really slow (over my millions of records). I am thinking there must be a way to do it with .SDcols - but this does column wise modes not row wise.
a<- data.table( a )
a[ , lapply(.SD , mfv ), .SDcols=c(paste0("dattyp",1:6) ) ]

I think the fastest way via data.table is still to convert into a relational (i.e. long) format and aggregate and then find max in reldtMtd function as follows. I wonder if using Rcpp will be faster.
data:
library(data.table)
M <- 1e6
popn <- 2
set.seed(0L)
a <- data.frame(
id=1:M,
dattyp1 = sample(popn, M, replace=TRUE),
dattyp2 = sample(popn, M, replace=TRUE),
dattyp3 = sample(popn, M, replace=TRUE),
dattyp4 = sample(popn, M, replace=TRUE),
dattyp5 = sample(popn, M, replace=TRUE),
dattyp6 = sample(popn, M, replace=TRUE)
)
setDT(a)
methods:
reldtMtd <- function() {
melt(a, id.vars="id")[,
.N, by=.(id, value)][,
value[which.max(N)], by=.(id)]
}
#from https://stackoverflow.com/a/8189441/1989480
Mode <- compiler::cmpfun(function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
})
Mode2 <- compiler::cmpfun(function(x) names(which.max(table(x))))
matA <- as.matrix(a[, -1L])
baseMtd1 <- function() apply(matA, 1, Mode)
baseMtd2 <- function() apply(matA, 1, Mode2)
library(microbenchmark)
microbenchmark(reldtMtd(), baseMtd1(), baseMtd2(), times=3L)
timings:
Unit: seconds
expr min lq mean median uq max neval
reldtMtd() 1.882783 1.947515 2.031767 2.012248 2.106259 2.20027 3
baseMtd1() 15.618716 15.675314 15.809277 15.731913 15.904557 16.07720 3
baseMtd2() 160.837513 161.692634 162.455048 162.547755 163.263816 163.97988 3

You can try this -though I am not sure how much faster it will be. Note, I am grabbing the first number returned by mfv.
library(modeest)
library(data.table)
a <- data.frame(
id=letters[],
dattyp1 = sample( 1:2, 26, replace=T) ,
dattyp2 = sample( 1:2, 26, replace=T) ,
dattyp3 = sample( 1:2, 26, replace=T) ,
dattyp4 = sample( 1:2, 26, replace=T) ,
dattyp5 = sample( 1:2, 26, replace=T) ,
dattyp6 = sample( 1:2, 26, replace=T)
)
a<- data.table( a )
a[ , Mode:=mfv(c(dattyp1,dattyp2,dattyp3,dattyp4,dattyp5,dattyp6))[1],by=id ]
datatable could be faster.
Apply:
microbenchmark(apply={
+ apply(a[ ,paste0("dattyp",1:6)], 1, Mode)
+ })
Unit: microseconds
expr min lq mean median uq max neval
apply 574.025 591.803 1056.807 624.988 704.396 39236.79 100
datatable by:
microbenchmark({
+ a[ , Mode:=mfv(c(dattyp1,dattyp2,dattyp3,dattyp4,dattyp5,dattyp6))[1],by=id ]
+ })
Unit: milliseconds
expr min lq
{ a[, `:=`(Mode, mfv(c(dattyp1, dattyp2, dattyp3, dattyp4, dattyp5, dattyp6))[1]), by = id] } 2.44109 2.748053
mean median uq max neval
3.049809 2.898769 3.139559 6.398032 100

Related

data.table runs slower than expected in grouping

Here's a reprex.
library(data.table)
# create a dummy data.table
dt <- data.table(
ts = seq.POSIXt(Sys.time(),by = 60,length.out = 1000),
regn = rpois(1000,100),
VR = rnorm(1000,50,sd = 5),
rv = rnorm(1000,500,sd = 20),
topic = sample(c("A","B","C","D","E"),size = 1000,replace = T),
hrblock = xts::align.time(seq.POSIXt(Sys.time(),by = 60,length.out = 1000),n = 60)
)
# run some groupings
# The groupings are exactly what I need in my production code except
# that here they are workng on random data. But still the code
# demonstrates the same reduction in speed as I get in the actual
# code.
microbenchmark::microbenchmark(
dt[,.(.N,t1=first(ts),t2= last(ts),
r1 = fifelse(regn %in% c(100,101,102), first(VR),NA_real_),
r2 = fifelse(regn %in% c(100,101,102), last(VR),NA_real_),
watts = fifelse(regn==101,mean(VR),NA_real_),
l1 = first(rv),l2=last(rv)),
.(hrblock,topic,regn)]
)
#> Unit: milliseconds
#> expr
#> dt[, .(.N, t1 = first(ts), t2 = last(ts), r1 = fifelse(regn %in% c(100, 101, 102), first(VR), NA_real_), r2 = fifelse(regn %in% c(100, 101, 102), last(VR), NA_real_), watts = fifelse(regn == 101, mean(VR), NA_real_), l1 = first(rv), l2 = last(rv)), .(hrblock, topic, regn)]
#> min lq mean median uq max neval
#> 51.30181 54.83056 57.41794 56.55636 57.99337 90.92381 100
Created on 2022-12-19 by the reprex package (v2.0.1)
So a 1000-row data.table is taking close to 56 milliseconds, which looks quite slow. In real life, I run summaries on hundreds of thousands or a million rows, and the user interface becomes very sluggish.
Am I making any fundamental mistake in the grouping?
I tried to use setkey before execution, and it did not speed up the code.
I am expecting a 5- to 10-fold improvement in the response time. Any help will be highly appreciated.
In case you have a large share of groups that contain only 1 row, I strongly suggest splitting the task: grouping operation for those groups with N > 1 and a simple operation for those groups N == 1. By this, you can make use of vectorization and avoid the usage of unnecessary function calls
microbenchmark::microbenchmark(
at_once = dt[,.(.N,t1=first(ts),t2= last(ts),
r1 = fifelse(regn %in% c(100,101,102), first(VR),NA_real_),
r2 = fifelse(regn %in% c(100,101,102), last(VR),NA_real_),
watts = fifelse(regn==101,mean(VR),NA_real_),
l1 = first(rv),l2=last(rv)),
.(hrblock,topic,regn)],
splitted = {
dt[, N := .N, by = .(hrblock,topic,regn)]
dt1 <- dt[N > 1,
.(N,
t1=first(ts),t2= last(ts),
r1 = fifelse(regn %in% c(100,101,102), first(VR),NA_real_),
r2 = fifelse(regn %in% c(100,101,102), last(VR),NA_real_),
watts = fifelse(regn==101,mean(VR),NA_real_),
l1 = first(rv),l2=last(rv)),
.(hrblock,topic,regn)]
dt2 <- dt[N == 1,
.(hrblock,topic,regn,
N,
t1 = ts,
t2 = ts,
r1 = fifelse(regn %in% c(100,101,102), VR, NA_real_),
r2 = fifelse(regn %in% c(100,101,102), VR, NA_real_),
watts = fifelse(regn == 101, VR ,NA_real_),
l1 = rv,
l2 = rv)]
rbind(dt1, dt2)
}
)
#>
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> at_once 33.666042 34.058334 37.66860 34.898542 39.61844 136.997209 100
#> splitted 2.610042 2.667168 3.02075 2.972376 3.05921 8.958875 100
all.equal(splitted, at_once)
#> [1] TRUE

Sample from a 0:Vector[i]

In R:
I have a vector **
y = sample(0:200, 1e4, replace = TRUE)
I want to create a variable ‘x’ such that:
x = sample(0:y[i], 1e4, replace = TRUE)
Where y[i] are the values of y1, y2, …, y1e4 created from the sample function before. For example if y1 = 50 then I would like the first entry of x = sample(0:50) etc. However I am not sure how to do this. I have tried for loops but have gotten no where.
Any help is much appreciated!
How about
x <- as.integer(runif(1e4)*sample(201, 1e4, TRUE))
Benchmarking:
f1 <- function() sapply(sample(0:200, 1e4, replace = TRUE), function(i) sample(0:i, size = 1))
f2 <- function() as.integer(runif(1e4)*sample(201, 1e4, TRUE))
microbenchmark::microbenchmark(f1 = f1(),
f2 = f2())
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1 38877.3 47070.50 49294.770 48625.00 50175.35 97045.0 100
#> f2 508.2 522.05 555.602 531.45 549.45 2080.8 100
This should work:
y = sample(0:200, 1e4, replace = TRUE)
x = sapply(y, \(i) sample(0:i, size = 1))
Or the equivalent using a for loop:
x = numeric(length(y))
for(i in seq_along(y)) {
x[i] = sample(0:y[i], size = 1)
}
If efficiency matters, this might be a bit faster on very long input:
x = floor(runif(length(y), min = 0, max = y + 1))

How can I create a new vector command within a loop in R?

for (i in 1:100) {
e <- rnorm(n = 20, mean = 100, sd = 10)
}
e
So I want to know how I can command (something) within each of the 20 randomly generated vectors. E.g. how can I tell it so it spits me out a new command for each new random vector?
for (i in 1:100) {
e <- rnorm(n = 20, mean = 100, sd = 10)
new_vector <- mean(e) - median(e)
}
e
I have tried this but that's definitely not it.
With the OP's code, we may need to initialize e and concatenate the object to append in each iteration
e <- numeric(0)
for (i in 1:100) {
e <- c(e, rnorm(n = 20, mean = 100, sd = 10))
}
If we want to create a list
e <- vector('list', 100)
for(i in 1:100) {
e[[i]] <- rnorm(n = 20, mean = 100, sd = 10)
}
Or if the interest is to get a vector with the difference of mean and median, initialize new_vector of length 100, loop over the sequence (1:100), get the random numbers in 'e' and assign the difference of mean, median for each position of the 'new_vector' using the the sequence as index
new_vector <- numeric(100)
for(i in 1:100){
e <- rnorm(n = 20, mean = 100, sd = 10)
new_vector[i] <- mean(e) - median(e)
}
Or using lapply/sapply/replicate
lapply(replicate(100, rnorm(n = 20, mean = 100, sd = 10),
simplify = FALSE), function(x) mean(x) - median(x))
Or with vectorized functions - colMedians (from matrixStats) and colMeans
library(matrixStats)
m1 <- replicate(100, rnorm(n = 20, mean = 100, sd = 10))
colMedians(m1)- colMeans(m1)

Fastest way to map multiple character columns to numerical values

I have an algorithm that at each iteration calculates means for certain groups (the groups do not change only their values).
The table of the values -
d1 <- data.frame(x = sample(LETTERS, N, replace = TRUE),
y1=rnorm(N))
head(d1)
# x y1
# 1 H -0.7852538
# 2 G -0.6739159
# 3 V -1.7783771
# 4 L -0.2849846
# 5 I -0.1760284
# 6 V -0.2785826
I can calculate the means (in several ways: dplyr, data.table and tapply). I have another data.frame consisting of two columns with the group names.
d2 <- data.frame('group.high' = sample(LETTERS, N * 2, replace = TRUE),
'group.low' = sample(LETTERS, N * 2, replace = TRUE))
head(d2)
# group.high group.low
# 1 U L
# 2 K J
# 3 C Q
# 4 Q A
# 5 Q U
# 6 K W
I want to add to columns, mean.high and mean.better, of the mean values of each group based on d1.
So far I have tried two options from dplyr and data.table. I had to use left_join twice in either of them. They are both similar in speed.
microbenchmark(
dplyr = {
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
### Solution 1
dplyr.d2 <- left_join(d2,data.frame('group.high' = names(means),
'mean.high' = means, stringsAsFactors = FALSE) ) %>%
left_join(., data.frame('group.low' = names(means),
'mean.low' = means, stringsAsFactors = FALSE))},
data.table = {
### Solution 2
d1 <- as.data.table(d1)
d2 <- as.data.table(d2)
means <- d1[ ,.(means = mean(y1)), by = x]
new.d2 <- data.table::merge.data.table(x = d2, y = means, by.x = 'group.high', by.y = 'x')
data.table.d2 <- data.table::merge.data.table(x = new.d2, y = means, by.x = 'group.low', by.y = 'x')
}
)
Unit: milliseconds
expr min lq mean median uq max neval cld
dplyr 34.0837 36.88650 53.22239 42.9227 47.50660 231.5066 100 a
data.table 40.2071 47.70735 87.46804 51.2517 59.05385 258.4999 100 b
Is there a better way? How can I speed the calculation?
As mentioned in the comments, there is an iterative process of updating the values. Here is an example.
N <- 10000
iterFuncDplyr <- function(d1, d2) {
dplyr.d2 <- left_join(d2,data.frame('group.high' = names(means),
'mean.high' = means, stringsAsFactors = FALSE) ) %>%
left_join(., data.frame('group.low' = names(means),
'mean.low' = means, stringsAsFactors = FALSE))
return(var(d1$y1))
}
iterFuncData <- function(d1, d2) {
means <- d1[ ,.(means = mean(y1)), by = x]
new.d2 <- data.table:::merge.data.table(x = d2, y = means, by.x = 'group.high', by.y = 'x')
data.table.d2 <- data.table:::merge.data.table(x = new.d2, y = means, by.x = 'group.low', by.y = 'x')
return(var(d1$y1))
}
d1 <- data.frame(x = sample(LETTERS, N, replace = TRUE),
y1=rnorm(N))
d2 <- data.frame('group.high' = sample(LETTERS, N * 2, replace = TRUE),
'group.low' = sample(LETTERS, N * 2, replace = TRUE))
library(data.table)
library(dplyr)
microbenchmark::microbenchmark(dplyr = {
temp.val <- 0
for (i in 1:10) {
d1$y1 <- temp.val + rnorm(N)
temp.val <- iterFuncDplyr(d1, d2)
}},
data.table = {
d1 <- as.data.table(d1)
d2 <- as.data.table(d2)
temp.val <- 0
for (i in 1:10) {
d1$y1 <- temp.val + rnorm(N)
temp.val <- iterFuncData(d1, d2)
}
}
)
Unit: milliseconds
expr min lq mean median uq max neval
dplyr 46.22904 50.67959 52.78275 51.96358 53.34825 108.2874 100
data.table 63.81111 67.13257 70.85537 69.85712 72.72446 127.4228 100
You could subset the named vector means to create new columns and match your output:
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
d2$mean.high <- means[d2$group.high]
d2$mean.low <- means[d2$group.low]
identical(as.matrix(d2), as.matrix(d3)) #factor vs character, used d3 w/ benchmark
[1] TRUE
Unit: microseconds
expr min lq mean median uq max neval
dplyr 4868.2 5316.25 5787.123 5524.15 5892.70 12187.3 100
data.table 8254.4 9606.60 10438.424 10118.35 10771.75 20966.5 100
subset 481.2 529.40 651.194 550.35 582.55 7849.9 100
Benchmark code:
d3 <- d2
microbenchmark::microbenchmark( # N = 10000
dplyr = {
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
### Solution 1
dplyr.d2 <- left_join(d2,data.frame('group.high' = names(means),
'mean.high' = means, stringsAsFactors = FALSE) ) %>%
left_join(., data.frame('group.low' = names(means),
'mean.low' = means, stringsAsFactors = FALSE))},
data.table = {
### Solution 2
d1 <- as.data.table(d1)
d2 <- as.data.table(d2)
means <- d1[ ,.(means = mean(y1)), by = x]
new.d2 <- data.table::merge.data.table(x = d2, y = means, by.x = 'group.high', by.y = 'x')
data.table.d2 <- data.table::merge.data.table(x = new.d2, y = means, by.x = 'group.low', by.y = 'x')
},
subset = {
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
d3$mean.high <- means[d2$group.high]
d3$mean.low <- means[d2$group.low]
}
)
Here is an answer very similar to Andrew's but relying on data.table instead of tapply() (which seems faster for very big N).
library(data.table)
# Create a named vector "means"
means <- setDT(d1)[, mean(y1), by = x][, setNames(V1, x)]
setDT(d2)[, c("mean.high.means", "mean.low.means") :=
.(means[as.character(group.high)], means[as.character(group.low)])]
Output:
group.high group.low mean.high.means mean.low.means
1: Z W 0.017032792 0.0091625547
2: A A 0.013796137 0.0137961371
3: V S -0.011570159 0.0004560325
4: D X 0.005475629 0.0200984250
5: U H -0.008249901 0.0054537833
---
199996: H K 0.005453783 0.0079905631
199997: A T 0.013796137 -0.0068537963
199998: W U 0.009162555 -0.0082499015
199999: T V -0.006853796 -0.0115701585
200000: G J 0.014829259 0.0206598470
Reproducible data:
N = 1e5
set.seed(1)
d1 <- data.frame(
x = sample(LETTERS, N, replace = TRUE),
y1 = rnorm(N)
)
d2 <- data.frame(
group.high = sample(LETTERS, N * 2, replace = TRUE),
group.low = sample(LETTERS, N * 2, replace = TRUE)
)

create data frame containing rows that add up to 100

This is my first stab at this:
library(dplyr)
step_size <- 5
grid <- expand.grid(
x1 = seq(0, 100, step_size)
, x2 = seq(0, 100, step_size)
, x3 = seq(0, 100, step_size)
)
grid$sum = grid$x1 + grid$x2 + grid$x3
grid$x1 <- (grid$x1 / grid$sum) * 100
grid$x2 <- (grid$x2 / grid$sum) * 100
grid$x3 <- (grid$x3 / grid$sum) * 100
grid$sum <- grid$x1 + grid$x2 + grid$x3
nrow(grid)
result <- distinct(grid) %>% filter(!is.na(sum))
head(result, 20)
nrow(result)
Basically, I want to create a data frame that contains as many rows as possible that add up to 100 and are uniformly distributed.
is there an easier better approach in R? thanks!
Using data.table...
library(data.table)
grid <- expand.grid(
x1 = seq(0, 100)
, x2 = seq(0, 100)
, x3 = seq(0, 100)
)
setDT(grid)
res <- grid[grid[, rowSums(.SD) == 100], ]
res[, summation := rowSums(.SD)]
Result:
> res[, unique(summation)]
[1] 100
This can also be done in base but data.table is faster:
library(data.table)
grid <- expand.grid(
x1 = seq(0, 100)
, x2 = seq(0, 100)
, x3 = seq(0, 100)
)
grid2 <- expand.grid(
x1 = seq(0, 100)
, x2 = seq(0, 100)
, x3 = seq(0, 100)
)
setDT(grid)
microbenchmark::microbenchmark(
data.table = {
res <- grid[grid[, rowSums(.SD) == 100], ]
},
base = {
res2 <- grid2[rowSums(grid2) == 100, ]
}
)
Unit: milliseconds
expr min lq mean median uq max neval cld
data.table 59.41157 89.6700 109.0462 107.7415 124.2675 183.9730 100 a
base 65.70521 109.6471 154.1312 125.4238 156.9168 611.0169 100 b
Here's a simple function. You can specify how many rows/columns you want, and what each row sums to.
func <- function(cols = 3, rows = 10, rowTotal = 100) {
dt1 <- replicate(n = cols, runif(n = rows))
dt1 <- data.frame(apply(X = dt1, MARGIN = 2, FUN = function(x) x / rowSums(dt1) * rowTotal))
return(dt1)
}
rowSums(func()) # default values (3 cols, 10 rows, each row sums to 100)
rowSums(func(cols = 5, rows = 10, rowTotal = 50)) # 5 cols, 10 rows, row sums to 50)

Resources