Fastest way to map multiple character columns to numerical values - r

I have an algorithm that at each iteration calculates means for certain groups (the groups do not change only their values).
The table of the values -
d1 <- data.frame(x = sample(LETTERS, N, replace = TRUE),
y1=rnorm(N))
head(d1)
# x y1
# 1 H -0.7852538
# 2 G -0.6739159
# 3 V -1.7783771
# 4 L -0.2849846
# 5 I -0.1760284
# 6 V -0.2785826
I can calculate the means (in several ways: dplyr, data.table and tapply). I have another data.frame consisting of two columns with the group names.
d2 <- data.frame('group.high' = sample(LETTERS, N * 2, replace = TRUE),
'group.low' = sample(LETTERS, N * 2, replace = TRUE))
head(d2)
# group.high group.low
# 1 U L
# 2 K J
# 3 C Q
# 4 Q A
# 5 Q U
# 6 K W
I want to add to columns, mean.high and mean.better, of the mean values of each group based on d1.
So far I have tried two options from dplyr and data.table. I had to use left_join twice in either of them. They are both similar in speed.
microbenchmark(
dplyr = {
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
### Solution 1
dplyr.d2 <- left_join(d2,data.frame('group.high' = names(means),
'mean.high' = means, stringsAsFactors = FALSE) ) %>%
left_join(., data.frame('group.low' = names(means),
'mean.low' = means, stringsAsFactors = FALSE))},
data.table = {
### Solution 2
d1 <- as.data.table(d1)
d2 <- as.data.table(d2)
means <- d1[ ,.(means = mean(y1)), by = x]
new.d2 <- data.table::merge.data.table(x = d2, y = means, by.x = 'group.high', by.y = 'x')
data.table.d2 <- data.table::merge.data.table(x = new.d2, y = means, by.x = 'group.low', by.y = 'x')
}
)
Unit: milliseconds
expr min lq mean median uq max neval cld
dplyr 34.0837 36.88650 53.22239 42.9227 47.50660 231.5066 100 a
data.table 40.2071 47.70735 87.46804 51.2517 59.05385 258.4999 100 b
Is there a better way? How can I speed the calculation?
As mentioned in the comments, there is an iterative process of updating the values. Here is an example.
N <- 10000
iterFuncDplyr <- function(d1, d2) {
dplyr.d2 <- left_join(d2,data.frame('group.high' = names(means),
'mean.high' = means, stringsAsFactors = FALSE) ) %>%
left_join(., data.frame('group.low' = names(means),
'mean.low' = means, stringsAsFactors = FALSE))
return(var(d1$y1))
}
iterFuncData <- function(d1, d2) {
means <- d1[ ,.(means = mean(y1)), by = x]
new.d2 <- data.table:::merge.data.table(x = d2, y = means, by.x = 'group.high', by.y = 'x')
data.table.d2 <- data.table:::merge.data.table(x = new.d2, y = means, by.x = 'group.low', by.y = 'x')
return(var(d1$y1))
}
d1 <- data.frame(x = sample(LETTERS, N, replace = TRUE),
y1=rnorm(N))
d2 <- data.frame('group.high' = sample(LETTERS, N * 2, replace = TRUE),
'group.low' = sample(LETTERS, N * 2, replace = TRUE))
library(data.table)
library(dplyr)
microbenchmark::microbenchmark(dplyr = {
temp.val <- 0
for (i in 1:10) {
d1$y1 <- temp.val + rnorm(N)
temp.val <- iterFuncDplyr(d1, d2)
}},
data.table = {
d1 <- as.data.table(d1)
d2 <- as.data.table(d2)
temp.val <- 0
for (i in 1:10) {
d1$y1 <- temp.val + rnorm(N)
temp.val <- iterFuncData(d1, d2)
}
}
)
Unit: milliseconds
expr min lq mean median uq max neval
dplyr 46.22904 50.67959 52.78275 51.96358 53.34825 108.2874 100
data.table 63.81111 67.13257 70.85537 69.85712 72.72446 127.4228 100

You could subset the named vector means to create new columns and match your output:
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
d2$mean.high <- means[d2$group.high]
d2$mean.low <- means[d2$group.low]
identical(as.matrix(d2), as.matrix(d3)) #factor vs character, used d3 w/ benchmark
[1] TRUE
Unit: microseconds
expr min lq mean median uq max neval
dplyr 4868.2 5316.25 5787.123 5524.15 5892.70 12187.3 100
data.table 8254.4 9606.60 10438.424 10118.35 10771.75 20966.5 100
subset 481.2 529.40 651.194 550.35 582.55 7849.9 100
Benchmark code:
d3 <- d2
microbenchmark::microbenchmark( # N = 10000
dplyr = {
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
### Solution 1
dplyr.d2 <- left_join(d2,data.frame('group.high' = names(means),
'mean.high' = means, stringsAsFactors = FALSE) ) %>%
left_join(., data.frame('group.low' = names(means),
'mean.low' = means, stringsAsFactors = FALSE))},
data.table = {
### Solution 2
d1 <- as.data.table(d1)
d2 <- as.data.table(d2)
means <- d1[ ,.(means = mean(y1)), by = x]
new.d2 <- data.table::merge.data.table(x = d2, y = means, by.x = 'group.high', by.y = 'x')
data.table.d2 <- data.table::merge.data.table(x = new.d2, y = means, by.x = 'group.low', by.y = 'x')
},
subset = {
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
d3$mean.high <- means[d2$group.high]
d3$mean.low <- means[d2$group.low]
}
)

Here is an answer very similar to Andrew's but relying on data.table instead of tapply() (which seems faster for very big N).
library(data.table)
# Create a named vector "means"
means <- setDT(d1)[, mean(y1), by = x][, setNames(V1, x)]
setDT(d2)[, c("mean.high.means", "mean.low.means") :=
.(means[as.character(group.high)], means[as.character(group.low)])]
Output:
group.high group.low mean.high.means mean.low.means
1: Z W 0.017032792 0.0091625547
2: A A 0.013796137 0.0137961371
3: V S -0.011570159 0.0004560325
4: D X 0.005475629 0.0200984250
5: U H -0.008249901 0.0054537833
---
199996: H K 0.005453783 0.0079905631
199997: A T 0.013796137 -0.0068537963
199998: W U 0.009162555 -0.0082499015
199999: T V -0.006853796 -0.0115701585
200000: G J 0.014829259 0.0206598470
Reproducible data:
N = 1e5
set.seed(1)
d1 <- data.frame(
x = sample(LETTERS, N, replace = TRUE),
y1 = rnorm(N)
)
d2 <- data.frame(
group.high = sample(LETTERS, N * 2, replace = TRUE),
group.low = sample(LETTERS, N * 2, replace = TRUE)
)

Related

Sample from a 0:Vector[i]

In R:
I have a vector **
y = sample(0:200, 1e4, replace = TRUE)
I want to create a variable ‘x’ such that:
x = sample(0:y[i], 1e4, replace = TRUE)
Where y[i] are the values of y1, y2, …, y1e4 created from the sample function before. For example if y1 = 50 then I would like the first entry of x = sample(0:50) etc. However I am not sure how to do this. I have tried for loops but have gotten no where.
Any help is much appreciated!
How about
x <- as.integer(runif(1e4)*sample(201, 1e4, TRUE))
Benchmarking:
f1 <- function() sapply(sample(0:200, 1e4, replace = TRUE), function(i) sample(0:i, size = 1))
f2 <- function() as.integer(runif(1e4)*sample(201, 1e4, TRUE))
microbenchmark::microbenchmark(f1 = f1(),
f2 = f2())
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1 38877.3 47070.50 49294.770 48625.00 50175.35 97045.0 100
#> f2 508.2 522.05 555.602 531.45 549.45 2080.8 100
This should work:
y = sample(0:200, 1e4, replace = TRUE)
x = sapply(y, \(i) sample(0:i, size = 1))
Or the equivalent using a for loop:
x = numeric(length(y))
for(i in seq_along(y)) {
x[i] = sample(0:y[i], size = 1)
}
If efficiency matters, this might be a bit faster on very long input:
x = floor(runif(length(y), min = 0, max = y + 1))

How execute pairwise.t.test into a list with `for` loop?

My list (lt):
df_1 <- data.frame(
x = replicate(
n = 2,
expr = runif(n = 30, min = 20, max = 100)
),
y = sample(
x = 1:3, size = 30, replace = TRUE
)
)
lt <- split(
x = df_1,
f = df_1[['y']]
)
vars <- names(df_1)[1:2]
I try:
for (i in vars) {
for (i in i) {
print(pairwise.t.test(x = lt[, i], g = lt[['y']], p.adj = 'bonferroni'))
}
}
But, the error message is:
Error in lista[, i] : incorrect number of dimensions
What's problem?
We don't need to split
pairwise.t.test(unlist(df_1[1:2]), g = rep(df_1$y, 2), p.adj = 'bonferroni')
#Pairwise comparisons using t tests with pooled SD
#data: unlist(df_1[1:2]) and rep(df_1$y, 2)
# 1 2
#2 1.00 -
#3 0.91 1.00

create data frame containing rows that add up to 100

This is my first stab at this:
library(dplyr)
step_size <- 5
grid <- expand.grid(
x1 = seq(0, 100, step_size)
, x2 = seq(0, 100, step_size)
, x3 = seq(0, 100, step_size)
)
grid$sum = grid$x1 + grid$x2 + grid$x3
grid$x1 <- (grid$x1 / grid$sum) * 100
grid$x2 <- (grid$x2 / grid$sum) * 100
grid$x3 <- (grid$x3 / grid$sum) * 100
grid$sum <- grid$x1 + grid$x2 + grid$x3
nrow(grid)
result <- distinct(grid) %>% filter(!is.na(sum))
head(result, 20)
nrow(result)
Basically, I want to create a data frame that contains as many rows as possible that add up to 100 and are uniformly distributed.
is there an easier better approach in R? thanks!
Using data.table...
library(data.table)
grid <- expand.grid(
x1 = seq(0, 100)
, x2 = seq(0, 100)
, x3 = seq(0, 100)
)
setDT(grid)
res <- grid[grid[, rowSums(.SD) == 100], ]
res[, summation := rowSums(.SD)]
Result:
> res[, unique(summation)]
[1] 100
This can also be done in base but data.table is faster:
library(data.table)
grid <- expand.grid(
x1 = seq(0, 100)
, x2 = seq(0, 100)
, x3 = seq(0, 100)
)
grid2 <- expand.grid(
x1 = seq(0, 100)
, x2 = seq(0, 100)
, x3 = seq(0, 100)
)
setDT(grid)
microbenchmark::microbenchmark(
data.table = {
res <- grid[grid[, rowSums(.SD) == 100], ]
},
base = {
res2 <- grid2[rowSums(grid2) == 100, ]
}
)
Unit: milliseconds
expr min lq mean median uq max neval cld
data.table 59.41157 89.6700 109.0462 107.7415 124.2675 183.9730 100 a
base 65.70521 109.6471 154.1312 125.4238 156.9168 611.0169 100 b
Here's a simple function. You can specify how many rows/columns you want, and what each row sums to.
func <- function(cols = 3, rows = 10, rowTotal = 100) {
dt1 <- replicate(n = cols, runif(n = rows))
dt1 <- data.frame(apply(X = dt1, MARGIN = 2, FUN = function(x) x / rowSums(dt1) * rowTotal))
return(dt1)
}
rowSums(func()) # default values (3 cols, 10 rows, each row sums to 100)
rowSums(func(cols = 5, rows = 10, rowTotal = 50)) # 5 cols, 10 rows, row sums to 50)

Data.table - subsetting within groups during group by is slow

I'm trying to produce several aggregate statistics, and some of them need to be produced on a subset of each group. The data.table is quite large, 10 million rows, but using by without column subsetting is blazing fast (less than a second). Adding just one additional column which needs to be calculated on a subset of each group increases the running time by factor of 12.
Is the a faster way to do this? Below is my full code.
library(data.table)
library(microbenchmark)
N = 10^7
DT = data.table(id1 = sample(1:400, size = N, replace = TRUE),
id2 = sample(1:100, size = N, replace = TRUE),
id3 = sample(1:50, size = N, replace = TRUE),
filter_var = sample(1:10, size = N, replace = TRUE),
x1 = sample(1:1000, size = N, replace = TRUE),
x2 = sample(1:1000, size = N, replace = TRUE),
x3 = sample(1:1000, size = N, replace = TRUE),
x4 = sample(1:1000, size = N, replace = TRUE),
x5 = sample(1:1000, size = N, replace = TRUE) )
setkey(DT, id1,id2,id3)
microbenchmark(
DT[, .(
sum_x1 = sum(x1),
sum_x2 = sum(x2),
sum_x3 = sum(x3),
sum_x4 = sum(x4),
sum_x5 = sum(x5),
avg_x1 = mean(x1),
avg_x2 = mean(x2),
avg_x3 = mean(x3),
avg_x4 = mean(x4),
avg_x5 = mean(x5)
) , by = c('id1','id2','id3')] , unit = 's', times = 10L)
min lq mean median uq max neval
0.942013 0.9566891 1.004134 0.9884895 1.031334 1.165144 10
microbenchmark( DT[, .(
sum_x1 = sum(x1),
sum_x2 = sum(x2),
sum_x3 = sum(x3),
sum_x4 = sum(x4),
sum_x5 = sum(x5),
avg_x1 = mean(x1),
avg_x2 = mean(x2),
avg_x3 = mean(x3),
avg_x4 = mean(x4),
avg_x5 = mean(x5),
sum_x1_F1 = sum(x1[filter_var < 5]) #this line slows everything down
) , by = c('id1','id2','id3')] , unit = 's', times = 10L)
min lq mean median uq max neval
12.24046 12.4123 12.83447 12.72026 13.49059 13.61248 10
GForce makes grouped operations run faster and will work on expressions like list(x = funx(X), y = funy(Y)), ...) where X and Y are column names and funx and funy belong to the set of optimized functions.
For a full description of what works, see ?GForce.
To test if an expression works, read the messages from DT[, expr, by=, verbose=TRUE].
In the OP's case, we have sum_x1_F1 = sum(x1[filter_var < 5]) which is not covered by GForce even though sum(v) is. In this special case, we can make a var v = x1*condition and sum that:
DT[, v := x1*(filter_var < 5)]
system.time( DT[, .(
sum_x1 = sum(x1),
sum_x2 = sum(x2),
sum_x3 = sum(x3),
sum_x4 = sum(x4),
sum_x5 = sum(x5),
avg_x1 = mean(x1),
avg_x2 = mean(x2),
avg_x3 = mean(x3),
avg_x4 = mean(x4),
avg_x5 = mean(x5),
sum_x1_F1 = sum(v)
) , by = c('id1','id2','id3')])
# user system elapsed
# 0.63 0.19 0.81
For comparison, timing the OP's code on my computer:
system.time( DT[, .(
sum_x1 = sum(x1),
sum_x2 = sum(x2),
sum_x3 = sum(x3),
sum_x4 = sum(x4),
sum_x5 = sum(x5),
avg_x1 = mean(x1),
avg_x2 = mean(x2),
avg_x3 = mean(x3),
avg_x4 = mean(x4),
avg_x5 = mean(x5),
sum_x1_F1 = sum(x1[filter_var < 5]) #this line slows everything down
) , by = c('id1','id2','id3')])
# user system elapsed
# 9.00 0.02 9.06

R data.table Performing a by aggregation on a joined column

I want to do an aggregation on the result of a join (x and i are keyed data.tables) without materializing the whole result. It is similar to what .EACHI tries to resolve but for columns that are added by the join.
D1 <- data.table(x = rnorm(1:100), i = sample(1:100, 100, replace = TRUE), k = sample(1:100, 100, replace = TRUE))
D2 <- data.table(x = rnorm(1:100), j = sample(1:100, 100, replace = TRUE), k = sample(1:100, 100, replace = TRUE))
setkey(D1, k)
setkey(D2, k)
I would like to sum x on the result of the join of D1 and D2 for the pairs i and j without materializing the result of D1[D2]
D1[D2, list(x = sum(x * i.x)), by = list(i, j), allow.cartesian = TRUE] ## Fails
D1[D2, allow.cartesian = TRUE][, list(x = sum(x * i.x)), by = list(i, j)] ## Is the result I want but uses up more memory
Is there a way to do this? If not will it be implemented in a future version?

Resources