I am trying to define a function that takes a data frame or table as input with a specific number of ID columns (e.g., 2 or 3 ID columns), and the remaining columns are NAME1, NAME2, ..., NAMEK (numeric columns). The output should be a data table that consists of the same ID columns as before plus one additional ID column that groups each unique pairwise combination of the column names (NAME1, NAME2, ...). In addition, we must gather the actual values of the numeric columns into two new columns based on the ID column; an example with two ID columns and three numeric columns:
ID1 <- c("A","A","A","B","B","B")
ID2 <- c(1,2,3,1,2,3)
NAME1 <- c(10,11,9,22,25,22)
NAME2 <- c(7,9,8,20,22,21)
NAME3 <- c(10,12,11,15,19,30)
DT <- data.table(ID1,ID2,NAME1,NAME2,NAME3)
I want the output of the function with DT as input to be
ID.new <- c("NAME1 - NAME2","NAME1 - NAME2","NAME1 - NAME2", "NAME1 - NAME2",
"NAME1 - NAME2","NAME1 - NAME2", "NAME1 - NAME3", "NAME1 - NAME3",
"NAME1 - NAME3","NAME1 - NAME3","NAME1 - NAME3","NAME1 - NAME3",
"NAME2 - NAME3","NAME2 - NAME3","NAME2 - NAME3","NAME2 - NAME3",
"NAME2 - NAME3", "NAME2 - NAME3")
ID1 <- c("A","A","A","B","B","B","A","A","A","B","B","B","A","A","A","B","B","B")
ID2 <- c(1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3)
value.left <- c(10,11,9,22,25,22,10,11,9,22,25,22,7,9,8,20,22,21)
value.right <- c(7,9,8,20,22,21,10,12,11,15,19,30,10,12,11,15,19,30)
DT.output <- data.table(ID.new,ID1,ID2,value.left,value.right)
I have found that fun() (see below) does the job, but is too slow for my liking:
fun <- function(data, ID.cols){
data <- data.table(data)
# Which of the columns are ID columns
ids <- which(colnames(data) %in% ID.cols)
# Obtain all pairwise combinations of numeric columns into a list
numeric.combs <- combn(x = data.table(data)[,!ids, with = FALSE], m = 2, simplify = FALSE)
id.cols <- data[,ids, with = FALSE]
# bind the ID columns to each pairwise combination of numeric columns inside the list
bind.columns.each.numeric.comb <- lapply(X = numeric.combs, FUN = function(x) cbind(id.cols,x))
# Create generic names for the numeric columns so that rbindlist() may be applied. In addition we make a new column that groups based on which columns we are considering
generalize <- suppressWarnings(lapply(X = bind.columns.each.numeric.comb, FUN = function(x)
setattr(x = x[,ID.NEW:=paste(colnames(x[,!ids,with=FALSE]),collapse=" - ")], name =
'names', value = c(ID.cols,"value.left","value.right","ID.NEW"))))
return(rbindlist(l=generalize))
}
# Performance
print(microbenchmark(fun(DT,ID.cols=c("ID1","ID2")),times=1000))
Is there a faster and more elegant way to do this?
A melted, self-join option:
library(data.table)
DTlong <- melt(DT, id.vars = c("ID1", "ID2"), variable.factor = FALSE)
out <- DTlong[DTlong, on = .(ID1, ID2), allow.cartesian = TRUE
][variable < i.variable,
][, .(ID.new = paste(variable, i.variable, sep = " - "),
ID1, ID2, value.left = value, value.right = i.value)]
out
# ID.new ID1 ID2 value.left value.right
# <char> <char> <num> <num> <num>
# 1: NAME1 - NAME2 A 1 10 7
# 2: NAME1 - NAME2 A 2 11 9
# 3: NAME1 - NAME2 A 3 9 8
# 4: NAME1 - NAME2 B 1 22 20
# 5: NAME1 - NAME2 B 2 25 22
# 6: NAME1 - NAME2 B 3 22 21
# 7: NAME1 - NAME3 A 1 10 10
# 8: NAME2 - NAME3 A 1 7 10
# 9: NAME1 - NAME3 A 2 11 12
# 10: NAME2 - NAME3 A 2 9 12
# 11: NAME1 - NAME3 A 3 9 11
# 12: NAME2 - NAME3 A 3 8 11
# 13: NAME1 - NAME3 B 1 22 15
# 14: NAME2 - NAME3 B 1 20 15
# 15: NAME1 - NAME3 B 2 25 19
# 16: NAME2 - NAME3 B 2 22 19
# 17: NAME1 - NAME3 B 3 22 30
# 18: NAME2 - NAME3 B 3 21 30
### validation
setorder(out, ID.new, ID1, ID2)
identical(DT.output, out)
# [1] TRUE
The methodology of combn is sound thinking, certainly, however its only inefficiency is that it iterates once per combination. That is, the function passed to combn(..., FUN=) is called in this case 18 times; if your data is much larger, it will be called many more times. In the case of a merge/join as here, though, everything is done in as vectorized a way as we can manage: merge is done efficiently, filtering is returned as a single logical vector, and the paste(..) is also one large vector.
The merge concept does have its own inefficiency, to be fair: it originally produces 54 rows due to the cartesian join. This will cause memory-exhaustion problems with much-larger data. If you run into this, it may help to use fuzzyjoin and include variable < variable (LHS vs RHS), which should reduce (if not completely remove) the problem.
This last recommendation can be done in sqldf as well:
sqldf::sqldf("
select t1.variable || ' - ' || t2.variable as [ID.new], t1.ID1, t1.ID2,
t1.value as [value.left], t2.value as [value.right]
from DTlong t1
join DTlong t2 on t1.ID1=t2.ID1 and t1.ID2=t2.ID2
and t1.variable < t2.variable")
# ID.new ID1 ID2 value.left value.right
# 1 NAME1 - NAME2 A 1 10 7
# 2 NAME1 - NAME3 A 1 10 10
# 3 NAME1 - NAME2 A 2 11 9
# 4 NAME1 - NAME3 A 2 11 12
# 5 NAME1 - NAME2 A 3 9 8
# 6 NAME1 - NAME3 A 3 9 11
# 7 NAME1 - NAME2 B 1 22 20
# 8 NAME1 - NAME3 B 1 22 15
# 9 NAME1 - NAME2 B 2 25 22
# 10 NAME1 - NAME3 B 2 25 19
# 11 NAME1 - NAME2 B 3 22 21
# 12 NAME1 - NAME3 B 3 22 30
# 13 NAME2 - NAME3 A 1 7 10
# 14 NAME2 - NAME3 A 2 9 12
# 15 NAME2 - NAME3 A 3 8 11
# 16 NAME2 - NAME3 B 1 20 15
# 17 NAME2 - NAME3 B 2 22 19
# 18 NAME2 - NAME3 B 3 21 30
Benchmarking:
bench::mark(
pernkf = fun(DT, c("ID1", "ID2")),
tjebo = fun2(DT, c("ID1", "ID2")),
r2evans = fun3(DT, c("ID1", "ID2")), # native data.table
r2evans2 = fun4(), # sqldf
check = FALSE)
# # A tibble: 4 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 pernkf 5.38ms 6.06ms 161. 287KB 13.2 61 5 379ms <NULL> <Rprofmem[,3~ <bch:tm~ <tibble [~
# 2 tjebo 5.08ms 5.63ms 172. 230KB 8.83 78 4 453ms <NULL> <Rprofmem[,3~ <bch:tm~ <tibble [~
# 3 r2evans 2.97ms 3.48ms 280. 170KB 11.0 127 5 454ms <NULL> <Rprofmem[,3~ <bch:tm~ <tibble [~
# 4 r2evans2 17.19ms 18.91ms 52.0 145KB 13.0 20 5 384ms <NULL> <Rprofmem[,3~ <bch:tm~ <tibble [~
(sqldf does take a performance hit in this example, I'd welcome improved queries :-)
If you can use a data frame, the below will give you the currently most speed and memory efficient approach (see benchmark wiki).
I think the approach using combn() seems reasonable to me. And I don't really think it's iterating over the combinations 18 times, as has been purported. Moreover, I personally find this easier to read than the data table melt version, but this is probably because I'm not used to data.table syntax.
Note: using this on a data table is apparently not efficient. If you really need a data.table, r2evans solution is better.
fun2 <- function(data, ID.cols){
ids <- which(colnames(data) %in% ID.cols)
## you can loop over the combinations directly
new_dat <- combn(data[-ids], 2, function(x) {
new_x <- setNames(x, paste("value", c("left", "right"), sep = "."))
## use paste with collapse for the ID.new
new_x$ID.new <- paste(names(x), collapse = " - ")
new_x
}, simplify = FALSE)
## bind it with the old ID columns, outside the loop (bit faster)
cbind(do.call(rbind, new_dat), data[ids])
}
fun2(DT,ID.cols = c("ID1", "ID2"))
#> value.left value.right ID.new ID1 ID2
#> 1 10 7 NAME1 - NAME2 A 1
#> 2 11 9 NAME1 - NAME2 A 2
#> 3 9 8 NAME1 - NAME2 A 3
#> 4 22 20 NAME1 - NAME2 B 1
#> 5 25 22 NAME1 - NAME2 B 2
#> 6 22 21 NAME1 - NAME2 B 3
#> 7 10 10 NAME1 - NAME3 A 1
#> 8 11 12 NAME1 - NAME3 A 2
#> 9 9 11 NAME1 - NAME3 A 3
#> 10 22 15 NAME1 - NAME3 B 1
#> 11 25 19 NAME1 - NAME3 B 2
#> 12 22 30 NAME1 - NAME3 B 3
#> 13 7 10 NAME2 - NAME3 A 1
#> 14 9 12 NAME2 - NAME3 A 2
#> 15 8 11 NAME2 - NAME3 A 3
#> 16 20 15 NAME2 - NAME3 B 1
#> 17 22 19 NAME2 - NAME3 B 2
#> 18 21 30 NAME2 - NAME3 B 3
For a benchmark, please see the community wiki.
Benchmarking, reprex. If you don't really need a data table, base R seems to do the trick here.
Note this is comparing r2evans' and pernkf's functions on a data table with tjebo's and tarjae's function on a data frame.
The methods suggested by PeaceWang is currently not included as they are either not scalable to k columns or provide an incorrect result.
bench::mark(
pernkf = fun(DT, c("ID1", "ID2")),
tjebo = fun2(DF, c("ID1", "ID2")),
r2evans = fun3(DT, c("ID1", "ID2")),
tarjae = fun4(DF, c("ID1", "ID2")),
check = FALSE)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 pernkf 2.95ms 3.2ms 302. 2.29MB 6.33
#> 2 tjebo 359.33µs 373.85µs 2423. 18.65KB 10.5
#> 3 r2evans 1.65ms 1.79ms 535. 756.16KB 6.30
#> 4 tarjae 26.49ms 27.74ms 34.3 4.75MB 7.35
m <- microbenchmark::microbenchmark(
pernkf = fun(DT, ID.cols = c("ID1", "ID2")),
r2evans = fun3(DT, ID.cols = c("ID1", "ID2")),
tjebo = fun2(DF, ID.cols = c("ID1", "ID2")),
tarjae = fun4(DF, c("ID1", "ID2")),
times = 1000
)
m
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> pernkf 2885.714 3055.1450 3439.1257 3150.457 3298.404 95391.80 1000
#> r2evans 1629.028 1739.5715 1949.8389 1829.696 1922.227 10843.33 1000
#> tjebo 354.714 410.0975 469.1457 427.948 443.237 4344.00 1000
#> tarjae 25854.416 26564.8420 29103.6948 27142.758 30982.328 118592.10 1000
ggplot2::autoplot(m)
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.
Data and functions
library(tidyverse)
library(data.table)
ID1 <- c("A","A","A","B","B","B")
ID2 <- c(1,2,3,1,2,3)
NAME1 <- c(10,11,9,22,25,22)
NAME2 <- c(7,9,8,20,22,21)
NAME3 <- c(10,12,11,15,19,30)
DF <- data.frame(ID1,ID2,NAME1,NAME2,NAME3)
DT <- data.table(DF)
fun <- function(data, ID.cols){
data <- data.table(data)
ids <- which(colnames(data) %in% ID.cols)
numeric.combs <- combn(x = data.table(data)[,!ids, with = FALSE], m = 2, simplify = FALSE)
id.cols <- data[,ids, with = FALSE]
bind.columns.each.numeric.comb <- lapply(X = numeric.combs, FUN = function(x) cbind(id.cols,x))
generalize <- suppressWarnings(lapply(X = bind.columns.each.numeric.comb, FUN = function(x)
setattr(x = x[,ID.NEW:=paste(colnames(x[,!ids,with=FALSE]),collapse=" - ")], name =
'names', value = c(ID.cols,"value.left","value.right","ID.NEW"))))
return(rbindlist(l=generalize))
}
fun2 <- function(data, ID.cols){
ids <- which(colnames(data) %in% ID.cols)
new_dat <- combn(data[-ids], 2, function(x) {
new_x <- setNames(x, paste("value", c("left", "right"), sep = "."))
new_x$ID.new <- paste(names(x), collapse = " - ")
new_x
}, simplify = FALSE)
cbind(do.call(rbind, new_dat), data[ids])
}
fun3 <- function(data, ID.cols) {
DTlong <- melt(data, id.vars = ID.cols, variable.factor = FALSE)
out <- DTlong[DTlong, on = .(ID1, ID2), allow.cartesian = TRUE
][variable < i.variable,
][, .(ID.new = paste(variable, i.variable, sep = " - "),
ID1, ID2, value.left = value, value.right = i.value)]
out
}
fun4 <- function(x, id.cols){
DT1 <- DT %>%
pivot_longer(
-id.cols
) %>%
mutate(name1 = lead(name, default=last(name)),
value1 = lead(value, default=last(value)))%>%
arrange(name, name1) %>%
group_by(name) %>%
mutate(n = n()) %>%
mutate(name_nr = parse_number(name)) %>%
ungroup()
DT1 %>%
mutate(name1 = lead(name, unique(n)*(max(name_nr)-min(name_nr)))) %>%
mutate(value1 = lead(value, unique(n)*(max(name_nr)-min(name_nr)))) %>%
slice(seq_len(first(n))) %>%
bind_rows(DT1 %>%
slice(1:(n() - unique(n))), .
) %>%
mutate(ID.new = paste(name, name1, sep = " - "), .before=1) %>%
select(ID.new, ID1, ID2, value.left=value, value.right = value1) %>%
arrange(ID.new)
}
Check if the solutions are the same:
## convert all to data frame
## column names and order need to be the same
## rows need to be sorted in the same way (caveat row names!)
preparetocompare <- function(x){
x <- data.frame(x)
names(x) <- tolower(names(x))
x <- x[c("id1", "id2", "value.left", "value.right", "id.new")]
x <- x[with(x, order(id.new, id1, id2)),]
rownames(x) <- NULL
}
compare_df <- function(...){
# credit to https://stackoverflow.com/a/17244041/7941188
ls_df <- c(as.list(environment()), list(...))
ls_compare <- lapply(ls_df, preparetocompare)
# inspired by https://stackoverflow.com/a/18814864/7941188
all.identical <- function(l) mapply(all.equal, head(l, 1), tail(l, -1))
all.identical(ls_compare)
}
compare_df(fun(DT, c("ID1", "ID2")),
fun2(DF, c("ID1", "ID2")),
fun3(DT, c("ID1", "ID2")),
fun4(DF, c("ID1", "ID2"))
)
#> [1] TRUE TRUE TRUE
This post consists of 3 parts:
The original answer (non-equi self-join with two ID columns)
The 1st edit (non-equi self-join with a variable number of ID columns)
The 2nd edit (benchmark of 6 different approaches with varying problem sizes)
Original answer: Non-equi self-join with two ID columns
For the sake of completeness, here is a solution which uses a non-equi self-join of the molten data (reshaped to long format):
library(data.table)
mdt <- melt(DT, id.vars = c("ID1", "ID2"))
res <- mdt[mdt, on = .(ID1, ID2, variable < variable), nomatch = NULL,
.(ID.new = paste(x.variable, i.variable, sep = " - "),
ID1, ID2, value.left = x.value, value.right = i.value)]
all.equal(res, DT.output, ignore.row.order = TRUE)
[1] TRUE
This approach is similar to r2evans' answer but avoids the cartesian join. I have refrained from showing benchmark results as benchmarkmarking a sample data set of 6 rows, 5 columns has only limited relevance.
Edit 1: Non-equi self-join with a variable number of ID columns
The OP has requested that the number of ID columns may vary (as a matter of fact, the names of the ID columns are passed as parameter to OP's own function).
The non-equi self-join can be enhanced to handle an arbitrary number of ID columns:
library(data.table)
id_cols <- c("ID1", "ID2")
mdt <- melt(DT, id.vars = id_cols)
res <- mdt[mdt, on = c(id_cols, "variable < variable"), nomatch = NULL,
c(.(ID.new = paste(x.variable, i.variable, sep = " - "),
value.left = x.value, value.right = i.value), .SD),
.SDcols = id_cols]
all.equal(res, DT.output, ignore.col.order = TRUE, ignore.row.order = TRUE)
[1] TRUE
Note, that it is safe to use .SD here as .SDcols picks only those columns which are already used to join on (specified by id_cols).
Edit 2: Benchmark with varying problem sizes
The benchmarks presented so far by r2evans and tjebo only use the original dataset with 2 id columns, 3 numeric columns, and 6 rows. Due to small problem size these benchmarks compare overhead but are not representative for the performance on larger problem sizes.
There are 3 different parameters which describe the problem size:
The number of rows nr of the sample dataset DT,
the number of numeric columns nc from which pair-wise rows are created, and
the number of id columns ni.
The final result consists of nc * (nc - 1) / 2 * nr rows and ni + 3 columns.
By using the press() function from the bench package we can easily perform a set of benchmark runs with varying problem sizes.
6 different approaches are being included in the benchmark runs:
pernkf(): the function as shown in OP's question using combn(),
r2evans(): r2evans' melted self-join but modified to work with an arbitrary number of id columns,
tjebo(): tjebo's base R approach using combn() with a data.frame,
nej(): a non-equi self-join of molten data, similar to r2evans' approach but avoiding the cartesian join,
dtc(): a data.table version of tjebos's combn() approach,
mvl(): an implementation of هنروقتان's approach to call melt() with a made-up measure.vars list.
All approaches are implemented as functions which are called with 2 parameters, the dataset DT, or DF, resp., and a character vector with the names of arbitrary id columns.
pernkf <- function(data, ID.cols){
data <- data.table(data)
# Which of the columns are ID columns
ids <- which(colnames(data) %in% ID.cols)
# Obtain all pairwise combinations of numeric columns into a list
numeric.combs <- combn(x = data.table(data)[,!ids, with = FALSE], m = 2, simplify = FALSE)
id.cols <- data[,ids, with = FALSE]
# bind the ID columns to each pairwise combination of numeric columns inside the list
bind.columns.each.numeric.comb <- lapply(X = numeric.combs, FUN = function(x) cbind(id.cols,x))
# Create generic names for the numeric columns so that rbindlist() may be applied. In addition we make a new column that groups based on which columns we are considering
generalize <- suppressWarnings(lapply(X = bind.columns.each.numeric.comb, FUN = function(x)
setattr(x = x[,ID.new:=paste(colnames(x[,!ids,with=FALSE]),collapse=" - ")], name =
'names', value = c(ID.cols,"value.left","value.right","ID.new"))))
return(rbindlist(l=generalize))
}
r2evans = \(DT, id_cols) {
DTlong <- melt(DT, id.vars = id_cols, variable.factor = FALSE)
DTlong[DTlong, on = c(id_cols), allow.cartesian = TRUE
][variable < i.variable,
][, .(ID.new = paste(variable, i.variable, sep = " - "), setnames(.SD, id_cols),
value.left = value, value.right = i.value), .SDcols = id_cols
]
}
tjebo <- \(data, ID.cols) {
ids <- which(colnames(data) %in% ID.cols)
## you can loop over the combinations directly
new_dat <- combn(data[-ids], 2, function(x) {
new_x <- setNames(x, paste("value", c("left", "right"), sep = "."))
## use paste with collapse for the ID.new
new_x$ID.new <- paste(names(x), collapse = " - ")
new_x
}, simplify = FALSE)
## bind it with the old ID columns, outside the loop (bit faster)
cbind(do.call(rbind, new_dat), data[ids])
}
nej <- \(DT, id_cols) {
mdt <- melt(DT, id.vars = id_cols)
mdt[mdt, on = c(id_cols, "variable < variable"), nomatch = NULL,
.(setnames(.SD, id_cols), ID.new = paste(x.variable, i.variable, sep = " - "),
value.left = x.value, value.right = i.value),
.SDcols = id_cols]
}
dtc <- \(DT, id_cols) {
combn(setdiff(colnames(DT), id_cols), 2,
\(x) DT[, ..x][, ID.new := paste(x, collapse = " - ")],
simplify = FALSE) |>
rbindlist(use.names = FALSE) |>
setnames(1:2, c("value.left", "value.right")) |>
cbind(DT[, ..id_cols])
}
mvl <- \(DT, id_cols) {
num_cols <- setdiff(colnames(DT), id_cols)
combos <- combn(num_cols, 2L, simplify = TRUE)
id_new_levels <- apply(combos, 2, paste, collapse = " - ")
melt(DT, measure.vars = list(combos[1L, ],combos[2L, ]),
value.name = c("value.left", "value.right"), variable.name = "ID.new")[
, ID.new := as.character(`levels<-`(ID.new, id_new_levels))]
}
The two approaches by Peace Wang and TarJae have been omitted as I was not able to turn these into scalable functions.
In the call to press() the number of rows nr is varied from 10 to 100'000 and the number of numeric columns nc from 3 to 10. Correspondingly, the number of rows of the resulting datasets vary from 30 to 4.5 million rows. All runs use 3 id columns in order to verify that ni is scalable (and not limited to 2).
The check funtion is set to ignore different order of rows and/or columns as these may vary between the different approaches.
library(bench)
bm <- press(
nr = c(10L, 1000L, 100000L),
nc = c(3L, 5L, 10L),
{
ni <- 3L
DT <- data.table()
id_cols <- sprintf("ID%01i", seq(ni))
# append id cols
for (id in id_cols) set(DT, , id, seq(nr))
# append data cols
for (j in seq(nc)) {
col_name <- sprintf("NAME%04i", j)
set(DT, , col_name, seq(nr) + (j / 1000))
}
DF <- as.data.frame(DT)
mark(
pernkf(DT, id_cols),
r2evans(DT, id_cols),
tjebo(DF, id_cols),
nej(DT, id_cols),
dtc(DT, id_cols),
mvl(DT, id_cols),
check = \(x,y) all.equal(x, setDT(y), ignore.row.order = TRUE, ignore.col.order = TRUE),
min_iterations = 3L
)
}
)
The benchmark timings are visualised by
ggplot2::autoplot(bm)
(note the logarithmic time scale).
Almost always, mvl() is the fastest approach. Only for the smallest problem sizes with 3 numeric columns and up to 1000 rows, tjebo() is slightly faster. For large problems with 100'000 of rows, dtc() and pernkf() are second and third, resp.
The next chart shows how performance varies with the number of numeric columns nc:
library(ggplot2)
ggplot(bm) +
aes(nc, median, colour = attr(expression, "description")) +
geom_point() +
geom_line() +
scale_x_log10() +
labs(colour = "expression") +
facet_wrap(~nr, scales = "free_y") +
ggtitle("Median run time")
(note the log-log scales and the independent time scales of the facets)
tjebo()'s run times increase steeper with nc than any other approach. For some use cases, mvl() is about a magnitude faster than any other approach.
An often neglected aspect is memory consumption. The next chart shows how memory allocation varies with problem size:
ggplot(bm) +
aes(nc, mem_alloc, colour = attr(expression, "description")) +
geom_point() +
geom_line() +
scale_x_log10() +
labs(colour = "expression") +
facet_wrap(~nr, scales = "free_y") +
ggtitle("Memory allocation")
(note the log-log scales and the independent scales on the y-axes)
There is a remarkably large discrepancy in memory allocation between the best and worst approaches for each use case which is about a factor of 7 to 8. Again, tjebo() shows the steepest increase in memory alloction with nc. For large problem sizes, mvl() allocates less memory than any other approach followed by dtc() and pernkf().
UPDATE II (Removed wrong solution)
Now after really hard work and good support of the community (thanks to #akrun and #tjebo) I think I have the correct and scalable tidyverse solution: (HURRAY):-)
library(tidyverse)
DT1 <- DT %>%
pivot_longer(
-c(ID1, ID2)
) %>%
mutate(name1 = lead(name, default=last(name)),
value1 = lead(value, default=last(value)))%>%
arrange(name, name1) %>%
group_by(name) %>%
mutate(n = n()) %>%
mutate(name_nr = parse_number(name)) %>%
ungroup()
DT1 %>%
mutate(name1 = lead(name, unique(n)*(max(name_nr)-min(name_nr)))) %>%
mutate(value1 = lead(value, unique(n)*(max(name_nr)-min(name_nr)))) %>%
slice(seq_len(first(n))) %>%
bind_rows(DT1 %>%
slice(1:(n() - unique(n))), .
) %>%
mutate(ID.new = paste(name, name1, sep = " - "), .before=1) %>%
select(ID.new, ID1, ID2, value.left=value, value.right = value1) %>%
arrange(ID.new)
ID.new ID1 ID2 value.left value.right
<chr> <chr> <dbl> <dbl> <dbl>
1 NAME1 - NAME2 A 1 10 7
2 NAME1 - NAME2 A 2 11 9
3 NAME1 - NAME2 A 3 9 8
4 NAME1 - NAME2 B 1 22 20
5 NAME1 - NAME2 B 2 25 22
6 NAME1 - NAME2 B 3 22 21
7 NAME1 - NAME3 A 1 10 10
8 NAME1 - NAME3 A 2 11 12
9 NAME1 - NAME3 A 3 9 11
10 NAME1 - NAME3 B 1 22 15
11 NAME1 - NAME3 B 2 25 19
12 NAME1 - NAME3 B 3 22 30
13 NAME2 - NAME3 A 1 7 10
14 NAME2 - NAME3 A 2 9 12
15 NAME2 - NAME3 A 3 8 11
16 NAME2 - NAME3 B 1 20 15
17 NAME2 - NAME3 B 2 22 19
18 NAME2 - NAME3 B 3 21 30
Attention:
Here is an inspiring idea which is not fully satisfy OP's requirement (e.g., ID.new and number order) but I think it worth to be recoreded here.
You can turn DT into long format by melt firstly.
Then to shift value with the step -nrow(DT) in order to do
the minus operation, i.e. NAME1 - NAME2, NAME2 - NAME3, NAME3 - NAME1.
ds = melt(DT,
measure.vars = patterns("^NAME"),
variable.name = c("ID.new"),
value.name = c("value.left"))
group_len = nrow(DT)
ds[, ID.new := paste(ID.new,shift(ID.new, n = -group_len, type = c("cyclic")),sep = " - ")]
ds[, value.right := shift(value.left, n = -group_len, type = c("cyclic"))]
Output:
ID1 ID2 ID.new value.left value.right
<char> <num> <char> <num> <num>
1: A 1 NAME1 - NAME2 10 7
2: A 2 NAME1 - NAME2 11 9
3: A 3 NAME1 - NAME2 9 8
4: B 1 NAME1 - NAME2 22 20
5: B 2 NAME1 - NAME2 25 22
6: B 3 NAME1 - NAME2 22 21
7: A 1 NAME2 - NAME3 7 10
8: A 2 NAME2 - NAME3 9 12
9: A 3 NAME2 - NAME3 8 11
10: B 1 NAME2 - NAME3 20 15
11: B 2 NAME2 - NAME3 22 19
12: B 3 NAME2 - NAME3 21 30
13: A 1 NAME3 - NAME1 10 10
14: A 2 NAME3 - NAME1 12 11
15: A 3 NAME3 - NAME1 11 9
16: B 1 NAME3 - NAME1 15 22
17: B 2 NAME3 - NAME1 19 25
18: B 3 NAME3 - NAME1 30 22
I think since data are well structured someone may use the following code (this is scalable but for simplicity I provided a simple variant)
melt(DT, measure.vars=list(c(3,3,4), c(4,5,5)))