R reshape() extremely slow

R reshape() extremely slow - r

I need to perform a simple reshape of data from long to wide, and this needs to work in base R. For this use case, reshape() seems to be extraordinarily slow (despite assertions that it is very fast https://stackoverflow.com/a/12073077/3017280). This example is a reasonable approximation of my data. I know that in this example I do not need both Index columns, but I do in the real data. On my laptop 10,000 rows takes 3 seconds, and 40,000 rows takes over 200 seconds. The real data has over one million rows, so reshape() is obviously a non-starter. Can anyone shed any light on why it takes so long in this case? I worked around the problem using split / lapply / Reduce + merge, which is clumsy but very much quicker.
n <- 5000
dfLong <- data.frame(Index1 = rep(sample(1E6:2E6, n), 4),
Index2 = rep(sample(3E6:4E6, n), 4),
Key = rep(1:4, each = n),
Date = sample(seq.Date(as.Date("2020-01-01"),
as.Date("2021-12-31"),
by = "1 day"),
size = n * 4, replace = TRUE),
Score = sample(0:48, n * 4, replace = TRUE))
system.time(dfWide <- reshape(data = dfLong,
v.names = c("Date", "Score"),
timevar = "Key",
idvar = c("Index1", "Index2"),
sep = "_Q",
direction = "wide"))

If you look at what functions reshape calls with the profvis package, you can see that almost all of the total time spent is on this one line in the function. The interaction function is used only to combine your two id columns into a single column.
data[, tempidname] <- interaction(data[, idvar],
drop = TRUE)
Rather than interaction, you could use do.call(paste0, data[, idvar]). You can use a function to create an environment with interaction equal to this faster function.
new_reshape <- function(...){
interaction <- function(x, drop) do.call(paste0, x)
environment(reshape) <- environment()
reshape(...)
}
Now it's much faster
system.time(dfWide <- reshape(data = dfLong,
v.names = c("Date", "Score"),
timevar = "Key",
idvar = c("Index1", "Index2"),
sep = "_Q",
direction = "wide"))
# user system elapsed
# 35.292 0.538 36.236
system.time(new_dfWide <- new_reshape(data = dfLong,
v.names = c("Date", "Score"),
timevar = "Key",
idvar = c("Index1", "Index2"),
sep = "_Q",
direction = "wide"))
# user system elapsed
# 0.071 0.009 0.081
all.equal(new_dfWide, dfWide)
# [1] TRUE
You can be even faster than that by using plyr:::ninteraction. The only non-base dependency of this function is plyr:::id_var, which has no dependencies, meaning if you can't install packages you can just copy-paste this function definition pretty easily (adding a comment giving credit).
new_reshape <- function(...){
# interaction = plyr:::ninteraction
# id_var = plyr:::id_var
interaction <-
function (.variables, drop = FALSE)
{
lengths <- vapply(.variables, length, integer(1))
.variables <- .variables[lengths != 0]
if (length(.variables) == 0) {
n <- nrow(.variables) %||% 0L
return(structure(seq_len(n), n = n))
}
if (length(.variables) == 1) {
return(id_var(.variables[[1]], drop = drop))
}
ids <- rev(lapply(.variables, id_var, drop = drop))
p <- length(ids)
ndistinct <- vapply(ids, attr, "n", FUN.VALUE = numeric(1),
USE.NAMES = FALSE)
n <- prod(ndistinct)
if (n > 2^31) {
char_id <- do.call("paste", c(ids, sep = "\r"))
res <- match(char_id, unique(char_id))
}
else {
combs <- c(1, cumprod(ndistinct[-p]))
mat <- do.call("cbind", ids)
res <- c((mat - 1L) %*% combs + 1L)
}
attr(res, "n") <- n
if (drop) {
id_var(res, drop = TRUE)
}
else {
structure(as.integer(res), n = attr(res, "n"))
}
}
id_var <-
function (x, drop = FALSE)
{
if (length(x) == 0)
return(structure(integer(), n = 0L))
if (!is.null(attr(x, "n")) && !drop)
return(x)
if (is.factor(x) && !drop) {
x <- addNA(x, ifany = TRUE)
id <- as.integer(x)
n <- length(levels(x))
}
else {
levels <- sort(unique(x), na.last = TRUE)
id <- match(x, levels)
n <- max(id)
}
structure(id, n = n)
}
environment(reshape) <- environment()
reshape(...)
}
system.time(new_dfWide <- new_reshape(data = dfLong,
v.names = c("Date", "Score"),
timevar = "Key",
idvar = c("Index1", "Index2"),
sep = "_Q",
direction = "wide"))
# user system elapsed
# 0.015 0.000 0.015

I don't know that I've ever made the claim that stats::reshape is the fastest.
For comparisons, stats::reshape is not as fast on my i9/64GB-ram system:
system.time(
dfWide <- reshape(data = dfLong,
v.names = c("Date", "Score"),
timevar = "Key",
idvar = c("Index1", "Index2"),
sep = "_Q",
direction = "wide")
)
# user system elapsed
# 19.63 0.03 19.73
But other reshaping functions do much better:
system.time(
tidyrWide <- pivot_wider(
dfLong, c("Index1", "Index2"),
names_prefix = "Q", names_from = "Key",
values_from = c("Date", "Score"))
)
# user system elapsed
# 0.01 0.00 0.02
nms <- names(dfWide)
tidyrWide <- subset(tidyrWide, select = nms) # column order
dfOrder <- do.call(order, dfWide)
tidyrOrder <- do.call(order, tidyrWide)
all.equal(dfWide[dfOrder,], as.data.frame(tidyrWide)[tidyrOrder,], check.attributes = FALSE)
# [1] TRUE
Similarly, data.table::dcast is equally fast:
dtLong <- as.data.table(dfLong)
system.time(
dtWide <- data.table::dcast(
Index1 + Index2 ~ paste0("Q", Key),
data = dtLong, value.var = c("Date", "Score"))
)
# user system elapsed
# 0.00 0.01 0.02
dtWide <- subset(dtWide, select = nms) # column order
dtOrder <- do.call(order, dtWide)
all.equal(dfWide[dfOrder,nms], as.data.frame(dtWide)[dtOrder,nms], check.attributes = FALSE)
# [1] TRUE

Consider an advanced modified version of #Moody_Mudskipper's matrix_spread, using base R. Since matrix will simplify complex types like Date, some adhoc changes will be required:
Function
matrix_spread <- function(df1, id, key, value, sep){
unique_ids <- unique(df1[[key]])
mats <- lapply(df1[value], function(x)
matrix(x, ncol=length(unique_ids), byrow = FALSE)
)
df2 <- do.call(
data.frame, list(unique(df1[id]), mats)
)
# RENAME COLS
names(df2)[(length(id)+1):ncol(df2)] <- as.vector(
sapply(value, function(x, y) paste0(x, sep, y), unique_ids)
)
# REORDER COLS
df2 <- df2[c(id, as.vector(
outer(c(value), unique_ids, function(x, y) paste0(x, sep, y))
))]
return(df2)
}
Application
system.time(
dfWide2 <- matrix_spread(
df1 = dfLong,
id = c("Index1", "Index2"),
key = "Key",
value = c("Date", "Score"),
sep = "_Q"
)
)
# user system elapsed
# 0.022 0.000 0.023
# CONVERT INTEGERS TO DATES
dfWide2[grep("Date", names(dfWide2))] <- lapply(
dfWide2[grep("Date", names(dfWide2))],
as.Date,
origin = "1970-01-01"
)
# REPLICATES OP'S reshape
identical(data.frame(dfWide), dfWide2)
# [1] TRUE

Related

How to deal with data.table with list column and NULL values

I have a fairly large data.table that comes from an SQL table.
All columns containing missing values in SQL are replaced by NULLs in the data.table so that these columns are actually lists containing values and also missing values.
I would like an efficient way to replace the NULLs with NAs and then convert the column (list) to a real data.table column
This is an example to reproduce my case :
library(data.table)
n = 10^6
l1 = as.list(rnorm(n, 10, 25))
l2 = as.list(rnorm(n, 0, 200))
l3 = as.list(rnorm(n))
df = data.table(a = runif(n),
b = l1,
c = l2,
d = rnorm(n, 88, 0.5),
e = l3
)
# create an index vector to set to NULL
id1 = sample(1:n, 0.26*n)
id2 = sample(1:n, 0.60*n)
id3 = sample(1:n, 0.09*n)
# set to NULL
df$b[id1] = list(NULL)
df$c[id2] = list(NULL)
df$e[id3] = list(NULL)
This is what I have done but it's a bit too long :
type = data.frame(type = sapply(df, class))
col = names(df)[which(type$type == "list")]
type = data.frame(type = sapply(df, class))
col = names(df)[which(type$type == "list")]
# ----------- FIRST WAY -----------------------------------------------------------------------
system.time(
df[, (col) := lapply(.SD, function(i) unlist(lapply(i, function(x) ifelse(is.null(x), NA, x)))), .SDcols = col]
)
# ----------- SECOND WAY (a little bit faster) ---------------------------
system.time(
for (i in col) {
df[, eval(i) := unlist(lapply(get(i), function(x) ifelse(is.null(x), NA, x)))]
}
)
Why the 1st solution is slower than second ? Anybody has a better way ?

We may use set here
library(data.table)
df1 <- copy(df)
system.time({
for(nm in col) {
i1 <- which(lengths(df1[[nm]]) == 0)
set(df1, i = i1, j = nm, value = list(NA))
df1[[nm]] <- unlist(df1[[nm]])
}
})
# user system elapsed
# 0.158 0.004 0.161
Compared with OP's second method
system.time(
for (i in col) {
df[, eval(i) := unlist(lapply(get(i), function(x) ifelse(is.null(x), NA, x)))]
}
)
# user system elapsed
# 5.618 0.157 5.756
-checking output
> all.equal(df, df1)
[1] TRUE

One solution based on lengths function:
cols = which(sapply(df, is.list))
df[, (cols) := lapply(.SD, \(x) {x[lengths(x)==0L] = NA; as.numeric(x)}), .SDcols=cols]

Efficiently fill 2D matrices by rows in a list in R

I have a list of 2D matrices. Each matrix is filled using the function fillMatrices. This function adds a number of individuals to each day 0 in a matrix and updates the columns a_M, b_M and c_M. The numbers of individuals come from an initial matrix ind. The code works but it is slow when the number of matrices within the list is large. For example with n = 10000:
user system elapsed
3.73 0.83 4.55
If possible, I would like to reduce the elapsed time to <= 1 sec and increase the n to 720000 matrices. So, I am looking for way to optimize only the section 3. Here is the code:
###############################################
###############################################
## Section 3
## Run the function "fillMatrices"
indexTime <- 1
dt_t_1 <- do.call(rbind, lapply(list_matrices, function(x) x[1,]))
dt_t <- fillMatrices(dt_t_1 = dt_t_1, species = c("a_M", "b_M", "c_M"), maxDuration = 5, matrixColumns = col_mat)
## Fill the matrices within the list
system.time(for(i in 1:n){
list_matrices[[i]][indexTime + 1,] <- dt_t[,i]
})
## test <- list_matrices[[1]]
The code of the section 1 is used to initialize the matrices and the function fillMatrices can be found in the section 2. In my example, the function is used to fill matrices for one species. In reality, the function is used for 3 species (i.e., is applied three times) by changing the argument species = c("a_M", "b_M", "c_M"). How can I speed up my code? Any advice would be much appreciated.
Here are the codes of sections 1 and 2:
rm(list=ls(all=TRUE))
library(ff)
library(dplyr)
set.seed(12345)
## Define the number of individuals
n <- 10000
###############################################
###############################################
## Section 1
## Build the list of 2D matrices
v_date <- as.vector(outer(c(paste(seq(0, 1, by = 1), "day", sep="_"), paste(seq(2, 5, by = 1), "days", sep="_")), c("a_M", "b_M", "c_M"), paste, sep="|"))
col_mat <- c("year", "day", "time", "ID", "died", v_date)
list_matrices <- list()
for(i in 1:n){
print(i)
list_matrices[[i]] <- ff(-999, dim=c(3650, length(col_mat)), dimnames=list(NULL, col_mat), vmode="double", overwrite = TRUE)
}
## test <- list_matrices[[1]]
## dim(list_matrices[[1]])
## Fill the first row of each matrix
for(i in 1:n){
print(i)
list_matrices[[i]][1,] <- c(1, 1, 1, i-1, 0, rep(0, length(v_date)))
}
## test <- list_matrices[[2]]
## Build the matrix "individual"
ind <- as.matrix(data.frame(year = rep(1, n), day = rep(1, n), time = rep(1, n), died = rep(0, n), ID = (seq(1, n, 1))- 1, a_M = sample(1:10, n, replace = T), b_M = sample(1:10, n, replace = T), c_M = sample(1:10, n, replace = T)))
## print(ind)
###############################################
###############################################
## Section 2
## Function to convert a data frame into a matrix
convertDFToMat <- function(x){
mat <- as.matrix(x[,-1])
ifelse(is(x[,1], "data.frame"), rownames(mat) <- pull(x[,1]), rownames(mat) <- x[,1])
## Convert character matrix into numeric matrix
mat <- apply(mat, 2, as.numeric)
return(mat)
}
## Define the function that is used to fill the matrices within the list
fillMatrices <- function(dt_t_1, species, maxDuration, matrixColumns){
## Format data
dt <- as.data.frame(dt_t_1) %>%
reshape::melt(id = c("ID")) %>%
arrange(ID) %>%
dplyr::mutate_all(as.character)
## summary(dt)
## Break out the variable "variable" into different columns, with one row for each individual-day
dt_reshape_filter_1 <- dt %>%
dplyr::filter(!variable %in% c("year", "day", "time", "ID", "died")) %>%
dplyr::mutate(day = variable %>% gsub(pattern = "\\_.*", replacement = "", x = .), col = variable %>% gsub(pattern = ".*\\|", replacement = "", x = .)) %>%
dplyr::select(-variable) %>%
tidyr::spread(col, value) %>%
dplyr::mutate_all(as.numeric) %>%
dplyr::arrange(ID, day)
## summary(dt_reshape_filter_1)
## Apply requested transformations and build the data frame
dt_transform <- dt_reshape_filter_1 %>%
dplyr::rename_at(vars(species), ~ c("a", "b", "c")) %>%
dplyr::mutate(day = day + 1) %>%
dplyr::filter(day < maxDuration + 1) %>%
dplyr::bind_rows(tibble(ID = ind[,c("ID")], day = 0, a = ind[,c("a_M")], b = ind[,c("b_M")])) %>%
dplyr::mutate(c = a + b) %>%
dplyr::rename_at(vars("a", "b", "c"), ~ species) %>%
dplyr::arrange(ID, day)
## summary(dt_transform)
## Take different columns of the data frame and gather them into a single column
dt_gather <- dt_transform %>%
tidyr::gather(variable, value, species) %>%
dplyr::mutate(day = if_else(day > 1, paste0(day, "_days"), paste0(day, "_day"))) %>%
tidyr::unite(variable, c("day", "variable"), sep = "|") %>%
dplyr::rename(var2 = ID) %>%
dplyr::mutate_all(as.character)
## summary(dt_gather)
## Add the other columns in the data frame and convert the resulting data frame into a matrix
dt_reshape_filter_2 <- dt %>%
dplyr::rename(var2 = ID) %>%
dplyr::filter(variable %in% c("year", "day", "time", "ID", "died")) %>%
tidyr::spread(variable, value) %>%
dplyr::arrange(as.numeric(var2)) %>%
dplyr::mutate(year = ind[,c("year")],
day = ind[,c("day")],
time = ind[,c("time")],
ID = ind[,c("ID")],
died = ind[,c("died")]) %>%
tidyr::gather(variable, value, c(year, day, time, ID, died)) %>%
dplyr::arrange(as.numeric(var2)) %>%
dplyr::mutate_all(as.character)
## summary(dt_reshape_filter_2)
## Build the output matrix
dt_bind <- bind_rows(dt_reshape_filter_2, dt_gather) %>%
tidyr::spread(var2, value) %>%
dplyr::arrange(match(variable, matrixColumns)) %>%
dplyr::select("variable", as.character(ind[,c("ID")]))
## summary(dt_bind)
dt_mat <- convertDFToMat(dt_bind)
## summary(dt_mat)
return(dt_mat)
}

Making a 3D array instead of a 2D list of matrices gives you more options
library(ff)
library(dplyr)
set.seed(12345)
## Define the number of individuals
n <- 10000L
n_row <- 3650L
#array way:
v_date <- as.vector(outer(c(paste(seq(0, 1, by = 1), "day", sep="_"), paste(seq(2, 5, by = 1), "days", sep="_")), c("a_M", "b_M", "c_M"), paste, sep="|"))
col_mat <- c("year", "day", "time", "ID", "died", v_date)
arr1 <- ff(-999L, dim = c(n_row, length(col_mat), n), dimnames = list(NULL, col_mat, NULL))
## Fill the first row of each matrix slice
arr1[1, , ] <- c(1L, 1L, 1L, NA, 0L, rep(0L, length(v_date)))
arr1[1, 4, ] <- seq_len(n)-1L
## Build the matrix "individual"
ind <- as.matrix(data.frame(year = rep(1L, n), day = rep(1L, n), time = rep(1L, n), died = rep(0L, n), ID = (seq(1L, n, 1L))- 1L, a_M = sample(1L:10L, n, replace = T), b_M = sample(1L:10L, n, replace = T), c_M = sample(1L:10L, n, replace = T)))
##fill the matrix
indexTime <- 1L
dt_t <- fillMatrices(dt_t_1 = t(arr1[1, ,]), species = c("a_M", "b_M", "c_M"), maxDuration = 5, matrixColumns = col_mat)
## reassign
system.time(
arr1[indexTime + 1, ,] <- dt_t
)
user system elapsed
0.05 0.70 0.7
# for comparison
#> system.time(for(i in 1:n){
#+ list_matrices[[i]][indexTime + 1,] <- dt_t[,i]
#+ })
# user system elapsed
# 4.75 1.08 5.90
As far as I can tell, it's giving me the same results as your original approach but does so a lot faster.

Efficient way to paste multiple column pairs in R data.table

I'm looking for an efficient way to paste/combine multiple pairs of adjacent columns at once using data.table. My feeble attempt is slow and not so elegant:
library(data.table)
dt <- data.table(ids = 1:3,
x1 = c("A","B","C"),
x2 = 1:3,
y1 = c("D", "E", "F"),
y2 = 4:6,
z1 = c("G", "H", "I"),
z3 = 7:9)
paste.pairs <- function(x, sep = "-"){
xx <- unlist(x)
x.len <- length(x)
r <- rep(NA, x.len/2)
s <- seq(1, x.len, by = 2)
for(i in 1:(x.len/2)) {
r[i] <- paste(xx[i], xx[i+1], sep = sep)
}
return(as.list(r))
}
dt[, paste.pairs(.SD), by = "ids"]
Is there a better way?

An option with Map by creating column index with seq
i1 <- seq(1, length(dt)-1, 2)
i2 <- seq(2, length(dt)-1, 2)
dt[, Map(paste,
.SD[, i1, with = FALSE], .SD[, i2, with = FALSE],
MoreArgs = list(sep="-")),
by = "ids"]
Another option would be to split by the names of the dataset and then paste
data.frame(lapply(split.default(dt[, -1, with = FALSE],
sub("\\d+$", "", names(dt)[-1])), function(x) do.call(paste, c(x, sep="-"))))
# x y z
#1 A-1 D-4 G-7
#2 B-2 E-5 H-8
#3 C-3 F-6 I-9
Or another option is with melt/dcast
dcast(melt(dt, id.var = 'ids')[, paste(value, collapse = "-"),
.(grp = sub("\\d+", "", variable), ids)], ids ~ grp, value.var = 'V1')

a solution using matrices
#create matrices
#use the columns you want to paste together...
m1 <- as.matrix( dt[,c(2,4,6)] )
m2 <- as.matrix( dt[, c(3,5,7)] )
#paste the matrices element-by-element, and convert result back to data.table
as.data.table( matrix( paste( m1, m2, sep="-"), nrow=nrow(m1), dimnames=dimnames(m1) ) )
Should run pretty fast, and is very readable and easy to adapt.
output
# x1 y1 z1
# 1: A-1 D-4 G-7
# 2: B-2 E-5 H-8
# 3: C-3 F-6 I-9
benchmarks
microbenchmark::microbenchmark(
wimpel = {
#create matrices
m1 <- as.matrix( dt[,c(2,4,6)] )
m2 <- as.matrix( dt[, c(3,5,7)] )
#paste the matrices element-by-element, and comvert to data.table
as.data.table( matrix( paste( m1, m2, sep="-"), nrow=nrow(m1), dimnames=dimnames(m1) ) )
},
akrun_df = {
data.frame(lapply(split.default(dt[, -1, with = FALSE],
sub("\\d+$", "", names(dt)[-1])), function(x) do.call(paste, c(x, sep="-"))))
},
akrun_map = {
i1 <- seq(2, length(dt), 2)
i2 <- seq(3, length(dt), 2)
dt[, Map(paste, .SD[, i1, with = FALSE], .SD[, i2, with = FALSE], MoreArgs = list(sep="-"))]
},
akrun_dcast = {
dcast(melt(dt, id.var = 'ids')[, paste(value, collapse = "-"),.(grp = sub("\\d+", "", variable), ids)], ids ~ grp, value.var = 'V1')
},
times = 10 )
# Unit: microseconds
# expr min lq mean median uq max neval
# wimpel 303.072 315.122 341.2417 319.1895 327.775 531.429 10
# akrun_df 1022.790 1028.515 1251.7812 1069.1850 1172.519 2779.460 10
# akrun_map 742.013 751.051 785.6059 778.1650 799.855 884.812 10
# akrun_dcast 4104.719 4175.215 4414.6596 4348.7430 4650.911 4939.221 10

How can I extend tidyr::spread() while maintaining the order of column names?

How to expand when maintaining the rank of numbers when using the spread function?
library(tidyverse)
data.frame(time = paste0("t_", 1:100)) %>%
rowwise() %>%
mutate(rnd = sample(1:100, size=1)) %>%
spread(time, rnd)
The column names of the execution result of the code shown above are t_1, t_11, t_100, .....
I want to get column names in order of numbers(t_1, t_2, t_3, ...).

You can try two things:
(1) Make "time" a factor with levels matching the order you want:
data.frame(time = factor(paste0("t_", 1:100), levels = paste0("t_", 1:100))) %>%
rowwise() %>%
mutate(rnd = sample(1:100, size=1)) %>%
spread(time, rnd)
(2) Force the order using a select statement:
data.frame(time = paste0("t_", 1:100)) %>%
rowwise() %>%
mutate(rnd = sample(1:100, size=1)) %>%
spread(time, rnd) %>%
select(paste0("t_", 1:100))

Here is a new function that retains column order. Only one small change is needed (see annotation):
my_spread <- function (data, key, value, fill = NA, convert = FALSE, drop = TRUE,
sep = NULL) {
key_col <- tidyr:::col_name(substitute(key))
value_col <- tidyr:::col_name(substitute(value))
tbl_df(my_spread_(data, key_col, value_col, fill = fill, convert = convert,
drop = drop, sep = sep))
}
my_spread_ <- function (data, key_col, value_col, fill = NA, convert = FALSE,
drop = TRUE, sep = NULL) {
col <- data[key_col]
#col_id <- tidyr:::id(col, drop = drop) # Old line
col_id <- seq_len(nrow(data)) # New line 1
attr(col_id, 'n') <- nrow(data) # New line 2
col_labels <- tidyr:::split_labels(col, col_id, drop = drop)
rows <- data[setdiff(names(data), c(key_col, value_col))]
if (length(rows) == 0) {
row_id <- structure(1L, n = 1L)
row_labels <- as.data.frame(matrix(nrow = 1, ncol = 0))
}
else {
row_id <- id(rows, drop = drop)
row_labels <- tidyr:::split_labels(rows, row_id, drop = drop)
rownames(row_labels) <- NULL
}
overall <- tidyr:::id(list(col_id, row_id), drop = FALSE)
n <- attr(overall, "n")
if (anyDuplicated(overall)) {
groups <- split(seq_along(overall), overall)
groups <- groups[vapply(groups, length, integer(1)) >
1]
str <- vapply(
groups,
function(x) paste0("(", paste0(x, collapse = ", "), ")"), character(1)
)
stop("Duplicate identifiers for rows ", paste(str, collapse = ", "),
call. = FALSE)
}
if (length(overall) < n) {
overall <- match(seq_len(n), overall, nomatch = NA)
}
else {
overall <- order(overall)
}
value <- data[[value_col]]
ordered <- value[overall]
if (!is.na(fill)) {
ordered[is.na(ordered)] <- fill
}
if (convert && !is.character(ordered)) {
ordered <- as.character(ordered)
}
dim(ordered) <- c(attr(row_id, "n"), attr(col_id, "n"))
colnames(ordered) <- enc2utf8( tidyr:::col_names(col_labels, sep = sep))
ordered <- tidyr:::as_data_frame_matrix(ordered)
if (convert) {
ordered[] <- lapply(ordered, type.convert, as.is = TRUE)
}
tidyr:::append_df(row_labels, ordered)
}

Subset data.table columns independently

I'm starting with the below table dt and try to subset its column by the list keys:
library(data.table)
set.seed(123)
randomchar <- function(n, w){
chararray <- replicate(w, sample(c(letters, LETTERS), n, replace = TRUE))
apply(chararray, 1, paste0, collapse = "")
}
dt <- data.table(x = randomchar(1000, 3),
y = randomchar(1000, 3),
z = randomchar(1000, 3),
key = c("x", "y", "z"))
keys <- with(dt, list(x = sample(x, 501),
y = sample(y, 500),
z = sample(z, 721)))
I can get the result I want by using a loop:
desired <- copy(dt)
for(i in seq_along(keys)){
keyname <- names(keys)[i]
desired <- desired[get(keyname) %in% keys[[i]]]
}
desired
The question is - Is there a more data.table idiomatic way to do this subset?
I tried using CJ: dt[CJ(keys)], but it takes a very long time.

What about building a mask and filter dt on this mask:
dt[Reduce(`&`, Map(function(key, col) col %in% key, keys, dt)),]

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

R reshape() extremely slow - r

Related

How to deal with data.table with list column and NULL values

Efficiently fill 2D matrices by rows in a list in R

Efficient way to paste multiple column pairs in R data.table

How can I extend tidyr::spread() while maintaining the order of column names?

Subset data.table columns independently

Categories

Resources