I'm looking for an efficient way to paste/combine multiple pairs of adjacent columns at once using data.table. My feeble attempt is slow and not so elegant:
library(data.table)
dt <- data.table(ids = 1:3,
x1 = c("A","B","C"),
x2 = 1:3,
y1 = c("D", "E", "F"),
y2 = 4:6,
z1 = c("G", "H", "I"),
z3 = 7:9)
paste.pairs <- function(x, sep = "-"){
xx <- unlist(x)
x.len <- length(x)
r <- rep(NA, x.len/2)
s <- seq(1, x.len, by = 2)
for(i in 1:(x.len/2)) {
r[i] <- paste(xx[i], xx[i+1], sep = sep)
}
return(as.list(r))
}
dt[, paste.pairs(.SD), by = "ids"]
Is there a better way?
An option with Map by creating column index with seq
i1 <- seq(1, length(dt)-1, 2)
i2 <- seq(2, length(dt)-1, 2)
dt[, Map(paste,
.SD[, i1, with = FALSE], .SD[, i2, with = FALSE],
MoreArgs = list(sep="-")),
by = "ids"]
Another option would be to split by the names of the dataset and then paste
data.frame(lapply(split.default(dt[, -1, with = FALSE],
sub("\\d+$", "", names(dt)[-1])), function(x) do.call(paste, c(x, sep="-"))))
# x y z
#1 A-1 D-4 G-7
#2 B-2 E-5 H-8
#3 C-3 F-6 I-9
Or another option is with melt/dcast
dcast(melt(dt, id.var = 'ids')[, paste(value, collapse = "-"),
.(grp = sub("\\d+", "", variable), ids)], ids ~ grp, value.var = 'V1')
a solution using matrices
#create matrices
#use the columns you want to paste together...
m1 <- as.matrix( dt[,c(2,4,6)] )
m2 <- as.matrix( dt[, c(3,5,7)] )
#paste the matrices element-by-element, and convert result back to data.table
as.data.table( matrix( paste( m1, m2, sep="-"), nrow=nrow(m1), dimnames=dimnames(m1) ) )
Should run pretty fast, and is very readable and easy to adapt.
output
# x1 y1 z1
# 1: A-1 D-4 G-7
# 2: B-2 E-5 H-8
# 3: C-3 F-6 I-9
benchmarks
microbenchmark::microbenchmark(
wimpel = {
#create matrices
m1 <- as.matrix( dt[,c(2,4,6)] )
m2 <- as.matrix( dt[, c(3,5,7)] )
#paste the matrices element-by-element, and comvert to data.table
as.data.table( matrix( paste( m1, m2, sep="-"), nrow=nrow(m1), dimnames=dimnames(m1) ) )
},
akrun_df = {
data.frame(lapply(split.default(dt[, -1, with = FALSE],
sub("\\d+$", "", names(dt)[-1])), function(x) do.call(paste, c(x, sep="-"))))
},
akrun_map = {
i1 <- seq(2, length(dt), 2)
i2 <- seq(3, length(dt), 2)
dt[, Map(paste, .SD[, i1, with = FALSE], .SD[, i2, with = FALSE], MoreArgs = list(sep="-"))]
},
akrun_dcast = {
dcast(melt(dt, id.var = 'ids')[, paste(value, collapse = "-"),.(grp = sub("\\d+", "", variable), ids)], ids ~ grp, value.var = 'V1')
},
times = 10 )
# Unit: microseconds
# expr min lq mean median uq max neval
# wimpel 303.072 315.122 341.2417 319.1895 327.775 531.429 10
# akrun_df 1022.790 1028.515 1251.7812 1069.1850 1172.519 2779.460 10
# akrun_map 742.013 751.051 785.6059 778.1650 799.855 884.812 10
# akrun_dcast 4104.719 4175.215 4414.6596 4348.7430 4650.911 4939.221 10
Related
I have a fairly large data.table that comes from an SQL table.
All columns containing missing values in SQL are replaced by NULLs in the data.table so that these columns are actually lists containing values and also missing values.
I would like an efficient way to replace the NULLs with NAs and then convert the column (list) to a real data.table column
This is an example to reproduce my case :
library(data.table)
n = 10^6
l1 = as.list(rnorm(n, 10, 25))
l2 = as.list(rnorm(n, 0, 200))
l3 = as.list(rnorm(n))
df = data.table(a = runif(n),
b = l1,
c = l2,
d = rnorm(n, 88, 0.5),
e = l3
)
# create an index vector to set to NULL
id1 = sample(1:n, 0.26*n)
id2 = sample(1:n, 0.60*n)
id3 = sample(1:n, 0.09*n)
# set to NULL
df$b[id1] = list(NULL)
df$c[id2] = list(NULL)
df$e[id3] = list(NULL)
This is what I have done but it's a bit too long :
type = data.frame(type = sapply(df, class))
col = names(df)[which(type$type == "list")]
type = data.frame(type = sapply(df, class))
col = names(df)[which(type$type == "list")]
# ----------- FIRST WAY -----------------------------------------------------------------------
system.time(
df[, (col) := lapply(.SD, function(i) unlist(lapply(i, function(x) ifelse(is.null(x), NA, x)))), .SDcols = col]
)
# ----------- SECOND WAY (a little bit faster) ---------------------------
system.time(
for (i in col) {
df[, eval(i) := unlist(lapply(get(i), function(x) ifelse(is.null(x), NA, x)))]
}
)
Why the 1st solution is slower than second ? Anybody has a better way ?
We may use set here
library(data.table)
df1 <- copy(df)
system.time({
for(nm in col) {
i1 <- which(lengths(df1[[nm]]) == 0)
set(df1, i = i1, j = nm, value = list(NA))
df1[[nm]] <- unlist(df1[[nm]])
}
})
# user system elapsed
# 0.158 0.004 0.161
Compared with OP's second method
system.time(
for (i in col) {
df[, eval(i) := unlist(lapply(get(i), function(x) ifelse(is.null(x), NA, x)))]
}
)
# user system elapsed
# 5.618 0.157 5.756
-checking output
> all.equal(df, df1)
[1] TRUE
One solution based on lengths function:
cols = which(sapply(df, is.list))
df[, (cols) := lapply(.SD, \(x) {x[lengths(x)==0L] = NA; as.numeric(x)}), .SDcols=cols]
I would like to write a for loop in R style (lapply?) to avoid the following repetitive code.
df1$fusion <- apply(df1[, cols], 1, paste, collapse = "-" )
df2$fusion <- apply(df2[, cols], 1, paste, collapse = "-" )
df3$fusion <- apply(df3[, cols], 1, paste, collapse = "-" )
df4$fusion <- apply(df4[, cols], 1, paste, collapse = "-" )
df5$fusion <- apply(df5[, cols], 1, paste, collapse = "-" )
df6$fusion <- apply(df6[, cols], 1, paste, collapse = "-" )
df7$fusion <- apply(df7[, cols], 1, paste, collapse = "-" )
df8$fusion <- apply(df8[, cols], 1, paste, collapse = "-" )
df9$fusion <- apply(df9[, cols], 1, paste, collapse = "-" )
df10$fusion <- apply(df10[, cols], 1, paste, collapse = "-" )
df11$fusion <- apply(df11[, cols], 1, paste, collapse = "-" )
df12$fusion <- apply(df12[, cols], 1, paste, collapse = "-" )
How do I do it?
Something like in shell script style?
df_ls=("df1 df2 df3 df4 df5 df6 df7 df8 df9 df10 df11 df12")
for i in $df_ls
do
${i}$fusion <- apply(${i}[, cols], 1, paste, collapse = "-" )
done
With lists. For example in your case.
list_df_general <- list(df, df2,...,df12) # load your date frames
lista_new <- list() # list empty
for(i in 12){
lista_new[[i]] <- apply(lista_df_general[[i]][, cols], 1, paste, collapse = "-" )
}
For browser inside list you should do
list_new[[1]] or list_new[[2]] with 3,4,...,12.
You can use something like
my_Fun <- function(df, cols)
{
df$fusion <- apply(df[, cols], 1, paste, collapse = "-" )
}
for(i in 1 : 12)
{
variable_Name <- paste0("df", i)
assign(x = variable_Name, value = my_Fun(df = get(x = variable_Name), cols = cols))
}
You can use a combination of lapply and apply on the list of dataframes. This will return a list of dataframes.
df_ls <- mget(paste0("df", 1:12))
lapply(df_ls, function(x) {x$fusion <- apply(x[, cols], 1, paste, collapse = "-" ); x})
Multiple data frames in a list are always handier, find them in the .GlobalEnv and put them into one using mget. Now we may use lapply to apply any function to this list; we will define an anonymous function that uses Reduce which conveniently does the looping job to paste0 over the rows and columns for us, and is fast.
df_lst <- mget(paste0('df', 1:n)) ## put DFs in list
cols <- paste0('X', 2:5) ## define columns, say 2 to 5
res <- lapply(df_lst, function(df) {df$fusion <- Reduce(\(x, y) paste0(x, '-', y), df[cols]); df})
res
# $df1
# X1 X2 X3 X4 X5 fusion
# 1 A A T G C A-T-G-C
# 2 A T T C G T-T-C-G
# 3 A G A G A G-A-G-A
#
# $df2
# X1 X2 X3 X4 X5 fusion
# 1 A T C C C T-C-C-C
# 2 T T A G A T-A-G-A
# 3 G C A A A C-A-A-A
#
# $df3
# X1 X2 X3 X4 X5 fusion
# 1 G G A T T G-A-T-T
# 2 T C T C T C-T-C-T
# 3 G T G C G T-G-C-G
#
# $df4
# X1 X2 X3 X4 X5 fusion
# 1 G T A G G T-A-G-G
# 2 G A G T C A-G-T-C
# 3 T G T C A G-T-C-A
If for some reason, you need the data frames in the .GlobalEnv you may use list2env(res, .GlobalEnv) thereafter (similar as I did with the example data below).
Data:
set.seed(42)
n <- 4
replicate(n, data.frame(matrix(sample(LETTERS[c(1, 20, 3, 7)], 15, replace=TRUE), 3, 5)), simplify=FALSE) |>
setNames(paste0('df', seq_len(n))) |> list2env(.GlobalEnv)
How can I efficiently calculate distances between (almost) consecutive rows of a large-ish (~4m rows) of a data.table? I've outlined my current approach, but it is very slow. My actual data has up to a few hundred columns. I need to calculate lags and leads for future use, so I create these and use them to calculate distances.
library(data.table)
library(proxy)
set_shift_col <- function(df, shift_dir, shift_num, data_cols, byvars = NULL){
df[, (paste0(data_cols, "_", shift_dir, shift_num)) := shift(.SD, shift_num, fill = NA, type = shift_dir), byvars, .SDcols = data_cols]
}
set_shift_dist <- function(dt, shift_dir, shift_num, data_cols){
stopifnot(shift_dir %in% c("lag", "lead"))
shift_str <- paste0(shift_dir, shift_num)
dt[, (paste0("dist", "_", shift_str)) := as.numeric(
proxy::dist(
rbindlist(list(
.SD[,data_cols, with=FALSE],
.SD[, paste0(data_cols, "_" , shift_str), with=FALSE]
), use.names = FALSE),
method = "cosine")
), 1:nrow(dt)]
}
n <- 10000
test_data <- data.table(a = rnorm(n), b = rnorm(n), c = rnorm(n), d = rnorm(n))
cols <- c("a", "b", "c", "d")
set_shift_col(test_data, "lag", 1, cols)
set_shift_col(test_data, "lag", 2, cols)
set_shift_col(test_data, "lead", 1, cols)
set_shift_col(test_data, "lead", 2, cols)
set_shift_dist(test_data, "lag", 1, cols)
I'm sure this is a very inefficient approach, any suggestions would be appreciated!
You aren't using the vectorisation efficiencies in the proxy::dist function - rather than call it once for each row you can get all the distances you need from a single call.
Try this replacement function and compare the speed:
set_shift_dist2 <- function(dt, shift_dir, shift_num, data_cols){
stopifnot(shift_dir %in% c("lag", "lead"))
shift_str <- paste0(shift_dir, shift_num)
dt[, (paste0("dist2", "_", shift_str)) := proxy::dist(
.SD[,data_cols, with=FALSE],
.SD[, paste0(data_cols, "_" , shift_str), with=FALSE],
method = "cosine",
pairwise = TRUE
)]
}
You could also do it in one go without storing copies of the data in the table
test_data[, dist_lag1 := proxy::dist(
.SD,
as.data.table(shift(.SD, 1)),
pairwise = TRUE,
method = 'cosine'
), .SDcols = c('a', 'b', 'c', 'd')]
I have a large data.frame of this structure:
min.reps <- 1
max.reps <- 3
set.seed(1)
df <- do.call(rbind,lapply(1:100, function(i) {
reps <- seq(1,as.integer(runif(1,min.reps, max.reps)), 1)
vals <- runif(length(reps), 0, 100)
return(data.frame(id=rep(i,length(reps)),rep=reps,val=vals,stringsAsFactors=F))
}))
head(df)
id rep val
1 1 1 37.212390
2 2 1 90.820779
3 2 2 20.168193
4 3 1 94.467527
5 3 2 66.079779
6 4 1 6.178627
Each df$id has between min.reps and max.reps observations (df$val). In reality instead of 100 ids I have ~5,000,000 ids.
For each df$id I'd like to add one more value, sampled from a normal distribution with mean and sd as the median and mad over its existing values, respectively.
This is trivial to do this way:
add.reps <- 1
all.ids <- unique(df$id)
require(dplyr)
new.df <- do.call(rbind, lapply(all.ids, function(i) {
id.df <- dplyr::filter(df, id == i)
add.df <- rbind(id.df, data.frame(id = rep(i,add.reps), rep = max(id.df$rep) + add.reps, val = rnorm(add.reps, median(id.df$val), mad(id.df$val)), stringsAsFactors = F))
}))
But I'm wondering if there's a much faster way to achieve this given the dimensions of my real data.frame.
This should be much faster:
add.reps <- 1
do.call(rbind, lapply(split(df, df$id), function(x) rbind(x,
data.frame(id = rep(unique(x$id), add.reps), rep = max(x$rep) + add.reps,
val = rnorm(add.reps, median(x$val), mad(x$val)), stringsAsFactors = F))))
Ok, so so far:
require(microbenchmark)
microbenchmark(
new.df <- do.call(rbind, lapply(all.ids, function(i) {
id.df <- dplyr::filter(df, id == i)
add.df <- rbind(id.df, data.frame(id = rep(i,add.reps), rep = max(id.df$rep) + add.reps, val = rnorm(add.reps, median(id.df$val), mad(id.df$val)), stringsAsFactors = F))
}))
)
new.df <- do.call(rbind, lapply(all.ids, function(i) { id.df <- dplyr::filter(df, id == i) add.df <- rbind(id.df, data.frame(id = rep(i, add.reps), rep = max(id.df$rep) + add.reps, val = rnorm(add.reps, median(id.df$val), mad(id.df$val)), stringsAsFactors = F)) }))
min lq mean median uq max neval
212.9906 225.1345 371.9314 260.9686 332.5619 1621.586 100
vs.
microbenchmark(
new.df <- do.call(rbind, lapply(split(df, df$id), function(x) rbind(x,
data.frame(id = rep(unique(x$id), add.reps), rep = max(x$rep) + add.reps,
val = rnorm(add.reps, median(x$val), mad(x$val)), stringsAsFactors = F))))
)
new.df <- do.call(rbind, lapply(split(df, df$id), function(x) rbind(x, data.frame(id = rep(unique(x$id), add.reps), rep = max(x$rep) + add.reps, val = rnorm(add.reps, median(x$val), mad(x$val)), stringsAsFactors = F))))
min lq mean median uq max neval
133.8357 135.1846 202.9654 137.2722 160.5121 1401.03 100
I wonder if this can still be further improved
If we want to get all combinations of two vectors, we can use rep/recycling rules:
x <- 1:4
y <- 1:2
cbind(rep(x, each = length(y)), rep(y, length(x)))
# [,1] [,2]
# [1,] 1 1
# [2,] 1 2
# [3,] 2 1
# [4,] 2 2
# [5,] 3 1
# [6,] 3 2
# [7,] 4 1
# [8,] 4 2
But expand.grid is much nicer -- it handles all the repetition for us.
expand.grid(x, y)
# Var1 Var2
# 1 1 1
# 2 2 1
# 3 3 1
# 4 4 1
# 5 1 2
# 6 2 2
# 7 3 2
# 8 4 2
Is there a simple version of this for concatenating strings? Like paste.grid? I have a named object where a lot of the objects have names like x_y_z where x, y, and z vary like x and y above.
For example, suppose x can be "avg" or "median", y can be "male" or "female", and z can be "height" or "weight". How can we concisely get all 8 combinations of the three?
Using rep is a pain:
x <- c("avg", "median")
y <- c("male", "female")
z <- c("height", "weight")
paste(rep(x, each = length(y) * length(z)),
rep(rep(y, each = length(z)), length(x)),
rep(z, length(x) * length(y)), sep = "_")
And repurposing expand.grid is a bit clunky (and probably inefficient):
apply(expand.grid(x, y, z), 1, paste, collapse = "_")
Am I missing something? Is there a better way to do this?
Yes, this is what interaction does
levels(interaction(x,y,z,sep='_'))
The implementation is pretty much the same as your rep code.
Outputs:
[1] "avg_female_height" "median_female_height" "avg_male_height" "median_male_height" "avg_female_weight"
[6] "median_female_weight" "avg_male_weight" "median_male_weight"
Using data.table's CJ cross-joining function:
library(data.table)
CJ(x,y,z)[, paste(V1,V2,V3, sep = "_")]
#[1] "avg_female_height" "avg_female_weight" "avg_male_height" "avg_male_weight"
#[5] "median_female_height" "median_female_weight" "median_male_height" "median_male_weight"
Or a variation of your apply approach would be:
do.call(paste, c(expand.grid(x, y, z), sep = "_"))
#[1] "avg_male_height" "median_male_height" "avg_female_height" "median_female_height"
#[5] "avg_male_weight" "median_male_weight" "avg_female_weight" "median_female_weight"
Rudimentary (microbenchmark::microbenchmark) benchmarking shows a pretty significant speed-up by using:
library(tidyr)
library(magrittr)
df <- data.frame(x, y, z)
df %>%
complete(x, y, z) %>%
unite("combo", x, y, z, sep = "_")
A bit slower, but perhaps more straight forward and vectorized variant the apply technique:
df <- expand.grid(x, y, z)
df$combo <- paste(df$Var1, df$Var1, df$Var3, sep = "_")
Someone should chime in with a data.table approach...
Benchmarking: Small Grid (256 elements)
set.seed(21034)
x <- sample(letters, 4, TRUE)
y <- sample(letters, 4, TRUE)
z <- sample(letters, 4, TRUE)
a <- sample(letters, 4, TRUE)
library(data.table)
library(microbenchmark)
library(magrittr)
library(tidyr)
microbenchmark(times = 25L,
DT1 = CJ(x, y, z, a)[ , paste(V1, V2, V3, V4, sep = "_")],
DT2 = CJ(x, y, z, a)[ , do.call(paste, c(.SD, sep = "_"))],
app1 = do.call(paste, c(expand.grid(x, y, z, a), sep = "_")),
app2 = paste((df <- expand.grid(x, y, z, a))$Var1,
df$Var2, df$Var3, sep = "_"),
magg_outer = outer(x, y, paste, sep = "_") %>%
outer(z, paste, sep = "_") %>%
outer(a, paste, sep = "_") %>% as.vector,
magg_tidy = data.frame(x, y, z, a) %>%
complete(x, y, z, a) %>%
unite("combo", x, y, z, a, sep = "_"),
interaction = levels(interaction(x, y, z, a, sep = "_")),
original = apply(expand.grid(x, y, z, a), 1, paste, collapse = "_"),
rep = paste(rep(x, each = (ny <- length(y)) * (nz <- length(z)) *
(na <- length(a))),
rep(rep(y, each = nz * na), (nx <- length(x))),
rep(rep(z, each = na), nx * ny), sep = "_"),
Reduce = Reduce(function(x, y) paste(rep(x, each = length(y)),
rep(y, length(x)), sep = "_"),
list(x, y, z, a)))
# Unit: microseconds
# expr min lq mean median uq max neval cld
# DT1 529.578 576.6400 624.00002 589.8270 604.9845 5449.287 1000 d
# DT2 561.028 606.4220 639.94659 620.4335 636.2735 5484.514 1000 d
# app1 201.043 225.4475 240.36960 233.4795 243.7090 4244.687 1000 b
# app2 196.692 225.6130 244.33543 234.0455 243.7925 4110.605 1000 b
# magg_outer 164.352 194.1395 205.30300 204.4220 211.1990 456.122 1000 b
# magg_tidy 1872.228 2038.1560 2150.98234 2067.8770 2126.1025 21891.884 1000 f
# interaction 254.885 295.1935 313.54392 306.6680 316.8095 4196.465 1000 c
# original 852.018 935.4960 976.24388 954.5115 972.5550 4973.724 1000 e
# rep 50.737 54.1515 60.22671 55.3660 56.9220 3823.655 1000 a
# Reduce 58.395 65.3860 68.46049 66.8920 68.5640 158.184 1000 a
Benchmarking: Large Grid (1,000,000 elements)
set.seed(21034)
x <- sprintf("%03d", sample(100))
y <- sprintf("%03d", sample(100))
z <- sprintf("%02d", sample(10))
a <- sprintf("%02d", sample(10))
library(data.table)
library(microbenchmark)
library(magrittr)
library(tidyr)
microbenchmark(times = 25L,
DT1 = CJ(x, y, z, a)[ , paste(V1, V2, V3, V4, sep = "_")],
DT2 = CJ(x, y, z, a)[ , do.call(paste, c(.SD, sep = "_"))],
app1 = do.call(paste, c(expand.grid(x, y, z, a), sep = "_")),
app2 = paste((df <- expand.grid(x, y, z, a))$Var1,
df$Var2, df$Var3, sep = "_"),
magg_outer = outer(x, y, paste, sep = "_") %>%
outer(z, paste, sep = "_") %>%
outer(a, paste, sep = "_") %>% as.vector,
magg_tidy = data.frame(x, y, z, a) %>%
complete(x, y, z, a) %>%
unite("combo", x, y, z, a, sep = "_"),
interaction = levels(interaction(x, y, z, a, sep = "_")),
original = apply(expand.grid(x, y, z, a), 1, paste, collapse = "_"),
rep = paste(rep(x, each = (ny <- length(y)) * (nz <- length(z)) *
(na <- length(a))),
rep(rep(y, each = nz * na), (nx <- length(x))),
rep(rep(z, each = na), nx * ny), sep = "_"),
Reduce = Reduce(function(x, y) paste(rep(x, each = length(y)),
rep(y, length(x)), sep = "_"),
list(x, y, z, a)))
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# DT1 360.6528 467.8408 517.4579 520.1484 549.1756 861.1567 25 ab
# DT2 355.0438 504.9642 572.0732 551.9106 615.6621 927.3210 25 b
# app1 727.4513 766.3053 926.1888 910.3998 957.7610 1690.1540 25 c
# app2 472.5724 567.1121 633.5304 600.3779 634.3158 1135.7535 25 b
# magg_outer 384.0112 475.5070 600.6317 525.8936 676.7134 927.6736 25 b
# magg_tidy 520.6428 602.5028 695.5500 680.8821 748.8746 1180.1107 25 bc
# interaction 353.7317 481.4732 531.0035 518.7084 585.0872 693.5171 25 ab
# original 4965.1156 5358.8704 5914.3560 5780.6609 6074.7470 9024.6476 25 d
# rep 206.0964 236.5811 273.1093 252.8179 285.0910 455.1776 25 a
# Reduce 322.0695 390.2595 446.3948 424.9185 508.5235 621.1878 25 ab
What about using outer()? Your two examples become
x <- 1:4
y <- 1:2
as.vector(outer(x, y, paste, sep = "_"))
## [1] "1_1" "2_1" "3_1" "4_1" "1_2" "2_2" "3_2" "4_2"
library(magrittr)
x <- c("avg", "median")
y <- c("male", "female")
z <- c("height", "weight")
outer(x, y, paste, sep = "_") %>% outer(z, paste, sep = "_") %>% as.vector
## [1] "avg_male_height" "median_male_height" "avg_female_height" "median_female_height" "avg_male_weight"
## [6] "median_male_weight" "avg_female_weight" "median_female_weight"
The second example can be simplified a little with Reduce():
Reduce(function(a, b) outer(a, b, paste, sep = "_"), list(x, y, z)) %>% as.vector
It's not efficient, however. Using microbenchmark, I find that your solution using rep() is about 10 times faster.