dt <- data.table(v1 = c("", "17-Sep-2019"),
v2 = c("", "17-Sep-2019"))
cols <- paste0("v",1:2)
dt[, do.call(paste, Map(function(x, y) paste(x, y, sep = ':'),
lubridate::dmy(.SD),
toupper(gsub(".*(\\(.*\\)).*","\\1", names(.SD))))),
.SDcols = cols]
I want the code to return c(NA, "2019-09-17:V1 2019-09-17:V2")
Maybe something like:
dt <- data.table(v1 = c("", "17-Sep-2019"), v2 = c("", "17-Sep-2019"))
cols <- paste0("v",1:2)
dt[, fifelse(v1!="",
do.call(paste, Map(function(x, v) paste0(as.Date(x, "%d-%b-%Y"), ":", toupper(v)), .SD, cols)),
NA_character_), .SDcols=cols]
Related
I have example data as follows:
library(data.table)
set.seed(1)
DT <- data.table(panelID = sample(50,50), # Creates a panel ID
Country = c(rep("Albania",30),rep("Belarus",50), rep("Chilipepper",20)),
some_NA = sample(0:5, 6),
some_NA_factor = sample(0:5, 6),
Group = c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20)),
Time = rep(seq(as.Date("2010-01-03"), length=20, by="1 month") - 1,5),
wt = 15*round(runif(100)/10,2),
Income = round(rnorm(10,-5,5),2),
Happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = sample(100,100),
Educ = round(rnorm(10,0.75,0.3),2))
DT [, uniqueID := .I] # Creates a unique ID # https://stackoverflow.com/questions/11036989/replace-all-0-values-to-na
DT$some_NA_factor <- factor(DT$some_NA_factor)
DT$Group <- as.character(DT$Group)
DT2 <- copy(DT)
This is what I want to do, to convert a column (in this case colum 5 Group) to numeric if that is possible.
dfs <- c("DT", "DT2")
conv_to_num_check <- function(z) is.character(z) && (mean(grepl("^ *-?[\\d.]+(?:e-?\\d+)?$", z, perl = TRUE), na.rm=TRUE)>0.9)
for (i in length(dfs)) {
cols <- which(sapply(get(dfs[i]), conv_to_num_check))
setDT(get(dfs[i]))[, (cols) := lapply(.SD, as.numeric), .SDcols = cols]
}
But when I check the class:
class(DT$Group) # Is character
When I do:
cols <- which(sapply(DT, conv_to_num_check))
setDT(DT)[, (cols) := lapply(.SD, as.numeric), .SDcols = cols]
class(DT$Group) # Is numeric
It works.. What am I doing wrong?
Just a tiny error in the line for (i in length(dfs)), as length(dfs) just returns 2:
for (i in length(dfs)) {
print(i)
}
# [1] 2
It will work if you change it to:
for (i in seq_along(dfs)) {
cols <- which(sapply(get(dfs[i]), conv_to_num_check))
setDT(get(dfs[i]))[, (cols) := lapply(.SD, as.numeric), .SDcols = cols]
}
How can I efficiently calculate distances between (almost) consecutive rows of a large-ish (~4m rows) of a data.table? I've outlined my current approach, but it is very slow. My actual data has up to a few hundred columns. I need to calculate lags and leads for future use, so I create these and use them to calculate distances.
library(data.table)
library(proxy)
set_shift_col <- function(df, shift_dir, shift_num, data_cols, byvars = NULL){
df[, (paste0(data_cols, "_", shift_dir, shift_num)) := shift(.SD, shift_num, fill = NA, type = shift_dir), byvars, .SDcols = data_cols]
}
set_shift_dist <- function(dt, shift_dir, shift_num, data_cols){
stopifnot(shift_dir %in% c("lag", "lead"))
shift_str <- paste0(shift_dir, shift_num)
dt[, (paste0("dist", "_", shift_str)) := as.numeric(
proxy::dist(
rbindlist(list(
.SD[,data_cols, with=FALSE],
.SD[, paste0(data_cols, "_" , shift_str), with=FALSE]
), use.names = FALSE),
method = "cosine")
), 1:nrow(dt)]
}
n <- 10000
test_data <- data.table(a = rnorm(n), b = rnorm(n), c = rnorm(n), d = rnorm(n))
cols <- c("a", "b", "c", "d")
set_shift_col(test_data, "lag", 1, cols)
set_shift_col(test_data, "lag", 2, cols)
set_shift_col(test_data, "lead", 1, cols)
set_shift_col(test_data, "lead", 2, cols)
set_shift_dist(test_data, "lag", 1, cols)
I'm sure this is a very inefficient approach, any suggestions would be appreciated!
You aren't using the vectorisation efficiencies in the proxy::dist function - rather than call it once for each row you can get all the distances you need from a single call.
Try this replacement function and compare the speed:
set_shift_dist2 <- function(dt, shift_dir, shift_num, data_cols){
stopifnot(shift_dir %in% c("lag", "lead"))
shift_str <- paste0(shift_dir, shift_num)
dt[, (paste0("dist2", "_", shift_str)) := proxy::dist(
.SD[,data_cols, with=FALSE],
.SD[, paste0(data_cols, "_" , shift_str), with=FALSE],
method = "cosine",
pairwise = TRUE
)]
}
You could also do it in one go without storing copies of the data in the table
test_data[, dist_lag1 := proxy::dist(
.SD,
as.data.table(shift(.SD, 1)),
pairwise = TRUE,
method = 'cosine'
), .SDcols = c('a', 'b', 'c', 'd')]
I want to apply function to portion of a table.
With data.frame, no problem:
df <- data.frame(name = paste("a", 1:10, sep = "-"),
x = 1:10,
y = rep(1:5),
z = rep(1:2, each = 5))
df[2:5, -1] <- scale(df[2:5, -1], center = c(1,2,3), scale = c(4,5,6))
But data.table complains:
dt <- data.table(name = paste("a", 1:10, sep = "-"),
x = 1:10,
y = rep(1:5),
z = rep(1:2, each = 5))
dt[2:5, -1] <- scale(dt[2:5, -1], center = c(1,2,3), scale = c(4,5,6))
Error in [<-.data.table(*tmp*, 2:5, -1, value = c(0.25, 0.5, 0.75, :
Item 1 of column numbers in j is -1 which is outside range [1,ncol=4]. Use column names
instead in j to add new columns.
What is the correct way in data.table? Thanks!
data.table needs more work to apply scale :
library(data.table)
cols <- names(dt)[-1]
dt[, (cols) := lapply(.SD, as.numeric), .SDcols = cols]
dt[2:5, (cols) := Map(scale, .SD, c(1,2,3), c(4,5,6)), .SDcols = cols]
I'm looking for an efficient way to paste/combine multiple pairs of adjacent columns at once using data.table. My feeble attempt is slow and not so elegant:
library(data.table)
dt <- data.table(ids = 1:3,
x1 = c("A","B","C"),
x2 = 1:3,
y1 = c("D", "E", "F"),
y2 = 4:6,
z1 = c("G", "H", "I"),
z3 = 7:9)
paste.pairs <- function(x, sep = "-"){
xx <- unlist(x)
x.len <- length(x)
r <- rep(NA, x.len/2)
s <- seq(1, x.len, by = 2)
for(i in 1:(x.len/2)) {
r[i] <- paste(xx[i], xx[i+1], sep = sep)
}
return(as.list(r))
}
dt[, paste.pairs(.SD), by = "ids"]
Is there a better way?
An option with Map by creating column index with seq
i1 <- seq(1, length(dt)-1, 2)
i2 <- seq(2, length(dt)-1, 2)
dt[, Map(paste,
.SD[, i1, with = FALSE], .SD[, i2, with = FALSE],
MoreArgs = list(sep="-")),
by = "ids"]
Another option would be to split by the names of the dataset and then paste
data.frame(lapply(split.default(dt[, -1, with = FALSE],
sub("\\d+$", "", names(dt)[-1])), function(x) do.call(paste, c(x, sep="-"))))
# x y z
#1 A-1 D-4 G-7
#2 B-2 E-5 H-8
#3 C-3 F-6 I-9
Or another option is with melt/dcast
dcast(melt(dt, id.var = 'ids')[, paste(value, collapse = "-"),
.(grp = sub("\\d+", "", variable), ids)], ids ~ grp, value.var = 'V1')
a solution using matrices
#create matrices
#use the columns you want to paste together...
m1 <- as.matrix( dt[,c(2,4,6)] )
m2 <- as.matrix( dt[, c(3,5,7)] )
#paste the matrices element-by-element, and convert result back to data.table
as.data.table( matrix( paste( m1, m2, sep="-"), nrow=nrow(m1), dimnames=dimnames(m1) ) )
Should run pretty fast, and is very readable and easy to adapt.
output
# x1 y1 z1
# 1: A-1 D-4 G-7
# 2: B-2 E-5 H-8
# 3: C-3 F-6 I-9
benchmarks
microbenchmark::microbenchmark(
wimpel = {
#create matrices
m1 <- as.matrix( dt[,c(2,4,6)] )
m2 <- as.matrix( dt[, c(3,5,7)] )
#paste the matrices element-by-element, and comvert to data.table
as.data.table( matrix( paste( m1, m2, sep="-"), nrow=nrow(m1), dimnames=dimnames(m1) ) )
},
akrun_df = {
data.frame(lapply(split.default(dt[, -1, with = FALSE],
sub("\\d+$", "", names(dt)[-1])), function(x) do.call(paste, c(x, sep="-"))))
},
akrun_map = {
i1 <- seq(2, length(dt), 2)
i2 <- seq(3, length(dt), 2)
dt[, Map(paste, .SD[, i1, with = FALSE], .SD[, i2, with = FALSE], MoreArgs = list(sep="-"))]
},
akrun_dcast = {
dcast(melt(dt, id.var = 'ids')[, paste(value, collapse = "-"),.(grp = sub("\\d+", "", variable), ids)], ids ~ grp, value.var = 'V1')
},
times = 10 )
# Unit: microseconds
# expr min lq mean median uq max neval
# wimpel 303.072 315.122 341.2417 319.1895 327.775 531.429 10
# akrun_df 1022.790 1028.515 1251.7812 1069.1850 1172.519 2779.460 10
# akrun_map 742.013 751.051 785.6059 778.1650 799.855 884.812 10
# akrun_dcast 4104.719 4175.215 4414.6596 4348.7430 4650.911 4939.221 10
I have a set of data and I need to add multiple extra columns to rank the existing data. I am doing this by adding one extra column at a time but I was hoping for a more efficient way by passing in the columns as a character vector? Here is a simple example:
require(data.table)
dt <- data.table(x = rnorm(10),
y = rnorm(10))
dt[, ":=" (rank_x = rank(x, ties.method = "min"),
rank_y = rank(y, ties.method = "min"))]
The ranking method is the same in all cases so I was hoping to use something like
cols <- c("x", "y")
dt[, cols := lapply(.SD, function(x) rank(x, ties.method = "min")), .SDcols = cols]
We can do this with paste to create new variables
dt[, paste0("rank_", cols) := lapply(.SD, rank, ties.method = "min"), .SDcols = cols]