Related
I'm having a hard time solving this issue: for a given data.table, can I filter all rows that pass a criteria an all columns?
example:
dt <-data.table(col_a = c(1,1,0,0,1),
col_b = c(50,0,0,1,0),
col_c = c(0,0,0,0,0),
col_d = c(0,0,0,0,0),
col_e = c(1,0,0,0,10))
I want to return the rows that pass the filter<-c(T,F,F,F,T) - so row number 5
I've tried dt[, filter] - tells me that 'filter' is not found
tried dt[,c(T,F,F,F,T)] this returns a string [1] TRUE FALSE FALSE FALSE TRUE
Can I solve this by only using data.table?
It is unclear from the description of the post. Based on the comments, the OP wants to select the rows that matches the values in filter. In order to do that, first convert the columns to logical, replicate the filter to make the dimensions same before doing the comparison ==, get the rowSums, check if it equal to ncol of original dataset for subsetting the rows
dt[rowSums(dt[, lapply(.SD, as.logical)] == filter[col(dt)])== ncol(dt)]
# col_a col_b col_c col_d col_e
#1: 1 0 0 0 10
Or another option is to paste to single string and then compare
dt[dt[, do.call(paste0, lapply(.SD, function(x) +(as.logical(x))))]
== paste(+(filter), collapse = "")]
Or another approach is to loop through the columns, store the boolean comparison output as a list of vectors and Reduce
lst1 <- vector('list', ncol(dt))
for(j in seq_along(dt)) lst1[[j]] <- as.logical(dt[[j]]) == filter[j]
dt[Reduce(`&`, lst1)]
Or a similar approach with Map/Reduce
dt[dt[, Reduce(`&`, Map(`==`, lapply(.SD, as.logical), filter))]]
Considering the size of your actual dataset, you might be better off to convert it into a long format and then perform the filtering:
ans <- melt(DT[, rn := .I], id.vars="rn")[,
value := as.logical(value)][,
if (all(value==filter)) rn, rn]$V1
one timing:
library(data.table)
set.seed(0L)
nc <- 392
nr <- 2e6
filter <- sample(c(1,0), nc, TRUE)
loc <- which(filter>0L)
M <- matrix(sample(c(1,0), nc*nr, TRUE), nrow=nr)
DT <- as.data.table(M)
system.time({
ans <- melt(DT[, rn := .I], id.vars="rn")[,
value := as.logical(value)][,
if (all(value==filter)) rn, rn]$V1
})
# user system elapsed
# 2.20 0.84 1.72
some other options but not as fast as converting into a long format:
library(Matrix)
library(data.table)
library(microbenchmark)
set.seed(0L)
nc <- 392
nr <- 1e5
filter <- sample(c(1,0), nc, TRUE)
loc <- which(filter>0L)
M <- matrix(sample(c(1,0), nc*nr, TRUE), nrow=nr)
DT <- as.data.table(M)
# filter <- c(T,F,F,F,T)
# DT <- data.table(c(1,1,0,0,1), c(50,0,0,1,0), c(0,0,0,0,0), c(0,0,0,0,0), c(1,0,0,0,10))
# M <- as.matrix(DT)
loc <- which(filter>0L)
sumF <- sum(filter)
DTo_f <- copy(DT)
DTj_f <- copy(DT)
#Spare matrix
sm_f <- function() {
sM <- as(M, "dgTMatrix")
ixDT <- data.table(R=sM#i+1L, C=sM#j+1L, I=1L)
univ <- data.table(R=rep(1:nr, each=length(loc)), C=rep(loc, nr), U=1L)
mgDT <- merge(univ, ixDT, by=c("R", "C"), all=TRUE)
mgDT[, if(!(anyNA(U) | anyNA(I))) R, R]$V1
}
#melt
m_f <- function() {
melt(DT[, rn := .I], id.vars="rn")[,
value := as.logical(value)][,
if (all(value==filter)) rn, rn]$V1
}
#order
o_f <- function() {
non0 <- DTo_f[, {
m <- as.matrix(.SD)
ri <- replace(col(.SD), .SD==0L, NA_integer_)
as.data.table(matrix(ri[order(row(.SD), ri, na.last=TRUE)], nrow=.N, byrow=TRUE))
}]
non0[setNames(as.list(c(loc, rep(NA_integer_, nc - length(loc)))), names(DTo_f)),
on=.NATURAL, which=TRUE]
}
#join
j_f <- function() {
setindexv(DTj_f, names(DTj_f))
DTj_f[, names(DTj_f) := lapply(DTj_f, as.logical)]
DTj_f[as.list(as.logical(filter)), on=names(DTj_f), which=TRUE]
}
microbenchmark(sm_f(), m_f(), o_f(), j_f(), times=1L)
timings:
Unit: seconds
expr min lq mean median uq max neval
sm_f() 9.134432 9.134432 9.134432 9.134432 9.134432 9.134432 1
m_f() 2.020081 2.020081 2.020081 2.020081 2.020081 2.020081 1
o_f() 3.413685 3.413685 3.413685 3.413685 3.413685 3.413685 1
j_f() 7.149763 7.149763 7.149763 7.149763 7.149763 7.149763 1
You can use which(colSums((df>0)==filter)==nrow(df)) to get index
> which(colSums((df>0)==filter)==nrow(df))
col_e
5
such that
> df[which(colSums((df>0)==filter)==nrow(df))]
col_a col_b col_c col_d col_e
1: 1 0 0 0 10
If I understand the question correctly, this should answer the question.
Reproduce your data:
library(data.table)
dt <-data.table(col_a = c(1,1,0,0,1),
col_b = c(50,0,0,1,0),
col_c = c(0,0,0,0,0),
col_d = c(0,0,0,0,0),
col_e = c(1,0,0,0,10))
filter<-c(T,F,F,F,T)
Now create a variable that is checking for non-zero values in each row and subset accordingly
to_subset = apply(dt, 1, function(x) {
all((x > 0) == filter)
})
# the output you are looking for
dt[to_subset]
# col_a col_b col_c col_d col_e
# 1: 1 0 0 0 10
The code can be collapsed to be more concise.
dt[apply(dt, 1, function(x) all((x > 0) == filter))]
# col_a col_b col_c col_d col_e
# 1: 1 0 0 0 10
I'm trying to build an efficient for loop for this function proposed by minem here: (Data.table: how to get the blazingly fast subsets it promises and apply to a second data.table)
My data are:
library(dplyr)
library(tidyr)
library(lubridate)
library(data.table)
adherence <- cbind.data.frame(c("1", "2", "3", "1", "2", "3"), c("2013-01-01", "2013-01-01", "2013-01-01", "2013-02-01", "2013-02-01", "2013-02-01"))
names(adherence)[1] <- "ID"
names(adherence)[2] <- "year"
adherence$year <- ymd(adherence$year)
lsr <- cbind.data.frame(
c("1", "1", "1", "2", "2", "2", "3", "3"), #ID
c("2012-03-01", "2012-08-02", "2013-01-06","2012-08-25", "2013-03-22", "2013-09-15", "2011-01-01", "2013-01-05"), #eksd
c("60", "90", "90", "60", "120", "60", "30", "90") # DDD
)
names(lsr)[1] <- "ID"
names(lsr)[2] <- "eksd"
names(lsr)[3] <- "DDD"
lsr$eksd <- as.Date((lsr$eksd))
lsr$DDD <- as.numeric(as.character(lsr$DDD))
lsr$ENDDATE <- lsr$eksd + lsr$DDD
lsr <- as.data.table(lsr)
adherence <- as.data.table(adherence)
The Function proposed by minem are:
by_minem2 <- function(dt = lsr2) {
d <- as.numeric(as.Date("2013-02-01"))
dt[, ENDDATE2 := as.numeric(ENDDATE)]
x <- dt[eksd <= d & ENDDATE > d, sum(ENDDATE2 - d), keyby = ID]
uid <- unique(dt$ID)
id2 <- setdiff(uid, x$ID)
id2 <- uid[!(uid %in% x$ID)]
x2 <- data.table(ID = id2, V1 = 0)
x <- rbind(x, x2)
setkey(x, ID)
x
}
This returns:
> by_minem2(lsr)
ID V1
1: 1 64
2: 2 0
3: 3 63
For the loop i need to include information about which time I evaluated at so the ideal repeated output looks like this:
cbind(as.Date("2013-02-01"),by_minem2(lsr))
I then want to repeat this for different dates a few hundred times putting everything into the same data.table:
time.months <- as.Date("2013-02-01")+(365.25/12)*(0:192) #dates to evaluate at
I'm trying to do this with a for loop like this:
for (d in min(time.months):max(time.months))
{
by_minem <- function(dt = lsr2) {
d <- as.numeric(d)
dt[, ENDDATE2 := as.numeric(ENDDATE)]
x <- dt[eksd <= d & ENDDATE > d, sum(ENDDATE2 - d), keyby = ID]
uid <- unique(dt$ID)
id2 <- setdiff(uid, x$ID)
id2 <- uid[!(uid %in% x$ID)]
x2 <- data.table(ID = id2, V1 = 0)
x <- rbind(x, x2)
setkey(x, ID)
xtot <- append(xtot,x)
xtot <- cbind(d, xtot) # i need to know time of evaluation
xtot
}
}
As indicated in the answer to the related question Data.table: how to get the blazingly fast subsets it promises and apply to a second data.table, this can be solved by updating in a non-equi join which is possible with data.table.
The difference to the linked question is that here we need to create the cross join CJ() of all unique IDs with the vector of dates on our own before joining with lsr.
The OP has provided a series of dates time.months whose defintion
time.months <- as.Date("2013-02-01")+(365.25/12)*(0:192) #dates to evaluate at
leads to "crooked" dates which is only visible if coerced to numeric or POSIXct:
head(lubridate::as_datetime(time.months))
[1] "2013-02-01 00:00:00 UTC" "2013-03-03 10:30:00 UTC" "2013-04-02 21:00:00 UTC"
[4] "2013-05-03 07:30:00 UTC" "2013-06-02 18:00:00 UTC" "2013-07-03 04:30:00 UTC"
The issue is that these "dates" are not aligned with midnight but start somewhere during the day. To avoid these ambiguities, the seq() function can be used
dates <- seq(as.Date("2013-02-01"), length.out = 193, by = "month")
which creates a series of dates starting on the first day of each month.
In addition, data.table's IDate class is used which stores dates as integers (4 bytes) instead of double (8 bytes). This saves memory as well as processing time because the usually faster integer arithmetic can be used.
# coerce Date to IDate
idates <- as.IDate(dates)
setDT(lsr)[, eksd := as.IDate(eksd)][, ENDDATE := as.IDate(ENDDATE)]
# cross join unique IDs with dates
CJ(ID = lsr$ID, date = idates, unique = TRUE)[
# intialize result column
, AH := 0L][
# non-equi join and ...
lsr, on = .(ID, date >= eksd, date < ENDDATE),
# ... update only matching rows
AH := as.integer(ENDDATE - x.date)][
# reshape from long to wide format
, dcast(.SD, ID ~ date)]
ID 2013-02-01 2013-03-01 2013-04-01 2013-05-01 2013-06-01 2013-07-01 2013-08-01 [...]
1: 1 64 36 5 0 0 0 0
2: 2 0 0 110 80 49 19 0
3: 3 63 35 4 0 0 0 0
Caveat
Note that above code assumes that the intervals [eksd, ENDDATE) for each ID do not overlap. This can be verified by
lsr[order(eksd), all(eksd - shift(ENDDATE, fill = 0) > 0), keyby = ID]
ID V1
1: 1 TRUE
2: 2 TRUE
3: 3 TRUE
In case there are overlaps, the above code can be modified to aggregate within the non-equi join using by = .EACHI.
Benchmark
In another related question data.table by = xx How do i keep the groups of length 0 when i returns no match, the OP has pointed out that performance is crucial due to the size of his production data.
According to OP's comment, lsr has 20 mio rows and 12 columns, the adherence dataset, that I'm trying not to use has 1,5 mio rows of 2 columns. In another question, the OP mentions that lsr is a few hundred mio. rows.
#minem has responded to this by providing a benchmark in his answer. We can use this benchmark data to compare the different answers.
# create benchmark data
lsr <- data.frame(
ID = c("1", "1", "1", "2", "2", "2", "3", "3"),
eksd = as.Date(c("2012-03-01", "2012-08-02", "2013-01-06","2012-08-25", "2013-03-22", "2013-09-15", "2011-01-01", "2013-01-05")),
DDD = as.integer(c("60", "90", "90", "60", "120", "60", "30", "90")),
stringsAsFactors = FALSE)
lsr$ENDDATE <- lsr$eksd + lsr$DDD
n <- 5e4
lsr2 <- lapply(1:n, function(x) lsr)
lsr2 <- rbindlist(lsr2, use.names = T, fill = T, idcol = T)
lsr2[, ID := as.integer(paste0(.id, ID))]
Thus, the benchmark dataset consists of 400 k rows and 150 k unique IDs:
lsr2[, .(.N, uniqueN(ID))]
N V2
1: 400000 150000
# pull data preparation out of the benchmark
lsr2i <- copy(lsr2)[, eksd := as.IDate(eksd)][, ENDDATE := as.IDate(ENDDATE)]
lsr2[, ENDDATE2 := as.numeric(ENDDATE)]
# define date series
dates <- seq(as.Date("2013-02-01"), length.out = 193, by = "month")
idates <- seq(as.IDate("2013-02-01"), length.out = 193, by = "month")
# run benchmark
library(microbenchmark)
bm <- microbenchmark(
minem = {
dt <- copy(lsr2)
xtot <- lapply(dates, function(d) {
d <- as.numeric(d)
x <- dt[eksd <= d & ENDDATE > d, sum(ENDDATE2 - d), keyby = ID]
uid <- unique(dt$ID)
id2 <- setdiff(uid, x$ID)
id2 <- uid[!(uid %in% x$ID)]
if (length(id2) > 0) {
x2 <- data.table(ID = id2, V1 = 0)
x <- rbind(x, x2)
}
setkey(x, ID)
x
})
for (x in seq_along(xtot)) {
setnames(xtot[[x]], c("ID", paste0("V", x)))
}
xtot <- Reduce(function(...) merge(..., all = TRUE, by = "ID"), xtot)
xtot
},
uwe = {
dt <- copy(lsr2i)
CJ(ID = dt$ID, date = idates, unique = TRUE)[, AH := 0L][
dt, on = .(ID, date >= eksd, date < ENDDATE),
AH := as.integer(ENDDATE - x.date)][, dcast(.SD, ID ~ date)]
},
times = 1L
)
print(bm)
The result for one run shows that the non-equi join is more than 4 times faster than the lapply() approach.
Unit: seconds
expr min lq mean median uq max neval
minem 27.654703 27.654703 27.654703 27.654703 27.654703 27.654703 1
uwe 5.958907 5.958907 5.958907 5.958907 5.958907 5.958907 1
something like this :
dt <- lsr
dt[, ENDDATE2 := as.numeric(ENDDATE)]
s <- time.months
xtot <- lapply(s, function(d) {
d <- as.numeric(d)
x <- dt[eksd <= d & ENDDATE > d, sum(ENDDATE2 - d), keyby = ID]
uid <- unique(dt$ID)
id2 <- setdiff(uid, x$ID)
id2 <- uid[!(uid %in% x$ID)]
if (length(id2) > 0) {
x2 <- data.table(ID = id2, V1 = 0)
x <- rbind(x, x2)
}
setkey(x, ID)
x
})
for (x in seq_along(xtot)) {
setnames(xtot[[x]], c("ID", paste0("V", x)))
}
xtot <- Reduce(function(...) merge(..., all = TRUE, by = "ID"), xtot)
xtot
My data.frame
df <- data.frame(ID=rep(1:3, 3), Obs_1=rnorm(9), Obs_2=rnorm(9), Obs_3=rnorm(9))
I want to calculate the mean of each colum by the ID. I have tried two approaches:
1)
ag <- aggregate(. ~ ID, df, function(x) c(mean = mean(x)))
returns
ID Obs_1 Obs_2 Obs_3
1 1 0.41220831 -0.9999704 -0.7234958
2 2 0.03564336 0.5014259 0.4847635
3 3 0.05647885 0.2067311 -0.0542368
Why does it does not indicate "mean" in the column name and how can make this reported? something like this:
Obs_1.mean Obs_2.mean Obs_3.mean
2) the second approach
df[, c(mean = lapply(.SD, mean), sd = lapply(.SD, sd)), by = ID]
it gives
unused argument (by = ID)
Thank you
Here is one approach:
df <- data.frame(ID=rep(1:3, 3),
Obs_1=rnorm(9),
Obs_2=rnorm(9),
Obs_3=rnorm(9))
ag <- aggregate(. ~ ID, df, FUN = mean)
colnames(ag) <- sapply(colnames(ag),
FUN = function(x) { paste0(x, ifelse(x=="ID", "", ".mean")) })
It yields:
ID Obs_1.mean Obs_2.mean Obs_3.mean
1 1 0.2675131 -0.3494175 -0.3128264
2 2 1.0988356 -0.6645116 -0.2192402
3 3 -1.1569562 -0.4040192 -0.1265475
About your question why the trick
FUN = function(x) { c(mean = mean(x)) })
to stick the name "mean" to the result of FUN does not work:
I suppose that is due to the grouping that aggregate() performs. Each group might in theory return a different name. Which of these potentially conflicting names should aggregate() use then to name the complete column of the aggregated dataframe...?
If no grouping happens, like here,
sapply(df, FUN = function(x) { c(mean = mean(x)) })
we get the expected names:
ID.mean Obs_1.mean Obs_2.mean Obs_3.mean
2.0000000 0.4265256 -0.2046452 -0.4496054
I have two data frames.
set.seed(1234)
df <- data.frame(
id = factor(rep(1:24, each = 10)),
price = runif(20)*100,
quantity = sample(1:100,240, replace = T)
)
df2 <- data.frame(
id = factor(seq(1:24)),
eq.quantity = sample(1:100, 24, replace = T)
)
I would like to use df2$eq.quantity to find the closest absolute value compared to df$quantity, by the factor variable, id. I would like to do that for each id in df2 and bind it into a new data-frame, called results.
I can do it like this for each individually ID:
d.1 <- df2[df2$id == 1, 2]
df.1 <- subset(df, id == 1)
id.1 <- df.1[which.min(abs(df.1$quantity-d.1)),]
Which would give the solution:
id price quantity
1 66.60838 84
But I would really like to be able to use a smarter solution, and also gathered the results into a dataframe, so if I do it manually it would look kinda like this:
results <- cbind(id.1, id.2, etc..., id.24)
I had some trouble giving this question a good name?
data.tables are smart!
Adding this to your current example...
library(data.table)
dt = data.table(df)
dt2 = data.table(df2)
setkey(dt, id)
setkey(dt2, id)
dt[dt2, dif:=abs(quantity - eq.quantity)]
dt[,list(price=price[which.min(dif)], quantity=quantity[which.min(dif)]), by=id]
result:
dt[,list(price=price[which.min(dif)], quantity=quantity[which.min(dif)]), by=id]
id price quantity
1: 1 66.6083758 84
2: 2 29.2315840 19
3: 3 62.3379442 63
4: 4 54.4974836 31
5: 5 66.6083758 6
6: 6 69.3591292 13
...
Merge the two datasets and use lapply to perform the function on each id.
df3 <- merge(df,df2,all.x=TRUE,by="id")
diffvar <- function(df){
df4 <- subset(df3, id == df)
df4[which.min(abs(df4$quantity-df4$eq.quantity)),]
}
resultslist <- lapply(levels(df3$id),function(df) diffvar(df))
Combine the resulting list elements in a dataframe:
resultsdf <- data.frame(matrix(unlist(resultslist), ncol=4, byrow=T))
Or more easy:
library(plyr)
resultsdf <- ddply(df3, .(id), function(x)x[which.min(abs(x$quantity-x$eq.quantity)),])
For different values of id I have a start and end dates with a relative quantity, var.
For each records (for the same id), start date is the same then the previous end date (here it comes roll...).
These periods span across multiple months and possibly years. My need is to split the quantity in var into parts relative to the actual days in each months. e.g.
start end var
30/01/2006 20/02/2006 104
above I have 21 days, the lower limit will belong to the previous period and the upper to the current, so 1/21 of 104 will be assigned to Jan 2006 and the rest to Feb 2006
I currently have two methods, listed below with dummy data, but they are pretty slow and I was wondering if someone may help with me out to speed them up.
library(data.table)
# data
set.seed(1)
nsample <- 200L # To increase the data size just change nsample
dt <- data.table(id= 1L:nsample)
dt <- dt[, list(date=sample(seq(as.Date("2006-01-01"), as.Date("2012-01-01"), "day"), 51, F)), by=id]
setkey(dt)
dt <- dt[, {tmp <- embed(as.vector(date), 2);list(start = structure(tmp[,2], class="Date"),
end = structure(tmp[,1], class="Date"),
var = rnorm(50, 100, 5))}, by=id]
setkey(dt, id, end)
> dt[1:4]
id start end var
1: 1 2006-01-30 2006-02-20 104.41542
2: 1 2006-02-20 2006-05-15 106.89356
3: 1 2006-05-15 2006-08-21 106.71162
4: 1 2006-08-21 2006-09-30 96.21729
# Method 1
dt1 <- copy(dt)
system.time({
dt1[, id2 := 1:.N]
tmp <- dt1[, list(id = id,
date = seq(start+1, end, "day"),
var = var), by=id2]
tmp[, var := var/(.N), by=id2]
res1 <- tmp[, list(var = sum(var)), by=list(id, period = paste(year(date), month(date), sep="-"))]
})
#user system elapsed
#1.92 0.00 1.92
# Method 2
dt2 <- copy(dt)
system.time({
dt2[, Ndays := as.integer(end)-as.integer(start)]
tmp <- dt2[, list(date = seq(min(start)+1, max(end), "day")), by=id]
setkey(tmp)
res2 <- dt2[ tmp, roll=-Inf][ end >= start,list(var = sum(var/Ndays)), by=list(id, period = paste(year(end), month(end), sep="-")) ]
})
#user system elapsed
# 0.7 0.0 0.7
> sum(dt$var) == sum(res1$var)
[1] TRUE
> sum(dt$var) == sum(res2$var)
[1] TRUE
> all.equal(res1, res2)
[1] TRUE
> res2[1:4]
id period var
1: 1 2006-1 4.972163
2: 1 2006-2 109.623593
3: 1 2006-3 39.448815
4: 1 2006-4 38.176273
This will be a bit faster (it's 3x faster for me than your second version). I optimized several things in your second version, that you can see below:
# let's just divide here instead of later
dt2[, var := var/(as.integer(end)-as.integer(start))]
tmp <- dt2[, list(date = seq(min(start)+1, max(end), "day")), by=id]
# data is sorted, so no need to sort again, just set key without sort
setattr(tmp, "sorted", c("id", "date"))
res2 <- dt2[tmp, roll=-Inf][,
list(var = sum(var)),
# doing the paste in by slows it down quite a bit, so let's postpone it
by=list(id, year(end), month(end))][,
`:=`(period = paste(year, month, sep = '-'), year = NULL, month = NULL)]
Re comment about large sizes - you could do all of the above inside dt2. It'll be slower, but I it won't create a large tmp:
dt2[, var := var/(as.integer(end)-as.integer(start))][,
{tmp = data.table(date = seq(min(start)+1, max(end), "day"));
setattr(tmp, 'sorted', 'date');
setattr(.SD, 'sorted', 'end');
.SD[tmp, roll = -Inf][,
list(var = sum(var)), by = list(year(end), month(end))][,
`:=`(period = paste(year, month, sep = '-'), year = NULL, month = NULL)]
}, by = id]