Find nearest value by group - r

I am looking for a practical way to retrieve the nearest value to 0 for each group using (preferably) data.table.
Assume the following DT:
set.seed(1)
library(data.table)
DT <- data.table(val = rnorm(1000), group = rep(1:10, each = 10)) # 10 groups
I have tried to combine both by = group and roll = "nearest", but it only returns the nearest value across and not by groups:
DT[val == 0, val, by = group, roll = "nearest"]
# group value
#1: 8 0.001105352
I could of course repeat the process for each group, but it would be impractical as the number of groups increases. E.g.:
res <- rbind(DT[val == 0 & group = 1, val, by = group, roll = "nearest"],
DT[val == 0 & group = 2, val, by = group, roll = "nearest"],
DT[val == 0 & group = 3, val, by = group, roll = "nearest"],
...)
Maybe I am missing some data.table feature?

You don't necessarily need a join for that.
A possible solution using a combination of min and abs:
DT[, .(closest.val.to.zero = val[abs(val) == min(abs(val))]), by = group]
which gives:
group closest.val.to.zero
1: 1 0.011292688
2: 2 -0.016190263
3: 3 0.002131860
4: 4 0.004398704
5: 5 0.017395620
6: 6 0.002415809
7: 7 0.004884450
8: 8 0.001105352
9: 9 -0.040150452
10: 10 -0.010925691
A more generalised way of the option as posted by #chinsoon12 in the comments:
DT[CJ(group = group, val = 0, unique = TRUE)
, on = .(group, val)
, .(group, closest.val.to.zero = x.val)
, roll = "nearest"]

Related

How to adapt the following code so that any arbitrary self-defined function can work inside?

The data and first piece look like this. I am applying function over pairs of columns (a,b,c) etc.
library(data.table)
d = data.table(time = c(1,1,2,2), a = c(1,2,3,4), b =c(4,3,2,1), c = c(1,1,1,1))
pairs = d[, data.table(t(combn(names(.SD), 2))), by = time]
pairs$i = 1:nrow(pairs) ## index column for reshaping in terms of pairs...
pairs = melt(pairs, id.vars = c('time', 'i'), value.name = 'firm')
d = melt(d, id.vars = 'time', variable.name = 'firm')
d = merge(pairs, d)
The piece I would like to adjust is the following (sum function applied here). Basically, this piece applies function to pair of columns (a-b), (a-c), (b-c) within each group (time 1 and 2).
result = dcast(d, time + i ~ .,
list(pair = \(x) paste(unique(x), collapse = '_'), sum),
value.var = list('firm', 'value'))
Let's say I have an arbitrary function
fun1<- function(x,y, na.rm = FALSE) 1 - 0.5*sum(abs(x-y))
I would apply this fun1 instead of sum in the above piece.
By making fun.aggregate = list we can preserve all the data, and then calculate with it later.
This still might not be what you want, but I think it's progress.
result = dcast(d, time + i ~ .,
list(pair = \(x) paste(unique(x), collapse = '_'), list),
value.var = list('firm', 'value'))
fun1 <- function(x,y, na.rm = FALSE) 1 - 0.5*sum(abs(x-y))
result[, new := sapply(value_list, \(x) fun1(x[1], x[2]) + fun1(x[3], x[4]))]
result
time i firm_pair value_list new
1: 1 1 a_b 1,2,4,3 1.0
2: 1 2 a_c 1,2,1,1 1.5
3: 1 3 b_c 4,3,1,1 1.5
4: 2 4 a_b 3,4,2,1 1.0
5: 2 5 a_c 3,4,1,1 1.5
6: 2 6 b_c 2,1,1,1 1.5

Summary statistics from aggregated groups using data.table

I have a dataset with this structure:
library(data.table)
dt <- data.table(
record=c(1:20),
area=rep(LETTERS[1:4], c(4, 6, 3, 7)),
score=c(1,1:3,2:3,1,1,1,2,2,1,2,1,1,1,1,1:3),
cluster=c("X", "Y", "Z")[c(1,1:3,3,2,1,1:3,1,1:3,3,3,3,1:3)]
)
I would like to aggregate the data so I can identify the most common cluster in each area for a given score (for example 1). I would also like some basic frequencies and percentages to be calculated with an output looking something like this:
dt_summary_for_1_score <- data.table(
area=c("A","B","C","D"),
cluster_mode=c("X","X","X","Z"),
cluster_pct = c(100,66.6,100,80),
cluster_freq = c(2,2,1,4),
record_freq = c(2,3,1,5)
)
Ideally I would like a solution that uses data.table. Thanks.
I would leverage frank, though a solution with sort(table(cluster)) is possible as well.
dt_summary =
dt[ , .N, keyby = .(area, score, cluster)
][ , {
idx = frank(-N, ties.method = 'min') == 1
NN = sum(N)
.(
cluster_mode = cluster[idx],
cluster_pct = 100*N[idx]/NN,
cluster_freq = N[idx],
record_freq = NN
)
}, by = .(area, score)]
To get the example with score == 1 we can subset this:
dt_summary[score == 1]
# area score cluster_mode cluster_pct cluster_freq record_freq
# 1: A 1 X 100.00000 2 2
# 2: B 1 X 66.66667 2 3
# 3: C 1 X 100.00000 1 1
# 4: D 1 Z 80.00000 4 5
This returns different rows in the case of ties. You might try something like cluster_mode = paste(cluster[idx], collapse = '|') or cluster_mode = list(cluster[idx]) instead for alternatives.
Breaking down the logic:
# Count how many times each cluster shows up with each area/score
dt[ , .N, keyby = .(area, score, cluster)
][ , {
# Rank each cluster's count within each area/score & take the top;
# ties.method = 'min' guarantees that if there's
# a tie for "winner", _both_ will get rank 1
# (by default, ties.method = 'average')
# Note that it is slightly inefficient to negate N
# in order to sort in descending order, especially
# if there are a large number of groups. We could
# either vectorize negation by using -.N in the
# previous step or by using frankv (a lower-level
# version of frank) which has an 'order' argument
idx = frank(-N, ties.method = 'min') == 1
# calculate here since it's used twice
NN = sum(N)
.(
# use [idx] to subset and make sure there are
# only as many rows on output as there are
# top-ranked clusters for this area/score
cluster_mode = cluster[idx],
cluster_pct = 100*N[idx]/NN,
cluster_freq = N[idx],
record_freq = NN
)
}, by = .(area, score)]

Lapply to all columns in a data.frame except one and replace the data in R

In the data frame "days", I want to apply the function called 'round.numbers' to all columns except the column named 'id'.
According to the comment,
days[,-'id'][, lapply(X = .SD, FUN = round.numbers)]
This works successfully
However it creates a new table instead of replacing the original data.
days[,-'id'] <- days[,-'id'][, lapply(X = .SD, FUN = round.numbers)]
Failed.
I assume you use data.table. Then you can use setdiff as in the following example:
> days = data.table(a = 1:2, b = 3:4, id = c(1, 1))
>
> days <- days[, lapply(X = .SD, FUN = identity),
+ .SDcols = setdiff(colnames(days), "id")]
> days
a b
1: 1 3
2: 2 4
or just drop id to start with
> days = data.table(a = 1:2, b = 3:4, id = c(1, 1))
> days <- days[, id := NULL][, lapply(X = .SD, FUN = identity)]
> days
a b
1: 1 3
2: 2 4
If you want to keep the id column then this should do (I added this after seeing your comment)
> set.seed(23812349)
> days = data.table(a = rnorm(2), b = rnorm(2), id = c(1, 1))
> days
a b id
1: -1.461587 0.2130853 1
2: 1.062314 0.8523587 1
>
> .cols <- setdiff(colnames(days), "id")
> days[, (.cols) := lapply(.SD, round, digits = 1), .SDcols = .cols]
> days
a b id
1: -1.5 0.2 1
2: 1.1 0.9 1

R data.table join with roll

dd = data.table(a = c(1,1), b = c(1,2), v = c(1, NA))
dd
# a b v
# 1: 1 1 1
# 2: 1 2 NA
setkey(dd, a,b)
dd[.(1,2), roll = TRUE, rollends = c(TRUE, TRUE)]
# a b v
# 1: 1 2 NA
What have I missed here? Why isn't v carried forward?
Rolling join doesn't need to do rolling here as you are matching exact row (1, 2). Rolling matching is made when there is no match on actual values, in your case it has exact match. See below example which, I modified dd so there is no match on .(1,2).
library(data.table)
dd = data.table(a = c(1,1), b = c(1,3), v = c(1, NA))
dd[.(1,2), roll = TRUE, rollends = c(TRUE, TRUE)]
# a b v
#1: 1 2 1
See ?data.table//roll (emphasis mine):
When i is a data.table and its row matches to all but the last x join column, and its value in the last i join column falls in a gap (including after the last observation in x for that group), then:
+Inf (or TRUE) rolls the prevailing value in x forward. It is also known as last observation carried forward (LOCF)
...

Filter out rows of group that don't fit criteria [duplicate]

This question already has answers here:
Filter dataframe by maximum values in each group [duplicate]
(2 answers)
Closed 7 years ago.
Here is the code to use for this question:
set.seed(1337)
myDT <- data.table(Key1 = sample(letters, 500, replace = TRUE),
Key2 = sample(LETTERS[1:5], 500, TRUE),
Data = sample(1:26, 500, replace = TRUE))
setkey(myDT, Key1, Key2)
# showing what myDT looks like
> myDT
Key1 Key2 Data
1: a A 6
2: a A 3
3: a B 2
4: a B 20
5: a B 13
---
496: z D 23
497: z E 3
498: z E 18
499: z E 11
500: z E 2
I would like to pair down myDT to take only the largest Data values for each Key1, Key2 pair. E.g. (using (Key1,Key2) to denote a pair) for (a,A) I would like to get rid of the row where Data is 3 and keep the row where Data is 6. For (z,E) I would like to keep only the row where Data is 18.
While typing out this question, a solution came to me (which I'll post below) but please help me know how you would approach this problem.
My answer
myDT[order(-Data), head(.SD, 1), by = .(Key1, Key2)]
# if you are on 1.9.6 or lower use this one
myDT[order(-Data), .SD[1], by = .(Key1, Key2)]
Or from comments
unique(myDT[order(-Data)], by = c("Key1", "Key2"))
Benchmark on 50M rows.
library(dplyr)
library(data.table)
library(microbenchmark)
set.seed(1337)
n = 5e7
myDT <- data.table(Key1 = sample(letters, n, replace = TRUE),
Key2 = sample(LETTERS[1:5], n, TRUE),
Data = sample(1:26, n, replace = TRUE))
setkey(myDT, Key1, Key2)
microbenchmark(times = 10L,
CathG = myDT[, .SD[which.max(Data)], by = .(Key1, Key2)],
jangorecki = myDT[order(-Data), head(.SD, 1), by = .(Key1, Key2)],
jangorecki.keeporder = myDT[order(-Data), head(.SD, 1), keyby = .(Key1, Key2)],
nist = myDT %>% group_by(Key1,Key2) %>% summarise(Data = max(Data)),
David = unique(myDT[order(-Data)], by = c("Key1", "Key2")))
#Unit: milliseconds
# expr min lq mean median uq max neval
# CathG 659.6150 689.3035 733.9177 739.795 780.0075 811.1456 10
# jangorecki 2844.7565 3026.3385 3089.6764 3097.332 3219.1951 3343.9919 10
# jangorecki.keeporder 2935.3733 3194.1606 3232.9297 3214.581 3308.0735 3411.4319 10
# nist 803.1921 844.5002 1011.7878 1007.755 1188.6127 1228.3869 10
# David 3410.4853 3501.5918 3590.2382 3590.190 3652.8091 3803.9038 10
Previously posted benchmark on small data shows much different results, so I would say it heavily depends on data, not just volume but also cardinality (count of unique values) - maybe even more in some cases.
Another way to do it, based on this Q is:
myDT[, .SD[which.max(Data)], by = .(Key1, Key2)]
# Key1 Key2 Data
# 1: a A 6
# 2: a B 20
# 3: a C 25
# 4: a E 7
# 5: b A 25
#---
#119: z A 23
#120: z B 26
#121: z C 24
#122: z D 25
#123: z E 18
A faster and nicer way to solve it using dplyr
myDT %>% group_by(Key1,Key2) %>% summarise(Data = max(Data))
To keep all existing columns in the data, you can use slice instead of summarise:
myDT %>% group_by(Key1,Key2) %>% slice(which.max(Data))
Note that this will return exactly 1 row per group and in case of ties, it will be the first maximum row of column Data.

Resources