How to quickly count number of unique entries after merging tables - r

library(data.table)
table1 <- data.table(id1 = c(1324, 2324, 29, 29, 1010, 1010),
type = c(1, 1, 2, 1, 1, 1),
class = c("A", "A", "B", "D", "D", "A"),
number = c(1, 98, 100, 100, 70, 70))
table2 <- data.table(id2 = c(1998, 1998, 2000, 2000, 2000, 2010, 2012, 2012),
type = c(1, 1, 3, 1, 1, 5, 1, 1),
class = c("D", "A", "D", "D", "A", "B", "A", "A"),
min_number = c(34, 0, 20, 45, 5, 23, 1, 1),
max_number = c(50, 100, 100, 100, 100, 9, 10, 100))
> table1
id1 type class number
1: 1324 1 A 1
2: 2324 1 A 98
3: 29 2 B 100
4: 29 1 D 100
5: 1010 1 D 70
6: 1010 1 A 70
> table2
id2 type class min_number max_number
1: 1998 1 D 34 50
2: 1998 1 A 0 100
3: 2000 3 D 20 100
4: 2000 1 D 45 100
5: 2000 1 A 5 100
6: 2010 5 B 23 9
7: 2012 1 A 1 10
8: 2012 1 A 1 100
I have two tables, and I would like to merge them based on type, class, and whether number lies between min_number and max_number. Then I would like to create a new variable nMatch that stores the number of unique id2s that match with each id1.
setindexv(table2, c("type", "class"))
for (t1_row in seq_len(nrow(table1))) {
print(t1_row)
set(
table1, t1_row, "matches",
table2[table1[t1_row], on = c("type", "class", "max_number >= number", "min_number <= number"), .(list(id2))]
)
}
> table1[, .(nMatch = uniqueN(unlist(matches), na.rm = TRUE)), by = .(id1)]
id1 nMatch
1: 1324 2
2: 2324 3
3: 29 1
4: 1010 3
The approach above is row-by-row as suggested here, but my real dataset has millions of rows. What's another way of doing this that's faster?

You can try data.table with on = .(...) to merge two data tables
table1[
table2,
.(id1, id2),
on = .(type, class, number >= min_number, number <= max_number),
nomatch = NULL
][
,
.(nMatch = uniqueN(id2)),
id1
]
and will get
id1 nMatch
1: 1324 2
2: 1010 3
3: 2324 3
4: 29 1

An option with tidyverse
library(dplyr)
library(tidyr)
left_join(table1, table2, by =
join_by(type, class, number >= min_number, number <= max_number)) %>%
distinct(id1, id2) %>%
drop_na %>%
count(id1, name = "nMatch")
-output
id1 nMatch
<num> <int>
1: 29 1
2: 1010 3
3: 1324 2
4: 2324 3

Related

Aggregating / rolling up specific records "into" subsequent records

I am trying to aggregate records with a specific type into subsequent records.
I have a dataset similar to the following:
df_initial <- data.frame("Id" = c(1, 2, 3, 4, 5),
"Qty" = c(105, 110, 100, 115, 120),
"Type" = c("A", "B", "B", "A", "A"),
"Difference" = c(30, 34, 32, 30, 34))
After sorting on the Id field, I'd like to aggregate records of Type = "B" into the next record of type = "A".
In other words, I'm looking to create df_new, which adds the Qty and Difference values for Ids 2 and 3 into the Qty and Difference values for Id 4, and flags Id 4 as being adjusted (in the field AdjustedFlag).
df_new <- data.frame("Id" = c(1, 4, 5),
"Qty" = c(105, 325, 120),
"Type" = c("A", "A", "A"),
"Difference" = c(30, 96, 34),
"AdjustedFlag" = c(0, 1, 0))
I'd greatly appreciate any advice or ideas about how to do this in R, preferably using data.table.
A data.table solution:
df_initial[, .(
Id = Id[.N], Qty = sum(Qty),
Difference = sum(Difference),
AdjustedFlag = +(.N > 1)
), by = .(grp = rev(cumsum(rev(Type == "A"))))
][, grp := NULL][]
# Id Qty Difference AdjustedFlag
# <num> <num> <num> <int>
# 1: 1 105 30 0
# 2: 4 325 96 1
# 3: 5 120 34 0
This can be solved by creating a new grouping variable, that groups the rows into the groups you describe, with the idea being to utilize that grouping variable for the desired aggregation.
Instead of having
A B B A A
that new grouping variable should look something like this:
1 2 2 2 3
This is not a data.table solution, but the same logic could be applied there:
library(tidyverse)
df_initial |>
mutate(
type2 = ifelse(Type == "A", as.numeric(factor(Type)), 0),
type2 = cumsum(type2),
type2 = ifelse(Type == "B", NA, type2)
) |>
fill(type2, .direction = "up") |>
group_by(type2) |>
summarise(
id = max(Id),
Qty = sum(Qty),
Difference = sum(Difference),
AdjustedFlag = as.numeric(n() > 1)
)
#> # A tibble: 3 × 5
#> type2 id Qty Difference AdjustedFlag
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 105 30 0
#> 2 2 4 325 96 1
#> 3 3 5 120 34 0
Using tidyverse
df_initial %>%
mutate(gn = if_else(lag(Type, default = 'A') == 'B' | Type == 'B', 'B', Type),
gr = cumsum(lag(gn, default = 'A') != gn),
adjusted = if_else(lag(Type, default = 'A') == 'B' | Type == 'B', 1, 0)) %>%
group_by(gr) %>%
summarise(Id = last(Id),
Qty = sum(Qty),
Type = 'A',
Difference = sum(Difference),
Adjusted_flg = max(adjusted)) %>% ungroup()
Here we create an interim dataset that looks like:
Id Qty Type Difference gn gr Adjusted
1 1 105 A 30 A 0 0
2 2 110 B 34 B 1 0
3 3 100 B 32 B 1 0
4 4 115 A 30 B 1 1
5 5 120 A 34 A 2 0
And use this to create our final table within the summarise. The gr is a column for indicating a group of values, which is why we group_by it.

How to subtract value of one group from other groups in R

I am trying to subtract the value of one group from another. I am hoping to use tidyverse
structure(list(A = c(1, 1, 1, 2, 2, 2, 3, 3, 3), group = c("a",
"b", "c", "a", "b", "c", "a", "b", "c"), value = c(10, 11, 12,
11, 40, 23, 71, 72, 91)), class = "data.frame", row.names = c(NA,
-9L))
That is my data, and I want to subtract all values of group A from B and C, and store the difference in one variable.
baseR solution
df$new <- df$value - ave(df$value, df$A, FUN = function(x) mean(x[df$group == 'a'], na.rm = T) )
> df
A group value new
1 1 a 10 0
2 1 b 11 1
3 1 c 12 2
4 2 a 11 0
5 2 b 40 29
6 2 c 23 12
7 3 a 71 0
8 3 b 72 1
9 3 c 91 20
dplyr method (assumption there is not more than one a value per group, else R will confuse which value to substract and result in error)
df %>% group_by(A) %>% mutate(new = ifelse(group != 'a', value - value[group == 'a'], value) )
# A tibble: 9 x 4
# Groups: A [3]
A group value new
<dbl> <chr> <dbl> <dbl>
1 1 a 10 10
2 1 b 11 1
3 1 c 12 2
4 2 a 11 11
5 2 b 40 29
6 2 c 23 12
7 3 a 71 71
8 3 b 72 1
9 3 c 91 20
or if you want to change all values
df %>% group_by(A) %>% mutate(new = value - value[group == 'a'] )
# A tibble: 9 x 4
# Groups: A [3]
A group value new
<dbl> <chr> <dbl> <dbl>
1 1 a 10 0
2 1 b 11 1
3 1 c 12 2
4 2 a 11 0
5 2 b 40 29
6 2 c 23 12
7 3 a 71 0
8 3 b 72 1
9 3 c 91 20
I only used data.table rather than data.frame because I'm more familiar.
library(data.table)
data <- setDT(structure(list(A = c(1, 1, 1, 2, 2, 2, 3, 3, 3), group = c("a",
"b", "c", "a", "b", "c", "a", "b", "c"), value = c(10, 11, 12,
11, 40, 23, 71, 72, 91)), class = "data.frame", row.names = c(NA,-9L)))
for (i in 1:length(unique(data$A))){
data[A == i, substraction := data[A == i, 'value'] - data[A == i & group == 'a', value]]
}

Multiply columns from different data frames if dates match

I have the following two data frames:
df1 <- data.frame(Category = c("A", "A", "A", "B", "B", "B", "C", "C", "C"),
Date = c(2001, 2002, 2003, 2001, 2002, 2003, 2001, 2002, 2003),
Beta1 = c(1, 3, 4, 4, 5, 3, 5, 3, 1),
Beta2 = c(2, 4, 6, 1, 1, 2, 5, 4, 2))
df2 <- data.frame(Date = c(2001, 2002, 2003),
Column1 = c(10, 20, 30),
Column2 = c(40, 50, 60))
Say I assign category A to Column1 and and category C to Column2. I want to multiply the row value from Column1 with the row betas from category A, if the dates match. Similarly, I want to multiply the row value from Column2 with the row betas from category C, if the dates match.
The match between a category and a column is of my own choosing. Assigning this myself won’t be a problem I think because I have relatively few columns.
Preferably, I want the output to look like this:
results <- data.frame(Date = c(2001, 2002, 2003),
Column1_categoryA_beta1 = c(10, 60, 120),
Column1_categoryA_beta2 = c(20, 80, 180),
Column2_categoryC_beta1 = c(200, 150, 60),
Column2_categoryC_beta2 = c(200, 200, 120))
Any help in how I best can approach this problem is very much appreciated!
With some data wrangling using tidyr and dplyr this can be achieved like so:
df1 <- data.frame(Category = c("A", "A", "A", "B", "B", "B", "C", "C", "C"),
Date = c(2001, 2002, 2003, 2001, 2002, 2003, 2001, 2002, 2003),
Beta1 = c(1, 3, 4, 4, 5, 3, 5, 3, 1),
Beta2 = c(2, 4, 6, 1, 1, 2, 5, 4, 2))
df2 <- data.frame(Date = c(2001, 2002, 2003),
Column1 = c(10, 20, 30),
Column2 = c(40, 50, 60))
library(dplyr)
library(tidyr)
df2_long <- df2 %>%
pivot_longer(-Date, names_to = "Column", values_to = "Value") %>%
mutate(Category = ifelse(Column == "Column1", "A", "C"))
df2_long %>%
left_join(df1) %>%
mutate(Beta1 = Value * Beta1,
Beta2 = Value * Beta2) %>%
select(Date, Category, Column, Beta1, Beta2) %>%
pivot_wider(id_cols = Date, names_from = c("Column", "Category"), values_from = c("Beta1", "Beta2"))
#> Joining, by = c("Date", "Category")
#> Warning: Column `Category` joining character vector and factor, coercing into
#> character vector
#> # A tibble: 3 x 5
#> Date Beta1_Column1_A Beta1_Column2_C Beta2_Column1_A Beta2_Column2_C
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2001 10 200 20 200
#> 2 2002 60 150 80 200
#> 3 2003 120 60 180 120
Created on 2020-04-14 by the reprex package (v0.3.0)
One way to get there while keeping the Category variable in the final data frame is the following:
df3 <- left_join(df1, df2, by="Date")
df4 <- df3 %>%
group_by(Date, Category) %>%
mutate(Col1Bet1 = Column1 * Beta1, Col1Bet2 = Column1 * Beta2, Col2Bet1 = Column2 * Beta1, Col2Bet2 = Column2 * Beta2)
which gives the following:
# A tibble: 9 x 10
# Groups: Date, Category [9]
Category Date Beta1 Beta2 Column1 Column2 Col1Bet1 Col1Bet2 Col2Bet1 Col2Bet2
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 2001 1 2 10 40 10 20 40 80
2 A 2002 3 4 20 50 60 80 150 200
3 A 2003 4 6 30 60 120 180 240 360
4 B 2001 4 1 10 40 40 10 160 40
5 B 2002 5 1 20 50 100 20 250 50
6 B 2003 3 2 30 60 90 60 180 120
7 C 2001 5 5 10 40 50 50 200 200
8 C 2002 3 4 20 50 60 80 150 200
9 C 2003 1 2 30 60 30 60 60 120
This could be a start. The result data.table has all information you want just in another format.
df3 <- merge(df1, df2)
df3$b1 <- ifelse(df3$Category=="A", df3$Beta1*df3$Column1, ifelse(df3$Category=="C", df3$Beta1*df3$Column2, NA))
df3$b2 <- ifelse(df3$Category=="A", df3$Beta2*df3$Column1, ifelse(df3$Category=="C", df3$Beta2*df3$Column2, NA))
# Date Category Beta1 Beta2 Column1 Column2 b1 b2
# 1 2001 A 1 2 10 40 10 20
# 2 2001 C 5 5 10 40 200 200
# 3 2001 B 4 1 10 40 NA NA
# 4 2002 A 3 4 20 50 60 80
# 5 2002 B 5 1 20 50 NA NA
# 6 2002 C 3 4 20 50 150 200
# 7 2003 B 3 2 30 60 NA NA
# 8 2003 A 4 6 30 60 120 180
# 9 2003 C 1 2 30 60 60 120

How to reduce aligned sequences to start and end coordinates?

I have a data table of labelled coordinates that are aligned between two groups (A and B). For example:
dt_long <- data.table(LABEL_A = c(rep("A", 20), rep("A", 15), rep ("A", 10), rep ("A", 15), rep ("A", 10)),
SEQ_A = c(11:30, 61:75, 76:85, 86:100, 110:119),
LABEL_B= c(rep("C", 20), rep("D", 15), rep("F", 10), rep("G",15), rep("D", 10)),
SEQ_B = c(1:20, 25:11, 16:25, 15:1, 1:5, 8:12))
How can I reduce this information into a short format, where the start and end coordinates for each aligned sequence are given. For example:
dt_short <- data.table(LABEL_A = c("A", "A", "A", "A", "A", "A"),
Start_A = c(11, 61, 76, 86, 110, 115),
End_A = c(30, 75, 85, 100, 114, 119),
LABEL_B= c("C", "D", "F", "G", "D", "D"),
Start_B = c(1, 25, 16, 15, 1, 8),
End_B = c(20, 11, 25, 1, 5, 12))
The length of each aligned sequence should be identical. For example:
identical(abs(dt_short$End_A - dt_short$Start_A), abs(dt_short$End_B - dt_short$Start_B))
You can make use of rleid and incorporating Frank's comment to remove grouping column
dt_long[, .(
LABEL_A=LABEL_A[1L], Start_A=SEQ_A[1L], End_A=SEQ_A[.N],
LABEL_B=LABEL_B[1L], Start_B=SEQ_B[1L], End_B=SEQ_B[.N]),
by=rleid(LABEL_A, LABEL_B,
c(0L, cumsum(diff(SEQ_A) > 1L)),
c(0L, cumsum(diff(SEQ_B) > 1L)))][, (1) := NULL]
output:
LABEL_A Start_A End_A LABEL_B Start_B End_B
1: A 11 30 C 1 20
2: A 61 75 D 25 11
3: A 76 85 F 16 25
4: A 86 100 G 15 1
5: A 110 114 D 1 5
6: A 115 119 D 8 12
A straight forward way is to group by the two labels and get the first and last of each group, i.e.
library(data.table)
dt_long[, .(Start_A = first(SEQ_A), End_A = last(SEQ_A), Start_B = first(SEQ_B), End_B = last(SEQ_B)), by = .(LABEL_A, LABEL_B)][]
# LABEL_A LABEL_B Start_A End_A Start_B End_B
#1: 1 3 11 30 1 20
#2: 1 4 61 75 25 11
#3: 1 6 76 85 16 25
#4: 1 7 86 100 15 1
We can just subset and dcast. Would also work seamlessly when there are many different groups of columns
dcast(dt_long[, .SD[c(1, .N)], .(LABEL_A, LABEL_B)],
LABEL_A + LABEL_B ~ c("Start", "End")[rowid(LABEL_A, LABEL_B)],
value.var = c("SEQ_A", "SEQ_B"))
# LABEL_A LABEL_B SEQ_A_End SEQ_A_Start SEQ_B_End SEQ_B_Start
#1: 1 3 30 11 20 1
#2: 1 4 75 61 11 25
#3: 1 6 85 76 25 16
#4: 1 7 100 86 1 15

R data.table conditional sum by row

> tempDT <- data.table(colA = c("E","E","A","C","E","C","E","C","E"), colB = c(20,30,40,30,30,40,30,20,10), group = c(1,1,1,1,2,2,2,2,2), want = c(NA, 30, 40, 70,NA,40,70,20,30))
> tempDT
colA colB group want
1: E 20 1 NA
2: E 30 1 30
3: A 40 1 40
4: C 30 1 70
5: E 30 2 NA
6: C 40 2 40
7: E 30 2 70
8: C 20 2 20
9: E 10 2 30
I have columns 'colA' 'colB' 'group': within each 'group', I would like to sum up 'colB' from bottom up until 'colA' is 'E'.
Based on the expected 'want', we create a run-length-id column 'grp' by checking if the value is 'E' in 'colA', then create 'want1' as the cumulative sum of 'colB' after grouping by 'grp' and 'group', get the row index ('i1') of elements that are duplicated in 'colA' and also is 'E' and assign the 'colB' values to 'want1'
tempDT[, grp:= rleid(colA=="E") * (colA != "E")
][grp!= 0, want1 := cumsum(colB), .(grp, group)]
i1 <- tempDT[, .I[colA=="E" & duplicated(colA)], group]$V1
tempDT[i1, want1 := colB][, grp := NULL][]
# colA colB group want want1
#1: E 20 1 NA NA
#2: E 30 1 30 30
#3: A 40 1 40 40
#4: C 30 1 70 70
#5: E 30 2 NA NA
#6: C 30 2 30 30
There's one approach: row reference + sums
# input data
tempDT <- data.table(colA = c("E","E","A","C","E","C","E","C","E"), colB = c(20,30,40,30,30,40,30,20,10), group = c(1,1,1,1,2,2,2,2,2), want = c(NA, 30, 40, 70,NA,40,70,20,30))
tempDT
# find row reference previous row where colA is "E"
lastEpos <- function(i) tail(which(tempDT$colA[1:(i-1)] == "E"), 1)
tempDT[, rowRef := sapply(.I, lastEpos), by = "group"]
# sum up
sumEpos <- function(i) {
valTEMP <- tempDT$rowRef[i]
outputTEMP <- sum(tempDT$colB[(valTEMP+1):i]) # sum
return(outputTEMP)
}
tempDT[, want1 := sapply(.I, sumEpos), by = "group"]
# deal with first row in every group
tempDT[, want1 := c(NA, want1[-1]), by = "group"]
# clean output
tempDT[, rowRef := NULL]
tempDT
library(dplyr)
df %>%
group_by(group) %>%
mutate(row_num = n():1) %>%
group_by(group) %>%
mutate(sum_colB = sum(colB[row_num < row_num[which(colA=='E')]]),
flag = ifelse(row_num >= row_num[which(colA=='E')], 0, 1),) %>%
mutate(sum_colB = ifelse(flag==1 & row_num==1, sum_colB, ifelse(flag==0, NA, colB))) %>%
select(-flag, -row_num) %>%
data.frame()
Output is:
colA colB group want sum_colB
1 E 20 1 NA NA
2 E 30 1 30 NA
3 A 40 1 40 40
4 C 30 1 70 70
5 E 30 2 NA NA
6 C 30 2 30 30
Sample data:
df <- structure(list(colA = structure(c(3L, 3L, 1L, 2L, 3L, 2L), .Label = c("A",
"C", "E"), class = "factor"), colB = c(20, 30, 40, 30, 30, 30
), group = c(1, 1, 1, 1, 2, 2), want = c(NA, 30, 40, 70, NA,
30)), .Names = c("colA", "colB", "group", "want"), row.names = c(NA,
-6L), class = "data.frame")

Resources