This is an update / follow-up on this question. The answer outlined their doesn't meet the new requirements.
I am looking for an efficient way (data.table?) to construct two new measures for each ID.
Measure 1 and Measure 2 needs to meet the following conditions:
Condition 1:
Find a sequence of three rows for which:
the first count > 0
the second `count >1' and
the third count ==1.
Condition 2 for Measure 1:
takes the value of the elements in product of the third row of the sequence that are:
in the product of second row of sequence and
NOT in the stock of the first row in sequence.
Condition 2 for measure 2:
takes the value of the elements in product of the last row of the sequence that are:
NOT in the product of second row of sequence
NOT in the stock of the first row in sequence.
Data:
df2 <- data.frame(ID = c(1,1,1,1,1,1,1,2,2,2,3,3,3,3),
seqs = c(1,2,3,4,5,6,7,1,2,3,1,2,3,4),
count = c(2,1,3,1,1,2,3,1,2,1,3,1,4,1),
product = c("A", "B", "C", "A,C,E", "A,B", "A,B,C", "D", "A", "B", "A", "A", "A,B,C", "D", "D"),
stock = c("A", "A,B", "A,B,C", "A,B,C,E", "A,B,C,E", "A,B,C,E", "A,B,C,D,E", "A", "A,B", "A,B", "A", "A,B,C", "A,B,C,D", "A,B,C,D"))
> df2
ID seqs count product stock
1 1 1 2 A A
2 1 2 1 B A,B
3 1 3 3 C A,B,C
4 1 4 1 A,C,E A,B,C,E
5 1 5 1 A,B A,B,C,E
6 1 6 2 A,B,C A,B,C,E
7 1 7 3 D A,B,C,D,E
8 2 1 1 A A
9 2 2 2 B A,B
10 2 3 1 A A,B
11 3 1 3 A A
12 3 2 1 A,B,C A,B,C
13 3 3 4 D A,B,C,D
14 3 4 1 D A,B,C,D
The desired output looks like this:
ID seq1 seq2 seq3 measure1 measure2
1: 1 2 3 4 C E
2: 2 1 2 3
3: 3 2 3 4 D
How would you code this?
Few things you need to know to be able to do this:
shift function to compare values in your groups
separate_rows function to split your strings to get to the normalised data view.
library(data.table)
dt <- data.table(ID = c(1,1,1,1,1,1,1,2,2,2,3,3,3,3),
seqs = c(1,2,3,4,5,6,7,1,2,3,1,2,3,4),
count = c(2,1,3,1,1,2,3,1,2,1,3,1,4,1),
product = c("A", "B", "C", "A,C,E", "A,B", "A,B,C", "D", "A", "B", "A", "A", "A,B,C", "D", "D"),
stock = c("A", "A,B", "A,B,C", "A,B,C,E", "A,B,C,E", "A,B,C,E", "A,B,C,D,E", "A", "A,B", "A,B", "A", "A,B,C", "A,B,C,D", "A,B,C,D"))
dt[, count.2 := shift(count, type = "lead")]
dt[, count.3 := shift(count, n = 2, type = "lead")]
dt[, product.2 := shift(product, type = "lead")]
dt[, product.3 := shift(product, n = 2, type = "lead")]
dt <- dt[count > 0 & count.2 > 1 & count.3 == 1]
dt <- unique(dt, by = "ID")
library(tidyr)
dt.measure <- separate_rows(dt, product.3, sep = ",")
dt.measure <- separate_rows(dt.measure, stock, sep = ",")
dt.measure <- separate_rows(dt.measure, product, sep = ",")
dt.measure[, measure.1 := (product.3 == product.2 & product.3 != stock)]
dt.measure[, measure.2 := (product.3 != product.2 & product.3 != stock)]
res <- dt.measure[,
.(
measure.1 = max(ifelse(measure.1, product.3, NA_character_), na.rm = TRUE),
measure.2 = max(ifelse(measure.2, product.3, NA_character_), na.rm = TRUE)
),
ID
]
dt <- merge(dt, res, by = "ID")
dt[, .(ID, measure.1, measure.2)]
# ID measure.1 measure.2
# 1: 1 C E
# 2: 2 <NA> <NA>
# 3: 3 D <NA>
I'm not sure what the criteria for efficient is, but here's an approach using embed and tidyverse style. It filters down so you are working with less and less.
Loading up the data and packages (note later on setdiff and intersect are from dplry)
library(purrr)
library(dplyr)
df1 <- data.frame(ID = c(1,1,1,1,1,1,1,2,2,2,3,3,3,3),
seqs = c(1,2,3,4,5,6,7,1,2,3,1,2,3,4),
count = c(2,1,3,1,1,2,3,1,2,1,3,1,4,1),
product = c("A", "B", "C", "A,C,E", "A,B",
"A,B,C", "D", "A", "B", "A", "A",
"A,B,C", "D", "D"),
stock = c("A", "A,B", "A,B,C", "A,B,C,E", "A,B,C,E",
"A,B,C,E", "A,B,C,D,E", "A", "A,B", "A,B", "A",
"A,B,C", "A,B,C,D", "A,B,C,D"),
stringsAsFactors = FALSE)
Define a helper function to evaluate condition 1
meetsCond1 <- function(rseg) {
seg <- rev(rseg)
all(seg[1] > 0, seg[2] > 1, seg[3] == 1)
}
The embed function warps a time series into a matrix where essentially each row is a window of the length of interest. Using apply, you filter down to which rows start relevant sequences.
cond1Match<- embed(df1$count, 3) %>%
apply(1, meetsCond1) %>%
which()
You can translate that back to final products, the previous products, and stock rows of interest to determine the measures by adding offsets. Split them into a list of individual components.
finalProds <- df1$product[cond1Match + 2] %>%
strsplit(",")
prevProds <- df1$product[cond1Match + 1] %>%
strsplit(",")
initialStock <- df1$stock[cond1Match] %>%
strsplit(",")
For both measures, neither of them can be in the stock.
notStock <- map2(finalProds, initialStock, ~.x[!(.x %in% .y)])
Then generate your data.frame by retrieving the seqs and ID values of the window. The measures then are just the intersect and setdiff of the final products with those in the previous rows.
data.frame(ID = df1$ID[cond1Match],
seq1 = df1$seqs[cond1Match],
seq2 = df1$seqs[cond1Match + 1],
seq3 = df1$seqs[cond1Match + 2],
measure1 = imap_chr(notStock,
~intersect(.x, prevProds[[.y]]) %>%
{if(length(.) == 0) "" else paste(., sep = ",")}
),
measure2 = imap_chr(notStock,
~setdiff(.x, prevProds[[.y]]) %>%
{if(length(.) == 0) "" else paste(., sep = ",")}
),
stringsAsFactors = FALSE
) %>%
slice(match(unique(ID), ID))
which yields the desired output, which seems to limit at most one line per ID. In the original post, you specify you want all reported. Removing the slice call would then instead yield
#> ID seq1 seq2 seq3 measure1 measure2
#> 1 1 2 3 4 C E
#> 2 1 6 7 1
#> 3 2 1 2 3
#> 4 2 3 1 2 C
#> 5 3 2 3 4 D
If you're looking to really squeeze efficiency, you might be able to gain some by placing the definitions of finalProds, prevProds, and initialStock instead of assigning them to variables first. I would imagine unless your set of matches is really large, it would be negligible.
A rolling window approach using data.table with base R code in j:
library(data.table)
cols <- c("product", "stock")
setDT(df2)[, (cols) := lapply(.SD, function(x) strsplit(as.character(x), split=",")), .SDcols=cols]
ans <- df2[,
transpose(lapply(1L:(.N-2L), function(k) {
if(count[k]>0 && count[k+1L]>1 && count[k+2L]==1) {
m1 <- setdiff(intersect(product[[k+2L]], product[[k+1L]]), stock[[k]])
m2 <- setdiff(setdiff(product[[k+2L]], product[[k+1L]]), stock[[k]])
c(seq1=seqs[k], seq2=seqs[k+1L], seq3=seqs[k+2L],
measure1=if(length(m1) > 0) paste(m1, collapse=",") else "",
measure2=if(length(m2) > 0) paste(m2, collapse=",") else "")
}
}), ignore.empty=TRUE),
ID]
setnames(ans, names(ans)[-1L], c(paste0("seq", 1:3), paste0("measure", 1:2)))
ans
output:
ID seq1 seq2 seq3 measure1 measure2
1: 1 2 3 4 C E
2: 2 1 2 3
3: 3 2 3 4 D
Related
I have a time-series panel dataset that is structured in the following way: There are 2 funds that each own different stocks at each time period.
df <- data.frame(
fund_id = c(1,1,1,1,1,1,1,1, 1, 2,2,2,2),
time_Q = c(1,1,1,2,2,2,2,3, 3, 1,1,2,2),
stock_id = c("A", "B", "C", "A", "C", "D", "E", "D", "E", "A", "B", "B", "C")
)
> df
fund_id time_Q stock_id
1 1 1 A
2 1 1 B
3 1 1 C
4 1 2 A
5 1 2 C
6 1 2 D
7 1 2 E
8 1 3 D
9 1 3 E
10 2 1 A
11 2 1 B
12 2 2 B
13 2 2 C
For each fund, I would like to calculate the percentage of stocks held in that current time_Q that were also ever held in any of the previous one to 2 quarters. So basically for every fund and every time_Q, I would like to have 2 columns with past 1 time_Q, past 1-2 time_Q which show what percentage of stocks held in that time were also present in any of that past time_Qs.
Here is what the result should look like:
result <- data.frame(
fund_id = c(1,1,1,2,2),
time_Q = c(1,2,3,1,2),
past_1Q = c("NA",0.5,1,"NA",0.5),
past_2Q = c("NA",0.5,1,"NA",0.5)
)
> result
fund_id time_Q past_1Q past_1_2Q
1 1 1 NA NA
2 1 2 0.5 0.5
3 1 3 1 1
4 2 1 NA NA
5 2 2 0.5 0.5
I already asked a similar question here, but now I'm looking for common elements across any of the past lagged periods. I'm looking for a dplyr or data.table scalable solution where I can have around 12 past quarters and deal with multiple funds and stocks and time periods.
Thanks in advance!
my solution
# dummy data
df <- data.table(fund_id = c(1,1,1,1,1,1,1,1, 1, 2,2,2,2)
, time_Q = c(1,1,1,2,2,2,2,3, 3, 1,1,2,2)
, stock_id = c("A", "B", "C", "A", "C", "D", "E", "D", "E", "A", "B", "B", "C")
); df
# lower case col names
names(df) <- tolower(names(df))
# unique grouping
x <- df[, .(dummy =.N), .(fund_id, time_q)][, dummy := NULL]
# initialise empty table
y <- NULL
# loop
for(i in 1:nrow(x))
{
# current quarter & before
z <- df[fund_id == x[i, fund_id]
& time_q %between% c( x[i, time_q] - 12, x[i, time_q])
]
# current quarter
a <- z[fund_id == x[i, fund_id]
& time_q == x[i, time_q]
, unique(stock_id)
]
# minus 1 to minus 12 quarter (lapply)
b <- lapply(1:12, \(j) z[fund_id == x[i, fund_id]
& time_q %between% c( x[i, time_q] - j, x[i, time_q] - 1)
, unique(stock_id)
]
)
# results
c <- data.table(fund_id = x[i, fund_id]
, current_q = x[i, time_q]
)
# no. of stocks in current quarter
d <- length(a)
# calculate % for the 12 periods
c[, paste0('past_1_to_', 1:12, '_q') := lapply(1:12, \(j) length(intersect(a,b[[j]])) / d) ]
# collect results
y <- rbind(y, c)
}
benchmark
x <- 1e3
df <- data.table(fund_id = rep(1:x, each = x/10)
, time_Q = rep(1:4, each = x/4)
, stock_id = sample(letters[1:26], size=20, replace=T)
)
took 20 seconds on the above df with 100k rows and 1,200 groups (fund_id, time_q)
I have a large dataframe that has as it's primary organization a single row with groups that are all identical length (in the toy example 3).
df <- data.frame(groups = c("gr1","gr1","gr1","gr2","gr2","gr2","gr3","gr3","gr3"),
no = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
colA = c("a", "b", "c", "a", "b", "c", "a", "b", "c"),
colB = c("a", "b", "c", "X_", "b", "c", "a", "b", "c"),
colC = c("a", "b", "c", "X_", "b", "c", "c", "b", "a"))
df
> df
> groups no colA colB colC
> 1 gr1 1 a a a
> 2 gr1 2 b b b
> 3 gr1 3 c c c
> 4 gr2 1 a X_ X_
> 5 gr2 2 b b b
> 6 gr2 3 c c c
> 7 gr3 1 a a c
> 8 gr3 2 b b b
> 9 gr3 3 c c a
I want to identify for each column which group is the first example of a unique arrangement of values. So for colA it should return (T, F, F) since all three groups are identical so only group one is the 1st unique on. For colB it should return (T, T, F) since there are two distinct groups and only the 3rd is identical to the 1st. And for colC it should be (T, T, T) since the order of items matters.
So the final output could be a matrix like this
colA colB colC
> gr1 T T T
> gr2 F T T
> gr3 F F T
I think I could figure this out by breaking down the data frame into pairs of group and colA/B/B, identify which ones are identical, storing the results in a vector, and then reassembling the whole deal. But I am seeing a ton of for-loops and have a hard time thinking about how to vectorize this. I have been using dplyr a bit, but I don't yet see how it can help.
Maybe there's a decent way to unstack each of the columns based on the groups and then run a comparison across the relevant subsets of new (and shorter) columns?
Edited to add:
Maybe group_by %>% summarize is a way to get at this. If the summary can essentially concatenate all values in a group per column into a really long string I could then see which of those is distinct per group?
Second edit:
I got as far as:
d1 <- df %>% group_by(groups) %>% summarise(colB = paste(unique(colB), collapse = ', ')) %>% distinct(colB)
which puts out
> # A tibble: 2 x 1
> colB
> <chr>
> 1 a, b, c
> 2 X_, b, c
It identifies the distinct groups, but I now have to figure out how to compare it against the rest full column to get T/F for each group.
Here's a base R approach :
cols <- grep('col', names(df))
cbind(unique(df[1]), sapply(df[cols], function(x)
!duplicated(by(x, df$groups, paste0, collapse = '-'))))
# groups colA colB colC
#1 gr1 TRUE TRUE TRUE
#4 gr2 FALSE TRUE TRUE
#7 gr3 FALSE FALSE TRUE
Your summarize idea is spot on:
df %>%
group_by(groups) %>%
summarize(across(starts_with("col"), paste, collapse = ""), .groups = "drop") %>%
mutate(across(starts_with("col"), ~!duplicated(.)))
# # A tibble: 3 x 4
# groups colA colB colC
# <chr> <lgl> <lgl> <lgl>
# 1 gr1 TRUE TRUE TRUE
# 2 gr2 FALSE TRUE TRUE
# 3 gr3 FALSE FALSE TRUE
With "data.table" you can try:
library(data.table)
cols <- c("colA", "colB", "colC")
fun <- function(x) !duplicated(x)
as.data.table(df)[, lapply(.SD, toString), groups, .SDcols = cols][
, (cols) := lapply(.SD, fun), .SDcols = cols][]
# groups colA colB colC
# 1: gr1 TRUE TRUE TRUE
# 2: gr2 FALSE TRUE TRUE
# 3: gr3 FALSE FALSE TRUE
I am trying to convert something like this df format:
df <- data.frame(first = c("a", "a", "b", "b", "b", "c"),
words =c("about", "among", "blue", "but", "both", "cat"))
df
first words
1 a about
2 a among
3 b blue
4 b but
5 b both
6 c cat
into the following format:
df1
first words
1 a about, among
2 b blue, but, both
3 c cat
>
I have tried
aggregate(words ~ first, data = df, FUN = list)
first words
1 a 1, 2
2 b 3, 5, 4
3 c 6
and tidyverse:
df %>%
group_by(first) %>%
group_rows()
Any suggestions would be appreciated!
A data.table solution:
library(data.table)
df <- data.frame(first = c("a", "a", "b", "b", "b", "c"),
words =c("about", "among", "blue", "but", "both", "cat"))
df <- setDT(df)[, lapply(.SD, toString), by = first]
df
# first words
# 1: a about, among
# 2: b blue, but, both
# 3: c cat
# convert back to a data.frame if you want
setDF(df)
Using tidyverse, after the group_by use summarise to either paste
library(dplyr)
df %>%
group_by(first) %>%
summarise(words = toString(words))
# A tibble: 3 x 2
# first words
# <fct> <chr>
#1 a about, among
#2 b blue, but, both
#3 c cat
or keep it as a list column
df %>%
group_by(first) %>%
summarise(words = list(words))
I have two data.frames dfA and dfB. Both of them have a column called key.
Now I'd like to know how many duplicates for A$key there are in B$key.
A <- data.frame(key=c("A", "B", "C", "D"))
B <- data.frame(key=c("A", "A", "B", "B", "B", "D"))
It should be A=2, B=3, C=0 and D=1. Whats the most easiest way to do this?
Use table
table(factor(B$key, levels = sort(unique(A$key))))
#A B C D
#2 3 0 1
factor is needed here such that we also 'count' entries that do not appear in B$key, that is C.
A <- data.frame(key=c("A", "B", "C", "D"))
B <- data.frame(key=c("A", "A", "B", "B", "B", "D"))
library(dplyr)
library(tidyr)
B %>%
filter(key %in% A$key) %>% # keep values that appear in A
count(key) %>% # count values
complete(key = A$key, fill = list(n = 0)) # add any values from A that don't appear
# # A tibble: 4 x 2
# key n
# <chr> <dbl>
# 1 A 2
# 2 B 3
# 3 C 0
# 4 D 1
Using tidyverse you can do:
A %>%
left_join(B %>% #Merging df A with df B for which the count in "key" was calculated
group_by(key) %>%
tally(), by = c("key" = "key")) %>%
mutate(n = ifelse(is.na(n), 0, n)) #Replacing NA with 0
key n
1 A 2
2 B 3
3 C 0
4 D 1
Actually you mean how many occurrences of each value of A$key you have in B$key?
You can obtain this by coding B$key as factor with the unique values of A$key as levels.
o <- table(factor(B$key, levels=unique(A$key)))
Yielding:
> o
A B C D
2 3 0 1
If you really want to count duplicates, do
dupes <- ifelse(o - 1 < 0, 0, o - 1)
Yielding:
> dupes
A B C D
1 2 0 0
Suppose I have a dataset like this:
id <- c(1,1,1,2,2,3,3,4,4)
visit <- c("A", "B", "C", "A", "B", "A", "C", "A", "B")
test1 <- c(12,16, NA, 11, 15,NA, 0,12, 5)
test2 <- c(1,NA, 2, 2, 2,2, NA,NA, NA)
df <- data.frame(id,visit,test1,test2)
I want to know the number of data points per visit PER test so that the final output looks something like this:
visit test1 test2
A 3 3
B 3 1
C 1 1
I know I can use the aggregate function like this for 1 variable as mentioned on this older post :
aggregate(x = df$id[!is.na(df$test)], by = list(df$visit[!is.na(df$test)]), FUN = length)
but how would I go about doing this for multiple tests?
You can also use data.table which could be useful for a flexible number of columns:
cols <- names(df)[grepl("test",names(df))]
setDT(df)[,lapply(.SD, function(x) sum(!is.na(x))), by = visit, .SDcols = cols]
df
# visit test1 test2
#1: A 3 3
#2: B 3 1
#3: C 1 1
Using table and rowSums in base R:
cols <- 3:4
sapply(cols, function(i) rowSums(table(df$visit, df[,i]), na.rm = TRUE))
# [,1] [,2]
#A 3 3
#B 3 1
#C 1 1