How to remove rows including sequential condition between two columns - r

I'm trying to remove the factors from my dataframe, but only those after a specific date. Here I made a toy example:
I have a test dataframe, and an inspection dataframe inspec. I would like to remove the letters that are in var1 that appear in inspec, but only the rows after the date in inspec. For example, consider
> test = data.frame(var1 = c("A", "B", "A", "B", "C","B", "A"), measure = c(6,7,8,6,10,1,0), date = as.Date(c("2021-01-02", "2021-01-03", "2021-01-04", "2021-01-05", "2021-01-06", "2021-01-07", "2021-01-12")))
> test
var1 measure date
1 A 6 2021-01-02
2 B 7 2021-01-03
3 A 8 2021-01-04
4 B 6 2021-01-05
5 C 10 2021-01-06
6 B 1 2021-01-07
7 A 0 2021-01-12
>
> inspec = data.frame(var1 = c("A", "C", "D", "A"), date = as.Date(c("2021-01-03", "2021-01-06", "2021-01-10", "2021-01-12")))
> inspec
var1 date
1 A 2021-01-03
2 C 2021-01-06
3 D 2021-01-10
4 A 2021-01-12
Then, as result, I'd like to obtain:
> test
var1 measure date
1 A 6 2021-01-02
2 B 7 2021-01-03
3 B 6 2021-01-05
4 B 1 2021-01-07
Note that only the A in var1 that were inspected after the date indicated in the inspec dataframe were excluded. If I didnt' want to maintain the var1 before a inspec date, I could just use test= test[!(test$var1 %in% inspec$var1),]
Any hint on how can I do that?

base R
## reduce `inspec` to the earliest date
inspec$date <- as.Date(inspec$date)
tmpinspec <- inspec[ave(as.integer(inspec$date), inspec$var1, FUN = function(z) z == min(z)) > 0,]
tmpinspec
# var1 date
# 1 A 2021-01-03
# 2 C 2021-01-06
# 3 D 2021-01-10
tmp <- merge(test, tmpinspec, by = "var1", all.x = TRUE, suffixes = c("", ".y"))
tmp
# var1 measure date date.y
# 1 A 6 2021-01-02 2021-01-03
# 2 A 8 2021-01-04 2021-01-03
# 3 A 0 2021-01-12 2021-01-03
# 4 B 7 2021-01-03 <NA>
# 5 B 6 2021-01-05 <NA>
# 6 B 1 2021-01-07 <NA>
# 7 C 10 2021-01-06 2021-01-06
tmp <- tmp[with(tmp, is.na(date.y) | date < date.y),]
# tmp$date.y <- NULL
# tmp
var1 measure date
# 1 A 6 2021-01-02
# 4 B 7 2021-01-03
# 5 B 6 2021-01-05
# 6 B 1 2021-01-07
dplyr
library(dplyr)
group_by(inspec, var1) %>%
slice_min(date) %>%
left_join(test, ., by = "var1", suffix = c("", ".y")) %>%
filter(is.na(date.y) | date < date.y) %>%
select(-date.y)
# var1 measure date
# 1 A 6 2021-01-02
# 2 B 7 2021-01-03
# 3 B 6 2021-01-05
# 4 B 1 2021-01-07

We can join on var1 and filter based on dates while the data is grouped by var1 and only keep the first match. See below;
library(dplyr)
test %>%
left_join(inspec, by = "var1", suffix = c("", ".y")) %>%
group_by(var1) %>%
filter(is.na(date.y) | date < first(date.y)) %>%
select(-date.y) %>%
group_by_all() %>%
slice(1)
#> # A tibble: 4 x 3
#> # Groups: var1, measure, date [4]
#> var1 measure date
#> <fct> <dbl> <date>
#> 1 A 6 2021-01-02
#> 2 B 1 2021-01-07
#> 3 B 6 2021-01-05
#> 4 B 7 2021-01-03
This is a variation of r2evans's answer.

Using data.table, we can also use the merge and then filter:
library(data.table)
test <- setDT(test); inspec <- setDT(inspec)
test <- merge(test, inspec, by = "var1", all.x = TRUE, suffixes = c("", ".y"))
test <- test[date < date.y | !is.na(date.y), .(var1, measure, date)]

Related

'Stretch' a grouped data frame using dplyr

I have a grouped dataframe with multiple IDs with a date and value column.
id <- c("a", "a", "a", "b", "b", "b", "c")
date <- c("2020-01-01", "2020-01-02", "2020-01-03",
"2020-01-01", "2020-01-02", "2020-01-03",
"2020-01-01")
value <- rnorm(n = length(id))
df <- cbind.data.frame(id, date, value)
However, some IDs have less than 3 dates. I want to "stretch" those IDs and add an NA for the value column for the new dates. In this dataframe, the "c" ID would have two new dates added ("2020-01-02" and "2020-01-03").
Perhaps this approach would suit?
library(tidyverse)
id <- c("a", "a", "a", "b", "b", "b", "c")
date <- c("2020-01-01", "2020-01-02", "2020-01-03",
"2020-01-01", "2020-01-02", "2020-01-03",
"2020-01-01")
value <- rnorm(n = length(id))
df <- cbind.data.frame(id, date, value)
df %>%
right_join(df %>% expand(id, date))
#> Joining, by = c("id", "date")
#> id date value
#> 1 a 2020-01-01 -1.5371474
#> 2 a 2020-01-02 0.9001098
#> 3 a 2020-01-03 0.1523491
#> 4 b 2020-01-01 0.8194577
#> 5 b 2020-01-02 1.2005270
#> 6 b 2020-01-03 0.1158812
#> 7 c 2020-01-01 -0.8676445
#> 8 c 2020-01-02 NA
#> 9 c 2020-01-03 NA
Created on 2022-09-05 by the reprex package (v2.0.1)
In base R, by id you may merge with a data frame created out of sequences of the date range. First of all you want to use proper date format by doing df$date <- as.Date(df$date).
by(df, df$id, \(x)
merge(x,
data.frame(id=el(x$id),
date=do.call(seq.Date, c(as.list(range(df$date)), 'day'))),
all=TRUE)) |>
do.call(what=rbind)
# id date value
# a.1 a 2020-01-01 1.3709584
# a.2 a 2020-01-02 -0.5646982
# a.3 a 2020-01-03 0.3631284
# b.1 b 2020-01-01 0.6328626
# b.2 b 2020-01-02 0.4042683
# b.3 b 2020-01-03 -0.1061245
# c.1 c 2020-01-01 1.5115220
# c.2 c 2020-01-02 NA
# c.3 c 2020-01-03 NA
You could use complete() from tidyr.
library(tidyr)
df %>%
complete(id, date)
# # A tibble: 9 × 3
# id date value
# <chr> <chr> <dbl>
# 1 a 2020-01-01 1.12
# 2 a 2020-01-02 1.58
# 3 a 2020-01-03 1.26
# 4 b 2020-01-01 -2.30
# 5 b 2020-01-02 -1.45
# 6 b 2020-01-03 -0.212
# 7 c 2020-01-01 0.344
# 8 c 2020-01-02 NA
# 9 c 2020-01-03 NA

How to unlist a list and keep the names of the top level as a new variable in R

is there a simple way to unlist a list and keep the names of one level as a new variable?
Working example:
# Input list
my_list <- list(Ticker1 = list(date = seq.Date(as.Date('2021-01-01'), as.Date('2021-01-10'), by = 'day'), Value = 1:10),
Ticker2 = list(date = seq.Date(as.Date('2021-01-01'), as.Date('2021-01-10'), by = 'day'), Value = 11:20),
Ticker3 = list(date = seq.Date(as.Date('2021-01-01'), as.Date('2021-01-10'), by = 'day'), Value = 21:30))
# Desired data frame
my_df_goal <- as.data.frame(list(Ticker = c(rep("Ticker1", 10), rep("Ticker2", 10), rep("Ticker3", 10)),
date = rep(seq.Date(as.Date('2021-01-01'), as.Date('2021-01-10'), by = 'day'), 3),
Value = 1:30))
There are similar questions regarding unlisting on stackoverflow, but I wasn't able to solve my problem.
One way would probably be
my_df <- as.data.frame(rlist::list.flatten(my_list))
# Same result:
my_df2 <- as.data.frame(unlist(my_list,recursive = F))
#Edit: just noticed that you do not need to unlist first to get the same result:
my_df3 <- as.data.frame(my_list)
and then try to extract the Ticker from the new variable name. But I hope there is an easier solution.
I also tried my_df4 <- reshape2::melt(my_list), but this is not helpful. I have a feeling that the purrr package may have this functionality, but I am not familiar with the package.
I am happy about any help!
Thanks a lot
dplyrs bind_rows should do the trick:
library(dplyr)
bind_rows(my_list, .id = "Ticker")
This returns
# A tibble: 30 x 3
Ticker date Value
<chr> <date> <int>
1 Ticker1 2021-01-01 1
2 Ticker1 2021-01-02 2
3 Ticker1 2021-01-03 3
4 Ticker1 2021-01-04 4
5 Ticker1 2021-01-05 5
6 Ticker1 2021-01-06 6
7 Ticker1 2021-01-07 7
8 Ticker1 2021-01-08 8
9 Ticker1 2021-01-09 9
10 Ticker1 2021-01-10 10
# ... with 20 more rows
Use data.table::rbindlist:
data:table::rbindlist(my_list, idcol = "Ticker")
Or, in the tidyverse:
library(tidyverse)
my_list %>%
enframe() %>%
unnest_wider(value) %>%
unnest(cols = c(date, Value))
# A tibble: 30 x 3
name date Value
<chr> <date> <int>
1 Ticker1 2021-01-01 1
2 Ticker1 2021-01-02 2
3 Ticker1 2021-01-03 3
4 Ticker1 2021-01-04 4
5 Ticker1 2021-01-05 5
6 Ticker1 2021-01-06 6
7 Ticker1 2021-01-07 7
8 Ticker1 2021-01-08 8
9 Ticker1 2021-01-09 9
10 Ticker1 2021-01-10 10
# ... with 20 more rows
Once we realise that sub lists for each ticker is just 2 lists with equal lengths, then we can think of it as a list of 3 dataframes with 2 columns each, and we could simply call rowbind:
do.call(rbind, lapply(my_list, data.frame))
# date Value
# Ticker1.1 2021-01-01 1
# Ticker1.2 2021-01-02 2
# Ticker1.3 2021-01-03 3
# Ticker1.4 2021-01-04 4
# Ticker1.5 2021-01-05 5
# ...
But we want list names as a new column, we need extra step:
do.call(rbind,
lapply(names(my_list), function(i){
data.frame(Ticker = i, my_list[[ i ]])
}))
# Ticker date Value
# 1 Ticker1 2021-01-01 1
# 2 Ticker1 2021-01-02 2
# 3 Ticker1 2021-01-03 3
# 4 Ticker1 2021-01-04 4
# 5 Ticker1 2021-01-05 5
# ...
See other options to get ID as a new column after rowbinding multiple dataframes:
Combine (rbind) data frames and create column with name of original data frames

Find rolling min and max occurrences from two columns

I would like to track min and max occurrences in two columns. This should be done in rolling fashion from beginning of the data, so we can track how many times overall IDs are present at each date. Also it doesn't matter in which column ID is present.
Result should be as follows. Row 1, nor B or C has occurred, so min_appearance is 0 but max_appearance is 1 as A and D was present. Row 5 A and D have been present 3 times at this point but B and C only 2. I'm not concerned which ID is present, but only on counts what is min and max. Also real data is more complicated, so pairs are not static, but A could face C and so on.
# A tibble: 8 x 5
date id1 id2 min_appearances max_appearances
<date> <chr> <chr> <dbl> <dbl>
1 2020-01-01 A D 0 1
2 2020-01-02 B C 1 1
3 2020-01-03 C B 1 2
4 2020-01-04 D A 2 2
5 2020-01-05 A D 2 3
6 2020-01-06 B C 3 3
7 2020-01-07 C B 3 4
8 2020-01-08 D A 4 4
DATA:
library(dplyr)
date <- seq(as.Date("2020/1/1"), by = "day", length.out = 8)
id1 <- rep(c("A", "B", "C", "D"), 2)
id2 <- rep(c("D", "C", "B", "A"), 2)
dt <- tibble(date = date,
id1 = id1,
id2 = id2)
Here's a way to do it using functions from the tidyverse. First, pivot_longer to handle more easily the data. Then compute the cumulative count of value for every unique ids. Compute the min and max for each row over the "count" columns. Finally, take the last min and max values for each pairs, and pivot back to wide.
library(tidyverse)
dt %>%
pivot_longer(cols = -date, values_to = "id") %>%
mutate(map_dfc(unique(id), ~ tibble("count_{.x}" := cumsum(id == .x)))) %>%
mutate(min_appearances = do.call(pmin, select(., starts_with("count"))),
max_appearances = do.call(pmax, select(., starts_with("count")))) %>%
group_by(date) %>%
mutate(across(min_appearances:max_appearances, last),
n = row_number()) %>%
pivot_wider(c(date, min_appearances, max_appearances), names_from = n, values_from = id, names_prefix = "id") %>%
relocate(order(colnames(.)))
date id1 id2 max_appearances min_appearances
<date> <chr> <chr> <int> <int>
1 2020-01-01 A D 1 0
2 2020-01-02 B C 1 1
3 2020-01-03 C B 2 1
4 2020-01-04 D A 2 2
5 2020-01-05 A D 3 2
6 2020-01-06 B C 3 3
7 2020-01-07 C B 4 3
8 2020-01-08 D A 4 4

How to compare within a group the first value to each subsequent value until a condition is met

So I have a data frame in the general structure below:
dataframe:
rownum
group
date
1
a
2021-05-01
2
a
2021-05-02
3
a
2021-05-03
4
b
2021-05-15
5
b
2021-05-17
6
b
2021-05-30
7
b
2021-05-31
8
b
2021-05-31
9
c
2021-05-01
10
c
2021-05-05
What I would like to do is, WITHIN GROUP, compare the first row to the next row, until the difference between the dates meets some threshold, say 10 days. Then, once that row meets the threshold, I'd like to then test the next row against the subsequent row. It would look like this:
Result, using threshold of 10:
|rownum|group |date |date diff|
|------|------|-----------|---|
|1 | a |2021-05-01 |NA|
|2 | a |2021-05-02 |1|
|3 | a |2021-05-03 |2|
|4 | b |2021-05-15 |NA|
|5 | b |2021-05-17 |2|
|6 | b |2021-05-30 |15 (meets criteria, start from row 7 now)|
|7 | b |2021-05-31 | NA|
|8 | b |2021-05-31 | 0|
|9 | c |2021-05-01 | NA|
|10 | c |2021-05-05 | 4|
So to reiterate, its comparing the FIRST row of a group to subsequent rows until some threshold is met. Then the count starts over at the first rep after that within the group to subsequent rows within the group. The difference is recorded as datediff.
I've tried this but I dont know if sapply is the way to go:
dataframe %>%
group_by(group) %>%
mutate(
datediff = sapply(date, function(x) {
all(difftime(dataframe$date,dplyr::lag(dataframe, n = 1, default = NA)))
}
)
)
Also tried this, which I think is closer to what I want:
for (m in 1:length(dataframe)) {
dataframe <- dataframe %>%
group_by(group) %>%
rowwise() %>%
mutate(datediff = difftime(dataframe$date,dplyr::lag(date, n = m, default = NA), units="days"))
}
So far I havent been able to get the right rowwise comparison to even implement the thresholding bit.
Another tidyverse solution. We can use accumulate to achieve this task. dat is from r2evans' example.
library(tidyverse)
dat2 <- dat %>%
group_by(group) %>%
mutate(diff_lag = as.integer(date - lag(date))) %>%
mutate(diff = accumulate(diff_lag, function(x, y){
if (is.na(x)){
res <- y
} else if (x > 10){
res <- NA
} else {
res <- x + y
}
return(res)
})) %>%
select(-diff_lag) %>%
ungroup()
dat2
# # A tibble: 10 x 4
# rownum group date diff
# < int> <chr> <date> <int>
# 1 1 a 2021-05-01 NA
# 2 2 a 2021-05-02 1
# 3 3 a 2021-05-03 2
# 4 4 b 2021-05-15 NA
# 5 5 b 2021-05-17 2
# 6 6 b 2021-05-30 15
# 7 7 b 2021-05-31 NA
# 8 8 b 2021-05-31 0
# 9 9 c 2021-05-01 NA
# 10 10 c 2021-05-05 4
base R
func <- function(x, threshold = 10) {
r <- rle(c(0, diff(x)) > threshold)
if ((len <- length(r$values)) > 1) {
r$lengths[len] <- r$lengths[len] - 1L
r$lengths[1] <- r$lengths[1] + 1L
}
cumsum(inverse.rle(r))
}
dat$group2 <- ave(as.numeric(dat$date), dat$group, FUN = func)
dat$datediff <- ave(as.numeric(dat$date), dat[,c("group", "group2")], FUN = function(x) c(NA, (x - x[1])[-1]))
dat$group2 <- NULL
dat
# rownum group date datediff
# 1 1 a 2021-05-01 NA
# 2 2 a 2021-05-02 1
# 3 3 a 2021-05-03 2
# 4 4 b 2021-05-15 NA
# 5 5 b 2021-05-17 2
# 6 6 b 2021-05-30 15
# 7 7 b 2021-05-31 NA
# 8 8 b 2021-05-31 0
# 9 9 c 2021-05-01 NA
# 10 10 c 2021-05-05 4
dplyr
library(dplyr)
dat %>%
group_by(group) %>%
mutate(group2 = func(date)) %>%
group_by(group, group2) %>%
mutate(datediff = c(NA, (date - date[1])[-1])) %>%
ungroup() %>%
select(-group2)
# # A tibble: 10 x 4
# rownum group date datediff
# <int> <chr> <date> <dbl>
# 1 1 a 2021-05-01 NA
# 2 2 a 2021-05-02 1
# 3 3 a 2021-05-03 2
# 4 4 b 2021-05-15 NA
# 5 5 b 2021-05-17 2
# 6 6 b 2021-05-30 15
# 7 7 b 2021-05-31 NA
# 8 8 b 2021-05-31 0
# 9 9 c 2021-05-01 NA
# 10 10 c 2021-05-05 4
Data
dat <- structure(list(rownum = 1:10, group = c("a", "a", "a", "b", "b", "b", "b", "b", "c", "c"), date = structure(c(18748, 18749, 18750, 18762, 18764, 18777, 18778, 18778, 18748, 18752), class = "Date")), row.names = c(NA, -10L), class = "data.frame")
(I already converted dat$date to Date-class.)
Here's a roundabout way of getting what you're looking for, where some of your NA are set to 0 using this solution:
library(tidyverse)
df %>%
group_by(group) %>%
mutate(date = as.Date(date),
date_diff = date - first(date),
flag = date_diff > 10) %>%
group_by(group, flag) %>%
mutate(temp_group = cur_group_id()) %>%
group_by(temp_group) %>%
mutate(date_diff = case_when(date_diff == first(date_diff) ~ date_diff,
date_diff != first(date_diff) & date_diff < 10 ~ date - first(date),
date_diff != first(date_diff) & date_diff > 10 ~ date - nth(date, 2))) %>%
ungroup() %>%
select(group, date, date_diff)
# A tibble: 10 x 3
group date date_diff
<chr> <date> <drtn>
1 a 2021-05-01 0 days
2 a 2021-05-02 1 days
3 a 2021-05-03 2 days
4 b 2021-05-15 0 days
5 b 2021-05-17 2 days
6 b 2021-05-30 15 days
7 b 2021-05-31 0 days
8 b 2021-05-31 0 days
9 c 2021-05-01 0 days
10 c 2021-05-05 4 days

Rank function in R after group by

How to use R to create a rank column? Below is an example
This is what I have:
Date group
12/5/2020 A
12/5/2020 A
11/7/2020 A
11/7/2020 A
11/9/2020 B
11/9/2020 B
10/8/2020 B
This is what I want:
Date group rank
12/5/2020 A 2
12/5/2020 A 2
11/7/2020 A 1
11/7/2020 A 1
11/9/2020 B 2
11/9/2020 B 2
10/8/2020 B 1
tidyverse
(I'm using dplyr here since I think it is easy to see the steps being done.)
A first approach might be to capitalize on R's factor function, which assigns an integer to each distinct value, so that operations on this factor is faster (when compared with strings). That is, it takes a (possibly looooong) vector of strings and converts it into a just-as-long vector of integers (much smaller and faster) and a very short vector of strings, where the integers are indices into the small vector of strings. This small vector is called the factor's "levels".
library(dplyr)
group_by(dat, group) %>%
mutate(rank = as.integer(factor(Date))) %>%
ungroup()
# # A tibble: 7 x 3
# Date group rank
# <chr> <chr> <int>
# 1 12/5/2020 A 2
# 2 12/5/2020 A 2
# 3 11/7/2020 A 1
# 4 11/7/2020 A 1
# 5 11/9/2020 B 2
# 6 11/9/2020 B 2
# 7 10/8/2020 B 1
This "sorta" works, but there are two problems:
This is reliant on the lexicographic sorting of the Date column, for which this data sample is acceptable, but this will fail. A better way is to convert to something more appropriately sortable, such as a Date object.
Failing sorts:
sort(c("12/9/2020", "11/9/2020", "2/9/2020"))
# [1] "11/9/2020" "12/9/2020" "2/9/2020"
dat %>%
mutate(Date = as.Date(Date, format = "%m/%d/%Y")) %>%
group_by(group) %>%
mutate(rank = as.integer(factor(Date))) %>%
ungroup()
# # A tibble: 7 x 3
# Date group rank
# <date> <chr> <int>
# 1 2020-12-05 A 2
# 2 2020-12-05 A 2
# 3 2020-11-07 A 1
# 4 2020-11-07 A 1
# 5 2020-11-09 B 2
# 6 2020-11-09 B 2
# 7 2020-10-08 B 1
and
There really are better functions for ranking, such as dplyr::dense_rank (which #akrun put in an answer first ... I was building to it, honestly):
dat %>%
mutate(Date = as.Date(Date, format = "%m/%d/%Y")) %>%
group_by(group) %>%
mutate(rank = dense_rank(Date)) %>%
ungroup()
# # A tibble: 7 x 3
# Date group rank
# <date> <chr> <int>
# 1 2020-12-05 A 2
# 2 2020-12-05 A 2
# 3 2020-11-07 A 1
# 4 2020-11-07 A 1
# 5 2020-11-09 B 2
# 6 2020-11-09 B 2
# 7 2020-10-08 B 1
We can use dense_rank after converting the 'Date' to Date class
library(dplyr)
library(lubridate)
df1 %>%
group_by(group) %>%
mutate(rank = dense_rank(mdy(Date)))
# A tibble: 7 x 3
# Groups: group [2]
# Date group rank
# <chr> <chr> <int>
#1 12/5/2020 A 2
#2 12/5/2020 A 2
#3 11/7/2020 A 1
#4 11/7/2020 A 1
#5 11/9/2020 B 2
#6 11/9/2020 B 2
#7 10/8/2020 B 1
data
df1 <- structure(list(Date = c("12/5/2020", "12/5/2020", "11/7/2020",
"11/7/2020", "11/9/2020", "11/9/2020", "10/8/2020"), group = c("A",
"A", "A", "A", "B", "B", "B")), class = "data.frame", row.names = c(NA,
-7L))
Convert the Date column to the actual date object, arrange the data by Date and use match with unique to get rank column.
library(dplyr)
df %>%
mutate(Date = lubridate::mdy(Date)) %>%
arrange(group, Date) %>%
group_by(group) %>%
mutate(rank = match(Date, unique(Date)))
# Date group rank
# <date> <chr> <int>
#1 2020-11-07 A 1
#2 2020-11-07 A 1
#3 2020-12-05 A 2
#4 2020-12-05 A 2
#5 2020-10-08 B 1
#6 2020-11-09 B 2
#7 2020-11-09 B 2
data
df <- structure(list(Date = c("12/5/2020", "12/5/2020", "11/7/2020",
"11/7/2020", "11/9/2020", "11/9/2020", "10/8/2020"), group = c("A",
"A", "A", "A", "B", "B", "B")), class = "data.frame", row.names = c(NA, -7L))

Resources