Expand dataset by count column in Dplyr - r

I have a dataset as follows:
library(tidyverse)
df <- data.frame(
report_date = c("2020-03-14", "2020-03-14", "2020-03-19", "2020-03-20"),
start_date = c("2020-03-06", "2020-03-10", "2020-03-11", "2020-03-11"),
count = c(1, 2, 1, 3)
)
Looking like:
report_date start_date count
1 2020-03-14 2020-03-06 1
2 2020-03-14 2020-03-10 2
3 2020-03-19 2020-03-11 1
4 2020-03-20 2020-03-11 3
I want to perform a transformation using the value count - aka - repeating each row n times as in count for starting row.
I think it's clear if I show the desired result as follows:
df_final <- data.frame(
report_date = c("2020-03-14", "2020-03-14", "2020-03-14", "2020-03-19",
"2020-03-20", "2020-03-20", "2020-03-20"),
start_date = c("2020-03-06", "2020-03-10", "2020-03-10", "2020-03-11",
"2020-03-11", "2020-03-11", "2020-03-11"),
count = c(1, 1, 1, 1, 1, 1, 1)
)
report_date start_date count
1 2020-03-14 2020-03-06 1
2 2020-03-14 2020-03-10 1
3 2020-03-14 2020-03-10 1
4 2020-03-19 2020-03-11 1
5 2020-03-20 2020-03-11 1
6 2020-03-20 2020-03-11 1
7 2020-03-20 2020-03-11 1
Thanks!

We may use uncount to replicate and then create the 'count'
library(dplyr)
library(tidyr)
df %>%
uncount(count) %>%
mutate(count = 1)
-output
report_date start_date count
1 2020-03-14 2020-03-06 1
2 2020-03-14 2020-03-10 1
3 2020-03-14 2020-03-10 1
4 2020-03-19 2020-03-11 1
5 2020-03-20 2020-03-11 1
6 2020-03-20 2020-03-11 1
7 2020-03-20 2020-03-11 1

Related

Joining two data frames on the closest date in R

I have two datasets that I would like to join based on date. One is a survey dataset, and the other is a list of prices at various dates. The dates don't match exactly, so I would like to join on the nearest date in the survey dataset (the price data is weekly).
Here's a brief snippet of what the survey dataset looks like (there are many other variables, but here's the two most relevant):
ID
actual.date
20120377
2012-09-26
2020455822
2020-11-23
20126758
2012-10-26
20124241
2012-10-25
2020426572
2020-11-28
And here's the price dataset (also much larger, but you get the idea):
date
price.var1
price.var2
2017-10-30
2.74733926399869
2.73994826674735
2015-03-16
2.77028200438506
2.74079930272231
2010-10-18
3.4265947805337
3.41591263539176
2012-10-29
4.10095806545397
4.14717556976502
2012-01-09
3.87888859352037
3.93074237884497
What I would like to do is join the price dataset to the survey dataset, joining on the nearest date.
I've tried a number of different things, none of which have worked to my satisfaction.
#reading in sample data
library(data.table)
library(dplyr)
survey <- fread(" ID actual.date
1: 20120377 2012-09-26
2: 2020455822 2020-11-23
3: 20126758 2012-10-26
4: 20124241 2012-10-25
5: 2020426572 2020-11-28
> ") %>% select(-V1)
price <- fread("date price.var1 price.var2
1: 2017-10-30 2.747339 2.739948
2: 2015-03-16 2.770282 2.740799
3: 2010-10-18 3.426595 3.415913
4: 2012-10-29 4.100958 4.147176
5: 2012-01-09 3.878889 3.930742") %>% select(-V1)
#using data.table
setDT(survey)[,DT_DATE := actual.date]
setDT(price)[,DT_DATE := date]
survey_price <- survey[price,on=.(DT_DATE),roll="nearest"]
#This works, and they join, but it drops a ton of observations, which won't work
#using dplyr
library(dplyr)
survey_price <- left_join(survey,price,by=c("actual.date"="date"))
#this joins them without dropping observations, but all of the price variables become NAs
You were almost there.
In the DT[i,on] syntax, i should be survey to join on all its rows
setDT(survey)
setDT(price)
survey_price <- price[survey,on=.(date=actual.date),roll="nearest"]
survey_price
date price.var1 price.var2 ID
<IDat> <num> <num> <int>
1: 2012-09-26 4.100958 4.147176 20120377
2: 2020-11-23 2.747339 2.739948 2020455822
3: 2012-10-26 4.100958 4.147176 20126758
4: 2012-10-25 4.100958 4.147176 20124241
5: 2020-11-28 2.747339 2.739948 2020426572
Convert the dates to numeric and find the closest date from the survey for price with Closest() from DescTools, and take that value.
Example datasets
survey <- tibble(
ID = sample(20000:40000, 9, replace = TRUE),
actual.date = seq(today() %m+% days(5), today() %m+% days(5) %m+% months(2),
"week")
)
price <- tibble(
date = seq(today(), today() %m+% months(2), by = "week"),
price_1 = sample(2:6, 9, replace = TRUE),
price_2 = sample(2:6, 9, replace = TRUE)
)
survey
# A tibble: 9 x 2
ID actual.date
<int> <date>
1 34592 2022-05-07
2 37846 2022-05-14
3 22715 2022-05-21
4 22510 2022-05-28
5 30143 2022-06-04
6 34348 2022-06-11
7 21538 2022-06-18
8 39802 2022-06-25
9 36493 2022-07-02
price
# A tibble: 9 x 3
date price_1 price_2
<date> <int> <int>
1 2022-05-02 6 6
2 2022-05-09 3 2
3 2022-05-16 6 4
4 2022-05-23 6 2
5 2022-05-30 2 6
6 2022-06-06 2 4
7 2022-06-13 2 2
8 2022-06-20 3 5
9 2022-06-27 5 6
library(tidyverse)
library(lubridate)
library(DescTools)
price <- price %>%
mutate(date = Closest(survey$actual.date %>%
as.numeric, date %>%
as.numeric) %>%
as_date())
# A tibble: 9 x 3
date price_1 price_2
<date> <int> <int>
1 2022-05-07 6 6
2 2022-05-14 3 2
3 2022-05-21 6 4
4 2022-05-28 6 2
5 2022-06-04 2 6
6 2022-06-11 2 4
7 2022-06-18 2 2
8 2022-06-25 3 5
9 2022-07-02 5 6
merge(survey, price, by.x = "actual.date", by.y = "date")
actual.date ID price_1 price_2
1 2022-05-07 34592 6 6
2 2022-05-14 37846 3 2
3 2022-05-21 22715 6 4
4 2022-05-28 22510 6 2
5 2022-06-04 30143 2 6
6 2022-06-11 34348 2 4
7 2022-06-18 21538 2 2
8 2022-06-25 39802 3 5
9 2022-07-02 36493 5 6

Retrieve only selected columns data in r with date criteria

I have two tables( orders, prices) and I would like to retrieve the Monthly_code and daily_code from orders table to prices table considering date criteria. Both tables doesn't have a unique primary key.
**Orders table data**
orders <- data.table(ID = c(1,1,1,2,2,3), Monthly_code = c('xx','xx','vv','uu','mm','gg'),
daily_code = c('xx-1','xx-1','vv-1','uu-1','mm-1','gg-1'),
Time_in = c('12/1/2020','12/16/2020','12/28/2020', '6/1/2020', '4/5/2020', '6/9/2020'),
Time_out = c('12/6/2020', '12/27/2020', '12/31/2020','6/13/2020','4/12/2020','6/23/2020')
**Prices table data**
prices <- data.table(ID = c(1,1,1,1,2,2,2,3), record_date = c('12/2/2020','12/3/2020','12/4/2020',
'12/5/2020', '6/6/2020', '6/7/2020', '6/8/2020' , '6/20/2020'), Price = c(20,22,21,22,13,15,22,30))
**Expected results data**
price_2 <- data.table(ID = c(1,1,1,1,2,2,2,3), record_date = c('12/2/2020','12/3/2020','12/4/2020',
'12/5/2020', '6/6/2020', '6/7/2020', '6/8/2020' , '6/20/2020'),
Price = c(20,22,21,22,13,15,22,30), Monthly_code = c('xx','xx','xx','xx', 'uu','uu', 'uu','gg'),
daily_code = c('xx-1', 'xx-1', 'xx-1','xx-1', 'uu-1', 'uu-1','uu-1','gg-1'))
You can use fuzzyjoin to join two dataframes in range.
library(dplyr)
library(lubridate)
library(fuzzyjoin)
orders %>%
mutate(across(starts_with('Time'), mdy)) %>%
fuzzy_right_join(prices %>% mutate(record_date = mdy(record_date)),
by = c('ID', 'Time_in' = 'record_date', 'Time_out' = 'record_date'),
match_fun = c(`==`, `<=`, `>=`)) -> result
result
# ID.x Monthly_code daily_code Time_in Time_out ID.y record_date Price
#1 1 xx xx-1 2020-12-01 2020-12-06 1 2020-12-02 20
#2 1 xx xx-1 2020-12-01 2020-12-06 1 2020-12-03 22
#3 1 xx xx-1 2020-12-01 2020-12-06 1 2020-12-04 21
#4 1 xx xx-1 2020-12-01 2020-12-06 1 2020-12-05 22
#5 2 uu uu-1 2020-06-01 2020-06-13 2 2020-06-06 13
#6 2 uu uu-1 2020-06-01 2020-06-13 2 2020-06-07 15
#7 2 uu uu-1 2020-06-01 2020-06-13 2 2020-06-08 22
#8 3 gg gg-1 2020-06-09 2020-06-23 3 2020-06-20 30

How many days from the list were in given period [R]

I’d like to count using R, how many days of given list:
2020-10-01
2020-10-03
2020-10-07
2020-10-08
2020-10-09
2020-10-10
2020-10-14
2020-10-17
2020-10-21
2020-10-22
2020-10-27
2020-10-29
2020-10-30
Were in given period from start to end:
id start end
1 2020-10-05 2020-10-30
2 2020-10-06 2020-10-29
3 2020-10-10 2020-10-12
And the result should be for example:
id number of days
1 5
2 18
3 12
Here you can find a tidyverse approch with lubridate and dplyr.
library(lubridate)
library(dplyr)
df %>%
count(id, start, end,
wt = days %within% interval(start, end),
name = "number_of_days")
#> id start end number_of_days
#> 1 1 2020-10-05 2020-10-30 11
#> 2 2 2020-10-06 2020-10-29 10
#> 3 3 2020-10-10 2020-10-12 1
For each row, count the number of days within the interval of start and end (extremes included).
(If you don't want to see start and end just remove them from the first line of count)
Where:
days <- c("2020-10-01",
"2020-10-03",
"2020-10-07",
"2020-10-08",
"2020-10-09",
"2020-10-10",
"2020-10-14",
"2020-10-17",
"2020-10-21",
"2020-10-22",
"2020-10-27",
"2020-10-29",
"2020-10-30")
df <- read.table(text = " id start end
1 2020-10-05 2020-10-30
2 2020-10-06 2020-10-29
3 2020-10-10 2020-10-12", header = TRUE)
days <- as.Date(days)
df$start <- as.Date(df$start)
df$end <- as.Date(df$end)
Assuming all the dates are of date class you can use mapply :
df2$num_days <- mapply(function(x, y) sum(df1$dates >= x & df1$dates <= y), df2$start, df2$end)

full_join but with condition on matching

I am going to try to make this as simple as possible, I would like a dplyr solution if possible:
Let's say I have a DataFrame of 2 columns called f1. The 2 columns are the reference number of an event and the date_begin is the begin date of the event:
f1
reference date_begin
1 01100144609598 2020-08-15
2 01100144692499 2020-08-12
3 01100144609598 2020-08-09
4 01100434045112 2020-08-26
5 01100434067379 2020-08-24
6 01100723546188 2020-08-16
I also have another DataFrame called f2 with 2 columns. The 2 columns are the reference number of an event and the date_end is the ending date of the event:
reference date_end
1 01100144609598 2020-09-06
2 01100144692499 2020-08-10
3 01100434121179 2020-08-25
4 01100578756185 2020-08-17
5 01100578757962 2020-08-31
6 01100578846401 2020-08-16
I want to use a full_join by reference. That being said:
If there is an end date before there is a begin date I want an NA in stead of the begin date
The end date has to be > than the begin date
If there are 2 end dates for the same reference that are bigger than a begin date take the smallest end date
If there is a begin date with no end date the end date should have an NA
So in this reproducible example I should have an f3 that looks something like the following :
reference date_begin date_end
1 01100144609598 2020-08-15 2020-09-06
2 01100144692499 NA 2020-08-10
3 01100144692499 2020-08-12 NA
4 01100434121179 NA 2020-08-25
5 01100578756185 NA 2020-08-17
6 01100578757962 NA 2020-08-31
7 01100578846401 NA 2020-08-16
8 01100144609598 2020-08-09 NA
9 01100434045112 2020-08-26 NA
10 01100434067379 2020-08-24 NA
11 01100723546188 2020-08-16 NA
As Chuck P mentions, the conditions make this a bit complicated. Rather than use full_join, I've first combined f1 and f2 and then transformed to "long" format. We can then group by reference and sort by date to set up to use case_when to apply the conditions stated in the post or other conditions as needed. The result is then transformed back to "wide" format to present as shown in the post. The code is
library(tidyverse)
#
# combine f1 and f2 and pivot to long format
#
all <- bind_rows(f1,f2) %>%
pivot_longer(cols = c(date_begin, date_end),
names_to = "type", values_to = "date",
values_drop_na = TRUE)
#
# group by reference, sort by date, and then use
# case_when function to pair begin and end dates
#
all <- all %>% group_by(reference) %>%
arrange(date) %>%
mutate(index = 1:n(),
index = case_when(
type == "date_end" & lag(type, n = 1) == "date_begin" ~ lag(index),
TRUE ~ index))
#
# pivot back to wide format to format results as shown in post
#
result <- all %>% pivot_wider(names_from =type, values_from = date) %>% mutate(index = NULL)
The result is
> result
# A tibble: 11 x 3
# Groups: reference [9]
reference date_begin date_end
<chr> <date> <date>
1 01100144609598 2020-08-09 NA
2 01100144692499 NA 2020-08-10
3 01100144692499 2020-08-12 NA
4 01100144609598 2020-08-15 2020-09-06
5 01100723546188 2020-08-16 NA
6 01100578846401 NA 2020-08-16
7 01100578756185 NA 2020-08-17
8 01100434067379 2020-08-24 NA
9 01100434121179 NA 2020-08-25
10 01100434045112 2020-08-26 NA
11 01100578757962 NA 2020-08-31
where the results are sorted by date.
This is more complex than it first appears because of the conditional logic. I broke it down into three steps that occur after we do the initial full_join to make f3
library(dplyr)
library(tidyr)
library(purrr)
f3 <- full_join(f1, f2)
#> Joining, by = "reference"
f3
#> reference date_begin date_end
#> 1 01100144609598 2020-08-15 2020-09-06
#> 2 01100144692499 2020-08-12 2020-08-10
#> 3 01100144609598 2020-08-09 2020-09-06
#> 4 01100434045112 2020-08-26 <NA>
#> 5 01100434067379 2020-08-24 <NA>
#> 6 01100723546188 2020-08-16 <NA>
#> 7 01100434121179 <NA> 2020-08-25
#> 8 01100578756185 <NA> 2020-08-17
#> 9 01100578757962 <NA> 2020-08-31
#> 10 01100578846401 <NA> 2020-08-16
Step 1 set aside the rows where we don't have to do anything because either the begin data or the end date is NA
nothing_to_do <-
f3 %>% filter(is.na(date_begin) | is.na(date_end))
Step 2 identify rows where we have a begin date after an end date like "01100144692499" for these we actually have to add a row and then adjust the rows.
end_before_beginning <-
f3 %>% filter(date_begin > date_end) %>%
group_by(reference) %>%
do (
add_row(.,
reference = .$reference,
date_begin = .$date_begin,
.after = 1)
) %>%
ungroup() %>%
mutate(date_begin =
case_when(
!is.na(date_end) ~ as.Date(NA_character_),
TRUE ~ date_begin
))
Step 3 identify rows with multiple beginnings same ending where we have to select the one with the shortest time space like "01100144609598"
multiple_beginnings <-
f3 %>%
group_by(reference, date_end) %>%
mutate(instances = n(),
date_diff = date_end - date_begin) %>%
filter(instances > 1) %>%
mutate(date_end =
case_when(
date_diff != min(date_diff) ~ as.Date(NA_character_),
TRUE ~ date_end
)) %>%
select(-instances, -date_diff)
Glue them all to together
final_answer <-
list(nothing_to_do, end_before_beginning, multiple_beginnings) %>%
reduce(full_join)
#> Joining, by = c("reference", "date_begin", "date_end")
#> Joining, by = c("reference", "date_begin", "date_end")
final_answer
#> reference date_begin date_end
#> 1 01100434045112 2020-08-26 <NA>
#> 2 01100434067379 2020-08-24 <NA>
#> 3 01100723546188 2020-08-16 <NA>
#> 4 01100434121179 <NA> 2020-08-25
#> 5 01100578756185 <NA> 2020-08-17
#> 6 01100578757962 <NA> 2020-08-31
#> 7 01100578846401 <NA> 2020-08-16
#> 8 01100144692499 <NA> 2020-08-10
#> 9 01100144692499 2020-08-12 <NA>
#> 10 01100144609598 2020-08-15 2020-09-06
#> 11 01100144609598 2020-08-09 <NA>
Your data...
f1 <- structure(list(reference = c("01100144609598", "01100144692499",
"01100144609598", "01100434045112", "01100434067379", "01100723546188"),
date_begin = structure(c(18489, 18486, 18483, 18500, 18498,
18490), class = "Date")), row.names = c(NA, -6L), class = "data.frame")
f2 <- structure(list(reference = c("01100144609598", "01100144692499",
"01100434121179", "01100578756185", "01100578757962", "01100578846401"),
date_end = structure(c(18511, 18484, 18499, 18491, 18505,
18490), class = "Date")), row.names = c(NA, -6L), class = "data.frame")

Is there a way to sum data grouping by date with a time period?

I have data which associates a time period (actually a start date and end date) and a continuous value.
I'd like to find a way to sum the values of my third variable for each day during the time period.
For example with this table :
START END NUMBER
1 2020-03-16 2020-05-31 5
2 2020-03-16 2020-06-30 7
3 2020-03-17 2020-08-31 1
Have a new table with :
DAY SUM
2020-03-16 12
2020-03-17 13
2020-03-18 13
...
2020-05-31 13
2020-06-01 8
...
And so on. Is there a way to do that? Maybe with the help of lubridate?
Thanks!
Try:
library(data.table)
setDT(df)[, c('START', 'END') := lapply(.SD, function(x) as.Date(as.character(x))), .SDcols = 1:2][
, .(DAY = seq(START, END, by = 'day'), NUMBER = NUMBER), by = 1:nrow(df)][
, .(SUM = sum(NUMBER)), by = DAY]
Output:
DAY SUM
1: 2020-03-16 12
2: 2020-03-17 13
3: 2020-03-18 13
4: 2020-03-19 13
5: 2020-03-20 13
---
165: 2020-08-27 1
166: 2020-08-28 1
167: 2020-08-29 1
168: 2020-08-30 1
169: 2020-08-31 1
Another data.table option using non-equi join:
ans <- DT[.(DATE=seq(min(START), max(END), by="1 day")), on=.(START<=DATE, END>=DATE),
by=.EACHI, .(SUM=sum(NUMBER))][, (1L) := NULL][]
setnames(ans, "END", "DAY")[]
output:
DAY SUM
1: 2020-03-16 12
2: 2020-03-17 13
3: 2020-03-18 13
4: 2020-03-19 13
5: 2020-03-20 13
---
165: 2020-08-27 1
166: 2020-08-28 1
167: 2020-08-29 1
168: 2020-08-30 1
169: 2020-08-31 1
data:
library(data.table)
DT <- fread("START END NUMBER
2020-03-16 2020-05-31 5
2020-03-16 2020-06-30 7
2020-03-17 2020-08-31 1")
cols <- c("START", "END")
DT[, (cols) := lapply(.SD, as.IDate, format="%Y-%m-%d"), .SDcols=cols]
1) Base R Using the data shown reproducibly in the Note at the end lapply over erach row expanding the date range into a sequence of dates using seq. This gives a list with one component per input row and we rbind those together giving long. Then aggregate long by Date. No packages are used.
expand <- function(i, data) with(data[i, ],
data.frame(Date = seq(START, END, "day"), NUMBER)
)
long <- do.call("rbind", lapply(1:nrow(DF), expand, data = DF))
result <- aggregate(NUMBER ~ Date, long, sum)
head(result)
giving:
Date NUMBER
1 2020-03-16 12
2 2020-03-17 13
3 2020-03-18 13
4 2020-03-19 13
5 2020-03-20 13
6 2020-03-21 13
2) dplyr Expand each row in the rowwise code and then sum NUMBER over Date in the group_by code.
library(dplyr)
DF %>%
rowwise %>%
do(data.frame(Date = seq(.$START, .$END, "day"), NUMBER = .$NUMBER)) %>%
ungroup %>%
group_by(Date) %>%
summarize(NUMBER = sum(NUMBER)) %>%
ungroup
Note
Lines <- " START END NUMBER
1 2020-03-16 2020-05-31 5
2 2020-03-16 2020-06-30 7
3 2020-03-17 2020-08-31 1"
DF <- read.table(text = Lines)
DF[1:2] <- lapply(DF[1:2], as.Date)

Resources