Calculate the days differences with mixed date format in R - r

I need to count differences in days between two mixed-structured dates. Here is an example dataset:
testdata <- data.frame(id = c(1,2,3),
date1 = c("2022/11/13 9:19:03 AM PST", "2022-11-01","2022-10-28"),
date2 = c("2022/12/12 1:52:29 PM PST","2022-10-21","2022/12/01 8:15:25 AM PST"))
> testdata
id date1 date2
1 1 2022/11/13 9:19:03 AM PST 2022/12/12 1:52:29 PM PST
2 2 2022-11-01 2022-10-21
3 3 2022-10-28 2022/12/01 8:15:25 AM PST
First I need to grab dates, exclude the hours, and calculate the number of days differences. So the expected dataset would be:
> df
id date1 date2. days.diff
1 1 2022/11/13 2022/12/12 19
2 2 2022-11-01 2022-10-21 11
3 3 2022-10-28 2022/12/01 34

You could use the anytime package with anytime to calculate the difference in dates rowwise like this:
library(dplyr)
library(anytime)
testdata %>%
rowwise() %>%
mutate(days.diff = anytime(date1) - anytime(date2))
#> # A tibble: 3 × 4
#> # Rowwise:
#> id date1 date2 days.diff
#> <dbl> <chr> <chr> <drtn>
#> 1 1 2022/11/13 9:19:03 AM PST 2022/12/12 1:52:29 PM PST -29.00000 days
#> 2 2 2022-11-01 2022-10-21 11.04167 days
#> 3 3 2022-10-28 2022/12/01 8:15:25 AM PST -34.04167 days
Created on 2023-01-20 with reprex v2.0.2

Using as.Date with tryFormats
library(dplyr)
testdata %>%
rowwise() %>%
mutate(across(starts_with("date"), ~ as.Date(.x,
tryFormats=c("%Y/%m/%d %H:%M:%S", "%Y-%m-%d"))),
days.diff = date2 - date1) %>%
ungroup()
# A tibble: 3 × 4
id date1 date2 days.diff
<dbl> <date> <date> <drtn>
1 1 2022-11-13 2022-12-12 29 days
2 2 2022-11-01 2022-10-21 -11 days
3 3 2022-10-28 2022-12-01 34 days

Related

R: Get unique values based on criteria from 2 other columns

Hi I would like to get only 1 unique Code for each rows. To get that 1 uniqe Code the criteria should be get nearest Refresh Date that is >= Effective Date. And if there is no Refresh date that is >= Effective date then just get the nearest Resfresh Date < Effective date.
Below is my sample dataframe.
Code <- c("A","A","A", "A", "B", "B", "B", "B", "C","C","C","C")
Effective_Date <- as.Date(c("2020-08-25","2020-08-25","2020-08-25","2020-08-25","2021-12-18","2021-12-18",
"2021-12-18","2021-12-18","2021-10-15","2021-10-15","2021-10-15","2021-10-15"))
Refresh_Date <- as.Date(c("2020-09-25","2021-09-17","2022-11-25","2020-02-20","2021-12-12","2021-12-18",
"2022-01-15","2021-08-19","2021-08-20","2020-08-25","2021-09-30","2020-08-25"))
DF <- data.frame(Code,Effective_Date,Refresh_Date)
> DF
Code Effective_Date Refresh_Date
1 A 2020-08-25 2021-09-17
2 A 2020-08-25 2020-09-25
3 A 2020-08-25 2022-11-25
4 A 2020-08-25 2020-02-20
5 B 2021-12-18 2021-12-14
6 B 2021-12-18 2021-12-18
7 B 2021-12-18 2022-01-15
8 B 2021-12-18 2021-08-19
9 C 2021-10-15 2021-08-20
10 C 2021-10-15 2020-08-25
11 C 2021-10-15 2021-09-30
12 C 2021-10-15 2020-08-25
It's just like aggregating to Code and Effective Date. But get the row that has the nearest Refresh Date >= Effective Date. And if there is no Refresh Date that is >= Effective Date then just get the nearest Refresh Date < Effective Date.
Below is my desired output:
> DF_DesiredOutput
Code Effective_Date Refresh_Date
1 A 2020-08-25 2020-09-25
2 B 2021-12-18 2021-12-18
3 C 2021-10-15 2021-09-30
We can use slice on the difference of 'Refresh_Date' and 'Effective_Date', get the index of the min value, after grouping by 'Code'
library(dplyr)
DF %>%
group_by(Code) %>%
slice(which.min(abs(Refresh_Date - Effective_Date))) %>%
ungroup
-output
# A tibble: 3 × 3
Code Effective_Date Refresh_Date
<chr> <date> <date>
1 A 2020-08-25 2020-09-25
2 B 2021-12-18 2021-12-18
3 C 2021-10-15 2021-09-30
Here is an alternative approach using arrange by the absolute difference and then slice:
library(dplyr)
DF %>%
group_by(Code) %>%
arrange(abs(Refresh_Date-Effective_Date), .by_group = TRUE) %>%
slice(1)
Code Effective_Date Refresh_Date
<chr> <date> <date>
1 A 2020-08-25 2020-09-25
2 B 2021-12-18 2021-12-18
3 C 2021-10-15 2021-09-30

R: Convert monthly data into daily data for panel data

I have the following data:
5 Products with a monthly rating from 2018-08 to 2018-12
Now with the help of R programming I would like to convert the monthly data into daily data and to have panel data.The monthly rating for each product will also be the rating for each day in the respective month.
So, that the new data will look like:
(with the first column being the product, the second column the date and the third column the rating)
A 2018-08-01 1
A 2018-08-02 1
A 2018-08-03 1
A 2018-08-04 1
... so on
A 2018-09-01 1
A 2018-09-02 1
...so on
A 2018-12-31 1
B 2018-08-01 3
B 2018-08-02 3
... so on
E 2018-12-31 3
library(tidyverse)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
# example data
data <- tribble(
~Product, ~`Product Rating 2018-08`, ~`Product Rating 2018-10`,
"A", 1, 1,
"B", 3, 3,
)
data2 <-
data %>%
pivot_longer(-Product) %>%
mutate(
name = name %>% str_extract("[0-9-]+$") %>% paste0("-01") %>% as.Date()
)
seq(as.Date("2018-08-01"), as.Date("2018-12-31"), by = "days") %>%
tibble(date = .) %>%
# left join on year and month
expand_grid(data2) %>%
filter(month(date) == month(name) & year(date) == year(name)) %>%
select(Product, date, value)
#> # A tibble: 124 × 3
#> Product date value
#> <chr> <date> <dbl>
#> 1 A 2018-08-01 1
#> 2 B 2018-08-01 3
#> 3 A 2018-08-02 1
#> 4 B 2018-08-02 3
#> 5 A 2018-08-03 1
#> 6 B 2018-08-03 3
#> 7 A 2018-08-04 1
#> 8 B 2018-08-04 3
#> 9 A 2018-08-05 1
#> 10 B 2018-08-05 3
#> # … with 114 more rows
Created on 2022-03-09 by the reprex package (v2.0.0)

Tally if observations fall in date windows

I have a data frame that represents policies with start and end dates. I'm trying to tally the count of policies that are active each month.
library(tidyverse)
ayear <- 2021
amonth <- 10
months <- 12
df <- tibble(
pol = c(1, 2, 3, 4)
, bdate = c('2021-02-23', '2019-12-03', '2020-08-11', '2020-12-14')
, edate = c('2022-02-23', '2020-12-03', '2021-08-11', '2021-06-14')
)
These four policies have a begin date (bdate) and end date (edate). Beginning in October (amonth) 2021 (ayear) and going back 12 months (months) I'm trying to generate a count of how many of the 4 policies were active at some point in the month to generate a data frame that looks something like this.
Data frame I'm trying to generate would have three columns: month, year, and active_pol_count with 12 rows. Like this.
library(tidyverse)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
df <- tibble(
pol = c(1, 2, 3, 4),
bdate = c("2021-02-23", "2019-12-03", "2020-08-11", "2020-12-14"),
edate = c("2022-02-23", "2020-12-03", "2021-08-11", "2021-06-14")
)
# transform star and end date to interval
df <- mutate(df, interval = interval(bdate, edate))
# for every first date of each month between 2020-10 to 2021-10
seq(as.Date("2020-10-01"), as.Date("2021-09-01"), by = "months") %>%
tibble(date = .) %>%
mutate(
year = year(date),
month = month(date),
active_pol_count = date %>% map_dbl(~ .x %within% df$interval %>% sum()),
)
#> # A tibble: 12 x 4
#> date year month active_pol_count
#> <date> <dbl> <dbl> <dbl>
#> 1 2020-10-01 2020 10 2
#> 2 2020-11-01 2020 11 2
#> 3 2020-12-01 2020 12 2
#> 4 2021-01-01 2021 1 2
#> 5 2021-02-01 2021 2 2
#> 6 2021-03-01 2021 3 3
#> 7 2021-04-01 2021 4 3
#> 8 2021-05-01 2021 5 3
#> 9 2021-06-01 2021 6 3
#> 10 2021-07-01 2021 7 2
#> 11 2021-08-01 2021 8 2
#> 12 2021-09-01 2021 9 1
Created on 2021-12-13 by the reprex package (v2.0.1)

Create a column for days sampled e.g. 0,10,30 days,starting with 0 days for every study area?

I like to create some sampling effort curves for species data. Where are several study areas with a number of sampling plots, resampled over a certain time period. My data set looks similar to this one:
df1 <- data.frame(PlotID = c("A","A","A","A","A","B","B","B","B","B","C","C","C","C","C","D","D","D","D","D","E","E","E"),
species = c("x","x","x1","x","x1","x2","x1","x3","x4","x4","x5","x5","x","x3","x","x3","x3","x4","x5","x","x1","x2","x3"),
date = as.Date(c("27-04-1995", "26-05-1995", "02-08-1995", "02-05-1995", "28-09-1995", "02-08-1994", "31-05-1995", "27-07-1995", "06-12-1995", "03-05-1996", "27-04-1995", "31-05-1995", "29-06-1994", "30-08-1995", "26-05-1994", "30-05-1995", "30-06-1995", "30-06-1995", "30-06-1995", "30-08-1995", "31-08-1995", "01-09-1995","02-09-1995"),'%d-%m-%Y'),
area= c("A","A","A","A","A","A","A","A","A","A","B","B","B","B","B","B","B","B","B","B","C","C","C"))
I really would like an output that gives me an extra column of time of sampling e.g. 0, 10 days, 30days for the whole dataframe, but times should start with 0 for each area. I tried this:
effort<-df1%>% arrange(PlotID, date,species) %>% group_by(area) %>%
mutate(diffDate = difftime(date, lag(date,1))) %>% ungroup()
But somehow my code produces nonsense?
Could please somebody enlighten me?
T the end I would like to achieve something like this example below. A List of matrices for every research area with species as rows but not with sampling plots as columns but time (in days, showing the increasing sampling effort). The example shows a data set from the package iNEXT. But I'm stuck with getting the days of sampling calculated for every area between the sampling dates.For now I just want this extra column showing the days between the sampling events in each area and the species found. I hope now it's a bit clearer?
Edit: This is how the date in my real data set looks like:
output from dput(head(my.data))
date= structure(c(801878400, 798940800, 780710400, 769910400, 775785600, 798940800), class = c("POSIXct", "POSIXt"), tzone = "UTC")
a possible tidyverse solution would be
library(dplyr)
df1 %>% arrange(area, date) %>%
group_by(area) %>%
mutate(diff_date_from_start = date - min(date),
diff_date_from_prev = date - lag(date))
#> # A tibble: 23 x 6
#> # Groups: area [3]
#> PlotID species date area diff_date_from_start diff_date_from_prev
#> <chr> <chr> <date> <chr> <drtn> <drtn>
#> 1 B x2 1994-08-02 A 0 days NA days
#> 2 A x 1995-04-27 A 268 days 268 days
#> 3 A x 1995-05-02 A 273 days 5 days
#> 4 A x 1995-05-26 A 297 days 24 days
#> 5 B x1 1995-05-31 A 302 days 5 days
#> 6 B x3 1995-07-27 A 359 days 57 days
#> 7 A x1 1995-08-02 A 365 days 6 days
#> 8 A x1 1995-09-28 A 422 days 57 days
#> 9 B x4 1995-12-06 A 491 days 69 days
#> 10 B x4 1996-05-03 A 640 days 149 days
#> # … with 13 more rows
The diff_date_from_prev variable might make more sense if you group by other variables as well, such as species and PlotID.
The diff_date_from_prev calculates the difference in days between the current sample and the first sample in for each Area.
Edit to answer comment:
Your date is stored as POSIX and not as Date class. If time zones are not relevant, I find easier to work with Date, so one option is converting to Date as.Date() and then applying the manipulations as stated previously. Alternatively you can use the difftime() function as suggested by #Rui Barradas in the comments and specify the unit accordingly.
df1 <- data.frame(PlotID = c("A","A","A","A","A","B","B","B","B","B","C","C","C","C","C","D","D","D","D","D","E","E","E"),
species = c("x","x","x1","x","x1","x2","x1","x3","x4","x4","x5","x5","x","x3","x","x3","x3","x4","x5","x","x1","x2","x3"),
# date as posix not as date. they are different data classs.
date = as.POSIXct(c("27-04-1995", "26-05-1995", "02-08-1995", "02-05-1995", "28-09-1995", "02-08-1994", "31-05-1995", "27-07-1995", "06-12-1995", "03-05-1996", "27-04-1995", "31-05-1995", "29-06-1994", "30-08-1995", "26-05-1994", "30-05-1995", "30-06-1995", "30-06-1995", "30-06-1995", "30-08-1995", "31-08-1995", "01-09-1995","02-09-1995"),'%d-%m-%Y'),
area= c("A","A","A","A","A","A","A","A","A","A","B","B","B","B","B","B","B","B","B","B","C","C","C"))
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
df1 %>% arrange(area, date) %>%
group_by(area) %>%
mutate(
date = as.Date(date),
diff_date_from_start = date - min(date)
)
#> # A tibble: 23 x 6
#> # Groups: area [3]
#> PlotID species date area diff_date_from_start diff_date_time
#> <chr> <chr> <date> <chr> <drtn> <drtn>
#> 1 A x 2-05-19 A 0 days 0 days
#> 2 A x1 2-08-19 A 92 days 92 days
#> 3 B x2 2-08-19 A 92 days 92 days
#> 4 B x4 3-05-19 A 365 days 365 days
#> 5 B x4 6-12-19 A 1675 days 1675 days
#> 6 A x 26-05-19 A 8766 days 8766 days
#> 7 A x 27-04-19 A 9101 days 9101 days
#> 8 B x3 27-07-19 A 9192 days 9192 days
#> 9 A x1 28-09-19 A 9620 days 9620 days
#> 10 B x1 31-05-19 A 10592 days 10592 days
#> # … with 13 more rows
# or as suggested by Rui Barradas. you can use difftime function and keep you date as a POSIX class
df1 %>% arrange(area, date) %>%
group_by(area) %>%
mutate(
diff_date_time = difftime(date, min(date), unit = "days")
)
#> # A tibble: 23 x 5
#> # Groups: area [3]
#> PlotID species date area diff_date_time
#> <chr> <chr> <dttm> <chr> <drtn>
#> 1 A x 2-05-19 00:00:00 A 0 days
#> 2 A x1 2-08-19 00:00:00 A 92 days
#> 3 B x2 2-08-19 00:00:00 A 92 days
#> 4 B x4 3-05-19 00:00:00 A 365 days
#> 5 B x4 6-12-19 00:00:00 A 1675 days
#> 6 A x 26-05-19 00:00:00 A 8766 days
#> 7 A x 27-04-19 00:00:00 A 9101 days
#> 8 B x3 27-07-19 00:00:00 A 9192 days
#> 9 A x1 28-09-19 00:00:00 A 9620 days
#> 10 B x1 31-05-19 00:00:00 A 10592 days
#> # … with 13 more rows
Created on 2021-06-13 by the reprex package (v2.0.0)
I solved it with a for loop
areas <- unique(df1$area)
df1$diffdate <- 0
for (i in 1:length(areas)){
df1$diffdate[df1$area == areas[i]] <- df1$date[df1$area == areas[i]] - min(df1$date[df1$area == areas[i]])
}
Do you want a sequence of dates by 10 days for each group of area?
library(dplyr)
library(tidyr)
df1 %>%
arrange(PlotID, date, species) %>%
group_by(area) %>%
complete(date = full_seq(date, 1)) %>%
mutate(species = zoo::na.locf(species),
PlotID = zoo::na.locf(PlotID),
diffDate = 10*as.integer(date - first(date)) %/% 10) %>%
ungroup() %>%
group_by(diffDate) %>%
filter(row_number() == 1)
## A tibble: 65 x 5
## Groups: diffDate [65]
# area date PlotID species diffDate
# <chr> <date> <chr> <chr> <dbl>
# 1 A 1994-08-02 B x2 0
# 2 A 1994-08-12 B x2 10
# 3 A 1994-08-22 B x2 20
# 4 A 1994-09-01 B x2 30
# 5 A 1994-09-11 B x2 40
# 6 A 1994-09-21 B x2 50
# 7 A 1994-10-01 B x2 60
# 8 A 1994-10-11 B x2 70
# 9 A 1994-10-21 B x2 80
#10 A 1994-10-31 B x2 90
## … with 55 more rows

Spread a data.frame with repetitive column

I have a large data.frame that I am trying to spread. A toy example looks like this.
data = data.frame(date = rep(c("2019", "2020"), 2), ticker = c("SPY", "SPY", "MSFT", "MSFT"), value = c(1, 2, 3, 4))
head(data)
date ticker value
1 2019 SPY 1
2 2020 SPY 2
3 2019 MSFT 3
4 2020 MSFT 4
I would like to spread it so the data.frame looks like this.
spread(data, key = ticker, value = value)
date MSFT SPY
1 2019 3 1
2 2020 4 2
However, when I do this on my actual data.frame, I get an error.
Error: Each row of output must be identified by a unique combination of keys.
Keys are shared for 18204 rows:
* 30341, 166871
* 30342, 166872
* 30343, 166873
* 30344, 166874
* 30345, 166875
* 30346, 166876
* 30347, 166877
* 30348, 166878
* 30349, 166879
* 30350, 166880
* 30351, 166881
* 30352, 166882
Below is a head and tail of my data.frame
head(df)
ref.date ticker weeklyReturn
<date> <chr> <dbl>
1 2008-02-01 SPY NA
2 2008-02-04 SPY NA
3 2008-02-05 SPY NA
4 2008-02-06 SPY NA
5 2008-02-07 SPY NA
6 2008-02-08 SPY -0.0478
tail(df)
ref.date ticker weeklyReturn
<date> <chr> <dbl>
1 2020-02-12 MDYV 0.00293
2 2020-02-13 MDYV 0.00917
3 2020-02-14 MDYV 0.0179
4 2020-02-18 MDYV 0.0107
5 2020-02-19 MDYV 0.00422
6 2020-02-20 MDYV 0.00347
You can use dplyr and tidyr packages. To get rid of that error, you would have to firstly sum the values for each group.
data %>%
group_by(date, ticker) %>%
summarise(value = sum(value)) %>%
pivot_wider(names_from = ticker, values_from = value)
# date MSFT SPY
# <fct> <dbl> <dbl>
# 1 2019 3 1
# 2 2020 4 2
As said in the comments, you have multiple values for same combination of date-ticker. You need to define what to do with it.
Here with a reprex:
library(tidyr)
library(dplyr)
# your data is more like:
data = data.frame(
date = c(2019, rep(c("2019", "2020"), 2)),
ticker = c("SPY", "SPY", "SPY", "MSFT", "MSFT"),
value = c(8, 1, 2, 3, 4))
# With two values for same date-ticker combination
data
#> date ticker value
#> 1 2019 SPY 8
#> 2 2019 SPY 1
#> 3 2020 SPY 2
#> 4 2019 MSFT 3
#> 5 2020 MSFT 4
# Results in error
data %>%
spread(ticker, value)
#> Error: Each row of output must be identified by a unique combination of keys.
#> Keys are shared for 2 rows:
#> * 1, 2
# New pivot_wider() Creates list-columns for duplicates
data %>%
pivot_wider(names_from = ticker, values_from = value,)
#> Warning: Values in `value` are not uniquely identified; output will contain list-cols.
#> * Use `values_fn = list(value = list)` to suppress this warning.
#> * Use `values_fn = list(value = length)` to identify where the duplicates arise
#> * Use `values_fn = list(value = summary_fun)` to summarise duplicates
#> # A tibble: 2 x 3
#> date SPY MSFT
#> <fct> <list> <list>
#> 1 2019 <dbl [2]> <dbl [1]>
#> 2 2020 <dbl [1]> <dbl [1]>
# Otherwise, decide yourself how to summarise duplicates with mean() for instance
data %>%
group_by(date, ticker) %>%
summarise(value = mean(value, na.rm = TRUE)) %>%
spread(ticker, value)
#> # A tibble: 2 x 3
#> # Groups: date [2]
#> date MSFT SPY
#> <fct> <dbl> <dbl>
#> 1 2019 3 4.5
#> 2 2020 4 2
Created on 2020-02-22 by the reprex package (v0.3.0)

Resources