Finding Month on Month Turnover - r

I seemed to be stuck at a very basic problem, I know its easy but I am not able to figure out.
So My data has HireDate and TermDate. TermDate is the last day of any employee.
I want to do as follow:
Leavers = Current Month Count taken from TermDate
Turnover for particular Month = Current Month Leavers / AVG (Row Count for Last Month and Current Month)
Reproduce Data
structure(list(HireDate = structure(c(17702, 13242, 16895, 17167,
12335, 13879, 12303, 13745, 14789, 16785, 15390, 17167, 12886,
13472, 15569, 13796, 16811, 11484, 13062, 17592, 16113, 13437,
15614, 17167, 17167, 16251, 17623, 13312, 14165, 17167, 17167,
10695, 15764, 13749, 16801, 17167, 13594, 13874, 17167, 17167,
13157, 17167, 12501, 13243, 12192, 12287, 12965, 13328, 17167,
13343, 17167, 17167, 11839, 17167, 13262, 13326, 14124, 16161,
17167, 17226, 12786, 13823, 13822, 13255, 17704, 17653, 12258,
12769, 13727, 10712, 17400, 13952, 14048, 14333, 17233, 17690,
13108, 13383, 13517, 13829, 17213, 13696, 16741, 17167, 17241,
12198, 14018, 12902, 16801, 17167, 17591, 12843, 13627, 14553,
15593, 16097, 16801, 13075, 13529, 17167), class = "Date"), TermDate = structure(c(NA,
13439, 17712, NA, 12880, 15408, 12877, 16493, 17135, 16944, 17135,
NA, 14054, 15670, 17531, 14327, NA, 13889, NA, NA, 16741, 17135,
17620, 17620, 17354, 17316, NA, 13312, 17166, NA, NA, 15705,
NA, 15112, NA, NA, 15705, 13970, 17655, NA, 13612, NA, 15418,
15917, 15705, NA, 14274, 13449, NA, 13559, 17417, NA, 14400,
NA, NA, 14334, 14813, 16343, 17703, NA, 12824, 15711, 15411,
14484, NA, NA, NA, 15309, 16493, 17197, NA, 14911, 16957, 15882,
NA, NA, 14435, 13768, 13517, 14907, NA, 17284, NA, NA, NA, 12772,
17166, NA, 16881, 17439, NA, 14944, NA, 15028, 16581, 16778,
NA, 13788, 14064, 17620), class = "Date")), row.names = 14296:14395, class = "data.frame")

A bit lengthy but it would work:
library(data.table)
df_leavers <- setDT(df)[, `:=` (TermDate = as.Date(as.character(TermDate)),
HireDate = as.Date(as.character(HireDate)))]
df_presences <- copy(df_leavers)
df_leavers <- df_leavers[, TermDate := format(TermDate, "%Y-%m")][!is.na(TermDate), (Leavers = .N), , by = TermDate]
df_presences <- df_presences[, maxTerm := max(TermDate, na.rm = T)][
is.na(TermDate), TermDate := maxTerm][
, .(YearMonth = format(seq(HireDate, TermDate, by = "month"), "%Y-%m")), by = 1:nrow(df)][
, (Presences = .N), by = YearMonth]
df_final <- df_leavers[df_presences, on = .(TermDate = YearMonth)]
setnames(df_final, c("YearMonth", "Leavers", "Presences"))
df_final <- df_final[is.na(Leavers), Leavers := 0][order(YearMonth),][, previousMonth := shift(Presences)][
is.na(previousMonth), previousMonth := 0][, AvgPresences := (Presences + previousMonth) / 2][
, Turnover := round(Leavers / AvgPresences, 2)][, "previousMonth" := NULL]
Output (beginning and end of dataset):
YearMonth Leavers Presences AvgPresences Turnover
1: 1999-04 0 1 0.5 0.00
2: 1999-05 0 2 1.5 0.00
3: 1999-06 0 2 2.0 0.00
4: 1999-07 0 2 2.0 0.00
5: 1999-08 0 2 2.0 0.00
---
227: 2018-02 0 32 32.5 0.00
228: 2018-03 3 36 34.0 0.09
229: 2018-04 0 33 34.5 0.00
230: 2018-05 1 34 33.5 0.03
231: 2018-06 2 36 35.0 0.06

library(dplyr)
df %>%
mutate(leavemonth=strftime(TermDate,format="%m-%Y")) %>%
group_by(leavemonth) %>%
summarize(n=n())
# A tibble: 51 x 2
leavemonth n
<chr> <int>
1 01-2007 1
2 01-2008 1
3 01-2009 1
4 01-2013 1
5 01-2017 1
6 02-2005 1
7 02-2007 1
8 02-2011 1
9 02-2015 2
10 03-2009 2
# ... with 41 more rows
I create a column with a unique identifier for the month-year of the termination date of each row, then count them using summarize.
If you'd like to just add n to the existing table, we can replace the summarize with add_count:
df %>%
mutate(leavemonth=strftime(TermDate,format="%m-%Y")) %>%
add_count(leavemonth)
# A tibble: 100 x 4
HireDate TermDate leavemonth n
<date> <date> <chr> <int>
1 2018-06-20 NA NA 34
2 2006-04-04 2006-10-18 10-2006 2
3 2016-04-04 2018-06-30 06-2018 2
4 2017-01-01 NA NA 34
5 2003-10-10 2005-04-07 04-2005 2
6 2008-01-01 2012-03-09 03-2012 3
7 2003-09-08 2005-04-04 04-2005 2
8 2007-08-20 2015-02-27 02-2015 2
9 2010-06-29 2016-11-30 11-2016 3
10 2015-12-16 2016-05-23 05-2016 1
# ... with 90 more rows

Related

Return NA if date is lower than for specific rows in R

My data looks like this:
dput(head(VI_v2_KRR05,28))
structure(list(ID = c("AUR", "AUR", "AUR", "AUR", "AUR", "AUR",
"AUR", "AUR", "AUR", "AUR", "AUR", "LAM", "LAM", "LAM", "LAM",
"LAM", "LAM", "LAM", "LAM", "LAM", "LAM", "P0", "P0", "P0", "P0",
"P0", "P01", "P01"), EVI_SOS = structure(c(16440, 16805, 17124,
17421, 17599, 17851, 18216, 18403, NA, NA, NA, 16272, 16406,
16637, 16771, 17148, 17516, 17725, 18022, 18210, NA, 16692, 16845,
17058, 17212, NA, 16717, 17077), class = "Date"), NDVI_SOS = structure(c(16436,
16801, 17110, 17420, 17607, 17841, 18196, 18402, NA, NA, NA,
16270, 16380, 16635, 16745, 17139, 17274, 17522, 17731, 18027,
18198, 16480, 16683, 17055, NA, NA, 16712, 17076), class = "Date"),
NIRv_SOS = structure(c(16424, 16557, 16789, 16922, 17221,
17379, 17607, 17821, 17931, 18214, 18400, 16274, 16404, 16639,
16769, 17145, 17519, 17727, 18028, 18208, NA, 16695, 16848,
17061, 17218, NA, 16720, 17084), class = "Date"), kNDVI_SOS = structure(c(16542,
16799, 17212, 17431, 17607, 17854, 18003, 18219, 18401, NA,
NA, 16282, 16647, 17139, 17516, 17733, 18026, 18205, NA,
NA, NA, 16474, 16706, 16847, 17073, 17209, 16712, 17075), class = "Date"),
EVI_EOS = structure(c(16766, 17084, 17356, 17577, 17812,
18098, 18385, 18585, NA, NA, NA, 16375, 16618, 16740, 17099,
17452, 17707, 17841, 18183, 18549, NA, 16825, 17012, 17193,
17396, NA, 17022, 17398), class = "Date"), NDVI_EOS = structure(c(16768,
17075, 17375, 17590, 17812, 18095, 18385, 18568, NA, NA,
NA, 16359, 16616, 16724, 16998, 17248, 17490, 17714, 17961,
18177, 18460, 16648, 17020, 17393, NA, NA, 17028, 17392), class = "Date"),
NIRv_EOS = structure(c(16533, 16768, 16899, 17191, 17358,
17590, 17798, 17920, 18104, 18379, 18547, 16373, 16619, 16738,
17086, 17441, 17709, 17839, 18182, 18471, NA, 16828, 17008,
17198, 17387, NA, 17028, 17398), class = "Date"), kNDVI_EOS = structure(c(16764,
17184, 17356, 17592, 17805, 17993, 18088, 18381, 18562, NA,
NA, 16620, 17007, 17439, 17715, 17845, 18180, 18457, NA,
NA, NA, 16625, 16833, 16991, 17192, 17366, 17026, 17391), class = "Date")), row.names = c(NA,
-28L), class = c("tbl_df", "tbl", "data.frame"))
I want to return NA the dates lower than 2015-09-04 for the ID AUR, 2015-09-04 for the ID LAM, 2016-01-02 for the ID P0 and 2016-01-09 for the ID P01.
Any help will be much appreciatted.
We can use case_when if there are only few IDs i.e. create the condition for each ID separately to return only the values that are greater than the threshold date or else, the default case in case_when returns NA.
library(dplyr)
library(collapse)
out <- df1 %>%
mutate(across(where(is_date),
~ case_when(.x > as.Date("2015-09-04") & ID == 'AUR' ~ .x,
ID == "LAM" & .x > as.Date("2015-09-04 ") ~ .x,
ID == "P0" & .x > as.Date("2016-01-02 ") ~ .x,
ID == "P01" & .x > as.Date("2016-01-09") ~ .x)))
-checking
> df1 %>% filter(ID == "LAM")
# A tibble: 10 × 9
ID EVI_SOS NDVI_SOS NIRv_SOS kNDVI_SOS EVI_EOS NDVI_EOS NIRv_EOS kNDVI_EOS
<chr> <date> <date> <date> <date> <date> <date> <date> <date>
1 LAM 2014-07-21 2014-07-19 2014-07-23 2014-07-31 2014-11-01 2014-10-16 2014-10-30 2015-07-04
2 LAM 2014-12-02 2014-11-06 2014-11-30 2015-07-31 2015-07-02 2015-06-30 2015-07-03 2016-07-25
3 LAM 2015-07-21 2015-07-19 2015-07-23 2016-12-04 2015-11-01 2015-10-16 2015-10-30 2017-09-30
4 LAM 2015-12-02 2015-11-06 2015-11-30 2017-12-16 2016-10-25 2016-07-16 2016-10-12 2018-07-03
5 LAM 2016-12-13 2016-12-04 2016-12-10 2018-07-21 2017-10-13 2017-03-23 2017-10-02 2018-11-10
6 LAM 2017-12-16 2017-04-18 2017-12-19 2019-05-10 2018-06-25 2017-11-20 2018-06-27 2019-10-11
7 LAM 2018-07-13 2017-12-22 2018-07-15 2019-11-05 2018-11-06 2018-07-02 2018-11-04 2020-07-14
8 LAM 2019-05-06 2018-07-19 2019-05-12 NA 2019-10-14 2019-03-06 2019-10-13 NA
9 LAM 2019-11-10 2019-05-11 2019-11-08 NA 2020-10-14 2019-10-08 2020-07-28 NA
10 LAM NA 2019-10-29 NA NA NA 2020-07-17 NA NA
> out %>% filter(ID == "LAM")
# A tibble: 10 × 9
ID EVI_SOS NDVI_SOS NIRv_SOS kNDVI_SOS EVI_EOS NDVI_EOS NIRv_EOS kNDVI_EOS
<chr> <date> <date> <date> <date> <date> <date> <date> <date>
1 LAM NA NA NA NA NA NA NA NA
2 LAM NA NA NA NA NA NA NA 2016-07-25
3 LAM NA NA NA 2016-12-04 2015-11-01 2015-10-16 2015-10-30 2017-09-30
4 LAM 2015-12-02 2015-11-06 2015-11-30 2017-12-16 2016-10-25 2016-07-16 2016-10-12 2018-07-03
5 LAM 2016-12-13 2016-12-04 2016-12-10 2018-07-21 2017-10-13 2017-03-23 2017-10-02 2018-11-10
6 LAM 2017-12-16 2017-04-18 2017-12-19 2019-05-10 2018-06-25 2017-11-20 2018-06-27 2019-10-11
7 LAM 2018-07-13 2017-12-22 2018-07-15 2019-11-05 2018-11-06 2018-07-02 2018-11-04 2020-07-14
8 LAM 2019-05-06 2018-07-19 2019-05-12 NA 2019-10-14 2019-03-06 2019-10-13 NA
9 LAM 2019-11-10 2019-05-11 2019-11-08 NA 2020-10-14 2019-10-08 2020-07-28 NA
10 LAM NA 2019-10-29 NA NA NA 2020-07-17 NA NA
A general approach would be to a key/value data with 'key' being the unique 'ID' and then do a join with the original data and loop across the columns to convert to NA by comparing with the 'dates' column
library(data.table)
df2 <- data.table(ID = c("AUR", "LAM", "PO", "P01"),
dates = as.Date(c("2015-09-04", "2015-09-04", "2016-01-02", "2016-01-09")))
out1 <- copy(df1)
nm1 <- grep("_SOS$", names(out1), value = TRUE)
setDT(out1)[df2, (nm1) := lapply(.SD, \(x)
fcase(x > dates, x)), on = .(ID), .SDcols = nm1]
Or the same method in dplyr
df1 %>%
left_join(df2) %>%
mutate(across(c(where(is_date), -dates),
~ case_when(.x > dates ~ .x)), dates = NULL)
# A tibble: 28 × 9
ID EVI_SOS NDVI_SOS NIRv_SOS kNDVI_SOS EVI_EOS NDVI_EOS NIRv_EOS kNDVI_EOS
<chr> <date> <date> <date> <date> <date> <date> <date> <date>
1 AUR NA NA NA NA 2015-11-27 2015-11-29 NA 2015-11-25
2 AUR 2016-01-05 2016-01-01 NA 2015-12-30 2016-10-10 2016-10-01 2015-11-29 2017-01-18
3 AUR 2016-11-19 2016-11-05 2015-12-20 2017-02-15 2017-07-09 2017-07-28 2016-04-08 2017-07-09
4 AUR 2017-09-12 2017-09-11 2016-05-01 2017-09-22 2018-02-15 2018-02-28 2017-01-25 2018-03-02
5 AUR 2018-03-09 2018-03-17 2017-02-24 2018-03-17 2018-10-08 2018-10-08 2017-07-11 2018-10-01
6 AUR 2018-11-16 2018-11-06 2017-08-01 2018-11-19 2019-07-21 2019-07-18 2018-02-28 2019-04-07
7 AUR 2019-11-16 2019-10-27 2018-03-17 2019-04-17 2020-05-03 2020-05-03 2018-09-24 2019-07-11
8 AUR 2020-05-21 2020-05-20 2018-10-17 2019-11-19 2020-11-19 2020-11-02 2019-01-24 2020-04-29
9 AUR NA NA 2019-02-04 2020-05-19 NA NA 2019-07-27 2020-10-27
10 AUR NA NA 2019-11-14 NA NA NA 2020-04-27 NA

How to bind more than 2 dataframes with different column number in R

I want to bind 4 dataframes. One of them, the "B8A_EVI_EOS_KRR05" has 11 rows while the others have 19. I've used cbind but I get a repetition of "B8A_EVI_EOS_KRR" first rows after the 11th row.
I want to be able to bind the 4 dataframes by ID (choose the ID of B8A_NDVI_EOS_KRR, B8A_NIRv_EOS_KRR or B8A_kNDVI_EOS_KRR) and fill the empty cells with NA.
My 4 dataframes look like this:
dput(B8A_EVI_EOS_KRR05)
structure(list(ID = c("AUR", "AUR", "AUR", "AUR", "AUR", "AUR",
"P1", "P14", "P15", "P17", "P2"), D_EOS = structure(c(17067,
17353, 17712, 18082, 18360, 18516, 17714, 17007, 16987, 16988,
17715), class = "Date"), EVI_EOS = structure(c(17042, 17344,
17813, 18107, 18385, 18548, 17705, 17144, 17027, 17003, 17827
), class = "Date")), row.names = c(NA, -11L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000000002745260>)
dput(B8A_NDVI_EOS_KRR05)
structure(list(ID = c("AUR", "AUR", "AUR", "AUR", "AUR", "AUR",
"LAM", "LAM", "LAM", "LAM", "LAM", "LAM", "P0", "P1", "P14",
"P15", "P17", "P2", "P3"), D_EOS = structure(c(17067, 17353,
17712, 18082, 18360, 18516, 17002, 17123, 17414, 17722, 18148,
18446, 17359, 17714, 17007, 16987, 16988, 17715, 17716), class = "Date"),
NDVI_EOS = structure(c(17071, 17379, 17814, 18095, 18384,
18577, 16996, 17248, 17501, 17715, 18176, 18461, 17393, 17705,
17076, 16994, 17050, 17829, 17755), class = "Date")), row.names = c(NA,
-19L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x0000000002745260>)
dput(B8A_NIRv_EOS_KRR05)
structure(list(ID = c("AUR", "AUR", "AUR", "AUR", "AUR", "AUR",
"LAM", "LAM", "LAM", "LAM", "LAM", "LAM", "P0", "P1", "P14",
"P15", "P17", "P2", "P3"), D_EOS = structure(c(17067, 17353,
17712, 18082, 18360, 18516, 17002, 17123, 17414, 17722, 18148,
18446, 17359, 17714, 17007, 16987, 16988, 17715, 17716), class = "Date"),
NIRv_EOS = structure(c(17077, 17385, 17810, 18096, 18385,
18574, 17085, 17085, 17494, 17709, 18179, 18534, 17387, 17705,
17062, 16997, 17027, 17822, 17749), class = "Date")), row.names = c(NA,
-19L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x0000000002745260>)
dput(B8A_kNDVI_EOS_KRR05)
structure(list(ID = c("AUR", "AUR", "AUR", "AUR", "AUR", "AUR",
"LAM", "LAM", "LAM", "LAM", "LAM", "LAM", "P0", "P1", "P14",
"P15", "P17", "P2", "P3"), D_EOS = structure(c(17067, 17353,
17712, 18082, 18360, 18516, 17002, 17123, 17414, 17722, 18148,
18446, 17359, 17714, 17007, 16987, 16988, 17715, 17716), class = "Date"),
kNDVI_EOS = structure(c(17074, 17380, 17812, 18093, 18385,
18569, 16997, 17247, 17487, 17715, 18177, 18454, 17369, 17775,
17078, 16991, 17028, 17770, 17742), class = "Date")), row.names = c(NA,
-19L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x0000000002745260>)`
On the left is an example of how the dataframes looks like when I use cbind and on the right it is shown the desired output.
We may nest the datasets in a list, do a join and then unnest
library(purrr)
library(dplyr)
library(tidyr)
mget(ls(pattern = "^B8A_.*_EOS_KRR05$")) %>%
imap(~ .x %>%
nest(data = -ID) %>%
rename(!! .y := data)) %>%
reduce(full_join, by = "ID") %>%
unnest(where(is.list), names_sep = "_")
-output
# A tibble: 19 × 9
ID B8A_EVI_EOS_KRR05_D… B8A_EVI_EOS_KRR… B8A_kNDVI_EOS_K… B8A_kNDVI_EOS_K… B8A_NDVI_EOS_KR… B8A_NDVI_EOS_KR… B8A_NIRv_EOS_KR… B8A_NIRv_EOS_KR…
<chr> <date> <date> <date> <date> <date> <date> <date> <date>
1 AUR 2016-09-23 2016-08-29 2016-09-23 2016-09-30 2016-09-23 2016-09-27 2016-09-23 2016-10-03
2 AUR 2017-07-06 2017-06-27 2017-07-06 2017-08-02 2017-07-06 2017-08-01 2017-07-06 2017-08-07
3 AUR 2018-06-30 2018-10-09 2018-06-30 2018-10-08 2018-06-30 2018-10-10 2018-06-30 2018-10-06
4 AUR 2019-07-05 2019-07-30 2019-07-05 2019-07-16 2019-07-05 2019-07-18 2019-07-05 2019-07-19
5 AUR 2020-04-08 2020-05-03 2020-04-08 2020-05-03 2020-04-08 2020-05-02 2020-04-08 2020-05-03
6 AUR 2020-09-11 2020-10-13 2020-09-11 2020-11-03 2020-09-11 2020-11-11 2020-09-11 2020-11-08
7 P1 2018-07-02 2018-06-23 2018-07-02 2018-09-01 2018-07-02 2018-06-23 2018-07-02 2018-06-23
8 P14 2016-07-25 2016-12-09 2016-07-25 2016-10-04 2016-07-25 2016-10-02 2016-07-25 2016-09-18
9 P15 2016-07-05 2016-08-14 2016-07-05 2016-07-09 2016-07-05 2016-07-12 2016-07-05 2016-07-15
10 P17 2016-07-06 2016-07-21 2016-07-06 2016-08-15 2016-07-06 2016-09-06 2016-07-06 2016-08-14
11 P2 2018-07-03 2018-10-23 2018-07-03 2018-08-27 2018-07-03 2018-10-25 2018-07-03 2018-10-18
12 LAM NA NA 2016-07-20 2016-07-15 2016-07-20 2016-07-14 2016-07-20 2016-10-11
13 LAM NA NA 2016-11-18 2017-03-22 2016-11-18 2017-03-23 2016-11-18 2016-10-11
14 LAM NA NA 2017-09-05 2017-11-17 2017-09-05 2017-12-01 2017-09-05 2017-11-24
15 LAM NA NA 2018-07-10 2018-07-03 2018-07-10 2018-07-03 2018-07-10 2018-06-27
16 LAM NA NA 2019-09-09 2019-10-08 2019-09-09 2019-10-07 2019-09-09 2019-10-10
17 LAM NA NA 2020-07-03 2020-07-11 2020-07-03 2020-07-18 2020-07-03 2020-09-29
18 P0 NA NA 2017-07-12 2017-07-22 2017-07-12 2017-08-15 2017-07-12 2017-08-09
19 P3 NA NA 2018-07-04 2018-07-30 2018-07-04 2018-08-12 2018-07-04 2018-08-06

How to recode a new date variable and select the lowest date out of four date columns in R

Sample data
stack_dat <- structure(list(bio_drug_stop_date = structure(c(15376, NA, 15602, NA, 15550, NA, 15350, 15363, 15418, 16157), class = "Date"),
follow_up_2_years = structure(c(16047, 14318, 16038, 14352, 16044, 16582, 16054, 16048, 16054, 16054), class = "Date"),
date_of_last_visit = structure(c(17836, 16405, 17591, 16801, 17866, 15826, 17866, 17257, 18109, 16587), class = "Date"),
end_of_follow_up_date = structure(c(NA, 17928, NA, 17928, 17900, 16980, 16890, 17100, NA, NA), class = "Date"), data_cut_date = structure(c(18201,
18201, 18201, 18201, 18201, 18201, 18201, 18201, 18201, 18201), class = "Date")), row.names = c(NA, 10L), class = "data.frame")
Structure
'data.frame': 10 obs. of 5 variables:
$ bio_drug_stop_date : Date, format: "2012-02-06" NA "2012-09-19" NA ...
$ follow_up_2_years : Date, format: "2013-12-08" "2009-03-15" "2013-11-29" "2009-04-18" ...
$ date_of_last_visit : Date, format: "2018-11-01" "2014-12-01" "2018-03-01" "2016-01-01" ...
$ end_of_follow_up_date: Date, format: NA "2019-02-01" NA "2019-02-01" ...
$ data_cut_date : Date, format: "2019-11-01" "2019-11-01" "2019-11-01" "2019-11-01" ...
Aim
The goal is to recode a new variable named treatment_end calculated by taking the date of bio_drug_stop_date; if it is not present, then the lowest date in any of the four other columns: follow_up_2_years, date_of_last_visit, end_of_follow_up_date, data_cut_date
We may use pmin with coalesce - coalesce the 'bio_drug_stop_date' with the min (using pmin) dates from other column for each row
library(dplyr)
library(purrr)
stack_dat %>%
mutate(treatment_end = coalesce(bio_drug_stop_date,
invoke(pmin, across(-1), na.rm = TRUE)))
-output
bio_drug_stop_date follow_up_2_years date_of_last_visit end_of_follow_up_date data_cut_date treatment_end
1 2012-02-06 2013-12-08 2018-11-01 <NA> 2019-11-01 2012-02-06
2 <NA> 2009-03-15 2014-12-01 2019-02-01 2019-11-01 2009-03-15
3 2012-09-19 2013-11-29 2018-03-01 <NA> 2019-11-01 2012-09-19
4 <NA> 2009-04-18 2016-01-01 2019-02-01 2019-11-01 2009-04-18
5 2012-07-29 2013-12-05 2018-12-01 2019-01-04 2019-11-01 2012-07-29
6 <NA> 2015-05-27 2013-05-01 2016-06-28 2019-11-01 2013-05-01
7 2012-01-11 2013-12-15 2018-12-01 2016-03-30 2019-11-01 2012-01-11
8 2012-01-24 2013-12-09 2017-04-01 2016-10-26 2019-11-01 2012-01-24
9 2012-03-19 2013-12-15 2019-08-01 <NA> 2019-11-01 2012-03-19
10 2014-03-28 2013-12-15 2015-06-01 <NA> 2019-11-01 2014-03-28

Convert point forecasts in fable from exponential format to normal numbers [duplicate]

This question already has answers here:
How can I disable scientific notation?
(4 answers)
Closed 2 years ago.
I have created the following tsibble (named afsi)
structure(list(Date = structure(c(12509, 12600, 12692, 12784,
12874, 12965, 13057, 13149, 13239, 13330, 13422, 13514, 13604,
13695, 13787, 13879, 13970, 14061, 14153, 14245, 14335, 14426,
14518, 14610, 14700, 14791, 14883, 14975, 15065, 15156, 15248,
15340, 15431, 15522, 15614, 15706, 15796, 15887, 15979, 16071,
16161, 16252, 16344, 16436, 16526, 16617, 16709, 16801, 16892,
16983, 17075, 17167, 17257, 17348, 17440, 17532, 17622, 17713,
17805, 17897), fiscal_start = 1, class = c("yearquarter", "vctrs_vctr"
)), Index = c(4.6049904235401, 4.60711076016453, 4.60980084146652,
4.61025389170935, 4.60544515681515, 4.60889021700954, 4.60983993107244,
4.61091608826696, 4.61138799159174, 4.61294431148318, 4.61167545843765,
4.61208284263432, 4.61421991328081, 4.61530485425155, 4.61471465043043,
4.6155992084451, 4.61195799200607, 4.61178486640435, 4.61037927954796,
4.60744590947049, 4.59979957741728, 4.59948551500254, 4.60078678080182,
4.60556092645471, 4.60934962087565, 4.60981147563749, 4.61060477704678,
4.61158365084251, 4.60963435263623, 4.61018215733317, 4.61209710959768,
4.61231368335184, 4.61071363571141, 4.61019496497916, 4.60948652606191,
4.61068813487859, 4.6084092003352, 4.60972706132393, 4.60866915174087,
4.61192565195909, 4.60878767339377, 4.61341471281265, 4.61015272152397,
4.6093479714315, 4.60750965935653, 4.60768790690338, 4.60676463096309,
4.60746490411374, 4.60885670935448, 4.60686846708382, 4.60688947889575,
4.60867708110485, 4.60448791268212, 4.60387348166032, 4.60569806689426,
4.6069320880709, 4.6087143894128, 4.61059688801283, 4.61065399116698,
4.61071421014339)), row.names = c(NA, -60L), key = structure(list(
.rows = structure(list(1:60), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame")), index = structure("Date", ordered = TRUE), index2 = "Date", interval = structure(list(
year = 0, quarter = 1, month = 0, week = 0, day = 0, hour = 0,
minute = 0, second = 0, millisecond = 0, microsecond = 0,
nanosecond = 0, unit = 0), .regular = TRUE, class = c("interval",
"vctrs_rcrd", "vctrs_vctr")), class = c("tbl_ts", "tbl_df", "tbl",
"data.frame"))
Using this data, I've created an ARIMA model and simulated forecasts from it for a year ahead using the fable package. I noticed that my table produces the forecasts in exponential notation which aren't too great to read, is there a way in which I could convert the point forecasts (under the Index column in my table) to appear as a normal number (and perhaps round to 5 digits)?
library(fable)
fit <- afsi %>%
model(arima = ARIMA(Index))
table <- fit1 %>% forecast(h='1 year')
the results from table look as follows
# A fable: 4 x 4 [1Q]
# Key: .model [1]
.model Date Index .mean
<chr> <qtr> <dist> <dbl>
1 arima 2019 Q2 N(4.6, 4.6e-06) 4.61
2 arima 2019 Q3 N(4.6, 7.4e-06) 4.61
3 arima 2019 Q4 N(4.6, 9e-06) 4.61
4 arima 2020 Q1 N(4.6, 9.9e-06) 4.61
We can set the options
options(scipen = 999)
-before
table
# A fable: 4 x 4 [1Q]
# Key: .model [1]
# .model Date Index .mean
# <chr> <qtr> <dist> <dbl>
#1 arima 2019 Q2 N(4.6, 4.6e-06) 4.61
#2 arima 2019 Q3 N(4.6, 7.4e-06) 4.61
#3 arima 2019 Q4 N(4.6, 9e-06) 4.61
#4 arima 2020 Q1 N(4.6, 9.9e-06) 4.61
options(scipen = 999)
-after
table <- fit %>%
forecast(h='1 year')
table
# A fable: 4 x 4 [1Q]
# Key: .model [1]
# .model Date Index .mean
# <chr> <qtr> <dist> <dbl>
#1 arima 2019 Q2 N(4.6, 0.0000046) 4.61
#2 arima 2019 Q3 N(4.6, 0.0000074) 4.61
#3 arima 2019 Q4 N(4.6, 0.000009) 4.61
#4 arima 2020 Q1 N(4.6, 0.0000099) 4.61

How can I fill in missing rows for monthly time series data?

Here's the dput of my data:
structure(list(date = structure(c(8596, 8631, 8659, 8687, 8733,
8743, 8796, 8806, 8853, 8880, 8908, 8932, 8971, 8999, 9027, 9069,
9097, 9111, 9160, 9188, 9212, 9230, 9279, 9309, 9328, 9363, 9391,
9434, 9449, 9482, 9519, 9541, 9580, 9610, 9643, 9672, 9708, 9736,
9764, 9799, 9827, 9850, 9890, 9920, 9947, 9975, 10007, 10038,
10072, 10100, 10122, 10163, 10191, 10213, 10254, 10282, 10310,
10345, 10354, 10385, 10418, 10469, 10497, 10528, 10556, 10570,
10612, 10641, 10668, 10710, 10742, 10759, 10802, 10830, 10858,
10893, 10914, 10947, 10984, 11010, 11038, 11066, 11096, 11135,
11164, 11193, 11229, 11257, 11285, 11313, 11346, 11374, 11411,
11435, 11467, 11502, 11514, 11565, 11592, 11621, 11649, 11677,
11718, 11746, 11776, 11797, 11838, 11867, 11894, 11923, 11951,
11979, 12021, 12035, 12077, 12105, 12133, 12160, 12189, 12231,
12259, 12273, 12315, 12356, 12385, 12399, 12441, 12472, 12497,
12538, 12553, 12591, 12630, 12658, 12686, 12714, 12742, 12770,
12804, 12832, 12860, 12903, 12917, 12938, 12986, 13015, 13056,
13085, 13116, 13139, 13169, 13204, 13232, 13260, 13288, 13301,
13357, 13385, 13414, 13442, 13470, 13498, 13533, 13561, 13603,
13631, 13658, 13694, 13722, 13750, 13778, 13805, 13846, 13862,
13896, 13925, 13967, 13995, 14009, 14050, 14078, 14121, 14149,
14177, 14205, 14233, 14268, 14296, 14323, 14352, 14380, 14449,
14474, 14506, 14548, 14575, 14590, 14618, 14661, 14688, 14729,
14758, 14761, 14821, 14849, 14877, 14905, 14933, 14961, 14995,
15024, 15038, 15093, 15121, 15135, 15185, 15212, 15241, 15269,
15297, 15325, 15360, 15387, 15430, 15458, 15485, 15513, 15542,
15583, 15611, 15639, 15667, 15696, 15731, 15745, 15786, 15815,
15842, 15917, 15945, 15966, 16001, 16030, 16076, 16129, 16143,
16184, 16276, 16303, 16343, 16374, 16400, 16417, 16455, 16482,
16525, 16553, 16585, 16612, 16646, 16678, 16706, 16729, 16752,
16777, 16819, 16860, 16891, 16916, 16925, 16976, 17002, 17042,
17072, 17100, 17120, 17141, 17178, 17224, 17245, 17261, 17304,
17330, 17373, 17401, 17459, 17488, 17512, 17548, 17581, 17598,
17631), tzone = "UTC", tclass = "Date", class = "Date"), AverageTemp = c(16.5027083333333,
17.325, 17.1888888888889, 15.8277777777778, 16.6583333333333,
17.3333333333333, 16.64375, 17.1133333333333, 17.895119047619,
18.5694444444444, 18.8222222222222, 17.4305555555556, 17.6555555555556,
17.025, 17.3222222222222, 17.2770833333333, 17.4805555555556,
16.9708333333333, 17.9666666666667, 17.1222222222222, 18.0166666666667,
17.25, 18.1875, 17.6577777777778, 16.6541666666667, 17.1083333333333,
16.4666666666667, 17.5972756410256, 17.2, 17.4444444444444, 16.95,
17.7, 17.9222222222222, 18.4875, 17.8229166666667, 16.9166666666667,
16.7083333333333, 17.1666666666667, 17.3111111111111, 18.2333333333333,
16.6277777777778, 17.5875, 17.3833333333333, 17.4638888888889,
17.725, 18.1388888888889, 17.7001111111111, 17.7222222222222,
17.2041666666667, 17.8255952380952, 17.1833333333333, 17.8103070175439,
17.8194444444444, 17.952, 18.158412414966, 18.4910714285714,
18.3488562091503, 19.1341830065359, 18.45, 18.9107142857143,
17.2275, 19.0828761904762, 18.1599701591512, 18.965739220457,
18.6720606060606, 18.8786057692308, 18.602656449553, 18.6327347883598,
19.2925198412698, 20.1952463624339, 18.8900384227765, 18.0934444444444,
18.0554871794872, 17.8405270655271, 17.5540598290598, 17.454122110648,
17.5764155982906, 16.9989942528736, 16.4252032967033, 16.5388571428571,
17.0108695652174, 17.7725308641975, 18.4252564102564, 17.2278899240856,
17.3102091315453, 17.3627204585538, 17.280641025641, 17.3746616809117,
17.3014601139601, 17.2238271604938, 16.379012345679, 16.6044444444444,
17.624415954416, 18.4023148148148, 18.0341435185185, 17.3016666666667,
17.8204861111111, 17.827264957265, 17.2772467320261, 17.8786954365079,
17.84375, 17.1732638888889, 16.9219907407407, 17.3826388888889,
17.7413333333333, 18.4948412698413, 18.2363425925926, 17.3282057823129,
17.5083333333333, 17.414898989899, 16.9453125, 17.4988095238095,
17.6704012345679, 18.1333333333333, 18.11875, 17.4805555555556,
17.4271367521368, 17.9006944444444, 17.9818181818182, 17.3125,
16.73625, 17.2666666666667, 17.4279340277778, 17.8584444444444,
17.2966666666667, 17.1, 18.3420833333333, 18.5814285714286, 17.6430555555556,
18.2307122507123, 18.0830687830688, 16.7563492063492, 16.9055555555556,
17.0090277777778, 17.3863095238095, 16.9139880952381, 16.7479166666667,
17.0888888888889, 17.7648148148148, 18.2277777777778, 19.3694444444444,
17.7064021164021, 18.7371527777778, 17.94375, 17.9416666666667,
17.8736111111111, 18.5354166666667, 18.1919444444444, 18.2555555555556,
17.7704365079365, 17.3509259259259, 17.3931216931217, 18.3355923202614,
17.9180555555556, 18.2104166666667, 18.0171121593291, 17.6840277777778,
17.5509259259259, 16.9631313131313, 17.4478070175439, 17.6916666666667,
17.6143376068376, 18.7415656565657, 19.0048611111111, 18.285462962963,
18.3816964285714, 18.2041310541311, 17.2343518518519, 17.2149382716049,
17.3684027777778, 17.5229861111111, 16.8517857142857, 19.0929141414141,
19.300404040404, 18.735, 17.9280277777778, 18.4470274170274,
19.0686597406425, 18.325, 18.5, 18.4388888888889, 18.7291666666667,
18.3708333333333, 18.0234918630752, 19.4925980392157, 19.2101488095238,
19.3890625, 18.5150793650794, 19.1944444444444, 19.0815277777778,
19.5192658730159, 17.2212418300654, 17.8081168831169, 18.2517361111111,
17.7775555555556, 18.012962962963, 17.0347222222222, 16.5888888888889,
18.8123101604278, 18.9187091503268, 19.0161111111111, 19.2625,
20.875, 18.8092592592593, 18.6526515151515, 18.9083333333333,
18.9835227272727, 18.1829292929293, 17.9060606060606, 17.7835227272727,
17.8237719298246, 19.7386363636364, 18.4961051693405, 18.5332727272727,
18.3787878787879, 18.5134199134199, 17.8098930481283, 18.4179292929293,
17.230303030303, 18.9035064935065, 17.8935897435897, 17.6211966604824,
17.9238095238095, 18.8382886904762, 19.42625, 18.6395833333333,
18.0652777777778, 19.3354166666667, 18.75359375, 17.951123043623,
17.6063068181818, 17.828022875817, 17.5528846153846, 18.5647727272727,
19.0318181818182, 19.1659090909091, 18.8997564935065, 19.1301136363636,
18.1705882352941, 17.1361570247934, 18.6090909090909, 18.1429951690821,
17.8829545454545, 18.3387983091787, 18.41875, 19.7, 20.2508333333333,
17.6387426900585, 18.1770897832817, 17.5400297619048, 17.7547246376812,
17.246412037037, 17.0846153846154, 17.7060185185185, 18.325,
18.5408333333333, 19.4251587301587, 18.3706018518519, 17.917,
17.91, 18.6451388888889, 18.29375, 17.2316666666667, 18.7189393939394,
18.1669193548387, 18.367297979798, 17.7043055555556, 18.1879520697168,
19.12, 20.425, 18.6663888888889, 17.5108796296296, 18.1883333333333,
18.3060049019608, 18.32625, 18.2861111111111, 18.0375, 17.3445175438596,
18.6451058201058, 18.97875, 19.4583333333333, 18.2597222222222,
19.9197222222222, 18.2342307692308, 18.7666666666667, 19.8277777777778,
17.6464285714286, 18.690873015873, 18.4520833333333, 19.8696428571429,
19.9833333333333, 18.2416666666667)), class = "data.frame", row.names = c(NA,
-292L))
My data is in YYYY-MM-DD format and is monthly data. Right now, there's missing data for a few months (e.g. 2017-09, 2014-05, 2014-06, 2013-12), but they are not specified in the data frame. How do I create a new row for possible missing months across my entire dataset? Since my dataset has two columns, the other column besides the date column should have an NA value specified for the new missing month row. I'm looking for a tidyverse, lubridate, or data.table solution.
You can use tidyr::complete for this, but you have the additional wrinkle that you have dates on different days in each month. First then you need to make a column to count months on, which we can do with the day(x) <- setter from lubridate.
Here's an example using the provided data truncated to 2014 for conciseness. Note that you should use seq.Date to specify the full range of dates that you want to be included in the month column, and you also will have NAs in the date column. (you can replace with the first of the month if you want)
library(tidyverse)
library(lubridate)
tbl <- structure(list(date = structure(c(16076, 16129, 16143, 16184, 16276, 16303, 16343, 16374, 16400, 16417), tzone = "UTC", tclass = "Date", class = "Date"), AverageTemp = c(18.3387983091787, 18.41875, 19.7, 20.2508333333333, 17.6387426900585, 18.1770897832817, 17.5400297619048, 17.7547246376812, 17.246412037037, 17.0846153846154)), row.names = c(NA, -10L), class = "data.frame")
tbl %>%
mutate(month = date %>% `day<-`(1)) %>%
complete(month = seq.Date(min(month), max(month), by = "month"))
#> # A tibble: 12 x 3
#> month date AverageTemp
#> <date> <date> <dbl>
#> 1 2014-01-01 2014-01-06 18.3
#> 2 2014-02-01 2014-02-28 18.4
#> 3 2014-03-01 2014-03-14 19.7
#> 4 2014-04-01 2014-04-24 20.3
#> 5 2014-05-01 NA NA
#> 6 2014-06-01 NA NA
#> 7 2014-07-01 2014-07-25 17.6
#> 8 2014-08-01 2014-08-21 18.2
#> 9 2014-09-01 2014-09-30 17.5
#> 10 2014-10-01 2014-10-31 17.8
#> 11 2014-11-01 2014-11-26 17.2
#> 12 2014-12-01 2014-12-13 17.1
As an alternative, you can instead just get the year and month components and use complete on the combination of the two:
tbl %>%
mutate(year = year(date), month = month(date)) %>%
complete(year = min(year):max(year), month = 1:12)
#> # A tibble: 12 x 4
#> year month date AverageTemp
#> <dbl> <dbl> <date> <dbl>
#> 1 2014 1 2014-01-06 18.3
#> 2 2014 2 2014-02-28 18.4
#> 3 2014 3 2014-03-14 19.7
#> 4 2014 4 2014-04-24 20.3
#> 5 2014 5 NA NA
#> 6 2014 6 NA NA
#> 7 2014 7 2014-07-25 17.6
#> 8 2014 8 2014-08-21 18.2
#> 9 2014 9 2014-09-30 17.5
#> 10 2014 10 2014-10-31 17.8
#> 11 2014 11 2014-11-26 17.2
#> 12 2014 12 2014-12-13 17.1
Created on 2019-03-20 by the reprex package (v0.2.1)

Resources