I have a data frame which looks like this:
Subscription MonthlyPayment FirstPaymentDate NumberofPayments
<chr> <dbl> <date> <int>
1 Netflix 12.99 2021-05-24 21
2 Spotify 9.99 2021-08-17 7
3 PureGym 19.99 2022-07-04 9
4 DisneyPlus 7.99 2020-10-26 11
5 AmazonPrime 34.99 2020-08-11 73
6 Youtube 12.99 2020-09-27 35
I want to find out future payment dates for each subscription service. For example Netflix has 21 monthly payments, so I want to list out all the monthly payment days from the first payment date. How would I do this for each subscription service, using dplyr?
You can use dplyr and tidyr; I create a list of sequential payments (rowwise) and then unnest that list
library(dplyr)
library(tidyr)
df %>%
rowwise() %>%
mutate(Payments = list(seq(FirstPaymentDate, by="month", length.out=NumberofPayments))) %>%
unnest(Payments)
Output:
# A tibble: 156 × 5
Subscription MonthlyPayment FirstPaymentDate NumberofPayments Payments
<chr> <dbl> <date> <int> <date>
1 Netflix 13.0 2021-05-24 21 2021-05-24
2 Netflix 13.0 2021-05-24 21 2021-06-24
3 Netflix 13.0 2021-05-24 21 2021-07-24
4 Netflix 13.0 2021-05-24 21 2021-08-24
5 Netflix 13.0 2021-05-24 21 2021-09-24
6 Netflix 13.0 2021-05-24 21 2021-10-24
7 Netflix 13.0 2021-05-24 21 2021-11-24
8 Netflix 13.0 2021-05-24 21 2021-12-24
9 Netflix 13.0 2021-05-24 21 2022-01-24
10 Netflix 13.0 2021-05-24 21 2022-02-24
# … with 146 more rows
You can add the months based on NumberofPayments directly to FirstPaymentDate. This approach does not require dplyr.
library(lubridate)
library(purrr)
df <- data.frame(sub = c("Netflix", "Spotify", "PureGym", "DisneyPlus", "AmazonPrime", "Youtube"),
mo_pay = c(12.99, 9.99, 19.99, 7.99, 34.99, 12.99),
dt_fpay = as.Date(c("2021-05-24", "2021-08-17", "2022-07-04", "2020-10-26", "2020-08-11", "2020-09-27")),
n_pay = c(21, 7, 9, 11, 73, 35))
pay_dt <- map(seq(nrow(df)),
function(x) df$dt_fpay[x] %m+% months(seq(df$n_pay[x])))
names(pay_dt) <- df$sub
pay_dt
output:
> pay_dt
$Netflix
[1] "2021-06-24" "2021-07-24" "2021-08-24" "2021-09-24" "2021-10-24" "2021-11-24"
[7] "2021-12-24" "2022-01-24" "2022-02-24" "2022-03-24" "2022-04-24" "2022-05-24"
[13] "2022-06-24" "2022-07-24" "2022-08-24" "2022-09-24" "2022-10-24" "2022-11-24"
[19] "2022-12-24" "2023-01-24" "2023-02-24"
$Spotify
[1] "2021-09-17" "2021-10-17" "2021-11-17" "2021-12-17" "2022-01-17" "2022-02-17"
[7] "2022-03-17"
$PureGym
[1] "2022-08-04" "2022-09-04" "2022-10-04" "2022-11-04" "2022-12-04" "2023-01-04"
[7] "2023-02-04" "2023-03-04" "2023-04-04"
$DisneyPlus
[1] "2020-11-26" "2020-12-26" "2021-01-26" "2021-02-26" "2021-03-26" "2021-04-26"
[7] "2021-05-26" "2021-06-26" "2021-07-26" "2021-08-26" "2021-09-26"
$AmazonPrime
[1] "2020-09-11" "2020-10-11" "2020-11-11" "2020-12-11" "2021-01-11" "2021-02-11"
[7] "2021-03-11" "2021-04-11" "2021-05-11" "2021-06-11" "2021-07-11" "2021-08-11"
[13] "2021-09-11" "2021-10-11" "2021-11-11" "2021-12-11" "2022-01-11" "2022-02-11"
[19] "2022-03-11" "2022-04-11" "2022-05-11" "2022-06-11" "2022-07-11" "2022-08-11"
[25] "2022-09-11" "2022-10-11" "2022-11-11" "2022-12-11" "2023-01-11" "2023-02-11"
[31] "2023-03-11" "2023-04-11" "2023-05-11" "2023-06-11" "2023-07-11" "2023-08-11"
[37] "2023-09-11" "2023-10-11" "2023-11-11" "2023-12-11" "2024-01-11" "2024-02-11"
[43] "2024-03-11" "2024-04-11" "2024-05-11" "2024-06-11" "2024-07-11" "2024-08-11"
[49] "2024-09-11" "2024-10-11" "2024-11-11" "2024-12-11" "2025-01-11" "2025-02-11"
[55] "2025-03-11" "2025-04-11" "2025-05-11" "2025-06-11" "2025-07-11" "2025-08-11"
[61] "2025-09-11" "2025-10-11" "2025-11-11" "2025-12-11" "2026-01-11" "2026-02-11"
[67] "2026-03-11" "2026-04-11" "2026-05-11" "2026-06-11" "2026-07-11" "2026-08-11"
[73] "2026-09-11"
$Youtube
[1] "2020-10-27" "2020-11-27" "2020-12-27" "2021-01-27" "2021-02-27" "2021-03-27"
[7] "2021-04-27" "2021-05-27" "2021-06-27" "2021-07-27" "2021-08-27" "2021-09-27"
[13] "2021-10-27" "2021-11-27" "2021-12-27" "2022-01-27" "2022-02-27" "2022-03-27"
[19] "2022-04-27" "2022-05-27" "2022-06-27" "2022-07-27" "2022-08-27" "2022-09-27"
[25] "2022-10-27" "2022-11-27" "2022-12-27" "2023-01-27" "2023-02-27" "2023-03-27"
[31] "2023-04-27" "2023-05-27" "2023-06-27" "2023-07-27" "2023-08-27"
Related
I would like an object that gives me a date range for every month (or quarter) from 1990-01-01 to 2021-12-31, separated by a colon. So for example in the monthly case, the first object would be 1990-01-01:1990-01-31, the second object would be 1990-02-01:1990-02-31, and so on.
The issue I am having trouble with is making sure that the date range is exclusive, i.e., that no date gets repeated.
start_date1 <- as.Date("1990-01-01", "%Y-%m-%d")
end_date1 <- as.Date("2021-12-01", "%Y-%m-%d")
first_date <- format(seq(start_date1,end_date1,by="month"),"%Y-%m-%d")
start_date2 <- as.Date("1990-02-01", "%Y-%m-%d")
end_date2 <- as.Date("2022-01-01", "%Y-%m-%d")
second_date <- format(seq(start_date2,end_date2,by="month"),"%Y-%m-%d")
date<-paste0(first_date, ":")
finaldate<-paste0(date, second_date)
This code works, except that the first date in each month gets repeated "1990-01-01:1990-02-01" "1990-02-01:1990-03-01", and that the last date is "2021-12-01:2022-01-01" (including Jan 1, 2022 rather than stopping at Dec 31, 2021.
If I go by 30 days instead, it doesn't work as well because not every month has 30 days.
What's the best way to get an exclusive date range?
You could do:
dates <- seq(as.Date("1990-01-01"), as.Date("2022-01-01"), by = "month")
dates <- paste(head(dates, -1), tail(dates-1, - 1), sep = ":")
resulting in:
dates
#> [1] "1990-01-01:1990-01-31" "1990-02-01:1990-02-28" "1990-03-01:1990-03-31"
#> [4] "1990-04-01:1990-04-30" "1990-05-01:1990-05-31" "1990-06-01:1990-06-30"
#> [7] "1990-07-01:1990-07-31" "1990-08-01:1990-08-31" "1990-09-01:1990-09-30"
#> [10] "1990-10-01:1990-10-31" "1990-11-01:1990-11-30" "1990-12-01:1990-12-31"
#> [13] "1991-01-01:1991-01-31" "1991-02-01:1991-02-28" "1991-03-01:1991-03-31"
#> [16] "1991-04-01:1991-04-30" "1991-05-01:1991-05-31" "1991-06-01:1991-06-30"
#> [19] "1991-07-01:1991-07-31" "1991-08-01:1991-08-31" "1991-09-01:1991-09-30"
#> [22] "1991-10-01:1991-10-31" "1991-11-01:1991-11-30" "1991-12-01:1991-12-31"
#> [25] "1992-01-01:1992-01-31" "1992-02-01:1992-02-29" "1992-03-01:1992-03-31"
#> [28] "1992-04-01:1992-04-30" "1992-05-01:1992-05-31" "1992-06-01:1992-06-30"
#> [31] "1992-07-01:1992-07-31" "1992-08-01:1992-08-31" "1992-09-01:1992-09-30"
#> [34] "1992-10-01:1992-10-31" "1992-11-01:1992-11-30" "1992-12-01:1992-12-31"
#> [37] "1993-01-01:1993-01-31" "1993-02-01:1993-02-28" "1993-03-01:1993-03-31"
#> [40] "1993-04-01:1993-04-30" "1993-05-01:1993-05-31" "1993-06-01:1993-06-30"
#> [43] "1993-07-01:1993-07-31" "1993-08-01:1993-08-31" "1993-09-01:1993-09-30"
#> [46] "1993-10-01:1993-10-31" "1993-11-01:1993-11-30" "1993-12-01:1993-12-31"
#> [49] "1994-01-01:1994-01-31" "1994-02-01:1994-02-28" "1994-03-01:1994-03-31"
#> [52] "1994-04-01:1994-04-30" "1994-05-01:1994-05-31" "1994-06-01:1994-06-30"
#> [55] "1994-07-01:1994-07-31" "1994-08-01:1994-08-31" "1994-09-01:1994-09-30"
#> [58] "1994-10-01:1994-10-31" "1994-11-01:1994-11-30" "1994-12-01:1994-12-31"
#> [61] "1995-01-01:1995-01-31" "1995-02-01:1995-02-28" "1995-03-01:1995-03-31"
#> [64] "1995-04-01:1995-04-30" "1995-05-01:1995-05-31" "1995-06-01:1995-06-30"
#> [67] "1995-07-01:1995-07-31" "1995-08-01:1995-08-31" "1995-09-01:1995-09-30"
#> [70] "1995-10-01:1995-10-31" "1995-11-01:1995-11-30" "1995-12-01:1995-12-31"
#> [73] "1996-01-01:1996-01-31" "1996-02-01:1996-02-29" "1996-03-01:1996-03-31"
#> [76] "1996-04-01:1996-04-30" "1996-05-01:1996-05-31" "1996-06-01:1996-06-30"
#> [79] "1996-07-01:1996-07-31" "1996-08-01:1996-08-31" "1996-09-01:1996-09-30"
#> [82] "1996-10-01:1996-10-31" "1996-11-01:1996-11-30" "1996-12-01:1996-12-31"
#> [85] "1997-01-01:1997-01-31" "1997-02-01:1997-02-28" "1997-03-01:1997-03-31"
#> [88] "1997-04-01:1997-04-30" "1997-05-01:1997-05-31" "1997-06-01:1997-06-30"
#> [91] "1997-07-01:1997-07-31" "1997-08-01:1997-08-31" "1997-09-01:1997-09-30"
#> [94] "1997-10-01:1997-10-31" "1997-11-01:1997-11-30" "1997-12-01:1997-12-31"
#> [97] "1998-01-01:1998-01-31" "1998-02-01:1998-02-28" "1998-03-01:1998-03-31"
#> [100] "1998-04-01:1998-04-30" "1998-05-01:1998-05-31" "1998-06-01:1998-06-30"
#> [103] "1998-07-01:1998-07-31" "1998-08-01:1998-08-31" "1998-09-01:1998-09-30"
#> [106] "1998-10-01:1998-10-31" "1998-11-01:1998-11-30" "1998-12-01:1998-12-31"
#> [109] "1999-01-01:1999-01-31" "1999-02-01:1999-02-28" "1999-03-01:1999-03-31"
#> [112] "1999-04-01:1999-04-30" "1999-05-01:1999-05-31" "1999-06-01:1999-06-30"
#> [115] "1999-07-01:1999-07-31" "1999-08-01:1999-08-31" "1999-09-01:1999-09-30"
#> [118] "1999-10-01:1999-10-31" "1999-11-01:1999-11-30" "1999-12-01:1999-12-31"
#> [121] "2000-01-01:2000-01-31" "2000-02-01:2000-02-29" "2000-03-01:2000-03-31"
#> [124] "2000-04-01:2000-04-30" "2000-05-01:2000-05-31" "2000-06-01:2000-06-30"
#> [127] "2000-07-01:2000-07-31" "2000-08-01:2000-08-31" "2000-09-01:2000-09-30"
#> [130] "2000-10-01:2000-10-31" "2000-11-01:2000-11-30" "2000-12-01:2000-12-31"
#> [133] "2001-01-01:2001-01-31" "2001-02-01:2001-02-28" "2001-03-01:2001-03-31"
#> [136] "2001-04-01:2001-04-30" "2001-05-01:2001-05-31" "2001-06-01:2001-06-30"
#> [139] "2001-07-01:2001-07-31" "2001-08-01:2001-08-31" "2001-09-01:2001-09-30"
#> [142] "2001-10-01:2001-10-31" "2001-11-01:2001-11-30" "2001-12-01:2001-12-31"
#> [145] "2002-01-01:2002-01-31" "2002-02-01:2002-02-28" "2002-03-01:2002-03-31"
#> [148] "2002-04-01:2002-04-30" "2002-05-01:2002-05-31" "2002-06-01:2002-06-30"
#> [151] "2002-07-01:2002-07-31" "2002-08-01:2002-08-31" "2002-09-01:2002-09-30"
#> [154] "2002-10-01:2002-10-31" "2002-11-01:2002-11-30" "2002-12-01:2002-12-31"
#> [157] "2003-01-01:2003-01-31" "2003-02-01:2003-02-28" "2003-03-01:2003-03-31"
#> [160] "2003-04-01:2003-04-30" "2003-05-01:2003-05-31" "2003-06-01:2003-06-30"
#> [163] "2003-07-01:2003-07-31" "2003-08-01:2003-08-31" "2003-09-01:2003-09-30"
#> [166] "2003-10-01:2003-10-31" "2003-11-01:2003-11-30" "2003-12-01:2003-12-31"
#> [169] "2004-01-01:2004-01-31" "2004-02-01:2004-02-29" "2004-03-01:2004-03-31"
#> [172] "2004-04-01:2004-04-30" "2004-05-01:2004-05-31" "2004-06-01:2004-06-30"
#> [175] "2004-07-01:2004-07-31" "2004-08-01:2004-08-31" "2004-09-01:2004-09-30"
#> [178] "2004-10-01:2004-10-31" "2004-11-01:2004-11-30" "2004-12-01:2004-12-31"
#> [181] "2005-01-01:2005-01-31" "2005-02-01:2005-02-28" "2005-03-01:2005-03-31"
#> [184] "2005-04-01:2005-04-30" "2005-05-01:2005-05-31" "2005-06-01:2005-06-30"
#> [187] "2005-07-01:2005-07-31" "2005-08-01:2005-08-31" "2005-09-01:2005-09-30"
#> [190] "2005-10-01:2005-10-31" "2005-11-01:2005-11-30" "2005-12-01:2005-12-31"
#> [193] "2006-01-01:2006-01-31" "2006-02-01:2006-02-28" "2006-03-01:2006-03-31"
#> [196] "2006-04-01:2006-04-30" "2006-05-01:2006-05-31" "2006-06-01:2006-06-30"
#> [199] "2006-07-01:2006-07-31" "2006-08-01:2006-08-31" "2006-09-01:2006-09-30"
#> [202] "2006-10-01:2006-10-31" "2006-11-01:2006-11-30" "2006-12-01:2006-12-31"
#> [205] "2007-01-01:2007-01-31" "2007-02-01:2007-02-28" "2007-03-01:2007-03-31"
#> [208] "2007-04-01:2007-04-30" "2007-05-01:2007-05-31" "2007-06-01:2007-06-30"
#> [211] "2007-07-01:2007-07-31" "2007-08-01:2007-08-31" "2007-09-01:2007-09-30"
#> [214] "2007-10-01:2007-10-31" "2007-11-01:2007-11-30" "2007-12-01:2007-12-31"
#> [217] "2008-01-01:2008-01-31" "2008-02-01:2008-02-29" "2008-03-01:2008-03-31"
#> [220] "2008-04-01:2008-04-30" "2008-05-01:2008-05-31" "2008-06-01:2008-06-30"
#> [223] "2008-07-01:2008-07-31" "2008-08-01:2008-08-31" "2008-09-01:2008-09-30"
#> [226] "2008-10-01:2008-10-31" "2008-11-01:2008-11-30" "2008-12-01:2008-12-31"
#> [229] "2009-01-01:2009-01-31" "2009-02-01:2009-02-28" "2009-03-01:2009-03-31"
#> [232] "2009-04-01:2009-04-30" "2009-05-01:2009-05-31" "2009-06-01:2009-06-30"
#> [235] "2009-07-01:2009-07-31" "2009-08-01:2009-08-31" "2009-09-01:2009-09-30"
#> [238] "2009-10-01:2009-10-31" "2009-11-01:2009-11-30" "2009-12-01:2009-12-31"
#> [241] "2010-01-01:2010-01-31" "2010-02-01:2010-02-28" "2010-03-01:2010-03-31"
#> [244] "2010-04-01:2010-04-30" "2010-05-01:2010-05-31" "2010-06-01:2010-06-30"
#> [247] "2010-07-01:2010-07-31" "2010-08-01:2010-08-31" "2010-09-01:2010-09-30"
#> [250] "2010-10-01:2010-10-31" "2010-11-01:2010-11-30" "2010-12-01:2010-12-31"
#> [253] "2011-01-01:2011-01-31" "2011-02-01:2011-02-28" "2011-03-01:2011-03-31"
#> [256] "2011-04-01:2011-04-30" "2011-05-01:2011-05-31" "2011-06-01:2011-06-30"
#> [259] "2011-07-01:2011-07-31" "2011-08-01:2011-08-31" "2011-09-01:2011-09-30"
#> [262] "2011-10-01:2011-10-31" "2011-11-01:2011-11-30" "2011-12-01:2011-12-31"
#> [265] "2012-01-01:2012-01-31" "2012-02-01:2012-02-29" "2012-03-01:2012-03-31"
#> [268] "2012-04-01:2012-04-30" "2012-05-01:2012-05-31" "2012-06-01:2012-06-30"
#> [271] "2012-07-01:2012-07-31" "2012-08-01:2012-08-31" "2012-09-01:2012-09-30"
#> [274] "2012-10-01:2012-10-31" "2012-11-01:2012-11-30" "2012-12-01:2012-12-31"
#> [277] "2013-01-01:2013-01-31" "2013-02-01:2013-02-28" "2013-03-01:2013-03-31"
#> [280] "2013-04-01:2013-04-30" "2013-05-01:2013-05-31" "2013-06-01:2013-06-30"
#> [283] "2013-07-01:2013-07-31" "2013-08-01:2013-08-31" "2013-09-01:2013-09-30"
#> [286] "2013-10-01:2013-10-31" "2013-11-01:2013-11-30" "2013-12-01:2013-12-31"
#> [289] "2014-01-01:2014-01-31" "2014-02-01:2014-02-28" "2014-03-01:2014-03-31"
#> [292] "2014-04-01:2014-04-30" "2014-05-01:2014-05-31" "2014-06-01:2014-06-30"
#> [295] "2014-07-01:2014-07-31" "2014-08-01:2014-08-31" "2014-09-01:2014-09-30"
#> [298] "2014-10-01:2014-10-31" "2014-11-01:2014-11-30" "2014-12-01:2014-12-31"
#> [301] "2015-01-01:2015-01-31" "2015-02-01:2015-02-28" "2015-03-01:2015-03-31"
#> [304] "2015-04-01:2015-04-30" "2015-05-01:2015-05-31" "2015-06-01:2015-06-30"
#> [307] "2015-07-01:2015-07-31" "2015-08-01:2015-08-31" "2015-09-01:2015-09-30"
#> [310] "2015-10-01:2015-10-31" "2015-11-01:2015-11-30" "2015-12-01:2015-12-31"
#> [313] "2016-01-01:2016-01-31" "2016-02-01:2016-02-29" "2016-03-01:2016-03-31"
#> [316] "2016-04-01:2016-04-30" "2016-05-01:2016-05-31" "2016-06-01:2016-06-30"
#> [319] "2016-07-01:2016-07-31" "2016-08-01:2016-08-31" "2016-09-01:2016-09-30"
#> [322] "2016-10-01:2016-10-31" "2016-11-01:2016-11-30" "2016-12-01:2016-12-31"
#> [325] "2017-01-01:2017-01-31" "2017-02-01:2017-02-28" "2017-03-01:2017-03-31"
#> [328] "2017-04-01:2017-04-30" "2017-05-01:2017-05-31" "2017-06-01:2017-06-30"
#> [331] "2017-07-01:2017-07-31" "2017-08-01:2017-08-31" "2017-09-01:2017-09-30"
#> [334] "2017-10-01:2017-10-31" "2017-11-01:2017-11-30" "2017-12-01:2017-12-31"
#> [337] "2018-01-01:2018-01-31" "2018-02-01:2018-02-28" "2018-03-01:2018-03-31"
#> [340] "2018-04-01:2018-04-30" "2018-05-01:2018-05-31" "2018-06-01:2018-06-30"
#> [343] "2018-07-01:2018-07-31" "2018-08-01:2018-08-31" "2018-09-01:2018-09-30"
#> [346] "2018-10-01:2018-10-31" "2018-11-01:2018-11-30" "2018-12-01:2018-12-31"
#> [349] "2019-01-01:2019-01-31" "2019-02-01:2019-02-28" "2019-03-01:2019-03-31"
#> [352] "2019-04-01:2019-04-30" "2019-05-01:2019-05-31" "2019-06-01:2019-06-30"
#> [355] "2019-07-01:2019-07-31" "2019-08-01:2019-08-31" "2019-09-01:2019-09-30"
#> [358] "2019-10-01:2019-10-31" "2019-11-01:2019-11-30" "2019-12-01:2019-12-31"
#> [361] "2020-01-01:2020-01-31" "2020-02-01:2020-02-29" "2020-03-01:2020-03-31"
#> [364] "2020-04-01:2020-04-30" "2020-05-01:2020-05-31" "2020-06-01:2020-06-30"
#> [367] "2020-07-01:2020-07-31" "2020-08-01:2020-08-31" "2020-09-01:2020-09-30"
#> [370] "2020-10-01:2020-10-31" "2020-11-01:2020-11-30" "2020-12-01:2020-12-31"
#> [373] "2021-01-01:2021-01-31" "2021-02-01:2021-02-28" "2021-03-01:2021-03-31"
#> [376] "2021-04-01:2021-04-30" "2021-05-01:2021-05-31" "2021-06-01:2021-06-30"
#> [379] "2021-07-01:2021-07-31" "2021-08-01:2021-08-31" "2021-09-01:2021-09-30"
#> [382] "2021-10-01:2021-10-31" "2021-11-01:2021-11-30" "2021-12-01:2021-12-31"
Created on 2022-03-19 by the reprex package (v2.0.1)
I used lubridate for the simplicity of its ymd() function.
require(lubridate)
You start with creating a vector of first days of the month:
start <- seq(ymd("1990-01-01"), ymd("2021-12-01"), by = "month")
Then you create another vector subtracting 1 day to obtain the last day of each month:
b <- start - 1
You remove the first element of that vector
end <- b[-1]
You join them all
paste0(start, ":", end)
There's an easily (manually) fixable issue: the very last interval is incorrect.
1) yearmon/yearqtr Create a monthly sequence using yearmon class and then convert that to the start and end dates. Similarly for quarters and yearqtr class. Internally both represent dates by year and fraction of year so use 1/12 and 1/4 in by=. Also note that using as.Date gives the date at the start of the month or quarter and the same but with the frac=1 argument gives the end.
library(zoo)
# input
st <- as.Date("1990-01-01")
en <- as.Date("2021-12-01")
# by month
mon <- seq(as.yearmon(st), as.yearmon(en), 1/12)
paste(as.Date(mon), as.Date(mon, frac = 1), sep = ":")
# by quarter
qtr <- seq(as.yearqtr(st), as.yearqtr(en), 1/4)
paste(as.Date(qtr), as.Date(qtr, frac = 1), sep = ":")
There is some question of what the end date should be. The above give an end date on the last interval of 2021-12-31 but if the end date should be 2021-12-01 so that no interval extends past en then replace the two paste lines with these respectively.
paste(as.Date(mon), pmin(as.Date(mon, frac = 1), en), sep = ":")
paste(as.Date(qtr), pmin(as.Date(qtr, frac = 1), en), sep = ":")
2) Base R A base R alternative is to use the expressions involving cut shown below to get the end of period. (1) seems less tricky but this might be useful if using only base R were desired. A similar approach with pmin as in (1) could be used if we want to ensure that no range extends beyond en.
This and the remaining solutions, but not (1), assume that st is the first of the month; however, that could readily be handled if needed.
mon <- seq(st, en, by = "month")
paste(mon, as.Date(cut(mon + 31, "month")) - 1, sep = ":")
qtr <- seq(st, en, by = "quarter")
paste(as.Date(qtr), as.Date(cut(qtr + 93, "month")) - 1, sep = ":")
3) lubridate Using various functions from this package we can write the following. A similar approach using pmin as in (1) could be used if the ranges may not extend beyond en.
library(lubridate)
mon <- seq(st, en, by = "month")
paste(mon, mon + month(1) - 1, sep = ":")
qtr <- seq(st, en, by = "quarter")
paste(qtr, qtr + quarter(1) - 1, sep = ":")
4) IDate We can use IDate class from data.table in which case we can make use of cut.IDate which returns another IDate object rather than a character string (as in base R).
st <- as.IDate("1990-01-01")
en <- as.IDate("2021-12-01")
mon <- seq(st, en, by = "month")
paste(mon, cut(mon + 31, "month") - 1, sep = ":")
qtr <- seq(st, en, by = "quarter")
paste(qtr, cut(qtr + 93, "month") - 1, sep = ":")
I have a huge (~10.000.000 rows) dataframe with a column that consists dates, i.e:
df <- data.frame(StartDate = as.character(c("2014-08-20 11:59:38",
"2014-08-21 16:17:44",
"2014-08-22 19:02:10",
"9/1/2014 08:05:13",
"9/2/2014 15:13:28",
"9/3/2014 00:22:01")))
The problem is that date formats are mixed - I would like to standardise them so as to get:
StartDate
1 2014-08-20
2 2014-08-21
3 2014-08-22
4 2014-09-01
5 2014-09-02
6 2014-09-03
1. as.Date() approach
as.Date("2014-08-31 23:59:38", "%m/%d/%Y")
as.Date("9/1/2014 00:00:28", "%m/%d/%Y")
gives
[1] NA
[1] "2014-09-01"
2. lubridate approach
dmy("9/1/2014 00:00:28")
mdy("9/1/2014 00:00:28")
dmy("2014-08-31 23:59:38")
mdy("2014-08-31 23:59:38")
in each case returns
[1] NA
Warning message:
All formats failed to parse. No formats found.
Is there any neat solution to that?
Easier maybe to use parse_date
library(parsedate)
df$StartDate <- as.Date(parse_date(df$StartDate))
-output
> df$StartDate
[1] "2014-08-20" "2014-08-21" "2014-08-22" "2014-09-01" "2014-09-02" "2014-09-03"
I have just found out that anytime::anydate extracts the dates directly and straightforwardly:
library(anytime)
library(tidyverse)
df %>%
mutate(Date = anydate(StartDate))
#> StartDate Date
#> 1 2014-08-20 11:59:38 2014-08-20
#> 2 2014-08-21 16:17:44 2014-08-21
#> 3 2014-08-22 19:02:10 2014-08-22
#> 4 9/1/2014 08:05:13 2014-09-01
#> 5 9/2/2014 15:13:28 2014-09-02
#> 6 9/3/2014 00:22:01 2014-09-03
Another solution, based on lubridate:
library(tidyverse)
library(lubridate)
df %>%
mutate(Date = if_else(!str_detect(StartDate,"/"),
date(ymd_hms(StartDate, quiet = T)), date(mdy_hms(StartDate, quiet = T))))
#> StartDate Date
#> 1 2014-08-20 11:59:38 2014-08-20
#> 2 2014-08-21 16:17:44 2014-08-21
#> 3 2014-08-22 19:02:10 2014-08-22
#> 4 9/1/2014 08:05:13 2014-09-01
#> 5 9/2/2014 15:13:28 2014-09-02
#> 6 9/3/2014 00:22:01 2014-09-03
I have the following dataset wherein, I have the person's ID, district and sub-district they live in along with the last date/time on which they uploaded data to the server. The variables "last_down_" contain the last date/time on which a person the uploaded data and are named in such a way that they show the date on which I had downloaded the data on the same. For example, "last_upload_2020-06-12" would mean I downloaded the data from the server on 12th June.
For the below dataset, I would like to spilt the date and time in each of the variables (all at once) in a way that the new separated variables which are created go by the name "last_date_(my download date)" & "last_time_(my download date)"
district block id last_upload_2020-06-12 last_upload_2020-06-13 last_upload_2020-06-14 last_upload_2020-06-15
A X 11 2020-02-06 11:53:19.0 2020-02-06 11:53:19.0 2020-02-06 11:53:19.0 2020-02-06 11:53:19.0
A X 12 2020-06-11 12:40:26.0 2020-06-11 12:40:26.0 2020-06-14 11:40:26.0 2020-06-15 18:50:26.0
A X 2020-06-14 11:08:12.0 2020-06-14 11:08:12.0
A X 14 2020-06-12 11:31:07.0 2020-06-13 11:31:07.0 2020-06-14 17:37:07.0 2020-06-14 17:37:07.0
A Y 15 2020-06-10 12:45:48.0 2020-06-10 12:45:48.0 2020-06-10 12:45:48.0 2020-06-10 12:45:48.0
A Y 16 2020-04-04 02:26:57.0 2020-04-04 02:26:57.0 2020-04-04 02:26:57.0 2020-04-04 02:26:57.0
A Y 17 2020-03-31 08:10:03.0 2020-03-31 08:10:03.0 2020-03-31 08:10:03.0 2020-03-31 08:10:03.0
A Y 18 2020-05-30 12:08:15.0 2020-05-30 12:08:15.0 2020-05-30 12:08:15.0 2020-05-30 12:08:15.0
A Z 19 2020-04-09 15:21:52.0 2020-04-09 15:21:52.0 2020-04-09 15:21:52.0 2020-04-09 15:21:52.0
A Z 20 2020-05-30 17:42:33.0 2020-05-30 17:42:33.0 2020-05-30 17:42:33.0 2020-05-30 17:42:33.0
A Z 21 2020-04-12 14:23:29.0 2020-04-12 14:23:29.0 2020-04-12 14:23:29.0 2020-04-12 14:23:29.0
A Z 22 2020-05-13 23:18:19.0 2020-05-13 23:18:19.0 2020-05-13 23:18:19.0 2020-05-13 23:18:19.0
A X 23 2020-04-30 09:53:31.0 2020-04-30 09:53:31.0 2020-04-30 09:53:31.0 2020-04-30 09:53:31.0
A X 24 2020-06-10 10:28:59.0 2020-06-10 10:28:59.0 2020-06-10 10:28:59.0 2020-06-15 11:31:33.0
A Y 25
A Y 26 2020-05-30 12:14:09.0 2020-05-30 12:14:09.0 2020-05-30 12:14:09.0 2020-05-30 12:14:09.0
B E 31
B C 32 2020-06-12 16:43:23.0 2020-06-12 16:43:23.0 2020-06-12 16:43:23.0 2020-06-12 16:43:23.0
B C 33 2019-10-24 22:30:35.0 2019-10-24 22:30:35.0 2019-10-24 22:30:35.0 2019-10-24 22:30:35.0
B C 34 2020-06-09 15:38:18.0 2020-06-09 15:38:18.0 2020-06-09 15:38:18.0 2020-06-15 14:35:41.0
B C 35 2020-06-11 14:39:51.0 2020-06-11 14:39:51.0 2020-06-11 14:39:51.0 2020-06-11 14:39:51.0
B D 36 2020-06-12 11:53:15.0 2020-06-12 11:53:15.0 2020-06-12 11:53:15.0 2020-06-15 13:02:39.0
B D 37 2020-04-21 15:43:43.0 2020-04-21 15:43:43.0 2020-04-21 15:43:43.0 2020-04-21 15:43:43.0
B D 38 2020-05-13 04:07:17.0 2020-05-13 04:07:17.0 2020-05-13 04:07:17.0 2020-05-13 04:07:17.0
B E 39 2020-04-30 13:51:20.0 2020-04-30 13:51:20.0 2020-04-30 13:51:20.0 2020-04-30 13:51:20.0
B E 40 2020-05-12 16:51:01.0 2020-05-12 16:51:01.0 2020-05-12 16:51:01.0 2020-05-12 16:51:01.0
B E 41 2020-04-16 12:14:24.0 2020-04-16 12:14:24.0 2020-04-16 12:14:24.0 2020-04-16 12:14:24.0
B C 42 2018-06-07 15:12:18.0 2018-06-07 15:12:18.0 2018-06-07 15:12:18.0 2018-06-07 15:12:18.0
B D 43 2019-09-28 10:08:51.0 2019-09-28 10:08:51.0 2019-09-28 10:08:51.0 2019-09-28 10:08:51.0
N.B: my date/time variables are numeric.
Once I get the data in shape, I would also like to do the following:
Get the year and month of all observations under "last_upload_2020-06-12" in a separate column.
Similarly, for the last date in my dataset that is "last_upload_2020-06-15". Can I automate R picking the last date something like Sys.Date()-1? I will always have the data for one date less than current.
Calculate the average upload time per ID, i.e., generally around what time does a person upload data to the server? Average should be based on unique time values.
Would be extremely helpful if someone could help solve this!
Thanks,
Rachita
You can try the following code in your original data set. This might help you to answer the introductory, first part, third part and lastly the second part of the question.
library(lubridate)
library(tidyverse)
district <- c("A","A","B","B","C","C")
block <- c("X","Y","Z","X","Y","Z")
id <- c(11,11,12,12,13,13)
upload_dt <- ymd_hms(c("2020-06-13 11:31:07",
"2020-04-12 14:23:29",
"2020-04-30 13:51:20",
"2020-06-12 11:53:15",
"2019-09-28 02:08:51",
"2020-04-12 16:23:29"))
df <- data.frame(district, block, id, upload_dt)
df <- df %>%
separate(upload_dt, into = c("date","time"),
sep = " ", remove = F)
df$upload_date <- paste("last_upload_date_is", df$date)
df$upload_time <- paste("last_upload_time_is", df$time)
df <- df %>%
mutate(date_added = ymd(df$date),
year_upload = year(date),
month_upload = month(date))
df
The output for introductory and first part of the question is as follows:-
district block id upload_dt date time upload_date
1 A X 11 2020-06-13 11:31:07 2020-06-13 11:31:07 last_upload_date_is 2020-06-13
2 A Y 11 2020-04-12 14:23:29 2020-04-12 14:23:29 last_upload_date_is 2020-04-12
3 B Z 12 2020-04-30 13:51:20 2020-04-30 13:51:20 last_upload_date_is 2020-04-30
4 B X 12 2020-06-12 11:53:15 2020-06-12 11:53:15 last_upload_date_is 2020-06-12
5 C Y 13 2019-09-28 02:08:51 2019-09-28 02:08:51 last_upload_date_is 2019-09-28
6 C Z 13 2020-04-12 16:23:29 2020-04-12 16:23:29 last_upload_date_is 2020-04-12
upload_time date_added year_upload month_upload
1 last_upload_time_is 11:31:07 2020-06-13 2020 6
2 last_upload_time_is 14:23:29 2020-04-12 2020 4
3 last_upload_time_is 13:51:20 2020-04-30 2020 4
4 last_upload_time_is 11:53:15 2020-06-12 2020 6
5 last_upload_time_is 02:08:51 2019-09-28 2019 9
6 last_upload_time_is 16:23:29 2020-04-12 2020 4
The code and the output for the third part of the question is as follows:-
df %>% group_by(id) %>%
summarise(avg_time_per_id = format(mean(strptime(time, "%H:%M:%S")), "%H:%M:%S")) %>%
ungroup()
# A tibble: 3 x 2
id avg_time_per_id
<dbl> <chr>
1 11 12:57:18
2 12 12:52:17
3 13 09:16:10
The code and the output for the second part of the question is as follows:-
(Note for this I have created a new data frame.) You can apply this solution to the existing data set.
df <- data.frame(
id = c(1:5),
district = c("X","Y","X","Y","X"),
block = c("A","A","B","B","C"),
upload_date_a = paste0(rep("2020-06-13"), " ", rep("11:31:07")),
upload_date_b = paste0(rep("2010-08-15"), " ", rep("02:45:27")),
upload_date_c = paste0(rep("2000-10-30"), " ", rep("16:45:51")),
stringsAsFactors = F
)
col_ind <- grep(x = names(df), pattern = "upload_date", value = T, ignore.case = T)
cols_list <- lapply(seq_along(col_ind), function(x){
q1 <- do.call(rbind, strsplit(df[[col_ind[[x]]]], split = " "))
q2 <- data.frame(q1, stringsAsFactors = F)
i <- ncol(q2)
colnames(q2) <- paste0(col_ind[[x]], c(1:i))
return(q2)
}
)
df_new <- cbind(df[1:3], do.call(cbind, cols_list))
df_new
id district block upload_date_a1 upload_date_a2 upload_date_b1
1 1 X A 2020-06-13 11:31:07 2010-08-15
2 2 Y A 2020-06-13 11:31:07 2010-08-15
3 3 X B 2020-06-13 11:31:07 2010-08-15
4 4 Y B 2020-06-13 11:31:07 2010-08-15
5 5 X C 2020-06-13 11:31:07 2010-08-15
upload_date_b2 upload_date_c1 upload_date_c2
1 02:45:27 2000-10-30 16:45:51
2 02:45:27 2000-10-30 16:45:51
3 02:45:27 2000-10-30 16:45:51
4 02:45:27 2000-10-30 16:45:51
5 02:45:27 2000-10-30 16:45:51
The Df looked so complicated that I thought it might be better to replicate it.
I then used a function to take every column you wanted and separate it into the last_date and last_time as wanted. Inside the function the temporary DF is cbind to a DF built outside of the loop. This DF consisted out of the columns which are not treated in the loop.
The result of this loop is the DF as wanted. [colnames got a little long]
The key for the second task was to transfer to last_time to hours, then grouping und summarizing.
I hope this is what you wanted.
I think with this as a basis you can deal with no2.
There were some warnings which had to do with NA's.
More explanation in the reprex below.
library(tidyverse)
df <- read.table(text = '
district block id last_upload_2020_06_12 last_upload_2020_06_13 last_upload_2020_06_14 last_upload_2020_06_15
"A" "X" 11 "2020-02-06 11:53:19.0" "2020-02-06 11:53:19.0" "2020-02-06 11:53:19.0" "2020-02-06 11:53:19.0"
"A" "X" 12 "2020-06-11 12:40:26.0" "2020-06-11 12:40:26.0" "2020-06-14 11:40:26.0" "2020-06-15 18:50:26.0"
"A" "X" NA "NA" "NA" "2020-06-14 11:0812.0" "2020-06-14 11:0812.0"
"A" "X" 14 "2020-06-12 11:31:07.0" "2020-06-13 11:31:07.0" "2020-06-14 17:37:07.0" "2020-06-14 17:37:07.0"
"A" "Y" 15 "2020-06-10 12:45:48.0" "2020-06-10 12:45:48.0" "2020-06-10 12:45:48.0" "2020-06-10 12:45:48.0"
"A" "Y" 16 "2020-04-04 02:26:57.0" "2020-04-04 02:26:57.0" "2020-04-04 02:26:57.0" "2020-04-04 02:26:57.0"
"A" "Y" 17 "2020-03-31 08:10:03.0" "2020-03-31 08:10:03.0" "2020-03-31 08:10:03.0" "2020-03-31 08:10:03.0"
"A" "Y" 18 "2020-05-30 12:08:15.0" "2020-05-30 12:08:15.0" "2020-05-30 12:08:15.0" "2020-05-30 12:08:15.0"
"A" "Z" 19 "2020-04-09 15:21:52.0" "2020-04-09 15:21:52.0" "2020-04-09 15:21:52.0" "2020-04-09 15:21:52.0"
"A" "Z" 20 "2020-05-30 17:42:33.0" "2020-05-30 17:42:33.0" "2020-05-30 17:42:33.0" "2020-05-30 17:42:33.0"
"A" "Z" 21 "2020-04-12 14:23:29.0" "2020-04-12 14:23:29.0" "2020-04-12 14:23:29.0" "2020-04-12 14:23:29.0"
"A" "Z" 22 "2020-05-13 23:18:19.0" "2020-05-13 23:18:19.0" "2020-05-13 23:18:19.0" "2020-05-13 23:18:19.0"
"A" "X" 23 "2020-04-30 09:53:31.0" "2020-04-30 09:53:31.0" "2020-04-30 09:53:31.0" "2020-04-30 09:53:31.0"
"A" "X" 24 "2020-06-10 10:28:59.0" "2020-06-10 10:28:59.0" "2020-06-10 10:28:59.0" "2020-06-15 11:31:33.0"
"A" "Y" 25 " " "" "" ""
"A" "Y" 26 "2020-05-3012:14:09.0" "2020-05-30 12:14:09.0" "2020-05-30 12:14:09.0" "2020-05-30 12:14:09.0"
"B" "E" 31 "" "" "" "" ""
"B" "C" 32 "2020-06-1 16:43:23.0" "2020-06-12 16:43:23.0" "2020-06-12 16:43:23.0" "2020-06-12 16:43:23.0"
"B" "C" 33 "2019-10-24 22:30:35.0" "2019-10-24 22:30:35.0" "2019-10-24 22:30:35.0" "2019-10-24 22:30:35.0"
"B" "C" 34 "2020-06-09 15:38:18.0" "2020-06-09 15:38:18.0" "2020-06-09 15:38:18.0" "2020-06-15 14:35:41.0"
"B" "C" 35 "2020-06-11 14:39:51.0" "2020-06-11 14:39:51.0" "2020-06-11 14:39:51.0" "2020-06-11 14:39:51.0"
"B" "D" 36 "2020-06-12 11:53:15.0" "2020-06-12 11:53:15.0" "2020-06-12 11:53:15.0" "2020-06-15 13:02:39.0"
"B" "D" 37 "2020-04-21 15:43:43.0" "2020-04-21 15:43:43.0" "2020-04-21 15:43:43.0" "2020-04-21 15:43:43.0"
"B" "D" 38 "2020-05-13 04:07:17.0" "2020-05-13 04:07:17.0" "2020-05-13 04:07:17.0" "2020-05-13 04:07:17.0"
"B" "E" 39 "2020-04-30 13:51:20.0" "2020-04-30 13:51:20.0" "2020-04-30 13:51:20.0" "2020-04-30 13:51:20.0"
"B" "E" 40 "2020-05-12 16:51:01.0" "2020-05-12 16:51:01.0" "2020-05-12 16:51:01.0" "2020-05-12 16:51:01.0"
"B" "E" 41 "2020-04-16 12:14:24.0" "2020-04-16 12:14:24.0" "2020-04-16 12:14:24.0" "2020-04-16 12:14:24.0"
"B" "C" 42 "2018-06-07 15:12:18.0" "2018-06-07 15:12:18.0" "2018-06-07 15:12:18.0" "2018-06-07 15:12:18.0"
"B" "D" 43 "2019-09-28 10:08:51.0" "2019-09-28 10:08:51.0" "2019-09-28 10:08:51.0" "2019-09-28 10:08:51.0"
', header =T)
# TASK: create for each column which contains 'last_upload' new columns
# with date and time
# get the colnames of the cols to be split or separated
ccl <- colnames(df %>% select(last_upload_2020_06_12:last_upload_2020_06_15))
# create new DF with first 3 columns, to which other columns are bound in
# the following function
dff <- df %>% select(district:id)
# function to separate each col in ccl to _date and _time
for (cl in ccl) {
tmp <- separate(df,
col = cl, sep = " ",
into = c(paste0(cl, "_date"), paste0(cl, "_time"))
) %>%
select(contains("_date") | contains("_time"))
dff <- cbind(dff, tmp)
}
dff %>% head()
#> district block id last_upload_2020_06_12_date last_upload_2020_06_12_time
#> 1 A X 11 2020-02-06 11:53:19.0
#> 2 A X 12 2020-06-11 12:40:26.0
#> 3 A X NA <NA> <NA>
#> 4 A X 14 2020-06-12 11:31:07.0
#> 5 A Y 15 2020-06-10 12:45:48.0
#> 6 A Y 16 2020-04-04 02:26:57.0
#> last_upload_2020_06_13_date last_upload_2020_06_13_time
#> 1 2020-02-06 11:53:19.0
#> 2 2020-06-11 12:40:26.0
#> 3 <NA> <NA>
#> 4 2020-06-13 11:31:07.0
#> 5 2020-06-10 12:45:48.0
#> 6 2020-04-04 02:26:57.0
#> last_upload_2020_06_14_date last_upload_2020_06_14_time
#> 1 2020-02-06 11:53:19.0
#> 2 2020-06-14 11:40:26.0
#> 3 2020-06-14 11:0812.0
#> 4 2020-06-14 17:37:07.0
#> 5 2020-06-10 12:45:48.0
#> 6 2020-04-04 02:26:57.0
#> last_upload_2020_06_15_date last_upload_2020_06_15_time
#> 1 2020-02-06 11:53:19.0
#> 2 2020-06-15 18:50:26.0
#> 3 2020-06-14 11:0812.0
#> 4 2020-06-14 17:37:07.0
#> 5 2020-06-10 12:45:48.0
#> 6 2020-04-04 02:26:57.0
# TASK: Calculate the average time of a day each id does a download
# new DF from original brought into long format
# split the date/time into last_date and last_time
ddf <- df %>%
pivot_longer(cols = last_upload_2020_06_12:last_upload_2020_06_15) %>%
separate(col = value, sep = ' ', into = c('last_date', 'last_time')) %>%
mutate(last_date = lubridate::ymd(last_date), last_time= lubridate::hms(last_time))
# calculating the mean hour of the day at which each id does a
# download, by calculating last_time to hours (of the day) and
# after grouping build mean hour
ddf %>%
mutate(hours = as.numeric(lubridate::hms(last_time), unit = 'hour')) %>%
group_by(id) %>% summarise(meanHourOfTheDay = mean(hours, na.rm = T))
#> # A tibble: 29 x 2
#> id meanHourOfTheDay
#> <int> <dbl>
#> 1 11 11.9
#> 2 12 14.0
#> 3 14 14.6
#> 4 15 12.8
#> 5 16 2.45
#> 6 17 8.17
#> 7 18 12.1
#> 8 19 15.4
#> 9 20 17.7
#> 10 21 14.4
#> # … with 19 more rows
Here is a sample of my tibble
protein patient value
<chr> <chr> <dbl>
1 BOD1L2 RF0064_Case-9-d- 10.4
2 PPFIA2 RF0064_Case-20-d- 7.83
3 STAT4 RF0064_Case-11-d- 11.0
4 TOM1L2 RF0064_Case-29-d- 13.0
5 SH2D2A RF0064_Case-2-d- 8.28
6 TIGD4 RF0064_Case-49-d- 9.71
In the "patient" column the "d" as in "Case-x-d" represents the a number of days. What I would like to do is create a new column stating whether the strings in the "patient" column contain values less than 14d.
I have managed to do this using the following command:
under14 <- "-1d|-2d|-3d|-4d|-4d|-5d|-6d|-7d|-8d|-9d|-11d|-12d|-13d|-14d"
data <- data %>%
mutate(case=ifelse(grepl(under14,data$patient),'under14days','over14days'))
However this seems extremely clunky and actually took way to long to type. I will have to be changing my search term many times so would like a quicker way to do this? Perhaps using some kind of regex is the best option, but I don't really know where to start with this.
R version 3.5.0 (2018-04-23)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13.5
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
locale:
[1] en_NZ.UTF-8/en_NZ.UTF-8/en_NZ.UTF-8/C/en_NZ.UTF-8/en_NZ.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] readxl_1.1.0 Rmisc_1.5 plyr_1.8.4 lattice_0.20-35 forcats_0.3.0 stringr_1.3.1 dplyr_0.7.5 purrr_0.2.5
[9] readr_1.1.1 tidyr_0.8.1 tibble_1.4.2 ggplot2_2.2.1 tidyverse_1.2.1
loaded via a namespace (and not attached):
[1] Rcpp_0.12.17 cellranger_1.1.0 pillar_1.2.3 compiler_3.5.0 bindr_0.1.1 tools_3.5.0 lubridate_1.7.4
[8] jsonlite_1.5 nlme_3.1-137 gtable_0.2.0 pkgconfig_2.0.1 rlang_0.2.1 psych_1.8.4 cli_1.0.0
[15] rstudioapi_0.7 yaml_2.1.19 parallel_3.5.0 haven_1.1.1 bindrcpp_0.2.2 xml2_1.2.0 httr_1.3.1
[22] hms_0.4.2 grid_3.5.0 tidyselect_0.2.4 glue_1.2.0 R6_2.2.2 foreign_0.8-70 modelr_0.1.2
[29] reshape2_1.4.3 magrittr_1.5 scales_0.5.0 rvest_0.3.2 assertthat_0.2.0 mnormt_1.5-5 colorspace_1.3-2
[36] utf8_1.1.4 stringi_1.2.3 lazyeval_0.2.1 munsell_0.5.0 broom_0.4.4 crayon_1.3.4
>
One possibility is to use tidyr::separate
library(tidyverse)
df %>%
separate(patient, into = c("ID1", "Days", "ID2"), sep = "-", extra = "merge", remove = F) %>%
mutate(case = ifelse(as.numeric(Days) <= 14, "under14days", "over14days")) %>%
select(-ID1, -ID2)
# protein patient Days value case
#1 BOD1L2 RF0064_Case-9-d- 9 10.40 under14days
#2 PPFIA2 RF0064_Case-20-d- 20 7.83 over14days
#3 STAT4 RF0064_Case-11-d- 11 11.00 under14days
#4 TOM1L2 RF0064_Case-29-d- 29 13.00 over14days
#5 SH2D2A RF0064_Case-2-d- 2 8.28 under14days
#6 TIGD4 RF0064_Case-49-d- 49 9.71 over14days
Sample data
df <-read.table(text =
" protein patient value
1 BOD1L2 RF0064_Case-9-d- 10.4
2 PPFIA2 RF0064_Case-20-d- 7.83
3 STAT4 RF0064_Case-11-d- 11.0
4 TOM1L2 RF0064_Case-29-d- 13.0
5 SH2D2A RF0064_Case-2-d- 8.28
6 TIGD4 RF0064_Case-49-d- 9.71 ", header = T, row.names = 1)
Since, format of the patient is clearly defined, a possible solution in base-R using gsub can be to extract the days and check with range as:
df$case <- ifelse(as.integer(gsub("RF0064_Case-(\\d+)-d-","\\1", df$patient)) <= 14,
"under14days", "over14days")
Exactly, same way, OP can modify code used in mutate as:
library(dplyr)
df <- df %>%
mutate(case = ifelse(as.integer(gsub("RF0064_Case-(\\d+)-d-","\\1", patient)) <= 14,
"under14days", "over14days"))
df
# protein patient value case
# 1 BOD1L2 RF0064_Case-9-d- 10.40 under14days
# 2 PPFIA2 RF0064_Case-20-d- 7.83 over14days
# 3 STAT4 RF0064_Case-11-d- 11.00 under14days
# 4 TOM1L2 RF0064_Case-29-d- 13.00 over14days
# 5 SH2D2A RF0064_Case-2-d- 8.28 under14days
# 6 TIGD4 RF0064_Case-49-d- 9.71 over14days
Data:
df <- read.table(text =
"protein patient value
1 BOD1L2 RF0064_Case-9-d- 10.4
2 PPFIA2 RF0064_Case-20-d- 7.83
3 STAT4 RF0064_Case-11-d- 11.0
4 TOM1L2 RF0064_Case-29-d- 13.0
5 SH2D2A RF0064_Case-2-d- 8.28
6 TIGD4 RF0064_Case-49-d- 9.71",
header = TRUE, stringsAsFactors = FALSE)
We can also extract the number directly with regex. ?<=- is look behind, which identifies the position with "-"
library(tidyverse)
dat2 <- dat %>%
mutate(Day = as.numeric(str_extract(patient, pattern = "(?<=-)[0-9]*"))) %>%
mutate(case = ifelse(Day <= 14,'under14days','over14days'))
dat2
# protein patient value Day case
# 1 BOD1L2 RF0064_Case-9-d- 10.40 9 under14days
# 2 PPFIA2 RF0064_Case-20-d- 7.83 20 over14days
# 3 STAT4 RF0064_Case-11-d- 11.00 11 under14days
# 4 TOM1L2 RF0064_Case-29-d- 13.00 29 over14days
# 5 SH2D2A RF0064_Case-2-d- 8.28 2 under14days
# 6 TIGD4 RF0064_Case-49-d- 9.71 49 over14days
DATA
dat <- read.table(text = " protein patient value
1 BOD1L2 'RF0064_Case-9-d-' 10.4
2 PPFIA2 'RF0064_Case-20-d-' 7.83
3 STAT4 'RF0064_Case-11-d-' 11.0
4 TOM1L2 'RF0064_Case-29-d-' 13.0
5 SH2D2A 'RF0064_Case-2-d-' 8.28
6 TIGD4 'RF0064_Case-49-d-' 9.71",
header = TRUE, stringsAsFactors = FALSE)
Just now i start learning the sparklyr package using the reference sparklyr
i did what was written in the document.
when using the following code
delay <- flights_tbl %>%
group_by(tailnum) %>%
summarise(count = n(), dist = mean(distance), delay = mean(arr_delay)) %>%
filter(count > 20, dist < 2000, !is.na(delay)) %>%
collect
Warning messages:
1: Missing values are always removed in SQL.
Use `AVG(x, na.rm = TRUE)` to silence this warning
2: Missing values are always removed in SQL.
Use `AVG(x, na.rm = TRUE)` to silence this warning
> delay
# A tibble: 2,961 x 4
tailnum count dist delay
<chr> <dbl> <dbl> <dbl>
1 N14228 111 1547 3.71
2 N24211 130 1330 7.70
3 N668DN 49.0 1028 2.62
4 N39463 107 1588 2.16
5 N516JB 288 1249 12.0
6 N829AS 230 228 17.2
7 N3ALAA 63.0 1078 3.59
8 N793JB 283 1529 4.72
9 N657JB 285 1286 5.03
10 N53441 102 1661 0.941
# ... with 2,951 more rows
In the similar way i want apply the same operations on nycflights13::flights dataset using dplyr package
nycflights13::flights %>%
group_by(tailnum) %>%
summarise(count = n(), dist = mean(distance), delay = mean(arr_delay)) %>%
filter(count > 20, dist < 2000, !is.na(delay))
# A tibble: 1,319 x 4
tailnum count dist delay
<chr> <int> <dbl> <dbl>
1 N102UW 48 536 2.94
2 N103US 46 535 - 6.93
3 N105UW 45 525 - 0.267
4 N107US 41 529 - 5.73
5 N108UW 60 534 - 1.25
6 N109UW 48 536 - 2.52
7 N110UW 40 535 2.80
8 N111US 30 536 - 0.467
9 N11206 111 1414 12.7
10 N112US 38 535 - 0.947
# ... with 1,309 more rows
My problem is why i am getting the different results ?
As mention in the documentation dplyr is the complete backend operations
for sparklyr.
> sessionInfo()
R version 3.4.0 (2017-04-21)
Platform: i386-w64-mingw32/i386 (32-bit)
Running under: Windows 7 (build 7601) Service Pack 1
Matrix products: default
locale:
[1] LC_COLLATE=English_United States.1252 [2] LC_CTYPE=English_United States.1252
[3] LC_MONETARY=English_United States.1252 [4] LC_NUMERIC=C
[5] LC_TIME=English_United States.1252
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] bindrcpp_0.2 dplyr_0.7.4 sparklyr_0.7.0
loaded via a namespace (and not attached):
[1] DBI_0.7 readr_1.1.1 withr_2.1.1
[4] nycflights13_0.2.2 rprojroot_1.3-2 lattice_0.20-35
[7] foreign_0.8-69 pkgconfig_2.0.1 config_0.2
[10] utf8_1.1.3 compiler_3.4.0 stringr_1.3.0
[13] parallel_3.4.0 xtable_1.8-2 Rcpp_0.12.15
[16] cli_1.0.0 shiny_1.0.5 plyr_1.8.4
[19] httr_1.3.1 tools_3.4.0 openssl_1.0
[22] nlme_3.1-131.1 broom_0.4.3 R6_2.2.2
[25] dbplyr_1.2.1 bindr_0.1 purrr_0.2.4
[28] assertthat_0.2.0 curl_3.1 digest_0.6.15
[31] mime_0.5 stringi_1.1.6 rstudioapi_0.7
[34] reshape2_1.4.3 hms_0.4.1 backports_1.1.2
[37] htmltools_0.3.6 grid_3.4.0 glue_1.2.0
[40] httpuv_1.3.5 rlang_0.2.0 psych_1.7.8
[43] magrittr_1.5 rappdirs_0.3.1 lazyeval_0.2.1
[46] yaml_2.1.16 crayon_1.3.4 tidyr_0.8.0
[49] pillar_1.1.0 base64enc_0.1-3 mnormt_1.5-5
[52] jsonlite_1.5 tibble_1.4.2 Lahman_6.0-0
The key difference is that in the non-sparklyr, we are not using na.rm = TRUE in mean, therefore, those elements having NA in 'distance' or 'arr_delay' will become NA when we take the mean but in sparklyr the NA values are already removed so the argument is not needed
We can check the NA elements in 'distance' and 'arr_delay'
nycflights13::flights %>%
summarise_at(vars(distance, arr_delay), funs(sum(is.na(.))))
# A tibble: 1 x 2
# distance arr_delay
# <int> <int>
#1 0 9430 #### number of NAs
So, if we correct for that, then the output will be the same
res <- nycflights13::flights %>%
group_by(tailnum) %>%
summarise(count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)) %>%
filter(count > 20, dist < 2000, !is.na(delay)) %>%
arrange(tailnum)
res
# A tibble: 2,961 x 4
# tailnum count dist delay
# <chr> <int> <dbl> <dbl>
# 1 N0EGMQ 371 676 9.98
# 2 N10156 153 758 12.7
# 3 N102UW 48 536 2.94
# 4 N103US 46 535 - 6.93
# 5 N104UW 47 535 1.80
# 6 N10575 289 520 20.7
# 7 N105UW 45 525 - 0.267
# 8 N107US 41 529 - 5.73
# 9 N108UW 60 534 - 1.25
#10 N109UW 48 536 - 2.52
# ... with 2,951 more rows
Using sparklyr
library(sparklyr)
library(dplyr)
library(nycflights13)
sc <- spark_connect(master = "local")
flights_tbl <- copy_to(sc, nycflights13::flights, "flights")
delay <- flights_tbl %>%
group_by(tailnum) %>%
summarise(count = n(), dist = mean(distance), delay = mean(arr_delay)) %>%
filter(count > 20, dist < 2000, !is.na(delay)) %>%
arrange(tailnum) %>%
collect
delay
# A tibble: 2,961 x 4
# tailnum count dist delay
# <chr> <dbl> <dbl> <dbl>
# 1 N0EGMQ 371 676 9.98
# 2 N10156 153 758 12.7
# 3 N102UW 48.0 536 2.94
# 4 N103US 46.0 535 - 6.93
# 5 N104UW 47.0 535 1.80
# 6 N10575 289 520 20.7
# 7 N105UW 45.0 525 - 0.267
# 8 N107US 41.0 529 - 5.73
# 9 N108UW 60.0 534 - 1.25
#10 N109UW 48.0 536 - 2.52
# ... with 2,951 more rows