How to sum columns and rows in a wide R dataframe? - r

I'd like to mutate by dataframe by summing both columns and rows.
mydata <-structure(list(description.y = c("X1", "X2"), `2011` = c(13185.66,
82444.01), `2012` = c(14987.61, 103399.4), `2013` = c(26288.98,
86098.22), `2014` = c(15238.21, 88540.04), `2015` = c(15987.11,
113145.1), `2016` = c(16324.57, 113196.2), `2017` = c(16594.87,
122167.57), `2018` = c(20236.02, 120058.21), `2019` = c(20626.69,
130699.68), `2020` = c(19553.83, 136464.31), `2021` = c(10426.32,
56392.28)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L), groups = structure(list(description.y = c("X1",
"X2"), .rows = structure(list(1L, 2L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L), .drop = TRUE))
I can sum rows like this
mydata1 <- mydata %>%
mutate(Total = rowSums(across(where(is.numeric))))
Which provides an extra column with totals for the rows
But I'm not sure how to add Columns to the dataframe while also retaining all existing values
I've tried this but it doesn't work. Any thoughts?
mydata1 <- mydata %>%
mutate(Total = rowSums(across(where(is.numeric)))) %>%
mutate(Total = colSums(across(where(is.numeric))))

Update: See comment #Mwavu -> many thanks!
direct solution with adorn_total():
mydata %>% adorn_totals(where = c("row", "col"))
First answer:
We could use adorn_totals()
library(dplyr)
library(janitor)
mydata %>%
mutate(Total = rowSums(across(where(is.numeric)))) %>%
adorn_totals()
description.y 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 Total
X1 13185.66 14987.61 26288.98 15238.21 15987.11 16324.57 16594.87 20236.02 20626.69 19553.83 10426.32 189449.9
X2 82444.01 103399.40 86098.22 88540.04 113145.10 113196.20 122167.57 120058.21 130699.68 136464.31 56392.28 1152605.0
Total 95629.67 118387.01 112387.20 103778.25 129132.21 129520.77 138762.44 140294.23 151326.37 156018.14 66818.60 1342054.9

Another way is to first summarize and then bind_rows:
library(dplyr)
mydata %>%
ungroup() %>%
mutate(Total = rowSums(across(where(is.numeric)))) %>%
bind_rows(summarize(., description.y = "Total", across(where(is.numeric), sum)))
Output
# A tibble: 3 x 13
description.y `2011` `2012` `2013` `2014` `2015` `2016` `2017` `2018` `2019` `2020` `2021` Total
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 X1 13186. 14988. 26289. 15238. 15987. 16325. 16595. 20236. 20627. 19554. 10426. 189450.
2 X2 82444. 103399. 86098. 88540. 113145. 113196. 122168. 120058. 130700. 136464. 56392. 1152605.
3 Total 95630. 118387. 112387. 103778. 129132. 129521. 138762. 140294. 151326. 156018. 66819. 1342055.

Related

R pivot wider error. listcols output error

I have a dataset like below,
df1<-structure(list(Uniprot_IDs = c("P0A799|PGK", "P0A853|TNAA", "P0CE47|EFTU1",
"P0A6F3|GLPK", "P0A6F5|CH60", "P0A9B2|G3P1", "P0A853|TNAA", "P0A6P1|EFTS",
"P0A6P1|EFTS", "P0A799|PGK"), `1_3ng` = c(12305960196.5721, 24169710612.0476,
NA, 8553811608.70032, 13176265141.6301, 92994780469.5607, 11373139178.993,
NA, 8062061247.94512, 3484150815.20598), `2_3ng` = c(11629654800,
25162283400, 31864546300, 8157173240, 12812379370, 90007498700,
10191440110, NA, 7911370530, 3406054010), `3_3ng` = c(12503938417.8663,
25733015601.0117, 34727094361.2997, 8857104380.18179, NA, 93988723611.341,
11653192532.4546, NA, 7933102839.01341, NA), `4_7-5ng` = c(NA,
79582218995.1549, 77615759060.3497, 21749287984.8341, 33342436650.5148,
101254055758.836, 30624750667.6451, 39438567251.7351, 10726988796.4798,
7850501475.22747), `5_7-5ng` = c(NA, 78743355495.2545, 81948536416.9992,
NA, 34617564902.3219, 99485017820.8478, NA, 40420212151.9563,
14804870783.7792, 8280398872.03417), `6_7-5ng` = c(NA, 80272416055.8845,
77019098847.8474, 23045479130.9574, 32885483296.8046, 90789109337.1181,
30678346321.0037, 37073444001.0421, 13710097518.7425, 7916821420.64152
), `7_10ng` = c(22617928037.5148, 97473230025.8853, 91579176089.4265,
28086665669.9634, 38033883000.8102, NA, 37181868033.5073, 44274304023.6936,
NA, 9288965106.5049), `8_10ng` = c(22091136513.3736, NA, 90754802145.7813,
26405368418.6503, 36442770423.3661, NA, 36789459227.7515, 42793252584.0984,
15307787846.1716, 8834742124.86943), `9_10ng` = c(24125219176.3177,
98420360686.1339, 99355131865.2305, 28271975548.9608, 39837381317.8216,
NA, 39481996086.9157, 47261977623.5276, 16463020175.2068, 9931809132.696
), `10_15ng` = c(30252776887.1842, 141726904178.35, 130889671408.26,
38206477283.6549, 56021084469.4745, 100336249543.662, 53295491175.4506,
62883519160.5278, NA, 13994955303.4972), `11_15ng` = c(32859283128.8916,
161633827056.573, NA, 45497410866.4248, 61586094337.2513, NA,
60508117975.6097, 73276218943.4545, NA, 15400735421.5), `12_15ng` = c(34372085877.8071,
165557046117.222, 153975644961.53, 46279635074.4959, 61867667358.3367,
106133922907.254, 63526552497.161, 76374667334.5682, NA, 15329671283.3959
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-10L))
And a group data,
groups <- structure(list(samples = c("1_3ng", "2_3ng", "3_3ng", "4_7-5ng",
"5_7-5ng", "6_7-5ng", "7_10ng", "8_10ng", "9_10ng", "10_15ng",
"11_15ng", "12_15ng"), groups = c("GrA", "GrA", "GrA", "GrB",
"GrB", "GrB", "GrC", "GrC", "GrC", "GrD", "GrD", "GrD")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -12L))
And I have used the following codes for removing the groupwise missing rows from the dataset.
new_colnames <- c("name", "group")
colnames(groups)<-new_colnames
x <- colnames(df1[,1])
df2 <- setNames(df1,replace(names(df1),names(df1)==x,"rowid"))
com_data <- df2 %>%
pivot_longer(!rowid, values_to="mass") %>%
inner_join(groups, by="name") %>%
group_by(name) %>%
filter(!all(is.na(mass))) %>%
ungroup() %>%
add_count(rowid) %>%
filter(n == max(n)) %>%
select(!c(group, n)) %>%
pivot_wider(names_from=name, values_from=mass)
But I am not getting the desired output, it was like below,
structure(list(rowid = c("P0A799|PGK", "P0A853|TNAA", "P0A6P1|EFTS"
), `1_3ng` = list(c(12305960196.5721, 3484150815.20598), c(24169710612.0476,
11373139178.993), c(NA, 8062061247.94512)), `2_3ng` = list(c(11629654800,
3406054010), c(25162283400, 10191440110), c(NA, 7911370530)),
`3_3ng` = list(c(12503938417.8663, NA), c(25733015601.0117,
11653192532.4546), c(NA, 7933102839.01341)), `4_7-5ng` = list(
c(NA, 7850501475.22747), c(79582218995.1549, 30624750667.6451
), c(39438567251.7351, 10726988796.4798)), `5_7-5ng` = list(
c(NA, 8280398872.03417), c(78743355495.2545, NA), c(40420212151.9563,
14804870783.7792)), `6_7-5ng` = list(c(NA, 7916821420.64152
), c(80272416055.8845, 30678346321.0037), c(37073444001.0421,
13710097518.7425)), `7_10ng` = list(c(22617928037.5148, 9288965106.5049
), c(97473230025.8853, 37181868033.5073), c(44274304023.6936,
NA)), `8_10ng` = list(c(22091136513.3736, 8834742124.86943
), c(NA, 36789459227.7515), c(42793252584.0984, 15307787846.1716
)), `9_10ng` = list(c(24125219176.3177, 9931809132.696),
c(98420360686.1339, 39481996086.9157), c(47261977623.5276,
16463020175.2068)), `10_15ng` = list(c(30252776887.1842,
13994955303.4972), c(141726904178.35, 53295491175.4506),
c(62883519160.5278, NA)), `11_15ng` = list(c(32859283128.8916,
15400735421.5), c(161633827056.573, 60508117975.6097), c(73276218943.4545,
NA)), `12_15ng` = list(c(34372085877.8071, 15329671283.3959
), c(165557046117.222, 63526552497.161), c(76374667334.5682,
NA))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-3L))
And the warning message was,
Warning message:
Values from `mass` are not uniquely identified; output will contain list-cols.
* Use `values_fn = list` to suppress this warning.
* Use `values_fn = {summary_fun}` to summarise duplicates.
* Use the following dplyr code to identify duplicates.
{data} %>%
dplyr::group_by(rowid, name) %>%
dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
dplyr::filter(n > 1L)
How to get the dataframe output which contain single values in each column instead of list-cols.
My desired output is,
structure(list(Uniprot_IDs = c("P0A853|TNAA", "P0CE47|EFTU1",
"P0A6F3|GLPK", "P0A6F5|CH60", "P0A853|TNAA", "P0A799|PGK"), `1_3ng` = c(24169710612.0476,
NA, 8553811608.70032, 13176265141.6301, 11373139178.993, 3484150815.20598
), `2_3ng` = c(25162283400, 31864546300, 8157173240, 12812379370,
10191440110, 3406054010), `3_3ng` = c(25733015601.0117, 34727094361.2997,
8857104380.18179, NA, 11653192532.4546, NA), `4_7-5ng` = c(79582218995.1549,
77615759060.3497, 21749287984.8341, 33342436650.5148, 30624750667.6451,
7850501475.22747), `5_7-5ng` = c(78743355495.2545, 81948536416.9992,
NA, 34617564902.3219, NA, 8280398872.03417), `6_7-5ng` = c(80272416055.8845,
77019098847.8474, 23045479130.9574, 32885483296.8046, 30678346321.0037,
7916821420.64152), `7_10ng` = c(97473230025.8853, 91579176089.4265,
28086665669.9634, 38033883000.8102, 37181868033.5073, 9288965106.5049
), `8_10ng` = c(NA, 90754802145.7813, 26405368418.6503, 36442770423.3661,
36789459227.7515, 8834742124.86943), `9_10ng` = c(98420360686.1339,
99355131865.2305, 28271975548.9608, 39837381317.8216, 39481996086.9157,
9931809132.696), `10_15ng` = c(141726904178.35, 130889671408.26,
38206477283.6549, 56021084469.4745, 53295491175.4506, 13994955303.4972
), `11_15ng` = c(161633827056.573, NA, 45497410866.4248, 61586094337.2513,
60508117975.6097, 15400735421.5), `12_15ng` = c(165557046117.222,
153975644961.53, 46279635074.4959, 61867667358.3367, 63526552497.161,
15329671283.3959)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-6L))
The issue is that your rowid does not uniquely identify observations as there are multiple obs for some rowids. To account for that you could add an additional identifier before reshaping to long:
library(tidyr)
library(dplyr)
com_data <- df2 %>%
group_by(rowid) %>%
mutate(id = row_number()) %>%
ungroup() %>%
pivot_longer(-c(rowid, id), values_to="mass") %>%
inner_join(groups, by="name") %>%
group_by(name) %>%
filter(!all(is.na(mass))) %>%
ungroup() %>%
add_count(rowid) %>%
filter(n == max(n)) %>%
select(!c(group, n)) %>%
arrange(rowid, id) %>%
pivot_wider(names_from=name, values_from=mass) %>%
select(-id)
com_data
#> # A tibble: 6 × 13
#> rowid `1_3ng` `2_3ng` `3_3ng` `4_7-5ng` `5_7-5ng` `6_7-5ng` `7_10ng`
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 P0A6P1|EFTS NA NA NA 3.94e10 4.04e10 3.71e10 4.43e10
#> 2 P0A6P1|EFTS 8.06e 9 7.91e 9 7.93e 9 1.07e10 1.48e10 1.37e10 NA
#> 3 P0A799|PGK 1.23e10 1.16e10 1.25e10 NA NA NA 2.26e10
#> 4 P0A799|PGK 3.48e 9 3.41e 9 NA 7.85e 9 8.28e 9 7.92e 9 9.29e 9
#> 5 P0A853|TNAA 2.42e10 2.52e10 2.57e10 7.96e10 7.87e10 8.03e10 9.75e10
#> 6 P0A853|TNAA 1.14e10 1.02e10 1.17e10 3.06e10 NA 3.07e10 3.72e10
#> # … with 5 more variables: `8_10ng` <dbl>, `9_10ng` <dbl>, `10_15ng` <dbl>,
#> # `11_15ng` <dbl>, `12_15ng` <dbl>
EDIT To get your desired result is a different issue but could be achieved like so:
com_data <- df2 %>%
group_by(rowid) %>%
mutate(id = row_number()) %>%
ungroup() %>%
pivot_longer(-c(rowid, id), values_to = "mass") %>%
inner_join(groups, by = "name") %>%
add_count(rowid, id, group, wt = !is.na(mass)) %>%
group_by(rowid, id) %>%
filter(!any(n == 0)) %>%
ungroup() %>%
select(!c(group, n)) %>%
pivot_wider(names_from = name, values_from = mass) %>%
select(-id)
com_data
#> # A tibble: 6 × 13
#> rowid `1_3ng` `2_3ng` `3_3ng` `4_7-5ng` `5_7-5ng` `6_7-5ng` `7_10ng`
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 P0A853|TNAA 2.42e10 2.52e10 2.57e10 7.96e10 7.87e10 8.03e10 9.75e10
#> 2 P0CE47|EFTU1 NA 3.19e10 3.47e10 7.76e10 8.19e10 7.70e10 9.16e10
#> 3 P0A6F3|GLPK 8.55e 9 8.16e 9 8.86e 9 2.17e10 NA 2.30e10 2.81e10
#> 4 P0A6F5|CH60 1.32e10 1.28e10 NA 3.33e10 3.46e10 3.29e10 3.80e10
#> 5 P0A853|TNAA 1.14e10 1.02e10 1.17e10 3.06e10 NA 3.07e10 3.72e10
#> 6 P0A799|PGK 3.48e 9 3.41e 9 NA 7.85e 9 8.28e 9 7.92e 9 9.29e 9
#> # … with 5 more variables: `8_10ng` <dbl>, `9_10ng` <dbl>, `10_15ng` <dbl>,
#> # `11_15ng` <dbl>, `12_15ng` <dbl>

How to unnest a data frame containing list of list with varied length?

I was trying to unnest the the following data frame.
df.org <- structure(list(Gene = "ARIH1", Description = "E3 ubiquitin-protein ligase ARIH1",
condition2_cellline = list(c("MCF7", "Jurkat")), condition2_activity = list(
c(40.8284023668639, 13.26973)), condition2_concentration = list(
c("100uM", "100uM")), condition3_cellline = list("Jurkat"),
condition3_activity = list(-4.60251), condition3_concentration = list(
"100uM")), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"))
This is my code:
df.output <- df.ori %>%
unnest(where(is.list), keep_empty = T)
This is what I got:
structure(list(Gene = c("ARIH1", "ARIH1"), Description = c("E3 ubiquitin-protein ligase ARIH1",
"E3 ubiquitin-protein ligase ARIH1"), condition2_cellline = c("MCF7",
"Jurkat"), condition2_activity = c(40.8284023668639, 13.26973
), condition2_concentration = c("100uM", "100uM"), condition3_cellline = c("Jurkat",
"Jurkat"), condition3_activity = c(-4.60251, -4.60251), condition3_concentration = c("100uM",
"100uM")), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-2L))
Is there a way to avoid duplicating those variables with a shorter length? The following output is what I want to get.
df.desired <- structure(list(Gene = c("ARIH1", "ARIH1"), Description = c("E3 ubiquitin-protein ligase ARIH1",
"E3 ubiquitin-protein ligase ARIH1"), condition2_cellline = c("MCF7",
"Jurkat"), condition2_activity = c(40.8284023668639, 13.26973
), condition2_concentration = c("100uM", "100uM"), condition3_cellline = c(NA,
"Jurkat"), condition3_activity = c(NA, -4.60251), condition3_concentration = c(NA,
"100uM")), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-2L))
Thanks so much for any help!
We could also do without reshaping i.e. get the max of the list column lengths in a column, then loop across those list columns, modify the length with the max value and use unnest
library(dplyr)
library(purrr)
library(tidyr)
df.org %>%
mutate(l1 = max(across(where(is.list), lengths)),
across(where(is.list), ~ map(.x, `length<-`, l1)), l1 = NULL) %>%
unnest(where(is.list), keep_empty = TRUE)
-output
# A tibble: 2 × 8
Gene Description condition2_cellline condition2_activity condition2_concentration condition3_cellline condition3_activity condition3_concentration
<chr> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
1 ARIH1 E3 ubiquitin-protein ligase ARIH1 MCF7 40.8 100uM Jurkat -4.60 100uM
2 ARIH1 E3 ubiquitin-protein ligase ARIH1 Jurkat 13.3 100uM <NA> NA <NA>
Here is suggestion how it could work.
We pivot_longer all listed columns.
apply the the function to create lists of same length
pivot back and unnest.
library(dplyr)
library(tidyr)
df.org %>%
pivot_longer(cols = starts_with("condition")) %>%
mutate(value = lapply(value, `length<-`, max(lengths(value)))) %>%
pivot_wider(names_from = name, values_from = value) %>%
unnest(cols = c(condition2_cellline, condition2_activity, condition2_concentration,
condition3_cellline, condition3_activity, condition3_concentration))
Gene Description condition2_cell~ condition2_acti~ condition2_conc~ condition3_cell~ condition3_acti~ condition3_conc~
<chr> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
1 ARIH1 E3 ubiquitin-prot~ MCF7 40.8 100uM Jurkat -4.60 100uM
2 ARIH1 E3 ubiquitin-prot~ Jurkat 13.3 100uM NA NA NA
>

how to filter groups matching more than one values?

id drug_name med_start med_end
<dbl> <chr> <date> <date>
1 pembrolizumab 2018-02-07 2018-02-07
1 pembrolizumab 2018-02-28 2018-02-28
2 pembrolizumab 2018-01-05 2018-01-05
2 nivolumab 2018-09-20 2018-09-20
2 nivolumab 2018-10-03 2018-10-03
2 nivolumab 2018-11-01 2018-11-01
I am trying to get ids who have both pembrolizumab and nivolumab in drug_name. Can I do a group_by over id? And then filter with both conditions?
For above table, id 2 has both drug_names. I might have situation where I will be filtering more than 2 drug_names.
I am also trying to find to see if the gap between two med_start is greater than x days. Let's say 30 days. Basically filter ids who have gap of 30 days between med_start.
Here is the code for above data
data <- structure(list(id = structure(c(1, 1, 2, 2, 2, 2), class = "int"),
drug_name = c("pembrolizumab", "pembrolizumab", "pembrolizumab",
"nivolumab", "nivolumab", "nivolumab"), med_start = structure(c(17569,
17590, 17536, 17794, 17807, 17836), class = "Date"), med_end = structure(c(17569,
17590, 17536, 17794, 17807, 17836), class = "Date")), row.names = c(NA,
-6L), groups = structure(list(patient_id = structure(c(1.49283861796358e-314,
1.6423825257779e-313), class = "integer64"), .rows = structure(list(
1:2, 3:6), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
We group by 'id', and filter where all the drugs of interest are %in% the 'drug_name' column, and extract the unique 'id'
library(dplyr)
data %>%
group_by(id) %>%
filter(all(c("pembrolizumab", "nivolumab") %in% drug_name)) %>%
ungroup %>%
pull(id)%>%
unique
-output
[1] 2
Here are some base R options
for the first question
> unique(
+ subset(
+ data,
+ ave(match(drug_name, c("pembrolizumab", "nivolumab")), id, FUN = var) > 0,
+ select = id
+ )
+ )
# A tibble: 1 x 1
id
<int>
1 2
for the second question
> subset(
+ data,
+ ave(as.integer(med_start), id, FUN = function(x) max(diff(x))) <= 30
+ )
# A tibble: 2 x 4
id drug_name med_start med_end
<int> <chr> <date> <date>
1 1 pembrolizumab 2018-02-07 2018-02-07
2 1 pembrolizumab 2018-02-28 2018-02-28

Does any column match requirement?

I have a count of stems by tree species for different plots.
structure(list(Plot = c(1, 2), Pine = c(0, 430), Spruce = c(708,
1241), Birch = c(119, 48), Aspen = c(0, 0), Salix = c(0, 0),
Rowan = c(0, 0), Alnus = c(0, 0), stem_sum = c(827, 1719)), row.names = c(NA,
-2L), groups = structure(list(.rows = structure(list(1L, 2L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame")), class = c("rowwise_df", "tbl_df", "tbl",
"data.frame"))
What I want to do is use dplyr 1.0 convention to mutate a new column, "Main species", if any of the tree species columns exceed 80% of the stem_sum of that plot.
My thought process:
df %>% rowwise() %>% mutate(`Main species`= c_across(Pine:Alnus, if(.. / stem/sum >= 0.8, paste(...))
How can I modify this code such that if there are more than one column which fulfils the requirement, the output will be "Mixed"?
You can use :
library(dplyr)
df %>%
rowwise() %>%
mutate(Main_Species = if(any(c_across(Pine:Alnus) >= 0.8 * stem_sum))
'Mixed' else 'Not Mixed')
# Plot Pine Spruce Birch Aspen Salix Rowan Alnus stem_sum Main_Species
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#1 1 0 708 119 0 0 0 0 827 Mixed
#2 2 430 1241 48 0 0 0 0 1719 Not Mixed
Or in base R :
df$Main_species <- ifelse(rowSums(df[2:8] >= df$stem_sum * 0.8) > 0,
'Mixed', 'Not Mixed')
library(tidyverse)
df %>%
pivot_longer(-c(Plot, stem_sum)) %>%
arrange(Plot, desc(value)) %>%
group_by(Plot) %>%
mutate(pct = value/stem_sum,
main_species = case_when(
pct > 0.8 & pct == max(pct) ~ name,
pct == max(pct) ~ "mixed"
)
) %>% ungroup() %>%
fill(main_species, .direction = "down") %>%
select(-pct) %>%
pivot_wider()

How to use the function slice correctly

I was looking to use slice, and this example here seemed to be most applicable. However, when I tried this for my data in didn't work but I'm not sure what I am missing.
Data
library(tidyverse)
library(lubridate)
# Data
my_df <- structure(list(Date = structure(c(17896, 17986, 18077, 18160), class = "Date"),
t_period = c("Quarter", "Quarter", "Quarter", "Quarter"),
A = c(-10.2, 9.4, 3.2, 0.7), B = c(-13.6, 13.4, 6.2, 2.8)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -4L), groups = structure(list(
Date = structure(c(17896, 17986, 18077, 18160), class = "Date"),
.rows = list(1L, 2L, 3L, 4L)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE))
# my_df
# A tibble: 4 x 4
# Groups: Date [4]
# Date t_period A B
# <date> <chr> <dbl> <dbl>
#1 2018-12-31 Quarter -10.2 -13.6
#2 2019-03-31 Quarter 9.4 13.4
#3 2019-06-30 Quarter 3.2 6.2
#4 2019-09-21 Quarter 0.7 2.8
What I want to do is filter out the last quarter date if it is before the quarter end.
# Get end of existing quarter
end_of_qtr <- ceiling_date(as.Date("2019-09-28"), "quarter") - 1
# Filter out last row if Date < quarter end date
filtered_df <- my_df %>%
group_by(t_period) %>%
arrange(Date) %>%
slice(ifelse( n() < end_of_qtr, 1:(n()-1), n() ))
However, this does not return the desired result, which would be the first 3 rows of my_data since the last date ("2019-09-21" is before the end_of_qtr date "2019-09-30")
Could someone tell me the correct way to use slice to do this please?

Resources