I have a dataset like below,
df1<-structure(list(Uniprot_IDs = c("P0A799|PGK", "P0A853|TNAA", "P0CE47|EFTU1",
"P0A6F3|GLPK", "P0A6F5|CH60", "P0A9B2|G3P1", "P0A853|TNAA", "P0A6P1|EFTS",
"P0A6P1|EFTS", "P0A799|PGK"), `1_3ng` = c(12305960196.5721, 24169710612.0476,
NA, 8553811608.70032, 13176265141.6301, 92994780469.5607, 11373139178.993,
NA, 8062061247.94512, 3484150815.20598), `2_3ng` = c(11629654800,
25162283400, 31864546300, 8157173240, 12812379370, 90007498700,
10191440110, NA, 7911370530, 3406054010), `3_3ng` = c(12503938417.8663,
25733015601.0117, 34727094361.2997, 8857104380.18179, NA, 93988723611.341,
11653192532.4546, NA, 7933102839.01341, NA), `4_7-5ng` = c(NA,
79582218995.1549, 77615759060.3497, 21749287984.8341, 33342436650.5148,
101254055758.836, 30624750667.6451, 39438567251.7351, 10726988796.4798,
7850501475.22747), `5_7-5ng` = c(NA, 78743355495.2545, 81948536416.9992,
NA, 34617564902.3219, 99485017820.8478, NA, 40420212151.9563,
14804870783.7792, 8280398872.03417), `6_7-5ng` = c(NA, 80272416055.8845,
77019098847.8474, 23045479130.9574, 32885483296.8046, 90789109337.1181,
30678346321.0037, 37073444001.0421, 13710097518.7425, 7916821420.64152
), `7_10ng` = c(22617928037.5148, 97473230025.8853, 91579176089.4265,
28086665669.9634, 38033883000.8102, NA, 37181868033.5073, 44274304023.6936,
NA, 9288965106.5049), `8_10ng` = c(22091136513.3736, NA, 90754802145.7813,
26405368418.6503, 36442770423.3661, NA, 36789459227.7515, 42793252584.0984,
15307787846.1716, 8834742124.86943), `9_10ng` = c(24125219176.3177,
98420360686.1339, 99355131865.2305, 28271975548.9608, 39837381317.8216,
NA, 39481996086.9157, 47261977623.5276, 16463020175.2068, 9931809132.696
), `10_15ng` = c(30252776887.1842, 141726904178.35, 130889671408.26,
38206477283.6549, 56021084469.4745, 100336249543.662, 53295491175.4506,
62883519160.5278, NA, 13994955303.4972), `11_15ng` = c(32859283128.8916,
161633827056.573, NA, 45497410866.4248, 61586094337.2513, NA,
60508117975.6097, 73276218943.4545, NA, 15400735421.5), `12_15ng` = c(34372085877.8071,
165557046117.222, 153975644961.53, 46279635074.4959, 61867667358.3367,
106133922907.254, 63526552497.161, 76374667334.5682, NA, 15329671283.3959
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-10L))
And a group data,
groups <- structure(list(samples = c("1_3ng", "2_3ng", "3_3ng", "4_7-5ng",
"5_7-5ng", "6_7-5ng", "7_10ng", "8_10ng", "9_10ng", "10_15ng",
"11_15ng", "12_15ng"), groups = c("GrA", "GrA", "GrA", "GrB",
"GrB", "GrB", "GrC", "GrC", "GrC", "GrD", "GrD", "GrD")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -12L))
And I have used the following codes for removing the groupwise missing rows from the dataset.
new_colnames <- c("name", "group")
colnames(groups)<-new_colnames
x <- colnames(df1[,1])
df2 <- setNames(df1,replace(names(df1),names(df1)==x,"rowid"))
com_data <- df2 %>%
pivot_longer(!rowid, values_to="mass") %>%
inner_join(groups, by="name") %>%
group_by(name) %>%
filter(!all(is.na(mass))) %>%
ungroup() %>%
add_count(rowid) %>%
filter(n == max(n)) %>%
select(!c(group, n)) %>%
pivot_wider(names_from=name, values_from=mass)
But I am not getting the desired output, it was like below,
structure(list(rowid = c("P0A799|PGK", "P0A853|TNAA", "P0A6P1|EFTS"
), `1_3ng` = list(c(12305960196.5721, 3484150815.20598), c(24169710612.0476,
11373139178.993), c(NA, 8062061247.94512)), `2_3ng` = list(c(11629654800,
3406054010), c(25162283400, 10191440110), c(NA, 7911370530)),
`3_3ng` = list(c(12503938417.8663, NA), c(25733015601.0117,
11653192532.4546), c(NA, 7933102839.01341)), `4_7-5ng` = list(
c(NA, 7850501475.22747), c(79582218995.1549, 30624750667.6451
), c(39438567251.7351, 10726988796.4798)), `5_7-5ng` = list(
c(NA, 8280398872.03417), c(78743355495.2545, NA), c(40420212151.9563,
14804870783.7792)), `6_7-5ng` = list(c(NA, 7916821420.64152
), c(80272416055.8845, 30678346321.0037), c(37073444001.0421,
13710097518.7425)), `7_10ng` = list(c(22617928037.5148, 9288965106.5049
), c(97473230025.8853, 37181868033.5073), c(44274304023.6936,
NA)), `8_10ng` = list(c(22091136513.3736, 8834742124.86943
), c(NA, 36789459227.7515), c(42793252584.0984, 15307787846.1716
)), `9_10ng` = list(c(24125219176.3177, 9931809132.696),
c(98420360686.1339, 39481996086.9157), c(47261977623.5276,
16463020175.2068)), `10_15ng` = list(c(30252776887.1842,
13994955303.4972), c(141726904178.35, 53295491175.4506),
c(62883519160.5278, NA)), `11_15ng` = list(c(32859283128.8916,
15400735421.5), c(161633827056.573, 60508117975.6097), c(73276218943.4545,
NA)), `12_15ng` = list(c(34372085877.8071, 15329671283.3959
), c(165557046117.222, 63526552497.161), c(76374667334.5682,
NA))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-3L))
And the warning message was,
Warning message:
Values from `mass` are not uniquely identified; output will contain list-cols.
* Use `values_fn = list` to suppress this warning.
* Use `values_fn = {summary_fun}` to summarise duplicates.
* Use the following dplyr code to identify duplicates.
{data} %>%
dplyr::group_by(rowid, name) %>%
dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
dplyr::filter(n > 1L)
How to get the dataframe output which contain single values in each column instead of list-cols.
My desired output is,
structure(list(Uniprot_IDs = c("P0A853|TNAA", "P0CE47|EFTU1",
"P0A6F3|GLPK", "P0A6F5|CH60", "P0A853|TNAA", "P0A799|PGK"), `1_3ng` = c(24169710612.0476,
NA, 8553811608.70032, 13176265141.6301, 11373139178.993, 3484150815.20598
), `2_3ng` = c(25162283400, 31864546300, 8157173240, 12812379370,
10191440110, 3406054010), `3_3ng` = c(25733015601.0117, 34727094361.2997,
8857104380.18179, NA, 11653192532.4546, NA), `4_7-5ng` = c(79582218995.1549,
77615759060.3497, 21749287984.8341, 33342436650.5148, 30624750667.6451,
7850501475.22747), `5_7-5ng` = c(78743355495.2545, 81948536416.9992,
NA, 34617564902.3219, NA, 8280398872.03417), `6_7-5ng` = c(80272416055.8845,
77019098847.8474, 23045479130.9574, 32885483296.8046, 30678346321.0037,
7916821420.64152), `7_10ng` = c(97473230025.8853, 91579176089.4265,
28086665669.9634, 38033883000.8102, 37181868033.5073, 9288965106.5049
), `8_10ng` = c(NA, 90754802145.7813, 26405368418.6503, 36442770423.3661,
36789459227.7515, 8834742124.86943), `9_10ng` = c(98420360686.1339,
99355131865.2305, 28271975548.9608, 39837381317.8216, 39481996086.9157,
9931809132.696), `10_15ng` = c(141726904178.35, 130889671408.26,
38206477283.6549, 56021084469.4745, 53295491175.4506, 13994955303.4972
), `11_15ng` = c(161633827056.573, NA, 45497410866.4248, 61586094337.2513,
60508117975.6097, 15400735421.5), `12_15ng` = c(165557046117.222,
153975644961.53, 46279635074.4959, 61867667358.3367, 63526552497.161,
15329671283.3959)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-6L))
The issue is that your rowid does not uniquely identify observations as there are multiple obs for some rowids. To account for that you could add an additional identifier before reshaping to long:
library(tidyr)
library(dplyr)
com_data <- df2 %>%
group_by(rowid) %>%
mutate(id = row_number()) %>%
ungroup() %>%
pivot_longer(-c(rowid, id), values_to="mass") %>%
inner_join(groups, by="name") %>%
group_by(name) %>%
filter(!all(is.na(mass))) %>%
ungroup() %>%
add_count(rowid) %>%
filter(n == max(n)) %>%
select(!c(group, n)) %>%
arrange(rowid, id) %>%
pivot_wider(names_from=name, values_from=mass) %>%
select(-id)
com_data
#> # A tibble: 6 × 13
#> rowid `1_3ng` `2_3ng` `3_3ng` `4_7-5ng` `5_7-5ng` `6_7-5ng` `7_10ng`
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 P0A6P1|EFTS NA NA NA 3.94e10 4.04e10 3.71e10 4.43e10
#> 2 P0A6P1|EFTS 8.06e 9 7.91e 9 7.93e 9 1.07e10 1.48e10 1.37e10 NA
#> 3 P0A799|PGK 1.23e10 1.16e10 1.25e10 NA NA NA 2.26e10
#> 4 P0A799|PGK 3.48e 9 3.41e 9 NA 7.85e 9 8.28e 9 7.92e 9 9.29e 9
#> 5 P0A853|TNAA 2.42e10 2.52e10 2.57e10 7.96e10 7.87e10 8.03e10 9.75e10
#> 6 P0A853|TNAA 1.14e10 1.02e10 1.17e10 3.06e10 NA 3.07e10 3.72e10
#> # … with 5 more variables: `8_10ng` <dbl>, `9_10ng` <dbl>, `10_15ng` <dbl>,
#> # `11_15ng` <dbl>, `12_15ng` <dbl>
EDIT To get your desired result is a different issue but could be achieved like so:
com_data <- df2 %>%
group_by(rowid) %>%
mutate(id = row_number()) %>%
ungroup() %>%
pivot_longer(-c(rowid, id), values_to = "mass") %>%
inner_join(groups, by = "name") %>%
add_count(rowid, id, group, wt = !is.na(mass)) %>%
group_by(rowid, id) %>%
filter(!any(n == 0)) %>%
ungroup() %>%
select(!c(group, n)) %>%
pivot_wider(names_from = name, values_from = mass) %>%
select(-id)
com_data
#> # A tibble: 6 × 13
#> rowid `1_3ng` `2_3ng` `3_3ng` `4_7-5ng` `5_7-5ng` `6_7-5ng` `7_10ng`
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 P0A853|TNAA 2.42e10 2.52e10 2.57e10 7.96e10 7.87e10 8.03e10 9.75e10
#> 2 P0CE47|EFTU1 NA 3.19e10 3.47e10 7.76e10 8.19e10 7.70e10 9.16e10
#> 3 P0A6F3|GLPK 8.55e 9 8.16e 9 8.86e 9 2.17e10 NA 2.30e10 2.81e10
#> 4 P0A6F5|CH60 1.32e10 1.28e10 NA 3.33e10 3.46e10 3.29e10 3.80e10
#> 5 P0A853|TNAA 1.14e10 1.02e10 1.17e10 3.06e10 NA 3.07e10 3.72e10
#> 6 P0A799|PGK 3.48e 9 3.41e 9 NA 7.85e 9 8.28e 9 7.92e 9 9.29e 9
#> # … with 5 more variables: `8_10ng` <dbl>, `9_10ng` <dbl>, `10_15ng` <dbl>,
#> # `11_15ng` <dbl>, `12_15ng` <dbl>
I was trying to unnest the the following data frame.
df.org <- structure(list(Gene = "ARIH1", Description = "E3 ubiquitin-protein ligase ARIH1",
condition2_cellline = list(c("MCF7", "Jurkat")), condition2_activity = list(
c(40.8284023668639, 13.26973)), condition2_concentration = list(
c("100uM", "100uM")), condition3_cellline = list("Jurkat"),
condition3_activity = list(-4.60251), condition3_concentration = list(
"100uM")), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"))
This is my code:
df.output <- df.ori %>%
unnest(where(is.list), keep_empty = T)
This is what I got:
structure(list(Gene = c("ARIH1", "ARIH1"), Description = c("E3 ubiquitin-protein ligase ARIH1",
"E3 ubiquitin-protein ligase ARIH1"), condition2_cellline = c("MCF7",
"Jurkat"), condition2_activity = c(40.8284023668639, 13.26973
), condition2_concentration = c("100uM", "100uM"), condition3_cellline = c("Jurkat",
"Jurkat"), condition3_activity = c(-4.60251, -4.60251), condition3_concentration = c("100uM",
"100uM")), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-2L))
Is there a way to avoid duplicating those variables with a shorter length? The following output is what I want to get.
df.desired <- structure(list(Gene = c("ARIH1", "ARIH1"), Description = c("E3 ubiquitin-protein ligase ARIH1",
"E3 ubiquitin-protein ligase ARIH1"), condition2_cellline = c("MCF7",
"Jurkat"), condition2_activity = c(40.8284023668639, 13.26973
), condition2_concentration = c("100uM", "100uM"), condition3_cellline = c(NA,
"Jurkat"), condition3_activity = c(NA, -4.60251), condition3_concentration = c(NA,
"100uM")), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-2L))
Thanks so much for any help!
We could also do without reshaping i.e. get the max of the list column lengths in a column, then loop across those list columns, modify the length with the max value and use unnest
library(dplyr)
library(purrr)
library(tidyr)
df.org %>%
mutate(l1 = max(across(where(is.list), lengths)),
across(where(is.list), ~ map(.x, `length<-`, l1)), l1 = NULL) %>%
unnest(where(is.list), keep_empty = TRUE)
-output
# A tibble: 2 × 8
Gene Description condition2_cellline condition2_activity condition2_concentration condition3_cellline condition3_activity condition3_concentration
<chr> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
1 ARIH1 E3 ubiquitin-protein ligase ARIH1 MCF7 40.8 100uM Jurkat -4.60 100uM
2 ARIH1 E3 ubiquitin-protein ligase ARIH1 Jurkat 13.3 100uM <NA> NA <NA>
Here is suggestion how it could work.
We pivot_longer all listed columns.
apply the the function to create lists of same length
pivot back and unnest.
library(dplyr)
library(tidyr)
df.org %>%
pivot_longer(cols = starts_with("condition")) %>%
mutate(value = lapply(value, `length<-`, max(lengths(value)))) %>%
pivot_wider(names_from = name, values_from = value) %>%
unnest(cols = c(condition2_cellline, condition2_activity, condition2_concentration,
condition3_cellline, condition3_activity, condition3_concentration))
Gene Description condition2_cell~ condition2_acti~ condition2_conc~ condition3_cell~ condition3_acti~ condition3_conc~
<chr> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
1 ARIH1 E3 ubiquitin-prot~ MCF7 40.8 100uM Jurkat -4.60 100uM
2 ARIH1 E3 ubiquitin-prot~ Jurkat 13.3 100uM NA NA NA
>
I'd like to mutate by dataframe by summing both columns and rows.
mydata <-structure(list(description.y = c("X1", "X2"), `2011` = c(13185.66,
82444.01), `2012` = c(14987.61, 103399.4), `2013` = c(26288.98,
86098.22), `2014` = c(15238.21, 88540.04), `2015` = c(15987.11,
113145.1), `2016` = c(16324.57, 113196.2), `2017` = c(16594.87,
122167.57), `2018` = c(20236.02, 120058.21), `2019` = c(20626.69,
130699.68), `2020` = c(19553.83, 136464.31), `2021` = c(10426.32,
56392.28)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L), groups = structure(list(description.y = c("X1",
"X2"), .rows = structure(list(1L, 2L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L), .drop = TRUE))
I can sum rows like this
mydata1 <- mydata %>%
mutate(Total = rowSums(across(where(is.numeric))))
Which provides an extra column with totals for the rows
But I'm not sure how to add Columns to the dataframe while also retaining all existing values
I've tried this but it doesn't work. Any thoughts?
mydata1 <- mydata %>%
mutate(Total = rowSums(across(where(is.numeric)))) %>%
mutate(Total = colSums(across(where(is.numeric))))
Update: See comment #Mwavu -> many thanks!
direct solution with adorn_total():
mydata %>% adorn_totals(where = c("row", "col"))
First answer:
We could use adorn_totals()
library(dplyr)
library(janitor)
mydata %>%
mutate(Total = rowSums(across(where(is.numeric)))) %>%
adorn_totals()
description.y 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 Total
X1 13185.66 14987.61 26288.98 15238.21 15987.11 16324.57 16594.87 20236.02 20626.69 19553.83 10426.32 189449.9
X2 82444.01 103399.40 86098.22 88540.04 113145.10 113196.20 122167.57 120058.21 130699.68 136464.31 56392.28 1152605.0
Total 95629.67 118387.01 112387.20 103778.25 129132.21 129520.77 138762.44 140294.23 151326.37 156018.14 66818.60 1342054.9
Another way is to first summarize and then bind_rows:
library(dplyr)
mydata %>%
ungroup() %>%
mutate(Total = rowSums(across(where(is.numeric)))) %>%
bind_rows(summarize(., description.y = "Total", across(where(is.numeric), sum)))
Output
# A tibble: 3 x 13
description.y `2011` `2012` `2013` `2014` `2015` `2016` `2017` `2018` `2019` `2020` `2021` Total
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 X1 13186. 14988. 26289. 15238. 15987. 16325. 16595. 20236. 20627. 19554. 10426. 189450.
2 X2 82444. 103399. 86098. 88540. 113145. 113196. 122168. 120058. 130700. 136464. 56392. 1152605.
3 Total 95630. 118387. 112387. 103778. 129132. 129521. 138762. 140294. 151326. 156018. 66819. 1342055.
I am trying to replace some text in my dataframe (a few rows given below)
> dput(Henry.longer[1:4,])
structure(list(N_l = c(4, 4, 4, 4), UG = c("100", "100", "100",
"100"), S = c(12, 12, 12, 12), Sample = c(NA, NA, NA, NA), EQ = c("Henry",
"Henry", "Henry", "Henry"), DF = c(0.798545454545455, 0.798545454545455,
0.798545454545455, 0.798545454545455), meow = c("Henry.Exterior.single",
"Multi", "Henry.Exterior.multi", "Henry.Interior.single"), Girder = c("Henry.Exterior.single",
"Henry.Interior.multi", "Henry.Exterior.multi", "Interior")), row.names = c(NA,
-4L), groups = structure(list(UG = "100", S = 12, .rows = list(
1:4)), row.names = c(NA, -1L), class = c("tbl_df", "tbl",
"data.frame"), .drop = FALSE), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"))
I try to mutate the dataframe as:
Henry.longer <- Henry.longer %>%
mutate(Loading = str_replace(meow, "Henry.Exterior.single", "Single")) %>%
mutate(Loading = str_replace(meow, "Henry.Exterior.multi", "Multi")) %>%
mutate(Loading = str_replace(meow, "Henry.Interior.single", "Single")) %>%
mutate(Loading = str_replace(meow, "Henry.Interior.multi", "Multi")) %>%
mutate(Girder = str_replace(meow, "Henry.Exterior.multi", "Exterior")) %>%
mutate(Girder = str_replace(meow, "Henry.Exterior.single", "Exterior")) %>%
mutate(Girder = str_replace(meow, "Henry.Interior.multi", "Interior")) %>%
mutate(Girder = str_replace(meow, "Henry.Interior.single", "Interior")) %>%
select(-meow)
But for some reason the results does not get applied to all the rows and only:
N_l UG S Sample EQ DF Loading Girder
1 4 100 12 NA Henry 0.799 Henry.Exterior.single Henry.Exterior.single
2 4 100 12 NA Henry 0.799 Multi Henry.Interior.multi
3 4 100 12 NA Henry 0.799 Henry.Exterior.multi Henry.Exterior.multi
4 4 100 12 NA Henry 0.799 Henry.Interior.single Interior
I think we can use lookup vectors for this, if it's easy or safer to use static string lookups:
tr_vec <- c(Henry.Exterior.single = "Single", Henry.Exterior.multi = "Multi", Henry.Interior.single = "Single", Henry.Interior.multi = "Multi")
tr_vec2 <- c(Henry.Exterior.multi = "Exterior", Henry.Exterior.single = "Exterior", Henry.Interior.multi = "Interior", Henry.Interior.single = "Interior")
Henry.longer %>%
mutate(
Loading = coalesce(tr_vec[Loading], Loading),
Girder = coalesce(tr_vec2[Girder], Girder)
)
# # A tibble: 4 x 8
# # Groups: UG, S [1]
# N_l UG S Sample EQ DF Loading Girder
# <dbl> <chr> <dbl> <lgl> <chr> <dbl> <chr> <chr>
# 1 4 100 12 NA Henry 0.799 Single Exterior
# 2 4 100 12 NA Henry 0.799 Multi Interior
# 3 4 100 12 NA Henry 0.799 Multi Exterior
# 4 4 100 12 NA Henry 0.799 Single Interior
The advantage of RonakShah's regex solution is that it can very easily handle many of the types of substrings you appear to need. Regexes do carry a little risk, though, in that they may (unlikely in that answer, but) miss match.
Instead of using str_replace I guess it would be easier to extract what you want using regex.
library(dplyr)
Henry.longer %>%
mutate(Loading = sub('.*\\.', '', meow),
Girder = sub('.*\\.(\\w+)\\..*', '\\1', meow))
where
Loading - removes everything until last dot
Girder - extracts a word between two dots.
Oh boy, looks like you've got some answers here already but here's a super-simple one that uses stringr::str_extract:
Henry.longer <- Henry.longer %>%
mutate(Loading = str_extract(meow, "single|multi")) %>%
mutate(Girder = str_extract(meow, "Interior|Exterior"))
It's worth noting that the demo data has a weird entry for meow in one column, so it didn't run perfectly on my machine:
Assume a company that we have info about Total sales and the amount of sales in three counties CA , TX and WI.
How can i calculate : the average sales contribution of the three states of total company sales
I need furthermore to find : the same average percentages for each year, month of the year and day of the week.
EDITED !!!
structure(list(CA = c(11047, 9925, 11322, 12251, 16610, 14696
), TX = c(7381, 5912, 9006, 6226, 9440, 9376), WI = c(6984, 3309,
8883, 9533, 11882, 8664), Total = c(25412, 19146, 29211, 28010,
37932, 32736), date = structure(c(1296518400, 1296604800, 1296691200,
1296777600, 1296864000, 1296950400), tzone = "UTC", class = c("POSIXct",
"POSIXt")), event_type = c("NA", "NA", "NA", "NA", "NA", "Sporting"
), snap_CA = c(1, 1, 1, 1, 1, 1), snap_TX = c(1, 0, 1, 0, 1,
1), snap_WI = c(0, 1, 1, 0, 1, 1)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
if I understood your problem correctly a possible solution would be this:
library(dplyr)
library(lubridate)
df1 <- df %>%
dplyr::mutate(YEAR = lubridate::year(date),
MONTH = lubridate::month(date),
WEEKDAY = lubridate::wday(date),
P_CA = CA / Total,
P_TX = TX / Total,
P_WI = WI / Total)
# Average per Year
df1 %>%
dplyr::group_by(YEAR) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
YEAR AV_CA AV_TX AV_WI
<dbl> <dbl> <dbl> <dbl>
1 2011 0.444 0.278 0.278
# Average per Month
df1 %>%
dplyr::group_by(MONTH) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
MONTH AV_CA AV_TX AV_WI
<dbl> <dbl> <dbl> <dbl>
1 2 0.444 0.278 0.278
# Average per Weekday
df1 %>%
dplyr::group_by(WEEKDAY) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
WEEKDAY AV_CA AV_TX AV_WI
<dbl> <dbl> <dbl> <dbl>
1 1 0.449 0.286 0.265
2 3 0.435 0.290 0.275
3 4 0.518 0.309 0.173
4 5 0.388 0.308 0.304
5 6 0.437 0.222 0.340
6 7 0.438 0.249 0.313
For this dummy data all will up to 100% but when using a larger dataset this might not be true