R pivot wider error. listcols output error - r

I have a dataset like below,
df1<-structure(list(Uniprot_IDs = c("P0A799|PGK", "P0A853|TNAA", "P0CE47|EFTU1",
"P0A6F3|GLPK", "P0A6F5|CH60", "P0A9B2|G3P1", "P0A853|TNAA", "P0A6P1|EFTS",
"P0A6P1|EFTS", "P0A799|PGK"), `1_3ng` = c(12305960196.5721, 24169710612.0476,
NA, 8553811608.70032, 13176265141.6301, 92994780469.5607, 11373139178.993,
NA, 8062061247.94512, 3484150815.20598), `2_3ng` = c(11629654800,
25162283400, 31864546300, 8157173240, 12812379370, 90007498700,
10191440110, NA, 7911370530, 3406054010), `3_3ng` = c(12503938417.8663,
25733015601.0117, 34727094361.2997, 8857104380.18179, NA, 93988723611.341,
11653192532.4546, NA, 7933102839.01341, NA), `4_7-5ng` = c(NA,
79582218995.1549, 77615759060.3497, 21749287984.8341, 33342436650.5148,
101254055758.836, 30624750667.6451, 39438567251.7351, 10726988796.4798,
7850501475.22747), `5_7-5ng` = c(NA, 78743355495.2545, 81948536416.9992,
NA, 34617564902.3219, 99485017820.8478, NA, 40420212151.9563,
14804870783.7792, 8280398872.03417), `6_7-5ng` = c(NA, 80272416055.8845,
77019098847.8474, 23045479130.9574, 32885483296.8046, 90789109337.1181,
30678346321.0037, 37073444001.0421, 13710097518.7425, 7916821420.64152
), `7_10ng` = c(22617928037.5148, 97473230025.8853, 91579176089.4265,
28086665669.9634, 38033883000.8102, NA, 37181868033.5073, 44274304023.6936,
NA, 9288965106.5049), `8_10ng` = c(22091136513.3736, NA, 90754802145.7813,
26405368418.6503, 36442770423.3661, NA, 36789459227.7515, 42793252584.0984,
15307787846.1716, 8834742124.86943), `9_10ng` = c(24125219176.3177,
98420360686.1339, 99355131865.2305, 28271975548.9608, 39837381317.8216,
NA, 39481996086.9157, 47261977623.5276, 16463020175.2068, 9931809132.696
), `10_15ng` = c(30252776887.1842, 141726904178.35, 130889671408.26,
38206477283.6549, 56021084469.4745, 100336249543.662, 53295491175.4506,
62883519160.5278, NA, 13994955303.4972), `11_15ng` = c(32859283128.8916,
161633827056.573, NA, 45497410866.4248, 61586094337.2513, NA,
60508117975.6097, 73276218943.4545, NA, 15400735421.5), `12_15ng` = c(34372085877.8071,
165557046117.222, 153975644961.53, 46279635074.4959, 61867667358.3367,
106133922907.254, 63526552497.161, 76374667334.5682, NA, 15329671283.3959
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-10L))
And a group data,
groups <- structure(list(samples = c("1_3ng", "2_3ng", "3_3ng", "4_7-5ng",
"5_7-5ng", "6_7-5ng", "7_10ng", "8_10ng", "9_10ng", "10_15ng",
"11_15ng", "12_15ng"), groups = c("GrA", "GrA", "GrA", "GrB",
"GrB", "GrB", "GrC", "GrC", "GrC", "GrD", "GrD", "GrD")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -12L))
And I have used the following codes for removing the groupwise missing rows from the dataset.
new_colnames <- c("name", "group")
colnames(groups)<-new_colnames
x <- colnames(df1[,1])
df2 <- setNames(df1,replace(names(df1),names(df1)==x,"rowid"))
com_data <- df2 %>%
pivot_longer(!rowid, values_to="mass") %>%
inner_join(groups, by="name") %>%
group_by(name) %>%
filter(!all(is.na(mass))) %>%
ungroup() %>%
add_count(rowid) %>%
filter(n == max(n)) %>%
select(!c(group, n)) %>%
pivot_wider(names_from=name, values_from=mass)
But I am not getting the desired output, it was like below,
structure(list(rowid = c("P0A799|PGK", "P0A853|TNAA", "P0A6P1|EFTS"
), `1_3ng` = list(c(12305960196.5721, 3484150815.20598), c(24169710612.0476,
11373139178.993), c(NA, 8062061247.94512)), `2_3ng` = list(c(11629654800,
3406054010), c(25162283400, 10191440110), c(NA, 7911370530)),
`3_3ng` = list(c(12503938417.8663, NA), c(25733015601.0117,
11653192532.4546), c(NA, 7933102839.01341)), `4_7-5ng` = list(
c(NA, 7850501475.22747), c(79582218995.1549, 30624750667.6451
), c(39438567251.7351, 10726988796.4798)), `5_7-5ng` = list(
c(NA, 8280398872.03417), c(78743355495.2545, NA), c(40420212151.9563,
14804870783.7792)), `6_7-5ng` = list(c(NA, 7916821420.64152
), c(80272416055.8845, 30678346321.0037), c(37073444001.0421,
13710097518.7425)), `7_10ng` = list(c(22617928037.5148, 9288965106.5049
), c(97473230025.8853, 37181868033.5073), c(44274304023.6936,
NA)), `8_10ng` = list(c(22091136513.3736, 8834742124.86943
), c(NA, 36789459227.7515), c(42793252584.0984, 15307787846.1716
)), `9_10ng` = list(c(24125219176.3177, 9931809132.696),
c(98420360686.1339, 39481996086.9157), c(47261977623.5276,
16463020175.2068)), `10_15ng` = list(c(30252776887.1842,
13994955303.4972), c(141726904178.35, 53295491175.4506),
c(62883519160.5278, NA)), `11_15ng` = list(c(32859283128.8916,
15400735421.5), c(161633827056.573, 60508117975.6097), c(73276218943.4545,
NA)), `12_15ng` = list(c(34372085877.8071, 15329671283.3959
), c(165557046117.222, 63526552497.161), c(76374667334.5682,
NA))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-3L))
And the warning message was,
Warning message:
Values from `mass` are not uniquely identified; output will contain list-cols.
* Use `values_fn = list` to suppress this warning.
* Use `values_fn = {summary_fun}` to summarise duplicates.
* Use the following dplyr code to identify duplicates.
{data} %>%
dplyr::group_by(rowid, name) %>%
dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
dplyr::filter(n > 1L)
How to get the dataframe output which contain single values in each column instead of list-cols.
My desired output is,
structure(list(Uniprot_IDs = c("P0A853|TNAA", "P0CE47|EFTU1",
"P0A6F3|GLPK", "P0A6F5|CH60", "P0A853|TNAA", "P0A799|PGK"), `1_3ng` = c(24169710612.0476,
NA, 8553811608.70032, 13176265141.6301, 11373139178.993, 3484150815.20598
), `2_3ng` = c(25162283400, 31864546300, 8157173240, 12812379370,
10191440110, 3406054010), `3_3ng` = c(25733015601.0117, 34727094361.2997,
8857104380.18179, NA, 11653192532.4546, NA), `4_7-5ng` = c(79582218995.1549,
77615759060.3497, 21749287984.8341, 33342436650.5148, 30624750667.6451,
7850501475.22747), `5_7-5ng` = c(78743355495.2545, 81948536416.9992,
NA, 34617564902.3219, NA, 8280398872.03417), `6_7-5ng` = c(80272416055.8845,
77019098847.8474, 23045479130.9574, 32885483296.8046, 30678346321.0037,
7916821420.64152), `7_10ng` = c(97473230025.8853, 91579176089.4265,
28086665669.9634, 38033883000.8102, 37181868033.5073, 9288965106.5049
), `8_10ng` = c(NA, 90754802145.7813, 26405368418.6503, 36442770423.3661,
36789459227.7515, 8834742124.86943), `9_10ng` = c(98420360686.1339,
99355131865.2305, 28271975548.9608, 39837381317.8216, 39481996086.9157,
9931809132.696), `10_15ng` = c(141726904178.35, 130889671408.26,
38206477283.6549, 56021084469.4745, 53295491175.4506, 13994955303.4972
), `11_15ng` = c(161633827056.573, NA, 45497410866.4248, 61586094337.2513,
60508117975.6097, 15400735421.5), `12_15ng` = c(165557046117.222,
153975644961.53, 46279635074.4959, 61867667358.3367, 63526552497.161,
15329671283.3959)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-6L))

The issue is that your rowid does not uniquely identify observations as there are multiple obs for some rowids. To account for that you could add an additional identifier before reshaping to long:
library(tidyr)
library(dplyr)
com_data <- df2 %>%
group_by(rowid) %>%
mutate(id = row_number()) %>%
ungroup() %>%
pivot_longer(-c(rowid, id), values_to="mass") %>%
inner_join(groups, by="name") %>%
group_by(name) %>%
filter(!all(is.na(mass))) %>%
ungroup() %>%
add_count(rowid) %>%
filter(n == max(n)) %>%
select(!c(group, n)) %>%
arrange(rowid, id) %>%
pivot_wider(names_from=name, values_from=mass) %>%
select(-id)
com_data
#> # A tibble: 6 × 13
#> rowid `1_3ng` `2_3ng` `3_3ng` `4_7-5ng` `5_7-5ng` `6_7-5ng` `7_10ng`
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 P0A6P1|EFTS NA NA NA 3.94e10 4.04e10 3.71e10 4.43e10
#> 2 P0A6P1|EFTS 8.06e 9 7.91e 9 7.93e 9 1.07e10 1.48e10 1.37e10 NA
#> 3 P0A799|PGK 1.23e10 1.16e10 1.25e10 NA NA NA 2.26e10
#> 4 P0A799|PGK 3.48e 9 3.41e 9 NA 7.85e 9 8.28e 9 7.92e 9 9.29e 9
#> 5 P0A853|TNAA 2.42e10 2.52e10 2.57e10 7.96e10 7.87e10 8.03e10 9.75e10
#> 6 P0A853|TNAA 1.14e10 1.02e10 1.17e10 3.06e10 NA 3.07e10 3.72e10
#> # … with 5 more variables: `8_10ng` <dbl>, `9_10ng` <dbl>, `10_15ng` <dbl>,
#> # `11_15ng` <dbl>, `12_15ng` <dbl>
EDIT To get your desired result is a different issue but could be achieved like so:
com_data <- df2 %>%
group_by(rowid) %>%
mutate(id = row_number()) %>%
ungroup() %>%
pivot_longer(-c(rowid, id), values_to = "mass") %>%
inner_join(groups, by = "name") %>%
add_count(rowid, id, group, wt = !is.na(mass)) %>%
group_by(rowid, id) %>%
filter(!any(n == 0)) %>%
ungroup() %>%
select(!c(group, n)) %>%
pivot_wider(names_from = name, values_from = mass) %>%
select(-id)
com_data
#> # A tibble: 6 × 13
#> rowid `1_3ng` `2_3ng` `3_3ng` `4_7-5ng` `5_7-5ng` `6_7-5ng` `7_10ng`
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 P0A853|TNAA 2.42e10 2.52e10 2.57e10 7.96e10 7.87e10 8.03e10 9.75e10
#> 2 P0CE47|EFTU1 NA 3.19e10 3.47e10 7.76e10 8.19e10 7.70e10 9.16e10
#> 3 P0A6F3|GLPK 8.55e 9 8.16e 9 8.86e 9 2.17e10 NA 2.30e10 2.81e10
#> 4 P0A6F5|CH60 1.32e10 1.28e10 NA 3.33e10 3.46e10 3.29e10 3.80e10
#> 5 P0A853|TNAA 1.14e10 1.02e10 1.17e10 3.06e10 NA 3.07e10 3.72e10
#> 6 P0A799|PGK 3.48e 9 3.41e 9 NA 7.85e 9 8.28e 9 7.92e 9 9.29e 9
#> # … with 5 more variables: `8_10ng` <dbl>, `9_10ng` <dbl>, `10_15ng` <dbl>,
#> # `11_15ng` <dbl>, `12_15ng` <dbl>

Related

How to extract the minimum value in all columns in a dataframe in R?

I am working on mass spectrometry proteomics expression data. For statistical analysis of the data, I have to find the top three minimum value of each column in the dataframe like below,
structure(list(Type = c("knn_vsn", "knn_loess", "knn_rlr", "lls_vsn",
"lls_loess", "lls_rlr", "svd_vsn", "svd_loess", "svd_rlr"), Group1 = c(0.00318368971435714,
0.00317086486813191, 0.00317086486813191, 0.00312821095645019,
0.00311632537571597, 0.00313568333628438, 0.00394831935666465,
0.00393605637633005, 0.00395599132474446), Group2 = c(0.0056588221783197,
0.00560933517836751, 0.00560933517836751, 0.00550114679857588,
0.00548316209864631, 0.00550230673346083, 0.00737865310351839,
0.0073411154394253, 0.00735748595511963), Group3 = c(0.00418838138878096,
0.00417201215938804, 0.00417201215938804, 0.00398819978362592,
0.00397093259462351, 0.00398827962107259, 0.00424157479553304,
0.00422638750183658, 0.00424175886713471), Group4 = c(0.0039811913527127,
0.00394649435912413, 0.00394649435912413, 0.00397059873107098,
0.00393840233766712, 0.00396385071387178, 0.0041077267588457,
0.00407577176849463, 0.00410191492380459)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -9L), groups = structure(list(
Type = c("knn_loess", "knn_rlr", "knn_vsn", "lls_loess",
"lls_rlr", "lls_vsn", "svd_loess", "svd_rlr", "svd_vsn"),
.rows = structure(list(2L, 3L, 1L, 5L, 6L, 4L, 8L, 9L, 7L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -9L), .drop = TRUE))
And I need the output like below,\
structure(list(`Type ` = c("lls_loess", "lls_rlr", "lls_vsn"),
Group1 = c(0.00311632537571597, 0.00313568333628438, 0.00312821095645019
), ` Type` = c("lls_loess", "lls_rlr", "lls_vsn"), Group2 = c(0.00548316209864631,
0.00550230673346083, 0.00550114679857588), ` Type` = c("lls_loess",
"lls_rlr", "lls_vsn"), Group3 = c(0.00397093259462351, 0.00398827962107259,
0.00398819978362592), `Type ` = c("lls_loess", "lls_rlr",
"lls_vsn"), Group4 = c(0.00393840233766712, 0.00396385071387178,
0.00397059873107098)), class = "data.frame", row.names = c(NA,
-3L))
Please suggest some useful R code for this issue.
Thank you in advance.
library(tidyverse)
df %>%
pivot_longer(-Type) %>%
group_by(name) %>%
slice_min(value, n = 3) %>% # You might stop here, already tidy
mutate(row = row_number()) %>%
ungroup() %>%
pivot_wider(names_from = name, values_from = c(Type, value),
names_vary = "slowest")
Result
# A tibble: 3 × 9
row Type_Group1 value_Group1 Type_Group2 value_Group2 Type_Group3 value_Group3 Type_Group4 value_Group4
<int> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl>
1 1 lls_loess 0.00312 lls_loess 0.00548 lls_loess 0.00397 lls_loess 0.00394
2 2 lls_vsn 0.00313 lls_vsn 0.00550 lls_vsn 0.00399 knn_loess 0.00395
3 3 lls_rlr 0.00314 lls_rlr 0.00550 lls_rlr 0.00399 knn_rlr 0.00395
Take a look at this
data <- data.frame(group = rep(letters[1:3], each = 5),data
value = 1:3)
data
Another possible solution, based on purrr::imap_dfc:
library(tidyverse)
imap_dfc(2:ncol(df), ~ df %>% ungroup %>% .[c(1,.x)] %>%
slice_min(df[[.x]], n = 3) %>% set_names(c(paste0("Type",.y), names(df)[.x])))
#> # A tibble: 3 × 8
#> Type1 Group1 Type2 Group2 Type3 Group3 Type4 Group4
#> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl>
#> 1 lls_loess 0.00312 lls_loess 0.00548 lls_loess 0.00397 lls_loess 0.00394
#> 2 lls_vsn 0.00313 lls_vsn 0.00550 lls_vsn 0.00399 knn_loess 0.00395
#> 3 lls_rlr 0.00314 lls_rlr 0.00550 lls_rlr 0.00399 knn_rlr 0.00395
Note: Your original data is grouped: that is why I use ungroup in my solution.

How to unnest a data frame containing list of list with varied length?

I was trying to unnest the the following data frame.
df.org <- structure(list(Gene = "ARIH1", Description = "E3 ubiquitin-protein ligase ARIH1",
condition2_cellline = list(c("MCF7", "Jurkat")), condition2_activity = list(
c(40.8284023668639, 13.26973)), condition2_concentration = list(
c("100uM", "100uM")), condition3_cellline = list("Jurkat"),
condition3_activity = list(-4.60251), condition3_concentration = list(
"100uM")), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"))
This is my code:
df.output <- df.ori %>%
unnest(where(is.list), keep_empty = T)
This is what I got:
structure(list(Gene = c("ARIH1", "ARIH1"), Description = c("E3 ubiquitin-protein ligase ARIH1",
"E3 ubiquitin-protein ligase ARIH1"), condition2_cellline = c("MCF7",
"Jurkat"), condition2_activity = c(40.8284023668639, 13.26973
), condition2_concentration = c("100uM", "100uM"), condition3_cellline = c("Jurkat",
"Jurkat"), condition3_activity = c(-4.60251, -4.60251), condition3_concentration = c("100uM",
"100uM")), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-2L))
Is there a way to avoid duplicating those variables with a shorter length? The following output is what I want to get.
df.desired <- structure(list(Gene = c("ARIH1", "ARIH1"), Description = c("E3 ubiquitin-protein ligase ARIH1",
"E3 ubiquitin-protein ligase ARIH1"), condition2_cellline = c("MCF7",
"Jurkat"), condition2_activity = c(40.8284023668639, 13.26973
), condition2_concentration = c("100uM", "100uM"), condition3_cellline = c(NA,
"Jurkat"), condition3_activity = c(NA, -4.60251), condition3_concentration = c(NA,
"100uM")), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-2L))
Thanks so much for any help!
We could also do without reshaping i.e. get the max of the list column lengths in a column, then loop across those list columns, modify the length with the max value and use unnest
library(dplyr)
library(purrr)
library(tidyr)
df.org %>%
mutate(l1 = max(across(where(is.list), lengths)),
across(where(is.list), ~ map(.x, `length<-`, l1)), l1 = NULL) %>%
unnest(where(is.list), keep_empty = TRUE)
-output
# A tibble: 2 × 8
Gene Description condition2_cellline condition2_activity condition2_concentration condition3_cellline condition3_activity condition3_concentration
<chr> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
1 ARIH1 E3 ubiquitin-protein ligase ARIH1 MCF7 40.8 100uM Jurkat -4.60 100uM
2 ARIH1 E3 ubiquitin-protein ligase ARIH1 Jurkat 13.3 100uM <NA> NA <NA>
Here is suggestion how it could work.
We pivot_longer all listed columns.
apply the the function to create lists of same length
pivot back and unnest.
library(dplyr)
library(tidyr)
df.org %>%
pivot_longer(cols = starts_with("condition")) %>%
mutate(value = lapply(value, `length<-`, max(lengths(value)))) %>%
pivot_wider(names_from = name, values_from = value) %>%
unnest(cols = c(condition2_cellline, condition2_activity, condition2_concentration,
condition3_cellline, condition3_activity, condition3_concentration))
Gene Description condition2_cell~ condition2_acti~ condition2_conc~ condition3_cell~ condition3_acti~ condition3_conc~
<chr> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
1 ARIH1 E3 ubiquitin-prot~ MCF7 40.8 100uM Jurkat -4.60 100uM
2 ARIH1 E3 ubiquitin-prot~ Jurkat 13.3 100uM NA NA NA
>

How to sum columns and rows in a wide R dataframe?

I'd like to mutate by dataframe by summing both columns and rows.
mydata <-structure(list(description.y = c("X1", "X2"), `2011` = c(13185.66,
82444.01), `2012` = c(14987.61, 103399.4), `2013` = c(26288.98,
86098.22), `2014` = c(15238.21, 88540.04), `2015` = c(15987.11,
113145.1), `2016` = c(16324.57, 113196.2), `2017` = c(16594.87,
122167.57), `2018` = c(20236.02, 120058.21), `2019` = c(20626.69,
130699.68), `2020` = c(19553.83, 136464.31), `2021` = c(10426.32,
56392.28)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L), groups = structure(list(description.y = c("X1",
"X2"), .rows = structure(list(1L, 2L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L), .drop = TRUE))
I can sum rows like this
mydata1 <- mydata %>%
mutate(Total = rowSums(across(where(is.numeric))))
Which provides an extra column with totals for the rows
But I'm not sure how to add Columns to the dataframe while also retaining all existing values
I've tried this but it doesn't work. Any thoughts?
mydata1 <- mydata %>%
mutate(Total = rowSums(across(where(is.numeric)))) %>%
mutate(Total = colSums(across(where(is.numeric))))
Update: See comment #Mwavu -> many thanks!
direct solution with adorn_total():
mydata %>% adorn_totals(where = c("row", "col"))
First answer:
We could use adorn_totals()
library(dplyr)
library(janitor)
mydata %>%
mutate(Total = rowSums(across(where(is.numeric)))) %>%
adorn_totals()
description.y 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 Total
X1 13185.66 14987.61 26288.98 15238.21 15987.11 16324.57 16594.87 20236.02 20626.69 19553.83 10426.32 189449.9
X2 82444.01 103399.40 86098.22 88540.04 113145.10 113196.20 122167.57 120058.21 130699.68 136464.31 56392.28 1152605.0
Total 95629.67 118387.01 112387.20 103778.25 129132.21 129520.77 138762.44 140294.23 151326.37 156018.14 66818.60 1342054.9
Another way is to first summarize and then bind_rows:
library(dplyr)
mydata %>%
ungroup() %>%
mutate(Total = rowSums(across(where(is.numeric)))) %>%
bind_rows(summarize(., description.y = "Total", across(where(is.numeric), sum)))
Output
# A tibble: 3 x 13
description.y `2011` `2012` `2013` `2014` `2015` `2016` `2017` `2018` `2019` `2020` `2021` Total
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 X1 13186. 14988. 26289. 15238. 15987. 16325. 16595. 20236. 20627. 19554. 10426. 189450.
2 X2 82444. 103399. 86098. 88540. 113145. 113196. 122168. 120058. 130700. 136464. 56392. 1152605.
3 Total 95630. 118387. 112387. 103778. 129132. 129521. 138762. 140294. 151326. 156018. 66819. 1342055.

How can we check if any 2 intervals of a unique ID overlaps?

I have data of patient prescription of oral DM drugs, i.e. DPP4 and SU, and would like to find out if patients had taken the drugs concurrently (i.e. whether there are overlapping intervals for DPP4 and SU within the same patient ID).
Sample data:
ID DRUG START END
1 1 DPP4 2020-01-01 2020-01-20
2 1 DPP4 2020-03-01 2020-04-01
3 1 SU 2020-03-15 2020-04-30
4 2 SU 2020-10-01 2020-10-31
5 2 DPP4 2020-12-01 2020-12-31
In the sample data above,
ID == 1, patient had DPP4 and SU concurrently from 2020-03-15 to 2020-04-01.
ID == 2, patient had consumed both medications at separate intervals.
I thought of splitting the data into 2, one for DPP4 and another for SU. Then, do a full join, and compare each DPP4 interval with each SU interval. This may be okay for small data, but if a patient has like 5 rows for DPP4 and another 5 for SU, we will have 25 comparisons, which may not be efficient. Add that with 10000+ patients.
I am not sure how to do it.
New data:
Hope to have a new df that looks like this. Or anything that is tidy.
ID DRUG START END
1 1 DPP4-SU 2020-03-15 2020-04-01
2 2 <NA> <NA> <NA>
Data Code:
df <- structure(list(ID = c(1L, 1L, 1L, 2L, 2L), DRUG = c("DPP4", "DPP4",
"SU", "SU", "DPP4"), START = structure(c(18262, 18322, 18336,
18536, 18597), class = "Date"), END = structure(c(18281, 18353,
18382, 18566, 18627), class = "Date")), class = "data.frame", row.names = c(NA,
-5L))
df_new <- structure(list(ID = 1:2, DRUG = c("DPP4-SU", NA), START = structure(c(18336,
NA), class = "Date"), END = structure(c(18353, NA), class = "Date")), class = "data.frame", row.names = c(NA,
-2L))
Edit:
I think from the sample data I gave, it may seem that there can only be 1 intersecting interval. But there may be more. So, I think this would be better data to illustrate.
structure(list(ID = c(3, 3, 3, 3, 3, 3, 3), DRUG = c("DPP4",
"DPP4", "SU", "SU", "DPP4", "DPP4", "DPP4"), START = structure(c(17004,
17383, 17383, 17418, 17437, 17649, 17676), class = c("IDate",
"Date")), END = structure(c(17039, 17405, 17405, 17521, 17625,
17669, 17711), class = c("IDate", "Date")), duration = c(35L,
22L, 22L, 103L, 188L, 20L, 35L), INDEX = c(1L, 0L, 0L, 0L, 0L,
0L, 0L)), row.names = c(NA, -7L), class = c("tbl_df", "tbl",
"data.frame"))
It's way more complicated than dear #AnoushiravanR's but as an alternative you could try
library(dplyr)
library(tidyr)
library(lubridate)
df %>%
full_join(x = ., y = ., by = "ID") %>%
# filter(DRUG.x != DRUG.y | START.x != START.y | END.x != END.y) %>%
filter(DRUG.x != DRUG.y) %>%
group_by(ID, intersection = intersect(interval(START.x, END.x), interval(START.y, END.y))) %>%
drop_na(intersection) %>%
filter(START.x == first(START.x)) %>%
summarise(DRUG = paste(DRUG.x, DRUG.y, sep = "-"),
START = as_date(int_start(intersection)),
END = as_date(int_end(intersection)),
.groups = "drop") %>%
select(-intersection)
returning
# A tibble: 1 x 4
ID DRUG START END
<int> <chr> <date> <date>
1 1 DPP4-SU 2020-03-15 2020-04-01
Edit: Changed the filter condition. The former one was flawed.
Updated Solution
I have made considerable modifications based on the newly provided data set. This time I first created interval for each START and END pair and extract the intersecting period between them. As dear Martin nicely made use of them we could use lubridate::int_start and lubridate::int_end to extract the START and END date of each interval:
library(dplyr)
library(lubridate)
library(purrr)
library(tidyr)
df %>%
group_by(ID) %>%
arrange(START, END) %>%
mutate(int = interval(START, END),
is_over = c(NA, map2(int[-n()], int[-1],
~ intersect(.x, .y)))) %>%
unnest(cols = c(is_over)) %>%
select(-int) %>%
filter(!is.na(is_over) | !is.na(lead(is_over))) %>%
select(!c(START, END)) %>%
mutate(grp = cumsum(is.na(is_over))) %>%
group_by(grp) %>%
summarise(ID = first(ID),
DRUG = paste0(DRUG, collapse = "-"),
is_over = na.omit(is_over)) %>%
mutate(START = int_start(is_over),
END = int_end(is_over)) %>%
select(!is_over)
# A tibble: 1 x 5
grp ID DRUG START END
<int> <int> <chr> <dttm> <dttm>
1 1 1 DPP4-SU 2020-03-15 00:00:00 2020-04-01 00:00:00
Second data set:
# A tibble: 2 x 5
grp ID DRUG START END
<int> <dbl> <chr> <dttm> <dttm>
1 1 3 DPP4-SU 2017-08-05 00:00:00 2017-08-27 00:00:00
2 2 3 SU-DPP4 2017-09-28 00:00:00 2017-12-21 00:00:00
Update
As per updated df
df <- structure(list(ID = c(3, 3, 3, 3, 3, 3, 3), DRUG = c(
"DPP4",
"DPP4", "SU", "SU", "DPP4", "DPP4", "DPP4"
), START = structure(c(
17004,
17383, 17383, 17418, 17437, 17649, 17676
), class = c(
"IDate",
"Date"
)), END = structure(c(
17039, 17405, 17405, 17521, 17625,
17669, 17711
), class = c("IDate", "Date")), duration = c(
35L,
22L, 22L, 103L, 188L, 20L, 35L
), INDEX = c(
1L, 0L, 0L, 0L, 0L,
0L, 0L
)), row.names = c(NA, -7L), class = c(
"tbl_df", "tbl",
"data.frame"
))
we obtain
> dfnew
ID DRUG start end
3.3 3 DPP4-SU 2017-08-05 2017-08-27
3.7 3 SU-DPP4 2017-09-28 2017-12-21
A base R option (not as fancy as the answers by #Anoushiravan R or #Martin Gal)
f <- function(d) {
d <- d[with(d, order(START, END)), ]
idx <- subset(
data.frame(which((u <- with(d, outer(START, END, `<`))) & t(u), arr.ind = TRUE)),
row > col
)
if (nrow(idx) == 0) {
return(data.frame(ID = unique(d$ID), DRUG = NA, start = NA, end = NA))
}
with(
d,
do.call(rbind,
apply(
idx,
1,
FUN = function(v) {
data.frame(
ID = ID[v["row"]],
DRUG = paste0(DRUG[sort(unlist(v))], collapse = "-"),
start = START[v["row"]],
end = END[v["col"]]
)
}
))
)
}
dfnew <- do.call(rbind, Map(f, split(df, ~ID)))
gives
> dfnew
ID DRUG start end
1 1 DPP4-SU 2020-03-15 2020-04-01
2 2 <NA> <NA> <NA>
You may use a slightly different approach from the above answers, but this will give you results in format different than required. Obviously, these can be joined to get expected results. You may try this
df <- structure(list(ID = c(3, 3, 3, 3, 3, 3, 3), DRUG = c("DPP4", "DPP4", "SU", "SU", "DPP4", "DPP4", "DPP4"), START = structure(c(17004, 17383, 17383, 17418, 17437, 17649, 17676), class = c("IDate", "Date")), END = structure(c(17039, 17405, 17405, 17521, 17625, 17669, 17711), class = c("IDate", "Date"))), row.names = c(NA, -7L), class = c("tbl_df", "tbl", "data.frame"))
df
#> # A tibble: 7 x 4
#> ID DRUG START END
#> <dbl> <chr> <date> <date>
#> 1 3 DPP4 2016-07-22 2016-08-26
#> 2 3 DPP4 2017-08-05 2017-08-27
#> 3 3 SU 2017-08-05 2017-08-27
#> 4 3 SU 2017-09-09 2017-12-21
#> 5 3 DPP4 2017-09-28 2018-04-04
#> 6 3 DPP4 2018-04-28 2018-05-18
#> 7 3 DPP4 2018-05-25 2018-06-29
library(tidyverse)
df %>%
mutate(treatment_id = row_number()) %>%
pivot_longer(c(START, END), names_to = 'event', values_to = 'dates') %>%
mutate(event = factor(event, levels = c('END', 'START'), ordered = TRUE)) %>%
group_by(ID) %>%
arrange(dates, event, .by_group = TRUE) %>%
mutate(overlap = cumsum(ifelse(event == 'START', 1, -1))) %>%
filter((overlap > 1 & event == 'START') | (overlap > 0 & event == 'END'))
#> # A tibble: 4 x 6
#> # Groups: ID [1]
#> ID DRUG treatment_id event dates overlap
#> <dbl> <chr> <int> <ord> <date> <dbl>
#> 1 3 SU 3 START 2017-08-05 2
#> 2 3 DPP4 2 END 2017-08-27 1
#> 3 3 DPP4 5 START 2017-09-28 2
#> 4 3 SU 4 END 2017-12-21 1
on originally provided data
# A tibble: 2 x 6
# Groups: ID [1]
ID DRUG treatment_id event dates overlap
<int> <chr> <int> <ord> <date> <dbl>
1 1 SU 3 START 2020-03-15 2
2 1 DPP4 2 END 2020-04-01 1
For transforming/getting results in original shape, you may filter overlapping rows
library(tidyverse)
df_new <- structure(list(ID = c(3, 3, 3, 3, 3, 3, 3), DRUG = c("DPP4", "DPP4", "SU", "SU", "DPP4", "DPP4", "DPP4"), START = structure(c(17004, 17383, 17383, 17418, 17437, 17649, 17676), class = c("IDate", "Date")), END = structure(c(17039, 17405, 17405, 17521, 17625, 17669, 17711), class = c("IDate", "Date"))), row.names = c(NA, -7L), class = c("tbl_df", "tbl", "data.frame"))
df_new %>%
mutate(treatment_id = row_number()) %>%
pivot_longer(c(START, END), names_to = 'event', values_to = 'dates') %>%
mutate(event = factor(event, levels = c('END', 'START'), ordered = TRUE)) %>%
group_by(ID) %>%
arrange(dates, event, .by_group = TRUE) %>%
mutate(overlap = cumsum(ifelse(event == 'START', 1, -1))) %>%
filter((overlap > 1 & event == 'START') | (overlap > 0 & event == 'END')) %>%
left_join(df_new %>% mutate(treatment_id = row_number()), by = c('ID', 'DRUG', 'treatment_id'))
#> # A tibble: 4 x 8
#> # Groups: ID [1]
#> ID DRUG treatment_id event dates overlap START END
#> <dbl> <chr> <int> <ord> <date> <dbl> <date> <date>
#> 1 3 SU 3 START 2017-08-05 2 2017-08-05 2017-08-27
#> 2 3 DPP4 2 END 2017-08-27 1 2017-08-05 2017-08-27
#> 3 3 DPP4 5 START 2017-09-28 2 2017-09-28 2018-04-04
#> 4 3 SU 4 END 2017-12-21 1 2017-09-09 2017-12-21
Created on 2021-08-10 by the reprex package (v2.0.0)

Does any column match requirement?

I have a count of stems by tree species for different plots.
structure(list(Plot = c(1, 2), Pine = c(0, 430), Spruce = c(708,
1241), Birch = c(119, 48), Aspen = c(0, 0), Salix = c(0, 0),
Rowan = c(0, 0), Alnus = c(0, 0), stem_sum = c(827, 1719)), row.names = c(NA,
-2L), groups = structure(list(.rows = structure(list(1L, 2L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame")), class = c("rowwise_df", "tbl_df", "tbl",
"data.frame"))
What I want to do is use dplyr 1.0 convention to mutate a new column, "Main species", if any of the tree species columns exceed 80% of the stem_sum of that plot.
My thought process:
df %>% rowwise() %>% mutate(`Main species`= c_across(Pine:Alnus, if(.. / stem/sum >= 0.8, paste(...))
How can I modify this code such that if there are more than one column which fulfils the requirement, the output will be "Mixed"?
You can use :
library(dplyr)
df %>%
rowwise() %>%
mutate(Main_Species = if(any(c_across(Pine:Alnus) >= 0.8 * stem_sum))
'Mixed' else 'Not Mixed')
# Plot Pine Spruce Birch Aspen Salix Rowan Alnus stem_sum Main_Species
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#1 1 0 708 119 0 0 0 0 827 Mixed
#2 2 430 1241 48 0 0 0 0 1719 Not Mixed
Or in base R :
df$Main_species <- ifelse(rowSums(df[2:8] >= df$stem_sum * 0.8) > 0,
'Mixed', 'Not Mixed')
library(tidyverse)
df %>%
pivot_longer(-c(Plot, stem_sum)) %>%
arrange(Plot, desc(value)) %>%
group_by(Plot) %>%
mutate(pct = value/stem_sum,
main_species = case_when(
pct > 0.8 & pct == max(pct) ~ name,
pct == max(pct) ~ "mixed"
)
) %>% ungroup() %>%
fill(main_species, .direction = "down") %>%
select(-pct) %>%
pivot_wider()

Resources