I have the following data frame:
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
df <- nycflights13::flights %>%
select(distance) %>%
group_by(distance) %>%
summarise(n = n()) %>%
arrange(distance) %>% ungroup()
df
#> # A tibble: 214 x 2
#> distance n
#> <dbl> <int>
#> 1 17 1
#> 2 80 49
#> 3 94 976
#> 4 96 607
#> 5 116 443
#> 6 143 439
#> 7 160 376
#> 8 169 545
#> 9 173 221
#> 10 184 5504
#> # … with 204 more rows
What I want to do is to bin the distance column by bin of size 100,
and also summing the n column accordingly.
How can do that?
So you get something like:
bin_distance sum_n
1-100 1633 #(1 + 49 + 976 + 607)
101-200 21344 # (443 + ... + 5327)
#etc
The most simple approach would be to use cut by creating groups using seq for every 100 values and sum the values for each group.
library(dplyr)
df %>%
group_by(group = cut(distance, breaks = seq(0, max(distance), 100))) %>%
summarise(n = sum(n))
# group n
# <fct> <int>
# 1 (0,100] 1633
# 2 (100,200] 21344
# 3 (200,300] 28310
# 4 (300,400] 7748
# 5 (400,500] 21292
# 6 (500,600] 26815
# 7 (600,700] 7846
# 8 (700,800] 48904
# 9 (800,900] 7574
#10 (900,1e+03] 18205
# ... with 17 more rows
which can be translated to base R using aggregate like
aggregate(n ~ distance,
transform(df, distance = cut(distance, breaks = seq(0, max(distance), 100))), sum)
A different tidyverse solution. It is closely following the logic of #Ronak Shah code, but instead of cut() it uses cut_width() from ggplot2:
nycflights13::flights %>%
select(distance) %>%
group_by(ints = cut_width(distance, width = 100, boundary = 0)) %>%
summarise(n = n())
ints n
<fct> <int>
1 [0,100] 1633
2 (100,200] 21344
3 (200,300] 28310
4 (300,400] 7748
5 (400,500] 21292
6 (500,600] 26815
7 (600,700] 7846
8 (700,800] 48904
9 (800,900] 7574
10 (900,1e+03] 18205
Related
I would like to know if my approach to calculate the average of the relative abundance of any taxon is correct !!!
If I want to know if, to calculate the relative abundance (percent) of each family (or any Taxon) in a phyloseq object (GlobalPattern) will be correct like:
data("GlobalPatterns")
T <- GlobalPatterns %>%
tax_glom(., "Family") %>%
transform_sample_counts(function(x)100* x / sum(x)) %>% psmelt() %>%
arrange(OTU) %>% rename(OTUsID = OTU) %>%
select(OTUsID, Family, Sample, Abundance) %>%
spread(Sample, Abundance)
T$Mean <- rowMeans(T[, c(3:ncol(T))])
FAM <- T[, c("Family", "Mean" ) ]
#order data frame
FAM <- FAM[order(dplyr::desc(FAM$Mean)),]
rownames(FAM) <- NULL
head(FAM)
Family Mean
1 Bacteroidaceae 7.490944
2 Ruminococcaceae 6.038956
3 Lachnospiraceae 5.758200
4 Flavobacteriaceae 5.016402
5 Desulfobulbaceae 3.341026
6 ACK-M1 3.242808
in this case the Bacteroidaceae were the most abundant family in all the samples of GlobalPattern (26 samples and 19216 OTUs), it was present in 7.49% in average in 26 samples !!!!
It’s correct to make the T$Mean <- rowMeans(T[, c(3:ncol(T))]) to calculate the average any given Taxon ?
Bacteroidaceae has the highest abundance, if all samples were pooled together.
However, it has the highest abundance in only 2 samples.
Nevertheless, there is no other taxon having a higher abundance in an average sample.
Let's use dplyr verbs for all the steps to have a more descriptive and consistent code:
library(tidyverse)
library(phyloseq)
#> Creating a generic function for 'nrow' from package 'base' in package 'biomformat'
#> Creating a generic function for 'ncol' from package 'base' in package 'biomformat'
#> Creating a generic function for 'rownames' from package 'base' in package 'biomformat'
#> Creating a generic function for 'colnames' from package 'base' in package 'biomformat'
data(GlobalPatterns)
data <-
GlobalPatterns %>%
tax_glom("Family") %>%
transform_sample_counts(function(x)100* x / sum(x)) %>%
psmelt() %>%
as_tibble()
# highest abundance: all samples pooled together
data %>%
group_by(Family) %>%
summarise(Abundance = mean(Abundance)) %>%
arrange(-Abundance)
#> # A tibble: 334 × 2
#> Family Abundance
#> <chr> <dbl>
#> 1 Bacteroidaceae 7.49
#> 2 Ruminococcaceae 6.04
#> 3 Lachnospiraceae 5.76
#> 4 Flavobacteriaceae 5.02
#> 5 Desulfobulbaceae 3.34
#> 6 ACK-M1 3.24
#> 7 Streptococcaceae 2.77
#> 8 Nostocaceae 2.62
#> 9 Enterobacteriaceae 2.55
#> 10 Spartobacteriaceae 2.45
#> # … with 324 more rows
# sanity check: is total abundance of each sample 100%?
data %>%
group_by(Sample) %>%
summarise(Abundance = sum(Abundance)) %>%
pull(Abundance) %>%
`==`(100) %>%
all()
#> [1] TRUE
# get most abundant family for each sample individually
data %>%
group_by(Sample) %>%
arrange(-Abundance) %>%
slice(1) %>%
select(Family) %>%
ungroup() %>%
count(Family, name = "n_samples") %>%
arrange(-n_samples)
#> Adding missing grouping variables: `Sample`
#> # A tibble: 18 × 2
#> Family n_samples
#> <chr> <int>
#> 1 Desulfobulbaceae 3
#> 2 Bacteroidaceae 2
#> 3 Crenotrichaceae 2
#> 4 Flavobacteriaceae 2
#> 5 Lachnospiraceae 2
#> 6 Ruminococcaceae 2
#> 7 Streptococcaceae 2
#> 8 ACK-M1 1
#> 9 Enterobacteriaceae 1
#> 10 Moraxellaceae 1
#> 11 Neisseriaceae 1
#> 12 Nostocaceae 1
#> 13 Solibacteraceae 1
#> 14 Spartobacteriaceae 1
#> 15 Sphingomonadaceae 1
#> 16 Synechococcaceae 1
#> 17 Veillonellaceae 1
#> 18 Verrucomicrobiaceae 1
Created on 2022-06-10 by the reprex package (v2.0.0)
I have two large datasets that want to match with each other
library(tidyverse)
df1 <- tibble(position=c(10,11,200,250,300))
df1
#> # A tibble: 5 × 1
#> position
#> <dbl>
#> 1 10
#> 2 11
#> 3 200
#> 4 250
#> 5 300
df2 <- tibble(start=c(1,10,200,251),
end=c(20,100,250,350),
name=c("geneA","geneB","geneC","geneD"))
df2
#> # A tibble: 4 × 3
#> start end name
#> <dbl> <dbl> <chr>
#> 1 1 20 geneA
#> 2 10 100 geneB
#> 3 200 250 geneC
#> 4 251 350 geneD
Created on 2022-03-03 by the reprex package (v2.0.1)
I have the position of the genes in the df1 and I want to find based on the range (start-end) from the df2 how many genes can be found in this position.
I want my data to look like this
position start end name
<dbl> <dbl> <dbl> <chr>
1 10 1 20 geneA
2 10 10 100 geneB
3 11 1 20 geneA
4 11 10 100 geneB
5 200 200 250 geneC
6 250 200 250 geneC
7 300 251 350 geneD
One way to solve this could be through crossing and filtering
df1 %>%
crossing(df2) %>%
filter(position >= start & position <= end)
However my dataset is way too large and can not afford crossing or expanding. Any other idea?
1) SQL engines can perform such operations without crossing. (It may be possible to speed it up even more if you add indexes.)
library(sqldf)
sqldf("select *
from df1 a
join df2 b on a.position between b.start and b.end")
2) data.table can also do some sql-like operations. (Be careful because the first variable in each comparison must be from the first data table and the second from the second. They can't be reordered so, for example, the first comparison could not be written as position <= start even though it is mathematically the same.) Again, adding indexes may improve the speed.
library(data.table)
dt1 <- as.data.table(df1)
dt2 <- as.data.table(df2)[, c("start2", "end2") := .(start, end)]
dt2[dt1, on = .(start <= position, end >= position)]
crossing is a wrapper around expand_grid and does additional stuff e.g. filtering. You can use it directly:
library(tidyverse)
df1 <- tibble(position = c(10, 11, 200, 250, 300))
df1
#> # A tibble: 5 × 1
#> position
#> <dbl>
#> 1 10
#> 2 11
#> 3 200
#> 4 250
#> 5 300
df2 <- tibble(
start = c(1, 10, 200, 251),
end = c(20, 100, 250, 350),
name = c("geneA", "geneB", "geneC", "geneD")
)
expand_grid(df1, df2) %>%
filter(position >= start & position <= end)
#> # A tibble: 7 × 4
#> position start end name
#> <dbl> <dbl> <dbl> <chr>
#> 1 10 1 20 geneA
#> 2 10 10 100 geneB
#> 3 11 1 20 geneA
#> 4 11 10 100 geneB
#> 5 200 200 250 geneC
#> 6 250 200 250 geneC
#> 7 300 251 350 geneD
Created on 2022-03-03 by the reprex package (v2.0.0)
Here is a dplyr way (sort of).
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
df1 <- tibble(position = c(10, 11, 200, 250, 300))
df2 <- tibble(
start = c(1, 10, 200, 251),
end = c(20, 100, 250, 350),
name = c("geneA", "geneB", "geneC", "geneD")
)
vbetween <- function(data, col, data2, start, end){
f <- function(x, l, r) l <= x & x <= r
col <- enquo(col)
start <- enquo(start)
end <- enquo(end)
x <- data %>% pull(!!col)
l <- data2 %>% pull(!!start)
r <- data2 %>% pull(!!end)
yes <- lapply(x, f, l = l, r = r)
lapply(yes, \(i) data2[i, ])
}
df1 %>% vbetween(position, df2, start, end) %>% bind_rows()
#> # A tibble: 7 x 3
#> start end name
#> <dbl> <dbl> <chr>
#> 1 1 20 geneA
#> 2 10 100 geneB
#> 3 1 20 geneA
#> 4 10 100 geneB
#> 5 200 250 geneC
#> 6 200 250 geneC
#> 7 251 350 geneD
Created on 2022-03-03 by the reprex package (v2.0.1)
The following function behaves as desired: several variables can be passed to group_by without the need to put them into alist() or dplyr::vars:
mean_by_grp <- function(df, meanvar, grp) {
grouping <- enexpr(grp) %>%
expr_deparse %>%
str_split(",",simplify = T) %>% `[`(1,) %>%
map(str_remove,"c\\(") %>% map(str_remove,"\\)") %>% map(str_trim) %>%
unlist %>% syms
df %>%
group_by(!!!syms(grouping)) %>%
summarise("average_{{meanvar}}" := mean({{meanvar}}, na.rm = TRUE),
.groups = 'drop')
}
starwars %>% mean_by_grp(height, species)
starwars %>% mean_by_grp(height, c(species, homeworld))
However, it is complicated. I need to turn c(var1,....varn) into a string, split it and turn it into a list of symbols so I can use with with syms.
Isn't there a much easier way to do this?
Of course, I could use ellipses instead of grp, but then I can only have one argument that passes multiple symbols to another function.
One option would be dplyr::across:
mean_by_grp <- function(df, meanvar, grp) {
df %>%
group_by(across({{ grp }})) %>%
summarise("average_{{meanvar}}" := mean({{meanvar}}, na.rm = TRUE),
.groups = 'drop')
}
library(dplyr)
starwars %>% mean_by_grp(height, species)
#> # A tibble: 38 × 2
#> species average_height
#> <chr> <dbl>
#> 1 Aleena 79
#> 2 Besalisk 198
#> 3 Cerean 198
#> 4 Chagrian 196
#> 5 Clawdite 168
#> 6 Droid 131.
#> 7 Dug 112
#> 8 Ewok 88
#> 9 Geonosian 183
#> 10 Gungan 209.
#> # … with 28 more rows
starwars %>% mean_by_grp(height, c(species, homeworld))
#> # A tibble: 58 × 3
#> species homeworld average_height
#> <chr> <chr> <dbl>
#> 1 Aleena Aleen Minor 79
#> 2 Besalisk Ojom 198
#> 3 Cerean Cerea 198
#> 4 Chagrian Champala 196
#> 5 Clawdite Zolan 168
#> 6 Droid Naboo 96
#> 7 Droid Tatooine 132
#> 8 Droid <NA> 148
#> 9 Dug Malastare 112
#> 10 Ewok Endor 88
#> # … with 48 more rows
The sample data frame:
no <- rep(1:5, each=2)
type <- rep(LETTERS[1:2], times=5)
set.seed(4)
value <- round(runif(10, 10, 30))
df <- data.frame(no, type, value)
df
no type value
1 1 A 22
2 1 B 10
3 2 A 16
4 2 B 16
5 3 A 26
6 3 B 15
7 4 A 24
8 4 B 28
9 5 A 29
10 5 B 11
Now what I want is to calculate the % value of each type of type (A or B) and create separate columns. Desired output is something like this:
no pct_A pct_B total_value
1 1 68.75000 31.25000 32
2 2 50.00000 50.00000 32
3 3 63.41463 36.58537 41
4 4 46.15385 53.84615 52
5 5 72.50000 27.50000 40
What I have tried so far (This gives the right output but the process seems very sub-optimal):
df %>%
group_by(no) %>%
mutate(total_value= sum(value))-> df
df %>%
mutate(pct_A=ifelse(type=='A', (value/total_value) *100, 0),
pct_B=ifelse(type=='B', (value/total_value) *100, 0)) %>%
group_by(no) %>%
summarise(pct_A=sum(pct_A),
pct_B=sum(pct_B)) %>%
ungroup() %>%
merge(df) %>%
distinct(no, .keep_all = T) %>%
select(-type, -value)
Is there any better way to do that? Especially using dplyr?
I looked for other answers too, but no help. This one came closer:
R Create new column of values based on the factor levels of another column
You could do it in base using aggregate.
do.call(data.frame, aggregate(value ~ no, df, \(x) c(proportions(x), sum(x)))) |>
setNames(c('no', 'pct_A', 'pct_B', 'total_value'))
# no pct_A pct_B total_value
# 1 1 0.6875000 0.3125000 32
# 2 2 0.5000000 0.5000000 32
# 3 3 0.6341463 0.3658537 41
# 4 4 0.4615385 0.5384615 52
# 5 5 0.7250000 0.2750000 40
For each no we can calculate sum and ratio then get the data in wide format.
library(dplyr)
library(tidyr)
df %>%
group_by(no) %>%
mutate(total_value = sum(value),
value = prop.table(value) * 100) %>%
ungroup %>%
pivot_wider(names_from = type, values_from = value, names_prefix = 'pct_')
# no total_value pct_A pct_B
# <int> <dbl> <dbl> <dbl>
#1 1 32 68.8 31.2
#2 2 32 50 50
#3 3 41 63.4 36.6
#4 4 52 46.2 53.8
#5 5 40 72.5 27.5
Here are two more ways to do this.
We could use purrr::map_dfc. However, setting up the correct column names is kind of cumbersome:
library(dplyr)
library(purrr)
df %>%
group_by(no) %>%
summarise(total_value = sum(value),
map_dfc(unique(type) %>% set_names(., paste0("pct_",.)),
~ sum((type == .x) * value) / total_value * 100)
)
#> # A tibble: 5 x 4
#> no total_value pct_A pct_B
#> <int> <dbl> <dbl> <dbl>
#> 1 1 32 68.8 31.2
#> 2 2 32 50 50
#> 3 3 41 63.4 36.6
#> 4 4 52 46.2 53.8
#> 5 5 40 72.5 27.5
Alternatively we can use dplyover::over (disclaimer: I'm the maintainer) which allows us to create names on the fly in a across-like way:
library(dplyover) # https://github.com/TimTeaFan/dplyover
df %>%
group_by(no) %>%
summarise(total_value = sum(value),
over(dist_values(type), # alternatively `unique(type)`
~ sum((type == .x) * value) / total_value * 100,
.names = "pct_{x}")
)
#> # A tibble: 5 x 4
#> no total_value pct_A pct_B
#> <int> <dbl> <dbl> <dbl>
#> 1 1 32 68.8 31.2
#> 2 2 32 50 50
#> 3 3 41 63.4 36.6
#> 4 4 52 46.2 53.8
#> 5 5 40 72.5 27.5
Created on 2021-09-17 by the reprex package (v2.0.1)
Performance-wise both approaches should be faster compared to data-rectangling approaches such as pivot_wider (but I haven't tested this specific scenario).
This question is similar to other problems with very large data in R, but I can't find an example of how to merge/join and then perform calculations on two dfs (as opposed to reading in lots of dataframes and using mclapply to do the calculations). Here the problem is not loading the data (takes ~20 min but they do load), but rather the merging & summarising.
I've tried all data.table approachesI could find, different types of joins, and ff, and I still run into the problem of vecseq limits 2^31 rows. Now I'm trying to use multidplyr to do it in parallel, but can't figure out where the error is coming from.
Dataframes:
species_data # df with ~ 65 million rows with cols <- c("id","species_id")
lookup # df with ~ 17 million rows with cols <- c("id","cell_id","rgn_id")
Not all ids in the lookup appear in the species_data
## make sample dataframes:
lookup <- data.frame(id = seq(2001,2500, by = 1),
cell_id = seq(1,500, by = 1),
rgn_id = seq(801,1300, by = 1))
library(stringi)
species_id <- sprintf("%s%s%s", stri_rand_strings(n = 1000, length = 3, pattern = "[A-Z]"),
pattern = "-",
stri_rand_strings(1000, length = 5, '[1-9]'))
id <- sprintf("%s%s%s", stri_rand_strings(n = 1000, length = 1, pattern = "[2]"),
stri_rand_strings(n = 1000, length = 1, pattern = "[0-4]"),
stri_rand_strings(n = 1000, length = 1, pattern = "[0-9]"))
species_data <- data.frame(species_id, id)
merge and join dfs with multidplyr
library(tidyverse)
install.packages("devtools")
library(devtools)
devtools::install_github("hadley/multidplyr")
library(multidplyr)
library(parallel)
species_summary <- species_data %>%
# partition the species data by species id
partition(species_id, cluster = cluster) %>%
left_join(species_data, lookup, by = "id") %>%
dplyr::select(-id) %>%
group_by(species_id) %>%
## total number of cells each species occurs in
mutate(tot_count_cells = n_distinct(cell_id)) %>%
ungroup() %>%
dplyr::select(c(cell_id, species_id, rgn_id, tot_count_cells)) %>%
group_by(rgn_id, species_id) %>%
## number of cells each species occurs in each region
summarise(count_cells_eez = n_distinct(cell_id)) %>%
collect() %>%
as_tibble()
## Error in partition(., species_id, cluster = cluster) : unused argument (species_id)
## If I change to:
species_summary <- species_data %>%
group_by(species_id) %>%
partition(cluster = cluster) %>% ...
## get, "Error in worker_id(data, cluster) : object 'cluster' not found
This is my first attempt at parallel and big data and I'm struggling to diagnose the errors.
Thanks!
First I load dplyr and multidplyr
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(multidplyr)
my_clusters <- new_cluster(3) # I have 4 cores
then I load the same data that you propose
library(stringi)
lookup <- tibble(
id = as.character(seq(2001, 2500, by = 1)),
cell_id = seq(1, 500, by = 1),
rgn_id = sprintf("%s", stri_rand_strings(n = 500, length = 3, pattern = "[0-9]"))
)
species_id <- sprintf(
"%s%s%s",
stri_rand_strings(n = 1000, length = 3, pattern = "[A-Z]"),
pattern = "-",
stri_rand_strings(n = 1000, length = 5, "[1-9]")
)
id <- sprintf(
"%s%s%s",
stri_rand_strings(n = 1000, length = 1, pattern = "[2]"),
stri_rand_strings(n = 1000, length = 1, pattern = "[0-4]"),
stri_rand_strings(n = 1000, length = 1, pattern = "[0-9]")
)
species_data <- tibble(species_id, id)
Check the result
species_data
#> # A tibble: 1,000 x 2
#> species_id id
#> <chr> <chr>
#> 1 CUZ-98293 246
#> 2 XDG-61673 234
#> 3 WFZ-94338 230
#> 4 UIH-97549 226
#> 5 AGE-35257 229
#> 6 BMD-75361 249
#> 7 MJB-78799 226
#> 8 STS-15141 225
#> 9 RXD-18645 245
#> 10 SKZ-58666 243
#> # ... with 990 more rows
lookup
#> # A tibble: 500 x 3
#> id cell_id rgn_id
#> <chr> <dbl> <chr>
#> 1 2001 1 649
#> 2 2002 2 451
#> 3 2003 3 532
#> 4 2004 4 339
#> 5 2005 5 062
#> 6 2006 6 329
#> 7 2007 7 953
#> 8 2008 8 075
#> 9 2009 9 008
#> 10 2010 10 465
#> # ... with 490 more rows
Now I can run the code using a multidplyr approach. I divide the dplyr code in two steps according to the two group_by(s)
first_step <- species_data %>%
left_join(lookup, by = "id") %>%
select(-id) %>%
group_by(species_id) %>%
partition(my_clusters) %>%
mutate(tot_count_cells = n_distinct(cell_id)) %>%
collect() %>%
ungroup()
first_step
#> # A tibble: 1,000 x 4
#> species_id cell_id rgn_id tot_count_cells
#> <chr> <dbl> <chr> <int>
#> 1 UIH-97549 NA <NA> 1
#> 2 BMD-75361 NA <NA> 1
#> 3 STS-15141 NA <NA> 1
#> 4 RXD-18645 NA <NA> 1
#> 5 HFI-78676 NA <NA> 1
#> 6 KVP-45194 NA <NA> 1
#> 7 SGW-29988 NA <NA> 1
#> 8 WBI-79521 NA <NA> 1
#> 9 MFY-86277 NA <NA> 1
#> 10 BHO-37621 NA <NA> 1
#> # ... with 990 more rows
and
second_step <- first_step %>%
group_by(rgn_id, species_id) %>%
partition(my_clusters) %>%
summarise(count_cells_eez = n_distinct(cell_id)) %>%
collect() %>%
ungroup()
second_step
#> # A tibble: 1,000 x 3
#> rgn_id species_id count_cells_eez
#> <chr> <chr> <int>
#> 1 <NA> ABB-24645 1
#> 2 <NA> ABY-98559 1
#> 3 <NA> AEQ-42462 1
#> 4 <NA> AFO-58569 1
#> 5 <NA> AKQ-44439 1
#> 6 <NA> AMF-23978 1
#> 7 <NA> ANF-49159 1
#> 8 <NA> APD-85367 1
#> 9 <NA> AQH-64126 1
#> 10 <NA> AST-77513 1
#> # ... with 990 more rows
Created on 2020-03-21 by the reprex package (v0.3.0)