Sampling the same number of imbalanced classes from two data frames - r

I am trying to sample the same number of classess from two data frames of differing sizes. I can do this manually, but the number of classes in some of my data frames are quite large.
I have been able to use the dplyr::count function to get a list of the classes of interest from the smaller data frame, as well as their counts. I then extract these classes and their counts as vectors. I then have attempted to create a function using these vectors and call it using mapply, so I can create filtered slices for each class then re-join the lists using do.call, but I am getting the errors when I attempt to run mapply.
Below are example datasets. df is the smaller data frame which has 6 rows containing ControlVarA == "Group_1" and 10 rows containing ControlVarA == "Group_2", and I am wanting to extract the same number of rows/classes from the larger data frame df2 (which has 6 rows ControlVarA == "Group_1" and 20 rows containing ControlVarA == "Group_2").
df <- data.frame("ID" = 1:16)
df$VarA <- c(1,1,1,1,1,1,1,1,1,1,1,14,NA_real_,NA_real_,NA_real_,16)
df$VarB <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16,16)
df$VarC <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
df$VarD <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
df$ControlVarA <- factor(c("Group_1","Group_1","Group_1","Group_1","Group_1", "Group_1",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
"Group_2","Group_2","Group_2","Group_2"))
df
df2 <- data.frame("ID" = 1:26)
df2$VarA <- c(1,1,1,1,1,1,1,1,1,1,1,14,NA_real_,NA_real_,NA_real_,16,16,16,16,16,16,16,16,16,16,16)
df2$VarB <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16,16,16,16,16,16,16,16,16,16,16,16)
df2$VarC <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16,16,16,16,16,16,16,16,16,16,16)
df2$VarD <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16,16,16,16,16,16,16,16,16,16,16)
df2$ControlVarA <- factor(c("Group_1","Group_1","Group_1","Group_1","Group_1", "Group_1",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2","Group_2","Group_2"))
df2
To extract the class names and class counts I use the code below.
slice_vars <- df %>%
count(ControlVarA) %>%
filter(!is.na(.)) %>%
t() %>%
janitor::row_to_names(1) %>%
colnames()
slice_nums <- df %>%
count(ControlVarA) %>%
filter(!is.na(.)) %>%
t() %>%
janitor::row_to_names(2) %>%
as.data.frame() %>%
rename_with(~ gsub(" ", "", .x)) %>%
colnames() %>%
as.numeric()
The function I created and mapply statement are below
func_group <- function(dataset, x, y) {
dataset %>%
group_by(ControlVarA) %>%
slice_sample(n = all_of(x)) %>%
ungroup() %>%
filter(ControlVarA == data[[y]])
}
combine_lists <- mapply(func_group, slice_nums, slice_vars, MoreArgs = list(dataset = df2))
do.call(rbind, combine_lists)

count to get number of rows for each value in ControlVarA, join with df2 and select n random rows from each group using sample_n. (Unfortunately, slice_sample(n = first(n)) returns an error)
library(dplyr)
df %>%
count(ControlVarA) %>%
left_join(df2, by = 'ControlVarA') %>%
group_by(ControlVarA) %>%
sample_n(first(n)) %>%
ungroup %>%
select(-n)
# ControlVarA ID VarA VarB VarC VarD
# <fct> <int> <dbl> <dbl> <dbl> <dbl>
# 1 Group_1 1 1 10 10 10
# 2 Group_1 4 1 0 16 16
# 3 Group_1 3 1 0 14 14
# 4 Group_1 2 1 0 12 12
# 5 Group_1 5 1 12 10 10
# 6 Group_1 6 1 12 12 12
# 7 Group_2 12 14 14 16 16
# 8 Group_2 25 16 16 16 16
# 9 Group_2 15 NA 16 14 14
#10 Group_2 22 16 16 16 16
#11 Group_2 9 1 0 10 10
#12 Group_2 8 1 12 16 16
#13 Group_2 24 16 16 16 16
#14 Group_2 21 16 16 16 16
#15 Group_2 7 1 12 14 14
#16 Group_2 14 NA 16 12 12

library(tidyverse)
df <- data.frame("ID" = 1:16)
df$VarA <- c(1,1,1,1,1,1,1,1,1,1,1,14,NA_real_,NA_real_,NA_real_,16)
df$VarB <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16,16)
df$VarC <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
df$VarD <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
df$ControlVarA <- factor(c("Group_1","Group_1","Group_1","Group_1","Group_1", "Group_1",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
"Group_2","Group_2","Group_2","Group_2"))
df2 <- data.frame("ID" = 1:26)
df2$VarA <- c(1,1,1,1,1,1,1,1,1,1,1,14,NA_real_,NA_real_,NA_real_,16,16,16,16,16,16,16,16,16,16,16)
df2$VarB <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16,16,16,16,16,16,16,16,16,16,16,16)
df2$VarC <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16,16,16,16,16,16,16,16,16,16,16)
df2$VarD <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16,16,16,16,16,16,16,16,16,16,16)
df2$ControlVarA <- factor(c("Group_1","Group_1","Group_1","Group_1","Group_1", "Group_1",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2","Group_2","Group_2"))
df <- as_tibble(df) %>%
mutate(table = "df")
df2 <- as_tibble(df2) %>%
mutate(table = "df2")
final_df <- df %>%
bind_rows(df2)
set.seed(2021)
final_df %>%
filter(!if_any(.cols = VarA:VarD, is.na)) %>%
group_by(table, ControlVarA) %>%
slice_sample(n = 5)
#> # A tibble: 20 x 7
#> # Groups: table, ControlVarA [4]
#> ID VarA VarB VarC VarD ControlVarA table
#> <int> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
#> 1 6 1 12 12 12 Group_1 df
#> 2 2 1 0 12 12 Group_1 df
#> 3 3 1 0 14 14 Group_1 df
#> 4 5 1 12 10 10 Group_1 df
#> 5 4 1 0 16 16 Group_1 df
#> 6 16 16 16 16 16 Group_2 df
#> 7 9 1 0 10 10 Group_2 df
#> 8 8 1 12 16 16 Group_2 df
#> 9 10 1 14 12 12 Group_2 df
#> 10 7 1 12 14 14 Group_2 df
#> 11 1 1 10 10 10 Group_1 df2
#> 12 4 1 0 16 16 Group_1 df2
#> 13 3 1 0 14 14 Group_1 df2
#> 14 2 1 0 12 12 Group_1 df2
#> 15 6 1 12 12 12 Group_1 df2
#> 16 22 16 16 16 16 Group_2 df2
#> 17 23 16 16 16 16 Group_2 df2
#> 18 9 1 0 10 10 Group_2 df2
#> 19 18 16 16 16 16 Group_2 df2
#> 20 20 16 16 16 16 Group_2 df2
Created on 2021-07-13 by the reprex package (v2.0.0)

Related

How to reorder column values ascending order that are seperated by "," and only keep first value in R

I have a column in a df that consists of values like so:
ID
2
NA
1
3
4
5,7
9,6,10
12
15
16
17
NA
19
22,23
I would like to reorder every row based on ascending order. Note - this column is a "character" based field and some rows are already in the correct order.
From there, I only want to keep the first value and remove the others.
Desired output:
ID
2
NA
1
3
4
5
6
12
15
16
17
NA
19
22
You can split the data on comma, sort them and extract the 1st value.
df$ID <- sapply(strsplit(df$ID, ','), function(x) sort(as.numeric(x))[1])
# ID
#1 2
#2 NA
#3 1
#4 3
#5 4
#6 5
#7 6
#8 12
#9 15
#10 16
#11 17
#12 NA
#13 19
#14 22
A couple of tidyverse alternatives.
library(tidyverse)
#1.
#Same as base R but in tidyverse
df %>% mutate(ID = map_dbl(str_split(ID, ','), ~sort(as.numeric(.x))[1]))
#2.
df %>%
mutate(row = row_number()) %>%
separate_rows(ID, sep = ',', convert = TRUE) %>%
group_by(row) %>%
summarise(ID = min(ID)) %>%
select(-row)
Here is another tidyverse solution: Making use of (dyplr, purrr, stringr and readr
library(tidyverse)
df %>%
mutate(ID = map_chr(str_split(ID, ","), ~
toString(sort(as.numeric(.x)))),
ID = parse_number(ID))
)
output:
ID
1 2
2 NA
3 1
4 3
5 4
6 5
7 6
8 12
9 15
10 16
11 17
12 NA
13 19
14 22
We may use the minimum instead of sorting / extracting:
DF <- transform(DF, ID=sapply(strsplit(ID, ','), \(x) min(as.double(x))))
DF
# ID
# 1 2
# 2 NA
# 3 1
# 4 3
# 5 4
# 6 5
# 7 6
# 8 12
# 9 15
# 10 16
# 11 17
# 12 NA
# 13 19
# 14 22
We could use str_extract
library(stringr)
library(dplyr)
df1 %>%
mutate(ID = as.numeric(str_extract(ID, '\\d+')))
-output
ID
1 2
2 NA
3 1
4 3
5 4
6 5
7 9
8 12
9 15
10 16
11 17
12 NA
13 19
14 22
data
df1 <- structure(list(ID = c("2", NA, "1", "3", "4", "5,7", "9,6,10",
"12", "15", "16", "17", NA, "19", "22,23")), class = "data.frame", row.names = c(NA,
-14L))

In search of a more efficient solution converting Wide data to long data

I want to convert the data from wide to long.I have solved the problem with the reshape package but then I manually had to define which column belonged the "gather columns", if there are hundreds of columns (which is the case in my data) that would be time consuming and a high risk of writing errors.
Does anyone know how to make a more efficient function to reach to this result?
id <- 1001:1003
qA2 <- c(10,5,1)
qB2 <- c(11,6,3)
qC2 <- c(10,7,5)
qA3 <- c(15,12,8)
qB3 <- c(18,15,7)
qC3 <- c(19,11,10)
df <- data.frame(id,qA2,qB2,qC2, qA3, qB3, qC3)
df
id qA2 qB2 qC2 qA3 qB3 qC3
1 1001 10 11 10 15 18 19
2 1002 5 6 7 12 15 11
3 1003 1 3 5 8 7 10
Solution with the reshape package:
library(reshape2)
df_test <- reshape(df, idvar="id", direction="long", varying=list(c(2,5), c(3,6), c(4,7)),v.names=c("qA", "qB", "qC"),times=2:3)
df_test
df_test <- df_test[order(df_test$id, df_test$time),]
id time qA qB qC
1001.2 1001 2 10 11 10
1001.3 1001 3 15 18 19
1002.2 1002 2 5 6 7
1002.3 1002 3 12 15 11
1003.2 1003 2 1 3 5
1003.3 1003 3 8 7 10
Using dplyr and tidyr, here is one way not sure about the efficiency though
library(dplyr)
library(tidyr)
df %>%
gather(key, value, -id) %>%
mutate(key = sub("\\d+", "", key)) %>%
group_by(key) %>%
mutate(row = row_number()) %>%
spread(key, value) %>%
select(-row)
# A tibble: 6 x 4
# id qA qB qC
# <int> <dbl> <dbl> <dbl>
#1 1001 10 11 10
#2 1001 15 18 19
#3 1002 5 6 7
#4 1002 12 15 11
#5 1003 1 3 5
#6 1003 8 7 10
With the new version of tidyr (1.0.0) (already on CRAN, just update it):
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = starts_with("q"),
names_to = "time",
names_prefix = "q[A-Z]",
values_to = c("qA","qB","qC"))
Here is a base R one liner,
df1 <- cbind(id = df$id, (do.call(cbind, lapply(split.default(df[-1],
gsub('\\d+', '', names(df)[-1])), stack))[c(TRUE, FALSE)]))
df1[with(df1, order(id)),]
# id qA.values qB.values qC.values
#1 1001 10 11 10
#4 1001 15 18 19
#2 1002 5 6 7
#5 1002 12 15 11
#3 1003 1 3 5
#6 1003 8 7 10
We can use names_pattern with pivot_longer
library(tidyr)
pivot_longer(df, -id, names_to = c(".value", "time"), names_pattern= "(\\D+)(\\d+)")
# A tibble: 6 x 5
# id time qA qB qC
# <int> <chr> <dbl> <dbl> <dbl>
#1 1001 2 10 11 10
#2 1001 3 15 18 19
#3 1002 2 5 6 7
#4 1002 3 12 15 11
#5 1003 2 1 3 5
#6 1003 3 8 7 10

R How to lag a dataframe by groups

I have the following data set:
Name Year VarA VarB Data.1 Data.2
A 2016 L H 100 101
A 2017 L H 105 99
A 2018 L H 103 105
A 2016 L A 90 95
A 2017 L A 99 92
A 2018 L A 102 101
I want to add a lagged variable by the grouping: Name, VarA, VarB so that my data would look like:
Name Year VarA VarB Data.1 Data.2 Lg1.Data.1 Lg2.Data.1
A 2016 L H 100 101 NA NA
A 2017 L H 105 99 100 NA
A 2018 L H 103 105 105 100
A 2016 L A 90 95 NA NA
A 2017 L A 99 92 90 NA
A 2018 L A 102 101 99 90
I found the following link, which is helpful: debugging: function to create multiple lags for multiple columns (dplyr)
And am using the following code:
df <- df %>%
group_by(Name) %>%
arrange(Name, VarA, VarB, Year) %>%
do(data.frame(., setNames(shift(.[,c(5:6)], 1:2), c(seq(1:8)))))
However, the lag offsetting all data associated w/ name, instead of the grouping I want, so only the 2018 years are accurately lagged.
Name Year VarA VarB Data.1 Data.2 Lg1.Data.1 Lg2.Data.1
A 2016 L H 100 101 NA NA
A 2017 L H 105 99 100 NA
A 2018 L H 103 105 105 100
A 2016 L A 90 95 103 105
A 2017 L A 99 92 90 103
A 2018 L A 102 101 99 90
How do I get the lag to reset for each new grouping combination (e.g. Name / VarA / VarB)?
dplyr::lag lets you set the distance you want to lag by. You can group by whatever variables you want—in this case, Name, VarA, and VarB—before making your lagged variables.
library(dplyr)
df %>%
group_by(Name, VarA, VarB) %>%
mutate(Lg1.Data.1 = lag(Data.1, n = 1), Lg2.Data.1 = lag(Data.1, n = 2))
#> # A tibble: 6 x 8
#> # Groups: Name, VarA, VarB [2]
#> Name Year VarA VarB Data.1 Data.2 Lg1.Data.1 Lg2.Data.1
#> <chr> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 A 2016 L H 100 101 NA NA
#> 2 A 2017 L H 105 99 100 NA
#> 3 A 2018 L H 103 105 105 100
#> 4 A 2016 L A 90 95 NA NA
#> 5 A 2017 L A 99 92 90 NA
#> 6 A 2018 L A 102 101 99 90
If you want a version that scales to more lags, you can use some non-standard evaluation to create new lagged columns dynamically. I'll do this with purrr::map to iterate of a set of n to lag by, make a list of data frames with the new columns added, then join all the data frames together. There are probably better NSE ways to do this, so hopefully someone can improve upon it.
I'm making up some new data, just to have a wider range of years to illustrate. Inside mutate, you can create column names with quo_name.
library(dplyr)
library(purrr)
set.seed(127)
df <- tibble(
Name = "A", Year = rep(2016:2020, 2), VarA = "L", VarB = rep(c("H", "A"), each = 5),
Data.1 = sample(1:10, 10, replace = T), Data.2 = sample(1:10, 10, replace = T)
)
df_list <- purrr::map(1:4, function(i) {
df %>%
group_by(Name, VarA, VarB) %>%
mutate(!!quo_name(paste0("Lag", i)) := dplyr::lag(Data.1, n = i))
})
You don't need to save this list—I'm just doing it to show an example of one of the data frames. You could instead go straight into reduce.
df_list[[3]]
#> # A tibble: 10 x 7
#> # Groups: Name, VarA, VarB [2]
#> Name Year VarA VarB Data.1 Data.2 Lag3
#> <chr> <int> <chr> <chr> <int> <int> <int>
#> 1 A 2016 L H 3 9 NA
#> 2 A 2017 L H 1 4 NA
#> 3 A 2018 L H 3 8 NA
#> 4 A 2019 L H 2 2 3
#> 5 A 2020 L H 4 5 1
#> 6 A 2016 L A 8 4 NA
#> 7 A 2017 L A 6 8 NA
#> 8 A 2018 L A 3 2 NA
#> 9 A 2019 L A 8 6 8
#> 10 A 2020 L A 9 1 6
Then use purrr::reduce to join all the data frames in the list. Since there are columns that are the same in each of the data frames, and those are the ones you want to join by, you can get away with not specifying join-by columns in inner_join.
reduce(df_list, inner_join)
#> Joining, by = c("Name", "Year", "VarA", "VarB", "Data.1", "Data.2")
#> Joining, by = c("Name", "Year", "VarA", "VarB", "Data.1", "Data.2")
#> Joining, by = c("Name", "Year", "VarA", "VarB", "Data.1", "Data.2")
#> # A tibble: 10 x 10
#> # Groups: Name, VarA, VarB [?]
#> Name Year VarA VarB Data.1 Data.2 Lag1 Lag2 Lag3 Lag4
#> <chr> <int> <chr> <chr> <int> <int> <int> <int> <int> <int>
#> 1 A 2016 L H 3 9 NA NA NA NA
#> 2 A 2017 L H 1 4 3 NA NA NA
#> 3 A 2018 L H 3 8 1 3 NA NA
#> 4 A 2019 L H 2 2 3 1 3 NA
#> 5 A 2020 L H 4 5 2 3 1 3
#> 6 A 2016 L A 8 4 NA NA NA NA
#> 7 A 2017 L A 6 8 8 NA NA NA
#> 8 A 2018 L A 3 2 6 8 NA NA
#> 9 A 2019 L A 8 6 3 6 8 NA
#> 10 A 2020 L A 9 1 8 3 6 8
Created on 2018-12-07 by the reprex package (v0.2.1)

Repeated measures in messy format, need help to tidy

I have a very large data set containing weekly weights that have been coded with week of study and the weight at that visit. There are some missing visits and the data is not currently aligned.
df <- data.frame(ID=1:3, Week_A=c(6,6,7), Weight_A=c(23,24,23), Week_B=c(7,7,8),
Weight_B=c(25,26,27), Week_C=c(8,9,9), Weight_C=c(27,26,28))
df
ID Week_A Weight_A Week_B Weight_B Week_C Weight_C
1 1 6 23 7 25 8 27
2 2 6 24 7 26 9 26
3 3 7 23 8 27 9 28
I would like to align the data by week number (ideal output below).
df_ideal <- data.frame (ID=1:3, Week_6=c(23,24,NA), Week_7=c(25,26,23),
Week_8=c(27,NA,27), Week_9=c(NA,26,28))
df_ideal
ID Week_6 Week_7 Week_8 Week_9
1 1 23 25 27 NA
2 2 24 26 NA 26
3 3 NA 23 27 28
I would appreciate some help with this, even to find a starting point to manipulate this data to an easier to manage format.
A tidyverse solution:
df <- data.frame(ID=1:3,
Week_A=c(6,6,7),
Weight_A=c(23,24,23),
Week_B=c(7,7,8),
Weight_B=c(25,26,27),
Week_C=c(8,9,9),
Weight_C=c(27,26,28))
library(tidyverse)
df_long <- df %>% gather(key="v", value="value", -ID) %>%
separate(v, into=c("v1", "v2")) %>%
spread(v1, value) %>%
complete(ID, Week) %>%
arrange(Week, ID)
df_long
# A tibble: 12 x 4
# ID Week v2 Weight
# <int> <dbl> <chr> <dbl>
# 1 1 6 A 23
# 2 2 6 A 24
# 3 3 6 <NA> NA
# 4 1 7 B 25
# 5 2 7 B 26
# 6 3 7 A 23
# 7 1 8 C 27
# 8 2 8 <NA> NA
# 9 3 8 B 27
#10 1 9 <NA> NA
#11 2 9 C 26
#12 3 9 C 28
df_wide <- df_long %>% select(-v2) %>%
spread(Week, Weight, sep="_")
df_wide
# A tibble: 3 x 5
# ID Week_6 Week_7 Week_8 Week_9
# <int> <dbl> <dbl> <dbl> <dbl>
#1 1 23 25 27 NA
#2 2 24 26 NA 26
#3 3 NA 23 27 28
Personally, I'd keep using df_long instead of df_wide, as it is a tidy data frame, while df_wide is not.
Here is a possible approach using the data.table package
library(data.table)
#convert into a data.table
setDT(df)
#convert into a long format
mdat <- melt(df, id.vars="ID", measure.vars=patterns("^Week", "^Weight", cols=names(df)))
#pivot into desired output
ans <- dcast(mdat, ID ~ value1, value.var="value2")
ans output:
ID 6 7 8 9
1: 1 23 25 27 NA
2: 2 24 26 NA 26
3: 3 NA 23 27 28
And if you really need the "Week_" in your column names, you can use
setnames(ans, names(ans)[-1L], paste("Week_", names(ans)[-1L]))
Another tidyverse solution using a double-gather with a final spread
df %>%
gather(k, v, -ID, -starts_with("Weight")) %>%
separate(k, into = c("k1", "k2")) %>%
unite(k1, k1, v) %>%
gather(k, v, starts_with("Weight")) %>%
separate(k, into = c("k3", "k4")) %>%
filter(k2 == k4) %>%
select(-k2, -k3, -k4) %>%
spread(k1, v)
# ID Week_6 Week_7 Week_8 Week_9
#1 1 23 25 27 NA
#2 2 24 26 NA 26
#3 3 NA 23 27 28
In base R, it's a double reshape, firstly to long and then back to wide on a different variable:
tmp <- reshape(df, idvar="ID", varying=lapply(c("Week_","Weight_"), grep, names(df)),
v.names=c("time","Week"), direction="long")
reshape(tmp, idvar="ID", direction="wide", sep="_")
# ID Week_6 Week_7 Week_8 Week_9
#1.1 1 23 25 27 NA
#2.1 2 24 26 NA 26
#3.1 3 NA 23 27 28

data frame selecting top by grouping

I have a data frame such as:
set.seed(1)
df <- data.frame(
sample = 1:50,
value = runif(50),
group = c(rep(NA, 20), gl(3, 10)))
I want to select the top 10 samples based on value. However, if there is a group corresponding to the sample, I only want to include one sample from that group. If group == NA, I want to include all of them. Arranging df by value looks like:
df_top <- df %>%
arrange(-value) %>%
top_n(10, value)
sample value group
1 46 0.7973088 3
2 49 0.8108702 3
3 22 0.8394404 1
4 2 0.8612095 NA
5 27 0.8643395 1
6 20 0.8753213 NA
7 44 0.8762692 3
8 26 0.8921983 1
9 11 0.9128759 NA
10 30 0.9606180 1
I would want to include samples 36, 22, 2, 20, 11, and the next five highest values in my data frame that continue to fit the pattern. How do I accomplish this?
I think I figured this out. Would this be the best way:
df_top <- df %>%
arrange(-value) %>%
group_by(group) %>%
filter(ifelse(!is.na(group), value == max(value), value == value)) %>%
ungroup() %>%
top_n(10, value)
# A tibble: 10 x 3
sample value group
<int> <dbl> <int>
1 18 0.992 NA
2 7 0.945 NA
3 21 0.935 1
4 4 0.908 NA
5 6 0.898 NA
6 35 0.827 2
7 41 0.821 3
8 20 0.777 NA
9 15 0.770 NA
10 17 0.718 NA
Similar method that uses slice instead of filter:
library(dplyr)
df_top <- df %>%
arrange(-value) %>%
group_by(group) %>%
slice(if(any(!is.na(group))) 1 else 1:n()) %>%
ungroup() %>%
top_n(10, value)
Result:
# A tibble: 10 x 3
sample value group
<int> <dbl> <int>
1 21 0.9347052 1
2 35 0.8273733 2
3 41 0.8209463 3
4 18 0.9919061 NA
5 7 0.9446753 NA
6 4 0.9082078 NA
7 6 0.8983897 NA
8 20 0.7774452 NA
9 15 0.7698414 NA
10 17 0.7176185 NA

Resources