Is there a way to select the first row within a group with different conditions in dplyr? - r

I want to select a row for each group created by variable a. It should be the row with the highest value for variable c, but if variable b is TRUE, then the row with b = TRUE and maximum c within that group should be selected.
I have the following code:
set.seed(42)
a <- rep(1:3, each = 3)
b <- sample(c(0,1), size = 9, replace = T)
c <- sample(1:9, size = 9, replace = F)
df <- data.frame(a = a,
b = b,
c = c)
df %>% group_by(a) %>% filter(b == 1) %>%
arrange(desc(c), .by_group = T) %>%
summarise_all(function(x) x[1]) -> df1
df %>% group_by(a) %>% filter(all(b != 1)) %>%
arrange(desc(c), .by_group = T) %>%
summarise_all(function(x) x[1]) -> df2
df3 <- rbind(df1, df2)
This works, but I wonder if there is a simpler way to achieve the same.

You could filter the values for groups and then do your summarize.
df %>%
group_by(a) %>%
filter(all(b==0) | b==1) %>%
summarize(b = first(b), c = max(c))
# a b c
# <int> <dbl> <int>
# 1 1 0 8
# 2 2 1 5
# 3 3 1 9
So we only keep the values per group if b==1 or if all b==0

We can do it with ifelse inside summarise and without the need to filter b values.
set.seed(42)
a <- rep(1:3, each = 3)
b <- sample(c(0,1), size = 9, replace = T)
cc <- sample(1:9, size = 9, replace = F)
df <- data.frame(a = a,
b = b,
cc = cc)
df |>
group_by(a) |>
summarise(b = max(b),teste = ifelse(any(b == 1), max(cc[b == 1]), max(cc)) )
Also, never name something c in R.

library(data.table)
setDT(df)
# select the maximum c value, grouped by a and b
# then negative order by b (so rows with b == 1 get on top),
# and select the first row of each a-group
df[df[, .I[c == max(c)], by = .(a,b)]$V1][order(a,-b), .SD[1], by = a]

library(dplyr)
df %>% group_by(a) %>%
arrange(desc(b),desc(c), .by_group = T) %>%
slice_head(n = 1) %>%
ungroup()
#> # A tibble: 3 × 3
#> a b c
#> <int> <dbl> <int>
#> 1 1 0 8
#> 2 2 1 5
#> 3 3 1 9
Input data:
set.seed(42)
a <- rep(1:3, each = 3)
b <- sample(c(0,1), size = 9, replace = T)
c <- sample(1:9, size = 9, replace = F)
df <- data.frame(a = a,
b = b,
c = c)
df
#> a b c
#> 1 1 0 8
#> 2 1 0 7
#> 3 1 0 4
#> 4 2 0 1
#> 5 2 1 5
#> 6 2 1 2
#> 7 3 1 9
#> 8 3 1 3
#> 9 3 0 6
Created on 2023-01-30 with reprex v2.0.2

Related

How to count multiple column in R

I'm trying to count the number of 0:2 in multiple columns of my data. Here is a sample. Thank for everyone.
df <- data_frame(
a = sample(0:2, 10, replace=T),
b = sample(0:2, 10, replace=T),
c = sample(0:2, 10, replace=T),
d = sample(0:2, 10, replace=T),
)
I want to got like this output
score
a
b
c
d
0
2
4
3
4
1
2
5
2
1
2
6
1
5
5
Code
library(tidyr)
library(dplyr)
df %>%
pivot_longer(cols = everything(),values_to = "score") %>%
count(name,score) %>%
pivot_wider(names_from = name,values_from = n,values_fill = 0)
Iterate using purrr::map() to dplyr::count() by each column, then use purrr::reduce() to dplyr::left_join() all the result dataframes together.
library(dplyr)
library(purrr)
set.seed(13)
counts <- map(
names(df),
~ count(df, score = .data[[.x]], name = .x)
)
reduce(counts, left_join)
score a b c d
1 0 4 5 3 2
2 1 4 1 2 5
3 2 2 4 5 3

sample values by group with conditions

I have grouped data and I want to create a new variable value that will take the value 0 or 1.
Every group needs at least one observation where value==1.
But groups cannot have more than 2 observations where value==1.
Ideally I can set it so no more than 25% of groups only have one observation where value==1.
library(tidyverse)
set.seed(1)
# sample can break the rules
tibble(group = c(rep("A", 3),
rep("B", 6),
rep("C", 4),
rep("D", 5))) %>%
group_by(group) %>%
mutate(value = sample(c(0, 1), n(), replace = TRUE, prob = c(0.8, 0.2)))
One solution would be to create a listing of your unique group labels and shuffle those (here I get the unique group labels via nest). Then depending on whether the group is in the first 25% of rows of the data frame, you can assign either a) a random number between 1 and 2, or b) always 2. Finally, you can use the assigned number to define how 0s and 1s should be sampled for each group, and then unnest the result.
set.seed(0)
result <- df %>%
nest(data = -group) %>%
.[sample(1:nrow(.), nrow(.)), ] %>% # shuffle the group order
mutate(
value_count = ifelse(row_number() / n() <= 0.25, sample(1:2, n(), replace = T), 2)
) %>%
rowwise() %>%
mutate(
count = nrow(data),
value = list(sample(c(rep(1, value_count), rep(0, count - value_count)), count))
) %>%
unnest(value) %>%
select(-data, -value_count, -count)
group value
<chr> <dbl>
1 B 0
2 B 0
3 B 0
4 B 0
5 B 1
6 B 0
7 A 1
8 A 1
9 A 0
10 D 1
11 D 0
12 D 1
13 D 0
14 D 0
15 C 1
16 C 0
17 C 0
18 C 1
Looks like I was beat to the punch, but here's another way to do it:
library(tidyverse)
set.seed(1)
# sample can break the rules
x <- tibble(group = c(rep("A", 3),
rep("B", 6),
rep("C", 4),
rep("D", 5)))
# Make all 'var' =1, then set all but first of each group to 0.
xx <- x %>% group_by(group) %>%
mutate(var = row_number()) %>%
mutate(var = ifelse(var == 1, 1, 0))
pct_with_two <- .75 # percentage of groups with two 1's
samp_size <- floor(length(unique(xx$group)) * pct_with_two) #round down to whole number
addl_one <- sample(unique(xx$group), size = samp_size, replace = F)
xx %>%
mutate(var2 = case_when(
group %in% addl_one & row_number() == 2 ~ 1,
TRUE ~0)) %>%
mutate(var = var+var2) %>%
select(-var2)
#> # A tibble: 18 x 2
#> # Groups: group [4]
#> group var
#> <chr> <dbl>
#> 1 A 1
#> 2 A 1
#> 3 A 0
#> 4 B 1
#> 5 B 0
#> 6 B 0
#> 7 B 0
#> 8 B 0
#> 9 B 0
#> 10 C 1
#> 11 C 1
#> 12 C 0
#> 13 C 0
#> 14 D 1
#> 15 D 1
#> 16 D 0
#> 17 D 0
#> 18 D 0
Created on 2022-03-11 by the reprex package (v0.3.0)

How can I dynamically group_by a dataframes variables in a function?

I want a function where i can enter different numbers of column names and have them grouped. The first piece of code here works:
df <- data.frame(col_a = sample(1:10, 100, replace = T),
col_b = sample(letters, 100, replace = T),
col_c = sample(LETTERS, 100, replace = T))
my_fun = function(df, ...) {
df %>% group_by_(...) %>% summarise(n = n())
}
my_fun(df , 'col_a')
my_fun(df , 'col_a', 'col_b')
my_fun(df , 'col_a', 'col_b', 'col_c')
What I now want is to apply the complete function, so all possible values in each grouped variable are present. I've manually typed col_a and col_b into the complete() function below. I'd want to pass the possible values as a function argument though, as I'm not always going to be grouping by col_a & col_b.
my_fun = function(df, ...) {
df %>% group_by_(...) %>% summarise(count = n()) %>%
ungroup() %>%
complete(col_a = 1:10, col_b = letters, fill = list(count = 0))
}
my_fun(df , 'col_a', 'col_b')
You can capture the data as named list. group_by + summarise n() can be replaced with count.
library(tidyverse)
my_fun = function(df, ...) {
args <- list(...)
df %>%
count(across(all_of(names(args))), name = 'count') %>%
complete(!!!args, fill = list(count = 0))
}
This can be ran as -
my_fun(df , 'col_a' = 1:12)
# col_a count
# <int> <dbl>
# 1 1 9
# 2 2 15
# 3 3 4
# 4 4 11
# 5 5 7
# 6 6 12
# 7 7 12
# 8 8 10
# 9 9 5
#10 10 15
#11 11 0
#12 12 0
my_fun(df , 'col_a' = 1:10, 'col_b' = letters)
# col_a col_b count
# <int> <chr> <dbl>
# 1 1 a 1
# 2 1 b 0
# 3 1 c 0
# 4 1 d 0
# 5 1 e 0
# 6 1 f 1
# 7 1 g 0
# 8 1 h 0
# 9 1 i 0
#10 1 j 0
# … with 250 more rows

How to average every two rows of dataframe in R

I have the following data frame (with 1000's of columns):
df<- structure(c(1, 2, 2, 1, 2, 2, 2, 1, 3, 3, 2, 2),
.Dim = 4:3, .Dimnames = list(c("a", "b", "c", "d"),
c("t1", "t2", "t3")))
What would be an efficient way to get average of every two rows?
Result I want:
t1 t2 t3
a 1 2 3
b 2 2 3
a_b 1.5 2 3
c 2 2 2
d 1 1 2
c_d 1.5 1.5 2
Split on ever 2 rows, then get mean per column, and rbind, and rbind all again.
do.call(rbind,
lapply(seq(1, nrow(df), 2), function(i){
x <- df[ i:(i + 1), , drop = FALSE]
res <- rbind(x, colSums(x)/2)
rownames(res)[ nrow(res) ] <- paste(rownames(x), collapse = "_")
res
}))
# t1 t2 t3
# a 1.0 2.0 3
# b 2.0 2.0 3
# a_b 1.5 2.0 3
# c 2.0 2.0 2
# d 1.0 1.0 2
# c_d 1.5 1.5 2
One dplyr possibility could be:
df %>%
data.frame() %>%
rownames_to_column() %>%
mutate_if(is.factor, as.numeric) %>%
group_by(group = gl(n()/2, 2)) %>%
group_map(~ bind_rows(.x, tibble(rowname = paste(.x$rowname, collapse = "_"),
t1 = mean(.x$t1),
t2 = mean(.x$t2),
t3 = mean(.x$t3)))) %>%
ungroup() %>%
select(-group)
rowname t1 t2 t3
<chr> <dbl> <dbl> <dbl>
1 a 1 2 2
2 b 2 2 2
3 a_b 1.5 2 2
4 c 2 2 1
5 d 1 1 1
6 c_d 1.5 1.5 1
The first three rows could be omitted if you create it beforehand as a data.frame, with names as a column and with factors as numeric variables. Then, what it does, is to, first, create a grouping variables using gl(). Second, it calculates the means, creates the name as a combination of the two elements in the group and binds it with the original data. Finally, it ungroups and removes the redundant variable.
a base R solution that works with any number of columns
M <- matrix(unlist(c(df)), ncol = 2, byrow = TRUE)
M <- cbind(M, rowMeans(M))
M <- matrix(c(t(M)),ncol = ncol(df), byrow = FALSE)
# add row names and column names
row.names <- matrix(rownames(df), ncol = 2 ,byrow = TRUE)
rownames(M) <- c(t(cbind(row.names, apply(row.names,1, paste, collapse = "_"))))
colnames(M) <- colnames(df)
# t1 t2 t3
# a 1.0 2.0 3
# b 2.0 2.0 3
# a_b 1.5 2.0 3
# c 2.0 2.0 2
# d 1.0 1.0 2
# c_d 1.5 1.5 2
Another dplyr approach.
Update: If you really need the row names (a, b, a_b, etc) see after my original solution for a scalable, but more convoluted, version.
Original
df <- df %>% as_tibble()
n <- nrow(df)/2
orig <- df %>% mutate(grp = sort(rep(1:2, n)))
means <- orig %>% group_by(grp) %>% summarise_all(mean)
bind_rows(orig, means) %>% arrange(grp) %>% select(-grp)
Output:
# A tibble: 6 x 3
t1 t2 t3
<dbl> <dbl> <dbl>
1 1 2 3
2 2 2 3
3 1.5 2 3
4 2 2 2
5 1 1 2
6 1.5 1.5 2
Updated with row names
rnames <- row.names(df)
df <- df %>% as_tibble()
n <- (nrow(df)/2)
orig <- df %>%
mutate(grp = sort(rep(1:n, n)), rn = rnames)
means <- orig %>%
group_by(grp) %>%
mutate(rn = paste0(rn, collapse="_")) %>%
ungroup() %>%
group_by(rn) %>%
summarise_if(is.numeric, mean)
bind_rows(orig, means) %>% arrange(grp) %>% select(-grp)
Output:
t1 t2 t3 rn
<dbl> <dbl> <dbl> <chr>
1 1 2 3 a
2 2 2 3 b
3 1.5 2 3 a_b
4 2 2 2 c
5 1 1 2 d
6 1.5 1.5 2 c_d
One possibility is to use the dplyr package.
Note that the data I use is slightly different from the data you are using: in your data the numbers are actually character values.
df <- structure(c(1, 2, 2, 1, 2, 2, 2, 1, 3, 3, 2, 2),
.Dim = 4:3, .Dimnames = list(c("a", "b", "c", "d"),
c("t1", "t2", "t3")))
First I create the summary tibble (which contains the means).
library(dplyr)
df_summary <- df %>% as_tibble(rownames = "names") %>%
group_by(ceiling(1:n() / 2)) %>%
summarise(names = paste(names, collapse = "_"),
t1 = mean(t1),
t2 = mean(t2),
t3 = mean(t3)) %>%
select(-1)
# A tibble: 2 x 4
names t1 t2 t3
<chr> <dbl> <dbl> <dbl>
1 a_b 1.5 2 3
2 c_d 1.5 1.5 2
Then I combine the summary data with original data:
df_summary %>% bind_rows(df %>% as_tibble(rownames = "names")) %>%
slice(3, 4, 1, 5, 6, 2)
# A tibble: 6 x 4
names t1 t2 t3
<chr> <dbl> <dbl> <dbl>
1 a 1 2 3
2 b 2 2 3
3 a_b 1.5 2 3
4 c 2 2 2
5 d 1 1 2
6 c_d 1.5 1.5 2
This function averages based on a column named "group"
and should be in the dataset.
x is a data frame or a matrix.
rowm = function(x){
x = as.data.frame(x)
u = unique(x$group)
r = rep(NA, ncol(x)*length(u))
tempDF = matrix(r, ncol=ncol(x))
counter=0
for(i in u){
counter = counter+1
tempDF[counter, ] = colMeans(x[x$group==i, ], )
}
colnames(tempDF) = colnames(x)
return(tempDF)}

Operations between groups with dplyr

I have a data frame as follow where I would like to group the data by grp and index and use group a as a reference to perform some simple calculations. I would like to subtract the variable value from other group from the values of group a.
df <- data.frame(grp = rep(letters[1:3], each = 2),
index = rep(1:2, times = 3),
value = seq(10, 60, length.out = 6))
df
## grp index value
## 1 a 1 10
## 2 a 2 20
## 3 b 1 30
## 4 b 2 40
## 5 c 1 50
## 6 c 2 60
The desired outpout would be like:
## grp index value
## 1 b 1 20
## 2 b 2 20
## 3 c 1 40
## 4 c 2 40
My guess is it will be something close to:
group_by(df, grp, index) %>%
mutate(diff = value - value[grp == "a"])
Ideally I would like to do it using dplyr.
Regards, Philippe
We can filter for 'grp' that are not 'a' and then do the difference within mutate.
df %>%
filter(grp!="a") %>%
mutate(value = value- df$value[df$grp=="a"])
Or another option would be join
df %>%
filter(grp!="a") %>%
left_join(., subset(df, grp=="a", select=-1), by = "index") %>%
mutate(value = value.x- value.y) %>%
select(1, 2, 5)
# grp index value
#1 b 1 20
#2 b 2 20
#3 c 1 40
#4 c 2 40

Resources