Operations between groups with dplyr - r

I have a data frame as follow where I would like to group the data by grp and index and use group a as a reference to perform some simple calculations. I would like to subtract the variable value from other group from the values of group a.
df <- data.frame(grp = rep(letters[1:3], each = 2),
index = rep(1:2, times = 3),
value = seq(10, 60, length.out = 6))
df
## grp index value
## 1 a 1 10
## 2 a 2 20
## 3 b 1 30
## 4 b 2 40
## 5 c 1 50
## 6 c 2 60
The desired outpout would be like:
## grp index value
## 1 b 1 20
## 2 b 2 20
## 3 c 1 40
## 4 c 2 40
My guess is it will be something close to:
group_by(df, grp, index) %>%
mutate(diff = value - value[grp == "a"])
Ideally I would like to do it using dplyr.
Regards, Philippe

We can filter for 'grp' that are not 'a' and then do the difference within mutate.
df %>%
filter(grp!="a") %>%
mutate(value = value- df$value[df$grp=="a"])
Or another option would be join
df %>%
filter(grp!="a") %>%
left_join(., subset(df, grp=="a", select=-1), by = "index") %>%
mutate(value = value.x- value.y) %>%
select(1, 2, 5)
# grp index value
#1 b 1 20
#2 b 2 20
#3 c 1 40
#4 c 2 40

Related

Add a new column with the same value within each group based on value from specific row

I have a data frame with a grouping variable ID, a factor F and a value V that looks something like this:
df <- data.frame(ID = c(rep(1, 3), rep(2, 3)),
F = factor(c("A","B","X","C","D","X")),
V = c(30, 32, 25, 31, 37, 24)
)
> df
ID F V
1 1 A 30
2 1 B 32
3 1 X 25
4 2 C 31
5 2 D 37
6 2 X 24
Now, I would like to add a new column New, which has the same value within each group (by ID) based on the value for V in the row where F==X using the tidyverse environment. Ideally, those rows would be removed afterwards so that the new data frame looks like this:
> df
ID F V New
1 1 A 30 25
2 1 B 32 25
3 2 C 31 24
4 2 D 37 24
I know that I have to use the group_by() function and probably also mutate(), but I couldn't quite manage to get my desired result.
df %>%
group_by(ID) %>%
mutate(New = V[F =='X']) %>%
filter(F != 'X')
# A tibble: 4 × 4
# Groups: ID [2]
ID F V New
<dbl> <fct> <dbl> <dbl>
1 1 A 30 25
2 1 B 32 25
3 2 C 31 24
4 2 D 37 24
library(dplyr)
df %>%
group_by(ID) %>% # grouping variables by ID
mutate(New = ifelse(F == "X",
V,
NA)) %>% # adding New column
summarise(New = max(New, na.rm = T)) %>% # Filtering rows with filled New column
right_join(df %>% filter(F != "X"), by = "ID") %>% # SQL-like join
select(ID, F, V, New) # reordering the columns to the desired order
And you get this output:
# A tibble: 4 × 4
ID F V New
<dbl> <fct> <dbl> <dbl>
1 1 A 30 25
2 1 B 32 25
3 2 C 31 24
4 2 D 37 24
Or even simplier:
df %>% filter(F == "X") %>% # filtering the rows with "X" in F column
right_join(df %>% filter(F != "X"), by = "ID") %>% joining to the same dataset without "X" rows
select(ID, F= F.y, V = V.y, New = V.x) #reordering and renaming of columns

sample values by group with conditions

I have grouped data and I want to create a new variable value that will take the value 0 or 1.
Every group needs at least one observation where value==1.
But groups cannot have more than 2 observations where value==1.
Ideally I can set it so no more than 25% of groups only have one observation where value==1.
library(tidyverse)
set.seed(1)
# sample can break the rules
tibble(group = c(rep("A", 3),
rep("B", 6),
rep("C", 4),
rep("D", 5))) %>%
group_by(group) %>%
mutate(value = sample(c(0, 1), n(), replace = TRUE, prob = c(0.8, 0.2)))
One solution would be to create a listing of your unique group labels and shuffle those (here I get the unique group labels via nest). Then depending on whether the group is in the first 25% of rows of the data frame, you can assign either a) a random number between 1 and 2, or b) always 2. Finally, you can use the assigned number to define how 0s and 1s should be sampled for each group, and then unnest the result.
set.seed(0)
result <- df %>%
nest(data = -group) %>%
.[sample(1:nrow(.), nrow(.)), ] %>% # shuffle the group order
mutate(
value_count = ifelse(row_number() / n() <= 0.25, sample(1:2, n(), replace = T), 2)
) %>%
rowwise() %>%
mutate(
count = nrow(data),
value = list(sample(c(rep(1, value_count), rep(0, count - value_count)), count))
) %>%
unnest(value) %>%
select(-data, -value_count, -count)
group value
<chr> <dbl>
1 B 0
2 B 0
3 B 0
4 B 0
5 B 1
6 B 0
7 A 1
8 A 1
9 A 0
10 D 1
11 D 0
12 D 1
13 D 0
14 D 0
15 C 1
16 C 0
17 C 0
18 C 1
Looks like I was beat to the punch, but here's another way to do it:
library(tidyverse)
set.seed(1)
# sample can break the rules
x <- tibble(group = c(rep("A", 3),
rep("B", 6),
rep("C", 4),
rep("D", 5)))
# Make all 'var' =1, then set all but first of each group to 0.
xx <- x %>% group_by(group) %>%
mutate(var = row_number()) %>%
mutate(var = ifelse(var == 1, 1, 0))
pct_with_two <- .75 # percentage of groups with two 1's
samp_size <- floor(length(unique(xx$group)) * pct_with_two) #round down to whole number
addl_one <- sample(unique(xx$group), size = samp_size, replace = F)
xx %>%
mutate(var2 = case_when(
group %in% addl_one & row_number() == 2 ~ 1,
TRUE ~0)) %>%
mutate(var = var+var2) %>%
select(-var2)
#> # A tibble: 18 x 2
#> # Groups: group [4]
#> group var
#> <chr> <dbl>
#> 1 A 1
#> 2 A 1
#> 3 A 0
#> 4 B 1
#> 5 B 0
#> 6 B 0
#> 7 B 0
#> 8 B 0
#> 9 B 0
#> 10 C 1
#> 11 C 1
#> 12 C 0
#> 13 C 0
#> 14 D 1
#> 15 D 1
#> 16 D 0
#> 17 D 0
#> 18 D 0
Created on 2022-03-11 by the reprex package (v0.3.0)

Computing minimum distance between observations within groups

In the dataset below, how could I create a new column min.diff that reports, for a given observation x, the minimum distance between x and any other observation y within its group (identified by the group column)? I would like to measure the distance between x and y by abs(x-y).
set.seed(1)
df <- data.frame(
group = c('A', 'A', 'A', 'B', 'B', 'C', 'C', 'C'),
value = sample(1:10, 8, replace = T)
)
Expected output:
group value min.diff
1 A 9 2
2 A 4 3
3 A 7 2
4 B 1 1
5 B 2 1
6 C 7 4
7 C 2 1
8 C 3 1
I prefer a solution using dplyr.
The only way that I have in my mind is to extend the dataframe by adding more rows to get each possible pair within groups, calculating distances and then filtering out the smallest value in each group. Is there a more compact way?
We can use map_dbl to subtract current value with all other values and select the minimum from it for each group.
library(dplyr)
library(purrr)
df %>%
group_by(group) %>%
mutate(min.diff = map_dbl(row_number(), ~min(abs(value[-.x] - value[.x]))))
# group value min.diff
# <chr> <int> <dbl>
#1 A 9 2
#2 A 4 3
#3 A 7 2
#4 B 1 1
#5 B 2 1
#6 C 7 4
#7 C 2 1
#8 C 3 1
We can use combn to do the pairwise difference between 'value', get the min of the absolute values
library(dplyr)
df1 <- df %>%
mutate(new = min(abs(combn(value, 2, FUN = function(x) x[1] - x[2]))))
If we want to get the minimum between a given element i.e. first from the rest
df1 <- df %>%
mutate(new = min(abs(value[-1] - first(value))))
If the order doesn't matter...
library(dplyr)
df %>%
arrange(group, value) %>% #Order ascending by value, within each group
group_by(group) %>%
mutate(min.diff = case_when(lag(group) == group & lead(group) == group ~ min(c(abs(value - lag(value)), abs(value - lead(value))), na.rm = T), #If the "group" for the previous and next entry are the same as the current group, take the smallest of the two differences
lag(group) == group ~ abs(value - lag(value)), #Otherwise, if only the previous entry's group is the same as the current one, take the difference from the previous
lead(group) == group ~ abs(value - lead(value)) #Otherwise, if only the next entry's group is the same as the current one, take the difference from the next
)
) %>%
ungroup()
# group value min.diff
# <chr> <int> <int>
# 1 A 4 3
# 2 A 7 2
# 3 A 9 2
# 4 B 1 1
# 5 B 2 1
# 6 C 2 1
# 7 C 3 1
# 8 C 7 4
If the order is important, you could add in an index and rearrange it after, like so:
library(dplyr)
df %>%
group_by(group) %>%
mutate(index = row_number()) %>% #create the index
arrange(group, value) %>%
mutate(min.diff = case_when(lag(group) == group & lead(group) == group ~ min(c(abs(value - lag(value)), abs(value - lead(value))), na.rm = T),
lag(group) == group ~ abs(value - lag(value)),
lead(group) == group ~ abs(value - lead(value))
)
) %>%
ungroup() %>%
arrange(group, index) %>% #rearrange by the index
select(-index) #remove the index
# group value min.diff
# <chr> <int> <int>
# 1 A 9 2
# 2 A 4 3
# 3 A 7 2
# 4 B 1 1
# 5 B 2 1
# 6 C 7 4
# 7 C 2 1
# 8 C 3 1

dplyr collapse 'tail' rows into larger groups

library(tidyverse)
df <- tibble(a = as.factor(1:20), b = c(50, 20, 13, rep(2, 10), rep(1, 7)))
How do I make dplyr look at this data frame df and collapse all these occurences of 2 into a single summed group, and collapse all the occurrences of 1 into a single summed group? And also keep the rest of the data frame.
Turn this:
# A tibble: 20 x 2
a b
<fct> <dbl>
1 1 50
2 2 20
3 3 13
4 4 2
5 5 2
6 6 2
7 7 2
8 8 2
9 9 2
10 10 2
11 11 2
12 12 2
13 13 2
14 14 1
15 15 1
16 16 1
17 17 1
18 18 1
19 19 1
20 20 1
into this:
# A tibble: 5 x 2
a b
<fct> <dbl>
1 1 50
2 2 20
3 3 13
4 grp2 20
5 grp1 7
[Edit] - I fixed the example data. Sorry about that.
We group by a manufactured sortkey to maintain sort order. We used the fact that b is in descending order in the input but if that is not the case in your actual data then replace sortkey = -b with the more general sortkey = data.table::rleid(b) or the longer sortkey = cumsum(coalesce(b != lag(b), FALSE)) .
We also convert b to the group names giving a new a. It wasn't clear which groups are to be converted to grp... form. Hard-coded 1 and 2? Any group with more than one row? Groups at the end with more than one row? At any rate it would be easy enough to change the condition in the if_else once that were clarified.
Finally perform the summation and then remove the sortkey.
df %>%
group_by(sortkey = -b, a = paste0(if_else(b %in% 1:2, "grp", ""), b)) %>%
summarize(b = sum(b)) %>%
ungroup %>%
select(-sortkey)
giving:
# A tibble: 5 x 2
a b
<chr> <int>
1 50 50
2 20 20
3 13 13
4 grp2 20
5 grp1 7
Here's a way. I have converted a from factor to character to make things easier. You can convert it back to factor if you want. Also your test data was a bit wrong.
df <- tibble(a = as.character(1:20), b = c(50, 20, 13, rep(2, 10), rep(1, 7)))
df %>%
mutate(
a = case_when(
b == 1 ~ "grp1",
b == 2 ~ "grp2",
TRUE ~ a
)
) %>%
group_by(a) %>%
summarise(b = sum(b))
# A tibble: 5 x 2
a b
<chr> <dbl>
1 1 50
2 2 20
3 3 13
4 grp1 7
5 grp2 20
This is an approach which gives you the desired names for groups & where you don't need to think in advance how many cases like that you would need (e.g. it would create grp3, grp4, ... depending on the number in b).
library(dplyr)
df %>%
mutate(
grp = as.numeric(lag(df$b) != df$b),
grp = cumsum(ifelse(is.na(grp), 0, grp))
) %>% group_by(grp) %>%
mutate(
a = ifelse(n() > 1, paste0("grp", b), a),
b = sum(b)
) %>% ungroup() %>% distinct(a, b)
Output:
a b
<chr> <dbl>
1 1 50
2 2 20
3 3 13
4 grp2 20
5 grp1 7
Note that the code could be also condensed but that leads to a certain lack of readability in my opinion:
df %>%
group_by(grp = cumsum(ifelse(is.na(as.numeric(lag(df$b) != df$b)), 0, as.numeric(lag(df$b) != df$b)))) %>%
mutate(
a = ifelse(n() > 1, paste0("grp", b), a),
b = sum(b)
) %>% ungroup() %>% distinct(a, b)

How to divide between groups of rows using dplyr?

I have this dataframe:
x <- data.frame(
name = rep(letters[1:4], each = 2),
condition = rep(c("A", "B"), times = 4),
value = c(2,10,4,20,8,40,20,100)
)
# name condition value
# 1 a A 2
# 2 a B 10
# 3 b A 4
# 4 b B 20
# 5 c A 8
# 6 c B 40
# 7 d A 20
# 8 d B 100
I want to group by name and divide the value of rows with condition == "B" with those with condition == "A", to get this:
data.frame(
name = letters[1:4],
value = c(5,5,5,5)
)
# name value
# 1 a 5
# 2 b 5
# 3 c 5
# 4 d 5
I know something like this can get me pretty close:
x$value[which(x$condition == "B")]/x$value[which(x$condition == "A")]
but I was wondering if there was an easy way to do this with dplyr (My dataframe is a toy example and I got to it by chaining multiple group_by and summarise calls).
Try:
x %>%
group_by(name) %>%
summarise(value = value[condition == "B"] / value[condition == "A"])
Which gives:
#Source: local data frame [4 x 2]
#
# name value
# (fctr) (dbl)
#1 a 5
#2 b 5
#3 c 5
#4 d 5
I'd use spread from tidyr.
library(dplyr)
library(tidyr)
x %>%
spread(condition, value) %>%
mutate(value = B/A)
name A B value
1 a 2 10 5
2 b 4 20 5
3 c 8 40 5
4 d 20 100 5
You could then do select(-A, -B) to drop the extra columns.
Using data.table, convert the 'data.frame' to 'data.table' (setDT(x)), grouped by 'name', we divide the 'value' corresponds to 'B' condition by the those that corresponds to 'A' 'condition'.
library(data.table)
setDT(x)[,.(value = value[condition=="B"]/value[condition=="A"]) , name]
# name value
#1: a 5
#2: b 5
#3: c 5
#4: d 5
Or reshape from 'long' to 'wide' and divide the 'B' column by 'A'.
dcast(setDT(x), name~condition, value.var='value')[, .(name, value = B/A)]

Resources