tidyr mutate new column based on group by with calculation - r

Using tidyr, how can I create a new column through a group-by and calculation?
For example, if I have this dataframe:
name <- c("a", "a", "a", "a", "b", "b", "b", "b")
x1 <- c(0, 0, 0, 0, 1, 1, 1, 1)
x2 <- c(15, 15, 15, 15, 15, 15, 15, 15)
y <- c(1, 2, 1, 2, 1, 2, 1, 2)
z <- c(50, 100, 40, 90, 65, 95, 40, 95)
df <- data.frame(name, x1, x2, y, z)
Let's say I want to (1) group-by x1 and x2; (2) find the max z value in that group; and (3) create a new column z2 that normalized z by that maximum.
So in this case, the expected output for z2 is c(0.5, 1, 0.4, 0.9, 0.684, 1, 0.421, 1).

We could simply group by 'x1', 'x2' and create the column with mutate
library(dplyr)
df <- df %>%
group_by(x1, x2) %>%
mutate(z2 = (z/max(z, na.rm = TRUE))) %>%
ungroup
-output
df
# A tibble: 8 × 6
name x1 x2 y z z2
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 a 0 15 1 50 0.5
2 a 0 15 2 100 1
3 a 0 15 1 40 0.4
4 a 0 15 2 90 0.9
5 b 1 15 1 65 0.684
6 b 1 15 2 95 1
7 b 1 15 1 40 0.421
8 b 1 15 2 95 1

Related

Several column sums by grouping variables in R

I have a data frame of term frequencies and some other random demographic variables. I want to utilize two grouping variables, drop the ones I do not need, and sum the frequencies based on the grouping variables.
Here is similar to what I have
df <- data.frame(user= c(1:9),
Group1 = c("a", "a", "a", "b", "b","b","c", "c", "c"),
Group2 = c("d", "e", "d", "e", "d", "e", "e", "e", "e"),
term1 = c(0, 1, 1, 0, 1, 1, 0, 0, 0),
term2 = c(1, 0, 1, 1, 0, 1, 0, 1, 1),
term3 = c(0, 1, 0, 0, 0, 0, 1, 1, 0))
and here is what I am trying to get.
desired <- data.frame(Group1 = c("a", "a", "b", "b", "c", "c"),
Group2 = c("d", "e", "d", "e", "d", "e"),
term1 = c(1, 1, 1, 1, 0, 0),
term2 = c(2, 0, 0, 2, 0, 2),
term3 = c(0, 1, 0, 0, 0, 2))
My real frame has about 4000 term columns, so naming each one individual in a dplyr function does not seem feasible.
Thank you!
You can try aggregate + expand.grid + merge
merge(
with(df, expand.grid(Group1 = unique(Group1), Group2 = unique(Group2))),
aggregate(. ~ Group1 + Group2, df[-1], sum),
all = TRUE
)
which gives
Group1 Group2 term1 term2 term3
1 a d 1 2 0
2 a e 1 0 1
3 b d 1 0 0
4 b e 1 2 0
5 c d NA NA NA
6 c e 0 2 2
If you want to have NAs as 0, you can try
> res <- merge(
with(df, expand.grid(Group1 = unique(Group1), Group2 = unique(Group2))),
aggregate(. ~ Group1 + Group2, df[-1], sum),
all = TRUE
)
> replace(res, is.na(res), 0)
Group1 Group2 term1 term2 term3
1 a d 1 2 0
2 a e 1 0 1
3 b d 1 0 0
4 b e 1 2 0
5 c d 0 0 0
6 c e 0 2 2
We can group by 'Group1, 'Group2', get the sum of 'term' columns in summarise and expand the data with complete for the missing combinations
library(dplyr)
library(tidyr)
df %>%
group_by(Group1, Group2) %>%
summarise(across(starts_with('term'), sum), .groups = 'drop') %>%
complete(Group1, Group2, fill = list(term1 = 0, term2 = 0, term3 = 0))
-output
# A tibble: 6 x 5
Group1 Group2 term1 term2 term3
<chr> <chr> <dbl> <dbl> <dbl>
1 a d 1 2 0
2 a e 1 0 1
3 b d 1 0 0
4 b e 1 2 0
5 c d 0 0 0
6 c e 0 2 2
If you don't need to compete all varible, setDT(df)[,lapply(.SD[,-1], sum),.(Group1,Group2)] is enough. Otherwise, you can use complete in package tidyr (as used in the first answer) to fill
the lacking varible.
library(data.table)
library(tidyr)
setDT(df)[,lapply(.SD[,-1], sum),.(Group1,Group2)] %>%
complete(Group1, Group2, fill = list(term1 = 0, term2 = 0, term3 = 0))
#> # A tibble: 6 x 5
#> Group1 Group2 term1 term2 term3
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 a d 1 2 0
#> 2 a e 1 0 1
#> 3 b d 1 0 0
#> 4 b e 1 2 0
#> 5 c d 0 0 0
#> 6 c e 0 2 2

How to subtract value of one group from other groups in R

I am trying to subtract the value of one group from another. I am hoping to use tidyverse
structure(list(A = c(1, 1, 1, 2, 2, 2, 3, 3, 3), group = c("a",
"b", "c", "a", "b", "c", "a", "b", "c"), value = c(10, 11, 12,
11, 40, 23, 71, 72, 91)), class = "data.frame", row.names = c(NA,
-9L))
That is my data, and I want to subtract all values of group A from B and C, and store the difference in one variable.
baseR solution
df$new <- df$value - ave(df$value, df$A, FUN = function(x) mean(x[df$group == 'a'], na.rm = T) )
> df
A group value new
1 1 a 10 0
2 1 b 11 1
3 1 c 12 2
4 2 a 11 0
5 2 b 40 29
6 2 c 23 12
7 3 a 71 0
8 3 b 72 1
9 3 c 91 20
dplyr method (assumption there is not more than one a value per group, else R will confuse which value to substract and result in error)
df %>% group_by(A) %>% mutate(new = ifelse(group != 'a', value - value[group == 'a'], value) )
# A tibble: 9 x 4
# Groups: A [3]
A group value new
<dbl> <chr> <dbl> <dbl>
1 1 a 10 10
2 1 b 11 1
3 1 c 12 2
4 2 a 11 11
5 2 b 40 29
6 2 c 23 12
7 3 a 71 71
8 3 b 72 1
9 3 c 91 20
or if you want to change all values
df %>% group_by(A) %>% mutate(new = value - value[group == 'a'] )
# A tibble: 9 x 4
# Groups: A [3]
A group value new
<dbl> <chr> <dbl> <dbl>
1 1 a 10 0
2 1 b 11 1
3 1 c 12 2
4 2 a 11 0
5 2 b 40 29
6 2 c 23 12
7 3 a 71 0
8 3 b 72 1
9 3 c 91 20
I only used data.table rather than data.frame because I'm more familiar.
library(data.table)
data <- setDT(structure(list(A = c(1, 1, 1, 2, 2, 2, 3, 3, 3), group = c("a",
"b", "c", "a", "b", "c", "a", "b", "c"), value = c(10, 11, 12,
11, 40, 23, 71, 72, 91)), class = "data.frame", row.names = c(NA,-9L)))
for (i in 1:length(unique(data$A))){
data[A == i, substraction := data[A == i, 'value'] - data[A == i & group == 'a', value]]
}

How to add new column and calculate recursive cum using dplyr and shift

I have a dataset: (actually I have more than 100 groups)
and I want to use dplyr to create a variable-y for each group, and fill first value of y to be 1,
Second y = 1* first x + 2*first y
The result would be:
I tried to create a column- y, all=1, then use
df%>% group_by(group)%>% mutate(var=shift(x)+2*shift(y))%>% ungroup()
but the formula for y become, always use initialize y value--1
Second y = 1* first x + 2*1
Could someone give me some ideas about this? Thank you!
The dput of my result data is:
structure(list(group = c("a", "a", "a", "a", "a", "b", "b", "b" ), x =
c(1, 2, 3, 4, 5, 6, 7, 8), y = c(1, 3, 8, 19, 42, 1, 8, 23)),
row.names = c(NA, -8L), class = c("tbl_df", "tbl", "data.frame" ))
To perform such calculation we can use accumulate from purrr or Reduce in base R.
Since you are already using dplyr we can use accumulate :
library(dplyr)
df %>%
group_by(group) %>%
mutate(y1 = purrr::accumulate(x[-n()], ~.x * 2 + .y, .init = 1))
# group x y y1
# <chr> <dbl> <dbl> <dbl>
#1 a 1 1 1
#2 a 2 3 3
#3 a 3 8 8
#4 a 4 19 19
#5 a 5 42 42
#6 b 6 1 1
#7 b 7 8 8
#8 b 8 23 23

How to loop data in R?

Here is a piece of my data:
data_x <- tribble(
~price, ~bokey, ~id, ~cost, ~revenue,
1, "a", 10, 0.20, 30,
2, "b", 20, 0.30, 60,
3, "c", 20, 0.30, 40,
4, "d", 10, 0.20, 100,
5, "e", 30, 0.10, 40,
6, "f", 10, 0.20, 10,
1, "g", 20, 0.30, 80,
2 , "h", 10, 0.20, 20,
3, "h", 30, 0.10, 20,
3, "i", 20, 0.30, 40,
)
As you see, there are three different type of IDs: 10, 20, 30. But in the real data, there are almost 100 ids. I want to aggregate the data based on these ids. Because I don't know how to do it in loop, I basically created some subsets:
data_10 <- data_x %>% filter(id == 10)
data_20 <- data_x %>% filter(id == 20)
data_30 <- data_x %>% filter(id == 30)
Here is the aggregated data:
data_agg <- data_10 %>%
group_by(priceseg = cut(as.numeric(price), c(0, 1, 3, 5, 6))) %>%
summarise(price_n = n_distinct(bokey),
Cost = sum(cost, na.rm = T),
Revenue = sum(revenue, na.rm = T),
clicks = n_distinct(bokey)) %>%
mutate(price_n2 = round(100 * prop.table(price_n), 2),
(zet = Cost/Revenue))
But I want to have one more column that shows the id. Here is the desired data:
data_desired <- tribble(
~id, ~priceseg, ~price_n, ~Cost, ~Revenue, ~clicks, ~price_n2, ~`(zet = Cost/Revenue)`
10, (0,1] 1 0.2 30 1 25 0.00667
10, (1,3] 1 0.2 20 1 25 0.01
10, (3,5] 1 0.2 100 1 25 0.002
10, (5,6] 1 0.2 10 1 25 0.02
20,
20,
.
.
) 30,
How can I get it?
Since you are already using dplyr, just add id as one of the grouping variables (no need to previously separate your data):
data_agg <- data_x %>%
group_by(id, priceseg = cut(as.numeric(price), c(0, 1, 3, 5, 6))) %>%
summarise(price_n = n_distinct(bokey),
Cost = sum(cost, na.rm = T),
Revenue = sum(revenue, na.rm = T),
clicks = n_distinct(bokey)) %>%
mutate(price_n2 = round(100 * prop.table(price_n), 2),
(zet = Cost/Revenue))
# A tibble: 8 x 8
# Groups: id [3]
# id priceseg price_n Cost Revenue clicks price_n2 `(zet = Cost/Revenue)`
# <dbl> <fct> <int> <dbl> <dbl> <int> <dbl> <dbl>
# 1 10 (0,1] 1 0.2 30 1 25 0.00667
# 2 10 (1,3] 1 0.2 20 1 25 0.01
# 3 10 (3,5] 1 0.2 100 1 25 0.002
# 4 10 (5,6] 1 0.2 10 1 25 0.02
# 5 20 (0,1] 1 0.3 80 1 25 0.00375
# 6 20 (1,3] 3 0.900 140 3 75 0.00643
# 7 30 (1,3] 1 0.1 20 1 50 0.005
# 8 30 (3,5] 1 0.1 40 1 50 0.0025
An option is to split and loop over with map while specifying the .id
library(dplyr)
library(purrr)
data_x %>%
split(.$id) %>%
map_dfr(~
.x %>%
group_by(priceseg = cut(as.numeric(price), c(0, 1, 3, 5, 6))) %>%
summarise(price_n = n_distinct(bokey),
Cost = sum(cost, na.rm = T),
Revenue = sum(revenue, na.rm = T),
clicks = n_distinct(bokey)) %>%
mutate(price_n2 = round(100 * prop.table(price_n), 2),
(zet = Cost/Revenue)), .id = "id" )
# A tibble: 8 x 8
# id priceseg price_n Cost Revenue clicks price_n2 `(zet = Cost/Revenue)`
# <chr> <fct> <int> <dbl> <dbl> <int> <dbl> <dbl>
#1 10 (0,1] 1 0.2 30 1 25 0.00667
#2 10 (1,3] 1 0.2 20 1 25 0.01
#3 10 (3,5] 1 0.2 100 1 25 0.002
#4 10 (5,6] 1 0.2 10 1 25 0.02
#5 20 (0,1] 1 0.3 80 1 25 0.00375
#6 20 (1,3] 3 0.900 140 3 75 0.00643
#7 30 (1,3] 1 0.1 20 1 50 0.005
#8 30 (3,5] 1 0.1 40 1 50 0.0025
The cut step can also be changed with findInterval
NOTE: The idea of split/map is based on the OP's title about looping and getting the output

How do you use dot referencing (for the data frame) and groups in dplyr?

Consider the following example data:
tmp_df_dplyr <- data.frame(groups = rep(c("C", "B", "A"), each = 3),
a = c(-2, 0, -1, -1, 0, 1, 0, 1, 2),
b = rep(c(-1, 0, 1), each = 3))
I wish to do the following, except using colSums:
tmp_df_dplyr %>%
group_by(groups) %>%
summarise(min_group = min(c(sum(a), sum(b))))
# produces:
# A tibble: 3 × 2
groups min_group
<fctr> <dbl>
1 A 3
2 B 0
3 C -3
Using dot referencing, I get an unexpected result:
tmp_df_dplyr %>%
group_by(groups) %>%
summarise(min_group = min(colSums(.[, c('a', 'b')])))
# produces
# A tibble: 3 × 2
groups min_group
<fctr> <dbl>
1 A 0
2 B 0
3 C 0
that is, it looks like the groups are not being applied.

Resources