Summarize values by group, but keep original data - r

I am trying to figure out how to sum values belonging to category a and b by factor file, but also keep the original data.
library(dplyr)
df <- data.frame(ID = 1:20, values = runif(20), category = rep(letters[1:5], 4), file = as.factor(sort(rep(1:5, 4))))
ID values category file
1 1 0.65699229 a 1
2 2 0.70506478 b 1
3 3 0.45774178 c 1
4 4 0.71911225 d 1
5 5 0.93467225 e 1
6 6 0.25542882 a 2
7 7 0.46229282 b 2
8 8 0.94001452 c 2
9 9 0.97822643 d 2
10 10 0.11748736 e 2
11 11 0.47499708 a 3
12 12 0.56033275 b 3
13 13 0.90403139 c 3
14 14 0.13871017 d 3
15 15 0.98889173 e 3
16 16 0.94666823 a 4
17 17 0.08243756 b 4
18 18 0.51421178 c 4
19 19 0.39020347 d 4
20 20 0.90573813 e 4
so that
df[1,2] will be added to df[2,2] to category 'ab' for file 1
df[6,2] will be added to df[7,2] to category 'ab' for file 2
etc.
So far I have this:
df %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
summarise(values = sum(values))
Problem
I would like to change the category of the summed values to "ab" and append it to the original data frame in the same pipeline.
Desired output:
ID values category file
1 1 0.65699229 a 1
2 2 0.70506478 b 1
3 3 0.45774178 c 1
4 4 0.71911225 d 1
5 5 0.93467225 e 1
6 6 0.25542882 a 2
7 7 0.46229282 b 2
8 8 0.94001452 c 2
9 9 0.97822643 d 2
10 10 0.11748736 e 2
11 11 0.47499708 a 3
12 12 0.56033275 b 3
13 13 0.90403139 c 3
14 14 0.13871017 d 3
15 15 0.98889173 e 3
16 16 0.94666823 a 4
17 17 0.08243756 b 4
18 18 0.51421178 c 4
19 19 0.39020347 d 4
20 20 0.90573813 e 4
21 21 1.25486225 ab 1
22 22 1.87216325 ab 2
23 23 1.36548126 ab 3

This will get you the result
df %>% bind_rows(
df %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
mutate(values = sum(values), category = paste0(category,collapse='')) %>%
filter(row_number() == 1 & n() > 1)
) %>% mutate(ID = row_number())
BTW the code pro produce the dataframe in the example is this one:
df <- data.frame(ID = 1:20, values = runif(20), category = rep(letters[1:5], 4), file = as.factor(sort(rep(1:4, 5))))
now lets say you want to sum multiple columns, you need to provide the list in a vector:
cols = c("values") # columns to be sum
df %>% bind_rows(
df %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
mutate_at(vars(cols), sum) %>%
mutate(category = paste0(category,collapse='')) %>%
filter(row_number() == 1 & n() > 1)
) %>% mutate(ID = row_number())

library(dplyr)
df1 %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
filter(n_distinct(category) > 1) %>%
summarise(values = sum(values)) %>%
mutate(category="ab",
ID=max(df1$ID)+1:n()) %>%
bind_rows(df1, .)
#> Warning in bind_rows_(x, .id): binding factor and character vector,
#> coercing into character vector
#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector
#> ID values category file
#> 1 1 0.62585921 a 1
#> 2 2 0.61865851 b 1
#> 3 3 0.05274456 c 1
#> 4 4 0.68156961 d 1
.
.
.
#> 19 19 0.43239411 d 5
#> 20 20 0.85886314 e 5
#> 21 21 1.24451773 ab 1
#> 22 22 0.99001810 ab 2
#> 23 23 1.25331943 ab 3

This data.table approach uses a self-join to get all of the possible two-character combinations.
library(data.table)
setDT(df)
df_self_join <- df[df, on = .(file), allow.cartesian = T
][category != i.category,
.(category = paste0(i.category, category), values = values + i.values, file)
][order(category), .(ID = .I + nrow(df), values, category, file)]
rbindlist(list(df, df_self_join))
ID values category file
1: 1 0.76984382 a 1
2: 2 0.54311583 b 1
3: 3 0.23462016 c 1
4: 4 0.60179043 d 1
...
20: 20 0.03534223 e 5
21: 21 1.31295965 ab 1
22: 22 0.51666175 ab 2
23: 23 1.02305754 ab 3
24: 24 1.00446399 ac 1
25: 25 0.96910373 ac 2
26: 26 0.87795389 ac 4
#total of 80 rows
Here is pretty close dplyr translation:
library(dplyr)
tib <- as_tibble(df)
inner_join(tib, tib, by = 'file')%>%
filter(ID.x != ID.y)%>%
transmute(category = paste0(category.x, category.y)
, values = values.x + values.y
, file)%>%
arrange(category)%>%
bind_rows(tib, .)%>%
mutate(ID = row_number())%>%
filter(category == 'ab') #filter added to show the "ab" files
# A tibble: 3 x 4
ID values category file
<int> <dbl> <chr> <fct>
1 21 1.31 ab 1
2 22 0.517 ab 2
3 23 1.02 ab 3

Related

How to create unit record data from summary table

I have a summary table I need to expand into unit-level observations to run test statistics on.
The summary table looks like this
tbl_summmary <-
tibble(
outcome = c("A (%)", "B (%)", "C (%)", "D (%)", "Total (n)"),
group_1 = c(.25, .25, .125, .325, 10),
group_2 = c(.50, 0.0, .325, .125, 20),
group_3 = c(.10, .40, .125, .325, 40))
tbl_summmary
The result needs to be a long format dataframe of the data described in the summary table.
df_simulation <-
bind_rows(
tibble(
group = 1,
outcome = c(
rep("A", .25*10),
rep("B", .25*10),
rep("C", .125*10),
rep("D", .325*10))),
tibble(
group = 2,
outcome = c(
rep("A", .50*20),
rep("B", .00*20),
rep("C", .325*20),
rep("D", .125*20))),
tibble(
group = 3,
outcome = c(
rep("A", .10*40),
rep("B", .40*40),
rep("C", .125*40),
rep("D", .325*40))))
df_simulation
However, the code needs to iterate over multiple variables and multiple groups. Typing out each group and variable manually like in the above won't be scalable. A way of doing this programmatically would be much appreciated!
One option would be to use tidyr::uncount after some additional data wrangling steps:
library(tidyr)
library(dplyr)
# Get df of totals
totals <- tbl_summmary[nrow(tbl_summmary),] |>
pivot_longer(-outcome, names_to = "group", names_prefix = "group_") |>
select(group, total = value)
# Get df of outcomes
outcomes <- tbl_summmary[-nrow(tbl_summmary),] |>
mutate(outcome = gsub("^(\\w+).*$", "\\1", outcome)) |>
pivot_longer(-outcome, names_to = "group", names_prefix = "group_") |>
left_join(totals, by = "group") |>
# We need integers, so use round
mutate(n = round(value * total)) |>
select(group, outcome, n) |>
uncount(n) |>
mutate(group = as.numeric(group)) |>
arrange(group, outcome)
count(outcomes, outcome, group)
#> # A tibble: 11 × 3
#> outcome group n
#> <chr> <dbl> <int>
#> 1 A 1 2
#> 2 A 2 10
#> 3 A 3 4
#> 4 B 1 2
#> 5 B 3 16
#> 6 C 1 1
#> 7 C 2 6
#> 8 C 3 5
#> 9 D 1 3
#> 10 D 2 2
#> 11 D 3 13
identical(df_simulation, outcomes)
#> [1] TRUE
I do like the approach by #stefan.
If you want the format you requested, it can be done by listing the outcome values by the total amount then unnesting the values, then putting it into long format.
library(dplyr)
library(tidyr)
library(stringr)
df <- tbl_summmary |>
mutate(across(starts_with("group"), ~ .x *.x[5])) |>
rowwise() |>
mutate(across(starts_with("group"), ~
ifelse(.x != 0,
list(rep(str_remove(outcome, " \\(%\\)"), .x)),
list(NA_character_))))
df[-5, -1] |>
unnest_wider(col = everything(), names_sep = "_") |>
stack() |>
mutate(ind = str_extract(str_remove(ind, "_\\d+$"), "\\d"), .before = 1) |>
rename(group = ind, outcome = values) |>
drop_na()
outcome group
1 A 1
2 B 1
3 C 1
4 D 1
5 A 1
6 B 1
7 D 1
8 D 1
9 A 2
10 C 2
11 D 2
12 A 2
13 C 2
14 D 2
15 A 2
16 C 2
17 A 2
18 C 2
19 A 2
20 C 2
21 A 2
22 C 2
23 A 2
24 A 2
25 A 2
26 A 2
27 A 3
28 B 3
29 C 3
30 D 3
31 A 3
32 B 3
33 C 3
34 D 3
35 A 3
36 B 3
37 C 3
38 D 3
39 A 3
40 B 3
41 C 3
42 D 3
43 B 3
44 C 3
45 D 3
46 B 3
47 D 3
48 B 3
49 D 3
50 B 3
51 D 3
52 B 3
53 D 3
54 B 3
55 D 3
56 B 3
57 D 3
58 B 3
59 D 3
60 B 3
61 D 3
62 B 3
63 B 3
64 B 3

R iterating by group and mapping values based on column value

I have the following data frame in R:
df <- data.frame(name = c('p1_start','p1_end','p2_start','p2_end','p1_start','p1_end','p2_start','p2_end','p1_start','p1_end','p2_start','p2_end','p1_start','p1_end','p2_start','p2_end'),
time = c(1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31),
target = c(1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2),
comb = c(0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1))
And another data frame:
data <- data.frame(time = c(2,5,8,14,14,20,21,26,28,28),
name = c('a','b','c','d','e','f','g','h','i','j'))
So, if we take a look at df we could sort the data by target and combination and we will notice that there are basically "groups". For example for target=1 and comb=0 there are four entries p1_start,p1_end,p2_start,p2_end and it is the same for all other target/comb combinations.
On the other side data contains entries with time being a timestamp.
Goal: I want to map the values from both data frames based on time.
Example: The first entry of data has time=2 meaning it happened between p1_start,p1_end so it should get the values target=1 and comb=0 mapped to the data data frame.
Example 2: The entries of data with time=14 happened between p2_start,p2_end so they should get the values target=1 and comb=1 mapped to the data data frame.
Idea: I thought I iterate over df by target and comb and for each combination of them check if there are rows in data whose time is between. The second could be done with the following command:
data[which(data$time > p1_start & data$time < p2_end),]
once I get the rows it is easy to append the values.
Problem: how could I do the iteration? I tried with the following:
df %>%
group_by(target, comb) %>%
print(data[which(data$time > df$p1_start & data$time < df$p2_end),])
But I am getting an error that time has not been initialized
Your problem is best known as performing non-equi join. We need to find a range in some given dataframe that corresponds to each value in one or more given vectors. This is better handled by the data.table package.
We would first transform your df into a format suitable for performing the join and then join data with df by time <= end while time >= start. Here is the code
library(data.table)
setDT(df)[, c("type", "name") := tstrsplit(name, "_", fixed = TRUE)]
df <- dcast(df, ... ~ name, value.var = "time")
cols <- c("target", "comb", "type")
setDT(data)[df, (cols) := mget(paste0("i.", cols)), on = .(time<=end, time>=start)]
After dcast, df looks like this
target comb type end start
1: 1 0 p1 3 1
2: 1 0 p2 7 5
3: 1 1 p1 11 9
4: 1 1 p2 15 13
5: 2 0 p1 19 17
6: 2 0 p2 23 21
7: 2 1 p1 27 25
8: 2 1 p2 31 29
And the output is
> data
time name target comb type
1: 2 a 1 0 p1
2: 5 b 1 0 p2
3: 8 c NA NA <NA>
4: 14 d 1 1 p2
5: 14 e 1 1 p2
6: 20 f NA NA <NA>
7: 21 g 2 0 p2
8: 26 h 2 1 p1
9: 28 i NA NA <NA>
10: 28 j NA NA <NA>
Here is a tidyverse solution:
library(tidyr)
library(dplyr)
df %>%
rename(name_df=name) %>%
mutate(x = time +1) %>%
pivot_longer(
cols = c(time, x),
names_to = "helper",
values_to = "time"
) %>%
right_join(data, by="time") %>%
select(time, name, target, comb)
time name target comb
<dbl> <chr> <dbl> <dbl>
1 2 a 1 0
2 5 b 1 0
3 8 c 1 0
4 14 d 1 1
5 14 e 1 1
6 20 f 2 0
7 21 g 2 0
8 26 h 2 1
9 28 i 2 1
10 28 j 2 1
df <- data.frame(name = c('p1_start','p1_end','p2_start','p2_end','p1_start','p1_end','p2_start','p2_end','p1_start','p1_end','p2_start','p2_end','p1_start','p1_end','p2_start','p2_end'),
time = c(1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31),
target = c(1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2),
comb = c(0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1))
data <- data.frame(time = c(2,5,8,14,14,20,21,26,28,28),
name = c('a','b','c','d','e','f','g','h','i','j'))
library(fuzzyjoin)
library(tidyverse)
tmp <- df %>%
separate(name,
into = c("p", "period"),
sep = "_",
remove = TRUE) %>%
pivot_wider(
id_cols = c(p, target, comb),
names_from = period,
values_from = time
) %>%
select(-p)
fuzzy_left_join(
x = data,
y = tmp,
by = c("time" = "start",
"time" = "end"),
match_fun = list(`>=`, `<=`))
#> time name target comb start end
#> 1 2 a 1 0 1 3
#> 2 5 b 1 0 5 7
#> 3 8 c NA NA NA NA
#> 4 14 d 1 1 13 15
#> 5 14 e 1 1 13 15
#> 6 20 f NA NA NA NA
#> 7 21 g 2 0 21 23
#> 8 26 h 2 1 25 27
#> 9 28 i NA NA NA NA
#> 10 28 j NA NA NA NA
Created on 2022-01-11 by the reprex package (v2.0.1)

Mutate new column with unique values for each list

I have a list here, and I wish to mutate a new column with unique values for each list relative to the mutation. For example, I want to mutate a column named ID as n >= 1.
Naturally, on a dataframe I would do this:
dat %>% mutate(id = row_number())
For a list, I would do this:
dat%>% map(~ mutate(., ID = row_number()))
And I would get an output likeso:
dat <- list(data.frame(x=c("a", "b" ,"c", "d", "e" ,"f" ,"g") ), data.frame(y=c("p", "lk", "n", "m", "g", "f", "t")))
[[1]]
x id
1 a 1
2 b 2
3 c 3
4 d 4
5 e 5
6 f 6
7 g 7
[[2]]
y id
1 p 1
2 lk 2
3 n 3
4 m 4
5 g 5
6 f 6
7 t 7
Though, how would I mutate a new column ID such that the row number continues from the first list.
Expected output:
[[1]]
x id
1 a 1
2 b 2
3 c 3
4 d 4
5 e 5
6 f 6
7 g 7
[[2]]
y id
1 p 8
2 lk 9
3 n 10
4 m 11
5 g 12
6 f 13
7 t 14
An option is to bind them into a single dataset, create the 'id' with row_number(), split by 'grp', loop over the list and remove any columns that have all NA values
library(dplyr)
library(purrr)
dat %>%
bind_rows(.id = 'grp') %>%
mutate(id = row_number()) %>%
group_split(grp) %>%
map(~ .x %>%
select(where(~ any(!is.na(.))), -grp))
-output
#[[1]]
# A tibble: 7 x 2
# x id
# <chr> <int>
#1 a 1
#2 b 2
#3 c 3
#4 d 4
#5 e 5
#6 f 6
#7 g 7
#[[2]]
# A tibble: 7 x 2
# y id
# <chr> <int>
#1 p 8
#2 lk 9
#3 n 10
#4 m 11
#5 g 12
#6 f 13
#7 t 14
Or an easier approach is to unlist (assuming single column), get the sequence, add a new column with map2
map2(dat, relist(seq_along(unlist(dat)), skeleton = dat),
~ .x %>% mutate(id = .y))
Or using a for loop
dat[[1]]$id <- seq_len(nrow(dat[[1]]))
for(i in seq_along(dat)[-1]) dat[[i]]$id <-
seq(tail(dat[[i-1]]$id, 1) + 1, length.out = nrow(dat[[i]]), by = 1)

dplyr collapse 'tail' rows into larger groups

library(tidyverse)
df <- tibble(a = as.factor(1:20), b = c(50, 20, 13, rep(2, 10), rep(1, 7)))
How do I make dplyr look at this data frame df and collapse all these occurences of 2 into a single summed group, and collapse all the occurrences of 1 into a single summed group? And also keep the rest of the data frame.
Turn this:
# A tibble: 20 x 2
a b
<fct> <dbl>
1 1 50
2 2 20
3 3 13
4 4 2
5 5 2
6 6 2
7 7 2
8 8 2
9 9 2
10 10 2
11 11 2
12 12 2
13 13 2
14 14 1
15 15 1
16 16 1
17 17 1
18 18 1
19 19 1
20 20 1
into this:
# A tibble: 5 x 2
a b
<fct> <dbl>
1 1 50
2 2 20
3 3 13
4 grp2 20
5 grp1 7
[Edit] - I fixed the example data. Sorry about that.
We group by a manufactured sortkey to maintain sort order. We used the fact that b is in descending order in the input but if that is not the case in your actual data then replace sortkey = -b with the more general sortkey = data.table::rleid(b) or the longer sortkey = cumsum(coalesce(b != lag(b), FALSE)) .
We also convert b to the group names giving a new a. It wasn't clear which groups are to be converted to grp... form. Hard-coded 1 and 2? Any group with more than one row? Groups at the end with more than one row? At any rate it would be easy enough to change the condition in the if_else once that were clarified.
Finally perform the summation and then remove the sortkey.
df %>%
group_by(sortkey = -b, a = paste0(if_else(b %in% 1:2, "grp", ""), b)) %>%
summarize(b = sum(b)) %>%
ungroup %>%
select(-sortkey)
giving:
# A tibble: 5 x 2
a b
<chr> <int>
1 50 50
2 20 20
3 13 13
4 grp2 20
5 grp1 7
Here's a way. I have converted a from factor to character to make things easier. You can convert it back to factor if you want. Also your test data was a bit wrong.
df <- tibble(a = as.character(1:20), b = c(50, 20, 13, rep(2, 10), rep(1, 7)))
df %>%
mutate(
a = case_when(
b == 1 ~ "grp1",
b == 2 ~ "grp2",
TRUE ~ a
)
) %>%
group_by(a) %>%
summarise(b = sum(b))
# A tibble: 5 x 2
a b
<chr> <dbl>
1 1 50
2 2 20
3 3 13
4 grp1 7
5 grp2 20
This is an approach which gives you the desired names for groups & where you don't need to think in advance how many cases like that you would need (e.g. it would create grp3, grp4, ... depending on the number in b).
library(dplyr)
df %>%
mutate(
grp = as.numeric(lag(df$b) != df$b),
grp = cumsum(ifelse(is.na(grp), 0, grp))
) %>% group_by(grp) %>%
mutate(
a = ifelse(n() > 1, paste0("grp", b), a),
b = sum(b)
) %>% ungroup() %>% distinct(a, b)
Output:
a b
<chr> <dbl>
1 1 50
2 2 20
3 3 13
4 grp2 20
5 grp1 7
Note that the code could be also condensed but that leads to a certain lack of readability in my opinion:
df %>%
group_by(grp = cumsum(ifelse(is.na(as.numeric(lag(df$b) != df$b)), 0, as.numeric(lag(df$b) != df$b)))) %>%
mutate(
a = ifelse(n() > 1, paste0("grp", b), a),
b = sum(b)
) %>% ungroup() %>% distinct(a, b)

Add column across rows condtionally

with df like below
df <- data.frame(
name = rep(c("A", "B", "C"),2),
type = c("10", "10", "10","20", "20", "20"),
val = c(1,2,3,4,5,6)
)
> df
name type val
1 A 10 1
2 B 10 2
3 C 10 3
4 A 20 4
5 B 20 5
6 C 20 6
>
the expected output is
I need to add val of all records with name C to val of records with name A for the corresponding type with a new name AC. Need an output keeping name C and without it.
output1
name type val
1 A 10 1
2 B 10 2
3 C 10 3
4 AC 10 4
5 A 20 4
6 B 20 5
7 C 20 6
8 AC 20 10
output2
name type val
1 AC 10 4
2 B 10 2
4 AC 20 10
5 B 20 5
>
prefer dplyr based solution
Here is one way,
library(dplyr)
df %>%
mutate(new = as.integer(name %in% c('A', 'C'))) %>%
group_by(type, new) %>%
summarise(name = paste0(name, collapse = ''), val = sum(val)) %>%
ungroup() %>%
select(-new)
# A tibble: 4 × 3
# type name val
# <fctr> <chr> <dbl>
#1 10 B 2
#2 10 AC 4
#3 20 B 5
#4 20 AC 10
To get the other output then,
df %>%
mutate(new = as.integer(name %in% c('A', 'C'))) %>%
group_by(type, new) %>%
summarise(name = paste0(name, collapse = ''), val = sum(val)) %>%
ungroup() %>%
select(-new) %>%
filter(nchar(name) > 1) %>%
bind_rows( df) %>%
arrange(val)
# A tibble: 8 × 3
# type name val
# <fctr> <chr> <dbl>
#1 10 A 1
#2 10 B 2
#3 10 C 3
#4 10 AC 4
#5 20 A 4
#6 20 B 5
#7 20 C 6
#8 20 AC 10
Here is another (requires tidyr as well as dplyr)
df1 <- df %>% group_by(type) %>%
summarise(AC=sum(val[name %in% c("A","C")]),B=val[name=="B"]) %>%
gather(key=name,value=val,-type) %>%
arrange(type)
Here is one option using data.table
library(data.table)
rbindlist(list(df, setDT(df)[, .(name = "AC", val = sum(val[as.character(name) %chin%
c("A", "C")])) , .(type)][, names(df), with = FALSE]))[order(type, name)]
# name type val
#1: A 10 1
#2: B 10 2
#3: C 10 3
#4: AC 10 4
#5: A 20 4
#6: B 20 5
#7: C 20 6
#8: AC 20 10
Or with dplyr
library(dplyr)
df %>%
filter(name %in% c("A", "C")) %>%
group_by(type) %>%
summarise(name = 'AC', val = sum(val)) %>%
full_join(df, ., on = 'type') %>%
arrange(type, val)
# name type val
#1 A 10 1
#2 B 10 2
#3 C 10 3
#4 AC 10 4
#5 A 20 4
#6 B 20 5
#7 C 20 6
#8 AC 20 10

Resources