How to create unit record data from summary table - r

I have a summary table I need to expand into unit-level observations to run test statistics on.
The summary table looks like this
tbl_summmary <-
tibble(
outcome = c("A (%)", "B (%)", "C (%)", "D (%)", "Total (n)"),
group_1 = c(.25, .25, .125, .325, 10),
group_2 = c(.50, 0.0, .325, .125, 20),
group_3 = c(.10, .40, .125, .325, 40))
tbl_summmary
The result needs to be a long format dataframe of the data described in the summary table.
df_simulation <-
bind_rows(
tibble(
group = 1,
outcome = c(
rep("A", .25*10),
rep("B", .25*10),
rep("C", .125*10),
rep("D", .325*10))),
tibble(
group = 2,
outcome = c(
rep("A", .50*20),
rep("B", .00*20),
rep("C", .325*20),
rep("D", .125*20))),
tibble(
group = 3,
outcome = c(
rep("A", .10*40),
rep("B", .40*40),
rep("C", .125*40),
rep("D", .325*40))))
df_simulation
However, the code needs to iterate over multiple variables and multiple groups. Typing out each group and variable manually like in the above won't be scalable. A way of doing this programmatically would be much appreciated!

One option would be to use tidyr::uncount after some additional data wrangling steps:
library(tidyr)
library(dplyr)
# Get df of totals
totals <- tbl_summmary[nrow(tbl_summmary),] |>
pivot_longer(-outcome, names_to = "group", names_prefix = "group_") |>
select(group, total = value)
# Get df of outcomes
outcomes <- tbl_summmary[-nrow(tbl_summmary),] |>
mutate(outcome = gsub("^(\\w+).*$", "\\1", outcome)) |>
pivot_longer(-outcome, names_to = "group", names_prefix = "group_") |>
left_join(totals, by = "group") |>
# We need integers, so use round
mutate(n = round(value * total)) |>
select(group, outcome, n) |>
uncount(n) |>
mutate(group = as.numeric(group)) |>
arrange(group, outcome)
count(outcomes, outcome, group)
#> # A tibble: 11 × 3
#> outcome group n
#> <chr> <dbl> <int>
#> 1 A 1 2
#> 2 A 2 10
#> 3 A 3 4
#> 4 B 1 2
#> 5 B 3 16
#> 6 C 1 1
#> 7 C 2 6
#> 8 C 3 5
#> 9 D 1 3
#> 10 D 2 2
#> 11 D 3 13
identical(df_simulation, outcomes)
#> [1] TRUE

I do like the approach by #stefan.
If you want the format you requested, it can be done by listing the outcome values by the total amount then unnesting the values, then putting it into long format.
library(dplyr)
library(tidyr)
library(stringr)
df <- tbl_summmary |>
mutate(across(starts_with("group"), ~ .x *.x[5])) |>
rowwise() |>
mutate(across(starts_with("group"), ~
ifelse(.x != 0,
list(rep(str_remove(outcome, " \\(%\\)"), .x)),
list(NA_character_))))
df[-5, -1] |>
unnest_wider(col = everything(), names_sep = "_") |>
stack() |>
mutate(ind = str_extract(str_remove(ind, "_\\d+$"), "\\d"), .before = 1) |>
rename(group = ind, outcome = values) |>
drop_na()
outcome group
1 A 1
2 B 1
3 C 1
4 D 1
5 A 1
6 B 1
7 D 1
8 D 1
9 A 2
10 C 2
11 D 2
12 A 2
13 C 2
14 D 2
15 A 2
16 C 2
17 A 2
18 C 2
19 A 2
20 C 2
21 A 2
22 C 2
23 A 2
24 A 2
25 A 2
26 A 2
27 A 3
28 B 3
29 C 3
30 D 3
31 A 3
32 B 3
33 C 3
34 D 3
35 A 3
36 B 3
37 C 3
38 D 3
39 A 3
40 B 3
41 C 3
42 D 3
43 B 3
44 C 3
45 D 3
46 B 3
47 D 3
48 B 3
49 D 3
50 B 3
51 D 3
52 B 3
53 D 3
54 B 3
55 D 3
56 B 3
57 D 3
58 B 3
59 D 3
60 B 3
61 D 3
62 B 3
63 B 3
64 B 3

Related

R: expand grid of all possible combinations within groups and apply functions across all the pairs

data <- tibble(time = c(1,1,2,2), a = c(1,2,3,4), b =c(4,3,2,1), c = c(1,1,1,1))
The result will look like this
result <- tibble(
t = c(1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2),
firm1 = c("a","a","a","b","b","b","c","c","c","a","a","a","b","b","b","c","c","c"),
firm2 = c("a","b","c","a","b","c","a","b","c","a","b","c","a","b","c","a","b","c"),
value = c(6,10,5,10,14,9,5,9,4,14,10,9,10,6,5,9,5,4))
result
The function could be
function(x, y){sum(x, y)}
Basically I am looking for a tidy solution to expand.grid data at each point of time and apply functions across columns. Can anyone help?
I tried this, but I could not have time in front of the pairs.
expected_result<-expand.grid(names(data[-1]), names(data[-1])) %>%
mutate(value = map2(Var1, Var2, ~ fun1(data[.x], data[.y])))
expected_result
Use exand.grid you get all possible combination of columns, split the data by time and apply fun for each row of tmp.
library(dplyr)
library(purrr)
tmp <- expand.grid(firm1 = names(data[-1]), firm2 = names(data[-1]))
fun <- function(x, y) sum(x, y)
result <- data %>%
group_split(time) %>%
map_df(~cbind(time = .x$time[1], tmp,
value = apply(tmp, 1, function(x) fun(.x[[x[1]]], .x[[x[2]]]))))
result
# time firm1 firm2 value
#1 1 a a 6
#2 1 b a 10
#3 1 c a 5
#4 1 a b 10
#5 1 b b 14
#6 1 c b 9
#7 1 a c 5
#8 1 b c 9
#9 1 c c 4
#10 2 a a 14
#11 2 b a 10
#12 2 c a 9
#13 2 a b 10
#14 2 b b 6
#15 2 c b 5
#16 2 a c 9
#17 2 b c 5
#18 2 c c 4
You may also do this in base R -
result <- do.call(rbind, by(data, data$time, function(x) {
cbind(time = x$time[1], tmp,
value = apply(tmp, 1, function(y) fun(x[[y[1]]], x[[y[2]]])))
}))
We may use
library(dplyr)
library(tidyr)
library(purrr)
data1 <- data %>%
group_by(time) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop') %>%
pivot_longer(cols = -time) %>%
group_split(time)
map_dfr(data1, ~ {dat <- .x
crossing(firm1 = dat$name, firm2 = dat$name) %>%
mutate(value = c(outer(dat$value, dat$value, FUN = `+`))) %>%
mutate(time = first(dat$time), .before = 1)})
-output
# A tibble: 18 × 4
time firm1 firm2 value
<dbl> <chr> <chr> <dbl>
1 1 a a 6
2 1 a b 10
3 1 a c 5
4 1 b a 10
5 1 b b 14
6 1 b c 9
7 1 c a 5
8 1 c b 9
9 1 c c 4
10 2 a a 14
11 2 a b 10
12 2 a c 9
13 2 b a 10
14 2 b b 6
15 2 b c 5
16 2 c a 9
17 2 c b 5
18 2 c c 4

Using pivot_longer with existing names_to column

Take an example dataframe like so (the real dataframe has more columns):
df <- data.frame(A = seq(1, 3, 1),
B = seq(4, 6, 1))
I can use pivot_longer to collect my columns of interest (A and B) like so:
library(dplyr)
library(tidyr)
df <- df %>%
pivot_longer(cols = c("A", "B"), names_to = "Letter", values_to = "Number")
df
Letter Number
<chr> <dbl>
1 A 1
2 B 4
3 A 2
4 B 5
5 A 3
6 B 6
Now let's say I have another column C in my dataframe, making it no longer tidy
C <- seq(7, 12, 1)
df_2 <- data.frame(df, C)
df_2
Letter Number C
1 A 1 7
2 B 4 8
3 A 2 9
4 B 5 10
5 A 3 11
6 B 6 12
I want to use pivot_longer again to make df_2 tidy and get this output:
data.frame(Letter = c(rep("A", 3), rep("B", 3), rep("C", 3)),
Number = seq(1, 12, 1))
Letter Number
1 A 1
2 A 2
3 A 3
4 B 4
5 B 5
6 B 6
7 C 7
8 C 8
9 C 9
10 C 10
11 C 11
12 C 12
Using the same strategy creates an error though:
df_2 %>%
pivot_longer(cols = "C", names_to = "Letter", values_to = "Number")
Error: Failed to create output due to bad names.
* Choose another strategy with `names_repair`
Setting names_repair to minimal runs but doesn't produce the output I want.
Follow it like this
library(tidyverse)
df <- data.frame(A = seq(1, 3, 1),
B = seq(4, 6, 1))
df <- df %>%
pivot_longer(cols = c("A", "B"), names_to = "Letter", values_to = "Number")
C <- seq(7, 12, 1)
df_2 <- data.frame(C)
df_2 <- df_2 %>% pivot_longer(cols = C, names_to = "Letter", values_to = "Number")
df_result <- rbind(df, df_2)
Output
> df_result
# A tibble: 12 x 2
Letter Number
<chr> <dbl>
1 A 1
2 B 4
3 A 2
4 B 5
5 A 3
6 B 6
7 C 7
8 C 8
9 C 9
10 C 10
11 C 11
12 C 12
Maybe try this if it is helpful:
library(tidyverse)
#Code
df_2 %>% pivot_longer(everything()) %>%
arrange(name) %>% group_by(name) %>%
filter(!duplicated(value))
Output:
# A tibble: 12 x 2
# Groups: name [3]
name value
<chr> <dbl>
1 A 1
2 A 2
3 A 3
4 B 4
5 B 5
6 B 6
7 C 7
8 C 8
9 C 9
10 C 10
11 C 11
12 C 12
We could do this easily with stack
library(dplyr)
stack(df_2)[2:1] %>%
distinct %>%
set_names(c("Letter", "Number"))
-output
# Letter Number
#1 A 1
#2 A 2
#3 A 3
#4 B 4
#5 B 5
#6 B 6
#7 C 7
#8 C 8
#9 C 9
#10 C 10
#11 C 11
#12 C 12
Or an option with unnest/enframe
library(tidyr)
library(tibble)
unclass(df_2) %>%
enframe(name = "Letter", value = "Number") %>%
unnest(c(Number)) %>%
distinct
Or using melt
library(reshape2)
melt(df_2) %>%
distinct()
Or in a single line in base R
unique(stack(df_2)[2:1])

Summarize values by group, but keep original data

I am trying to figure out how to sum values belonging to category a and b by factor file, but also keep the original data.
library(dplyr)
df <- data.frame(ID = 1:20, values = runif(20), category = rep(letters[1:5], 4), file = as.factor(sort(rep(1:5, 4))))
ID values category file
1 1 0.65699229 a 1
2 2 0.70506478 b 1
3 3 0.45774178 c 1
4 4 0.71911225 d 1
5 5 0.93467225 e 1
6 6 0.25542882 a 2
7 7 0.46229282 b 2
8 8 0.94001452 c 2
9 9 0.97822643 d 2
10 10 0.11748736 e 2
11 11 0.47499708 a 3
12 12 0.56033275 b 3
13 13 0.90403139 c 3
14 14 0.13871017 d 3
15 15 0.98889173 e 3
16 16 0.94666823 a 4
17 17 0.08243756 b 4
18 18 0.51421178 c 4
19 19 0.39020347 d 4
20 20 0.90573813 e 4
so that
df[1,2] will be added to df[2,2] to category 'ab' for file 1
df[6,2] will be added to df[7,2] to category 'ab' for file 2
etc.
So far I have this:
df %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
summarise(values = sum(values))
Problem
I would like to change the category of the summed values to "ab" and append it to the original data frame in the same pipeline.
Desired output:
ID values category file
1 1 0.65699229 a 1
2 2 0.70506478 b 1
3 3 0.45774178 c 1
4 4 0.71911225 d 1
5 5 0.93467225 e 1
6 6 0.25542882 a 2
7 7 0.46229282 b 2
8 8 0.94001452 c 2
9 9 0.97822643 d 2
10 10 0.11748736 e 2
11 11 0.47499708 a 3
12 12 0.56033275 b 3
13 13 0.90403139 c 3
14 14 0.13871017 d 3
15 15 0.98889173 e 3
16 16 0.94666823 a 4
17 17 0.08243756 b 4
18 18 0.51421178 c 4
19 19 0.39020347 d 4
20 20 0.90573813 e 4
21 21 1.25486225 ab 1
22 22 1.87216325 ab 2
23 23 1.36548126 ab 3
This will get you the result
df %>% bind_rows(
df %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
mutate(values = sum(values), category = paste0(category,collapse='')) %>%
filter(row_number() == 1 & n() > 1)
) %>% mutate(ID = row_number())
BTW the code pro produce the dataframe in the example is this one:
df <- data.frame(ID = 1:20, values = runif(20), category = rep(letters[1:5], 4), file = as.factor(sort(rep(1:4, 5))))
now lets say you want to sum multiple columns, you need to provide the list in a vector:
cols = c("values") # columns to be sum
df %>% bind_rows(
df %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
mutate_at(vars(cols), sum) %>%
mutate(category = paste0(category,collapse='')) %>%
filter(row_number() == 1 & n() > 1)
) %>% mutate(ID = row_number())
library(dplyr)
df1 %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
filter(n_distinct(category) > 1) %>%
summarise(values = sum(values)) %>%
mutate(category="ab",
ID=max(df1$ID)+1:n()) %>%
bind_rows(df1, .)
#> Warning in bind_rows_(x, .id): binding factor and character vector,
#> coercing into character vector
#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector
#> ID values category file
#> 1 1 0.62585921 a 1
#> 2 2 0.61865851 b 1
#> 3 3 0.05274456 c 1
#> 4 4 0.68156961 d 1
.
.
.
#> 19 19 0.43239411 d 5
#> 20 20 0.85886314 e 5
#> 21 21 1.24451773 ab 1
#> 22 22 0.99001810 ab 2
#> 23 23 1.25331943 ab 3
This data.table approach uses a self-join to get all of the possible two-character combinations.
library(data.table)
setDT(df)
df_self_join <- df[df, on = .(file), allow.cartesian = T
][category != i.category,
.(category = paste0(i.category, category), values = values + i.values, file)
][order(category), .(ID = .I + nrow(df), values, category, file)]
rbindlist(list(df, df_self_join))
ID values category file
1: 1 0.76984382 a 1
2: 2 0.54311583 b 1
3: 3 0.23462016 c 1
4: 4 0.60179043 d 1
...
20: 20 0.03534223 e 5
21: 21 1.31295965 ab 1
22: 22 0.51666175 ab 2
23: 23 1.02305754 ab 3
24: 24 1.00446399 ac 1
25: 25 0.96910373 ac 2
26: 26 0.87795389 ac 4
#total of 80 rows
Here is pretty close dplyr translation:
library(dplyr)
tib <- as_tibble(df)
inner_join(tib, tib, by = 'file')%>%
filter(ID.x != ID.y)%>%
transmute(category = paste0(category.x, category.y)
, values = values.x + values.y
, file)%>%
arrange(category)%>%
bind_rows(tib, .)%>%
mutate(ID = row_number())%>%
filter(category == 'ab') #filter added to show the "ab" files
# A tibble: 3 x 4
ID values category file
<int> <dbl> <chr> <fct>
1 21 1.31 ab 1
2 22 0.517 ab 2
3 23 1.02 ab 3

dplyr collapse 'tail' rows into larger groups

library(tidyverse)
df <- tibble(a = as.factor(1:20), b = c(50, 20, 13, rep(2, 10), rep(1, 7)))
How do I make dplyr look at this data frame df and collapse all these occurences of 2 into a single summed group, and collapse all the occurrences of 1 into a single summed group? And also keep the rest of the data frame.
Turn this:
# A tibble: 20 x 2
a b
<fct> <dbl>
1 1 50
2 2 20
3 3 13
4 4 2
5 5 2
6 6 2
7 7 2
8 8 2
9 9 2
10 10 2
11 11 2
12 12 2
13 13 2
14 14 1
15 15 1
16 16 1
17 17 1
18 18 1
19 19 1
20 20 1
into this:
# A tibble: 5 x 2
a b
<fct> <dbl>
1 1 50
2 2 20
3 3 13
4 grp2 20
5 grp1 7
[Edit] - I fixed the example data. Sorry about that.
We group by a manufactured sortkey to maintain sort order. We used the fact that b is in descending order in the input but if that is not the case in your actual data then replace sortkey = -b with the more general sortkey = data.table::rleid(b) or the longer sortkey = cumsum(coalesce(b != lag(b), FALSE)) .
We also convert b to the group names giving a new a. It wasn't clear which groups are to be converted to grp... form. Hard-coded 1 and 2? Any group with more than one row? Groups at the end with more than one row? At any rate it would be easy enough to change the condition in the if_else once that were clarified.
Finally perform the summation and then remove the sortkey.
df %>%
group_by(sortkey = -b, a = paste0(if_else(b %in% 1:2, "grp", ""), b)) %>%
summarize(b = sum(b)) %>%
ungroup %>%
select(-sortkey)
giving:
# A tibble: 5 x 2
a b
<chr> <int>
1 50 50
2 20 20
3 13 13
4 grp2 20
5 grp1 7
Here's a way. I have converted a from factor to character to make things easier. You can convert it back to factor if you want. Also your test data was a bit wrong.
df <- tibble(a = as.character(1:20), b = c(50, 20, 13, rep(2, 10), rep(1, 7)))
df %>%
mutate(
a = case_when(
b == 1 ~ "grp1",
b == 2 ~ "grp2",
TRUE ~ a
)
) %>%
group_by(a) %>%
summarise(b = sum(b))
# A tibble: 5 x 2
a b
<chr> <dbl>
1 1 50
2 2 20
3 3 13
4 grp1 7
5 grp2 20
This is an approach which gives you the desired names for groups & where you don't need to think in advance how many cases like that you would need (e.g. it would create grp3, grp4, ... depending on the number in b).
library(dplyr)
df %>%
mutate(
grp = as.numeric(lag(df$b) != df$b),
grp = cumsum(ifelse(is.na(grp), 0, grp))
) %>% group_by(grp) %>%
mutate(
a = ifelse(n() > 1, paste0("grp", b), a),
b = sum(b)
) %>% ungroup() %>% distinct(a, b)
Output:
a b
<chr> <dbl>
1 1 50
2 2 20
3 3 13
4 grp2 20
5 grp1 7
Note that the code could be also condensed but that leads to a certain lack of readability in my opinion:
df %>%
group_by(grp = cumsum(ifelse(is.na(as.numeric(lag(df$b) != df$b)), 0, as.numeric(lag(df$b) != df$b)))) %>%
mutate(
a = ifelse(n() > 1, paste0("grp", b), a),
b = sum(b)
) %>% ungroup() %>% distinct(a, b)

Dplyr mutate new column at a specified location

An example:
a = c(10,20,30)
b = c(1,2,3)
c = c(4,5,6)
d = c(7,8,9)
df=data.frame(a,b,c,d)
library(dplyr)
df_1 = df %>% mutate(a1=sum(a+1))
How do I add "a1" after "a" (or any other defined position) and NOT at the end?
Thank you.
An update that might be useful for others who find this question - this can now be achieved directly within mutate (I'm using dplyr v1.0.2).
Just specify which existing column the new column should be positioned after or before, e.g.:
df_after <- df %>%
mutate(a1=sum(a+1), .after = a)
df_before <- df %>%
mutate(a1=sum(a+1), .before = b)
Another option is add_column from tibble
library(tibble)
add_column(df, a1 = sum(a + 1), .after = "a")
# a a1 b c d
#1 10 63 1 4 7
#2 20 63 2 5 8
#3 30 63 3 6 9
Extending on www's answer, we can use dplyr's select_helper functions to reorder newly created columns as we see fit:
library(dplyr)
## add a1 after a
df %>%
mutate(a1 = sum(a + 1)) %>%
select(a, a1, everything())
#> a a1 b c d
#> 1 10 63 1 4 7
#> 2 20 63 2 5 8
#> 3 30 63 3 6 9
## add a1 after c
df %>%
mutate(a1 = sum(a + 1)) %>%
select(1:c, a1, everything())
#> a b c a1 d
#> 1 10 1 4 63 7
#> 2 20 2 5 63 8
#> 3 30 3 6 63 9
dplyr >= 1.0.0
relocate was added as a new verb to change the order of one or more columns. If you pipe the output of your mutate the syntax for relocate also uses .before and .after arguments:
df_1 %>%
relocate(a1, .after = a)
a a1 b c d
1 10 63 1 4 7
2 20 63 2 5 8
3 30 63 3 6 9
An additional benefit is you can also move multiple columns using any tidyselect syntax:
df_1 %>%
relocate(c:a1, .before = b)
a c d a1 b
1 10 4 7 63 1
2 20 5 8 63 2
3 30 6 9 63 3
The mutate function will always add the newly created column at the end. However, we can sort the column alphabetically after the mutate function using select.
library(dplyr)
df_1 <- df %>%
mutate(a1 = sum(a + 1)) %>%
select(sort(names(.)))
df_1
# a a1 b c d
# 1 10 63 1 4 7
# 2 20 63 2 5 8
# 3 30 63 3 6 9

Resources