how to get the output of proc tabulate (SAS) in R - r

Is there any R function which could give me directly the same output of proc tabulate ??
var1<-c(rep("A",4),rep("B",4))
var2<-c(rep("C",4),rep("D",4))
var3<-c(rep("E",2),rep("F",4),rep("G",2))
dataset<-data.frame(var1,var2,var3)
proc tabulate data=dataset;
class var1 var2 var3;
table var1*var2 ,var3 all (n rowpctn);
run;
The output that I want is like this:

Here is a way with R -
Create a column of 1s - n
Expand the data to fill the missing combinations - complete
Reshape to 'wide' format - pivot_wider
Create the 'Total' column by getting the row wise sum - rowSums
Add the percentage by looping across the 'var3' columns
library(dplyr)
library(tidyr)
library(stringr)
dataset %>%
mutate(n = 1, var3 = str_c('var3_', var3)) %>%
complete(var1, var2, var3, fill = list(n = 0)) %>%
pivot_wider(names_from = var3, values_from = n, values_fn = sum) %>%
mutate(Total = rowSums(across(where(is.numeric)))) %>%
group_by(var1) %>%
mutate(across(starts_with('var3'),
~ case_when(. == 0 ~ '0(0%)',
TRUE ~ sprintf('%d(%d%%)', ., 100 * mean(. != 0))))) %>%
ungroup
-output
# A tibble: 4 × 6
var1 var2 var3_E var3_F var3_G Total
<chr> <chr> <chr> <chr> <chr> <dbl>
1 A C 2(50%) 2(50%) 0(0%) 4
2 A D 0(0%) 0(0%) 0(0%) 0
3 B C 0(0%) 0(0%) 0(0%) 0
4 B D 0(0%) 2(50%) 2(50%) 4
Update
Based on the comments by #IceCreamToucan, there was a bug, which is corrected in the below code
dataset %>%
mutate(n = 1, var3 = str_c('var3_', var3)) %>%
complete(var1, var2, var3, fill = list(n = 0)) %>%
pivot_wider(names_from = var3, values_from = n, values_fn = sum) %>%
mutate(Total = rowSums(across(where(is.numeric))),
100 * across(starts_with('var3'), ~ . != 0,
.names = "{.col}_perc")/rowSums(across(starts_with('var3'), ~ .!= 0)),
across(matches('var3_[A-Z]$'), ~ case_when(. == 0 ~ '0(0%)',
TRUE ~ sprintf('%d(%.f%%)', ., get(str_c(cur_column(), '_perc')))))) %>%
select(-ends_with('perc'))

Here's a more generic version, where I define a function.
var1<-c(rep("A",4),rep("B",4))
var2<-c(rep("C",4),rep("D",4))
var3<-c(rep("E",2),rep("F",4),rep("G",2))
df<-data.frame(var1,var2,var3)
df_tabulate(df, id_cols = c(var1, var2), names_from = var3)
#> # A tibble: 4 × 6
#> var1 var2 var3_E var3_F var3_G Total
#> <chr> <chr> <chr> <chr> <chr> <dbl>
#> 1 A C 2(50.0%) 2(50.0%) 0(0.0%) 4
#> 2 A D 0(0%) 0(0%) 0(0%) 0
#> 3 B C 0(0%) 0(0%) 0(0%) 0
#> 4 B D 0(0.0%) 2(50.0%) 2(50.0%) 4
You can define the function using janitor
library(janitor, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(rlang)
library(tidyr)
df_tabulate <- function(df, id_cols, names_from){
id_cols <- enquo(id_cols)
if (quo_is_call(id_cols, 'c'))
id_cols <- call_args(id_cols)
else
id_cols <- ensym(id_cols)
names_from_chr <- as_label(enquo(names_from))
counts <- df %>%
mutate(g = eval(call2(paste, !!!id_cols, sep = ',')),
col = paste0(names_from_chr, '_', {{ names_from }})) %>%
tabyl(g, col) %>%
adorn_totals('col')
percs <- adorn_percentages(counts) %>%
adorn_pct_formatting()
rbind(counts, percs) %>%
group_by(g) %>%
summarise(across(-Total, ~ paste0(first(.), '(', last(.), ')')),
Total = as.numeric(first(Total))) %>%
separate(g, into = as.character(id_cols)) %>%
complete(!!!id_cols) %>%
mutate(across(starts_with(names_from_chr), ~ coalesce(., '0(0%)')),
across(Total, ~ coalesce(., 0)))
}

Here it is as a single pipeline with discrete simple steps. Long, to be sure, but if you wanted many tables like this you could store it as a function.
library(tidyverse)
library(janitor)
dataset %>%
mutate(across(var1:var2, as.factor)) %>%
count(var1, var2, var3, .drop = FALSE) %>%
unite(vars, var1, var2) %>%
pivot_wider(names_from = var3, values_from = n) %>%
select(-`NA`) %>%
replace(is.na(.), 0) %>%
adorn_totals("col") %>%
adorn_percentages(,,,,-c(vars, Total)) %>%
adorn_pct_formatting(digits = 0,,,,-c(vars, Total)) %>%
adorn_ns(position = "front",,,-c(vars, Total)) %>%
separate(vars, into = c("var1", "var2"))
#> # A tibble: 4 x 6
#> var1 var2 E F G Total
#> <chr> <chr> <chr> <chr> <chr> <dbl>
#> 1 A C 2 (50%) 2 (50%) 0 (0%) 4
#> 2 A D 0 (-) 0 (-) 0 (-) 0
#> 3 B C 0 (-) 0 (-) 0 (-) 0
#> 4 B D 0 (0%) 2 (50%) 2 (50%) 4
This replaces the questionable 0/0 = 0% with simply - for a cleaner(IMO) result.

Related

dplyr::filter multiple criteria from other data.frame

Refering on this topic:
dplyr::filter criteria from other data.frame
I adjusted the df to fit better to my complex df.
df1<-data.frame(A=c(1:10),
B=c(3:12))
df2<-data.frame(A_min=c(2,3),
A_max=c(3,3),
B_min=c(1,2),
B_max=c(2,2))
I was not able to put multiple criteria in the code with dplyr::map2
df2 |>
mutate(Score = map2(A_min, A_max, ~ nrow(filter(df1, between(A, .x, .y)))))
I needed to adjust the code so it works in my case (don´t know exactly why between is not working).
mutate(Score=map2_dbl(A_min,A_max,~nrow(filter(df1,A>=.x & A<=.y))))
How can I add a second criteria e.g. B_min and B_max to filter at the same time to get a combined score with both criteria? (Edited the question to make it clear what the goal is).
We may use pmap
library(dplyr)
library(tidyr)
library(purrr)
df2 |>
mutate(Score = pmap(across(everything()),
~ tibble(A= nrow(filter(df1, between(A, ..1, ..2))),
B= nrow(filter(df1, between(B, ..3, ..4)))))) %>%
unnest_wider(Score, names_sep = "")
-output
# A tibble: 2 × 6
A_min A_max B_min B_max ScoreA ScoreB
<dbl> <dbl> <dbl> <dbl> <int> <int>
1 2 3 1 2 2 0
2 3 3 2 2 1 0
Or if there are multiple sets of columns, reshape to 'long' format before doing the processing and then reshape it back to 'wide' format
df2 %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, names_to = c("name", ".value"),
names_sep = "_") %>%
rowwise %>%
mutate(name, Score = df1 %>%
filter(between(cur_data()[[name]], min, max)) %>%
nrow, .keep = 'unused') %>%
ungroup %>%
pivot_wider(names_from = name, values_from = Score,
names_glue = "Score{name}") %>%
select(-rn) %>%
bind_cols(df2, .)
A_min A_max B_min B_max ScoreA ScoreB
1 2 3 1 2 2 0
2 3 3 2 2 1 0

In `dplyr`, when use `pivot_wide` ,I want to replace 'NA' at the same time

In dplyr, when use pivot_wide ,I want to replace 'NA' at the same time .
Here is the code as below ,they are not work. Anyone can help? Thanks!
library(tidyverse)
test_data <- data.frame(category=c('A','B','A','B','A','B','A','B'),
sub_category=c('a1','b1','a2','b2','a1','b1','a2','b2'),
sales=c(1,2,4,5,8,1,4,6))
#method1: Error: Can't convert <double> to <list>.
test_data %>% pivot_wider(names_from = 'category',
values_from = 'sales',
values_fill = 0) %>% unnest()
#method2: code can't run
test_data %>% pivot_wider(names_from = 'category',
values_from = 'sales') %>% unnest() %>%
as.data.frame() %>%
mutate(across(where(is.numeric),function(x) stringr::str_replace('NA',0)))
test_data %>%
pivot_wider(names_from = 'category', values_from = 'sales',
values_fn = list, values_fill = list(0)) %>%
unnest(c(A, B))
# A tibble: 8 x 3
sub_category A B
<chr> <dbl> <dbl>
1 a1 1 0
2 a1 8 0
3 b1 0 2
4 b1 0 1
5 a2 4 0
6 a2 4 0
7 b2 0 5
8 b2 0 6
Is this what you want?
test_data %>%
pivot_wider(names_from = 'category',
values_from = 'sales') %>%
unnest(cols = c(A, B)) %>%
mutate(across(where(is.numeric), replace_na, 0))

How to summarise with sum dependent on another column - using dplyr

I am looking to perform a group by on id, code1 and then summarise. I want the summarise to do several conditional sums i.e. sum of the count column when code2 == "B". I know how to do this by creating an intermediary binary column but I was wondering if there is quicker method where this can all be performed in the summarise statement.
Here is some test data:
id <- c(1,1,1)
code1 <- c("M", "M", "M")
code2 <- c("B", "B", "U")
code3 <- c("H", "N", "N")
count <- c(15, 2, 1)
x <- data.frame(id, code1, code2, code3, count)
Desired output:
id | code1 | Total | B_count | U_count | H_count | N_count
1 M 18 17 1 15 3
We can use the conditions inside the summarise call:
library(dplyr)
x %>%
group_by(id, code1) %>%
summarise(total = sum(count),
B_count = sum(count[code2 == "B"]),
U_count = sum(count[code2 == "U"]),
H_count = sum(count[code3 == "H"]),
N_count = sum(count[code3 == "N"]))
`summarise()` regrouping output by 'id' (override with `.groups` argument)
# A tibble: 1 x 7
# Groups: id [1]
id code1 total B_count U_count H_count N_count
<dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 M 18 17 1 15 3
This solution is very complicated but it gets the job done.
library(dplyr)
library(tidyr)
x %>%
pivot_longer(
cols = matches('code[2-9]'),
names_to = 'vars',
values_to = 'code'
) %>%
dplyr::select(-vars) %>%
group_by(id, code1, code) %>%
summarise(count = sum(count), .groups = "rowwise") %>%
pivot_wider(
id_cols = c(id, code1),
names_from = code,
values_from = count
) %>%
left_join(
x %>%
group_by(id, code1) %>%
summarise(Total = sum(count), .groups = "rowwise"),
by = c("id", "code1")
) %>%
select(id, code1, Total, everything())
## A tibble: 1 x 7
# id code1 Total B H N U
# <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 M 18 17 15 3 1

turn pivot_wider() into spread()

I love the new tidyr pivot_wider function but since it hasn't been officially added to the CRAN package I was wondering how to convert the following code into the older spread() function (I do not have access to the server to DL tidyr from github)
test <- data.frame(x = c(1,1,2,2,2,2,3,3,3,4),
y = c(rep("a", 5), rep("b", 5)))
test %>%
count(x, y) %>%
group_by(x) %>%
mutate(prop = prop.table(n)) %>%
mutate(v1 = paste0(n, ' (', round(prop, 2), ')')) %>%
pivot_wider(id_cols = x, names_from = y, values_from = v1)
Desired Output:
# A tibble: 4 x 3
# Groups: x [4]
x a b
<dbl> <chr> <chr>
1 1 2 (1) NA
2 2 3 (0.75) 1 (0.25)
3 3 NA 3 (1)
4 4 NA 1 (1)
I tried (but is not quite right):
test %>%
count(x, y) %>%
group_by(x) %>%
mutate(prop = prop.table(n)) %>%
mutate(v1 = paste0(n, ' (', round(prop, 2), ')')) %>%
spread(y, v1) %>%
select(-n, -prop)
Any help appreciated!
One option is to remove the columns 'n', 'prop' before the spread statement as including them would create unique rows with that column values as well
library(dplyr)
library(tidyr)
test %>%
count(x, y) %>%
group_by(x) %>%
mutate(prop = prop.table(n)) %>%
mutate(v1 = paste0(n, ' (', round(prop, 2), ')')) %>%
select(-n, -prop) %>%
spread(y, v1)
# A tibble: 4 x 3
# Groups: x [4]
# x a b
# <dbl> <chr> <chr>
#1 1 2 (1) <NA>
#2 2 3 (0.75) 1 (0.25)
#3 3 <NA> 3 (1)
#4 4 <NA> 1 (1)
Or using base R
tbl <- table(test)
tbl[] <- paste0(tbl, "(", prop.table(tbl, 1), ")")
You can use data.table package:
> library(data.table)
> setDT(test)[,.(n=.N),by=.(x,y)][,.(y=y,n=n,final=gsub('\\(1\\)','',paste0(n,'(',round(prop.table(n),2), ')'))),by=x]
x y n final
1: 1 a 2 2
2: 2 a 3 3(0.75)
3: 2 b 1 1(0.25)
4: 3 b 3 3
5: 4 b 1 1

dplyr number of rows across groups after filtering

I want the count and proportion (of all of elements) of each group in a data frame (after filtering). This code produces the desired output:
library(dplyr)
df <- data_frame(id = sample(letters[1:3], 100, replace = TRUE),
value = rnorm(100))
summary <- filter(df, value > 0) %>%
group_by(id) %>%
summarize(count = n()) %>%
ungroup() %>%
mutate(proportion = count / sum(count))
> summary
# A tibble: 3 x 3
id count proportion
<chr> <int> <dbl>
1 a 17 0.3695652
2 b 13 0.2826087
3 c 16 0.3478261
Is there an elegant solution to avoid the ungroup() and second summarize() steps. Something like:
summary <- filter(df, value > 0) %>%
group_by(id) %>%
summarize(count = n(),
proportion = n() / [?TOTAL_ROWS()?])
I couldn't find such a function in the documentation, but I must be missing something obvious. Thanks!
You can use nrow on . which refers to the entire data frame piped in:
df %>%
filter(value > 0) %>%
group_by(id) %>%
summarise(count = n(), proportion = count / nrow(.))
# A tibble: 3 x 3
# id count proportion
# <chr> <int> <dbl>
#1 a 14 0.2592593
#2 b 22 0.4074074
#3 c 18 0.3333333

Resources