Add frequency counts to 2x2 prop.table - r

How do I add frequency counts to a 2x2 prop.table? So here 'dataset' contains two categorical variables.
dataset %>% prop.table(margin = 2) %>% '*' (100) %>% round(2)
I would like the counts in addition to percentages of each category.
Sorry for the dopey example, but it should look like this, except the sum doesn't need to be reported in every cell.

A reproducible example and solution:
tab <-iris %>% mutate(size = factor(1+(Sepal.Length>median(iris$Sepal.Length)),levels = 1:2, labels = c('S','L'))) %>%
select(Species, size) %>%
table()
prop <- prop.table(tab,margin = 2) %>% '*' (100) %>% round(2)
matrix(paste(tab,prop),nrow = nrow(tab),dimnames = dimnames(tab))
gives
size
Species S L
setosa "50 62.5" "0 0"
versicolor "24 30" "26 37.14"
virginica "6 7.5" "44 62.86"
or another solution:
iris %>% mutate(size = factor(1+(Sepal.Length>median(iris$Sepal.Length)),levels = 1:2, labels = c('S','L'))) %>%
group_by(Species, size) %>%
summarise(n = n()) %>%
group_by(size) %>%
mutate(p = paste(n,round(n/sum(n)*100,2))) %>%
select(-n) %>%
spread(size,p,fill = paste(0,0))
gives
# A tibble: 3 x 3
Species S L
<fct> <chr> <chr>
1 setosa 50 62.5 0 0
2 versicolor 24 30 26 37.14
3 virginica 6 7.5 44 62.86

addmargins applied to your table might do what you want.
set.seed(34)
n <- 20
tab <- table(sample(1:3, n, replace = TRUE), sample(c("A", "B"), n, replace = TRUE))
addmargins(tab)

Related

Summarize information by group in data table in R

I'm trying to get multiple summary statistics in R grouped by Team. I used code like below, but output is not what I want.
please point me in a better direction. Thanks!
set.seed(77)
data <- data.frame(Team =sample(c("A","B"),30, replace=TRUE),
gender=sample(c("female","male"),30, replace=TRUE),
Age =sample(c(0:100),30, replace=T))
dat <- data %>%
group_by(Team, gender) %>%
dplyr::summarize_all(list(my_mean = mean,
my_sum = sum,
my_sd = sd)) %>%
as.data.frame()
df <- data %>%
group_by(Team) %>%
summarize(total = n(gender),
mean = mean(Age),
Max_Age = max(Age),
Min_Age = min(Age),
sd = sd(Age),
)
I want to get like this pic.
You may need to create the dataframe for the summary statistics of age per Team (age_summary in the example below) and that for the count of Team members per gender and Team (gender_summary in the example below), and then merge them into one dataframe (say summary_df).
library(tidyverse)
set.seed(77)
data <- data.frame(
Team = sample(c("A", "B"), 30, replace = TRUE),
gender = sample(c("female", "male"), 30, replace = TRUE),
Age = sample(c(0:100), 30, replace = T)
)
age_summary <- data %>%
group_by(Team) %>%
summarize(
mean = mean(Age),
Max = max(Age),
Min = min(Age),
sd = sd(Age)
) %>%
column_to_rownames("Team") %>%
t() %>%
as_tibble(
rownames = "age_summary"
)
gender_summary <- data %>%
group_by(Team) %>%
count(gender) %>%
ungroup() %>%
pivot_wider(names_from = Team, values_from = n)
summary_df <- full_join(
age_summary,
gender_summary
) %>%
mutate(
"item" = if_else(
is.na(gender),
"Age",
"Sex"
)
) %>%
unite("summary", c(age_summary, gender), na.rm = TRUE, remove = FALSE) %>%
relocate(item, .before = 1) %>%
select(-c(age_summary, gender))
# # A tibble: 6 × 4
# item summary A B
# <chr> <chr> <dbl> <dbl>
# 1 Age mean 45.6 57.8
# 2 Age Max 92 82
# 3 Age Min 5 14
# 4 Age sd 30.1 22.1
# 5 Sex female 8 9
# 6 Sex male 7 6

Plot histograms per row using gt tables - R

I want to create a gt table where I see some metrics like number of observations, mean and median, and I want a column with its histogram. For this question I will use the iris dataset.
I have recently learned how to put a plot in a tibble using this code:
library(dplyr)
library(tidyr)
library(purrr)
library(gt)
my_tibble <- iris %>%
pivot_longer(-Species,
names_to = "Vars",
values_to = "Values") %>%
group_by(Vars) %>%
summarise(obs = n(),
mean = round(mean(Values),2),
median = round(median(Values),2),
plots = list(ggplot(cur_data(), aes(Values)) + geom_histogram()))
Now I want to use the plots column for plotting an histogram per variable, so I have tried this:
my_tibble %>%
mutate(ggplot = NA) %>%
gt() %>%
text_transform(
locations = cells_body(vars(ggplot)),
fn = function(x) {
map(.$plots,ggplot_image)
}
)
But it returns me an error:
Error in body[[col]][stub_df$rownum_i %in% loc$rows] <- fn(body[[col]][stub_df$rownum_i %in% :
replacement has length zero
The gt table should be like this:
Any help will be greatly appreciated.
After reviewing the excellent ideas from #akrun and #TarJae, I have this solution that gives the required gt table:
plots <- iris %>%
pivot_longer(-Species,
names_to = "Vars",
values_to = "Values") %>%
group_by(Vars) %>%
nest() %>%
mutate(plot = map(data,
function(df) df %>%
ggplot(aes(Values)) +
geom_histogram())) %>%
select(-data)
iris %>%
pivot_longer(-Species,
names_to = "Vars",
values_to = "Values") %>%
group_by(Vars) %>%
summarise(obs = n(),
mean = round(mean(Values),2),
median = round(median(Values),2)) %>%
mutate(ggplot = NA) %>%
gt() %>%
text_transform(
locations = cells_body(vars(ggplot)),
fn = function(x) {
map(plots$plot, ggplot_image, height = px(100))
}
)
And this is the table:
I had to create the plot outside the output table, so I could call it in the gt table.
We need to loop over the plots
library(dplyr)
library(tidyr)
library(purrr)
library(gt)
library(ggplot2)
iris %>%
pivot_longer(-Species,
names_to = "Vars",
values_to = "Values") %>%
nest_by(Vars) %>%
mutate(n = nrow(data),
mean = round(mean(data$Values), 2),
median = round(median(data$Values), 2),
plots = list(ggplot(data, aes(Values)) + geom_histogram()), .keep = "unused") %>%
ungroup %>%
mutate(ggplot = NA) %>%
{dat <- .
dat %>%
select(-plots) %>%
gt() %>%
text_transform(locations = cells_body(c(ggplot)),
fn = function(x) {
map(dat$plots, ggplot_image, height = px(100))
}
)
}
-check for the output
Update: See comments:
For your purposes in accordance with a shiny app you may use summarytools see here: https://cran.r-project.org/web/packages/summarytools/vignettes/introduction.html
it is compatible with r shiny!
Here is a small example:
library(summarytools)
dfSummary(iris,
plain.ascii = FALSE,
style = "grid",
graph.magnif = 0.75,
valid.col = FALSE,
tmp.img.dir = "/tmp")
view(dfSummary(iris))
Try this:
library(skimr)
skim(iris)
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
* <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 Sepal.Length 0 1 5.84 0.828 4.3 5.1 5.8 6.4 7.9 ▆▇▇▅▂
2 Sepal.Width 0 1 3.06 0.436 2 2.8 3 3.3 4.4 ▁▆▇▂▁
3 Petal.Length 0 1 3.76 1.77 1 1.6 4.35 5.1 6.9 ▇▁▆▇▂
4 Petal.Width 0 1 1.20 0.762 0.1 0.3 1.3 1.8 2.5 ▇▁▇▅▃

How to estimate the mean of the 10% upper and lower values over multiple categories with dplyr?

Suppose you have this data.frame in R
set.seed(15)
df <- data.frame(cat = rep(c("a", "b"), each = 50),
x = c(runif(50, 0, 1), runif(50, 1, 2)))
I want to estimate the mean of the 10% upper and lower values in each category.
I can do it using base functions like this
dfa <- df[df$cat=="a",]
dfb <- df[df$cat=="b",]
mean(dfa[dfa$x >= quantile(dfa$x, 0.9),"x"])
# [1] 0.9537632
mean(dfa[dfa$x <= quantile(dfa$x, 0.1),"x"])
# [1] 0.07959845
mean(dfb[dfb$x >= quantile(dfb$x, 0.9),"x"])
# [1] 1.963775
mean(dfb[dfb$x <= quantile(dfb$x, 0.1),"x"])
# [1] 1.092218
However, I can't figure it out how to implement this using dplyr or purrr.
Thanks for the help.
We could do this in a group by approach using cut and quantile as breaks
library(dplyr)
df %>%
group_by(cat) %>%
mutate(grp = cut(x, breaks = c(-Inf, quantile(x,
probs = c(0.1, 0.9)), Inf))) %>%
group_by(grp, .add = TRUE) %>%
summarise(x = mean(x, na.rm = TRUE), .groups = 'drop_last') %>%
slice(-2)
-ouptut
# A tibble: 4 x 3
# Groups: cat [2]
cat grp x
<chr> <fct> <dbl>
1 a (-Inf,0.0813] 0.0183
2 a (0.853, Inf] 0.955
3 b (-Inf,1.21] 1.07
4 b (1.93, Inf] 1.95
Here's a way you can use cut() to help partitaion your data into groups and then take the mean
df %>%
group_by(cat) %>%
mutate(part=cut(x, c(-Inf, quantile(x, c(.1, .9)), Inf), labels=c("low","center","high"))) %>%
filter(part!="center") %>%
group_by(cat, part) %>%
summarize(mean(x))
which returns everything in a nice tibble
cat part `mean(x)`
<chr> <fct> <dbl>
1 a low 0.0796
2 a high 0.954
3 b low 1.09
4 b high 1.96
To make it a bit cleaner, you can factor out the splitting to a helper function
split_quantile <- function(x , p=c(.1, .9)) {
cut(x, c(-Inf, quantile(x, c(.1, .9)), Inf), labels=c("low","center","high"))
}
df %>%
group_by(cat) %>%
mutate(part = split_quantile(x)) %>%
filter(part != "center") %>%
group_by(cat, part) %>%
summarize(mean(x))
A variant of #MrFlick's answer - you can use cut_number and slice:
df %>%
group_by(cat) %>%
mutate(part = cut_number(x, n = 10)) %>%
group_by(cat, part) %>%
summarise(mean(x)) %>%
slice(1, n())

More efficient way to perform calculations on multiple (combined) columns by group

What is a more efficient way to perform calculations on multiple combined columns by group?
I have a dataset with Manager Effectiveness & Team Effectiveness components. How can I quickly calculate the number of 5s for each component by gender?
The desired outcome is like so:
Number of 5s for 'Manager effectiveness' = 2
Number of 5s for 'Team effectiveness' = 0
So far, I've tried the dplyr method:
Data %>%
group_by(gender) %>%
summarise(sum(c(Manager EQ, Manager IQ)) == 5)
Data %>%
group_by(gender) %>%
summarise(sum(c(Team collaboration, Team friendliness)) == 5)
Though it works, typing each column name quickly becomes tedious and error-prone as more columns are involved.
We can use summarise_at
library(dplyr)
Data %>%
group_by(gender) %>%
summarise_at(vars(starts_with('Manager')), ~ sum(. == 5))
Or if we are checking the sum of all numeric columns, use summarise_if
Data %>%
group_by(gender) %>%
summarise_if(is.numeric, ~ sum(. == 5))
Can we wrapped in a function
f1 <- function(dat, colPrefix, grp, val) {
dat %>%
group_by_at(grp) %>%
summarise_at(vars(starts_with(colPrefix)), ~ sum(. == val))
}
f1(Data, "Manager", "gender", 5)
Mostly expanding on #akrun's answer:
## made up data 100 observations
set.seed(133)
dat <- 1:5
gen <- c("M", "F")
z <- tibble(me = sample(dat, 100, TRUE),
mi = sample(dat, 100, TRUE),
tc = sample(dat, 100, TRUE),
tf = sample(dat, 100, TRUE),
gender = sample(gen, 100, TRUE))
# Grouping by gender, counting 5's, and reshaping data
z %>%
group_by(gender) %>%
summarise_at(vars(everything()), ~ sum(. == 5)) %>%
pivot_longer(me:tf) %>%
mutate(name = paste0("# 5's for ", name)) %>%
pivot_wider(gender)
Output:
# A tibble: 2 x 5
gender `# 5's for me` `# 5's for mi` `# 5's for tc` `# 5's for tf`
<chr> <int> <int> <int> <int>
1 F 6 6 8 5
2 M 10 14 20 5
This is starting to get a little hack-ey, but in response to Amanda's comment & my misunderstanding of the question:
z %>%
group_by(gender) %>%
summarise_at(vars(everything()), ~ sum(. == 5)) %>%
pivot_longer(me:tf) %>%
mutate(name = paste0("# 5's for ", name)) %>%
mutate(grp = ifelse(str_detect(name, 'm'), 'manager', 'team')) %>%
group_by(gender, grp) %>%
summarise(total_5s = sum(value))
Gives results:
# A tibble: 4 x 3
# Groups: gender [2]
gender grp total_5s
<chr> <chr> <int>
1 F manager 12
2 F team 13
3 M manager 24
4 M team 25
Unfortunately this relies heavily on making a distinction and group based on the column names of the original data.

How to add totals as well as group_by statistics in R

When computing any statistic using summarise and group_by we only get the summary statistic per-category, and not the value for all the population (Total). How to get both?
I am looking for something clean and short. Until now I can only think of:
bind_rows(
iris %>% group_by(Species) %>% summarise(
"Mean" = mean(Sepal.Width),
"Median" = median(Sepal.Width),
"sd" = sd(Sepal.Width),
"p10" = quantile(Sepal.Width, probs = 0.1))
,
iris %>% summarise(
"Mean" = mean(Sepal.Width),
"Median" = median(Sepal.Width),
"sd" = sd(Sepal.Width),
"p10" = quantile(Sepal.Width, probs = 0.1)) %>%
mutate(Species = "Total")
)
But I would like something more compact. In particular, I don't want to type the code (for summarize) twice, once for each group and once for the total.
You can simplify it if you untangle what you're trying to do: you have iris data that has several species, and you want that summarized along with data for all species. You don't need to calculate those summary stats before you can bind. Instead, bind iris with a version of iris that's been set to Species = "Total", then group and summarize.
library(tidyverse)
bind_rows(
iris,
iris %>% mutate(Species = "Total")
) %>%
group_by(Species) %>%
summarise(Mean = mean(Sepal.Width),
Median = median(Sepal.Width),
sd = sd(Sepal.Width),
p10 = quantile(Sepal.Width, probs = 0.1))
#> # A tibble: 4 x 5
#> Species Mean Median sd p10
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 setosa 3.43 3.4 0.379 3
#> 2 Total 3.06 3 0.436 2.5
#> 3 versicolor 2.77 2.8 0.314 2.3
#> 4 virginica 2.97 3 0.322 2.59
I like the caution in the comments above, though I have to do this sort of calculation for work enough that I have a similar shorthand function in a personal package. It perhaps makes less sense for things like standard deviations, but it's something I need to do a lot for adding up totals of demographic groups, etc. (If it's useful, that function is here).
bit shorter, though quite similar to bind_rows
q10 <- function(x){quantile(x , probs=0.1)}
iris %>%
select(Species,Sepal.Width)%>%
group_by(Species) %>%
summarise_all(c("mean", "sd", "q10")) %>%
t() %>%
cbind(c("total", iris %>% select(Sepal.Width) %>% summarise_all(c("mean", "sd", "q10")))) %>%
t()
more clean probably:
bind_rows(
iris %>%
group_by(Species) %>%
select(Sepal.Width)%>%
summarise_all(c("mean", "sd", "q10"))
,
iris %>%
select(Sepal.Width)%>%
summarise_all(c("mean", "sd", "q10")) %>%
mutate(Species = "Total")
)

Resources