apply count() to every factor variable in a dataframe - r

I can use purrr::map() to get the mean of every column in a dataframe. Can I use any of the map functions in combination with count() to get counts for each categorical variable in a dataframe?
library(dplyr)
library(purrr)
mtcars %>% map(mean)
mtcars %>% mutate(am = factor(am, labels = c("auto", "manual")),
vs = factor(vs, labels = c("V", "S"))) %>% select_if(is.factor) %>%
map(count)

Almost there! Just need to specify the data in count:
mtcars %>%
mutate(
am = factor(am, labels = c("auto", "manual")),
vs = factor(vs, labels = c("V", "S"))
) %>%
select_if(is.factor) %>%
map(~count(data.frame(x = .x), x))

You can use the 'table' function instead of count:
mtcars %>%
mutate(
am = factor(am, labels = c("auto", "manual")),
vs = factor(vs, labels = c("V", "S"))
) %>%
select_if(is.factor) %>%
map(table)
#$`vs`
#V S
#18 14
#$am
#auto manual
#19 13

Related

Stacking two different stratified tables

I am trying to combine to different stratified tables made with tbl_strata()and tbl_summary from the gtsummary-package. I want to stratify by the same variable in both tables, but use different variables in the "by" argument in tbl_summary(). When I combine the tables using tbl_stack(), the column headers from the second table are lost. Is there any way to combine the tables and keep the column headers from both tables?
See reproducible example below
library(gtsummary)
library(tidyverse)
data("diamonds")
table1 <- diamonds %>%
filter(cut %in% c("Ideal", "Premium") & color %in% c("E", "I")) %>%
mutate(color = factor(color)) %>%
tbl_strata(strata = cut,
.tbl_fun =
~.x %>%
tbl_summary(by = color, include = price))
table2 <- diamonds %>%
filter(cut %in% c("Ideal", "Premium") & clarity %in% c("SI1", "SI2")) %>%
mutate(clarity = factor(clarity)) %>%
tbl_strata(strata = cut,
.tbl_fun =
~.x %>%
tbl_summary(by = clarity, include = price))
tbl_stack(list(table1, table2), group_header = c("Table 1", "Table 2"))
UPDATE:
Below is an (rough) example of the output I would like to be able to get from tbl_stack():
library(gtsummary)
library(tidyverse)
library(gt)
data("diamonds")
library(flextable)
table1_tibble <- diamonds %>%
filter(cut %in% c("Ideal", "Premium") & color %in% c("E", "I")) %>%
mutate(color = factor(color)) %>%
tbl_strata(strata = cut,
.tbl_fun =
~.x %>%
tbl_summary(by = color, include = price)) %>%
as_tibble()
table1_tibble <- rbind(colnames(table1_tibble), table1_tibble) %>%
rename(label = colnames(table1_tibble)[1],
premium_1 = colnames(table1_tibble)[2],
premium_2 = colnames(table1_tibble)[3],
ideal_1 = colnames(table1_tibble)[4],
ideal_2 = colnames(table1_tibble)[5]) %>%
add_row(label = "Table 1", .before = 1)
table2_tibble <- diamonds %>%
filter(cut %in% c("Ideal", "Premium") & clarity %in% c("SI1", "SI2")) %>%
mutate(clarity = factor(clarity)) %>%
tbl_strata(strata = cut,
.tbl_fun =
~.x %>%
tbl_summary(by = clarity, include = price)) %>%
as_tibble()
table2_tibble <- rbind(colnames(table2_tibble), table2_tibble) %>%
rename(label = colnames(table2_tibble)[1],
premium_1 = colnames(table2_tibble)[2],
premium_2 = colnames(table2_tibble)[3],
ideal_1 = colnames(table2_tibble)[4],
ideal_2 = colnames(table2_tibble)[5]) %>%
add_row(label = "Table 2", .before = 1)
bind_rows(table1_tibble, table2_tibble) %>%
flextable() %>%
merge_at(i = 1, j = 2:3, part = "header") %>%
merge_at(i = 1, j = 4:5, part = "header") %>%
set_header_labels(label = "", premium_1 = "Premium", ideal_1 = "Ideal")
Hope this clarifies matters.
Best regards,
Martin

Customize the order of columns in tidyHtmlTable function

I don't know how to order columns in tidyHtmlTable function from htmlTable package.
The documentation says:
Columns are sorted by arrange(cgroup,header) where cgroup will be expanded to the columns
of the cgroup argument, e.g. cgroup = c(a, b), header = c will become arrange(a,b,c). If you
want to sort in non-alphabetic order you can provide a factor variable and that information will be retained.
How can I provide such factor variable? For example, in the doc example
library(magrittr)
library(tidyr)
library(dplyr)
library(htmlTable)
library(tibble)
td <- mtcars %>%
as_tibble(rownames = "rnames") %>%
pivot_longer(names_to = "per_metric",
cols = c(hp, mpg, qsec))
tidy_summary <- td %>%
group_by(cyl, gear, per_metric) %>%
summarise(Mean = round(mean(value), 1),
SD = round(sd(value), 1),
Min = round(min(value), 1),
Max = round(max(value), 1),
.groups = 'drop') %>%
pivot_longer(names_to = "summary_stat",
cols = c(Mean, SD, Min, Max)) %>%
ungroup() %>%
mutate(gear = paste(gear, "Gears"),
cyl = paste(cyl, "Cylinders"))
tidy_summary %>%
arrange(per_metric, summary_stat) %>%
addHtmlTableStyle(align = "r") %>%
tidyHtmlTable(header = gear,
cgroup = cyl,
rnames = summary_stat,
rgroup = per_metric)
if I want to maintain the order of rgroup and rnames but, for example, I want the columns in non-alphabetical order, like this:
4 Gears |3 Gears |5 Gears
How can I do that? Thank you.
What we can do is to make the changes before tidyHtmlTable
How it works:
mutate gear to factor class
Use fct_relevel from forcats package (it is in tidyverse)
library(forcats)
tidy_summary %>%
arrange(per_metric, summary_stat) %>%
addHtmlTableStyle(align = "r") %>%
mutate(gear = as.factor(gear),
gear = fct_relevel(gear, "4 Gears", "3 Gears", "5 Gears")) %>%
tidyHtmlTable(header = gear,
cgroup = cyl,
rnames = summary_stat,
rgroup = per_metric)

Skip "zero" level of dichotomous variables in expss tables

I want to create a summary table for some dichotomous variables using the expss package. Since the variables are dichotomous, one of the two levels would the sufficient to "show the picture".
I tried to use the function tab_net_cell, but was not able to get the right results. Here is some example code with BrCa (Breast cancer) with 1 or 0. I only want to show the number of patients with but not without breast cancer.
df <- data.frame(BrCa = c(1,1,1,0,0,0,NA,NA,0,0))
df$group <- c(1,2,1,2,1,2,1,2,1,2)
df %>%
expss::tab_cols(group) %>%
expss::tab_cells(BrCa) %>%
expss::tab_stat_cpct(total_row_position = "none",label = "%") %>%
expss::tab_stat_cases(total_row_position = "none",label = "N") %>%
expss::tab_pivot(stat_position = "inside_rows")
df %>%
expss::tab_cols(group) %>%
expss::tab_cells(BrCa) %>%
expss::tab_net_cells("BrCa" = eq(1)) %>%
expss::tab_stat_cpct(total_row_position = "none",label = "%") %>%
expss::tab_stat_cases(total_row_position = "none",label = "N") %>%
expss::tab_pivot(stat_position = "inside_rows")
The simplest way is to filter resulted table:
df <- data.frame(BrCa = c(1,1,1,0,0,0,NA,NA,0,0))
df$group <- c(1,2,1,2,1,2,1,2,1,2)
df %>%
expss::tab_cols(group) %>%
expss::tab_cells(BrCa) %>%
expss::tab_stat_cpct(total_row_position = "none",label = "%") %>%
expss::tab_stat_cases(total_row_position = "none",label = "N") %>%
expss::tab_pivot(stat_position = "inside_rows") %>%
expss::where(grepl(1, row_labels))
Another way is to use mean and sum instead of cpct and cases:
df %>%
expss::tab_cols(group) %>%
expss::tab_cells(BrCa*100) %>%
expss::tab_stat_mean(label = "%") %>%
expss::tab_stat_sum(label = "N") %>%
expss::tab_pivot(stat_position = "inside_rows")

How to obtain all row numbers that contain certain x/y value combinations in dplyr

lets say a plot select function gives me a data frame of x and y coordinates (values) which we can artificially generate with:
newData <- mtcars[ c(1,3,5,9:11) ,c('mpg', 'cyl')]
rownames(newData) <- NULL
colnames(newData) <- c('x', 'y')
x y
1 21.0 6
2 22.8 4
3 18.7 8
4 22.8 4
5 19.2 6
6 17.8 6
How do I find out which rows (i.e. row numbers) in the original mtcars contain these x/y value combinations?
I know I can join the two,
inner_join(mtcars, newData[ ,c('x', 'y')], by = c("mpg" = "x", "cyl" = 'y'))
which gives me a full data frame, but I only need the row numbers
which I need to use inside this:
mtcars$selected <- T
mtcars %>%
(selected = if_else(row_number() %in% '#ANSWER', !selected, selected))
in order to switch all selected points from F <-> T
UPDATE
I'm running this now:
mtcars %>%
mutate(Selected = if_else(row_number() %in% {mtcars %>%
mutate(rn = row_number()) %>%
inner_join(distinct(newData), by = c('mpg' = "x", "cyl" = 'y')) %>%
pull(rn)}, !Selected, Selected))
In order to pass the column names as variables (such as when coming from a dropdown menu in R Shiny) we can use setNames(c('x', 'y'), c(xvar, yvar)))
xpar <- 'mpg' #(input$xpar_selector in shiny App)
ypar <- 'cyl' #(input$ypar_selector in shiny App)
mtcars %>%
mutate(Selected = if_else(row_number() %in% {
mtcars %>%
mutate(rn = row_number()) %>%
inner_join(distinct(newData), by = setNames(c('x', 'y'), c(xvar, yvar))) %>%
pull(rn)
},
!Selected, Selected))
How to pass column names for inner join by 2 column sets as variables with dplyr
Expected result is:
Here's one way -
mtcars %>%
mutate(rn = row_number()) %>%
inner_join(distinct(newData), by = c("mpg" = "x", "cyl" = 'y')) %>%
pull(rn)
[1] 1 2 3 5 9 10 11
Here's how you could insert this in your code -
mtcars$selected <- TRUE
mtcars %>%
mutate(selected = if_else(row_number() %in% {mtcars %>%
mutate(rn = row_number()) %>%
inner_join(distinct(newData), by = c("mpg" = "x", "cyl" = 'y')) %>%
pull(rn)}, !selected, selected))
Here's a simpler way of achieving your result -
mtcars %>%
left_join(
newData %>%
distinct() %>%
mutate(selected = FALSE),
by = c("mpg" = "x", "cyl" = 'y')
) %>%
mutate(
selected = if_else(is.na(selected), TRUE, selected)
)
However, I am confused as to what you are trying to achieve here.
Maybe you are simply looking for an anti_join i.e. get rows from mtcars that are not there in newData -
mtcars %>%
anti_join(newData, by = c("mpg" = "x", "cyl" = 'y'))
Here is an option :
library(dplyr,warn.conflicts = FALSE)
newData %>%
distinct() %>%
mutate(match = TRUE) %>%
left_join(mtcars, by= c(mpg ="x", cyl="y"),.) %>%
pull(match) %>%
which()
#> [1] 1 2 3 5 9 10 11

Using purrr::pmap() in a rowwise manner outside of mutate()

I am trying to use purrr::pmap() to apply a custom function in a rowwise fashion along some dataframe rows. I can achieve my desired end result with a for-loop and with apply(), but when I try to use pmap() I can only get the result I want in combination with mutate(), which in my real-life applied case will be insufficient.
Is there a way to use pmap() to apply my custom function and just have the output print rather than be stored in a new column?
library(dplyr)
library(purrr)
library(tibble)
Create demo data & custom function
set.seed(57)
ds_mt <-
mtcars %>%
rownames_to_column("model") %>%
mutate(
am = factor(am, labels = c("auto", "manual")),
vs = factor(vs, labels = c("V", "S"))
) %>%
select(model, mpg, wt, cyl, am, vs) %>%
sample_n(3)
foo <- function(model, am, mpg){
print(
paste("The", model, "has a", am, "transmission and gets", mpg, "mpgs.")
)
}
Successful example of rowwise for-loop:
for (row in 1:nrow(ds_mt)) {
foo(
model = ds_mt[row, "model"],
am = ds_mt[row, "am"],
mpg = ds_mt[row, "mpg"]
)
}
Successful example using apply():
row.names(ds_mt) <- NULL # to avoid named vector as output
apply(
ds_mt,
MARGIN = 1,
FUN = function(ds)
foo(
model = ds["model"],
am = ds["am"],
mpg = ds["mpg"]
)
)
Example using pmap() within mutate() that is almost what I need.
ds_mt %>%
mutate(new_var =
pmap(
.l =
list(
model = model,
am = am,
mpg = mpg
),
.f = foo
))
FAILING CODE: Why doesn't this work?
ds_mt %>%
pmap(
.l =
list(
model = model,
am = am,
mpg = mpg
),
.f = foo
)
So after some more reading it seems this is a case for pwalk() rather than pmap(), because I am trying to get output to print (i.e., a side effect) rather than to be stored in a dataframe.
library(dplyr)
library(purrr)
library(tibble)
set.seed(57)
ds_mt <-
mtcars %>%
rownames_to_column("model") %>%
mutate(
am = factor(am, labels = c("auto", "manual")),
vs = factor(vs, labels = c("V", "S"))
) %>%
select(model, mpg, wt, cyl, am, vs) %>%
sample_n(3)
foo <- function(model, am, mpg){
print(
paste("The", model, "has a", am, "transmission and gets", mpg, "mpgs.")
)
}
ds_mt %>%
select(model, am, mpg) %>%
pwalk(
.l = .,
.f = foo
)

Resources