fct_reorder by function for only one group - r

I have a df of public and private schools within counties, and each has an assigned value. I want to use forcats::fct_reorder to rearrange the counties by the median value, but only for the private schools. Using default forcats::fct_reorder arranges by total median, which is less useful for what I'm doing.
Reprex here:
# make df
set.seed(1)
df <-
data.frame(
county = rep(c("Bexar","Travis","Tarrant","Aransas"), each=20),
type = rep(c("public","private"), each=10)
) %>%
mutate(value = case_when(type == "public" ~ runif(80,0,1),
type == "private" ~ runif(80, 0, 10)))
# private values are way higher than public
# relevel by median value
df %>%
mutate(county = forcats::fct_reorder(county, value, .fun=median)) %>%
# this rearranges counties by total median, but I only want to arrange by median of the private schools
# plot
ggplot(aes(x=county, y = value, color = type)) +
geom_point(position = position_dodge(
width=.75
)) +
geom_boxplot(alpha=.5)
Desired output would order them by increasing median of private schools only: Aransas, Travis, Tarrant, Bexar.
thanks!

library(tidyverse)
set.seed(1)
df <-
data.frame(
county = rep(c("Bexar","Travis","Tarrant","Aransas"), each=20),
type = rep(c("public","private"), each=10)
) %>%
mutate(value = case_when(type == "public" ~ runif(80,0,1),
type == "private" ~ runif(80, 0, 10)))
private_medians <-
df %>%
filter(type == "private") %>%
group_by(county) %>%
summarise(median = median(value)) %>%
arrange(median)
private_medians
#> # A tibble: 4 x 2
#> county median
#> <chr> <dbl>
#> 1 Aransas 3.91
#> 2 Travis 4.39
#> 3 Tarrant 5.68
#> 4 Bexar 6.24
# add other counties at the end in case they do not appear in the private subset
levels <- private_medians$county %>% union(df$county %>% unique())
df %>%
mutate(county = county %>% factor(levels = levels)) %>%
ggplot(aes(x=county, y = value, color = type)) +
geom_point(position = position_dodge(
width=.75
)) +
geom_boxplot(alpha=.5)
Created on 2021-10-18 by the reprex package (v2.0.1)

Related

Summarize information by group in data table in R

I'm trying to get multiple summary statistics in R grouped by Team. I used code like below, but output is not what I want.
please point me in a better direction. Thanks!
set.seed(77)
data <- data.frame(Team =sample(c("A","B"),30, replace=TRUE),
gender=sample(c("female","male"),30, replace=TRUE),
Age =sample(c(0:100),30, replace=T))
dat <- data %>%
group_by(Team, gender) %>%
dplyr::summarize_all(list(my_mean = mean,
my_sum = sum,
my_sd = sd)) %>%
as.data.frame()
df <- data %>%
group_by(Team) %>%
summarize(total = n(gender),
mean = mean(Age),
Max_Age = max(Age),
Min_Age = min(Age),
sd = sd(Age),
)
I want to get like this pic.
You may need to create the dataframe for the summary statistics of age per Team (age_summary in the example below) and that for the count of Team members per gender and Team (gender_summary in the example below), and then merge them into one dataframe (say summary_df).
library(tidyverse)
set.seed(77)
data <- data.frame(
Team = sample(c("A", "B"), 30, replace = TRUE),
gender = sample(c("female", "male"), 30, replace = TRUE),
Age = sample(c(0:100), 30, replace = T)
)
age_summary <- data %>%
group_by(Team) %>%
summarize(
mean = mean(Age),
Max = max(Age),
Min = min(Age),
sd = sd(Age)
) %>%
column_to_rownames("Team") %>%
t() %>%
as_tibble(
rownames = "age_summary"
)
gender_summary <- data %>%
group_by(Team) %>%
count(gender) %>%
ungroup() %>%
pivot_wider(names_from = Team, values_from = n)
summary_df <- full_join(
age_summary,
gender_summary
) %>%
mutate(
"item" = if_else(
is.na(gender),
"Age",
"Sex"
)
) %>%
unite("summary", c(age_summary, gender), na.rm = TRUE, remove = FALSE) %>%
relocate(item, .before = 1) %>%
select(-c(age_summary, gender))
# # A tibble: 6 × 4
# item summary A B
# <chr> <chr> <dbl> <dbl>
# 1 Age mean 45.6 57.8
# 2 Age Max 92 82
# 3 Age Min 5 14
# 4 Age sd 30.1 22.1
# 5 Sex female 8 9
# 6 Sex male 7 6

Making a Sankey Diagram in R

I'm trying to create a Sankey Diagram. I am using R with either {plotly} or {networkD3} packages. Both ask for the same type of data: source, target, value. I'm not really sure what source, target, and value is supposed to be and how to aggregate my data to this format. I have the following:
data.frame(
UniqID = rep(c(1:10), times=4),
Year = c(rep("2005", times=10), rep("2010", times=10), rep("2015", times=10), rep("2020", times=10)),
Response_Variable = round(runif(n = 40, min = 0, max = 2), digits = 0)
)
The response variable is a categorical variable of 0, 1, or 2. I would like to show the flow of the classes of this variable from one year to the next. The final product should look something like this:
In my case, "Wave" would be Year and "Outcome" would be the classes (0, 1, 2) of the response variable.
You don't really have enough information in your data to make a chart exactly like that because with the data you provided it's not clear which things changed from one category to the next across years. Maybe you were trying to achieve that with the UniqID column, but the way the data is, it doesn't make sense...
df <- data.frame(UniqID=rep(c(1:10), times=4),
Year=rep(c("2005", "2010", "2015", "2020"), times=10),
Response_Variable=round(runif(n=40, min = 0, max = 2), digits=0))
library(dplyr)
df %>% arrange(UniqID, Year) %>% filter(UniqID == 1)
#> UniqID Year Response_Variable
#> 1 1 2005 2
#> 2 1 2005 1
#> 3 1 2015 1
#> 4 1 2015 0
Ignoring that, the data format you're asking about is a list of "links" each one defining a movement from one "node", the "source" node, to another "node", the "target" "node". So in your case, each year-category combination is a "node", and you need a list of each "link" between those nodes, and potentially a "value" for each of your links, which in your case the number of occurrences of the source node makes the most sense. You could reshape your data to that format like this...
df %>%
group_by(Year, Response_Variable) %>%
summarise(value = n(), .groups = "drop") %>%
mutate(source = paste(Year, Response_Variable, sep = "_")) %>%
group_by(Response_Variable) %>%
mutate(target = lead(source, order_by = Year)) %>%
filter(!is.na(target))
#> # A tibble: 9 × 5
#> # Groups: Response_Variable [3]
#> Year Response_Variable value source target
#> <chr> <dbl> <int> <chr> <chr>
#> 1 2005 0 4 2005_0 2010_0
#> 2 2005 1 3 2005_1 2010_1
#> 3 2005 2 3 2005_2 2010_2
#> 4 2010 0 2 2010_0 2015_0
#> 5 2010 1 6 2010_1 2015_1
#> 6 2010 2 2 2010_2 2015_2
#> 7 2015 0 3 2015_0 2020_0
#> 8 2015 1 3 2015_1 2020_1
#> 9 2015 2 4 2015_2 2020_2
To get to the more specific format that {networkD3} requires, you need one data.frame for links and one that lists each node. The links data.frame needs to refer to each node in the nodes data.frame by its 0-based index. You can set that up like this...
library(dplyr)
library(networkD3)
df <-
data.frame(
UniqID=rep(c(1:10), times=4),
Year=rep(c("2005", "2010", "2015", "2020"), times=10),
Response_Variable=round(runif(n=40, min = 0, max = 2), digits=0)
)
links <-
df %>%
group_by(Year, Response_Variable) %>%
summarise(value = n(), .groups = "drop") %>%
mutate(source = paste(Year, Response_Variable, sep = "_")) %>%
group_by(Response_Variable) %>%
mutate(target = lead(source, order_by = Year)) %>%
filter(!is.na(target)) %>%
ungroup() %>%
select(source, target, value)
nodes <- data.frame(node_id = unique(c(links$source, links$target)))
links$source <- match(links$source, nodes$node_id) - 1
links$target <- match(links$target, nodes$node_id) - 1
sankeyNetwork(
Links = links,
Nodes = nodes,
Source = "source",
Target = "target",
Value = "value",
NodeID = "node_id"
)
#> Links is a tbl_df. Converting to a plain data frame.
given the modification to your example data, it would look like this...
library(dplyr)
library(networkD3)
df <-
data.frame(
UniqID=rep(c(1:10), times=4),
Year=c(rep("2005", times=10), rep("2010", times=10), rep("2015", times=10), rep("2020", times=10)),
Response_Variable=round(runif(n=40, min = 0, max = 2), digits=0)
)
links <-
df %>%
arrange(UniqID, Year) %>%
mutate(source = paste(Year, Response_Variable, sep = "_")) %>%
group_by(UniqID) %>%
mutate(target = lead(source, order_by = Year)) %>%
filter(!is.na(target)) %>%
ungroup() %>%
select(UniqID, source, target) %>%
group_by(source, target) %>%
summarise(value = n(), .groups = "drop")
nodes <- data.frame(node_id = unique(c(links$source, links$target)))
nodes$node_label <- sub("(.*)_([0-9]+)$", "\\1 (response \\2)", nodes$node_id)
nodes$node_group <- sub("^.*_", "", nodes$node_id)
links$source <- match(links$source, nodes$node_id) - 1
links$target <- match(links$target, nodes$node_id) - 1
sankeyNetwork(
Links = links,
Nodes = nodes,
Source = "source",
Target = "target",
Value = "value",
NodeID = "node_label",
NodeGroup = "node_group"
)
The answer is to use ggsankey and not plotly nor networkD3!

randomly add NA values to dataframe with the proportion set by group

I would like to randomly add NA values to my dataframe with the proportion set by group.
library(tidyverse)
set.seed(1)
dat <- tibble(group = c(rep("A", 100),
rep("B", 100)),
value = rnorm(200))
pA <- 0.5
pB <- 0.2
# does not work
# was trying to create another column that i could use with
# case_when to set value to NA if missing==1
dat %>%
group_by(group) %>%
mutate(missing = rbinom(n(), 1, c(pA, pB))) %>%
summarise(mean = mean(missing))
I'd create a small tibble to keep track of the expected missingness rates, and join it to the first data frame. Then go through row by row to decide whether to set a value to missing or not.
This is easy to generalize to more than two groups as well.
library("tidyverse")
set.seed(1)
dat <- tibble(
group = c(
rep("A", 100),
rep("B", 100)
),
value = rnorm(200)
)
expected_nans <- tibble(
group = c("A", "B"),
p = c(0.5, 0.2)
)
dat_with_nans <- dat %>%
inner_join(
expected_nans,
by = "group"
) %>%
mutate(
r = runif(n()),
value = if_else(r < p, NA_real_, value)
) %>%
select(
-p, -r
)
dat_with_nans %>%
group_by(
group
) %>%
summarise(
mean(is.na(value))
)
#> # A tibble: 2 × 2
#> group `mean(is.na(value))`
#> <chr> <dbl>
#> 1 A 0.53
#> 2 B 0.17
Created on 2022-03-11 by the reprex package (v2.0.1)
Nesting and unnesting works
library(tidyverse)
dat <- tibble(group = c(rep("A", 1000),
rep("B", 1000)),
value = rnorm(2000))
pA <- .1
pB <- 0.5
set.seed(1)
dat %>%
group_by(group) %>%
nest() %>%
mutate(p = case_when(
group=="A" ~ pA,
TRUE ~ pB
)) %>%
mutate(data = purrr::map(data, ~ mutate(.x, missing = rbinom(n(), 1, p)))) %>%
unnest() %>%
summarise(mean = mean(missing))
# A tibble: 2 × 2
group mean
<chr> <dbl>
1 A 0.11
2 B 0.481
set.seed(1)
dat %>%
group_by(group) %>%
nest() %>%
mutate(p = case_when(
group=="A" ~ pA,
TRUE ~ pB
)) %>%
mutate(data = purrr::map(data, ~ mutate(.x, missing = rbinom(n(), 1, p)))) %>%
unnest() %>%
ungroup() %>%
mutate(value = case_when(
missing == 1 ~ NA_real_,
TRUE ~ value
)) %>%
select(-p, -missing)

How to estimate the mean of the 10% upper and lower values over multiple categories with dplyr?

Suppose you have this data.frame in R
set.seed(15)
df <- data.frame(cat = rep(c("a", "b"), each = 50),
x = c(runif(50, 0, 1), runif(50, 1, 2)))
I want to estimate the mean of the 10% upper and lower values in each category.
I can do it using base functions like this
dfa <- df[df$cat=="a",]
dfb <- df[df$cat=="b",]
mean(dfa[dfa$x >= quantile(dfa$x, 0.9),"x"])
# [1] 0.9537632
mean(dfa[dfa$x <= quantile(dfa$x, 0.1),"x"])
# [1] 0.07959845
mean(dfb[dfb$x >= quantile(dfb$x, 0.9),"x"])
# [1] 1.963775
mean(dfb[dfb$x <= quantile(dfb$x, 0.1),"x"])
# [1] 1.092218
However, I can't figure it out how to implement this using dplyr or purrr.
Thanks for the help.
We could do this in a group by approach using cut and quantile as breaks
library(dplyr)
df %>%
group_by(cat) %>%
mutate(grp = cut(x, breaks = c(-Inf, quantile(x,
probs = c(0.1, 0.9)), Inf))) %>%
group_by(grp, .add = TRUE) %>%
summarise(x = mean(x, na.rm = TRUE), .groups = 'drop_last') %>%
slice(-2)
-ouptut
# A tibble: 4 x 3
# Groups: cat [2]
cat grp x
<chr> <fct> <dbl>
1 a (-Inf,0.0813] 0.0183
2 a (0.853, Inf] 0.955
3 b (-Inf,1.21] 1.07
4 b (1.93, Inf] 1.95
Here's a way you can use cut() to help partitaion your data into groups and then take the mean
df %>%
group_by(cat) %>%
mutate(part=cut(x, c(-Inf, quantile(x, c(.1, .9)), Inf), labels=c("low","center","high"))) %>%
filter(part!="center") %>%
group_by(cat, part) %>%
summarize(mean(x))
which returns everything in a nice tibble
cat part `mean(x)`
<chr> <fct> <dbl>
1 a low 0.0796
2 a high 0.954
3 b low 1.09
4 b high 1.96
To make it a bit cleaner, you can factor out the splitting to a helper function
split_quantile <- function(x , p=c(.1, .9)) {
cut(x, c(-Inf, quantile(x, c(.1, .9)), Inf), labels=c("low","center","high"))
}
df %>%
group_by(cat) %>%
mutate(part = split_quantile(x)) %>%
filter(part != "center") %>%
group_by(cat, part) %>%
summarize(mean(x))
A variant of #MrFlick's answer - you can use cut_number and slice:
df %>%
group_by(cat) %>%
mutate(part = cut_number(x, n = 10)) %>%
group_by(cat, part) %>%
summarise(mean(x)) %>%
slice(1, n())

How to data wrangle and barplot the proportion without undesired stripes

Please find the input data and expected output as screenshot below:
However, the current plot with the below code:
I feel, I made it too complicated. But I shared input data and expected data along with struggled code along the way. Could you please help us
Mainly there are 2 issues.
1. If mutate is used, undesired stripes appear on the plot
Summarize used, then it is not adding to 100%
2. How can we extract the top contributors
Both have been tried by us but stuck somewhere
# Input data
df <- tibble(
country = c(rep(c("India","USA","Germany","Africa"), each = 8)),
type = c("sms","Other","whatsapp","web","online","shiny","whatsapp","whatsapp",
"sms","sms","sms","web","web","Other","online","whatsapp",
"sms","Other","whatsapp","shiny","online","shiny","whatsapp","whatsapp",
"sms","sms","sms","shiny","online","Other","online","Other"
),
cust = rep(c("google","Apple","wallmart","pg"),8),
quantity = c(10,20,30,40,50,60,70,80,
90,100,15,25,35,45,55,65,
75,85,95,105,10,15,20,25,
30,35,40,45,50,55,60,65)
)
# Without Customer
df %>%
group_by(country,type) %>%
summarise(kpi_wo_cust = sum(quantity)) %>%
ungroup() -> df_wo_cust
# With Customer
df %>%
group_by(country,type,cust) %>%
summarise(kpi_cust = sum(quantity)) %>%
ungroup() -> df_cust
df_combo <- left_join(df_cust, df_wo_cust, by = c("country","type"))
df_combo %>% glimpse()
# Aggregated data for certain KPIs for final plot
df_aggr <- df_combo %>%
group_by(country,type) %>%
mutate(kpi_cust_total = sum(kpi_cust),
per_kpi_cust = 100 * (kpi_cust/kpi_cust_total)) %>%
group_by(country) %>%
# In order to except from repeated counting, selecting unique()
mutate(kpi_cust_uniq_total = sum(kpi_cust) %>% unique(),
per_unq_kpi_cust = 100 * (kpi_cust/kpi_cust_uniq_total) %>% round(4))
#
plt = df_aggr %>% ungroup() %>%#glimpse()
# In order to obtain theTop 2 customers (Major contributor) within country and type
# However, if this code is used, there is an error
# group_by(country, type) %>%
# nest() %>%
# mutate(top_cust = purrr::map_chr(data, function(x){
# x %>% arrange(desc(per_kpi_cust)) %>%
# top_n(2,per_kpi_cust) %>%
# summarise(Cust = paste(cust,round(per_kpi_cust,2), collapse = "<br>")) %>%
# pull(cust)
# })#,data = NULL
# ) %>%
# unnest(cols = data) %>%
group_by(country, type) %>%
# If mutate is used, undesired stripes appear on the plot
# Summarize used, then it is not adding to 100%
mutate(avg_kpi_cust = per_unq_kpi_cust %>% mean()) %>%
#summarise(avg_kpi_cust = per_unq_kpi_cust %>% mean()) %>%
ggplot(aes(x = country,
y = avg_kpi_cust,
fill = type,
text = paste('<br>proportion: ', round(avg_kpi_cust,2), "%",
"<br>country:",country
))) +
geom_bar(stat = "identity"#, position=position_dodge()
) +
coord_flip() +
theme_bw()
ggplotly(plt)
The key was to use distinct() after mutate() instead of summarise()
Also, mean() was the wrong function used earlier instead of sum() which had resulted in incomplete barplot.
library(tidyverse)
library(plotly)
# Input data
df <- tibble(
country = c(rep(c("India","USA","Germany","Africa"), each = 8)),
type = c("sms","Other","whatsapp","web","online","shiny","whatsapp","whatsapp",
"sms","sms","sms","web","web","Other","online","whatsapp",
"sms","Other","whatsapp","shiny","online","shiny","whatsapp","whatsapp",
"sms","sms","sms","shiny","online","Other","online","Other"
),
cust = rep(c("google","Apple","wallmart","pg"),8),
quantity = c(10,20,30,40,50,60,70,80,
90,100,15,25,35,45,55,65,
75,85,95,105,10,15,20,25,
30,35,40,45,50,55,60,65)
)
# Without Customer
df %>%
group_by(country,type) %>%
summarise(kpi_wo_cust = sum(quantity)) %>%
ungroup() -> df_wo_cust
# With Customer
df %>%
group_by(country,type,cust) %>%
summarise(kpi_cust = sum(quantity)) %>%
ungroup() -> df_cust
df_combo <- left_join(df_cust, df_wo_cust, by = c("country","type"))
df_combo %>% glimpse()
# Aggregated data for certain KPIs for final plot
df_aggr <- df_combo %>%
group_by(country,type) %>%
mutate(kpi_cust_total = sum(kpi_cust),
per_kpi_cust = 100 * (kpi_cust/kpi_cust_total)) %>%
group_by(country) %>%
# In order to except from repeated counting, selecting unique()
mutate(kpi_cust_uniq_total = sum(kpi_cust) %>% unique(),
per_unq_kpi_cust = 100 * (kpi_cust/kpi_cust_uniq_total) %>% round(4))
plt = df_aggr %>% ungroup() %>%
# In order to diplay Top 2 customers (Major contributor) within country and type
group_by(country, type) %>%
nest() %>%
mutate(top_cust = purrr::map_chr(data, function(x){
x %>% arrange(desc(per_kpi_cust)) %>%
top_n(2,per_kpi_cust) %>%
summarise(Cust = paste(cust,round(per_kpi_cust,2), collapse = "<br>")) %>%
pull(Cust)
})) %>%
unnest(cols = data) %>%
group_by(country, type) %>%
# If mutate is used, undesired stripes appear on the plot
# Summarize used, then it is not adding to 100%.
# So distinct was used
mutate(avg_kpi_cust = per_unq_kpi_cust %>% sum()) %>%
ungroup() %>%
distinct(country, type, .keep_all = T) %>%
ggplot(aes(x = country,
y = avg_kpi_cust,
fill = type,
text = top_cust
)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_bw()
ggplotly(plt, tooltip = "text")

Resources