I am trying to figure out a way to aggregate levels of a group creating a new level based on a threshold value of what you are aggregating.
Create some data:
library(tidyr)
library(dplyr)
demo_data <- as_tibble(VADeaths) %>%
mutate(age_bucket = row.names(VADeaths)) %>%
pivot_longer(-age_bucket) %>%
arrange(name)
Here are a bunch of values below our threshold (say 15 here)
demo_data %>%
filter(value < 15)
#> # A tibble: 5 x 3
#> age_bucket name value
#> <chr> <chr> <dbl>
#> 1 50-54 Rural Female 8.7
#> 2 55-59 Rural Female 11.7
#> 3 50-54 Rural Male 11.7
#> 4 50-54 Urban Female 8.4
#> 5 55-59 Urban Female 13.6
Now I can use some logic to do this with case_when but this seems fragile because it is so specific. This does, however, illustrate what I am after:
demo_data %>%
mutate(age_bucket_agg = case_when(
age_bucket %in% c("50-54", "55-59") & name == "Rural Female" ~ "50-59",
age_bucket %in% c("50-54", "55-59") & name == "Urban Female" ~ "50-59",
age_bucket %in% c("50-54", "55-59") & name == "Rural Male" ~ "50-59",
TRUE ~ age_bucket
)
) %>%
group_by(age_bucket_agg, name) %>%
summarise(value = sum(value))
#> `summarise()` regrouping output by 'age_bucket_agg' (override with `.groups` argument)
#> # A tibble: 17 x 3
#> # Groups: age_bucket_agg [6]
#> age_bucket_agg name value
#> <chr> <chr> <dbl>
#> 1 50-54 Urban Male 15.4
#> 2 50-59 Rural Female 20.4
#> 3 50-59 Rural Male 29.8
#> 4 50-59 Urban Female 22
#> 5 55-59 Urban Male 24.3
#> 6 60-64 Rural Female 20.3
#> 7 60-64 Rural Male 26.9
#> 8 60-64 Urban Female 19.3
#> 9 60-64 Urban Male 37
#> 10 65-69 Rural Female 30.9
#> 11 65-69 Rural Male 41
#> 12 65-69 Urban Female 35.1
#> 13 65-69 Urban Male 54.6
#> 14 70-74 Rural Female 54.3
#> 15 70-74 Rural Male 66
#> 16 70-74 Urban Female 50
#> 17 70-74 Urban Male 71.1
My question is can anyone think of an automated way of doing this? How can I tell dplyr (or R in general) to take all values below as threshold and add them to the next age_bucket and then recode that grouping level to take the lowest value and the biggest value and create a new range.
I think your example is a bit too minimal for this really challenging question. I added some challenges to your data which I think the approaches of the other answers can't tackle yet. My approach is quite verbose. Essentially, it checks every logical combination / direction in which age buckets could be merged and then recursively merges the age buckets until the threshold is met or until there are no other age buckets left to merge together. With a bit more work we could turn this into a more general function.
library(tidyverse)
demo_data <- as_tibble(VADeaths) %>%
mutate(age_bucket = row.names(VADeaths)) %>%
pivot_longer(-age_bucket) %>%
arrange(name) %>%
# lets add more challenges to the data
mutate(value = case_when(
age_bucket == "55-59" & name == "Rural Female" ~ 2,
age_bucket == "70-74" & name == "Rural Male" ~ 13,
age_bucket == "65-69" & name == "Urban Female" ~ 8,
age_bucket == "70-74" & name == "Urban Male" ~ 3,
T ~ value))
# function that implements merging age buckets
merge_impl <- function(x) {
if(any(x$first)) {
e <- filter(x, first == 1)
if (e$id & !is.na(e$age_max_lead)) {
out <- mutate(x,
age_max = if_else(first,
age_max_lead,
age_max),
value = if_else(first,
value + value_lead,
value))
out <- filter(out, !lag(first, default = FALSE))
} else if (e$id & is.na(e$age_max_lead & !is.na(e$age_min_lag))) {
out <- mutate(x,
age_min = if_else(first,
age_min_lag,
age_min),
value = if_else(first,
value + value_lag,
value))
out <- filter(out, !lead(first, default = FALSE))
} else if (e$id & is.na(e$age_max_lead & is.na(e$age_min_lag))) {
out <- x
} else if (!e$id & !is.na(e$age_min_lag)) {
out <- mutate(x,
age_min = if_else(first,
age_min_lag,
age_min),
value = if_else(first,
value + value_lag,
value))
out <- filter(out, !lead(first, default = FALSE))
} else if (!e$id & is.na(e$age_min_lag) & !is.na(e$age_max_lead)) {
out <- mutate(x,
age_max = if_else(first,
age_max_lead,
age_max),
value = if_else(first,
value + value_lead,
value)) %>%
out <- filter(out, !lag(first, default = FALSE))
} else if (!e$id & is.na(e$age_min_lag) & is.na(e$age_max_lead)) {
out <- x
}
} else {
out <- x
}
select(out,
-contains("lead"), -contains("lag"),
-first, -id)
}
merge_age_buckets <- function(x, threshold) {
# initialize
data_ls <-
x %>%
separate(age_bucket,
c("age_min", "age_max"),
convert = TRUE) %>%
group_by(name) %>%
mutate(across(c(age_min, age_max, value),
list(lead = ~ lead(.x),
lag = ~ lag(.x))
)
) %>%
mutate(id = age_min %% 10 == 0,
first = value < threshold & cumsum(value < threshold) == 1) %>%
group_split
# check & proceed
if(any(map_lgl(data_ls, ~ any(.x$first & nrow(.x) > 1)))) {
res <- map_dfr(data_ls, merge_impl) %>%
mutate(age_bucket = paste0(age_min, "-", age_max)) %>%
select(- c(age_min, age_max))
# if result still needs adjustment repeat
if(any(res$value < threshold)) {
merge_age_buckets(res, threshold = threshold)
} else {
return(res)
}
} else {
out <- reduce(data_ls, bind_rows) %>%
mutate(age_buckets = paste0(age_min, "-", age_max)) %>%
select(- c(age_min, age_max))
return(out)
}
}
merge_age_buckets(demo_data, 15)
#> # A tibble: 13 x 3
#> name value age_bucket
#> <chr> <dbl> <chr>
#> 1 Rural Female 31 50-64
#> 2 Rural Female 30.9 65-69
#> 3 Rural Female 54.3 70-74
#> 4 Rural Male 29.8 50-59
#> 5 Rural Male 26.9 60-64
#> 6 Rural Male 54 65-74
#> 7 Urban Female 22 50-59
#> 8 Urban Female 27.3 60-69
#> 9 Urban Female 50 70-74
#> 10 Urban Male 15.4 50-54
#> 11 Urban Male 24.3 55-59
#> 12 Urban Male 37 60-64
#> 13 Urban Male 57.6 65-74
Created on 2020-06-23 by the reprex package (v0.3.0)
Here is a unneccessary complicated way using dplyr and stringr:
demo_data %>%
group_by(name) %>%
mutate(csum = cumsum(value),
min_split = ifelse(value<15, as.numeric(str_split(age_bucket[value<15], "-", simplify = TRUE))[1], NA),
max_split = ifelse(value<15, as.numeric(str_split(age_bucket[min(which(csum>15))], "-", simplify = TRUE))[2], NA),
age_bucket = ifelse(value<15, str_c(min_split, "-", max_split), age_bucket),
value = ifelse(value<15, csum[min(which(csum>15))], value)) %>%
select(-min_split, -max_split, -csum) %>%
distinct() %>%
arrange(age_bucket)
which yields
# A tibble: 18 x 3
# Groups: name [4]
age_bucket name value
<chr> <chr> <dbl>
1 50-54 Urban Male 15.4
2 50-59 Rural Female 20.4
3 50-59 Rural Male 29.8
4 50-59 Urban Female 22
5 55-59 Rural Male 18.1
6 55-59 Urban Male 24.3
7 60-64 Rural Female 20.3
8 60-64 Rural Male 26.9
9 60-64 Urban Female 19.3
10 60-64 Urban Male 37
11 65-69 Rural Female 30.9
12 65-69 Rural Male 41
13 65-69 Urban Female 35.1
14 65-69 Urban Male 54.6
15 70-74 Rural Female 54.3
16 70-74 Rural Male 66
17 70-74 Urban Female 50
18 70-74 Urban Male 71.1
Not sure if I understand the requirements correctly after TimTeaFan’s comments, here is approach in data.table:
library(data.table)
DT <- setDT(reshape2::melt(VADeaths, id.vars=NULL))
DT[, c("low", "high") := lapply(tstrsplit(Var1, "-"), as.integer)]
DT[value < 15, c("low","high") := .(min(low), max(high)), Var2]
DT[, sum(value), .(low, high, Var2)]
With a mix of cumsum and rle, (here using data.table::rleid, but you can also use base::rle)
library(tidyr)
library(dplyr)
demo_data <- as_tibble(VADeaths) %>%
mutate(age_bucket = as.factor(row.names(VADeaths))) %>% #factorise to get the levels right
pivot_longer(-age_bucket) %>%
arrange(name, age_bucket) #added this to sort
thresh <- 15
demo_data %>%
group_by(name) %>%
mutate(rle_val = data.table::rleid(value < thresh),
min_nonconsec = which.min(c(1, diff(rle_val) != 1)),
newbuck = cumsum(row_number() > min_nonconsec)) %>%
group_by(name, newbuck) %>%
summarise(newname = paste(age_bucket, collapse = "-"),
newbucket = paste(unlist(strsplit(newname, "-"))[1], tail(unlist(strsplit(newname, "-")),1), sep = "-"),
newval = sum(value)
) %>%
select(-newname)
#> `summarise()` regrouping output by 'name' (override with `.groups` argument)
#> # A tibble: 15 x 4
#> # Groups: name [4]
#> name newbuck newbucket newval
#> <chr> <int> <chr> <dbl>
#> 1 Rural Female 0 50-64 40.7
#> 2 Rural Female 1 65-69 30.9
#> 3 Rural Female 2 70-74 54.3
#> 4 Rural Male 0 50-59 29.8
#> 5 Rural Male 1 60-64 26.9
#> 6 Rural Male 2 65-69 41
#> 7 Rural Male 3 70-74 66
#> 8 Urban Female 0 50-64 41.3
#> 9 Urban Female 1 65-69 35.1
#> 10 Urban Female 2 70-74 50
#> 11 Urban Male 0 50-54 15.4
#> 12 Urban Male 1 55-59 24.3
#> 13 Urban Male 2 60-64 37
#> 14 Urban Male 3 65-69 54.6
#> 15 Urban Male 4 70-74 71.1
Created on 2020-06-20 by the reprex package (v0.3.0)
Related
I am trying to obtain the largest 10 investors in a country but obtain confusing result using arrange in dplyr versus order in base R.
head(fdi_partner)
give the following results
# A tibble: 6 x 3
`Main counterparts` `Number of projects` `Total registered capital (Mill. USD)(*)`
<chr> <chr> <chr>
1 TOTAL 1818 38854.3
2 Singapore 231 11358.66
3 Korea Rep.of 377 7679.9
4 Japan 204 4325.79
5 Netherlands 24 4209.64
6 China, PR 216 3001.79
and
fdi_partner %>%
rename("Registered capital" = "Total registered capital (Mill. USD)(*)") %>%
mutate_at(c("Number of projects", "Registered capital"), as.numeric) %>%
arrange("Number of projects") %>%
head()
give almost the same result
# A tibble: 6 x 3
`Main counterparts` `Number of projects` `Registered capital`
<chr> <dbl> <dbl>
1 TOTAL 1818 38854.
2 Singapore 231 11359.
3 Korea Rep.of 377 7680.
4 Japan 204 4326.
5 Netherlands 24 4210.
6 China, PR 216 3002.
while the following code is working fine with base R
head(fdi_partner)
fdi_numeric <- fdi_partner %>%
rename("Registered capital" = "Total registered capital (Mill. USD)(*)") %>%
mutate_at(c("Number of projects", "Registered capital"), as.numeric)
head(fdi_numeric[order(fdi_numeric$"Number of projects", decreasing = TRUE), ], n=11)
which gives
# A tibble: 11 x 3
`Main counterparts` `Number of projects` `Registered capital`
<chr> <dbl> <dbl>
1 TOTAL 1818 38854.
2 Korea Rep.of 377 7680.
3 Singapore 231 11359.
4 China, PR 216 3002.
5 Japan 204 4326.
6 Hong Kong SAR (China) 132 2365.
7 United States 83 783.
8 Taiwan 66 1464.
9 United Kingdom 50 331.
10 F.R Germany 37 131.
11 Thailand 36 370.
Can anybody help explain what's wrong with me?
dplyr (and more generally tidyverse packages) accept only unquoted variable names. If your variable name has a space in it, you must wrap it in backticks:
library(dplyr)
test <- data.frame(`My variable` = c(3, 1, 2), var2 = c(1, 1, 1), check.names = FALSE)
test
#> My variable var2
#> 1 3 1
#> 2 1 1
#> 3 2 1
# Your code (doesn't work)
test %>%
arrange("My variable")
#> My variable var2
#> 1 3 1
#> 2 1 1
#> 3 2 1
# Solution
test %>%
arrange(`My variable`)
#> My variable var2
#> 1 1 1
#> 2 2 1
#> 3 3 1
Created on 2023-01-05 with reprex v2.0.2
I am working with the R programming language.
I have the following dataset:
set.seed(123)
gender <- c("Male","Female")
gender <- sample(gender, 5000, replace=TRUE, prob=c(0.45, 0.55))
gender <- as.factor(gender)
status <- c("Immigrant","Citizen")
status <- sample(status, 5000, replace=TRUE, prob=c(0.3, 0.7))
status <- as.factor(status )
country <- c("A", "B", "C", "D")
country <- sample(country, 5000, replace=TRUE, prob=c(0.25, 0.25, 0.25, 0.25))
country <- as.factor(country)
################
disease <- c("Yes","No")
disease <- sample(disease, 5000, replace=TRUE, prob=c(0.4, 0.6))
disease <- as.factor(disease)
###################
my_data = data.frame(gender, status, disease, country)
I want to find out the relative percentage of each unique group of factors that have the disease vs do not have the disease.
As an example:
What percentage of Male Immigrants from Country A have the disease vs don't have the disease
What percentage of Male Citizens from Country A have the disease vs don't have the disease (both these percentages should add to 1)
etc.
I tried to do this with the following code:
# https://stackoverflow.com/questions/24576515/relative-frequencies-proportions-with-dplyr
library(dplyr)
step_1 = my_data %>% group_by (gender, status, country, disease) %>%
summarise (n=n()) %>%
mutate(rel.freq = paste0(round(100 * n/sum(n), 0), "%"))
`summarise()` has grouped output by 'gender', 'status', 'country'. You can override using the
`.groups` argument.
# A tibble: 32 x 6
# Groups: gender, status, country [16]
gender status country disease n rel.freq
<fct> <fct> <fct> <fct> <int> <chr>
1 Female Citizen A No 285 60%
2 Female Citizen A Yes 193 40%
Now (assuming this is correct), I am trying to make some modifications to this result - this should reduce the number of rows by half (i.e. two rows in step_1 for yes and no are now combined into a single row):
step_2 = step_1 %>%
group_by(gender, status, country) %>%
summarize(disease = first(disease),
# number of people in this row that do not have the disease
n_no = sum(disease == "No"),
# number of people in this row that do have the disease
n_yes = sum(disease == "Yes"),
# relative percent of people in this row that do not have the disease
n_no_rel_freq = paste(round(sum(disease == "No") / sum(n) * 100), "%"),
# relative percent of people in this row that do have the disease
n_yes_rel_freq = paste(round(sum(disease == "Yes") / sum(n) * 100), "%"),
# overall percent of all people in this row relative to entire population
overall_percent = sum(n) / sum(step_1$n))
The code seems to run - but many of the percentages are now 0:
# A tibble: 16 x 9
# Groups: gender, status [4]
gender status country disease n_no n_yes n_no_rel_freq n_yes_rel_freq overall_percent
<fct> <fct> <fct> <fct> <int> <int> <chr> <chr> <dbl>
1 Female Citizen A No 1 0 0 % 0 % 0.102
2 Female Citizen B No 1 0 0 % 0 % 0.092
Can someone please show me how to fix this?
Thanks!
Note: The final result should look something like this
# desired result (sample)
gender status country n_no n_yes n_no_rel_freq n_yes_rel_freq total overall_percent
1 female citizen A 285 193 0.6 0.4 478 0.0956
Maybe you can make use of pivot-wider.
library(tidyverse)
my_data %>% group_by (gender, status, country, disease) %>%
summarise (n=n()) %>%
mutate(rel.freq = paste0(round(100 * n/sum(n), 0), "%")) -> step_1
#> `summarise()` has grouped output by 'gender', 'status', 'country'. You can
#> override using the `.groups` argument.
step_1 |>group_by(country) |>
pivot_wider(names_from = disease,
values_from = c(n:rel.freq),
names_prefix = "disease_") |>
mutate(overallPerc = (n_disease_No + n_disease_Yes)/sum(step_1$n))
#> # A tibble: 16 × 8
#> # Groups: country [4]
#> gender status country n_disease_No n_disease_Yes rel.fre…¹ rel.f…² overa…³
#> <fct> <fct> <fct> <int> <int> <chr> <chr> <dbl>
#> 1 Female Citizen A 308 200 61% 39% 0.102
#> 2 Female Citizen B 291 169 63% 37% 0.092
#> 3 Female Citizen C 301 228 57% 43% 0.106
#> 4 Female Citizen D 245 189 56% 44% 0.0868
#> 5 Female Immigrant A 107 95 53% 47% 0.0404
#> 6 Female Immigrant B 126 76 62% 38% 0.0404
#> 7 Female Immigrant C 137 70 66% 34% 0.0414
#> 8 Female Immigrant D 129 74 64% 36% 0.0406
#> 9 Male Citizen A 237 167 59% 41% 0.0808
#> 10 Male Citizen B 247 163 60% 40% 0.082
#> 11 Male Citizen C 250 171 59% 41% 0.0842
#> 12 Male Citizen D 230 139 62% 38% 0.0738
#> 13 Male Immigrant A 103 68 60% 40% 0.0342
#> 14 Male Immigrant B 117 63 65% 35% 0.036
#> 15 Male Immigrant C 93 53 64% 36% 0.0292
#> 16 Male Immigrant D 102 52 66% 34% 0.0308
#> # … with abbreviated variable names ¹rel.freq_disease_No,
#> # ²rel.freq_disease_Yes, ³overallPerc
I would solve this with the help of data.table:
install(data.table)
setDT(my_data)
my_data[, .N, by = .(gender, status, country, disease)][
, dcast(.SD, gender+status+country~disease, value.var = "N")][
, rel.freq := Yes/(No+Yes)][]
What is in there:
You install data.table
You convert my_data to a data.table (setDT(my_data))
With my_data[, .N, by = .(gender, status, country, disease)] you count cases (.N) grouped by all the variables after by=.
With [, dcast(.SD, gender+status+country~disease, value.var = "N")] you counvert your long table into a wide one, leaving the levels of disease as new column headers and summing on N, which is the number of cases.
With [, rel.freq := Yes/(No+Yes)] you create a new variable rel.freq that is the result of dividing the positive cases in the total cases.
With the [] you display the result to screen (you don't need this step, if you want to assign the result to a new object).
This is the result I obtained:
gender status country No Yes rel.freq
1: Female Citizen A 308 200 0.3937008
2: Female Citizen B 291 169 0.3673913
3: Female Citizen C 301 228 0.4310019
4: Female Citizen D 245 189 0.4354839
5: Female Immigrant A 107 95 0.4702970
6: Female Immigrant B 126 76 0.3762376
7: Female Immigrant C 137 70 0.3381643
8: Female Immigrant D 129 74 0.3645320
9: Male Citizen A 237 167 0.4133663
10: Male Citizen B 247 163 0.3975610
11: Male Citizen C 250 171 0.4061758
12: Male Citizen D 230 139 0.3766938
13: Male Immigrant A 103 68 0.3976608
14: Male Immigrant B 117 63 0.3500000
15: Male Immigrant C 93 53 0.3630137
16: Male Immigrant D 102 52 0.3376623
Example Dataframe
structure(list(sex = c("Male", "Female", "Female", "Female",
"Male", "Female", "Female", "Male", "Female"), cigarettes_smoking_status = c("Non-smoker",
"Non-smoker", "Non-smoker", "Non-smoker", "Non-smoker", "Non-smoker",
"Non-smoker", "Regular Smoker", "Non-smoker")), row.names = 2:10, class = "data.frame")
Code
smoking_status_by_per <- smoking_dataset %>%
group_by(cigarettes_smoking_status, sex) %>%
dplyr::summarise(count1=n()) %>%
mutate(percentage=(count1/sum(count1))*100) %>%
pivot_wider(names_from = sex, values_from = percentage) %>%
group_by(cigarettes_smoking_status)
The problem
I am having difficulty producing a percentage table in R that is condensed to 4 rows (Occasional smokers, Non-smokers, regular smokers and Prefer not to say) that clearly shows the percentage in each category by sex. Ideally, I am looking to produce a table in R that looks like this
How I want the table to look:
I have been attempting to use janitor::tabyl and pivot_wider to condense the rows, so there are just 4 rows. One row for Regular smokers. One row for occasional smokers etc. This is what my current output looks like.
Current dodgy output:
smoking_status_by_per %>%
# generate counts
janitor::tabyl(cigarettes_smoking_status, sex) %>%
# add total row/column
janitor::adorn_totals(where = c('row', 'col')) %>%
# convert counts to percentages
janitor::adorn_percentages() %>%
janitor::adorn_pct_formatting()
cigarettes_smoking_status Female Male Total
Non-smoker 75.0% 25.0% 100.0%
Regular Smoker 0.0% 100.0% 100.0%
Total 66.7% 33.3% 100.0%
This does convert the totals to percentages. You can use janitor::adorn_ns to add back counts to the percentages as well. Or save the totals after calculating the totals and add them back to the table afterwards (rbind the last row and cbind the Totals column with the counts).
We can use proportions and some binding to get what you have in the example.
Starting with enough data to fill out the matrix,
set.seed(42)
quux <- data.frame(response = sample(c("Non-smoker", "Occasional smoker", "Prefer not to say", "Regular smoker"), size=5000, replace=TRUE), gender = sample(c("Male", "Female", "Prefer not to say", "Unknown"), size=5000, replace=TRUE))
head(quux)
# response gender
# 1 Non-smoker Unknown
# 2 Non-smoker Prefer not to say
# 3 Non-smoker Female
# 4 Non-smoker Unknown
# 5 Occasional smoker Female
# 6 Regular smoker Unknown
base R
We can look at a simple table with:
table(quux)
# gender
# response Female Male Prefer not to say Unknown
# Non-smoker 330 294 323 312
# Occasional smoker 308 344 287 325
# Prefer not to say 292 337 310 304
# Regular smoker 309 308 311 306
For future verification, the sum of the first column (Female) is 1239, and the expected column-wise percentages for that are
c(330, 308, 292, 309) / 1239
# [1] 0.2663438 0.2485876 0.2356739 0.2493947
We can get the percentages with
round(100 * proportions(table(quux), margin = 2), 2)
# gender
# response Female Male Prefer not to say Unknown
# Non-smoker 26.63 22.92 26.24 25.02
# Occasional smoker 24.86 26.81 23.31 26.06
# Prefer not to say 23.57 26.27 25.18 24.38
# Regular smoker 24.94 24.01 25.26 24.54
Do get the right-most (Total) and bottom summary, we'll need to bind things.
tbl1 <- do.call(table, quux)
tbl2 <- 100 * proportions(tbl1, margin = 2)
tbl3 <- rbind(tbl2, `Number of Respondents` = colSums(tbl1))
tbl3
# Female Male Prefer not to say Unknown
# Non-smoker 26.63438 22.91504 26.23883 25.02005
# Occasional smoker 24.85876 26.81216 23.31438 26.06255
# Prefer not to say 23.56739 26.26656 25.18278 24.37851
# Regular smoker 24.93947 24.00624 25.26401 24.53889
# Number of Respondents 1239.00000 1283.00000 1231.00000 1247.00000
tbl4 <- cbind(tbl3, `Total %` = c(100 * proportions(rowSums(tbl1)), sum(tbl1)))
tbl4
# Female Male Prefer not to say Unknown Total %
# Non-smoker 26.63438 22.91504 26.23883 25.02005 25.18
# Occasional smoker 24.85876 26.81216 23.31438 26.06255 25.28
# Prefer not to say 23.56739 26.26656 25.18278 24.37851 24.86
# Regular smoker 24.93947 24.00624 25.26401 24.53889 24.68
# Number of Respondents 1239.00000 1283.00000 1231.00000 1247.00000 5000.00
And we can round the numbers:
round(tbl4, 1)
# Female Male Prefer not to say Unknown Total %
# Non-smoker 26.6 22.9 26.2 25.0 25.2
# Occasional smoker 24.9 26.8 23.3 26.1 25.3
# Prefer not to say 23.6 26.3 25.2 24.4 24.9
# Regular smoker 24.9 24.0 25.3 24.5 24.7
# Number of Respondents 1239.0 1283.0 1231.0 1247.0 5000.0
dplyr
library(dplyr)
library(tidyr) # pivot_wider
tbl1 <- tibble(quux) %>%
count(response, gender) %>%
pivot_wider(response, names_from = gender, values_from = n)
tbl1
# # A tibble: 4 × 5
# response Female Male `Prefer not to say` Unknown
# <chr> <int> <int> <int> <int>
# 1 Non-smoker 330 294 323 312
# 2 Occasional smoker 308 344 287 325
# 3 Prefer not to say 292 337 310 304
# 4 Regular smoker 309 308 311 306
tbl2 <- tbl1 %>%
summarize(
response = "Number of Respondents",
across(-response, ~ sum(.)),
`Total %` = sum(tbl1[,-1])
)
tbl2
# # A tibble: 1 × 6
# response Female Male `Prefer not to say` Unknown `Total %`
# <chr> <int> <int> <int> <int> <int>
# 1 Number of Respondents 1239 1283 1231 1247 5000
tbl1 %>%
mutate(
across(Female:Unknown, ~ 100 * . / sum(.)),
`Total %` = rowSums(tbl1[,-1]),
`Total %` = 100 * `Total %` / sum(`Total %`)
) %>%
bind_rows(tbl2)
# # A tibble: 5 × 6
# response Female Male `Prefer not to say` Unknown `Total %`
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 Non-smoker 26.6 22.9 26.2 25.0 25.2
# 2 Occasional smoker 24.9 26.8 23.3 26.1 25.3
# 3 Prefer not to say 23.6 26.3 25.2 24.4 24.9
# 4 Regular smoker 24.9 24.0 25.3 24.5 24.7
# 5 Number of Respondents 1239 1283 1231 1247 5000
All,
Thanks in advance. I have this school dataset. Each category (in Category column) has a range number of students (e.g., from 30 to 60 students), so I need to calculate:
the total number of classrooms that fall in each category (from category 1 to category 4), and
the percentage of classrooms that fall in the category.
For example, how many classrooms (NumOfClassrooms column) fall in Category_4, and what's the percentage of those classrooms to the total classrooms? Here is an illustrative example for my question:
ID = 1:1050
District = rep(c("AR", "CO", "AL", "KS", "IN", "ME", "KY", "ME", "MN", "NJ"), times = c(80, 120, 100, 110, 120, 100, 100, 120, 100, 100))
schoolName = randomNames::randomNames(1050, ethnicity = 5 ,which.names = "last")
Grade = rep(c("First", "Second", "Third", "Fourth"), times = c(400, 300, 200, 150))
NumOfClassrooms = sample(1:6)
StudentNumber = sample(1:90, 5)
AverageNumOfStudents = StudentNumber/NumOfClassrooms
Category = ifelse(AverageNumOfStudents > 0 & AverageNumOfStudents < 10, "category_1",
ifelse(AverageNumOfStudents >=10 & AverageNumOfStudents < 30, "category_2",
ifelse(AverageNumOfStudents >=30 & AverageNumOfStudents <= 60, "category_3",
ifelse(AverageNumOfStudents > 60 , "category_4", "NA"))))
dat = data.frame(ID, schoolName, Grade, NumOfClassrooms, StudentNumber, AverageNumOfStudents, Category)
Finally, I need to divide the results based on the "District" column into separate excel files using the following code (it should work fine once I get the above two steps).
Final_Divide = Final_df %>%
dplyr::group_by(District) %>%
dplyr::ungroup()
list_data <- split(Final_Divide,
Final_Divide$District)
options(digits=3)
Map(openxlsx::write.xlsx, list_data, paste0(names(list_data), '.xlsx'))
Thank you very much in advance.
Setting a random seed before your code for reproducibility:
set.seed(42)
# Your code creating dat
Table1 <- xtabs(NumOfClassrooms~Category, dat)
Table1
# Category
# category_1 category_2 category_4
# 1925 1575 175
Table2 <- prop.table(Table1)
round(Table2, 4) # Proportions
# Category
# category_1 category_2 category_4
# 0.5238 0.4286 0.0476
round(Table2 * 100, 2) # Percent
# Category
# category_1 category_2 category_4
# 52.38 42.86 4.76
If we include District in dat:
dat <- data.frame(ID, District, schoolName, Grade, NumOfClassrooms, StudentNumber, AverageNumOfStudents, Category)
Table3 <- xtabs(NumOfClassrooms~District+Category, dat)
addmargins(Table3)
# Category
# District category_1 category_2 category_4 Sum
# AL 187 149 16 352
# AR 143 121 14 278
# CO 220 180 20 420
# IN 220 180 20 420
# KS 198 166 19 383
# KY 187 148 17 352
# ME 407 329 36 772
# MN 176 153 17 346
# NJ 187 149 16 352
# Sum 1925 1575 175 3675
For row percentages by District:
round(prop.table(Table3, 1) * 100, 2)
# Category
# District category_1 category_2 category_4
# AL 53.12 42.33 4.55
# AR 51.44 43.53 5.04
# CO 52.38 42.86 4.76
# IN 52.38 42.86 4.76
# KS 51.70 43.34 4.96
# KY 53.12 42.05 4.83
# ME 52.72 42.62 4.66
# MN 50.87 44.22 4.91
# NJ 53.12 42.33 4.55
Here's a possible solution using the tidyverse
dat %>%
mutate("Total Classrooms" = n()) %>%
group_by(Category) %>%
mutate("Number of Classrooms in Category" = n(),
"Category Percentage" = `Number of Classrooms in Category`/`Total Classrooms` * 100)
This will give us:
# Groups: Category [3]
ID District schoolName Grade NumOfClassrooms StudentNumber AverageNumOfStude~ Category `Total Classroom~ `Number of Classrooms in~ `Category Percent~
<int> <chr> <chr> <chr> <int> <int> <dbl> <chr> <int> <int> <dbl>
1 1 AR Svyatetskiy First 5 87 17.4 category~ 1050 525 50
2 2 AR Booco First 1 79 79 category~ 1050 175 16.7
3 3 AR Jones First 6 49 8.17 category~ 1050 350 33.3
4 4 AR Sapkin First 3 5 1.67 category~ 1050 350 33.3
5 5 AR Fosse First 2 35 17.5 category~ 1050 525 50
6 6 AR Vanwagenen First 4 87 21.8 category~ 1050 525 50
7 7 AR Orth First 5 79 17.4 category~ 1050 525 50
8 8 AR Moline First 1 49 79 category~ 1050 175 16.7
9 9 AR Bradford First 6 5 8.17 category~ 1050 350 33.3
10 10 AR Wollman First 3 35 1.67 category~ 1050 350 33.3
# ... with 1,040 more rows
If you need a separate table of just the category/# classrooms/percentage data:
dat %>%
mutate("Total Classrooms" = n()) %>%
group_by(Category) %>%
mutate("Number of Classrooms in Category" = n(),
"Category Percentage" = `Number of Classrooms in Category`/`Total Classrooms` * 100) %>%
select(Category, "Number of Classrooms in Category", "Category Percentage") %>%
unique()
This gives us:
# A tibble: 3 x 3
# Groups: Category [3]
Category `Number of Classrooms in Category` `Category Percentage`
<chr> <int> <dbl>
1 category_2 525 50
2 category_4 175 16.7
3 category_1 350 33.3
Note that in your post, this code is a bit redundant:
Final_Divide = Final_df %>%
dplyr::group_by(District) %>%
dplyr::ungroup()
If you group and then immediately ungroup, you're actually just doing this:
Final_Divide <- Final_df
You could also consider adding split(.$District) to transform your data into a list all in one chunk of code:
dat %>%
mutate("Total Classrooms" = n()) %>%
group_by(Category) %>%
mutate("Number of Classrooms in Category" = n(),
"Category Percentage" = `Number of Classrooms in Category`/`Total Classrooms` * 100) %>%
split(.$District)
Here is some sample data:
movie_df <- data.frame("ID" = c(1,2,3,4,5,6,7,8,9,10),
"movie_type" = c("Action", "Horror", "Comedy", "Thriller", "Comedy",
"Action","Thriller", "Horror", "Action", "Comedy"),
"snack_type" = c("Chocolate", "Popcorn", "Candy", "Popcorn", "Popcorn",
"Candy","Chocolate", "Candy", "Popcorn", "Chocolate"),
"event_type" = c("Solo", "Family", "Date", "Friends", "Solo",
"Family","Date", "Date", "Friends", "Friends"),
"total_cost" = c(50, 35, 20, 50, 30,
60, 25, 35, 20, 50))
What I want to do is go through each column and compare each group to the rest of the groups on total_cost. For example, I want to see how movie_type == 'Action' compares to movie_type != 'Action' for total_cost. I want to do that for every type in movie_type then every type in snack_type and event_type.
What I ultimately want to get to is this where sd = Standard Deviation. Ideally this will be done by a tidyverse method in R (e.g. dplyr or tidyr):
> results_df
# A tibble: 11 x 11
Group Grp_1 Grp_2 Grp_1_mean Grp_2_mean Grp_1_sd Grp_2_sd Grp_1_n Grp_2_n Mean_Diff `t-test`
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 movie_type Action Rest of group 43.3 35 20.8 11.5 3 7 8.33 2.84
2 movie_type Horror Rest of group 35 38.1 0 16.0 2 8 -3.12 -2.21
3 movie_type Thriller Rest of group 37.5 37.5 17.7 14.6 2 8 0 0
4 movie_type Comedy Rest of group 33.3 39.3 15.3 14.6 3 7 -5.95 -2.22
5 snack_type Chocolate Rest of group 41.7 35.7 14.4 14.8 3 7 5.95 2.26
6 snack_type Candy Rest of group 38.3 37.1 20.2 12.9 3 7 1.19 0.407
7 snack_type Popcorn Rest of group 33.8 40 12.5 15.8 4 6 -6.25 -2.60
8 event_type Date Rest of group 26.7 42.1 7.64 14.1 3 7 -15.5 -7.25
9 event_type Family Rest of group 47.5 35 17.7 13.4 2 8 12.5 3.86
10 event_type Friends Rest of group 40 36.4 17.3 14.1 3 7 3.57 1.28
11 event_type Solo Rest of group 40 36.9 14.1 15.1 2 8 3.12 1.04
It's same logic as Daniel did using purrr::map and purrr::map2.
library(dplyr)
library(tibble)
library(purrr)
library(stringr)
needed_cols <- c("movie_type", "snack_type", "event_type")
new_names <- 1:2 %>%
map(~str_c(c("group", "mean", "sd", "n"), "_", .x)) %>%
unlist()
my_data <- needed_cols %>%
map(function(df_c)
map(unique(movie_df[[df_c]]),
function(v){
df <- movie_df %>%
mutate(group = ifelse(get(df_c) == v, v, "rest_of_group")) %>%
group_by(group) %>%
summarize(mean = mean(total_cost), sd = sd(total_cost), n = n()) %>%
.[match(.$group, c(v, "rest_of_group")),]
df <- bind_cols(df[1, ], df[2,])
names(df) <- new_names
df
}
)
) %>%
map2(needed_cols, ~bind_rows(.x) %>% mutate(group = .y)) %>%
bind_rows() %>%
select(
str_subset(names(.), "group") %>% sort(),
str_subset(names(.), "mean"),
str_subset(names(.), "sd"),
str_subset(names(.), "n")
) %>%
mutate(mean_diff = mean_1 - mean_2)
Sorry its not in pipes, but in Base R we can:
results_df <- do.call(rbind,unlist(
apply(movie_df[,2:4],2,function(u)
lapply(unique(u), function(x)
data.frame(
group1 = as.character(x),
group2 = "rest",
grp1_mean = mean(movie_df$total_cost[u == x]),
grp2_mean = mean(movie_df$total_cost[u != x]),
grp1_sd = sd(movie_df$total_cost[u == x]),
grp2_sd = sd(movie_df$total_cost[u != x])
)
)
),recursive=F)
)
#add mean differences
results_df$meandiff <- with(results_df, grp1_mean - grp2_mean)
> results_df
group1 group2 grp1_mean grp2_mean grp1_sd grp2_sd meandiff
movie_type1 Action rest 43.33333 35.00000 20.816660 11.54701 8.333333
movie_type2 Horror rest 35.00000 38.12500 0.000000 16.02175 -3.125000
movie_type3 Comedy rest 33.33333 39.28571 15.275252 14.55695 -5.952381
movie_type4 Thriller rest 37.50000 37.50000 17.677670 14.63850 0.000000
snack_type1 Chocolate rest 41.66667 35.71429 14.433757 14.84042 5.952381
snack_type2 Popcorn rest 33.75000 40.00000 12.500000 15.81139 -6.250000
snack_type3 Candy rest 38.33333 37.14286 20.207259 12.86375 1.190476
event_type1 Solo rest 40.00000 36.87500 14.142136 15.10381 3.125000
event_type2 Family rest 47.50000 35.00000 17.677670 13.36306 12.500000
event_type3 Date rest 26.66667 42.14286 7.637626 14.09998 -15.476190
event_type4 Friends rest 40.00000 36.42857 17.320508 14.05770 3.571429