R: Summarizing Data At Multiple Levels - r

I am working with the R programming language.
I have the following dataset about people with their weights and asthma (1 = yes, 0 = no):
library(dplyr)
library(purrr)
library(ggplot2)
set.seed(123)
my_data1 = data.frame(Weight = rnorm(500,100,100), asthma = sample(c(0,1), prob = c(0.7,0.3), replace=TRUE, size= 500))
my_data2 = data.frame(Weight = rnorm(500, 200, 50), asthma = sample(c(0,1), prob = c(0.3,0.7), replace=TRUE, size= 500))
my_data_a = rbind(my_data1, my_data2)
my_data_a$gender = "male"
my_data1 = data.frame(Weight = rnorm(500,100,100), asthma = sample(c(0,1), prob = c(0.7,0.3), replace=TRUE, size= 500))
my_data2 = data.frame(Weight = rnorm(500, 200, 50), asthma = sample(c(0,1), prob = c(0.3,0.7), replace=TRUE, size= 500))
my_data_b = rbind(my_data1, my_data2)
my_data_b$gender = "female"
my_data = rbind(my_data_a, my_data_b)
my_data$id = 1:2000
My Question: For both genders, I would like to "bin" people in this dataset into "n" bins (e.g. n = 30) in ascending order based on the available weight ranges (e.g. min_weight_men : min_weight_men+ 30 = bin_1_men, min_weight_women : min_weight_women+ 30 = bin_1_women, min_weight_men+ 30 : min_weight_men+ 60 = bin_2_men, etc.) - and then find out how many people in each bin, as well as the min weight and max weight for each bin.
My Attempt: I tried to do this with the following code:
Part_1 = my_data %>% group_by(gender) %>%
mutate(bins = cut(Weight , breaks = pretty(Weight , n = (max(Weight)-min(Weight))/30), include.lowest = TRUE)) %>%
mutate(rank = dense_rank(bins)) %>%
mutate(new_bins = paste(rank,"_", gender, sep=""))
Part_2 = Part_1 %>% group_by(gender, bins) %>%
summarize(min_weight = min(Weight), max_weight = max(Weight), count = n())
Part_3 = merge(x=Part_1,y=Part_2, by.x=c("gender","bins"), by.y=c("gender","bins"))
While the result are in the format that I want - I am not sure if I have performed the calculations correctly:
> head(Part_3)
gender bins Weight asthma id rank new_bins min_weight max_weight count
1 female (-100,-50] -75.13021 0 1192 4 4_female -99.91774 -51.53241 23
2 female (-100,-50] -55.78222 0 1382 4 4_female -99.91774 -51.53241 23
3 female (-100,-50] -51.53241 0 1232 4 4_female -99.91774 -51.53241 23
4 female (-100,-50] -71.44877 1 1484 4 4_female -99.91774 -51.53241 23
5 female (-100,-50] -93.99402 1 1160 4 4_female -99.91774 -51.53241 23
6 female (-100,-50] -96.49823 0 1378 4 4_female -99.91774 -51.53241 23
Can someone please help me understand if I have done this correctly?
Thanks!
Note: Just to clarify - suppose weights for men are from 70kg to 150kg. I want bins such as bin_1_men = 70-100kg, bin_2_men = 100-130kg, etc. I am aware that this could result in some bins having significantly different counts.

Instead of doing this in 3 steps, could be done in a single pipe with mutate after grouping
library(dplyr)
my_data %>%
group_by(gender) %>%
mutate(bins = cut(Weight , breaks = pretty(Weight ,
n = (max(Weight)-min(Weight))/30), include.lowest = TRUE),
rank = dense_rank(bins),
new_bins = paste(rank,"_", gender, sep="")) %>%
group_by(gender, bins) %>%
mutate(min_weight = min(Weight), max_weight = max(Weight),
count = n()) %>%
ungroup

Related

R: Calculating Quantiles with (group_by .add = TRUE)

I am working with the R programming language.
I have the following dataset:
set.seed(123)
library(dplyr)
Patient_ID = 1:5000
gender <- c("Male","Female")
gender <- sample(gender, 5000, replace=TRUE, prob=c(0.45, 0.55))
Gender <- as.factor(gender)
status <- c("Immigrant","Citizen")
status <- sample(status, 5000, replace=TRUE, prob=c(0.3, 0.7))
Status <- as.factor(status )
Height = rnorm(5000, 150, 10)
Weight = rnorm(5000, 90, 10)
Hospital_Visits = sample.int(20, 5000, replace = TRUE)
################
disease <- c("Yes","No")
disease <- sample(disease, 5000, replace=TRUE, prob=c(0.4, 0.6))
Disease <- as.factor(disease)
###################
my_data = data.frame(Patient_ID, Gender, Status, Height, Weight, Hospital_Visits, Disease)
Patient_ID Gender Status Height Weight Hospital_Visits Disease
1 1 Female Citizen 145.0583 113.70725 1 No
2 2 Male Immigrant 161.2759 88.33188 18 No
3 3 Female Immigrant 138.5305 99.26961 6 Yes
4 4 Male Citizen 164.8102 84.31848 12 No
5 5 Male Citizen 159.1619 92.25090 12 Yes
6 6 Female Citizen 153.3513 101.31986 11 Yes
In a previous question (R: Calculating Proportions Based on Nested Groups), I learned how to calculate "nested proportions" based on ntiles (e.g. calculate 3 ntiles for one variable, group by these 3 ntiles and then claculate 3 ntiles for the second variable based on these previous ntiles,etc.):
# e.g. using 3 ntiles
my_data %>%
group_by(Gender, Status) %>%
mutate(Height_ntile = ntile(Height, 3),
Height_range = paste(min(Height), max(Height), sep = "-")) %>%
group_by(Height_ntile, Height_range, .add = TRUE) %>%
mutate(Weight_ntile = ntile(Weight, 3),
Weight_range = paste(min(Weight), max(Weight), sep = "-")) %>%
group_by(Weight_ntile, Weight_range, .add = TRUE) %>%
mutate(Hospital_Visits_ntile = ntile(Hospital_Visits, 3),
Hospital_range = paste(min(Hospital_Visits), max(Hospital_Visits), sep = "-")) %>%
group_by(Hospital_Visits_ntile, Hospital_range, .add = TRUE) %>%
summarize(percent_disease = mean(Disease == "Yes"),
count = n(),
.groups = "drop")
Now, I am trying to repeat this exact same function but using "quantiles" instead:
I tried to modify the above code - here is my attempt:
my_data %>%
group_by(Gender, Status) %>%
mutate(Height_group = cut(Height, breaks = c(-Inf,
quantile(Height, c(0.33, 0.67)),
Inf)),
Height_range = paste(min(Height), max(Height), sep = "-")) %>%
group_by(Height_group, Height_range, .add = TRUE) %>%
mutate(Weight_group = cut(Weight, breaks = c(-Inf,
quantile(Weight, c(0.33, 0.67)),
Inf)),
Weight_range = paste(min(Weight), max(Weight), sep = "-")) %>%
group_by(Weight_group, Weight_range, .add = TRUE) %>%
mutate(Hospital_Visits_group = cut(Hospital_Visits, breaks = c(-Inf,
quantile(Hospital_Visits, c(0.33, 0.67)),
Inf)),
Hospital_range = paste(min(Hospital_Visits), max(Hospital_Visits), sep = "-")) %>%
group_by(Hospital_Visits_group, Hospital_range, .add = TRUE) %>%
summarize(percent_disease = mean(Disease == "Yes"),
count = n(),
.groups = "drop")
This code runs, but I am not sure if I have done this correctly (e.g. the "infinite" values appearing):
A tibble: 108 x 10
Gender Status Height_~1 Heigh~2 Weigh~3 Weigh~4 Hospi~5 Hospi~6 perce~7
<fct> <fct> <fct> <chr> <fct> <chr> <fct> <chr> <dbl>
1 Female Citizen (-Inf,14~ 115.86~ (-Inf,~ 58.991~ (-Inf,~ 1-20 0.314
2 Female Citizen (-Inf,14~ 115.86~ (-Inf,~ 58.991~ (7,14] 1-20 0.458
Can someone please show me if I have done this correctly?
Thanks!
Answer based on insights provided by #akrun:
my_data %>%
group_by(Gender, Status) %>%
mutate(Height_group = as.integer(cut(Height, breaks = c(-Inf,
quantile(Height, c(0.33, 0.67)),
Inf))),
Height_range = paste(min(Height), max(Height), sep = "-")) %>%
group_by(Height_group, Height_range, .add = TRUE) %>%
mutate(Weight_group = as.integer(cut(Weight, breaks = c(-Inf,
quantile(Weight, c(0.33, 0.67)),
Inf))),
Weight_range = paste(min(Weight), max(Weight), sep = "-")) %>%
group_by(Weight_group, Weight_range, .add = TRUE) %>%
mutate(Hospital_Visits_group = as.integer(cut(Hospital_Visits, breaks = c(-Inf,
quantile(Hospital_Visits, c(0.33, 0.67)),
Inf))),
Hospital_range = paste(min(Hospital_Visits), max(Hospital_Visits), sep = "-")) %>%
group_by(Hospital_Visits_group, Hospital_range, .add = TRUE) %>%
summarize(percent_disease = mean(Disease == "Yes"),
count = n(),
.groups = "drop")
Have I understood this correctly?

Summarize information by group in data table in R

I'm trying to get multiple summary statistics in R grouped by Team. I used code like below, but output is not what I want.
please point me in a better direction. Thanks!
set.seed(77)
data <- data.frame(Team =sample(c("A","B"),30, replace=TRUE),
gender=sample(c("female","male"),30, replace=TRUE),
Age =sample(c(0:100),30, replace=T))
dat <- data %>%
group_by(Team, gender) %>%
dplyr::summarize_all(list(my_mean = mean,
my_sum = sum,
my_sd = sd)) %>%
as.data.frame()
df <- data %>%
group_by(Team) %>%
summarize(total = n(gender),
mean = mean(Age),
Max_Age = max(Age),
Min_Age = min(Age),
sd = sd(Age),
)
I want to get like this pic.
You may need to create the dataframe for the summary statistics of age per Team (age_summary in the example below) and that for the count of Team members per gender and Team (gender_summary in the example below), and then merge them into one dataframe (say summary_df).
library(tidyverse)
set.seed(77)
data <- data.frame(
Team = sample(c("A", "B"), 30, replace = TRUE),
gender = sample(c("female", "male"), 30, replace = TRUE),
Age = sample(c(0:100), 30, replace = T)
)
age_summary <- data %>%
group_by(Team) %>%
summarize(
mean = mean(Age),
Max = max(Age),
Min = min(Age),
sd = sd(Age)
) %>%
column_to_rownames("Team") %>%
t() %>%
as_tibble(
rownames = "age_summary"
)
gender_summary <- data %>%
group_by(Team) %>%
count(gender) %>%
ungroup() %>%
pivot_wider(names_from = Team, values_from = n)
summary_df <- full_join(
age_summary,
gender_summary
) %>%
mutate(
"item" = if_else(
is.na(gender),
"Age",
"Sex"
)
) %>%
unite("summary", c(age_summary, gender), na.rm = TRUE, remove = FALSE) %>%
relocate(item, .before = 1) %>%
select(-c(age_summary, gender))
# # A tibble: 6 × 4
# item summary A B
# <chr> <chr> <dbl> <dbl>
# 1 Age mean 45.6 57.8
# 2 Age Max 92 82
# 3 Age Min 5 14
# 4 Age sd 30.1 22.1
# 5 Sex female 8 9
# 6 Sex male 7 6

How to easily generate/simulate example data with different groups for modelling

How to easily generate/simulate meaningful example data for modelling: e.g. telling that give me n rows of data, for 2 groups, their sex distributions and mean age should differ by X and Y units, respectively? Is there a simple way for doing it automatically? Any packages?
For example, what would be the simplest way for generating such data?
groups: two groups: A, B
sex: different sex distributions: A 30%, B 70%
age: different mean ages: A 50, B 70
PS! Tidyverse solutions are especially welcome.
My best try so far is still quite a lot of code:
n=100
d = bind_rows(
#group A females
tibble(group = rep("A"),
sex = rep("Female"),
age = rnorm(n*0.4, 50, 4)),
#group B females
tibble(group = rep("B"),
sex = rep("Female"),
age = rnorm(n*0.3, 45, 4)),
#group A males
tibble(group = rep("A"),
sex = rep("Male"),
age = rnorm(n*0.20, 60, 6)),
#group B males
tibble(group = rep("B"),
sex = rep("Male"),
age = rnorm(n*0.10, 55, 4)))
d %>% group_by(group, sex) %>%
summarise(n = n(),
mean_age = mean(age))
There are lots of ways to sample from vectors and to draw from random distributions in R. For example, the data set you requested could be created like this:
set.seed(69) # Makes samples reproducible
df <- data.frame(groups = rep(c("A", "B"), each = 100),
sex = c(sample(c("M", "F"), 100, TRUE, prob = c(0.3, 0.7)),
sample(c("M", "F"), 100, TRUE, prob = c(0.5, 0.5))),
age = c(runif(100, 25, 75), runif(100, 50, 90)))
And we can use the tidyverse to show it does what was expected:
library(dplyr)
df %>%
group_by(groups) %>%
summarise(age = mean(age),
percent_male = length(which(sex == "M")))
#> # A tibble: 2 x 3
#> groups age percent_male
#> <chr> <dbl> <int>
#> 1 A 49.4 29
#> 2 B 71.0 50

Distilling summary statistics by numerical categories with dplyr

I have a large (rows > 200000) data frame with dozens of columns of data. I want to distill this data frame down and summarize the number of data that have variables that fall within given ranges.
For instance, if I have a data.frame that is similar to this:
set.seed(10)
df <- data.frame( age = runif( n = 1000, min = 0, max = 4000 ),
size = rnorm( n = 1000, mean = 10, sd = 1 ),
shape = rnorm( n = 1000, mean = 1000, sd = 1000) )
and I would like to group get the number of samples within a series of age ranges, the mean size and shape, and the median size and shape from the samples in each of those age brackets.
Something like
summary.df <- data.frame( age.group = seq( 0, 3900, by = 100 ),
number = (number of samples in age bin),
mean = ( mean of data in age bin ) )
etc.
Right now I am doing this very bluntly by creating a new data.frame for each age group.
data.1 <- subset( df, age > 0 & age <= 100 )
data.2 <- subset( df, age > 100 & age <= 200 )
data.3 <- subset( df, age > 200 & age <= 300 )
etc.
and then adding a categorical variable
data.1 <- data.frame( data.1, age.group = "100", count.row = nrow( data.1 ) )
data.2 <- data.frame( data.2, age.group = "200", count.row = nrow( data.2 ) )
data.3 <- data.frame( data.3, age.group = "300", count.row = nrow( data.3 ) )
adding them together
data.big <- rbind( data.1, data.2, data.3 )
and then generating summary stats via dplyr
data.summary <- data.big %>%
group_by( age.group ) %>%
summarize( count.row = mean( count.row ),
mean = mean( size, na.rm = TRUE ),
median = median( size, na.rm = T ) )
How would I go about doing this more efficiently with just dplyr? I think there must be a way but I can't wrap my head around it.
Thanks for any help you can give!
You can make use of cut to divide the data in intervals of 100 and calculate summary statistics for each group.
library(dplyr)
df %>%
group_by(age = cut(age, seq( 0, 4000, by = 100))) %>%
summarise(mean = mean( size, na.rm = TRUE),
median = median( size, na.rm = TRUE))
# age mean median
# <fct> <dbl> <dbl>
# 1 (0,100] 10.0 9.92
# 2 (100,200] 9.88 10.2
# 3 (200,300] 10.1 10.1
# 4 (300,400] 9.83 9.80
# 5 (400,500] 9.95 9.72
# 6 (500,600] 9.68 9.78
# 7 (600,700] 10.2 10.5
# 8 (700,800] 10.2 10.4
# 9 (800,900] 9.68 9.47
#10 (900,1e+03] 9.80 9.81
# … with 30 more rows

Finding maximum number from column for each day of whole year and creating a plot up to this number in R

There is a database of whole year:
Month Day Time X Y
...
3 1 0 2 4
3 1 1 4 2
3 1 2 7 3
3 1 3 8 8
3 1 4 4 6
3 1 5 1 4
3 1 6 6 6
3 1 7 7 9
...
3 2 0 5 7
3 2 1 7 2
3 2 2 9 3
...
4 1 0 2 8
...
I want to find maximum value of X for each day and create a plot for each day starting from beginning of the day (Time 0) up to this found maximum value. I tried to use dataframe but I got a bit lost and database is quite big so I'm not sure if this is the best idea.
Any ideas how to do it?
If I understood you correctly, this should work:
Sample dataset:
set.seed(123)
df <- data.frame(Month = sample(c(1:12), 30, replace = TRUE),
Day = sample(c(1:31), 30, replace = TRUE),
Time = sample(c(1:24), 30, replace = TRUE),
x = rnorm(30, mean = 10, sd = 5),
y = rnorm(30, mean = 10, sd = 5))
Using tidyverse (ggplot and dplyr):
require(tidyverse)
df %>%
#Grouping by month and day
group_by(Month, Day) %>%
#Creating new variables for x and y - the max value, and removing values bigger than the max value.
mutate(maxX = max(x, na.rm = TRUE),
maxY = max(y, na.rm = TRUE),
plotX = ifelse(x > maxY, NA, x),
plotY = ifelse(y > maxY, NA, y)) %>%
ungroup() %>%
#Select and gather only the needed variables for the plot
select(Time, plotX, plotY) %>%
gather(plot, value, -Time) %>%
#Plot
ggplot(aes(Time, value, color = plot)) +
geom_point()
output:
You can try a tidyverse. Duplicated Times per Day and Month are removed without any ranking.
library(tidyverse)
set.seed(123)
df <- data.frame(Month = sample(c(1:2), 30, replace = TRUE),
Day = sample(c(1:2), 30, replace = TRUE),
Time = sample(c(1:10), 30, replace = TRUE),
x = rnorm(30, mean = 10, sd = 5),
y = rnorm(30, mean = 10, sd = 5))
df %>%
group_by(Month, Day) %>%
filter(!duplicated(Time)) %>% # remove dupliceted "Time"'s.
filter(x<=max(x) & Time <= Time[x == max(x)]) %>%
ggplot(aes(Time, x)) +
geom_line() +
geom_point(data=. %>% filter(x == max(x)))+
facet_grid(Month~Day, labeller = label_both)
Or try to put all in one plot using different colors
df %>%
group_by(Month, Day) %>%
filter(!duplicated(Time)) %>%
filter(x<=max(x) & Time <= Time[x == max(x)]) %>%
ggplot(aes(Time, x, color = interaction(Month, Day))) +
geom_line() +
geom_point(data=. %>% filter(x == max(x)))

Resources