I am working with the R programming language.
I have the following dataset:
set.seed(123)
library(dplyr)
Patient_ID = 1:5000
gender <- c("Male","Female")
gender <- sample(gender, 5000, replace=TRUE, prob=c(0.45, 0.55))
Gender <- as.factor(gender)
status <- c("Immigrant","Citizen")
status <- sample(status, 5000, replace=TRUE, prob=c(0.3, 0.7))
Status <- as.factor(status )
Height = rnorm(5000, 150, 10)
Weight = rnorm(5000, 90, 10)
Hospital_Visits = sample.int(20, 5000, replace = TRUE)
################
disease <- c("Yes","No")
disease <- sample(disease, 5000, replace=TRUE, prob=c(0.4, 0.6))
Disease <- as.factor(disease)
###################
my_data = data.frame(Patient_ID, Gender, Status, Height, Weight, Hospital_Visits, Disease)
Patient_ID Gender Status Height Weight Hospital_Visits Disease
1 1 Female Citizen 145.0583 113.70725 1 No
2 2 Male Immigrant 161.2759 88.33188 18 No
3 3 Female Immigrant 138.5305 99.26961 6 Yes
4 4 Male Citizen 164.8102 84.31848 12 No
5 5 Male Citizen 159.1619 92.25090 12 Yes
6 6 Female Citizen 153.3513 101.31986 11 Yes
In a previous question (R: Calculating Proportions Based on Nested Groups), I learned how to calculate "nested proportions" based on ntiles (e.g. calculate 3 ntiles for one variable, group by these 3 ntiles and then claculate 3 ntiles for the second variable based on these previous ntiles,etc.):
# e.g. using 3 ntiles
my_data %>%
group_by(Gender, Status) %>%
mutate(Height_ntile = ntile(Height, 3),
Height_range = paste(min(Height), max(Height), sep = "-")) %>%
group_by(Height_ntile, Height_range, .add = TRUE) %>%
mutate(Weight_ntile = ntile(Weight, 3),
Weight_range = paste(min(Weight), max(Weight), sep = "-")) %>%
group_by(Weight_ntile, Weight_range, .add = TRUE) %>%
mutate(Hospital_Visits_ntile = ntile(Hospital_Visits, 3),
Hospital_range = paste(min(Hospital_Visits), max(Hospital_Visits), sep = "-")) %>%
group_by(Hospital_Visits_ntile, Hospital_range, .add = TRUE) %>%
summarize(percent_disease = mean(Disease == "Yes"),
count = n(),
.groups = "drop")
Now, I am trying to repeat this exact same function but using "quantiles" instead:
I tried to modify the above code - here is my attempt:
my_data %>%
group_by(Gender, Status) %>%
mutate(Height_group = cut(Height, breaks = c(-Inf,
quantile(Height, c(0.33, 0.67)),
Inf)),
Height_range = paste(min(Height), max(Height), sep = "-")) %>%
group_by(Height_group, Height_range, .add = TRUE) %>%
mutate(Weight_group = cut(Weight, breaks = c(-Inf,
quantile(Weight, c(0.33, 0.67)),
Inf)),
Weight_range = paste(min(Weight), max(Weight), sep = "-")) %>%
group_by(Weight_group, Weight_range, .add = TRUE) %>%
mutate(Hospital_Visits_group = cut(Hospital_Visits, breaks = c(-Inf,
quantile(Hospital_Visits, c(0.33, 0.67)),
Inf)),
Hospital_range = paste(min(Hospital_Visits), max(Hospital_Visits), sep = "-")) %>%
group_by(Hospital_Visits_group, Hospital_range, .add = TRUE) %>%
summarize(percent_disease = mean(Disease == "Yes"),
count = n(),
.groups = "drop")
This code runs, but I am not sure if I have done this correctly (e.g. the "infinite" values appearing):
A tibble: 108 x 10
Gender Status Height_~1 Heigh~2 Weigh~3 Weigh~4 Hospi~5 Hospi~6 perce~7
<fct> <fct> <fct> <chr> <fct> <chr> <fct> <chr> <dbl>
1 Female Citizen (-Inf,14~ 115.86~ (-Inf,~ 58.991~ (-Inf,~ 1-20 0.314
2 Female Citizen (-Inf,14~ 115.86~ (-Inf,~ 58.991~ (7,14] 1-20 0.458
Can someone please show me if I have done this correctly?
Thanks!
Answer based on insights provided by #akrun:
my_data %>%
group_by(Gender, Status) %>%
mutate(Height_group = as.integer(cut(Height, breaks = c(-Inf,
quantile(Height, c(0.33, 0.67)),
Inf))),
Height_range = paste(min(Height), max(Height), sep = "-")) %>%
group_by(Height_group, Height_range, .add = TRUE) %>%
mutate(Weight_group = as.integer(cut(Weight, breaks = c(-Inf,
quantile(Weight, c(0.33, 0.67)),
Inf))),
Weight_range = paste(min(Weight), max(Weight), sep = "-")) %>%
group_by(Weight_group, Weight_range, .add = TRUE) %>%
mutate(Hospital_Visits_group = as.integer(cut(Hospital_Visits, breaks = c(-Inf,
quantile(Hospital_Visits, c(0.33, 0.67)),
Inf))),
Hospital_range = paste(min(Hospital_Visits), max(Hospital_Visits), sep = "-")) %>%
group_by(Hospital_Visits_group, Hospital_range, .add = TRUE) %>%
summarize(percent_disease = mean(Disease == "Yes"),
count = n(),
.groups = "drop")
Have I understood this correctly?
Related
I am working with the R programming language.
I have the following dataset about people with their weights and asthma (1 = yes, 0 = no):
library(dplyr)
library(purrr)
library(ggplot2)
set.seed(123)
my_data1 = data.frame(Weight = rnorm(500,100,100), asthma = sample(c(0,1), prob = c(0.7,0.3), replace=TRUE, size= 500))
my_data2 = data.frame(Weight = rnorm(500, 200, 50), asthma = sample(c(0,1), prob = c(0.3,0.7), replace=TRUE, size= 500))
my_data_a = rbind(my_data1, my_data2)
my_data_a$gender = "male"
my_data1 = data.frame(Weight = rnorm(500,100,100), asthma = sample(c(0,1), prob = c(0.7,0.3), replace=TRUE, size= 500))
my_data2 = data.frame(Weight = rnorm(500, 200, 50), asthma = sample(c(0,1), prob = c(0.3,0.7), replace=TRUE, size= 500))
my_data_b = rbind(my_data1, my_data2)
my_data_b$gender = "female"
my_data = rbind(my_data_a, my_data_b)
my_data$id = 1:2000
My Question: For both genders, I would like to "bin" people in this dataset into "n" bins (e.g. n = 30) in ascending order based on the available weight ranges (e.g. min_weight_men : min_weight_men+ 30 = bin_1_men, min_weight_women : min_weight_women+ 30 = bin_1_women, min_weight_men+ 30 : min_weight_men+ 60 = bin_2_men, etc.) - and then find out how many people in each bin, as well as the min weight and max weight for each bin.
My Attempt: I tried to do this with the following code:
Part_1 = my_data %>% group_by(gender) %>%
mutate(bins = cut(Weight , breaks = pretty(Weight , n = (max(Weight)-min(Weight))/30), include.lowest = TRUE)) %>%
mutate(rank = dense_rank(bins)) %>%
mutate(new_bins = paste(rank,"_", gender, sep=""))
Part_2 = Part_1 %>% group_by(gender, bins) %>%
summarize(min_weight = min(Weight), max_weight = max(Weight), count = n())
Part_3 = merge(x=Part_1,y=Part_2, by.x=c("gender","bins"), by.y=c("gender","bins"))
While the result are in the format that I want - I am not sure if I have performed the calculations correctly:
> head(Part_3)
gender bins Weight asthma id rank new_bins min_weight max_weight count
1 female (-100,-50] -75.13021 0 1192 4 4_female -99.91774 -51.53241 23
2 female (-100,-50] -55.78222 0 1382 4 4_female -99.91774 -51.53241 23
3 female (-100,-50] -51.53241 0 1232 4 4_female -99.91774 -51.53241 23
4 female (-100,-50] -71.44877 1 1484 4 4_female -99.91774 -51.53241 23
5 female (-100,-50] -93.99402 1 1160 4 4_female -99.91774 -51.53241 23
6 female (-100,-50] -96.49823 0 1378 4 4_female -99.91774 -51.53241 23
Can someone please help me understand if I have done this correctly?
Thanks!
Note: Just to clarify - suppose weights for men are from 70kg to 150kg. I want bins such as bin_1_men = 70-100kg, bin_2_men = 100-130kg, etc. I am aware that this could result in some bins having significantly different counts.
Instead of doing this in 3 steps, could be done in a single pipe with mutate after grouping
library(dplyr)
my_data %>%
group_by(gender) %>%
mutate(bins = cut(Weight , breaks = pretty(Weight ,
n = (max(Weight)-min(Weight))/30), include.lowest = TRUE),
rank = dense_rank(bins),
new_bins = paste(rank,"_", gender, sep="")) %>%
group_by(gender, bins) %>%
mutate(min_weight = min(Weight), max_weight = max(Weight),
count = n()) %>%
ungroup
I'm trying to get multiple summary statistics in R grouped by Team. I used code like below, but output is not what I want.
please point me in a better direction. Thanks!
set.seed(77)
data <- data.frame(Team =sample(c("A","B"),30, replace=TRUE),
gender=sample(c("female","male"),30, replace=TRUE),
Age =sample(c(0:100),30, replace=T))
dat <- data %>%
group_by(Team, gender) %>%
dplyr::summarize_all(list(my_mean = mean,
my_sum = sum,
my_sd = sd)) %>%
as.data.frame()
df <- data %>%
group_by(Team) %>%
summarize(total = n(gender),
mean = mean(Age),
Max_Age = max(Age),
Min_Age = min(Age),
sd = sd(Age),
)
I want to get like this pic.
You may need to create the dataframe for the summary statistics of age per Team (age_summary in the example below) and that for the count of Team members per gender and Team (gender_summary in the example below), and then merge them into one dataframe (say summary_df).
library(tidyverse)
set.seed(77)
data <- data.frame(
Team = sample(c("A", "B"), 30, replace = TRUE),
gender = sample(c("female", "male"), 30, replace = TRUE),
Age = sample(c(0:100), 30, replace = T)
)
age_summary <- data %>%
group_by(Team) %>%
summarize(
mean = mean(Age),
Max = max(Age),
Min = min(Age),
sd = sd(Age)
) %>%
column_to_rownames("Team") %>%
t() %>%
as_tibble(
rownames = "age_summary"
)
gender_summary <- data %>%
group_by(Team) %>%
count(gender) %>%
ungroup() %>%
pivot_wider(names_from = Team, values_from = n)
summary_df <- full_join(
age_summary,
gender_summary
) %>%
mutate(
"item" = if_else(
is.na(gender),
"Age",
"Sex"
)
) %>%
unite("summary", c(age_summary, gender), na.rm = TRUE, remove = FALSE) %>%
relocate(item, .before = 1) %>%
select(-c(age_summary, gender))
# # A tibble: 6 × 4
# item summary A B
# <chr> <chr> <dbl> <dbl>
# 1 Age mean 45.6 57.8
# 2 Age Max 92 82
# 3 Age Min 5 14
# 4 Age sd 30.1 22.1
# 5 Sex female 8 9
# 6 Sex male 7 6
df <- read.csv ('https://raw.githubusercontent.com/ulklc/covid19-
timeseries/master/countryReport/raw/rawReport.csv',
stringsAsFactors = FALSE)
df8 <- read.csv ('https://raw.githubusercontent.com/hirenvadher954/Worldometers-
Scraping/master/countries.csv',
stringsAsFactors = FALSE)
library(tidyverse)
df %>%
left_join(df8, by = c("countryName" = "country_name")) %>%
mutate(population = as.numeric(str_remove_all(population, ","))) %>%
group_by(countryName) %>%
group_by(countryName) %>%
unique() %>%
summarize(population = sum(population, na.rm = TRUE),
confirmed = sum(confirmed, na.rm = TRUE),
recovered = sum(recovered, na.rm = TRUE),
death = sum(death, na.rm = TRUE),
death_prop = paste0(as.character(death), "/", as.character(population)),
confirmed_prop = paste0(as.character(confirmed), "/", as.character(population)),
recovered_prop = paste0(as.character(recovered), "/", as.character(population)),
)
population / death ratio is calculated in this code.
what I want to do is
Finding 10 countries with the highest population / death ratio.
as output;
counrtyName death population rate
İtaly 19000 50000000000 19/50000000
spain 17000 60000000000 17/60000000
....
.....
....
....
....
The examples I have given are not real data.
examples do not reflect reality.
I'm in the process of creating summaries tables based on subgroups and would love to add an overall summary in a tidyer/more efficient manner.
What I have so far is this. I've created summaries via levels within my factor variables.
library(tidyverse)
df <- data.frame(var1 = 10:18,
var2 = c("A","B","A","B","A","B","A","B","A"))
group_summary <- df %>% group_by(var2) %>%
filter(var2 != "NA") %>%
summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n())
Next I created an overall summary.
Summary <- df %>%
filter(var2 != "NA") %>%
summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n())
Finally, I bound the two objects with dplyr::bind_rows
complete_summary <- bind_rows(Summary, group_summary)
What I've done works but it is very, very verbose and can't be the most efficient way. I tried to use ungroup
group_summary <- df %>% group_by(var2) %>%
filter(var2 != "NA") %>%
summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n()) %>% ungroup %>%
summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n())
but it threw an error:
Evaluation error: object var1 not found.
Thanks in advance for your assistance.
Ideally, if you want to do it in one-chain, this is how you can do by using bind_rows to combine both the results, just like you've done - but removing the temporary objects you created.
library(tidyverse)
#> Warning: package 'tibble' was built under R version 3.5.2
df <- data.frame(var1 = 10:18,
var2 = c("A","B","A","B","A","B","A","B","A"))
df %>% group_by(var2) %>%
filter(var2 != "NA") %>%
summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n()) %>% #ungroup() %>%
bind_rows( df %>% summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n()))
#> # A tibble: 3 x 6
#> var2 Max Median Min IQR Count
#> <fct> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 A 18 14 10 4 5
#> 2 B 17 14 11 3 4
#> 3 <NA> 18 14 10 4 9
Created on 2019-01-29 by the reprex package (v0.2.1)
Not the most elegant solution either, but simple:
c <- mtcars %>%
mutate(total_mean = mean(wt),
total_median = median(wt)) %>%
group_by(cyl) %>%
summarise(meanweight = mean(wt),
medianweight = median(wt),
total_mean = first(total_mean),
total_median = first(total_median))
I'm trying to reproduce the framework from this blogpost http://www.luishusier.com/2017/09/28/balance/ with the following code but it looks like I get inconsistent results
library(tidyverse)
library(magrittr)
ids <- c("1617", "1516", "1415", "1314", "1213", "1112", "1011", "0910", "0809", "0708", "0607", "0506")
data <- ids %>%
map(function(i) {read_csv(paste0("http://www.football-data.co.uk/mmz4281/", i ,"/F1.csv")) %>%
select(Date:AST) %>%
mutate(season = i)})
data <- bind_rows(data)
data <- data[complete.cases(data[ , 1:3]), ]
tmp1 <- data %>%
select(season, HomeTeam, FTHG:FTR,HS:AST) %>%
rename(BP = FTHG,
BC = FTAG,
TP = HS,
TC = AS,
TCP = HST,
TCC = AST,
team = HomeTeam)%>%
mutate(Pts = ifelse(FTR == "H", 3, ifelse(FTR == "A", 0, 1)),
Terrain = "Domicile")
tmp2 <- data %>%
select(season, AwayTeam, FTHG:FTR, HS:AST) %>%
rename(BP = FTAG,
BC = FTHG,
TP = AS,
TC = HS,
TCP = AST,
TCC = HST,
team = AwayTeam)%>%
mutate(Pts = ifelse(FTR == "A", 3 ,ifelse(FTR == "H", 0 , 1)),
Terrain = "Extérieur")
tmp3 <- bind_rows(tmp1, tmp2)
l1_0517 <- tmp3 %>%
group_by(season, team)%>%
summarise(j = n(),
pts = sum(Pts),
diff_but = (sum(BP) - sum(BC)),
diff_t_ca = (sum(TCP, na.rm = T) - sum(TCC, na.rm = T)),
diff_t = (sum(TP, na.rm = T) - sum(TC, na.rm = T)),
but_p = sum(BP),
but_c = sum(BC),
tir_ca_p = sum(TCP, na.rm = T),
tir_ca_c = sum(TCC, na.rm = T),
tir_p = sum(TP, na.rm = T),
tir_c = sum(TC, na.rm = T)) %>%
arrange((season), desc(pts), desc(diff_but))
Then I apply the framework mentioned above:
l1_0517 <- l1_0517 %>%
mutate(
# First, see how many goals the team scores relative to the average
norm_attack = but_p %>% divide_by(mean(but_p)) %>%
# Then, transform it into an unconstrained scale
log(),
# First, see how many goals the team concedes relative to the average
norm_defense = but_c %>% divide_by(mean(but_c)) %>%
# Invert it, so a higher defense is better
raise_to_power(-1) %>%
# Then, transform it into an unconstrained scale
log(),
# Now that we have normalized attack and defense ratings, we can compute
# measures of quality and attacking balance
quality = norm_attack + norm_defense,
balance = norm_attack - norm_defense
) %>%
arrange(desc(norm_attack))
When I look at the column norm_attack, I expect to find the same value for equivalent but_p values, which is not the case here:
head(l1_0517, 10)
for instance when but_p has value 83, row 5 and row 7, I get norm_attack at 0.5612738 and 0.5128357 respectively.
Is it normal? I would expect mean(l1_0517$but_p) to be fixed and therefore obtaining the same result when a value of l1_0517$but_p is log normalised?
UPDATE
I have tried to work on a simpler example but I can't reproduce this issue:
df <- tibble(a = as.integer(runif(200, 15, 100)))
df <- df %>%
mutate(norm_a = a %>% divide_by(mean(a)) %>%
log())
I found the solution after looking at the type of l1_0517
It is a grouped_df hence the different results.
The correct code is:
l1_0517 <- tmp3 %>%
group_by(season, team)%>%
summarise(j = n(),
pts = sum(Pts),
diff_but = (sum(BP) - sum(BC)),
diff_t_ca = (sum(TCP, na.rm = T) - sum(TCC, na.rm = T)),
diff_t = (sum(TP, na.rm = T) - sum(TC, na.rm = T)),
but_p = sum(BP),
but_c = sum(BC),
tir_ca_p = sum(TCP, na.rm = T),
tir_ca_c = sum(TCC, na.rm = T),
tir_p = sum(TP, na.rm = T),
tir_c = sum(TC, na.rm = T)) %>%
ungroup() %>%
arrange((season), desc(pts), desc(diff_but))