Related
I am working with the R programming language.
I have the following dataset:
set.seed(123)
library(dplyr)
Patient_ID = 1:5000
gender <- c("Male","Female")
gender <- sample(gender, 5000, replace=TRUE, prob=c(0.45, 0.55))
Gender <- as.factor(gender)
status <- c("Immigrant","Citizen")
status <- sample(status, 5000, replace=TRUE, prob=c(0.3, 0.7))
Status <- as.factor(status )
Height = rnorm(5000, 150, 10)
Weight = rnorm(5000, 90, 10)
Hospital_Visits = sample.int(20, 5000, replace = TRUE)
################
disease <- c("Yes","No")
disease <- sample(disease, 5000, replace=TRUE, prob=c(0.4, 0.6))
Disease <- as.factor(disease)
###################
my_data = data.frame(Patient_ID, Gender, Status, Height, Weight, Hospital_Visits, Disease)
Patient_ID Gender Status Height Weight Hospital_Visits Disease
1 1 Female Citizen 145.0583 113.70725 1 No
2 2 Male Immigrant 161.2759 88.33188 18 No
3 3 Female Immigrant 138.5305 99.26961 6 Yes
4 4 Male Citizen 164.8102 84.31848 12 No
5 5 Male Citizen 159.1619 92.25090 12 Yes
6 6 Female Citizen 153.3513 101.31986 11 Yes
In a previous question (R: Calculating Proportions Based on Nested Groups), I learned how to calculate "nested proportions" based on ntiles (e.g. calculate 3 ntiles for one variable, group by these 3 ntiles and then claculate 3 ntiles for the second variable based on these previous ntiles,etc.):
# e.g. using 3 ntiles
my_data %>%
group_by(Gender, Status) %>%
mutate(Height_ntile = ntile(Height, 3),
Height_range = paste(min(Height), max(Height), sep = "-")) %>%
group_by(Height_ntile, Height_range, .add = TRUE) %>%
mutate(Weight_ntile = ntile(Weight, 3),
Weight_range = paste(min(Weight), max(Weight), sep = "-")) %>%
group_by(Weight_ntile, Weight_range, .add = TRUE) %>%
mutate(Hospital_Visits_ntile = ntile(Hospital_Visits, 3),
Hospital_range = paste(min(Hospital_Visits), max(Hospital_Visits), sep = "-")) %>%
group_by(Hospital_Visits_ntile, Hospital_range, .add = TRUE) %>%
summarize(percent_disease = mean(Disease == "Yes"),
count = n(),
.groups = "drop")
Now, I am trying to repeat this exact same function but using "quantiles" instead:
I tried to modify the above code - here is my attempt:
my_data %>%
group_by(Gender, Status) %>%
mutate(Height_group = cut(Height, breaks = c(-Inf,
quantile(Height, c(0.33, 0.67)),
Inf)),
Height_range = paste(min(Height), max(Height), sep = "-")) %>%
group_by(Height_group, Height_range, .add = TRUE) %>%
mutate(Weight_group = cut(Weight, breaks = c(-Inf,
quantile(Weight, c(0.33, 0.67)),
Inf)),
Weight_range = paste(min(Weight), max(Weight), sep = "-")) %>%
group_by(Weight_group, Weight_range, .add = TRUE) %>%
mutate(Hospital_Visits_group = cut(Hospital_Visits, breaks = c(-Inf,
quantile(Hospital_Visits, c(0.33, 0.67)),
Inf)),
Hospital_range = paste(min(Hospital_Visits), max(Hospital_Visits), sep = "-")) %>%
group_by(Hospital_Visits_group, Hospital_range, .add = TRUE) %>%
summarize(percent_disease = mean(Disease == "Yes"),
count = n(),
.groups = "drop")
This code runs, but I am not sure if I have done this correctly (e.g. the "infinite" values appearing):
A tibble: 108 x 10
Gender Status Height_~1 Heigh~2 Weigh~3 Weigh~4 Hospi~5 Hospi~6 perce~7
<fct> <fct> <fct> <chr> <fct> <chr> <fct> <chr> <dbl>
1 Female Citizen (-Inf,14~ 115.86~ (-Inf,~ 58.991~ (-Inf,~ 1-20 0.314
2 Female Citizen (-Inf,14~ 115.86~ (-Inf,~ 58.991~ (7,14] 1-20 0.458
Can someone please show me if I have done this correctly?
Thanks!
Answer based on insights provided by #akrun:
my_data %>%
group_by(Gender, Status) %>%
mutate(Height_group = as.integer(cut(Height, breaks = c(-Inf,
quantile(Height, c(0.33, 0.67)),
Inf))),
Height_range = paste(min(Height), max(Height), sep = "-")) %>%
group_by(Height_group, Height_range, .add = TRUE) %>%
mutate(Weight_group = as.integer(cut(Weight, breaks = c(-Inf,
quantile(Weight, c(0.33, 0.67)),
Inf))),
Weight_range = paste(min(Weight), max(Weight), sep = "-")) %>%
group_by(Weight_group, Weight_range, .add = TRUE) %>%
mutate(Hospital_Visits_group = as.integer(cut(Hospital_Visits, breaks = c(-Inf,
quantile(Hospital_Visits, c(0.33, 0.67)),
Inf))),
Hospital_range = paste(min(Hospital_Visits), max(Hospital_Visits), sep = "-")) %>%
group_by(Hospital_Visits_group, Hospital_range, .add = TRUE) %>%
summarize(percent_disease = mean(Disease == "Yes"),
count = n(),
.groups = "drop")
Have I understood this correctly?
If I want to get the mean and sum of all the numeric columns using the mtcars data set, I would use following codes:
group_by(gear) %>%
summarise(across(where(is.numeric), list(mean = mean, sum = sum)))
But if I have missing values in some of the columns, how do I take that into account? Here is a reproducible example:
test.df1 <- data.frame("Year" = sample(2018:2020, 20, replace = TRUE),
"Firm" = head(LETTERS, 5),
"Exporter"= sample(c("Yes", "No"), 20, replace = TRUE),
"Revenue" = sample(100:200, 20, replace = TRUE),
stringsAsFactors = FALSE)
test.df1 <- rbind(test.df1,
data.frame("Year" = c(2018, 2018),
"Firm" = c("Y", "Z"),
"Exporter" = c("Yes", "No"),
"Revenue" = c(NA, NA)))
test.df1 <- test.df1 %>% mutate(Profit = Revenue - sample(20:30, 22, replace = TRUE ))
test.df_summarized <- test.df1 %>% group_by(Firm) %>% summarize(across(where(is.numeric)), list(mean = mean, sum = sum)))
If I would just summarize each variable separately, I could use the following:
test.df1 %>% group_by(Firm) %>% summarize(Revenue_mean = mean(Revenue, na.rm = TRUE,
Profit_mean = mean(Profit, na.rm = TRUE)
But I am trying to figure out how can I tweak the code I wrote above for mtcars to the example data set I have provided here.
Because your functions all have a na.rm argument, you can pass it along with the ...
test.df1 %>% summarize(across(where(is.numeric), list(mean = mean, sum = sum), na.rm = TRUE))
# Year_mean Year_sum Revenue_mean Revenue_sum Profit_mean Profit_sum
# 1 2019.045 44419 162.35 3247 138.25 2765
(I left out the group_by because it's not specified properly in your code and the example is still well-illustrated without it. Also make sure that your functions are inside across().)
Just for the record, you could also do it like this (and this works when the different functions have different arguments)
test.df1 %>%
summarise(across(where(is.numeric),
list(
mean = ~ mean(.x, na.rm = T),
sum = ~ sum(.x, na.rm = T))
)
)
# Year_mean Year_sum Revenue_mean Revenue_sum Profit_mean Profit_sum
# 1 2019.045 44419 144.05 2881 119.3 2386
I'm in the process of creating summaries tables based on subgroups and would love to add an overall summary in a tidyer/more efficient manner.
What I have so far is this. I've created summaries via levels within my factor variables.
library(tidyverse)
df <- data.frame(var1 = 10:18,
var2 = c("A","B","A","B","A","B","A","B","A"))
group_summary <- df %>% group_by(var2) %>%
filter(var2 != "NA") %>%
summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n())
Next I created an overall summary.
Summary <- df %>%
filter(var2 != "NA") %>%
summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n())
Finally, I bound the two objects with dplyr::bind_rows
complete_summary <- bind_rows(Summary, group_summary)
What I've done works but it is very, very verbose and can't be the most efficient way. I tried to use ungroup
group_summary <- df %>% group_by(var2) %>%
filter(var2 != "NA") %>%
summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n()) %>% ungroup %>%
summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n())
but it threw an error:
Evaluation error: object var1 not found.
Thanks in advance for your assistance.
Ideally, if you want to do it in one-chain, this is how you can do by using bind_rows to combine both the results, just like you've done - but removing the temporary objects you created.
library(tidyverse)
#> Warning: package 'tibble' was built under R version 3.5.2
df <- data.frame(var1 = 10:18,
var2 = c("A","B","A","B","A","B","A","B","A"))
df %>% group_by(var2) %>%
filter(var2 != "NA") %>%
summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n()) %>% #ungroup() %>%
bind_rows( df %>% summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n()))
#> # A tibble: 3 x 6
#> var2 Max Median Min IQR Count
#> <fct> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 A 18 14 10 4 5
#> 2 B 17 14 11 3 4
#> 3 <NA> 18 14 10 4 9
Created on 2019-01-29 by the reprex package (v0.2.1)
Not the most elegant solution either, but simple:
c <- mtcars %>%
mutate(total_mean = mean(wt),
total_median = median(wt)) %>%
group_by(cyl) %>%
summarise(meanweight = mean(wt),
medianweight = median(wt),
total_mean = first(total_mean),
total_median = first(total_median))
I am using Expss package to create Tables in R.
I am have 5 Statements each statement Have 5 brands.
5 Statements are in 5 consecutive variables like a1,a2,a3,a4,a5
Can i have table in grid format like the following?
enter image description here
There are two solutions: one is verbose but not scalable, second is scalable but is not very simple. Both solutions are based on the idea that we reposition labels from variables to banner location.
library(expss)
# create sample of data
set.seed(123)
N = 150
df = data.frame(
st1 = sample(paste0("brand", 1:5), N, replace = TRUE),
st2 = sample(paste0("brand", 1:5), N, replace = TRUE),
st3 = sample(paste0("brand", 1:5), N, replace = TRUE),
st4 = sample(paste0("brand", 1:5), N, replace = TRUE),
st5 = sample(paste0("brand", 1:5), N, replace = TRUE)
) %>% apply_labels(
st1 = 'Statement 1',
st2 = 'Statement 2',
st3 = 'Statement 3',
st4 = 'Statement 4',
st5 = 'Statement 5'
)
# verbose solution with Tab_*. It is not scalable for large number of variables
# manipulation with variable labels is needed to repostion variable labels from rows to column
df %>%
tab_total_row_position("above") %>%
tab_cells("|" = drop_var_labs(st1)) %>%
tab_stat_cpct(label = var_lab(st1)) %>%
tab_cells("|" = drop_var_labs(st2)) %>%
tab_stat_cpct(label = var_lab(st2)) %>%
tab_cells("|" = drop_var_labs(st3)) %>%
tab_stat_cpct(label = var_lab(st3)) %>%
tab_cells("|" = drop_var_labs(st4)) %>%
tab_stat_cpct(label = var_lab(st4)) %>%
tab_cells("|" = drop_var_labs(st5)) %>%
tab_stat_cpct(label = var_lab(st5)) %>%
tab_pivot(stat_position = "inside_columns") %>%
tab_transpose()
# solution wich will work for arbirary number of variables
df %>%
calculate(
lapply(st1 %to% st5, function(item)
# manipulation with variable labels is needed to repostion variable labels from rows to column
cro(list(drop_var_labs(item)), list(var_lab(item)), total_row_position = "above")
)
) %>%
Reduce("%merge%", .) %>%
tab_transpose()
I'm trying to reproduce the framework from this blogpost http://www.luishusier.com/2017/09/28/balance/ with the following code but it looks like I get inconsistent results
library(tidyverse)
library(magrittr)
ids <- c("1617", "1516", "1415", "1314", "1213", "1112", "1011", "0910", "0809", "0708", "0607", "0506")
data <- ids %>%
map(function(i) {read_csv(paste0("http://www.football-data.co.uk/mmz4281/", i ,"/F1.csv")) %>%
select(Date:AST) %>%
mutate(season = i)})
data <- bind_rows(data)
data <- data[complete.cases(data[ , 1:3]), ]
tmp1 <- data %>%
select(season, HomeTeam, FTHG:FTR,HS:AST) %>%
rename(BP = FTHG,
BC = FTAG,
TP = HS,
TC = AS,
TCP = HST,
TCC = AST,
team = HomeTeam)%>%
mutate(Pts = ifelse(FTR == "H", 3, ifelse(FTR == "A", 0, 1)),
Terrain = "Domicile")
tmp2 <- data %>%
select(season, AwayTeam, FTHG:FTR, HS:AST) %>%
rename(BP = FTAG,
BC = FTHG,
TP = AS,
TC = HS,
TCP = AST,
TCC = HST,
team = AwayTeam)%>%
mutate(Pts = ifelse(FTR == "A", 3 ,ifelse(FTR == "H", 0 , 1)),
Terrain = "Extérieur")
tmp3 <- bind_rows(tmp1, tmp2)
l1_0517 <- tmp3 %>%
group_by(season, team)%>%
summarise(j = n(),
pts = sum(Pts),
diff_but = (sum(BP) - sum(BC)),
diff_t_ca = (sum(TCP, na.rm = T) - sum(TCC, na.rm = T)),
diff_t = (sum(TP, na.rm = T) - sum(TC, na.rm = T)),
but_p = sum(BP),
but_c = sum(BC),
tir_ca_p = sum(TCP, na.rm = T),
tir_ca_c = sum(TCC, na.rm = T),
tir_p = sum(TP, na.rm = T),
tir_c = sum(TC, na.rm = T)) %>%
arrange((season), desc(pts), desc(diff_but))
Then I apply the framework mentioned above:
l1_0517 <- l1_0517 %>%
mutate(
# First, see how many goals the team scores relative to the average
norm_attack = but_p %>% divide_by(mean(but_p)) %>%
# Then, transform it into an unconstrained scale
log(),
# First, see how many goals the team concedes relative to the average
norm_defense = but_c %>% divide_by(mean(but_c)) %>%
# Invert it, so a higher defense is better
raise_to_power(-1) %>%
# Then, transform it into an unconstrained scale
log(),
# Now that we have normalized attack and defense ratings, we can compute
# measures of quality and attacking balance
quality = norm_attack + norm_defense,
balance = norm_attack - norm_defense
) %>%
arrange(desc(norm_attack))
When I look at the column norm_attack, I expect to find the same value for equivalent but_p values, which is not the case here:
head(l1_0517, 10)
for instance when but_p has value 83, row 5 and row 7, I get norm_attack at 0.5612738 and 0.5128357 respectively.
Is it normal? I would expect mean(l1_0517$but_p) to be fixed and therefore obtaining the same result when a value of l1_0517$but_p is log normalised?
UPDATE
I have tried to work on a simpler example but I can't reproduce this issue:
df <- tibble(a = as.integer(runif(200, 15, 100)))
df <- df %>%
mutate(norm_a = a %>% divide_by(mean(a)) %>%
log())
I found the solution after looking at the type of l1_0517
It is a grouped_df hence the different results.
The correct code is:
l1_0517 <- tmp3 %>%
group_by(season, team)%>%
summarise(j = n(),
pts = sum(Pts),
diff_but = (sum(BP) - sum(BC)),
diff_t_ca = (sum(TCP, na.rm = T) - sum(TCC, na.rm = T)),
diff_t = (sum(TP, na.rm = T) - sum(TC, na.rm = T)),
but_p = sum(BP),
but_c = sum(BC),
tir_ca_p = sum(TCP, na.rm = T),
tir_ca_c = sum(TCC, na.rm = T),
tir_p = sum(TP, na.rm = T),
tir_c = sum(TC, na.rm = T)) %>%
ungroup() %>%
arrange((season), desc(pts), desc(diff_but))