labelling column names in summary dynamically - r

i want to label the names of summary table dynamically so that it can take names of defined already.
so here i am making q25 name dynamically so as if i require to display q25 differently then i can display accordingly.
df <- data.frame(Name = c("asdf","kjhgf","cvbnm","rtyui","cvbnm","jhfd","cvbnm","sdfghj","cvbnm","dfghj","cvbnm"),
sale=c(27,28,27,16,14,25,14,14,19,18,28),
city=c("CA","TX","MN","NY","TX","MT","HU","KL","TX","SA","TX"),
Dept = c("HH","MM","NN","MM","AA","VV","MM","HU","JJ","MM","ZZ"))
percentile25 <- "25th Percentilen"
t1<-function(dataset,var,name,p25=getOption("percentile25", default = "25th percentile")){
var <- rlang::parse_expr(var)
tabl1<- dataset %>% filter(!is.na(!!var)) %>% summarise(
q25 = quantile(!! var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[2],
N = sum(!is.na(!!var)))
summ_tab<-tabl1 %>%
mutate(" "= !!name,
q25 = q25)
summ_tab <- summ_tab %>% dplyr::rename(
q25=!!p25)
summ_tab <- summ_tab %>% select(" ",everything(),N)
summ_tab
}
t1(data = df,var = "sale",name = "listd")

The issue is that in dplyr::rename the new name should be on the LHS, the old name on the RHS. Also, you have to make use of the special assignment operator :=, i.e. do dplyr::rename(!!p25 := q25):
library(dplyr)
t1 <- function(dataset, var, name, p25 = getOption("percentile25", default = "25th percentile")) {
var <- rlang::parse_expr(var)
tabl1 <- dataset %>%
filter(!is.na(!!var)) %>%
summarise(
q25 = quantile(!!var, type = 6, probs = seq(0, 1, 0.25), na.rm = TRUE)[2],
N = sum(!is.na(!!var))
)
summ_tab <- tabl1 %>%
mutate(
" " = !!name,
q25 = q25
)
summ_tab <- summ_tab %>% dplyr::rename(
!!p25 := q25
)
summ_tab <- summ_tab %>% select(" ", everything(), N)
summ_tab
}
t1(data = df, var = "sale", name = "listd")
#> 25th percentile N
#> 1 listd 14 11

Related

change the name of label in summary table dynamically

I am trying to make my function dynamic like i want to show q25 dynamically.
like i want to declare percentile25 dynamically i rmarkdown if i want to otherwise by default wanted to keep.
i have tried set option, get option but getting error everytime.
df <- data.frame(Name = c("asdf","kjhgf","cvbnm","rtyui","cvbnm","jhfd","cvbnm","sdfghj","cvbnm","dfghj","cvbnm"),
sale=c(27,28,27,16,14,25,14,14,19,18,28),
city=c("CA","TX","MN","NY","TX","MT","HU","KL","TX","SA","TX"),
Dept = c("HH","MM","NN","MM","AA","VV","MM","HU","JJ","MM","ZZ"))
percentile25 <- "25th Perc"
t1<- function(dataset,var,name,options(percentile25 = "25th percentile"),..)
{
var <- rlang::parse_expr(var)
tabl1 <- dataset %>% dplyr::filter(!is.na(!!var)) %>% summarise(
q25 = quantile(!! var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[2],
N = sum(!is.na(!!var)))
summ_tab<-tabl1 %>%
mutate(" "= !!name,
q25 = q25)
summ_tab <- summ_tab %>% dplyr::rename(
!!p25 := q25)
summ_tab <- summ_tab %>% select(" ",everything(),N)
summ_tab
}
t1(data = df,var = "sale",name = "listd")
for Mediana
t1<- function(dataset,var,name,p25 = getOption("percentile25", default = "25th percentile")
med=getOption(Med_n ,default ="Median")){
var <- rlang::parse_expr(var)
tabl1 <- dataset %>% dplyr::filter(!is.na(!!var)) %>% summarise(
q25 = quantile(!! var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[2],
med = quantile(!! var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[3],
N = sum(!is.na(!!var)))
summ_tab<-tabl1 %>%
mutate(" "= !!name,
q25 = q25,
Median = Median)
summ_tab <- summ_tab %>% dplyr::rename(
!!p25 := q25,
!!med:=Median)
summ_tab <- summ_tab %>% select(" ",everything(),N)
summ_tab
}
You have misunderstood my comment on your former question:
Stick with the definition of the function in your former post:
library(dplyr)
t1 <- function(dataset, var, name, p25 = getOption("percentile25", default = "25th percentile")) {
var <- rlang::parse_expr(var)
tabl1 <- dataset %>%
filter(!is.na(!!var)) %>%
summarise(
q25 = quantile(!!var, type = 6, probs = seq(0, 1, 0.25), na.rm = TRUE)[2],
N = sum(!is.na(!!var))
)
summ_tab <- tabl1 %>%
mutate(
" " = !!name,
q25 = q25
)
summ_tab <- summ_tab %>% dplyr::rename(
!!p25 := q25
)
summ_tab <- summ_tab %>% select(" ", everything(), N)
summ_tab
}
Then option one to pass your desired label to the function would be to pass the name directly to the p25 argument:
t1(data = df, var = "sale", name = "listd", p25 = "25th Percentilen")
#> 25th Percentilen N
#> 1 listd 14 11
Second option would be to set your desired label via options outside of your function. In that case getOption will automatically pick the label:
options(percentile25 = "25th Percentilen")
t1(data = df, var = "sale", name = "listd")
#> 25th Percentilen N
#> 1 listd 14 11
UPDATE And here is the updated function which now includes the median. Additionally I made some slight adjustments, e.g. if you want to compute just one quantile you could do so by using e.g. probs = .25. Also I collapsed the rename and select into one step:
t1 <- function(dataset, var, name,
p25 = getOption("percentile25", default = "25th percentile"),
med = getOption("Med_n", default = "Median")) {
var <- rlang::parse_expr(var)
tabl1 <- dataset %>%
filter(!is.na(!!var)) %>%
summarise(
q25 = quantile(!!var, type = 6, probs = .25, na.rm = TRUE),
med = quantile(!!var, type = 6, probs = .5, na.rm = TRUE),
N = sum(!is.na(!!var))
)
summ_tab <- tabl1 %>%
mutate(
" " = !!name
) %>%
select(" ", !!p25 := q25, !!med := med, N)
summ_tab
}
t1(
data = df, var = "sale", name = "listd",
p25 = "25th Percentilen", med = "Mediana"
)
#> 25th Percentilen Mediana N
#> 1 listd 14 19 11
options(
percentile25 = "25th Percentilen",
Med_n = "Mediana"
)
t1(data = df, var = "sale", name = "listd")
#> 25th Percentilen Mediana N
#> 1 listd 14 19 11

error while checking two conditions in if statement

I want to check if the names in the summary table matches with the name in stats then run the first condition otherwise run the second condition. there is one more thing like there cane be only one column of summary(25th percentile) or can be two column (25th percentile,75th percentile) or three column summary or four column summary like below
but its showing the error
df <- data.frame(Name = c("asdf","kjhgf","cvbnm","rtyui","cvbnm","jhfd","cvbnm","sdfghj","cvbnm","dfghj","cvbnm"),
sale=c(27,28,27,16,14,25,14,14,19,18,28),
sale2=c(32,25,29,36,44,24,17,15,11,13,22),
city=c("CA","TX","MN","NY","TX","MT","HU","KL","TX","SA","TX"),
Dept = c("HH","MM","NN","MM","AA","VV","MM","HU","JJ","MM","ZZ"))
options(p25 = "25 P")
options(p25 = "75 P")
options(meann = "MEAN")
options(med = "Meadiana")
stats <- c('25th percentile','Median','Mean','75th percentile')
p25 = getOption("p25",default = "Perc 25")
p75 = getOption("p75",default = "Perc 75")
med = getOption("med",default = "Median")
meann = getOption("meann",default = "Mean")
summ_tab1<- df %>% filter(!is.na(sale)) %>% summarise(
q25 = round(quantile(sale, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[2],digits = 1),
Median =round(quantile(sale, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[3],digits = 1),
Average = round( mean(sale, na.rm=TRUE),digits = 1),
q75 = round(quantile(sale, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[4],digits = 1) ,
N = sum(!is.na(sale)))
summ_tab_suff <- summ_tab1 %>% mutate(" "= "ttt",
q25 = q25,
Median = Median,
Average = Average,
q75 = q75)
summ_tab_suff <- summ_tab_suff %>% dplyr::rename(
!!p25 := q25,
!!med := Median,
!!meann := Average ,
!!p75 := q75)
if (names(summ_tab_suff[1:4]) %in% stats & ncol(summ_tab_suff) ==
6){
summ_tab_suff <- summ_tab_suff %>% select(" ",N,stats)}
else{summ_tab_suff <- summ_tab_suff %>% select(" ",N,everything())
}

Putting dataframe Group BY code in a function in R

I have a code in R where I work with multiple dataframes.
Example of a dataframe format :
ClientID Group CountC
X1 A 3
R3 B 2
D4 A 1
T5 A 7
H0 B 5
The other dataframes have the same 2 columns, but CountC differs.
For each of the dataframes, I have a common code that calculates quantile by group / and then pivot the form of the dataframe :
quantileByGroup <-
df %>%
group_by(Group) %>%
summarize(Q25 = quantile(CountC, probs = .25),
Q50 = quantile(CountC, probs = .5),
Q75 = quantile(CountC, probs = .75),
Q100 = quantile(CountC, probs = 1))
quantileByGroupFinal <- pivot_longer(quantileByGroup,
cols = c(2,3,4,5),
names_to = "name",
values_to = "value")
To avoid repeating the same code everytime, I want to put this code in a function.
However when I try, it is complicated especially for this part :
quantileByGroup <-
df %>%
group_by(Group) %>%
summarize(Q25 = quantile(CountC, probs = .25),
Q50 = quantile(CountC, probs = .5),
Q75 = quantile(CountC, probs = .75),
Q100 = quantile(CountC, probs = 1))
Since it is impossible to pass the column names Group and CountC as parameters in the function.
Is there any way to do this?
Thank you
f <- function(.data, .group, .summarize)
{
.data %>%
dplyr::group_by({{.group}}) %>%
dplyr::summarise( "{{.summarize}}_Q25" := quantile({{.summarize}}, probs = .25),
"{{.summarize}}_Q50" := quantile({{.summarize}}, probs = .5),
"{{.summarize}}_Q75" := quantile({{.summarize}}, probs = .75),
"{{.summarize}}_Q100" := quantile({{.summarize}}, probs = 1)) %>%
dplyr::ungroup() %>%
tidyr::pivot_longer(-{{.group}}) %>%
return()
}
and call like:
df %>%
f(.group = Group, .summarize = CountC)

R dplyr across: Dynamically specifying arguments to functions t.test and varTest

Am writing some dplyr across statements. Want to create some p-values using the functions t.test and varTest. The x= columns for calculations are in df_vars and the mu= and sigma.squared= parameter values are in df_mu_sigma.
A hard-coded version of the data I need are in df_sumry. If the variable names were always the same when code is run, something like this would suffice. That's not the case, however.
The beginnings of a non-hard-coded version of what I need are in df_sumry2. That doesn't yield a correct result yet though, because values of mu= and sigma.squared= are not dynamically specified. Only the first two p-values are correct in df_sumry2. They are always wrong after that because the code always uses values for the mpg variable.
How can I consistently get the right values inserted for mu and sigma.squared?
library(dplyr)
library(magrittr)
library(EnvStats)
df_vars <- mtcars %>%
select(mpg, cyl, disp, hp)
set.seed(9302)
df_mu_sigma <- mtcars %>%
select(mpg, cyl, disp, hp) %>%
slice_sample(n = 12) %>%
summarize(
across(
everything(),
list(mean = mean,
std = sd
))
)
df_sumry <- df_vars %>%
summarize(
mpg_mean = mean(mpg),
mpg_mean_prob = t.test(mpg, mu = df_mu_sigma$mpg_mean)$p.value,
mpg_std = sd(mpg),
mpg_std_prob = varTest(mpg, sigma.squared = df_mu_sigma$mpg_std^2)$p.value,
cyl_mean = mean(cyl),
cyl_mean_prob = t.test(cyl, mu = df_mu_sigma$cyl_mean)$p.value,
cyl_std = sd(cyl),
cyl_std_prob = varTest(cyl, sigma.squared = df_mu_sigma$cyl_std^2)$p.value,
disp_mean = mean(disp),
disp_mean_prob = t.test(disp, mu = df_mu_sigma$disp_mean)$p.value,
disp_std = sd(disp),
disp_std_prob = varTest(disp, sigma.squared = df_mu_sigma$disp_std^2)$p.value,
hp_mean = mean(hp),
hp_mean_prob = t.test(hp, mu = df_mu_sigma$hp_mean)$p.value,
hp_std = sd(hp),
hp_std_prob = varTest(hp, sigma.squared = df_mu_sigma$hp_std^2)$p.value
)
vars_num <- names(df_vars)
df_sumry2 <- df_vars %>%
summarize(
across(
all_of(vars_num),
list(mean = mean,
mean_prob = function(x) t.test(x, mu = df_mu_sigma$mpg_mean)$p.value,
std = sd,
std_prob = function(x) varTest(x, sigma.squared = df_mu_sigma$mpg_std^2)$p.value)
)
)
I appear to have come up with a solution to my own problem. I'd be happy to see alternative solutions though as they may be better than mine.
library(dplyr)
library(magrittr)
library(EnvStats)
df_vars <- mtcars %>%
select(mpg, cyl, disp, hp)
df_mu_sigma <- mtcars %>%
select(mpg, cyl, disp, hp) %>%
slice_sample(n = 12) %>%
summarize(
across(
everything(),
list(mean = mean,
std = sd
))
)
df_sumry <- df_vars %>%
summarize(
mpg_mean = mean(mpg),
mpg_mean_prob = t.test(mpg, mu = df_mu_sigma$mpg_mean)$p.value,
mpg_std = sd(mpg),
mpg_std_prob = varTest(mpg, sigma.squared = df_mu_sigma$mpg_std^2)$p.value,
cyl_mean = mean(cyl),
cyl_mean_prob = t.test(cyl, mu = df_mu_sigma$cyl_mean)$p.value,
cyl_std = sd(cyl),
cyl_std_prob = varTest(cyl, sigma.squared = df_mu_sigma$cyl_std^2)$p.value,
disp_mean = mean(disp),
disp_mean_prob = t.test(disp, mu = df_mu_sigma$disp_mean)$p.value,
disp_std = sd(disp),
disp_std_prob = varTest(disp, sigma.squared = df_mu_sigma$disp_std^2)$p.value,
hp_mean = mean(hp),
hp_mean_prob = t.test(hp, mu = df_mu_sigma$hp_mean)$p.value,
hp_std = sd(hp),
hp_std_prob = varTest(hp, sigma.squared = df_mu_sigma$hp_std^2)$p.value
)
vars_num <- names(df_vars)
library(glue)
df_sumry2 <- df_vars %>%
summarize(
across(
all_of(vars_num),
list(mean = mean,
mean_prob = function(x) {
mu_name <- glue("{ensym(x)}_mean")
t.test(x, mu = df_mu_sigma[[mu_name]])$p.value
},
std = sd,
std_prob = function(x) {
sigma_name <- glue("{ensym(x)}_std")
varTest(x, sigma.squared = df_mu_sigma[[sigma_name]]^2)$p.value
}
)
)
)
all.equal(df_sumry, df_sumry2)
This is not much better than your solution, but I would use cur_column() instead of ensym() to avoid quosures handling.
Also, putting the query in a separate function makes things a bit tidier.
Finally, I would use lambda functions instead of anonymous functions for clarity.
get_mu = function(suffix){
df_mu_sigma[[paste0(cur_column(), suffix)]] #you could use glue() as well here
}
df_vars %>%
summarize(
across(
all_of(vars_num),
list(
mean = mean,
mean_prob = ~t.test(.x, mu = get_mu("_mean"))$p.value,
std = sd,
std_prob = ~varTest(.x, sigma.squared = get_mu("_std")^2)$p.value
)
)
) %>% t() #just to format the output
# [,1]
# mpg_mean 20.09062500
# mpg_mean_prob 0.01808550
# mpg_std 6.02694805
# mpg_std_prob 0.96094601
# cyl_mean 6.18750000
# cyl_mean_prob 0.10909740
# cyl_std 1.78592165
# cyl_std_prob 0.77092484
# disp_mean 230.72187500
# disp_mean_prob 0.17613878
# disp_std 123.93869383
# disp_std_prob 0.96381507
# hp_mean 146.68750000
# hp_mean_prob 0.03914858
# hp_std 68.56286849
# hp_std_prob 0.03459963

Updating a custom function to get combined summary statistics by groups

I have created a function, for getting summary of average, percentile. but not I want that summary for particular subsets. so I have created subsets accordingly.
but my function is not working properly.
so actually I am trying to update my function so that I can get a summary for list of variables as variable name and summary can be rbind for multiple list of variables.
I have no Idea how can i put "ALL", "MM" as name of variable in my function.
so that the summary for both can be rbind itself
df <- data.frame(Name = c("asdf","kjhgf","cvbnm","rtyui","cvbnm","jhfd","cvbnm","sdfghj","cvbnm","dfghj","cvbnm"),
sale=c(27,28,27,16,14,25,14,14,19,18,28),
city=c("CA","TX","MN","NY","TX","MT","HU","KL","TX","SA","TX"),
Dept = c("HH","MM","NN","MM","AA","VV","MM","HU","JJ","MM","ZZ"))
df1<- df
df$cc1<-1
df2<- subset(df, Dept == 'MM')
df$cc2<-ifelse(df$Dept == 'MM',1,NA)
lst<-list(df$cc1, df$cc2)
listd<-list("ALL" = df1, "MM" =df2)
#I want to run my function for listd so that i can get a combined summary for all variables in listd
tt2<-function(data,var,footer,Name_of_variable,decimal){
for (d in 1:length(data)) {
cat('\n\n#### ', names(data)[d], '\n\n')
md<-data[[d]]
table_list<-list()
for (i in 1:length(d))
table_list[[i]]<-t1(md,var,footer,decimal,Name_of_variable)
tt<- do.call(rbind,table_list)
}
cat(knit_print(tt))
cat('\n\n')
}
t1<-function(dataset,var,Suff,decimal,Name_of_variable){
numdig <- if (decimal == TRUE) {1} else {0}
var <- rlang::parse_expr(var)
summ_tab1<- dataset %>% filter(!is.na(!!var)) %>% summarise(
q25 = format(round(quantile(!! var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[2],digits = numdig),nsmall = numdig),
Median = format(round(quantile(!! var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[3],digits = numdig),nsmall = numdig),
Average = format(round( mean(!! var, na.rm=TRUE),digits = numdig),nsmall = numdig),
q75 = format(round(quantile(!! var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[4],digits = numdig) ,nsmall = numdig),
N = sum(!is.na(!!var)))
summ_tab<-summ_tab1 %>%
mutate(" "=!!Name_of_variable,
q25 = q25,
Median =Median,
Average =Average,
q75 = q75)%>%
dplyr::rename(
`25th percentile` = q25,
`75th percentile` = q75)%>%select(" ",N,everything())
summ_tab1
}
tt2(data = listd,var = "sale",Name_of_variable = "listd",decimal = TRUE)
Previously I was getting summary like below
but now the output summary should be like , name of variable should be in rows.
I've slightly rewritten your t1 function and make use of the fact that it returns a dataframe. This can be used together with purrr::map_dfr:
library(dplyr)
df <- data.frame(Name = c("asdf","kjhgf","cvbnm","rtyui","cvbnm","jhfd","cvbnm","sdfghj","cvbnm","dfghj","cvbnm"),
sale=c(27,28,27,16,14,25,14,14,19,18,28),
city=c("CA","TX","MN","NY","TX","MT","HU","KL","TX","SA","TX"),
Dept = c("HH","MM","NN","MM","AA","VV","MM","HU","JJ","MM","ZZ"))
df1<- df
df$cc1<-1
df2<- subset(df, Dept == 'MM')
df$cc2<-ifelse(df$Dept == 'MM',1,NA)
lst<-list(df$cc1, df$cc2)
listd<-list("ALL" = df1, "MM" =df2)
t1 <- function(dataset, var, decimal){
numdig <- if (decimal == TRUE) {
1
} else {
0
}
var <- rlang::parse_expr(var)
dataset %>%
filter(!is.na(!!var)) %>%
summarise(
q25 = format(round(quantile(!!var,
type = 6,
probs = seq(0, 1, 0.25),
na.rm=TRUE)[2],
digits = numdig),
nsmall = numdig),
Median = format(round(quantile(!!var,
type = 6,
probs = seq(0, 1, 0.25), na.rm=TRUE)[3],
digits = numdig),
nsmall = numdig),
Average = format(round(mean(!!var,
na.rm = TRUE),
digits = numdig),
nsmall = numdig),
q75 = format(round(quantile(!!var,
type = 6,
probs = seq(0, 1, 0.25),
na.rm = TRUE)[4],
digits = numdig),
nsmall = numdig),
N = sum(!is.na(!!var))) %>%
rename(
`25th percentile` = q25,
`75th percentile` = q75)
}
listd %>%
purrr::map_dfr(~t1(dataset = .x, var = "sale", decimal = TRUE), .id = " ")
#> 25th percentile Median Average 75th percentile N
#> 1 ALL 14.0 19.0 20.9 27.0 11
#> 2 MM 14.5 17.0 19.0 25.5 4
Created on 2020-09-23 by the reprex package (v0.3.0)

Resources