Handling NaN values in custom function summary - r

I have custom function and I want to replace NaN and Na values with "0.0"
what can be a simplest solution.
df <- data.frame(Name = c("asdf","kjhgf","cvbnm","rtyui","cvbnm","jhfd","cvbnm","sdfghj","cvbnm","dfghj","cvbnm"),
sale=c(27,28,NA,16,NA,25,NA,14,NA,18,NA),
city=c("CA","TX","MN","NY","TX","MT","HU","KL","TX","SA","TX"),
Dept = c("HH","MM","NN","MM","AA","VV","MM","HU","JJ","MM","ZZ"))
df$sale = as.numeric(df$sale)
data=df
filter_var= "Name"
filter_val="cvbnm"
Text_var="sale"
Name_of_variable="EXP"
decimal =TRUE
footer=""
Suff="%"
filter_val = ensym(filter_val)
filter_var = rlang::parse_expr(filter_var)
Text_var= ensym(Text_var)
numdig <- if (decimal == TRUE) {1} else {0}
dat1<-data %>% select(!!filter_var,!!Text_var) %>% filter(!is.na(!!Text_var))
dat2<- dat1 %>% filter((!!filter_var == filter_val))
summ_tab1<- dat2 %>%
select(!!Text_var) %>% summarise(
q25 = format(round(quantile(!! Text_var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[2],digits = numdig),nsmall = numdig),
Median = format(round(quantile(!! Text_var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[3],digits = numdig),nsmall = numdig),
Average = format(round( mean(!! Text_var, na.rm=TRUE),digits = numdig),nsmall = numdig),
q75 = format(round(quantile(!! Text_var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[4],digits = numdig) ,nsmall = numdig),
N = sum(!is.na(!!Text_var)))
summ_tab1
summ_tab1[2,](is.nan(summ_tab1[2,0])) <- "0.0"
I have tried this but doesn't work

Is this what you want?
summ_tab1[1, 1:4] <- "0.0"

Related

change the name of label in summary table dynamically

I am trying to make my function dynamic like i want to show q25 dynamically.
like i want to declare percentile25 dynamically i rmarkdown if i want to otherwise by default wanted to keep.
i have tried set option, get option but getting error everytime.
df <- data.frame(Name = c("asdf","kjhgf","cvbnm","rtyui","cvbnm","jhfd","cvbnm","sdfghj","cvbnm","dfghj","cvbnm"),
sale=c(27,28,27,16,14,25,14,14,19,18,28),
city=c("CA","TX","MN","NY","TX","MT","HU","KL","TX","SA","TX"),
Dept = c("HH","MM","NN","MM","AA","VV","MM","HU","JJ","MM","ZZ"))
percentile25 <- "25th Perc"
t1<- function(dataset,var,name,options(percentile25 = "25th percentile"),..)
{
var <- rlang::parse_expr(var)
tabl1 <- dataset %>% dplyr::filter(!is.na(!!var)) %>% summarise(
q25 = quantile(!! var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[2],
N = sum(!is.na(!!var)))
summ_tab<-tabl1 %>%
mutate(" "= !!name,
q25 = q25)
summ_tab <- summ_tab %>% dplyr::rename(
!!p25 := q25)
summ_tab <- summ_tab %>% select(" ",everything(),N)
summ_tab
}
t1(data = df,var = "sale",name = "listd")
for Mediana
t1<- function(dataset,var,name,p25 = getOption("percentile25", default = "25th percentile")
med=getOption(Med_n ,default ="Median")){
var <- rlang::parse_expr(var)
tabl1 <- dataset %>% dplyr::filter(!is.na(!!var)) %>% summarise(
q25 = quantile(!! var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[2],
med = quantile(!! var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[3],
N = sum(!is.na(!!var)))
summ_tab<-tabl1 %>%
mutate(" "= !!name,
q25 = q25,
Median = Median)
summ_tab <- summ_tab %>% dplyr::rename(
!!p25 := q25,
!!med:=Median)
summ_tab <- summ_tab %>% select(" ",everything(),N)
summ_tab
}
You have misunderstood my comment on your former question:
Stick with the definition of the function in your former post:
library(dplyr)
t1 <- function(dataset, var, name, p25 = getOption("percentile25", default = "25th percentile")) {
var <- rlang::parse_expr(var)
tabl1 <- dataset %>%
filter(!is.na(!!var)) %>%
summarise(
q25 = quantile(!!var, type = 6, probs = seq(0, 1, 0.25), na.rm = TRUE)[2],
N = sum(!is.na(!!var))
)
summ_tab <- tabl1 %>%
mutate(
" " = !!name,
q25 = q25
)
summ_tab <- summ_tab %>% dplyr::rename(
!!p25 := q25
)
summ_tab <- summ_tab %>% select(" ", everything(), N)
summ_tab
}
Then option one to pass your desired label to the function would be to pass the name directly to the p25 argument:
t1(data = df, var = "sale", name = "listd", p25 = "25th Percentilen")
#> 25th Percentilen N
#> 1 listd 14 11
Second option would be to set your desired label via options outside of your function. In that case getOption will automatically pick the label:
options(percentile25 = "25th Percentilen")
t1(data = df, var = "sale", name = "listd")
#> 25th Percentilen N
#> 1 listd 14 11
UPDATE And here is the updated function which now includes the median. Additionally I made some slight adjustments, e.g. if you want to compute just one quantile you could do so by using e.g. probs = .25. Also I collapsed the rename and select into one step:
t1 <- function(dataset, var, name,
p25 = getOption("percentile25", default = "25th percentile"),
med = getOption("Med_n", default = "Median")) {
var <- rlang::parse_expr(var)
tabl1 <- dataset %>%
filter(!is.na(!!var)) %>%
summarise(
q25 = quantile(!!var, type = 6, probs = .25, na.rm = TRUE),
med = quantile(!!var, type = 6, probs = .5, na.rm = TRUE),
N = sum(!is.na(!!var))
)
summ_tab <- tabl1 %>%
mutate(
" " = !!name
) %>%
select(" ", !!p25 := q25, !!med := med, N)
summ_tab
}
t1(
data = df, var = "sale", name = "listd",
p25 = "25th Percentilen", med = "Mediana"
)
#> 25th Percentilen Mediana N
#> 1 listd 14 19 11
options(
percentile25 = "25th Percentilen",
Med_n = "Mediana"
)
t1(data = df, var = "sale", name = "listd")
#> 25th Percentilen Mediana N
#> 1 listd 14 19 11

error while checking two conditions in if statement

I want to check if the names in the summary table matches with the name in stats then run the first condition otherwise run the second condition. there is one more thing like there cane be only one column of summary(25th percentile) or can be two column (25th percentile,75th percentile) or three column summary or four column summary like below
but its showing the error
df <- data.frame(Name = c("asdf","kjhgf","cvbnm","rtyui","cvbnm","jhfd","cvbnm","sdfghj","cvbnm","dfghj","cvbnm"),
sale=c(27,28,27,16,14,25,14,14,19,18,28),
sale2=c(32,25,29,36,44,24,17,15,11,13,22),
city=c("CA","TX","MN","NY","TX","MT","HU","KL","TX","SA","TX"),
Dept = c("HH","MM","NN","MM","AA","VV","MM","HU","JJ","MM","ZZ"))
options(p25 = "25 P")
options(p25 = "75 P")
options(meann = "MEAN")
options(med = "Meadiana")
stats <- c('25th percentile','Median','Mean','75th percentile')
p25 = getOption("p25",default = "Perc 25")
p75 = getOption("p75",default = "Perc 75")
med = getOption("med",default = "Median")
meann = getOption("meann",default = "Mean")
summ_tab1<- df %>% filter(!is.na(sale)) %>% summarise(
q25 = round(quantile(sale, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[2],digits = 1),
Median =round(quantile(sale, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[3],digits = 1),
Average = round( mean(sale, na.rm=TRUE),digits = 1),
q75 = round(quantile(sale, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[4],digits = 1) ,
N = sum(!is.na(sale)))
summ_tab_suff <- summ_tab1 %>% mutate(" "= "ttt",
q25 = q25,
Median = Median,
Average = Average,
q75 = q75)
summ_tab_suff <- summ_tab_suff %>% dplyr::rename(
!!p25 := q25,
!!med := Median,
!!meann := Average ,
!!p75 := q75)
if (names(summ_tab_suff[1:4]) %in% stats & ncol(summ_tab_suff) ==
6){
summ_tab_suff <- summ_tab_suff %>% select(" ",N,stats)}
else{summ_tab_suff <- summ_tab_suff %>% select(" ",N,everything())
}

labelling column names in summary dynamically

i want to label the names of summary table dynamically so that it can take names of defined already.
so here i am making q25 name dynamically so as if i require to display q25 differently then i can display accordingly.
df <- data.frame(Name = c("asdf","kjhgf","cvbnm","rtyui","cvbnm","jhfd","cvbnm","sdfghj","cvbnm","dfghj","cvbnm"),
sale=c(27,28,27,16,14,25,14,14,19,18,28),
city=c("CA","TX","MN","NY","TX","MT","HU","KL","TX","SA","TX"),
Dept = c("HH","MM","NN","MM","AA","VV","MM","HU","JJ","MM","ZZ"))
percentile25 <- "25th Percentilen"
t1<-function(dataset,var,name,p25=getOption("percentile25", default = "25th percentile")){
var <- rlang::parse_expr(var)
tabl1<- dataset %>% filter(!is.na(!!var)) %>% summarise(
q25 = quantile(!! var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[2],
N = sum(!is.na(!!var)))
summ_tab<-tabl1 %>%
mutate(" "= !!name,
q25 = q25)
summ_tab <- summ_tab %>% dplyr::rename(
q25=!!p25)
summ_tab <- summ_tab %>% select(" ",everything(),N)
summ_tab
}
t1(data = df,var = "sale",name = "listd")
The issue is that in dplyr::rename the new name should be on the LHS, the old name on the RHS. Also, you have to make use of the special assignment operator :=, i.e. do dplyr::rename(!!p25 := q25):
library(dplyr)
t1 <- function(dataset, var, name, p25 = getOption("percentile25", default = "25th percentile")) {
var <- rlang::parse_expr(var)
tabl1 <- dataset %>%
filter(!is.na(!!var)) %>%
summarise(
q25 = quantile(!!var, type = 6, probs = seq(0, 1, 0.25), na.rm = TRUE)[2],
N = sum(!is.na(!!var))
)
summ_tab <- tabl1 %>%
mutate(
" " = !!name,
q25 = q25
)
summ_tab <- summ_tab %>% dplyr::rename(
!!p25 := q25
)
summ_tab <- summ_tab %>% select(" ", everything(), N)
summ_tab
}
t1(data = df, var = "sale", name = "listd")
#> 25th percentile N
#> 1 listd 14 11

Putting dataframe Group BY code in a function in R

I have a code in R where I work with multiple dataframes.
Example of a dataframe format :
ClientID Group CountC
X1 A 3
R3 B 2
D4 A 1
T5 A 7
H0 B 5
The other dataframes have the same 2 columns, but CountC differs.
For each of the dataframes, I have a common code that calculates quantile by group / and then pivot the form of the dataframe :
quantileByGroup <-
df %>%
group_by(Group) %>%
summarize(Q25 = quantile(CountC, probs = .25),
Q50 = quantile(CountC, probs = .5),
Q75 = quantile(CountC, probs = .75),
Q100 = quantile(CountC, probs = 1))
quantileByGroupFinal <- pivot_longer(quantileByGroup,
cols = c(2,3,4,5),
names_to = "name",
values_to = "value")
To avoid repeating the same code everytime, I want to put this code in a function.
However when I try, it is complicated especially for this part :
quantileByGroup <-
df %>%
group_by(Group) %>%
summarize(Q25 = quantile(CountC, probs = .25),
Q50 = quantile(CountC, probs = .5),
Q75 = quantile(CountC, probs = .75),
Q100 = quantile(CountC, probs = 1))
Since it is impossible to pass the column names Group and CountC as parameters in the function.
Is there any way to do this?
Thank you
f <- function(.data, .group, .summarize)
{
.data %>%
dplyr::group_by({{.group}}) %>%
dplyr::summarise( "{{.summarize}}_Q25" := quantile({{.summarize}}, probs = .25),
"{{.summarize}}_Q50" := quantile({{.summarize}}, probs = .5),
"{{.summarize}}_Q75" := quantile({{.summarize}}, probs = .75),
"{{.summarize}}_Q100" := quantile({{.summarize}}, probs = 1)) %>%
dplyr::ungroup() %>%
tidyr::pivot_longer(-{{.group}}) %>%
return()
}
and call like:
df %>%
f(.group = Group, .summarize = CountC)

Updating a custom function to get combined summary statistics by groups

I have created a function, for getting summary of average, percentile. but not I want that summary for particular subsets. so I have created subsets accordingly.
but my function is not working properly.
so actually I am trying to update my function so that I can get a summary for list of variables as variable name and summary can be rbind for multiple list of variables.
I have no Idea how can i put "ALL", "MM" as name of variable in my function.
so that the summary for both can be rbind itself
df <- data.frame(Name = c("asdf","kjhgf","cvbnm","rtyui","cvbnm","jhfd","cvbnm","sdfghj","cvbnm","dfghj","cvbnm"),
sale=c(27,28,27,16,14,25,14,14,19,18,28),
city=c("CA","TX","MN","NY","TX","MT","HU","KL","TX","SA","TX"),
Dept = c("HH","MM","NN","MM","AA","VV","MM","HU","JJ","MM","ZZ"))
df1<- df
df$cc1<-1
df2<- subset(df, Dept == 'MM')
df$cc2<-ifelse(df$Dept == 'MM',1,NA)
lst<-list(df$cc1, df$cc2)
listd<-list("ALL" = df1, "MM" =df2)
#I want to run my function for listd so that i can get a combined summary for all variables in listd
tt2<-function(data,var,footer,Name_of_variable,decimal){
for (d in 1:length(data)) {
cat('\n\n#### ', names(data)[d], '\n\n')
md<-data[[d]]
table_list<-list()
for (i in 1:length(d))
table_list[[i]]<-t1(md,var,footer,decimal,Name_of_variable)
tt<- do.call(rbind,table_list)
}
cat(knit_print(tt))
cat('\n\n')
}
t1<-function(dataset,var,Suff,decimal,Name_of_variable){
numdig <- if (decimal == TRUE) {1} else {0}
var <- rlang::parse_expr(var)
summ_tab1<- dataset %>% filter(!is.na(!!var)) %>% summarise(
q25 = format(round(quantile(!! var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[2],digits = numdig),nsmall = numdig),
Median = format(round(quantile(!! var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[3],digits = numdig),nsmall = numdig),
Average = format(round( mean(!! var, na.rm=TRUE),digits = numdig),nsmall = numdig),
q75 = format(round(quantile(!! var, type=6, probs = seq(0, 1, 0.25), na.rm=TRUE)[4],digits = numdig) ,nsmall = numdig),
N = sum(!is.na(!!var)))
summ_tab<-summ_tab1 %>%
mutate(" "=!!Name_of_variable,
q25 = q25,
Median =Median,
Average =Average,
q75 = q75)%>%
dplyr::rename(
`25th percentile` = q25,
`75th percentile` = q75)%>%select(" ",N,everything())
summ_tab1
}
tt2(data = listd,var = "sale",Name_of_variable = "listd",decimal = TRUE)
Previously I was getting summary like below
but now the output summary should be like , name of variable should be in rows.
I've slightly rewritten your t1 function and make use of the fact that it returns a dataframe. This can be used together with purrr::map_dfr:
library(dplyr)
df <- data.frame(Name = c("asdf","kjhgf","cvbnm","rtyui","cvbnm","jhfd","cvbnm","sdfghj","cvbnm","dfghj","cvbnm"),
sale=c(27,28,27,16,14,25,14,14,19,18,28),
city=c("CA","TX","MN","NY","TX","MT","HU","KL","TX","SA","TX"),
Dept = c("HH","MM","NN","MM","AA","VV","MM","HU","JJ","MM","ZZ"))
df1<- df
df$cc1<-1
df2<- subset(df, Dept == 'MM')
df$cc2<-ifelse(df$Dept == 'MM',1,NA)
lst<-list(df$cc1, df$cc2)
listd<-list("ALL" = df1, "MM" =df2)
t1 <- function(dataset, var, decimal){
numdig <- if (decimal == TRUE) {
1
} else {
0
}
var <- rlang::parse_expr(var)
dataset %>%
filter(!is.na(!!var)) %>%
summarise(
q25 = format(round(quantile(!!var,
type = 6,
probs = seq(0, 1, 0.25),
na.rm=TRUE)[2],
digits = numdig),
nsmall = numdig),
Median = format(round(quantile(!!var,
type = 6,
probs = seq(0, 1, 0.25), na.rm=TRUE)[3],
digits = numdig),
nsmall = numdig),
Average = format(round(mean(!!var,
na.rm = TRUE),
digits = numdig),
nsmall = numdig),
q75 = format(round(quantile(!!var,
type = 6,
probs = seq(0, 1, 0.25),
na.rm = TRUE)[4],
digits = numdig),
nsmall = numdig),
N = sum(!is.na(!!var))) %>%
rename(
`25th percentile` = q25,
`75th percentile` = q75)
}
listd %>%
purrr::map_dfr(~t1(dataset = .x, var = "sale", decimal = TRUE), .id = " ")
#> 25th percentile Median Average 75th percentile N
#> 1 ALL 14.0 19.0 20.9 27.0 11
#> 2 MM 14.5 17.0 19.0 25.5 4
Created on 2020-09-23 by the reprex package (v0.3.0)

Resources