creating summary of subset filtered dataframe - r

I am trying filtered data with value having 1 but the dataframe is already labelled . so the objective is to create a summary of filtered dataset
df <- data.frame(NY = c(1,2,1,1,2,1,1,1,2,1,1,1,2,1),
DE = c(2,1,1,1,1,2,2,1,1,1,2,2,2,1) )
df$NY<- factor(df$NY, levels =c(1,2), labels = c("unavailable","available"))
df$DE<- factor(df$DE, levels =c(1,2), labels = c("rejected","recieved"))
output is the frequency of "available" in both column
available/ total frequency in NY and DE for "recieved"
the output should be look like

If output in this format is useful?
library(janitor)
library(tidyverse)
df %>% pivot_longer(everything()) %>%
tabyl(name, value) %>%
adorn_percentages() %>%
adorn_pct_formatting(digits = 2)
#> name available unavailable
#> DE 50.00% 50.00%
#> NY 71.43% 28.57%
In case of revised scenario
df %>% pivot_longer(everything()) %>%
tabyl(value, name) %>%
adorn_percentages('col') %>%
filter(value %in% c('available', 'recieved')) %>%
adorn_totals('row') %>%
adorn_pct_formatting(digits = 2) %>%
tail(1)
value DE NY
Total 50.00% 71.43%

Here's a tidyverse approach to your problem which outputs the percentages as decimal:
library(tidyverse)
df %>% summarise(across(everything(), ~ sum(. == "available")/n()))
Output:
NY DE
1 0.7142857 0.5

You can try map_df() through each column.
df %>%
map_df(
~ (mean(. == "available") * 100) %>%
round() %>%
paste("%")
)
# # A tibble: 1 x 2
# NY DE
# <chr> <chr>
# 1 71 % 50 %
For different desired values, one approach is to create a named vector as below and pass it into a customised function. Note that the output is a character vector but you can change it as necessary.
values <- c(NY = "available",
DE = "received")
get_percent <- function(.data, .values) {
vars <- names(.values)
pct <- sapply(
seq_along(.values),
function(.) round(mean(.data[[ vars[.] ]] == .values[vars[.]]) * 100)
)
pct <- paste0(pct, "%")
names(pct) <- vars
pct
}
res <- get_percent(df, values)
res
# NY DE
# "29%" "43%"

Related

Ontime percentage calculations

I need to calculate the overall ontime percentage of each airline with this sample dataset.
library(tidyverse)
library(dplyr)
df_chi <- tribble(
~airline, ~ontime, ~qty,~dest,
'delta',TRUE,527,'CHI',
'delta',FALSE,92,'CHI',
'american',TRUE,4229,'CHI',
'american',FALSE,825,'CHI'
)
df_nyc <- tribble(
~airline, ~ontime, ~qty,~dest,
'delta',TRUE,1817,'NYC',
'delta',FALSE,567,'NYC',
'american',TRUE,1651,'NYC',
'american',FALSE,625,'NYC'
)
I have a solution although it is verbose and I want to avoid the numbered index ie [2,2]. Is there a more elegant way using more of the tidyverse?
df_all <- bind_rows(df_chi,df_nyc)
delta_ot <- df_all %>%
filter(airline == "delta") %>%
group_by(ontime) %>%
summarize(total = sum(qty))
delta_ot <- delta_ot[2,2] / sum(delta_ot$total)
american_ot <- df_all %>%
filter(airline == "american") %>%
group_by(ontime) %>%
summarize(total = sum(qty))
american_ot <- american_ot[2,2] / sum(american_ot$total)
As on the ontime column is logical column, use that to subset instead of [2, 2]. Also, instead of doing the filter, do this once by adding the 'airline' as a grouping column
library(dplyr)
bind_rows(df_chi, df_nyc) %>%
group_by(airline, ontime) %>%
summarise(total = sum(qty), .groups = 'drop_last') %>%
summarise(total = total[ontime]/sum(total))
-output
# A tibble: 2 × 2
airline total
<chr> <dbl>
1 american 0.802
2 delta 0.781
Subsetting by logical returns the corresponding value where there are TRUE elements
> c(1, 3, 5)[c(FALSE, TRUE, FALSE)]
[1] 3

How to summarise grouped value increases

I have this type of data:
df <- data.frame(
Utt = c(rep("oh", 10), rep("ah", 10)),
name = rep(LETTERS[1:2], 10),
value = c(0.5,2,2,2,2,1,0,1,3.5,1,
2.2,2.3,1.9,0.1,0.3,1.8,3,4,3.5,2)
)
I need to know whether within in each group of Utt and name, there are continuous value increases and how large these increases are.
EDIT: I've cobbled together this code, which produces the right result but seems convoluted:
df %>%
# order by name:
arrange(name) %>%
group_by(name, Utt) %>%
# mutate:
mutate(
# is there an increase from one value to the next?
is_increase = ifelse(lag(value) < value, value, NA),
# what's the difference between these values?
diff = is_increase - lag(value)) %>%
group_by(name, Utt, grp = rleid(!is.na(diff))) %>%
# sum the contiguous values:
summarise(increase_size = sum(diff, na.rm = TRUE)) %>%
# remove 0 values:
filter(!increase_size == 0) %>%
# put same-group increase_sizes in the same row:
summarise(
increase_size = str_c(increase_size, collapse = ', '))
# A tibble: 3 x 3
# Groups: name [2]
name Utt increase_size
<chr> <chr> <chr>
1 A ah 3.2
2 A oh 1.5, 3.5
3 B ah 3.9
NOTE: Ideally, the expected outcome would be:
1 A ah 3.2
2 A oh 1.5, 3.5
3 B ah 3.9
4 B oh NA
Is there a better (i.e., more concise, more clever) dplyr solution?
Use this function to find what you want.
f <- function(x) {
ind <- which(x > lag(x))
if (length(ind) == 0) {
return(NA)
}
ind2 <- ind[which(lead(ind, default = max(ind) + 2) - ind > 1)]
ind1 <- ind[which(ind - lag(ind, default = min(ind) - 2) > 1)] - 1
return(paste0(x[ind2] - x[ind1], collapse = ", "))
}
And use the function in summarise:
df %>% group_by(name, Utt) %>% summarise(increase = f(value))
Using tidyverse, my solution was similar to yours. One possible modification might be to subset your columns before summing instead of filtering. This will keep all combinations of name and Utt and allow for NA for increase_size in the end. Since the column increase_size is character type, you can convert an empty string to NA.
library(data.table)
library(tidyverse)
df %>%
arrange(name) %>%
group_by(name, Utt) %>%
mutate(diff = c(0, diff(value))) %>%
group_by(grp = rleid(diff < 0), .add = T) %>%
summarise(increase_size = sum(diff[diff > 0], na.rm = T)) %>%
group_by(name, Utt) %>%
summarise(increase_size = toString(increase_size[increase_size > 0])) %>%
mutate(increase_size = na_if(increase_size, ""))
Output
name Utt increase_size
<chr> <chr> <chr>
1 A ah 3.2
2 A oh 1.5, 3.5
3 B ah 3.9
4 B oh NA

Is it possible to conditionally format rows while using as_grouped_data with flextable?

I'm trying to conditionally format rows after calling as_grouped_data basing the conditions on the grouped rows:
library(tidyverse)
library(flextable)
df <- tibble(vStat = c(rep("Average Degree", 3), rep("Average Weight", 3)),
val = c(1.22222, 1.33333, 1.44444, 1.55555, 1.66666, 1.77777))
flextable(df %>%
as_grouped_data(groups="vStat")) %>%
colformat_double(i = ~ vStat=="Average Degree", digits=1) %>% # not working
colformat_double(i = ~ vStat=="Average Weight", digits=3) %>% # not working
autofit()
I understand that the above doesn't work because the condition in colformat_double only applies to rows where val is now NA:
df %>%
as_grouped_data(groups="vStat")
> vStat val
> 1 Average Degree NA
> 3 <NA> 1.22222
> 4 <NA> 1.33333
> 5 <NA> 1.44444
> 2 Average Weight NA
> 6 <NA> 1.55555
> 7 <NA> 1.66666
> 8 <NA> 1.77777
It doesn't seem to work like grouped data normally would when calling first:
flextable(df %>%
as_grouped_data(groups="vStat")) %>%
colformat_double(i = ~ first(vStat=="Average Degree"), digits=1) %>%
colformat_double(i = ~ first(vStat=="Average Weight"), digits=3) %>%
autofit()
> Error in get_rows_id(x[[part]], i) : invalid row selection: length(i) [1] != nrow(dataset) [8]
Rounding in the dataset before grouping doesn't get me what I want either, with the number of digits still going out to the highest condition and getting filled in with zeros:
flextable(df %>%
mutate(val = case_when(vStat=="Average Degree" ~ round(val, 1),
vStat=="Average Weight" ~ round(val, 3))) %>%
as_grouped_data(groups="vStat")) %>%
autofit()
I'd really like to not have to specify individual row numbers in colformat_double in a table with 50 rows when my data change every day.
We could create an index column or duplicate the same column 'vStat' with another name and do the condition on the index or use the same code on the other column and remove it later
library(dplyr)
library(flextable)
flextable(df %>%
mutate(ind = match(vStat, unique(vStat))) %>%
as_grouped_data(groups="vStat")) %>%
colformat_double(i = ~ ind == 1, digits=1) %>% # not working
colformat_double(i = ~ ind == 2, digits=3) %>% # not working
void(j = ~ind) %>%
compose( j = 3, value = as_paragraph(""), part = "header") %>%
autofit()
-output

Implicit NA's making a table in the Tidyverse

I am running into an error message when trying to create a table using tidyverse. The error message reads
"Factor Com.Race contains implicit NA, consider using
forcats::fct_explicit_na".
I am noob when it comes to the tidyverse. So I haven't been able to try much.
Major_A <- rep("Major A", times=150)
set.seed(1984)
gender <- sample(c("Female","Male"), prob=c(.95,.05),size=150, replace=T)
race.asian <- sample(c("Y","N"),prob= c(.01,.99),size=150, replace=T)
race.black <- sample(c("Y","N"),prob= c(.1,.9),size=150, replace=T)
race.AmInd <- sample(c("Y","N"),prob= c(.01,.99),size=150, replace=T)
race.hawa <- sample(c("Y","N"),prob= c(.01,.99),size=150, replace=T)
race.hisp <- sample(c("Y","N"),prob= c(.02,.98),size=150, replace=T)
race.white <- sample(c("Y","N"),prob=c(.8,.2),size=150,replace=T)
race.NotR <- sample(c("Y","N"),prob=c(.01,.98),size=150,replace=T)
degree <- sample(c("BA","MAT"),prob=c(.9,.1),size=150,replace=T)
enroll <- data.frame(Major_A,gender,race.asian,race.black,race.AmInd,race.hawa,race.hisp,race.white, race.NotR, degree)
multi.race_fun <- function(dat,startr,endr){
dat$multi <- rowSums(dat[,startr:endr]=="Y")
return(dat)
}
enroll.multiR <- multi.race_fun(enroll,3,9)
# load comrace function
com_race.fun <- function(dat){
dat$Com.Race <- ifelse(dat$race.hisp=="Y","Hispanic",
ifelse(dat$race.black=="Y" & dat$multi==1, "African Am",
ifelse(dat$race.AmInd=="Y" & dat$multi==1,"Native Am",
ifelse(dat$race.asian=="Y" & dat$multi==1,"Asian",
ifelse(dat$race.hawa=="Y" & dat$multi==1, "Hawaiian",
ifelse(dat$race.white=="Y" & dat$multi==1,"Caucasian",
ifelse(dat$multi>=2,"Two or More Races","Not Reported")))))))
return(dat)
}
# run comrace function
enroll.comR <- com_race.fun(enroll.multiR)
enroll.comR$gender <- factor(enroll.comR$gender, levels= c("Female", "Male"))
enroll.comR$Com.Race <- factor(enroll.comR$Com.Race, levels=c("African Am","Asian","Caucasian","Hawaiian","Hispancic","Two or More Races", "Not Reported"))
library(tidyverse)
gen_race.tbl<- enroll.comR%>%
group_by(Com.Race, gender, .drop = FALSE) %>%
summarise(count = n()) %>%
ungroup() %>%
mutate(perc = (count/sum(count)*100)) %>%
gather(key, value, -gender, -Com.Race) %>%
unite(Com.Race, Com.Race, key) %>%
spread(Com.Race, value)
I would like the code to produce a table with counts and percents for all level of the gender and Com.Race variables.
I would suggest using gather() from dplyr to restructure your wide-format data right at the start, then you can summarize the counts/percentages for each level of the gender and ethnicity variables. Using reshape2::dcast() at the end will give your desired output, but spread() can also be used.
# toy data set
df <- data.frame(gender=sample(c('M','F'),100,T,prob=c(0.9,0.1)),
ethn.a=sample(c('Y','N'),100,T,prob=c(0.8,0.2)),
ethn.b=sample(c('Y','N'),100,T,prob=c(0.7,0.3)),
ethn.c=sample(c('Y','N'),100,T,prob=c(0.25,0.75)),
ethn.d=sample(c('Y','N'),100,T,prob=c(0.95,0.05)))
# gather wide data, group by gender/ethnicity, summarise, reshape to wide format
df %>% gather(k,v,-gender) %>% group_by(gender,k,v) %>%
summarise(n=n()) %>% mutate(perc=round((n/sum(n))*100,2)) %>%
mutate(cell=paste0(n,' (',sprintf("%.1f",perc),'%)')) %>%
select(-n,-perc) %>%
filter(v=='Y') %>% reshape2::dcast(v~k+gender, value.var = 'cell')
v ethn.a_F ethn.a_M ethn.b_F ethn.b_M ethn.c_F ethn.c_M ethn.d_F ethn.d_M
1 Y 11 (84.6%) 69 (79.3%) 10 (76.9%) 66 (75.9%) 3 (23.1%) 28 (32.2%) 12 (92.3%) 87 (100.0%)
# using spread()
df %>% gather(k,v,-gender) %>% group_by(gender,k,v) %>%
summarise(n=n()) %>% mutate(perc=round((n/sum(n))*100,2)) %>%
mutate(cell=paste0(n,' (',sprintf("%.1f",perc),'%)')) %>%
select(-n,-perc) %>%
filter(v=='Y') %>%
spread(k,cell,fill=0)
# A tibble: 2 x 6
# Groups: gender [2]
gender v ethn.a ethn.b ethn.c ethn.d
<fct> <chr> <chr> <chr> <chr> <chr>
1 F Y 11 (84.6%) 10 (76.9%) 3 (23.1%) 12 (92.3%)
2 M Y 69 (79.3%) 66 (75.9%) 28 (32.2%) 87 (100.0%)

How to pass variable to filter function within a R function

I am fairly new to R. I wrote the below function which tries to summarise a dataframe, based on a feature variable (passed to the function as 'variable') and a target variable (passed to the function as target_var). I also pass it a value (target_val) on which to filter.
The function below falls over on the filter line (filter(target_var == target_val)). I think it has something to do with quo, quosure etc, but can't figure out how to fix it. The following code should be ready to run - if you exclude the filter line it should work, if you included the filter line it will fall over.
library(dplyr)
target <- c('good', 'good', 'bad', 'good', 'good', 'bad')
var_1 <- c('debit_order', 'other', 'other', 'debit_order','debit_order','debit_order')
dset <- data.frame(target, var_1)
odds_by_var <- function(dataframe, variable, target_var, target_val){
df_name <- paste('odds', deparse(substitute(variable)), sep = "_")
variable_string <- deparse(substitute(variable))
target_string <- deparse(substitute(target_var))
temp_df1 <- dataframe %>%
group_by_(variable_string, target_string) %>%
summarise(cnt = n()) %>%
group_by_(variable_string) %>%
mutate(total = sum(cnt)) %>%
mutate(rate = cnt / total) %>%
filter(target_var == target_val)
assign(df_name, temp_df1, envir=.GlobalEnv)
}
odds_by_var(dset, var_1, target, 'bad')
so I assume you want to filter by target good or bad.
In my understanding, always filter() before you group_by(), as you will possibly ommit your filter variables. I restructured your function a little:
dset <- data.frame(target, var_1)
odds_by_var <- function(dataframe, variable, target_var, target_val){
df_name <- paste('odds', deparse(substitute(variable)), sep = "_")
variable_string <- deparse(substitute(variable))
target_string <- deparse(substitute(target_var))
temp_df1 <- dataframe %>%
group_by_(variable_string, target_string) %>%
summarise(cnt = n()) %>%
mutate(total = sum(cnt),
rate = cnt / total)
names(temp_df1) <- c(variable_string,"target","cnt","total","rate" )
temp_df1 <- temp_df1[temp_df1$target == target_val,]
assign( df_name,temp_df1, envir=.GlobalEnv)
}
odds_by_var(dset, var_1, target, "bad")
result:
> odds_var_1
# A tibble: 2 x 5
# Groups: var_1 [2]
var_1 target cnt total rate
<chr> <chr> <int> <int> <dbl>
1 debit_order bad 1 4 0.25
2 other bad 1 2 0.5

Resources