summarise() doesn't recognize a variable - r

I'm wondering why I get Error: Problem with summarise() input wt_avg below?
library(tidyverse)
CA_vacc <- read_csv('https://raw.githubusercontent.com/rnorouzian/e/master/2017-2018%20CA%20Vaccination%20Data.csv',
na = c(".","--*"))
CA_vacc %>% summarise(
wt_avg = sum(HEPB_percent * ENROLLMENT, na.rm = TRUE) / sum(ENROLLMENT, na.rm = TRUE)
)
# Error: Problem with `summarise()` input `wt_avg`.

Does this work:
library(dplyr)
library(readr)
CA_vacc %>% summarise(
wt_avg = sum(parse_number(HEPB_percent) * ENROLLMENT, na.rm = TRUE) / sum(ENROLLMENT, na.rm = TRUE)
+ )
# A tibble: 1 x 1
wt_avg
<dbl>
1 96.8

library(tidyverse)
CA_vacc %>%
mutate(HEPB_percent = as.numeric(str_remove_all(CA_vacc$HEPB_percent, "\\?|%"))) %>%
summarise(
wt_avg = sum(HEPB_percent * ENROLLMENT, na.rm = TRUE) / sum(ENROLLMENT, na.rm = TRUE)
)

Using base R
with(CA_vacc, sum(as.numeric(gsub("[?%]", "", HEPB_percent)) *
ENROLLMENT, na.rm = TRUE)/sum(ENROLLMENT, na.rm = TRUE))
#[1] 96.76707

Related

R dplyr across: Dynamically specifying arguments to functions t.test and varTest

Am writing some dplyr across statements. Want to create some p-values using the functions t.test and varTest. The x= columns for calculations are in df_vars and the mu= and sigma.squared= parameter values are in df_mu_sigma.
A hard-coded version of the data I need are in df_sumry. If the variable names were always the same when code is run, something like this would suffice. That's not the case, however.
The beginnings of a non-hard-coded version of what I need are in df_sumry2. That doesn't yield a correct result yet though, because values of mu= and sigma.squared= are not dynamically specified. Only the first two p-values are correct in df_sumry2. They are always wrong after that because the code always uses values for the mpg variable.
How can I consistently get the right values inserted for mu and sigma.squared?
library(dplyr)
library(magrittr)
library(EnvStats)
df_vars <- mtcars %>%
select(mpg, cyl, disp, hp)
set.seed(9302)
df_mu_sigma <- mtcars %>%
select(mpg, cyl, disp, hp) %>%
slice_sample(n = 12) %>%
summarize(
across(
everything(),
list(mean = mean,
std = sd
))
)
df_sumry <- df_vars %>%
summarize(
mpg_mean = mean(mpg),
mpg_mean_prob = t.test(mpg, mu = df_mu_sigma$mpg_mean)$p.value,
mpg_std = sd(mpg),
mpg_std_prob = varTest(mpg, sigma.squared = df_mu_sigma$mpg_std^2)$p.value,
cyl_mean = mean(cyl),
cyl_mean_prob = t.test(cyl, mu = df_mu_sigma$cyl_mean)$p.value,
cyl_std = sd(cyl),
cyl_std_prob = varTest(cyl, sigma.squared = df_mu_sigma$cyl_std^2)$p.value,
disp_mean = mean(disp),
disp_mean_prob = t.test(disp, mu = df_mu_sigma$disp_mean)$p.value,
disp_std = sd(disp),
disp_std_prob = varTest(disp, sigma.squared = df_mu_sigma$disp_std^2)$p.value,
hp_mean = mean(hp),
hp_mean_prob = t.test(hp, mu = df_mu_sigma$hp_mean)$p.value,
hp_std = sd(hp),
hp_std_prob = varTest(hp, sigma.squared = df_mu_sigma$hp_std^2)$p.value
)
vars_num <- names(df_vars)
df_sumry2 <- df_vars %>%
summarize(
across(
all_of(vars_num),
list(mean = mean,
mean_prob = function(x) t.test(x, mu = df_mu_sigma$mpg_mean)$p.value,
std = sd,
std_prob = function(x) varTest(x, sigma.squared = df_mu_sigma$mpg_std^2)$p.value)
)
)
I appear to have come up with a solution to my own problem. I'd be happy to see alternative solutions though as they may be better than mine.
library(dplyr)
library(magrittr)
library(EnvStats)
df_vars <- mtcars %>%
select(mpg, cyl, disp, hp)
df_mu_sigma <- mtcars %>%
select(mpg, cyl, disp, hp) %>%
slice_sample(n = 12) %>%
summarize(
across(
everything(),
list(mean = mean,
std = sd
))
)
df_sumry <- df_vars %>%
summarize(
mpg_mean = mean(mpg),
mpg_mean_prob = t.test(mpg, mu = df_mu_sigma$mpg_mean)$p.value,
mpg_std = sd(mpg),
mpg_std_prob = varTest(mpg, sigma.squared = df_mu_sigma$mpg_std^2)$p.value,
cyl_mean = mean(cyl),
cyl_mean_prob = t.test(cyl, mu = df_mu_sigma$cyl_mean)$p.value,
cyl_std = sd(cyl),
cyl_std_prob = varTest(cyl, sigma.squared = df_mu_sigma$cyl_std^2)$p.value,
disp_mean = mean(disp),
disp_mean_prob = t.test(disp, mu = df_mu_sigma$disp_mean)$p.value,
disp_std = sd(disp),
disp_std_prob = varTest(disp, sigma.squared = df_mu_sigma$disp_std^2)$p.value,
hp_mean = mean(hp),
hp_mean_prob = t.test(hp, mu = df_mu_sigma$hp_mean)$p.value,
hp_std = sd(hp),
hp_std_prob = varTest(hp, sigma.squared = df_mu_sigma$hp_std^2)$p.value
)
vars_num <- names(df_vars)
library(glue)
df_sumry2 <- df_vars %>%
summarize(
across(
all_of(vars_num),
list(mean = mean,
mean_prob = function(x) {
mu_name <- glue("{ensym(x)}_mean")
t.test(x, mu = df_mu_sigma[[mu_name]])$p.value
},
std = sd,
std_prob = function(x) {
sigma_name <- glue("{ensym(x)}_std")
varTest(x, sigma.squared = df_mu_sigma[[sigma_name]]^2)$p.value
}
)
)
)
all.equal(df_sumry, df_sumry2)
This is not much better than your solution, but I would use cur_column() instead of ensym() to avoid quosures handling.
Also, putting the query in a separate function makes things a bit tidier.
Finally, I would use lambda functions instead of anonymous functions for clarity.
get_mu = function(suffix){
df_mu_sigma[[paste0(cur_column(), suffix)]] #you could use glue() as well here
}
df_vars %>%
summarize(
across(
all_of(vars_num),
list(
mean = mean,
mean_prob = ~t.test(.x, mu = get_mu("_mean"))$p.value,
std = sd,
std_prob = ~varTest(.x, sigma.squared = get_mu("_std")^2)$p.value
)
)
) %>% t() #just to format the output
# [,1]
# mpg_mean 20.09062500
# mpg_mean_prob 0.01808550
# mpg_std 6.02694805
# mpg_std_prob 0.96094601
# cyl_mean 6.18750000
# cyl_mean_prob 0.10909740
# cyl_std 1.78592165
# cyl_std_prob 0.77092484
# disp_mean 230.72187500
# disp_mean_prob 0.17613878
# disp_std 123.93869383
# disp_std_prob 0.96381507
# hp_mean 146.68750000
# hp_mean_prob 0.03914858
# hp_std 68.56286849
# hp_std_prob 0.03459963

How can I loop different variables to the same command

I am trying to loop different variables into the same command:
Following is the list of variables and values I want to loop
behavior_list <- c("knocked1", "questions1", ...)
answer_list <- c(0, 1)
answer_label_list <- c("Yes", "No")
Following is the command:
data_aliki %>%
group_by(indicator) %>%
summarise(
total_indicator = n(),
yes_knocked1 = sum(knocked1==1, na.rm = TRUE)
)
I am trying to loop
yes_knocked1 = sum(knocked1==1, na.rm = TRUE)
no_knocked1 = sum(knocked1==0, na.rm = TRUE)
yes_questions1 = sum(questions1==1, na.rm = TRUE)
no_questions1 = sum(questions1==0, na.rm = TRUE)
Is there an easier way to do this instead of copy and paste?
You did not provide a reproducible example, so I will illustrate how to achieve what you want in dplyr for the mtcars data set:
mtcars %>% group_by(cyl) %>%
summarize_at(c("mpg","hp"), list("lt15" = ~sum(. < 15, na.rm = TRUE),
"lt18" = ~sum(. < 18, na.rm = TRUE)))
Output
cyl mpg_lt15 hp_lt15 mpg_lt18 hp_lt18
<dbl> <int> <int> <int> <int>
1 4 0 0 0 0
2 6 0 0 1 0
3 8 5 0 12 0
This should work in your case:
data_aliki %>%
group_by(indicator) %>%
summarize_at(c("knocked1","questions1"),
list("yes" = ~sum(. == 1, na.rm = TRUE),
"no" = ~sum(. == 0, na.rm = TRUE))

How to Add Column Totals to Grouped Summaries in R

I'm in the process of creating summaries tables based on subgroups and would love to add an overall summary in a tidyer/more efficient manner.
What I have so far is this. I've created summaries via levels within my factor variables.
library(tidyverse)
df <- data.frame(var1 = 10:18,
var2 = c("A","B","A","B","A","B","A","B","A"))
group_summary <- df %>% group_by(var2) %>%
filter(var2 != "NA") %>%
summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n())
Next I created an overall summary.
Summary <- df %>%
filter(var2 != "NA") %>%
summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n())
Finally, I bound the two objects with dplyr::bind_rows
complete_summary <- bind_rows(Summary, group_summary)
What I've done works but it is very, very verbose and can't be the most efficient way. I tried to use ungroup
group_summary <- df %>% group_by(var2) %>%
filter(var2 != "NA") %>%
summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n()) %>% ungroup %>%
summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n())
but it threw an error:
Evaluation error: object var1 not found.
Thanks in advance for your assistance.
Ideally, if you want to do it in one-chain, this is how you can do by using bind_rows to combine both the results, just like you've done - but removing the temporary objects you created.
library(tidyverse)
#> Warning: package 'tibble' was built under R version 3.5.2
df <- data.frame(var1 = 10:18,
var2 = c("A","B","A","B","A","B","A","B","A"))
df %>% group_by(var2) %>%
filter(var2 != "NA") %>%
summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n()) %>% #ungroup() %>%
bind_rows( df %>% summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n()))
#> # A tibble: 3 x 6
#> var2 Max Median Min IQR Count
#> <fct> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 A 18 14 10 4 5
#> 2 B 17 14 11 3 4
#> 3 <NA> 18 14 10 4 9
Created on 2019-01-29 by the reprex package (v0.2.1)
Not the most elegant solution either, but simple:
c <- mtcars %>%
mutate(total_mean = mean(wt),
total_median = median(wt)) %>%
group_by(cyl) %>%
summarise(meanweight = mean(wt),
medianweight = median(wt),
total_mean = first(total_mean),
total_median = first(total_median))

Normalising data with dplyr mutate() brings inconsistencies

I'm trying to reproduce the framework from this blogpost http://www.luishusier.com/2017/09/28/balance/ with the following code but it looks like I get inconsistent results
library(tidyverse)
library(magrittr)
ids <- c("1617", "1516", "1415", "1314", "1213", "1112", "1011", "0910", "0809", "0708", "0607", "0506")
data <- ids %>%
map(function(i) {read_csv(paste0("http://www.football-data.co.uk/mmz4281/", i ,"/F1.csv")) %>%
select(Date:AST) %>%
mutate(season = i)})
data <- bind_rows(data)
data <- data[complete.cases(data[ , 1:3]), ]
tmp1 <- data %>%
select(season, HomeTeam, FTHG:FTR,HS:AST) %>%
rename(BP = FTHG,
BC = FTAG,
TP = HS,
TC = AS,
TCP = HST,
TCC = AST,
team = HomeTeam)%>%
mutate(Pts = ifelse(FTR == "H", 3, ifelse(FTR == "A", 0, 1)),
Terrain = "Domicile")
tmp2 <- data %>%
select(season, AwayTeam, FTHG:FTR, HS:AST) %>%
rename(BP = FTAG,
BC = FTHG,
TP = AS,
TC = HS,
TCP = AST,
TCC = HST,
team = AwayTeam)%>%
mutate(Pts = ifelse(FTR == "A", 3 ,ifelse(FTR == "H", 0 , 1)),
Terrain = "Extérieur")
tmp3 <- bind_rows(tmp1, tmp2)
l1_0517 <- tmp3 %>%
group_by(season, team)%>%
summarise(j = n(),
pts = sum(Pts),
diff_but = (sum(BP) - sum(BC)),
diff_t_ca = (sum(TCP, na.rm = T) - sum(TCC, na.rm = T)),
diff_t = (sum(TP, na.rm = T) - sum(TC, na.rm = T)),
but_p = sum(BP),
but_c = sum(BC),
tir_ca_p = sum(TCP, na.rm = T),
tir_ca_c = sum(TCC, na.rm = T),
tir_p = sum(TP, na.rm = T),
tir_c = sum(TC, na.rm = T)) %>%
arrange((season), desc(pts), desc(diff_but))
Then I apply the framework mentioned above:
l1_0517 <- l1_0517 %>%
mutate(
# First, see how many goals the team scores relative to the average
norm_attack = but_p %>% divide_by(mean(but_p)) %>%
# Then, transform it into an unconstrained scale
log(),
# First, see how many goals the team concedes relative to the average
norm_defense = but_c %>% divide_by(mean(but_c)) %>%
# Invert it, so a higher defense is better
raise_to_power(-1) %>%
# Then, transform it into an unconstrained scale
log(),
# Now that we have normalized attack and defense ratings, we can compute
# measures of quality and attacking balance
quality = norm_attack + norm_defense,
balance = norm_attack - norm_defense
) %>%
arrange(desc(norm_attack))
When I look at the column norm_attack, I expect to find the same value for equivalent but_p values, which is not the case here:
head(l1_0517, 10)
for instance when but_p has value 83, row 5 and row 7, I get norm_attack at 0.5612738 and 0.5128357 respectively.
Is it normal? I would expect mean(l1_0517$but_p) to be fixed and therefore obtaining the same result when a value of l1_0517$but_p is log normalised?
UPDATE
I have tried to work on a simpler example but I can't reproduce this issue:
df <- tibble(a = as.integer(runif(200, 15, 100)))
df <- df %>%
mutate(norm_a = a %>% divide_by(mean(a)) %>%
log())
I found the solution after looking at the type of l1_0517
It is a grouped_df hence the different results.
The correct code is:
l1_0517 <- tmp3 %>%
group_by(season, team)%>%
summarise(j = n(),
pts = sum(Pts),
diff_but = (sum(BP) - sum(BC)),
diff_t_ca = (sum(TCP, na.rm = T) - sum(TCC, na.rm = T)),
diff_t = (sum(TP, na.rm = T) - sum(TC, na.rm = T)),
but_p = sum(BP),
but_c = sum(BC),
tir_ca_p = sum(TCP, na.rm = T),
tir_ca_c = sum(TCC, na.rm = T),
tir_p = sum(TP, na.rm = T),
tir_c = sum(TC, na.rm = T)) %>%
ungroup() %>%
arrange((season), desc(pts), desc(diff_but))

dplyr pipe multiple datasets to summarize()

I am making a table using dplyr. I want to perform the same "summarize" command on multiple datasets. I know in ggplot2, you can just change out the dataset and rerun the plot, which is cool.
here's what I want to avoid:
table_1 <-
group_by(df_1, boro) %>%
summarize(n_units = n(),
mean_rent = mean(rent_numeric, na.rm = TRUE),
sd_rend = sd(rent_numeric,na.rm = TRUE),
median_rent = median(rent_numeric, na.rm = TRUE),
mean_bedrooms = mean(bedrooms_numeric, na.rm = TRUE),
sd_bedrooms = sd(bedrooms_numeric, na.rm = TRUE),
mean_sqft = mean(sqft, na.rm = TRUE),
sd_sqft = sd(sqft, na.rm = TRUE),
n_broker = sum(ob=="broker"),
pr_broker = n_broker/n_units)
table_2 <-
group_by(df_2, boro) %>%
summarize(n_units = n(),
mean_rent = mean(rent_numeric, na.rm = TRUE),
sd_rend = sd(rent_numeric,na.rm = TRUE),
median_rent = median(rent_numeric, na.rm = TRUE),
mean_bedrooms = mean(bedrooms_numeric, na.rm = TRUE),
sd_bedrooms = sd(bedrooms_numeric, na.rm = TRUE),
mean_sqft = mean(sqft, na.rm = TRUE),
sd_sqft = sd(sqft, na.rm = TRUE),
n_broker = sum(ob=="broker"),
pr_broker = n_broker/n_units)
Basically, is there a way to set up the summarize command as a function or something maybe so I can just pour in df_1 and df_2?
If you know all the variable names in advance and if they are the same in all the data sets you want to look at, you can just do something like:
myfunc <- function(df) {
df %>%
group_by(cyl) %>%
summarize(n = n(),
mean_hp = mean(hp))
}
myfunc(mtcars)
#Source: local data frame [3 x 3]
#
# cyl n mean_hp
#1 4 11 82.63636
#2 6 7 122.28571
#3 8 14 209.21429
And then use it with a different data set (that would have the same structure and variable names). If you need flexibility, i.e. you don't know all the variables in advance and what to be able to specify them as input in the function, look at the dplyr non standard evaluation vignette.
Here's just a tiny example of how you could implement "standard evaluation" into your function to allow for more flexibility. Consider if you wanted to allow the user of the function to specify by which column the data should be grouped, you could do:
myfunc <- function(df, grp) {
df %>%
group_by_(grp) %>% # notice that I use "group_by_" instead of "group_by"
summarize(n = n(),
mean_hp = mean(hp))
}
and then use it:
myfunc(mtcars, "gear")
#Source: local data frame [3 x 3]
#
# gear n mean_hp
#1 3 15 176.1333
#2 4 12 89.5000
#3 5 5 195.6000
myfunc(mtcars, "cyl")
#Source: local data frame [3 x 3]
#
# cyl n mean_hp
#1 4 11 82.63636
#2 6 7 122.28571
#3 8 14 209.21429
The %>% operator just passes on a tbl object as the first parameter to the next function. And summarize just expects a tbl. So you can define
mysummary <- function(.data) {
summarize(.data, n_units = n(),
mean_rent = mean(rent_numeric, na.rm = TRUE),
sd_rend = sd(rent_numeric,na.rm = TRUE),
median_rent = median(rent_numeric, na.rm = TRUE),
mean_bedrooms = mean(bedrooms_numeric, na.rm = TRUE),
sd_bedrooms = sd(bedrooms_numeric, na.rm = TRUE),
mean_sqft = mean(sqft, na.rm = TRUE),
sd_sqft = sd(sqft, na.rm = TRUE),
n_broker = sum(ob=="broker"),
pr_broker = n_broker/n_units)
}
And then call
table_1 <- group_by(df_1, boro) %>% mysummary
table_2 <- group_by(df_2, boro) %>% mysummary
With an actual working example
mysummary <- function(.data) {
summarize(.data,
ave.mpg=mean(mpg),
ave.hp=mean(hp)
)
}
mtcars %>% group_by(cyl) %>% mysummary
mtcars %>% group_by(gear) %>% mysummary

Resources