simplifying tidyeval with multiple symbols - r

The following function behaves as desired: several variables can be passed to group_by without the need to put them into alist() or dplyr::vars:
mean_by_grp <- function(df, meanvar, grp) {
grouping <- enexpr(grp) %>%
expr_deparse %>%
str_split(",",simplify = T) %>% `[`(1,) %>%
map(str_remove,"c\\(") %>% map(str_remove,"\\)") %>% map(str_trim) %>%
unlist %>% syms
df %>%
group_by(!!!syms(grouping)) %>%
summarise("average_{{meanvar}}" := mean({{meanvar}}, na.rm = TRUE),
.groups = 'drop')
}
starwars %>% mean_by_grp(height, species)
starwars %>% mean_by_grp(height, c(species, homeworld))
However, it is complicated. I need to turn c(var1,....varn) into a string, split it and turn it into a list of symbols so I can use with with syms.
Isn't there a much easier way to do this?
Of course, I could use ellipses instead of grp, but then I can only have one argument that passes multiple symbols to another function.

One option would be dplyr::across:
mean_by_grp <- function(df, meanvar, grp) {
df %>%
group_by(across({{ grp }})) %>%
summarise("average_{{meanvar}}" := mean({{meanvar}}, na.rm = TRUE),
.groups = 'drop')
}
library(dplyr)
starwars %>% mean_by_grp(height, species)
#> # A tibble: 38 × 2
#> species average_height
#> <chr> <dbl>
#> 1 Aleena 79
#> 2 Besalisk 198
#> 3 Cerean 198
#> 4 Chagrian 196
#> 5 Clawdite 168
#> 6 Droid 131.
#> 7 Dug 112
#> 8 Ewok 88
#> 9 Geonosian 183
#> 10 Gungan 209.
#> # … with 28 more rows
starwars %>% mean_by_grp(height, c(species, homeworld))
#> # A tibble: 58 × 3
#> species homeworld average_height
#> <chr> <chr> <dbl>
#> 1 Aleena Aleen Minor 79
#> 2 Besalisk Ojom 198
#> 3 Cerean Cerea 198
#> 4 Chagrian Champala 196
#> 5 Clawdite Zolan 168
#> 6 Droid Naboo 96
#> 7 Droid Tatooine 132
#> 8 Droid <NA> 148
#> 9 Dug Malastare 112
#> 10 Ewok Endor 88
#> # … with 48 more rows

Related

Most frequent phrases from text data in R

Does anyone here have experience in identifying the most common phrases (3 ~ 7 consecutive words)? Understand that most analysis on frequency focuses on the most frequent/common word (along with plotting a WordCloud) rather than phrases.
# Assuming a particular column in a data frame (df) with n rows that is all text data
# as I'm not able to provide a sample data as using dput() on a large text file won't # be feasible here
Text = df$Text_Column
docs = Corpus(VectorSource(Text))
...
Thanks in advance!
You have several options to do this in R. Let's grab some data first. I use the books by Jane Austen from the janeaustenr and do some cleaning to have each paragrah in a separate row:
library(janeaustenr)
library(tidyverse)
books <- austen_books() %>%
mutate(paragraph = cumsum(text == "" & lag(text) != "")) %>%
group_by(paragraph) %>%
summarise(book = head(book, 1),
text = trimws(paste(text, collapse = " ")),
.groups = "drop")
With tidytext:
library(tidytext)
map_df(3L:7L, ~unnest_tokens(books, ngram, text, token = "ngrams", n = .x)) %>% # using multiple values for n is not directly implemented in tidytext
count(ngram) %>%
filter(!is.na(ngram)) %>%
slice_max(n, n = 10)
#> # A tibble: 10 × 2
#> ngram n
#> <chr> <int>
#> 1 i am sure 415
#> 2 i do not 412
#> 3 she could not 328
#> 4 it would be 258
#> 5 in the world 247
#> 6 as soon as 236
#> 7 a great deal 214
#> 8 would have been 211
#> 9 she had been 203
#> 10 it was a 202
With quanteda:
library(quanteda)
books %>%
corpus(docid_field = "paragraph",
text_field = "text") %>%
tokens(remove_punct = TRUE,
remove_symbols = TRUE) %>%
tokens_ngrams(n = 3L:7L) %>%
dfm() %>%
topfeatures(n = 10) %>%
enframe()
#> # A tibble: 10 × 2
#> name value
#> <chr> <dbl>
#> 1 i_am_sure 415
#> 2 i_do_not 412
#> 3 she_could_not 328
#> 4 it_would_be 258
#> 5 in_the_world 247
#> 6 as_soon_as 236
#> 7 a_great_deal 214
#> 8 would_have_been 211
#> 9 she_had_been 203
#> 10 it_was_a 202
With text2vec:
library(text2vec)
library(janeaustenr)
library(tidyverse)
books <- austen_books() %>%
mutate(paragraph = cumsum(text == "" & lag(text) != "")) %>%
group_by(paragraph) %>%
summarise(book = head(book, 1),
text = trimws(paste(text, collapse = " ")),
.groups = "drop")
library(text2vec)
itoken(books$text, tolower, word_tokenizer) %>%
create_vocabulary(ngram = c(3L, 7L), sep_ngram = " ") %>%
filter(str_detect(term, "[[:alpha:]]")) %>% # keep terms with at tleas one alphabetic character
slice_max(term_count, n = 10)
#> Number of docs: 10293
#> 0 stopwords: ...
#> ngram_min = 3; ngram_max = 7
#> Vocabulary:
#> term term_count doc_count
#> 1: i am sure 415 384
#> 2: i do not 412 363
#> 3: she could not 328 288
#> 4: it would be 258 233
#> 5: in the world 247 234
#> 6: as soon as 236 233
#> 7: a great deal 214 209
#> 8: would have been 211 192
#> 9: she had been 203 179
#> 10: it was a 202 194
Created on 2022-08-03 by the reprex package (v2.0.1)

group_by and summaries with variable number of variables

Using the {{var}} notation the following code works.
The variables to be used for grouping and for summarizing van be given as parameters to my_summary
I would like to modify my_summary so that I can give a varying number of variables for both grouping and summarizing. Can this be done?
suppressPackageStartupMessages({
library(tidyverse)
})
set.seed(4321)
demo_df <-
tibble(age=as.integer(rep(c(10,20),each=10)),
gender=rep(c("f","m"),10),
weight=rnorm(20,70,7),
size=rnorm(20,160,15))
my_summary <- function(df_in,group_var,summary_var){
df_in |>
group_by({{group_var}}) |>
summarise_at(vars({{summary_var}}),mean)
}
my_summary(demo_df,gender,weight)
Another possible solution, allowing for multiple grouping variables:
library(tidyverse)
my_summary <- function(df_in, group_var,summary_var){
df_in %>%
group_by(!!!group_var) %>%
summarise(across({{summary_var}}, mean), .groups = "drop")
}
my_summary(demo_df, vars(age,gender), c(weight,size))
#> # A tibble: 4 × 4
#> age gender weight size
#> <int> <chr> <dbl> <dbl>
#> 1 10 f 71.5 159.
#> 2 10 m 72.4 158.
#> 3 20 f 64.3 167.
#> 4 20 m 71.6 164.
Alternatively, without vars (that may be superseded):
library(tidyverse)
my_summary <- function(df_in, summary_var , ...){
summary_var <- enquos(summary_var)
group_var <- enquos(...)
df_in %>%
group_by(!!!group_var) %>%
summarise(across(!!!summary_var,mean), .groups = "drop")
}
my_summary(demo_df, c(weight, size), age, gender)
#> # A tibble: 4 × 4
#> age gender weight size
#> <int> <chr> <dbl> <dbl>
#> 1 10 f 71.5 159.
#> 2 10 m 72.4 158.
#> 3 20 f 64.3 167.
#> 4 20 m 71.6 164.
Use summarise(across(.)).
suppressPackageStartupMessages({
library(tidyverse)
})
set.seed(4321)
demo_df <-
tibble(age=as.integer(rep(c(10,20),each=10)),
gender=rep(c("f","m"),10),
weight=rnorm(20,70,7),
size=rnorm(20,160,15))
my_summary <- function(df_in,group_var,summary_var){
df_in |>
group_by({{group_var}}) |>
summarise(across({{summary_var}}, mean))
}
my_summary(demo_df, gender, weight:size)
#> # A tibble: 2 × 3
#> gender weight size
#> <chr> <dbl> <dbl>
#> 1 f 67.9 163.
#> 2 m 72.0 161.
Created on 2022-06-09 by the reprex package (v2.0.1)

Iterating over values and variable names in dplyr::summarise

I'm using the following script to make a table in R:
library(dplyr)
library(tidyr)
get_probability <- function(parameter_array, threshold) {
return(round(100 * sum(parameter_array >= threshold) /
length(parameter_array)))
}
thresholds = c(75, 100, 125)
mtcars %>% group_by(gear) %>%
dplyr::summarise(
low=get_probability(disp, thresholds[[1]]),
medium=get_probability(disp, thresholds[[2]]),
high=get_probability(disp, thresholds[[3]]),
)
The table that comes out is the following:
# A tibble: 3 x 4
gear low medium high
<dbl> <dbl> <dbl> <dbl>
1 3 100 100 93
2 4 92 67 50
3 5 100 80 60
My question is, how can condense what I have passed to summarise to a single line? i.e., is there a way to iterate over both the thresholds vector, also while passing custom variable names?
In recent versions of dplyr, summarise will auto-splice data.frames created within it into new columns. So, you just need a way to iterate over thresholds to create a data.frame.
One option is purrr:::map_dfc.
library(dplyr, warn.conflicts = FALSE)
get_probability <- function(parameter_array, threshold) {
return(round(100 * sum(parameter_array >= threshold) /
length(parameter_array)))
}
thresholds = c(75, 100, 125)
thresholds <- setNames(thresholds, c('low', 'medium', 'high'))
mtcars %>%
group_by(gear) %>%
summarise(purrr::map_dfc(thresholds, ~ get_probability(disp, .x)))
#> # A tibble: 3 × 4
#> gear low medium high
#> <dbl> <dbl> <dbl> <dbl>
#> 1 3 100 100 93
#> 2 4 92 67 50
#> 3 5 100 80 60
If you prefer not to use an extra package though, you could just lapply and then convert the output to data.frame. (Replace \(x) with function(x) in older versions of R)
mtcars %>%
group_by(gear) %>%
summarise(as.data.frame(lapply(thresholds, \(x) get_probability(disp, x))))
#> # A tibble: 3 × 4
#> gear low medium high
#> <dbl> <dbl> <dbl> <dbl>
#> 1 3 100 100 93
#> 2 4 92 67 50
#> 3 5 100 80 60
Created on 2021-08-17 by the reprex package (v2.0.1)

How can I remove whitespace from a column in pipe?

I am trying to as.integer() the 2nd column. This doesn't work, because of the white space. Unfortunately all attempts to remove that white space failed, too. How is this done?
library(tidyverse)
url <- "https://www.destatis.de/DE/Themen/Staat/Oeffentliche-Finanzen/Schulden-Finanzvermoegen/Tabellen/03-2021-vorlaufiger-schuldenstand-laender.html"
content <- rvest::read_html(url)
content %>%
html_table(fill = TRUE) %>%
pluck(1) %>%
`[`(4:19,1:2) %>%
rename(Schulden = 2)
You could remove the white space via e.g. stringr::str_remove before converting to a numeric:
library(tidyverse)
library(rvest)
url <- "https://www.destatis.de/DE/Themen/Staat/Oeffentliche-Finanzen/Schulden-Finanzvermoegen/Tabellen/03-2021-vorlaufiger-schuldenstand-laender.html"
content <- rvest::read_html(url)
content %>%
html_table(fill = TRUE) %>%
pluck(1) %>%
`[`(4:19,1:2) %>%
rename(Schulden = 2) %>%
mutate(Schulden = as.numeric(str_remove(Schulden, "\\s")))
#> # A tibble: 16 x 2
#> Länder Schulden
#> <chr> <dbl>
#> 1 Baden-Württemberg 47529
#> 2 Bayern 17361
#> 3 Brandenburg 18836
#> 4 Hessen 45269
#> 5 Mecklenburg-Vorpommern 8492
#> 6 Niedersachsen 66540
#> 7 Nordrhein-Westfalen 184481
#> 8 Rheinland-Pfalz 31428
#> 9 Saarland 14489
#> 10 Sachsen 4992
#> 11 Sachsen-Anhalt 22038
#> 12 Schleswig-Holstein 30908
#> 13 Thüringen 16056
#> 14 Berlin 62025
#> 15 Bremen 34280
#> 16 Hamburg 35216

How to bin the summarised frequency table with dplyr

I have the following data frame:
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
df <- nycflights13::flights %>%
select(distance) %>%
group_by(distance) %>%
summarise(n = n()) %>%
arrange(distance) %>% ungroup()
df
#> # A tibble: 214 x 2
#> distance n
#> <dbl> <int>
#> 1 17 1
#> 2 80 49
#> 3 94 976
#> 4 96 607
#> 5 116 443
#> 6 143 439
#> 7 160 376
#> 8 169 545
#> 9 173 221
#> 10 184 5504
#> # … with 204 more rows
What I want to do is to bin the distance column by bin of size 100,
and also summing the n column accordingly.
How can do that?
So you get something like:
bin_distance sum_n
1-100 1633 #(1 + 49 + 976 + 607)
101-200 21344 # (443 + ... + 5327)
#etc
The most simple approach would be to use cut by creating groups using seq for every 100 values and sum the values for each group.
library(dplyr)
df %>%
group_by(group = cut(distance, breaks = seq(0, max(distance), 100))) %>%
summarise(n = sum(n))
# group n
# <fct> <int>
# 1 (0,100] 1633
# 2 (100,200] 21344
# 3 (200,300] 28310
# 4 (300,400] 7748
# 5 (400,500] 21292
# 6 (500,600] 26815
# 7 (600,700] 7846
# 8 (700,800] 48904
# 9 (800,900] 7574
#10 (900,1e+03] 18205
# ... with 17 more rows
which can be translated to base R using aggregate like
aggregate(n ~ distance,
transform(df, distance = cut(distance, breaks = seq(0, max(distance), 100))), sum)
A different tidyverse solution. It is closely following the logic of #Ronak Shah code, but instead of cut() it uses cut_width() from ggplot2:
nycflights13::flights %>%
select(distance) %>%
group_by(ints = cut_width(distance, width = 100, boundary = 0)) %>%
summarise(n = n())
ints n
<fct> <int>
1 [0,100] 1633
2 (100,200] 21344
3 (200,300] 28310
4 (300,400] 7748
5 (400,500] 21292
6 (500,600] 26815
7 (600,700] 7846
8 (700,800] 48904
9 (800,900] 7574
10 (900,1e+03] 18205

Resources