Dynamically change the column name created using summarise() and complete() - r

I'm trying to dynamically create an extra column. The first piece of code works as i want it to:
library(dplyr)
library(tidyr)
set.seed(1)
df <- data.frame(animals = sample(c('dog', 'cat', 'rat'), 100, replace = T))
my_fun <- function(data, column_name){
data %>% group_by(animals) %>%
summarise(!!column_name := n())
}
my_fun(df, 'frequency')
Here i also use the complete function and it doesn't work:
library(dplyr)
set.seed(1)
df <- data.frame(animals = sample(c('dog', 'cat', 'rat'), 100, replace = T))
my_fun <- function(data, column_name){
data %>% group_by(animals) %>%
summarise(!!column_name := n())%>%
ungroup() %>%
complete(animals = c('dog', 'cat', 'rat', 'bat'),
fill = list(!!column_name := 0))
}
my_fun(df, 'frequency')
The list function doesn't seem to like !!column_name :=
Is there something i can do to make this work? Basically i want the second piece of code to output:
animals frequency
bat 0
cat 38
dog 27
rat 35

You could keep the fill argument of complete() as the default (which will give you the missing values as NA) and subsequently replace them with 0:
my_fun <- function(data, column_name){
data %>%
group_by(animals) %>%
summarise(!!column_name := n())%>%
ungroup() %>%
complete(animals = c('dog', 'cat', 'rat', 'bat')) %>%
mutate_all(~replace(., is.na(.), 0))
}

Related

How to combine lapply with dplyr in a function

Below is a sample data frame that I have created along with the expected output.
df = data.frame(color = c("Yellow", "Blue", "Green", "Red", "Magenta"),
values = c(24, 24, 34, 45, 49),
Quarter = c("Period1","Period2" , "Period3", "Period3", "Period1"),
Market = c("Camden", "StreetA", "DansFireplace", "StreetA", "DansFireplace"))
dfXQuarter = df %>% group_by(Quarter) %>% summarise(values = sum(values)) %>%
mutate(cut = "Quarter") %>% data.frame()
colnames(dfXQuarter)[1] = "Grouping"
dfXMarket = df %>% group_by(Market) %>% summarise(values = sum(values)) %>%
mutate(cut = "Market")%>% data.frame()
colnames(dfXMarket)[1] = "Grouping"
df_all = rbind(dfXQuarter, dfXMarket)
Now I for the sake brevity I want to compile this into a function and using lapply.
Below is my attempt at the same-
list = c("Market", "Quarter")
df_all <- do.call(rbind, lapply(list, function(x){
df_l= df %>% group_by(x) %>%
summarise(values = sum(values)) %>%
mutate(cut= x) %>%
data.frame()
colnames(df_l)[df_l$x] = "Grouping"
df_l
}))
This block of code is giving me error.
I need the output to be the exact replica of the 'df_all' output for further operations.
How I do write this function correctly?
We can use purrr::map_dfr
library(dplyr)
library(purrr)
#Don't use the R build-in type e.g. list in variables name
lst <- c("Market", "Quarter")
#Use map if you need the output as a list
map_dfr(lst, ~df %>% group_by("Grouping"=!!sym(.x)) %>%
summarise(values = sum(values)) %>%
mutate(cut = .x) %>%
#To avoid the warning massage from bind_rows
mutate_if(is.factor, as.character))
# A tibble: 6 x 3
Grouping values cut
<chr> <dbl> <chr>
1 Camden 24 Market
2 DansFireplace 83 Market
3 StreetA 69 Market
4 Period1 73 Quarter
5 Period2 24 Quarter
6 Period3 79 Quarter
We can fix the first solution by
change group_by(x) to group_by_at(x), since x is a string here.
Use colnames(df_l)[colnames(df_l)==x] <- "Grouping" in naming the grouping variable.
Not pretty but works and doesn't require tidy functions:
groupwise_summation <- function(df, grouping_vecs){
# Split, apply, combine:
tmpdf <- do.call(rbind, lapply(split(df, df[,grouping_vecs]), function(x){sum(x$values)}))
# Clean up the df:
data.frame(cbind(cut = row.names(tmpdf), value = as.numeric(tmpdf)), row.names = NULL)
}
# Apply and combine:
df_all <- rbind(groupwise_summation(df, c("Quarter")), groupwise_summation(df, c("Market")))
# Note inside the c(), you can use multiple grouping variables.

Use dplyr to get index of first column with certain value per group or row

I have the following script. Option 1 uses a long format and group_by to identify the first step of many where the status equals 0.
Another option (2) is to use apply to calculate this value for each row, and then transform the data to a long format.
The firs option does not scale well. The second does, but I was unable to get it into a dplyr pipe. I tried to solve this with purrr but did not succeeed.
Questions:
Why does the first option not scale well?
How can I transform the second option in a dplyr pipe?
require(dplyr)
require(tidyr)
require(ggplot2)
set.seed(314)
# example data
dat <- as.data.frame(matrix(sample(c(0,1),
size = 9000000,
replace = TRUE,
prob = c(5,95)),
ncol = 9))
names(dat) <- paste("step",1:9, sep="_")
steps <- dat %>% select(starts_with("step_")) %>% names()
# option 1 is slow
dat.cum <- dat %>%
mutate(id = row_number()) %>%
gather(step, status,-id) %>%
group_by(id) %>%
mutate(drop = min(if_else(status==0,match(step, steps),99L))) %>%
mutate(status = if_else(match(step, steps)>=drop,0,1))
ggplot(dat.cum, aes(x = step, fill = factor(status))) +
geom_bar()
# option 2 is faster
dat$drop <- apply(dat,1,function(x) min(which(x==0),99))
dat.cum <- dat %>%
gather(step,status,-drop) %>%
mutate(status = if_else(match(step,steps)>=drop,0,1))
ggplot(dat.cum, aes(x = step, fill = factor(status))) +
geom_bar()
If you would like to map along rows you could do:
dat %>%
mutate(drop2 = map_int(seq_len(nrow(dat)), ~ min(which(dat[.x, ] == 0L), 99L)))
It could be that "gathering and grouping" is faster than Looping:
dat %>%
as_tibble() %>%
select(starts_with("step_")) %>%
mutate(row_nr = row_number()) %>%
gather(key = "col", value = "value", -row_nr) %>%
arrange(row_nr, col) %>%
group_by(row_nr) %>%
mutate(col_index = row_number()) %>%
filter(value == 0) %>%
summarise(drop3 = min(col_index)) %>%
ungroup() %>%
right_join(dat %>%
mutate(row_nr = row_number()),
by = "row_nr") %>%
mutate(drop3 = if_else(is.na(drop3), 99, drop3))

Trying to understand how eval(expr, envir = df) works

I have built a function which seems to work, but I don't understand why.
My initial problem was to take a data.frame which contains counts of a population and expand it to re-create the original population. This is easy enough if you know the column names in advance.
library(tidyverse)
set.seed(121)
test_counts <- tibble(Population = letters[1:4], Length = c(1,1,2,1),
Number = sample(1:100, 4))
expand_counts_v0 <- function(Length, Population, Number) {
tibble(Population = Population,
Length = rep(Length, times = Number))
}
test_counts %>% pmap_dfr(expand_counts_v0) %>% # apply it
group_by(Population, Length) %>% # test it
summarise(Number = n()) %>%
ungroup %>%
{ all.equal(., test_counts)}
# [1] TRUE
However, I wanted to generalise it to a function which didn't need to know at the column names of the data.frame, and I'm interested in NSE, so I wrote:
test_counts1 <- tibble(Population = letters[1:4],
Length = c(1,1,2,1),
Number = sample(1:100, 4),
Height = c(100, 50, 45, 90),
Width = c(700, 50, 60, 90)
)
expand_counts_v1 <- function(df, count = NULL) {
countq <- enexpr(count)
names <- df %>% select(-!!countq) %>% names
namesq <- names %>% map(as.name)
cols <- map(namesq, ~ expr(rep(!!., times = !!countq))
) %>% set_names(namesq)
make_tbl <- function(...) {
expr(tibble(!!!cols)) %>% eval(envir = df)
}
df %>% pmap_dfr(make_tbl)
}
But, when I test this function it seems to duplicate rows 4 times:
test_counts %>% expand_counts_v1(count = Number) %>%
group_by(Population, Length) %>%
summarise(Number = n()) %>%
ungroup %>%
{ sum(.$Number)/sum(test_counts$Number)}
# [1] 4
This lead me to guess a solution, which was
expand_counts_v2 <- function(df, count = NULL) {
countq <- enexpr(count)
names <- df %>% select(-!!countq) %>% names
namesq <- names %>% map(as.name)
cols <- map(namesq, ~ expr(rep(!!., times = !!countq))
) %>% set_names(namesq)
make_tbl <- function(...) {
expr(tibble(!!!cols)) %>% eval(envir = df)
}
df %>% make_tbl
}
This seems to work:
test_counts %>% expand_counts_v2(count = Number) %>%
group_by(Population, Length) %>%
summarise(Number = n()) %>%
ungroup %>%
{ all.equal(., test_counts)}
# [1] TRUE
test_counts1 %>% expand_counts_v2(count = Number) %>%
group_by(Population, Length, Height, Width) %>%
summarise(Number = n()) %>%
ungroup %>%
{ all.equal(., test_counts1)}
# [1] TRUE
But I don't understand why. How is it evaluating for each row, even though I'm not using pmap anymore? The function needs to be applied to each row in order to work, so it must be somehow, but I can't see how it's doing that.
EDIT
After Artem's correct explanation of what was going on, I realised I could do this
expand_counts_v2 <- function(df, count = NULL) {
countq <- enexpr(count)
names <- df %>% select(-!!countq) %>% names
namesq <- names %>% map(as.name)
cols <- map(namesq, ~ expr(rep(!!., times = !!countq))
) %>% set_names(namesq)
expr(tibble(!!!cols)) %>% eval_tidy(data = df)
}
Which gets rid of the unnecessary mk_tbl function. However, as Artem said, that is only really working because rep is vectorised. So, it's working, but not by re-writing the _v0 function and pmapping it, which is the process I was trying to replicate. Eventually, I discovered, rlang::new_function and wrote:
expand_counts_v3 <- function(df, count = NULL) {
countq <- enexpr(count)
names <- df %>% select(-!!countq) %>% names
namesq <- names %>% map(as.name)
cols <- map(namesq, ~ expr(rep(!!., times = !!countq))
) %>% set_names(namesq)
all_names <- df %>% names %>% map(as.name)
args <- rep(0, times = length(all_names)) %>% as.list %>% set_names(all_names)
correct_function <- new_function(args, # this makes the function as in _v0
expr(tibble(!!!cols)) )
pmap_dfr(df, correct_function) # applies it as in _v0
}
which is longer, and probably uglier, but works the way I originally wanted.
The issue is in eval( envir = df ), which exposes the entire data frame to make_tbl(). Notice that you never use ... argument inside make_tbl(). Instead, the function effectively computes the equivalent of
with( df, tibble(Population = rep(Population, times = Number),
Length = rep(Length, times=Number)) )
regardless of what arguments you provide to it. When you call the function via pmap_dfr(), it essentially computes the above four times (once for each row) and concatenates the results by-row, resulting in the duplication of entries you've observed. When you remove pmap_dfr(), the function is called once, but since rep is itself vectorized (try doing rep( test_counts$Population, test_counts$Number ) to see what I mean), make_tbl() computes the entire result in one go.

removing groups with a certain NA number

Sorry to bother with a relatively simple question perhaps.
I have this type of dataframe:
A long list of names in the column "NAME" c(a, b, c, d, e ...) , two potential classes in the column "SURNAME" c(A, B) and a third column containing values.
I want to remove all NAMES for which at least in one of the SURNAME classes I have more than 2 "NA" in the VALUE column.
I wanted to post an example dataset but I am struggling to format it properly
I was trying to use
df <- df %>%
group_by(NAME) %>%
group_by(SURNAME) %>%
filter(!is.na(VALUE)) %>%
filter(length(VALUE)>=3)
it does not throw an error but I have the impression that something is wrong. Any suggestion? Many thanks
Let's create a dataset to work with:
set.seed(1234)
df <- data.frame(
name = sample(x=letters, size=1e3, replace=TRUE),
surname = sample(x=c("A", "B"), size=1e3, replace=TRUE),
value = sample(x=c(1:10*10,NA), size=1e3, replace=TRUE),
stringsAsFactors = FALSE
)
Here's how to do it with Base R:
# count NAs by name-surname combos (na.action arg is important!)
agg <- aggregate(value ~ name + surname, data=df, FUN=function(x) sum(is.na(x)), na.action=NULL)
# rename is count of NAs column
names(agg)[3] <- "number_of_na"
#add count of NAs back to original data
df <- merge(df, agg, by=c("name", "surname"))
# subset the original data
result <- df[df$number_of_na < 3, ]
Here's how to do it with data.table:
library(data.table)
dt <- as.data.table(df)
dt[ , number_of_na := sum(is.na(value)), by=.(name, surname)]
result <- dt[number_of_na < 3]
Here's how to do it with dplr/tidyverse:
library(dplyr) # or library(tidyverse)
result <- df %>%
group_by(name, surname) %>%
summarize(number_of_na = sum(is.na(value))) %>%
right_join(df, by=c("name", "surname")) %>%
filter(number_of_na < 3)
After grouping by 'NAME', 'SURNAME', create a column with the number of NA elements in that group and then filter out any 'NAME' that have an 'ind' greater than or equal to 3
df %>%
group_by(NAME, SURNAME) %>%
mutate(ind = sum(is.na(VALUE))) %>%
group_by(NAME) %>%
filter(!any(ind >=3)) %>%
select(-ind)
Or do an anti_join after doing the filtering by 'NAME', 'SURNAME' based on the condition
df %>%
group_by(NAME, SURNAME) %>%
filter(sum(is.na(VALUE))>=3) %>%
ungroup %>%
distinct(NAME) %>%
anti_join(df, .)
data
set.seed(24)
df <- data.frame(NAME = rep(letters[1:5], each = 20),
SURNAME = sample(LETTERS[1:4], 5 * 20, replace = TRUE),
VALUE = sample(c(NA, 1:3), 5 *20, replace = TRUE),
stringsAsFactors = FALSE)

Use variable names in function in dplyr for sum and cumsum

dplyr programming question here. Trying to write a dplyr function which takes column names as inputs and also filters on a component outlined in the function. What I am trying to recreate is as follow called test:
#test df
x<- sample(1:100, 10)
y<- sample(c(TRUE, FALSE), 10, replace = TRUE)
date<- seq(as.Date("2018-01-01"), as.Date("2018-01-10"), by =1)
my_df<- data.frame(x = x, y =y, date =date)
test<- my_df %>% group_by(date) %>%
summarise(total = n(), total_2 = sum(y ==TRUE, na.rm=TRUE)) %>%
mutate(cumulative_a = cumsum(total), cumulative_b = cumsum(total_2)) %>%
ungroup() %>% filter(date >= "2018-01-03")
The function I am testing is as follows:
cumsum_df<- function(data, date_field, cumulative_y, minimum_date = "2017-04-21") {
date_field <- enquo(date_field)
cumulative_y <- enquo(cumulative_y)
data %>% group_by(!!date_field) %>%
summarise(total = n(), total_2 = sum(!!cumulative_y ==TRUE, na.rm=TRUE)) %>%
mutate(cumulative_a = cumsum(total), cumulative_b = cumsum(total_2)) %>%
ungroup() %>% filter((!!date_field) >= minimum_date)
}
test2<- cumsum_df(data = my_df, date_field = date, cumulative_y = y, minimum_date = "2018-01-03")
I have looked looked at some examples of using enquo and this thread gets me half way there:
Use variable names in functions of dplyr
But the issue is I get two different data frame outputs for test 1 and test 2. The one from the function outputs does not have data from the logical y referenced column.
I also tried this instead
cumsum_df<- function(data, date_field, cumulative_y, minimum_date = "2017-04-21") {
date_field <- enquo(date_field)
cumulative_y <- deparse(substitute(cumulative_y))
data %>% group_by(!!date_field) %>%
summarise(total = n(), total_2 = sum(data[[cumulative_y]] ==TRUE, na.rm=TRUE)) %>%
mutate(cumulative_a = cumsum(total), cumulative_b = cumsum(total_2)) %>%
ungroup() %>% filter((!!date_field) >= minimum_date)
}
test2<- cumsum_df(data= my_df, date_field = date, cumulative_y = y, minimum_date = "2018-01-04")
Based on this thread: Pass a data.frame column name to a function
But the output from my test 2 column is also wildly different and it seems to do some kind or recursive accumulation. Which again is different to my test date frame.
If anyone can help that would be much appreciated.

Resources