Is there a more tidyverse-idiomatic way to combine several columns into a list column than using mapply?
For example given the following
tibble(.rows = 9) %>%
mutate(foo = runif(n()),
a_1 = runif(n()),
a_2 = runif(n()),
a_3 = runif(n())) ->
Z
(where Z might contain other columns, and might also contain more than 3 as) one can do
Z %>% mutate(A = mapply(c, a_1, a_2, a_3, SIMPLIFY = FALSE))
which works fine, although it would be nice to be able to say starts_with('a_') instead of a_1, a_2, a_3.
Another possibility is
Z %>%
rowid_to_column() %>%
pivot_longer(cols = starts_with('a_')) %>%
group_by(rowid) %>%
summarise(foo = unique(foo),
A = list(value)) %>%
select(-rowid)
which technically works, but introduces other problems (e.g., it uses an ugly foo = unique(foo); furthermore if instead of just one foo there were many foos it would become a bit more involved).
Based on a previous answer (now deleted) and the comments, I made a comparison of different solutions:
FUN_mapply <- function() { Z %>% mutate(A = mapply(c, a_1, a_2, a_3, SIMPLIFY = FALSE)) }
FUN_asplit <- function() { Z %>% mutate(A = asplit(.[,grepl("^a", colnames(.))], 1)) }
FUN_pmap <- function() { Z %>% mutate(A = pmap(.[,grepl("^a", colnames(.))], c)) }
FUN_transpose <- function() { Z %>% mutate(A = transpose(.[,grepl("^a", colnames(.))])) }
FUN_asplit_tidy <- function() { Z %>% mutate(A = asplit(select(., starts_with("a")), 1)) }
FUN_pmap_tidy <- function() { Z %>% mutate(A = pmap(select(., starts_with("a")), c)) }
FUN_transpose_tidy <- function() { Z %>% mutate(A = transpose(select(., starts_with("a")))) }
all(unlist(pmap(list(FUN_mapply()$A, FUN_asplit()$A, FUN_pmap()$A, FUN_transpose()$A), ~all(mapply(all.equal, .x, .y, MoreArgs = list(attributes = F)))))) # All A columns are equal?
mb <- microbenchmark::microbenchmark(
FUN_mapply(),
FUN_asplit(),
FUN_pmap(),
FUN_transpose(),
FUN_asplit_tidy(),
FUN_pmap_tidy(),
FUN_transpose_tidy(),
times = 1000L
)
ggplot2::autoplot(mb)
Edit: Replace select(., starts_with("a")) with Z[,grepl("^a", colnames(Z))]
Related
i have a function in R that generates a table graph picking data from a dataframe and every time i want to pass a different variable (column name from dataframe) i have to repeat the code. So sometimes it can be the variable and sometimes the variableb, other times the variablec... etc.
generates_table_variablea <- function(data) { ## how to pass the column = variablea here like this
####### function(data, column = variablea) .. ???
big_data <- data %>%
group_by(a, b, c, d) %>%
mutate(total_categoria_abs = sum(abs(f))) %>%
mutate(volume_negativo = if_else(variablea < 0, f, 0)) %>%
mutate(volume_positivo = if_else(variablea > 0, f, 0)) %>%
mutate(total = sum(volume_positivo) - sum(volume_negativo)) %>%
mutate(e = if_else(variablea < 0, sum(variablea), 0)) %>%
ungroup() %>%
filter (variablea < 0) %>%
group_by(a, b, c, d) %>%
summarise(e = mean(e), vendas = sum(f*-1), frac_vendas = vendas*-1/mean(total_categoria_abs)) %>%
arrange(e) %>%
ungroup()
big_data$frac_vendas <- round(big_data$frac_vendas, digits = 2)
big_data$e <- round(big_data$e, digits = 0)
}
If I want to change this variable, I have to do the follow:
generates_table_variableb <- function(data) { ## HERE IT WILL BE function(data, column = variableb)...
big_data <- data %>%
group_by(a, b, c, d) %>%
mutate(total_categoria_abs = sum(abs(f))) %>%
mutate(volume_negativo = if_else(variableb < 0, f, 0)) %>% #### HERE I NEED TO CHANGE ALWAYS TO VARIABLEA, VARIABLEB, VARIABLEC...
mutate(volume_positivo = if_else(variableb > 0, f, 0)) %>%
mutate(total = sum(volume_positivo) - sum(volume_negativo)) %>%
mutate(e = if_else(variablea < 0, sum(variableb), 0)) %>%
ungroup() %>%
filter (variableb < 0) %>%
group_by(a, b, c, d) %>%
summarise(e = mean(e), vendas = sum(f*-1), frac_vendas = vendas*-1/mean(total_categoria_abs)) %>%
arrange(e) %>%
ungroup()
big_data$frac_vendas <- round(big_data$frac_vendas, digits = 2)
big_data$e <- round(big_data$e, digits = 0)
}
Having multiple functions doing the same thing is slowing down my code...
How could this be better? All that I want is to pass this column dynamically.
This is one of the way
library(dplyr)
x <- data.frame(v1=1:3, v2=4:6)
f <- function(data, var1){
x %>% select(!!var1)
}
f(x, quo(v1))
You can see more explanation in https://adv-r.hadley.nz/quasiquotation.html
I found a other away that works too:
generates_table_variablea <- function(dataframe, variable) { ## Here pass variable
big_data <- dataframe %>%
group_by(a, b, c, d) %>%
mutate(total_categoria_abs = sum(abs(f))) %>%
mutate(volume_negativo = if_else(.data[[variable]] < 0, f, 0)) %>%
mutate(volume_positivo = if_else(.data[[variable]] > 0, f, 0)) %>%
mutate(total = sum(volume_positivo) - sum(volume_negativo)) %>%
mutate(e = if_else(.data[[variable]] < 0, sum(variablea), 0)) %>%
ungroup() %>%
filter (.data[[variable]] < 0) %>%
group_by(a, b, c, d) %>%
summarise(e = mean(e), vendas = sum(f*-1), frac_vendas = vendas*-1/mean(total_categoria_abs)) %>%
arrange(e) %>%
ungroup()
big_data$frac_vendas <- round(big_data$frac_vendas, digits = 2)
big_data$e <- round(big_data$e, digits = 0)
}
Only replace the variable by .data[[variable]] and you can pass any column inside the function.
I have built a function which seems to work, but I don't understand why.
My initial problem was to take a data.frame which contains counts of a population and expand it to re-create the original population. This is easy enough if you know the column names in advance.
library(tidyverse)
set.seed(121)
test_counts <- tibble(Population = letters[1:4], Length = c(1,1,2,1),
Number = sample(1:100, 4))
expand_counts_v0 <- function(Length, Population, Number) {
tibble(Population = Population,
Length = rep(Length, times = Number))
}
test_counts %>% pmap_dfr(expand_counts_v0) %>% # apply it
group_by(Population, Length) %>% # test it
summarise(Number = n()) %>%
ungroup %>%
{ all.equal(., test_counts)}
# [1] TRUE
However, I wanted to generalise it to a function which didn't need to know at the column names of the data.frame, and I'm interested in NSE, so I wrote:
test_counts1 <- tibble(Population = letters[1:4],
Length = c(1,1,2,1),
Number = sample(1:100, 4),
Height = c(100, 50, 45, 90),
Width = c(700, 50, 60, 90)
)
expand_counts_v1 <- function(df, count = NULL) {
countq <- enexpr(count)
names <- df %>% select(-!!countq) %>% names
namesq <- names %>% map(as.name)
cols <- map(namesq, ~ expr(rep(!!., times = !!countq))
) %>% set_names(namesq)
make_tbl <- function(...) {
expr(tibble(!!!cols)) %>% eval(envir = df)
}
df %>% pmap_dfr(make_tbl)
}
But, when I test this function it seems to duplicate rows 4 times:
test_counts %>% expand_counts_v1(count = Number) %>%
group_by(Population, Length) %>%
summarise(Number = n()) %>%
ungroup %>%
{ sum(.$Number)/sum(test_counts$Number)}
# [1] 4
This lead me to guess a solution, which was
expand_counts_v2 <- function(df, count = NULL) {
countq <- enexpr(count)
names <- df %>% select(-!!countq) %>% names
namesq <- names %>% map(as.name)
cols <- map(namesq, ~ expr(rep(!!., times = !!countq))
) %>% set_names(namesq)
make_tbl <- function(...) {
expr(tibble(!!!cols)) %>% eval(envir = df)
}
df %>% make_tbl
}
This seems to work:
test_counts %>% expand_counts_v2(count = Number) %>%
group_by(Population, Length) %>%
summarise(Number = n()) %>%
ungroup %>%
{ all.equal(., test_counts)}
# [1] TRUE
test_counts1 %>% expand_counts_v2(count = Number) %>%
group_by(Population, Length, Height, Width) %>%
summarise(Number = n()) %>%
ungroup %>%
{ all.equal(., test_counts1)}
# [1] TRUE
But I don't understand why. How is it evaluating for each row, even though I'm not using pmap anymore? The function needs to be applied to each row in order to work, so it must be somehow, but I can't see how it's doing that.
EDIT
After Artem's correct explanation of what was going on, I realised I could do this
expand_counts_v2 <- function(df, count = NULL) {
countq <- enexpr(count)
names <- df %>% select(-!!countq) %>% names
namesq <- names %>% map(as.name)
cols <- map(namesq, ~ expr(rep(!!., times = !!countq))
) %>% set_names(namesq)
expr(tibble(!!!cols)) %>% eval_tidy(data = df)
}
Which gets rid of the unnecessary mk_tbl function. However, as Artem said, that is only really working because rep is vectorised. So, it's working, but not by re-writing the _v0 function and pmapping it, which is the process I was trying to replicate. Eventually, I discovered, rlang::new_function and wrote:
expand_counts_v3 <- function(df, count = NULL) {
countq <- enexpr(count)
names <- df %>% select(-!!countq) %>% names
namesq <- names %>% map(as.name)
cols <- map(namesq, ~ expr(rep(!!., times = !!countq))
) %>% set_names(namesq)
all_names <- df %>% names %>% map(as.name)
args <- rep(0, times = length(all_names)) %>% as.list %>% set_names(all_names)
correct_function <- new_function(args, # this makes the function as in _v0
expr(tibble(!!!cols)) )
pmap_dfr(df, correct_function) # applies it as in _v0
}
which is longer, and probably uglier, but works the way I originally wanted.
The issue is in eval( envir = df ), which exposes the entire data frame to make_tbl(). Notice that you never use ... argument inside make_tbl(). Instead, the function effectively computes the equivalent of
with( df, tibble(Population = rep(Population, times = Number),
Length = rep(Length, times=Number)) )
regardless of what arguments you provide to it. When you call the function via pmap_dfr(), it essentially computes the above four times (once for each row) and concatenates the results by-row, resulting in the duplication of entries you've observed. When you remove pmap_dfr(), the function is called once, but since rep is itself vectorized (try doing rep( test_counts$Population, test_counts$Number ) to see what I mean), make_tbl() computes the entire result in one go.
I have a dataframe:
genes_1 = c("a","b","c","d","e")
genes_2 = c("f","g","c","e","j")
genes_3 = c("a","b","m","n","o")
df = data.frame(genes_1, genes_2, genes_3)
My desired output:
genes_1 = c("","","","d","")
genes_2 = c("f","g","","","j")
genes_3 = c("","","m","n","o")
df = data.frame(genes_1, genes_2, genes_3)
How can I achieve this?
Thanks
0-dependency base R solution:
data.frame(
genes_1 = c("a","b","c","d","e"),
genes_2 = c("f","g","c","e","j"),
genes_3 = c("a","b","m","n","o"),
stringsAsFactors = FALSE
) -> xdf
dups <- names(which(table(unlist(xdf, use.names = FALSE)) > 1))
xdf[] <- lapply(xdf, function(x) { x[x %in% dups] <- "" ; x })
xdf
unlist() recursively unwinds all the columns into a single character vector.
table() counts all occurrences of each element.
which() narrows down to only the ones which are TRUE
names() grabs the character select vector elements.
We then work by column to replace all occurrences in the vector that match with ""
library(microbenchmark)
library(data.table)
microbenchmark(
base = {
ydf <- xdf
dups <- names(which(table(unlist(ydf, use.names = FALSE)) > 1))
ydf[] <- lapply(ydf, function(x) { x[x %in% dups] <- "" ; x })
},
base.2 = {
ydf <- xdf
tmp <- unlist(ydf)
ydf[arrayInd(which(duplicated(tmp) | duplicated(tmp, fromLast = TRUE)), dim(ydf))] <- ""
},
tidyverse = {
ydf <- xdf
ydf %>%
gather(genes, value) %>%
add_count(value) %>%
mutate(value = ifelse(n > 1, "", value)) %>%
select(-n) %>%
group_by(genes) %>%
mutate(ID = 1:n()) %>%
spread(genes, value) %>%
select(-ID) -> ydf
},
data.table = {
ydt <- data.table(xdf)
ydt[,lapply(.SD, function(x) { x[x %in% dups] <- "" ; x })]
}
) %>%
{ print(.) ; . } %>%
autoplot()
Another base solution:
tmp <- unlist(df)
df[arrayInd(which(duplicated(tmp) | duplicated(tmp,fromLast=TRUE)), dim(df))] <- NA
# genes_1 genes_2 genes_3
#1 <NA> f <NA>
#2 <NA> g <NA>
#3 <NA> <NA> m
#4 d <NA> n
#5 <NA> j o
unlist just creates a long vector for all the values in df
arrayInd then creates a two-column row/column index for subsetting df for the duplicated values.
Here is a tidyverse solution. df2 is the final output.
library(tidyverse)
df2 <- df %>%
gather(genes, value) %>%
add_count(value) %>%
mutate(value = ifelse(n > 1, "", value)) %>%
select(-n) %>%
group_by(genes) %>%
mutate(ID = 1:n()) %>%
spread(genes, value) %>%
select(-ID)
dplyr programming question here. Trying to write a dplyr function which takes column names as inputs and also filters on a component outlined in the function. What I am trying to recreate is as follow called test:
#test df
x<- sample(1:100, 10)
y<- sample(c(TRUE, FALSE), 10, replace = TRUE)
date<- seq(as.Date("2018-01-01"), as.Date("2018-01-10"), by =1)
my_df<- data.frame(x = x, y =y, date =date)
test<- my_df %>% group_by(date) %>%
summarise(total = n(), total_2 = sum(y ==TRUE, na.rm=TRUE)) %>%
mutate(cumulative_a = cumsum(total), cumulative_b = cumsum(total_2)) %>%
ungroup() %>% filter(date >= "2018-01-03")
The function I am testing is as follows:
cumsum_df<- function(data, date_field, cumulative_y, minimum_date = "2017-04-21") {
date_field <- enquo(date_field)
cumulative_y <- enquo(cumulative_y)
data %>% group_by(!!date_field) %>%
summarise(total = n(), total_2 = sum(!!cumulative_y ==TRUE, na.rm=TRUE)) %>%
mutate(cumulative_a = cumsum(total), cumulative_b = cumsum(total_2)) %>%
ungroup() %>% filter((!!date_field) >= minimum_date)
}
test2<- cumsum_df(data = my_df, date_field = date, cumulative_y = y, minimum_date = "2018-01-03")
I have looked looked at some examples of using enquo and this thread gets me half way there:
Use variable names in functions of dplyr
But the issue is I get two different data frame outputs for test 1 and test 2. The one from the function outputs does not have data from the logical y referenced column.
I also tried this instead
cumsum_df<- function(data, date_field, cumulative_y, minimum_date = "2017-04-21") {
date_field <- enquo(date_field)
cumulative_y <- deparse(substitute(cumulative_y))
data %>% group_by(!!date_field) %>%
summarise(total = n(), total_2 = sum(data[[cumulative_y]] ==TRUE, na.rm=TRUE)) %>%
mutate(cumulative_a = cumsum(total), cumulative_b = cumsum(total_2)) %>%
ungroup() %>% filter((!!date_field) >= minimum_date)
}
test2<- cumsum_df(data= my_df, date_field = date, cumulative_y = y, minimum_date = "2018-01-04")
Based on this thread: Pass a data.frame column name to a function
But the output from my test 2 column is also wildly different and it seems to do some kind or recursive accumulation. Which again is different to my test date frame.
If anyone can help that would be much appreciated.
#Inputs:
n1 = c(5,6,7)
n2 = c(1,2,3)
list1 = data.frame(n1,n2)
list2 = data.frame(n1,n2)
listx = list(list1,list2)
n1 = c(5,6,7,8)
n2 = c(6,7,8,9)
list3 = data.frame(n1,n2)
list4 = data.frame(n1,n2)
list5 = data.frame(n1,n2)
listy = list(list3,list4,list5)
list6 = list(listx,listy)
#Code:
z <- list()
for(i in 1:length(list6)){
w <- data.frame(x=c(rep(0, nrow(list6[[i]][[1]])))) #init 0,0,0,0...
for(j in 1:length(list6[[i]])){
w[,1] <- w[,1] + list6[[i]][[j]]$n1
z[[i]] <- w
}
}
z
I believe there's a more efficient coding method instead of using double for-loop, would like lapply/sapply type equivalent (or any?). Many thanks
lapply(list6,function(x) Reduce("+",x)[,1,drop=FALSE])
This should do the job given list6.
With tidyverse, if there are no missing elements i.e NA, we can use the reduce approach
library(dplyr)
library(purrr)
list6 %>%
map(~ .x %>%
reduce(`+`) %>%
select(1))
Or in general, it can be done with group_by sum
list6 %>%
map(~ bind_rows(.x, .id = 'grp') %>%
group_by(grp) %>%
group_by(grp1 = row_number()) %>%
summarise_at(2, sum, na.rm = TRUE) %>%
select(-grp1) )