Use dynamically generated column names in dplyr - r

I have a data frame with multiple columns, the user provides a vector with the column names, and I want to count maximum amount of times an element appears
set.seed(42)
df <- tibble(
var1 = sample(c(1:3),10,replace=T),
var2 = sample(c(1:3),10,replace=T),
var3 = sample(c(1:3),10,replace=T)
)
select_vars <- c("var1", "var3")
df %>%
rowwise() %>%
mutate(consensus=max(table(unlist(c(var1,var3)))))
# A tibble: 10 x 4
# Rowwise:
var1 var2 var3 consensus
<int> <int> <int> <int>
1 1 1 1 2
2 1 1 3 1
3 1 2 1 2
4 1 2 1 2
5 2 2 2 2
6 2 3 3 1
7 2 3 2 2
8 1 1 1 2
9 3 1 2 1
10 3 3 2 1
This does exactly what I want, but when I try to use a vector of variables i cant get it to work
df %>%
rowwise() %>%
mutate(consensus=max(unlist(table(select_vars)) )))

You can wrap it in c(!!! syms()) to get it working, and you don't need the unlist apparently. But honestly, I'm not sure what you are trying to do, and why table is needed here. Do you just want to check if var2 and var3 are the same value and if then 2 and if not then 1?
library(dplyr)
df <- tibble(
var1 = sample(c(1:3),10,replace=T),
var2 = sample(c(1:3),10,replace=T),
var3 = sample(c(1:3),10,replace=T)
)
select_vars <- c("var2", "var3")
df %>%
rowwise() %>%
mutate(consensus=max(table(c(!!!syms(select_vars)))))
#> # A tibble: 10 x 4
#> # Rowwise:
#> var1 var2 var3 consensus
#> <int> <int> <int> <int>
#> 1 2 3 2 1
#> 2 3 1 3 1
#> 3 3 1 1 2
#> 4 3 3 3 2
#> 5 1 1 2 1
#> 6 2 1 3 1
#> 7 3 2 3 1
#> 8 1 2 3 1
#> 9 2 1 2 1
#> 10 2 1 1 2
Created on 2021-07-22 by the reprex package (v0.3.0)

In the OP's code, we need select
library(dplyr)
df %>%
rowwise() %>%
mutate(consensus=max(table(unlist(select(cur_data(), select_vars))) ))
-output
# A tibble: 10 x 4
# Rowwise:
var1 var2 var3 consensus
<int> <int> <int> <int>
1 1 1 1 2
2 1 1 3 1
3 1 2 1 2
4 1 2 1 2
5 2 2 2 2
6 2 3 3 1
7 2 3 2 2
8 1 1 1 2
9 3 1 2 1
10 3 3 2 1
Or just subset from cur_data() which would only return the data keeping the group attributes
df %>%
rowwise %>%
mutate(consensus = max(table(unlist(cur_data()[select_vars]))))
# A tibble: 10 x 4
# Rowwise:
var1 var2 var3 consensus
<int> <int> <int> <int>
1 1 1 1 2
2 1 1 3 1
3 1 2 1 2
4 1 2 1 2
5 2 2 2 2
6 2 3 3 1
7 2 3 2 2
8 1 1 1 2
9 3 1 2 1
10 3 3 2 1
Or using pmap
library(purrr)
df %>%
mutate(consensus = pmap_dbl(cur_data()[select_vars], ~ max(table(c(...)))))
# A tibble: 10 x 4
var1 var2 var3 consensus
<int> <int> <int> <dbl>
1 1 1 1 2
2 1 1 3 1
3 1 2 1 2
4 1 2 1 2
5 2 2 2 2
6 2 3 3 1
7 2 3 2 2
8 1 1 1 2
9 3 1 2 1
10 3 3 2 1
As these are rowwise operations, can get some efficiency if we use collapse functions
library(collapse)
tfm(df, consensus = dapply(slt(df, select_vars), MARGIN = 1,
FUN = function(x) fmax(tabulate(x))))
# A tibble: 10 x 4
var1 var2 var3 consensus
* <int> <int> <int> <int>
1 1 1 1 2
2 1 1 3 1
3 1 2 1 2
4 1 2 1 2
5 2 2 2 2
6 2 3 3 1
7 2 3 2 2
8 1 1 1 2
9 3 1 2 1
10 3 3 2 1
Benchmarks
As noted above, collapse is faster (run on a slightly bigger dataset)
df1 <- df[rep(seq_len(nrow(df)), 1e5), ]
system.time({
tfm(df1, consensus = dapply(slt(df1, select_vars), MARGIN = 1,
FUN = function(x) fmax(tabulate(x))))
})
#user system elapsed
# 5.257 0.123 5.323
system.time({
df1 %>%
mutate(consensus = pmap_dbl(cur_data()[select_vars], ~ max(table(c(...)))))
})
#user system elapsed
# 54.813 0.517 55.246
The rowwise operation is taking too much time, so stopped the execution
df1 %>%
rowwise() %>%
mutate(consensus=max(table(unlist(select(cur_data(), select_vars))) ))
})
Timing stopped at: 575.5 3.342 581.3

What you need is to use the verb all_of
df %>%
rowwise() %>%
mutate(consensus=max(table(unlist(all_of(select_vars)))))
# A tibble: 10 x 4
# Rowwise:
var1 var2 var3 consensus
<int> <int> <int> <int>
1 2 3 3 1
2 2 2 2 1
3 1 2 2 1
4 2 3 3 1
5 1 2 1 1
6 2 1 2 1
7 2 2 2 1
8 3 1 2 1
9 2 1 3 1
10 3 2 1 1

Related

R update values within a grouped df with information from updated previous value

I would like conditionally mutate variables (var1, var2) within groups (id) at different timepoints (timepoint) using previously updated/muated values according to this function:
change_function <- function(value,pastvalue,timepoint){
if(timepoint==1){valuenew=value} else
if(value==0){valuenew=pastvalue-1}
if(value==1){valuenew=pastvalue}
if(value==2){valuenew=pastvalue+1}
return(valuenew)
}
pastvalue is the MUTATED/UPDATED value at timepoint -1 for timepoint 2:4
Here is an example and output file:
``` r
#example data
df <- data.frame(id=c(1,1,1,1,2,2,2,2),timepoint=c(1,2,3,4,1,2,3,4),var1=c(1,0,1,2,2,2,1,0),var2=c(2,0,1,2,3,2,1,0))
df
#> id timepoint var1 var2
#> 1 1 1 1 2
#> 2 1 2 0 0
#> 3 1 3 1 1
#> 4 1 4 2 2
#> 5 2 1 2 3
#> 6 2 2 2 2
#> 7 2 3 1 1
#> 8 2 4 0 0
#desired output
output <- data.frame(id=c(1,1,1,1,2,2,2,2),timepoint=c(1,2,3,4,1,2,3,4),var1=c(1,0,0,1,2,3,3,2),var2=c(2,1,1,2,3,4,4,3))
output
#> id timepoint var1 var2
#> 1 1 1 1 2
#> 2 1 2 0 1
#> 3 1 3 0 1
#> 4 1 4 1 2
#> 5 2 1 2 3
#> 6 2 2 3 4
#> 7 2 3 3 4
#> 8 2 4 2 3
```
<sup>Created on 2020-11-23 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
My Approach: use my function using dplyr::mutate_at
library(dplyr)
df %>%
group_by(id) %>%
mutate_at(.vars=vars(var1,var2),
.funs=funs(.=change_function(.,dplyr::lag(.),timepoint)))
However, this does not work because if/else is not vectorized
Update 1:
Using a nested ifelse function does not give the desired output, because it does not use updated pastvalue's:
change_function <- function(value,pastvalue,timepoint){
ifelse((timepoint==1),value,
ifelse((value==0),pastvalue-1,
ifelse((value==1),pastvalue,
ifelse((value==2),pastvalue+1,NA))))
}
library(dplyr)
df %>%
group_by(id) %>%
mutate_at(.vars=vars(var1,var2),
.funs=funs(.=change_function(.,dplyr::lag(.),timepoint)))
id TimePoint var1 var2 var1_. var2_.
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 2 1 2
2 1 2 0 0 0 1
3 1 3 1 1 0 0
4 1 4 2 2 2 2
5 2 1 2 3 2 3
6 2 2 2 2 3 4
7 2 3 1 1 2 2
8 2 4 0 0 0 0
Update 2:
According to the comments, purrr:accumulate could be used
Thanks to akrun I could get the correct function:
# write a vectorized function
change_function <- function(prev, new) {
change=if_else(new==0,-1,
if_else(new==1,0,1))
if_else(is.na(new), new, prev + change)
}
# use purrr:accumulate
df %>%
group_by(id) %>%
mutate_at(.vars=vars(var1,var2),
.funs=funs(accumulate(.,change_function)))
# A tibble: 8 x 4
# Groups: id [2]
id timepoint var1 var2
<dbl> <dbl> <dbl> <dbl>
1 1 1 1 2
2 1 2 0 1
3 1 3 0 1
4 1 4 1 2
5 2 1 2 3
6 2 2 3 4
7 2 3 3 4
8 2 4 2 3

Remove Redundant row with large number of variable

I have data with 33 attribute. 30 of them is variable. And other 3 column is cluster number ,degree and sum of degree. I want to remove duplicate row which have same value from variable 1 until 30. Within duplicate row I want to choose the row which have highest values of sum degree to remain in the data. This coding is run in R. My question is how do I simplify zz.
df_order=dfOrder(rule2,c(33),ascending=FALSE)
df_order2=as_tibble(df_order)
zz=df_order2 %>% distinct(X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23,X24,X25,X26,X27,X28,X29,X30,.keep_all = TRUE)
Sample data:
set.seed(42)
dat <- tibble(a=rep(1:2, each=10), b=rep(1:4, each=5), x1=sample(3,size=20,replace=TRUE), x2=sample(3,size=20,replace=TRUE), x3=sample(3,size=20,replace=TRUE))
dat
# # A tibble: 20 x 5
# a b x1 x2 x3
# <int> <int> <int> <int> <int>
# 1 1 1 1 1 3
# 2 1 1 1 3 3
# 3 1 1 1 1 1
# 4 1 1 1 1 1
# 5 1 1 2 2 2
# 6 1 2 2 3 2
# ...truncated...
Brute-force to show what distinct gives you:
distinct(dat, x1, x2, x3, .keep_all = TRUE)
# # A tibble: 14 x 5
# a b x1 x2 x3
# <int> <int> <int> <int> <int>
# 1 1 1 1 1 3
# 2 1 1 1 3 3
# 3 1 1 1 1 1
# 4 1 1 2 2 2
# 5 1 2 2 3 2
# 6 1 2 1 1 2
# 7 1 2 3 2 2
# 8 1 2 3 2 3
# 9 2 3 1 3 2
# 10 2 3 1 3 1
# 11 2 3 2 2 3
# 12 2 4 3 1 2
# 13 2 4 1 2 1
# 14 2 4 3 2 1
Programmatic way, without specifying each of x1 through x3, both work (depending on your preference towards "just use these" or "don't use those"). The first two work in base R and tidyverse equally well, the third is using dplyr::select.
dat[!duplicated(subset(dat, select = -(a:b))),]
dat[!duplicated(subset(dat, select = x1:x3)),]
dat[!duplicated(select(dat, x1:x3)),] # or -(a:b), same
Or perhaps a pipe-looking method:
select(dat, x1:x3) %>%
Negate(duplicated)(.) %>%
which(.) %>%
slice(dat, .)
Using the data from #r2evans post an option is to use splice after converting the column names to symbols
library(dplyr)
dat %>%
distinct(!!! rlang::syms(names(select(., starts_with('x')))), .keep_all = TRUE)
# A tibble: 14 x 5
# a b x1 x2 x3
# <int> <int> <int> <int> <int>
# 1 1 1 1 1 3
# 2 1 1 1 3 3
# 3 1 1 1 1 1
# 4 1 1 2 2 2
# 5 1 2 2 3 2
# 6 1 2 1 1 2
# 7 1 2 3 2 2
# 8 1 2 3 2 3
# 9 2 3 1 3 2
#10 2 3 1 3 1
#11 2 3 2 2 3
#12 2 4 3 1 2
#13 2 4 1 2 1
#14 2 4 3 2 1
From dplyr version >= 1.0.0, we can also use distinct with across
dat %>%
distinct(across(starts_with('x')), .keep_all = TRUE)
# A tibble: 14 x 5
# a b x1 x2 x3
# <int> <int> <int> <int> <int>
# 1 1 1 1 1 3
# 2 1 1 1 3 3
# 3 1 1 1 1 1
# 4 1 1 2 2 2
# 5 1 2 2 3 2
# 6 1 2 1 1 2
# 7 1 2 3 2 2
# 8 1 2 3 2 3
# 9 2 3 1 3 2
#10 2 3 1 3 1
#11 2 3 2 2 3
#12 2 4 3 1 2
#13 2 4 1 2 1
#14 2 4 3 2 1

is there a way in R to fill missing groups absent of observations?

Say I have something like:
df<-data.frame(group=c(1, 1,1, 2,2,2,3,3,3,4,4, 1, 1,1),
group2=c(1,2,3,1,2,3,1,2,3,1,3, 1,2,3))
group group2
1 1 1
2 1 2
3 1 3
4 2 1
5 2 2
6 2 3
7 3 1
8 3 2
9 3 3
10 4 1
11 4 3
12 1 1
13 1 2
14 1 3
My goal is to count the number of unique instances for group= something and group2= something. Like so:
df1<-df%>%group_by(group, group2)%>% mutate(want=n())%>%distinct(group, group2, .keep_all=TRUE)
group group2 want
<dbl> <dbl> <int>
1 1 1 2
2 1 2 2
3 1 3 2
4 2 1 1
5 2 2 1
6 2 3 1
7 3 1 1
8 3 2 1
9 3 3 1
10 4 1 1
11 4 3 1
however, notice that group=4, group2=2 was not in my dataset to begin with. Is there some sort of autofill function where I can fill these non-observations with a zero to get below easily?:
group group2 want
<dbl> <dbl> <int>
1 1 1 2
2 1 2 2
3 1 3 2
4 2 1 1
5 2 2 1
6 2 3 1
7 3 1 1
8 3 2 1
9 3 3 1
10 4 1 1
11 4 2 0
12 4 3 1
After getting the count, we can expand with complete to fill the missing combinations with 0
library(dplyr)
library(tidyr)
df %>%
count(group, group2) %>%
complete(group, group2, fill = list(n = 0))
# A tibble: 12 x 3
# group group2 n
# <dbl> <dbl> <dbl>
# 1 1 1 2
# 2 1 2 2
# 3 1 3 2
# 4 2 1 1
# 5 2 2 1
# 6 2 3 1
# 7 3 1 1
# 8 3 2 1
# 9 3 3 1
#10 4 1 1
#11 4 2 0
#12 4 3 1
Or if we do the group_by, instead of mutate and then do the distinct, directly use the summarise
df %>%
group_by(group, group2) %>%
summarise(n = n()) %>%
ungroup %>%
complete(group, group2, fill = list(n = 0))
Here is a data.table approach solution to this problem:
library(data.table)
setDT(df)[CJ(group, group2, unique = TRUE),
c(.SD, .(want = .N)), .EACHI,
on = c("group", "group2")]
# group group2 want
# 1 1 2
# 1 2 2
# 1 3 2
# 2 1 1
# 2 2 1
# 2 3 1
# 3 1 1
# 3 2 1
# 3 3 1
# 4 1 1
# 4 2 0
# 4 3 1

How to arrange/sort by unique sequences?

A) Here is my data frame arranged by plate:
df <- read.table(header=TRUE, stringsAsFactors=FALSE, text="
plate phase score
A 1 1
A 1 1
A 1 1
A 2 1
A 2 1
A 2 1
A 3 2
A 3 2
A 3 2
B 1 1
B 1 1
B 1 2
B 2 1
B 2 1
B 2 3")
B) Goal: I want to order it by both plate first and then phase but sequentially (see below how the rows are ordered alphabetically by plate but sequentially by phase
plate phase score
<chr> <int> <int>
1 A 1 1
2 A 2 1
3 A 3 2
4 A 1 1
5 A 2 1
6 A 3 2
7 A 1 1
8 A 2 1
9 A 3 2
10 B 1 1
11 B 2 1
12 B 1 1
13 B 2 1
14 B 1 2
15 B 2 3
One option is to create a sequence variable grouped by 'plate', 'phase' and arrange on it along with 'plate' and 'score'
library(dplyr)
df %>%
group_by(plate, phase) %>%
mutate(rn = row_number()) %>%
ungroup %>%
arrange(plate, rn, score) %>%
select(-rn)
# A tibble: 15 x 3
# plate phase score
# <chr> <int> <int>
# 1 A 1 1
# 2 A 2 1
# 3 A 3 2
# 4 A 1 1
# 5 A 2 1
# 6 A 3 2
# 7 A 1 1
# 8 A 2 1
# 9 A 3 2
#10 B 1 1
#11 B 2 1
#12 B 1 1
#13 B 2 1
#14 B 1 2
#15 B 2 3
Or using data.table
library(data.table)
setDT(df)[order(plate, rowid(phase), score)]
df[with(df, order(plate, ave(phase, phase, FUN = seq_along), phase)),]
#> plate phase score
#> 1 A 1 1
#> 4 A 2 1
#> 7 A 3 2
#> 2 A 1 1
#> 5 A 2 1
#> 8 A 3 2
#> 3 A 1 1
#> 6 A 2 1
#> 9 A 3 2
#> 10 B 1 1
#> 13 B 2 1
#> 11 B 1 1
#> 14 B 2 1
#> 12 B 1 2
#> 15 B 2 3

The dplyr way to get grouped differences

I am trying to figure out the dplyr way to do grouped differences.
Here is some fake data:
>crossing(year=seq(1,4),week=seq(1,3)) %>%
mutate(value = c(rep(4,3),rep(3,3),rep(2,3),rep(1,3)))
year week value
<int> <int> <dbl>
1 1 1 4
2 1 2 4
3 1 3 4
4 2 1 3
5 2 2 3
6 2 3 3
7 3 1 2
8 3 2 2
9 3 3 2
10 4 1 1
11 4 2 1
12 4 3 1
What I would like is year 1- year2, year2-year3, and year3-year4. The result would like like the following.
year week diffs
<int> <int> <dbl>
1 1 1 1
2 1 2 1
3 1 3 1
4 2 1 1
5 2 2 1
6 2 3 1
7 3 1 1
8 3 2 1
9 3 3 1
Edit:
I apologize. I was trying to make a simple reprex, but I messed up a lot.
Please let me know what the proper etiquette is. I don't want to ruffle any feathers.
I did not know that -diff() was a function. What I am actually looking for is percent difference ((new-old)/old)*100 and I am not able to find a straight forward way to use diff to get that value.
I am starting from the largest year. Adding a arrange(desc(year)) to the above code is what I have. I would be trimming the smallest year not the largest.
If this edit with worth a separate question let me know.
If you don't have missing years for each week:
df %>%
arrange(year) %>%
group_by(week) %>%
mutate(diffs = value - lead(value)) %>%
na.omit() %>% select(-value)
# A tibble: 9 x 3
# Groups: week [3]
# year week diffs
# <int> <int> <dbl>
#1 1 1 1
#2 1 2 1
#3 1 3 1
#4 2 1 1
#5 2 2 1
#6 2 3 1
#7 3 1 1
#8 3 2 1
#9 3 3 1
You can use diff, but it needs adjusting, as it subtracts the other way and returns a vector that's one shorter than what it's passed:
library(tidyverse)
diffed <- crossing(year = seq(1,4),
week = seq(1,3)) %>%
mutate(value = rep(4:1, each = 3)) %>%
group_by(week) %>%
mutate(value = c(-diff(value), NA)) %>%
drop_na(value)
diffed
#> # A tibble: 9 x 3
#> # Groups: week [3]
#> year week value
#> <int> <int> <int>
#> 1 1 1 1
#> 2 1 2 1
#> 3 1 3 1
#> 4 2 1 1
#> 5 2 2 1
#> 6 2 3 1
#> 7 3 1 1
#> 8 3 2 1
#> 9 3 3 1
using dplyr and do:
library(dplyr)
df %>% group_by(week) %>% do(cbind(.[-nrow(.),1:2],diffs=-diff(.$value)))
# # A tibble: 9 x 3
# # Groups: week [3]
# year week diffs
# <int> <int> <dbl>
# 1 1 1 1
# 2 2 1 1
# 3 3 1 1
# 4 1 2 1
# 5 2 2 1
# 6 3 2 1
# 7 1 3 1
# 8 2 3 1
# 9 3 3 1

Resources