Grouping Over All Possible Combinations of Several Variables With dplyr - r

Given a situation such as the following
library(dplyr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
I would like to group `myData' to eventually find summary data grouping by all possible combinations of var2, var3, and var4.
I can create a list with all possible combinations of variables as character values with
groupNames <- names(myData)[2:4]
myGroups <- Map(combn,
list(groupNames),
seq_along(groupNames),
simplify = FALSE) %>%
unlist(recursive = FALSE)
My plan was to make separate data sets for each variable combination with a for() loop, something like
### This Does Not Work
for (i in 1:length(myGroups)){
assign( myGroups[i]%>%
unlist() %>%
paste0(collapse = "")%>%
paste0("Data"),
myData %>%
group_by_(lapply(myGroups[[i]], as.symbol)) %>%
summarise( n = length(var1),
avgVar2 = var2 %>%
mean()))
}
Admittedly I am not very good with lists, and looking up this issue was a bit challenging since dpyr updates have altered how grouping works a bit.
If there is a better way to do this than separate data sets I would love to know.
I've gotten a loop similar to above working when I am only grouping by a single variable.
Any and all help is greatly appreciated! Thank you!

This seems convulated, and there's probably a way to simplify or fancy it up with a do, but it works. Using your myData and myGroups,
results = lapply(myGroups, FUN = function(x) {
do.call(what = group_by_, args = c(list(myData), x)) %>%
summarise( n = length(var1),
avgVar1 = mean(var1))
}
)
> results[[1]]
Source: local data frame [3 x 3]
var2 n avgVar1
1 a 31 0.38929738
2 b 31 -0.07451717
3 c 38 -0.22522129
> results[[4]]
Source: local data frame [9 x 4]
Groups: var2
var2 var3 n avgVar1
1 a A 11 -0.1159160
2 a B 11 0.5663312
3 a C 9 0.7904056
4 b A 7 0.0856384
5 b B 13 0.1309756
6 b C 11 -0.4192895
7 c A 15 -0.2783099
8 c B 10 -0.1110877
9 c C 13 -0.2517602
> results[[7]]
# I won't paste them here, but it has all 27 rows, grouped by var2, var3 and var4.
I changed your summarise call to average var1 since var2 isn't numeric.

I have created a function based on the answer of #Gregor and the comments that followed:
library(magrittr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
Function combSummarise
combSummarise <- function(data, variables=..., summarise=...){
# Get all different combinations of selected variables (credit to #Michael)
myGroups <- lapply(seq_along(variables), function(x) {
combn(c(variables), x, simplify = FALSE)}) %>%
unlist(recursive = FALSE)
# Group by selected variables (credit to #konvas)
df <- eval(parse(text=paste("lapply(myGroups, function(x){
dplyr::group_by_(data, .dots=x) %>%
dplyr::summarize_( \"", paste(summarise, collapse="\",\""),"\")})"))) %>%
do.call(plyr::rbind.fill,.)
groupNames <- c(myGroups[[length(myGroups)]])
newNames <- names(df)[!(names(df) %in% groupNames)]
df <- cbind(df[, groupNames], df[, newNames])
names(df) <- c(groupNames, newNames)
df
}
Call of combSummarise
combSummarise (myData, var=c("var2", "var3", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)"))
etc

Inspired by the answers by Gregor and dimitris_ps, I wrote a dplyr style function that runs summarise for all combinations of group variables.
summarise_combo <- function(data, ...) {
groupVars <- group_vars(data) %>% map(as.name)
groupCombos <- map( 0:length(groupVars), ~combn(groupVars, ., simplify=FALSE) ) %>%
unlist(recursive = FALSE)
results <- groupCombos %>%
map(function(x) {data %>% group_by(!!! x) %>% summarise(...)} ) %>%
bind_rows()
results %>% select(!!! groupVars, everything())
}
Example
library(tidyverse)
mtcars %>% group_by(cyl, vs) %>% summarise_combo(cyl_n = n(), mean(mpg))

Using unite to create a new column is the simplest way
library(tidyverse)
df = tibble(
a = c(1,1,2,2,1,1,2,2),
b = c(3,4,3,4,3,4,3,4),
val = c(1,2,3,4,5,6,7,8)
)
print(df)#output1
df_2 = unite(df, 'combined_header', a, b, sep='_', remove=FALSE) #remove=F doesn't remove existing columns
print(df_2)#output2
df_2 %>% group_by(combined_header) %>%
summarize(avg_val=mean(val)) %>% print()#output3
#avg 1_3 = mean(1,5)=3 avg 1_4 = mean(2, 6) = 4
RESULTS
Output:
output1
a b val
<dbl> <dbl> <dbl>
1 1 3 1
2 1 4 2
3 2 3 3
4 2 4 4
5 1 3 5
6 1 4 6
7 2 3 7
8 2 4 8
output2
combined_header a b val
<chr> <dbl> <dbl> <dbl>
1 1_3 1 3 1
2 1_4 1 4 2
3 2_3 2 3 3
4 2_4 2 4 4
5 1_3 1 3 5
6 1_4 1 4 6
7 2_3 2 3 7
8 2_4 2 4 8
output3
combined_header avg_val
<chr> <dbl>
1 1_3 3
2 1_4 4
3 2_3 5
4 2_4 6

Related

How to do a for loop with case_when

I'm a beginner with R and I'm trying to do a for-loop to recode many variables: when "test" modality is missing, then have "test.v1" modality. It looked very easy to do, but I can't get it:
VEC_1 <- c("test1","test2","test3","test4","test5","test6","test7","test8","test9")
VEC_2 <- c("test1.v1","test2.v1","test3.v1","test4.v1","test5.v1","test6.v1","test7.v1","test8.v1","test9.v1")
for (i in 1:(min(length(VEC_1), length(VEC_2)))){
df2 <- df1 %>%
mutate(
VEC_1[i] = case_when(
is.na(VEC_1[i]) & !is.na(VEC_2[i]) ~ VEC_2[i],
TRUE ~ VEC_1[i])
)
}
I have this error
Unexpected error : '=' in:
" mutate(
VEC_1[i] ="
Does anyone have an idea ?
EDIT: df1 is like :
test1 <- c("A","B","A","A",NA,"B","A",NA,"A")
test1.v1 <- c("B",NA,"B","B","A","B","B",NA,"A")
test2 <- c("B","B","B","B",NA,"C","C","C","C")
test2.v1 <- c("C",NA,"A","A","B","B","C",NA,"C")
test3 <- c("A","B","B","B",NA,"C","C",NA,"C")
test3.v1 <- c("B","A","B",NA,"A","A","A","A",NA)
test4 <- c(NA,"B","B","A",NA,"B","A",NA,"A")
test4.v1 <- c("B","B","B","A","A","B","B","B","B")
df1 <- data.frame(test1,test1.v1,test2,test2.v1,test3,test3.v1,test4,test4.v1)
Based on the example data.frame df1, I'm wondering if you might try putting your data into long form, then grouping by row number and test, then substituting missing values.
library(tidyverse)
df1 %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, names_to = c("test", "mode"), names_pattern = "test(\\d+)([.v1]*)") %>%
group_by(rn, test) %>%
mutate(value = ifelse(mode == "" & is.na(value), value[mode == ".v1"], value)) %>%
pivot_wider(id_cols = rn, names_from = c(test, mode), values_from = value, names_prefix = "test", names_sep = "")
Output
rn test1 test1.v1 test2 test2.v1 test3 test3.v1 test4 test4.v1
<int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 A B B C A B B B
2 2 B NA B NA B A B B
3 3 A B B A B B B B
4 4 A B B A B NA A A
5 5 A A B B A A A A
6 6 B B C B C A B B
7 7 A B C C C A A B
8 8 NA NA C NA A A B B
9 9 A A C C C NA A B
I'm assuming VEC_1 and VEC_2 are the same length here. you don't need to use a for-loop, mutate affects the whole column of the dataframe, it kinda behaves like a for-loop. I have rewritten your code like this and changed the data for testing purposes:
library(dplyr)
VEC_1 <- c("test1","test2","test3","test4",NA,"test6","test7",NA,"test9")
VEC_2 <- c("test1.v1",NA,"test3.v1","test4.v1","test5.v1","test6.v1","test7.v1",NA,"test9.v1")
df <- data.frame(VEC_1,VEC_2)
df %>% mutate(VEC_1 = if_else(is.na(VEC_1) & !is.na(VEC_2),VEC_2,VEC_1))
which is equal to
df <- data.frame(VEC_1,VEC_2)
for (i in 1:nrow(df)){
if(is.na(df$VEC_1[i]) & !is.na(df$VEC_2[i])){
df$VEC_1[i] = df$VEC_2[i]
}
}
Output:
> df
VEC_1 VEC_2
1 test1 test1.v1
2 test2 <NA>
3 test3 test3.v1
4 test4 test4.v1
5 test5.v1 test5.v1
6 test6 test6.v1
7 test7 test7.v1
8 <NA> <NA>
9 test9 test9.v1
i also changed from case_when to if_else, because you only check one condition.
?if_else
if_else(condition, true, false, missing = NULL)
ifelse/if_else will be a good solution, unless you have multiple conditions, then case_when is your friend. A loop isn't necessary:
library(dplyr)
VEC_1 <- c("test1","test2",NA,NA)
VEC_2 <- c("test1.v1",NA,NA,"test4.v1")
df <- tibble(VEC_1, VEC_2)
df %>%
mutate(VEC_1 = case_when(
is.na(VEC_2) ~ VEC_1,
is.na(VEC_1) ~ VEC_2,
TRUE ~ VEC_1)
)
# A tibble: 4 × 2
VEC_1 VEC_2
<chr> <chr>
1 test1 test1.v1
2 test2 NA
3 NA NA
4 NA test4.v1
# A tibble: 4 × 2
VEC_1 VEC_2
<chr> <chr>
1 test1 test1.v1
2 test2 NA
3 NA NA
4 test4.v1 test4.v1

product of multiple selected columns in a data frame in R

I have a data frame with a subset of variables that starts with 'AA_' (e.g., AA_1, AA_2, ... AA_100) along with other variables X, Y, Z.
If I would like to get the produce of all 'AA_' variables, what would be the most efficient way in R to achieve this?
I am thinking something like
mydata = mydata %>%
mutate(AA_product = reduce(starts_with('AA_'), `*`))
but it does not quite work
Here, we need to select the data
library(dplyr)
library(purrr)
mydata %>%
mutate(AA_product = reduce(select(., starts_with( 'AA_')), `*`))
-output
# X Y Z AA_1 AA_2 AA_3 AA_product
#1 1 2 3 1 2 3 6
#2 2 3 4 2 3 4 24
#3 3 4 5 3 4 5 60
Another less efficient approach is rowwise with c_across
mydata %>%
rowwise() %>%
mutate(AA_prod = prod(c_across(starts_with('AA')))) %>%
ungroup
data
mydata <- data.frame(X = 1:3, Y = 2:4, Z = 3:5,
AA_1 = 1:3, AA_2 = 2:4, AA_3 = 3:5)
If you want row-wise product for "AA_" columns, you can do this in base R with Reduce :
cols <- grep('AA_', names(mydata))
mydata$AA_product <- Reduce(`*`, mydata[cols])
and apply :
mydata$AA_product <- apply(mydata[cols], 1, prod)

Create NA's based on another dataframe without long data

I have a tibble with the explicit "id" and colnames I need to convert to NA's. Is there anyway I can create the NA's without making my df a long dataset? I considered using the new rows_update function, but I'm not sure if this is correct because I only want certain columns to be NA.
library(dplyr)
to_na <- tribble(~x, ~col,
1, "z",
3, "y"
)
df <- tibble(x = c(1,2,3),
y = c(1,1,1),
z = c(2,2,2))
# desired output:
#> # A tibble: 3 x 3
#> x y z
#> <dbl> <dbl> <dbl>
#> 1 1 1 NA
#> 2 2 1 2
#> 3 3 NA 2
Created on 2020-07-03 by the reprex package (v0.3.0)
This definitely isn't the most elegant solution, but it gets the output you want.
library(dplyr)
library(purrr)
to_na <- tribble(~x, ~col,
1, "z",
3, "y"
)
df <- tibble(x = c(1,2,3),
y = c(1,1,1),
z = c(2,2,2))
map2(to_na$x, to_na$col, #Pass through these two objects in parallel
function(xval_to_missing, col) df %>% #Two objects above matched by position here.
mutate_at(col, #mutate_at the specified cols
~if_else(x == xval_to_missing, NA_real_, .) #if x == xval_to_missing, make NA, else keep as is.
) %>%
select(x, col) #keep x and the modified column.
) %>% #end of map2
reduce(left_join, by = "x") %>% #merge within the above list, by x.
relocate(x, y, z) #Keep your ordering
Output:
# A tibble: 3 x 3
x y z
<dbl> <dbl> <dbl>
1 1 1 NA
2 2 1 2
3 3 NA 2
We can use row/column indexing to assign the values to NA in base R
df <- as.data.frame(df)
df[cbind(to_na$x, match(to_na$col, names(df)))] <- NA
df
# x y z
#1 1 1 NA
#2 2 1 2
#3 3 NA 2
If we want to use rows_update
library(dplyr)
library(tidyr)
library(purrr)
lst1 <- to_na %>%
mutate(new = NA_real_) %>%
split(seq_len(nrow(.))) %>%
map(~ .x %>%
pivot_wider(names_from = col, values_from = new))
for(i in seq_along(lst1)) df <- rows_update(df, lst1[[i]])
df
# A tibble: 3 x 3
# x y z
# <dbl> <dbl> <dbl>
#1 1 1 NA
#2 2 1 2
#3 3 NA 2

dplyr::mutate_at() relying on multiple columns with a given prefix/suffix

dplyr::mutate_at() can be used to apply the same function to multiple columns. It also allows you to set the results in new columns using a named list.
However, what if I have many columns in pairs (say, data1_a, data1_b, data2_a, data2_b, ...) and I want to multiply those pairs together? Is that possible?
By hand, that would look like
suppressPackageStartupMessages({
library(dplyr)
})
data.frame(data1_a = 1:3, data1_b = 2:4,
data2_a = 3:5, data2_b = 4:6) %>%
mutate(
data1 = data1_a * data1_b,
data2 = data2_a * data2_b
)
#> data1_a data1_b data2_a data2_b data1 data2
#> 1 1 2 3 4 2 12
#> 2 2 3 4 5 6 20
#> 3 3 4 5 6 12 30
My current solution is to write a function which takes the unsuffixed variable name (i.e. "data1"), creates the suffixed names and then performs a simple mutate() on that variable using get(). I then call that function for each output:
foo <- function(df, name) {
a <- paste0(name, "_a")
b <- paste0(name, "_b")
return(
mutate(
df,
!!name := get(a) * get(b)
)
)
}
data.frame(data1_a = 1:3, data1_b = 2:4,
data2_a = 3:5, data2_b = 4:6) %>%
foo("data1") %>%
foo("data2")
#> data1_a data1_b data2_a data2_b data1 data2
#> 1 1 2 3 4 2 12
#> 2 2 3 4 5 6 20
#> 3 3 4 5 6 12 30
(or write a loop over all the variable names if there were more of them)
But if it's possible to use mutate_at or something of the sort, that'd be much cleaner.
We can use pivot_longer/pivot_wider
library(dplyr)
library(tidyr)
df1 %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, names_to = c('grp', '.value'),
names_sep = "_") %>%
group_by(grp) %>%
transmute(rn, new = a * b) %>%
pivot_wider(names_from = grp, values_from = new) %>%
select(-rn) %>%
bind_cols(df1, .)
# A tibble: 3 x 6
# data1_a data1_b data2_a data2_b data1 data2
# <int> <int> <int> <int> <int> <int>
#1 1 2 3 4 2 12
#2 2 3 4 5 6 20
#3 3 4 5 6 12 30
Or another option is to split into a list based on the column names and then do the *
library(purrr)
library(stringr)
df1 %>%
split.default(str_remove(names(.), "_.*")) %>%
map_dfr(reduce, `*`) %>%
bind_cols(df1, .)
# A tibble: 3 x 6
# data1_a data1_b data2_a data2_b data1 data2
# <int> <int> <int> <int> <int> <int>
#1 1 2 3 4 2 12
#2 2 3 4 5 6 20
#3 3 4 5 6 12 30
With mutate, it is possible, but it would be more manual
df1 %>%
mutate(data1 = select(., starts_with('data1')) %>%
reduce(`*`),
data2 = select(., starts_with('data2')) %>%
reduce(`*`))
data
df1 <- data.frame(data1_a = 1:3, data1_b = 2:4,
data2_a = 3:5, data2_b = 4:6)
After adopting #akrun's elegant solution, I noticed it was unfortunately very inefficient (since it has to recreate two dataframes), taking almost a second on a dataset with 20,000 rows and 11 "groups".
So a while ago I developed the following function (with a bit of help from #user12728748... sorry for not posting here sooner), which takes the names of the groups ("data1", "data2", etc) and a formula using the prefixes, allowing for bquote-style quoting for constant names:
suppressPackageStartupMessages(library(dplyr))
mutateSet <- function(df, colNames, formula,
isPrefix = TRUE,
separator = "_") {
vars <- all.vars(formula)
# extracts names wrapped in `.()`
escapedNames <- function (expr)
{
unquote <- function(e) {
if (is.pairlist(e) || length(e) <= 1L) NULL
else if (e[[1L]] == as.name(".")) deparse(e[[2L]])
else unlist(sapply(e, unquote))
}
unquote(substitute(expr))
}
escapedVars <- eval(rlang::expr(escapedNames(!!formula)))
# remove escaped names from mapping variables
vars <- setdiff(vars, escapedVars)
# get output prefix/suffix as string
lhs <- rlang::f_lhs(formula) %>%
all.vars()
# get operation as string
# deparse() can have line breaks; paste0() brings it back to one line
rhs <- rlang::f_rhs(formula) %>%
deparse() %>%
paste0(collapse = "")
# dummy function to cover for bquote escaping
. <- function(x) x
for (i in colNames) {
if (isPrefix) {
aliases <- paste0(vars, separator, i)
newCol <- paste0(lhs, separator, i)
} else {
aliases <- paste0(i, separator, vars)
newCol <- paste0(i, separator, lhs)
}
if (length(lhs) == 0) newCol <- i
mapping <- rlang::list2(!!!aliases)
names(mapping) <- vars
mapping <- do.call(wrapr::qc, mapping)
df <- rlang::expr(wrapr::let(
mapping,
df %>% dplyr::mutate(!!newCol := ...RHS...)
)) %>%
deparse() %>%
gsub(
pattern = "...RHS...",
replacement = rhs
) %>%
{eval(parse(text = .))}
}
return(df)
}
df <- data.frame(a_data1 = 1:3, b_data1 = 2:4,
a_data2 = 3:5, b_data2 = 4:6,
static = 5:7)
mutateSet(df, "data1", ~ a + b)
#> a_data1 b_data1 a_data2 b_data2 static data1
#> 1 1 2 3 4 5 3
#> 2 2 3 4 5 6 5
#> 3 3 4 5 6 7 7
mutateSet(df, c("data1", "data2"), x ~ sqrt(a) + b)
#> a_data1 b_data1 a_data2 b_data2 static x_data1 x_data2
#> 1 1 2 3 4 5 3.000000 5.732051
#> 2 2 3 4 5 6 4.414214 7.000000
#> 3 3 4 5 6 7 5.732051 8.236068
mutateSet(df, c("data1", "data2"), ~ a + b + .(static))
#> a_data1 b_data1 a_data2 b_data2 static data1 data2
#> 1 1 2 3 4 5 8 12
#> 2 2 3 4 5 6 11 15
#> 3 3 4 5 6 7 14 18
Created on 2020-04-28 by the reprex package (v0.3.0)
This can probably be cleaned up (especially that heinous for-loop), but it works for now.
Repeating #user12728748's performance test, we see this is ~100x faster:
suppressPackageStartupMessages({
invisible(lapply(c("dplyr", "tidyr", "rlang", "wrapr", "microbenchmark"),
require, character.only = TRUE))
})
polymutate <- function(df, formula) {
form <- rlang::f_rhs(formula)
df %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, names_to = c('.value', 'grp'),
names_sep = "_") %>%
group_by(grp) %>%
transmute(rn, new = eval(form)) %>%
pivot_wider(names_from = grp, values_from = new) %>%
select(-rn) %>%
bind_cols(df, .)
}
set.seed(1)
df <- setNames(data.frame(matrix(sample(1:12, 6E6, replace=TRUE), ncol=6)),
c("a_data1", "b_data1", "a_data2", "b_data2", "a_data3", "b_data3"))
pd <- polymutate(df, ~ a + b)
pd2 <- mutateSet(df, c("data1", "data2", "data3"), ~ a + b)
all.equal(pd, pd2)
#> [1] TRUE
microbenchmark(polymutate(df, ~ a + b),
mutateSet(df, c("data1", "data2", "data3"), ~ a + b),
times=10L)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> polymutate 1612.306 1628.9776 1690.78586 1670.15600 1741.3490 1806.1412 10
#> mutateSet 8.757 9.6302 13.27135 10.45965 19.2976 20.4657 10
This is now possible using the cur_column() function within across().
library(tidyverse)
dat <- data.frame(
data1_a = 1:3,
data1_b = 2:4,
data2_a = 3:5,
data2_b = 4:6
)
mutate(
dat,
across(ends_with("a"), ~ . * dat[[str_replace(cur_column(), "a$", "b")]],
.names = "updated_{col}")
)
Returns:
data1_a data1_b data2_a data2_b updated_data1_a updated_data2_a
1 1 2 3 4 2 12
2 2 3 4 5 6 20
3 3 4 5 6 12 30
Where updated_data1_a and updated_data2_a contain the desired output variables.

Group data by factor level, then transform to data frame with colname being levels?

There is my problem that I can't solve it:
Data:
df <- data.frame(f1=c("a", "a", "b", "b", "c", "c", "c"),
v1=c(10, 11, 4, 5, 0, 1, 2))
data.frame:f1 is factor
f1 v1
a 10
a 11
b 4
b 5
c 0
c 1
c 2
# What I want is:(for example, fetch data with the number of element of some level == 2, then to data.frame)
a b
10 4
11 5
Thanks in advance!
I might be missing something simple here , but the below approach using dplyr works.
library(dplyr)
nlevels = 2
df1 <- df %>%
add_count(f1) %>%
filter(n == nlevels) %>%
select(-n) %>%
mutate(rn = row_number()) %>%
spread(f1, v1) %>%
select(-rn)
This gives
# a b
# <int> <int>
#1 10 NA
#2 11 NA
#3 NA 4
#4 NA 5
Now, if you want to remove NA's we can do
do.call("cbind.data.frame", lapply(df1, function(x) x[!is.na(x)]))
# a b
#1 10 4
#2 11 5
As we have filtered the dataframe which has only nlevels observations, we would have same number of rows for each column in the final dataframe.
split might be useful here to split df$v1 into parts corresponding to df$f1. Since you are always extracting equal length chunks, it can then simply be combined back to a data.frame:
spl <- split(df$v1, df$f1)
data.frame(spl[lengths(spl)==2])
# a b
#1 10 4
#2 11 5
Or do it all in one call by combining this with Filter:
data.frame(Filter(function(x) length(x)==2, split(df$v1, df$f1)))
# a b
#1 10 4
#2 11 5
Here is a solution using unstack :
unstack(
droplevels(df[ave(df$v1, df$f1, FUN = function(x) length(x) == 2)==1,]),
v1 ~ f1)
# a b
# 1 10 4
# 2 11 5
A variant, similar to #thelatemail's solution :
data.frame(Filter(function(x) length(x) == 2, unstack(df,v1 ~ f1)))
My tidyverse solution would be:
library(tidyverse)
df %>%
group_by(f1) %>%
filter(n() == 2) %>%
mutate(i = row_number()) %>%
spread(f1, v1) %>%
select(-i)
# # A tibble: 2 x 2
# a b
# * <dbl> <dbl>
# 1 10 4
# 2 11 5
or mixing approaches :
as_tibble(keep(unstack(df,v1 ~ f1), ~length(.x) == 2))
Using all base functions (but you should use tidyverse)
# Add count of instances
x$len <- ave(x$v1, x$f1, FUN = length)
# Filter, drop the count
x <- x[x$len==2, c('f1','v1')]
# Hacky pivot
result <- data.frame(
lapply(unique(x$f1), FUN = function(y) x$v1[x$f1==y])
)
colnames(result) <- unique(x$f1)
> result
a b
1 10 4
2 11 5
I'd like code this, may it helps for you
library(reshape2)
library(dplyr)
aa = data.frame(v1=c('a','a','b','b','c','c','c'),f1=c(10,11,4,5,0,1,2))
cc = aa %>% group_by(v1) %>% summarise(id = length((v1)))
dd= merge(aa,cc) #get the level
ee = dd[dd$aa==2,] #select number of level equal to 2
ee$id = rep(c(1,2),nrow(ee)/2) # reset index like (1,2,1,2)
dcast(ee, id~v1,value.var = 'f1')
all done!

Resources