Find the union and intersection of grouped variables - r

I have two vectors that are binned.
Basically, I want a function to find the union and intersection of these two vectors (output).
It seems there is no function that supports this feature. Any idea of how i can carry out the desired output vector?
example1 <- c("18--25", "26--30", "31--50", "51+")
example2 <- c("18--23", "24--30", "31--65", "66+")
output <- c("18--23", "24--25", "26--30", "31--50", "51--65", "66+")

We can remove duplicates and combine a sorted vector every 2 elements like this (R version 4.0 and later for pipe |>):
f <- function(x, y, sep, max){
m <- paste0("\\", max)
gsub(m, "", c(x, y)) |>
strsplit(sep, fixed = T) |>
unlist(use.names = F) |>
sort() |>
unique() |>
as.numeric() |>
(\(.) tapply(., gl(length(.), 2, length(.)), paste, collapse = sep, simplify = T))() |>
(\(.) .[!is.na(.)])() |>
as.character() |>
(\(.) {.[length(.)] <- paste0(.[length(.)], max) ; .})()
}
# for older R versions
f <- function(x, y, sep, max){
x <- gsub(paste0("\\", max), "", c(x, y))
x <- as.numeric(unique(sort(unlist(strsplit(x, sep, T), use.names = F))))
x <- tapply(x, gl(length(x), 2L, length(x)), paste, collapse = sep, simplify = T)
x <- as.character(x[!is.na(x)])
x[length(x)] <- paste0(x[length(x)], max)
x
}
f(example1, example2, "--", "+")
[1] "18--23" "24--25" "26--30" "31--50" "51--65" "66+"

Related

Extract each data frame name from a list of data frames in lapply function

I have a list of many data frames and I am trying to perform manipulations to each data frame in the list. I created this lapply function and then the list is then merged together. However when trying to rename certain columns so that they include the respective data frame name:
paste(deparse(substitute(x)),"_start"
the dataframe names are being extracted like this :
x[[i]]_start_1
Here is the full code:
df_list <-lapply(df_list, function(x){
lookup <- c(start = paste(deparse(substitute(x)),"_start"),
end = paste(deparse(substitute(x)),"_end"),
top = paste(deparse(substitute(x)),"_top"),
left = paste(deparse(substitute(x)),"_left"),
height = paste(deparse(substitute(x)),"_height"),
width = paste(deparse(substitute(x)),"_width"),
type = paste(deparse(substitute(x)),"_type"),
value = paste(deparse(substitute(x)),"_value"))
x <- x %>% rename_with(.fn = ~lookup[.x], .cols = intersect(names(.), names(lookup)))
x <- arrange(x, creativeId)
x <- x[,-1]
x <- x %>% distinct()
x$counter <- with(x, ave(creativeId, with(rle(creativeId), rep(seq_along(values), lengths)), FUN = seq_along))
x <- x %>% relocate(counter)
x <- x %>% pivot_wider(names_from =counter, values_from= -names(.)[1:2])
})
new_df <- Reduce(function(x,y) merge(x,y,all=TRUE), df_list)
Please let me know if there is a workaround so that the data frame names are printed correctly. Thank you!
We may use Map
df_list2 <- Map(function(x, nm) {
lookup <- c(start = paste0(nm,"_start"),
end = paste0(nm, "_end"),
top = paste0(nm,"_top"),
left = paste0(nm,"_left"),
height = paste0(nm,"_height"),
width = paste0(nm,"_width"),
type = paste0(nm,"_type"),
value = paste0(nm,"_value"))
x <- x %>%
rename_with(.fn = ~lookup[.x], .cols = intersect(names(.), names(lookup)))
x <- arrange(x, creativeId)
x <- x[,-1]
x <- x %>% distinct()
x$counter <- with(x, ave(creativeId,
with(rle(creativeId), rep(seq_along(values), lengths)), FUN = seq_along))
x <- x %>% relocate(counter)
x <- x %>% pivot_wider(names_from =counter, values_from= -names(.)[1:2])
}, df_list, names(df_list))

Dynamic formula not working with startsWith and colnames

I'm working on making a function to create tables and I need to have some conditional rules involved for formatting. One will be based on a column name, however when I send it down using as.formula it seems to be over doing it. I've made an example here:
library(tidyverse)
library(rlang)
a <- as_tibble(x =cbind( Year = c(2018, 2019, 2020), a = 1:3,
b.1 = c("a", "b", "c"),
b.2 = c("d", "e", "f"),
fac = c("This", "This","That")))
foo <- function(x, y, z, ...){
y_var <- enquo(y)
x %>%
filter(Year %in% c(2018, 2019),
...) %>%
mutate(!!quo_name(y_var) := factor(!!y_var,
levels = z,
ordered = TRUE)) %>%
arrange(!!y_var)
}
to.table <- function(x, y, z, ...){
y_var <- enquo(y)
df.in <- foo(x=x,
y=!!y_var,
z= z)
cond <- paste("~!is.na(", quo_name(y_var),")")
cond.2 <- paste("~startsWith(colnames(", df.in, "),\"b\")")
flextable(df.in) %>%
bold(i = as.formula(cond),
part = "body") %>%
bg(i = as.formula(cond.2),
bg = "Red3",
j = as.formula(cond.2))
}
to.table(x=a,
y=Year,
z= c(2020,2018,2019),
fac == "This")
Error in startsWith(colnames(2:3), "b") : non-character object(s)
From the error I've been reviving it looks like solved the expression before it gets put through the as.formula as those two columns are the correct answer.
Proof:
df.in <- foo(x=a,
y=Year,
z= c(2020,2018,2019),
fac == "This")
startsWith(colnames(df.in), prefix = "b")
[1] FALSE FALSE TRUE TRUE FALSE
What am I missing here? If anyone has a solution, or suggestion on how to do things differently potentially using quosures or other tidyverse friendly methods I would much appreciate it.
Extension:
To make this a bit more clear, I may need to elaborate on my intended use of this example. I'm trying to figure out how to take names generated dynamically in a function represented as foo that start with a specified value (generally 3 columns), and then check those columns for a specified value that I can then highlight in a specific Color.
Additionally in the answer cond is used in both of the i= designation, the two separate conditions in will likely never overlap.
We could specify the j with the column names of the data created i.e. startsWith returns a logical vector from the column names based on the names that starts with 'b', use the logical vector to extract the column names with [ (nm1).
to.table <- function(x, y, z, ...){
y_var <- enquo(y)
df.in <- foo(x=x,
y=!!y_var,
z= z)
cond <- as.formula(glue::glue('~ !is.na({quo_name(y_var)})'))
nm1 <- names(df.in)[startsWith(names(df.in), prefix = "b")]
flextable(df.in) %>%
bold(i = cond,
part = "body") %>%
bg(i = cond,
bg = "Red3",
j = nm1)
}
-testing
to.table(x=a,
y=Year,
z= c(2020,2018,2019),
fac == "This")
-output
In the OP's post formula created for 'cond' is fine although it is a bit more flexible by using glue whereas the second one i.e. 'cond.2' returns
paste("~startsWith(colnames(", df.in, "),\"b\")")
[1] "~startsWith(colnames( 2:3 ),\"b\")" "~startsWith(colnames( c(\"1\", \"2\") ),\"b\")"
[3] "~startsWith(colnames( c(\"a\", \"b\") ),\"b\")" "~startsWith(colnames( c(\"d\", \"e\") ),\"b\")"
[5] "~startsWith(colnames( c(\"This\", \"This\") ),\"b\")"
It is because df.in is a data.frame on which we are trying to paste the startsWith(colnames( string. Each of the lines returned are the column values
If we want to get either 'a' or 'b' column names prefix with 'red' color, change the startsWith to grep which can take a regex as pattern
to.table <- function(x, y, z, ...){
y_var <- enquo(y)
df.in <- foo(x=x,
y=!!y_var,
z= z)
cond <- as.formula(glue::glue('~ !is.na({quo_name(y_var)})'))
nm1 <- grep("^(a|b)", names(df.in), value = TRUE)
flextable(df.in) %>%
bold(i = cond,
part = "body") %>%
bg(i = cond,
bg = "Red3",
j = nm1)
}
to.table(x=a,
y=Year,
z= c(2020,2018,2019),
fac == "This")
-output
If we want to color based on the value of 'a'
to.table <- function(x, y, z, ...){
y_var <- enquo(y)
df.in <- foo(x=x,
y=!!y_var,
z= z)
cond <- as.formula(glue::glue('~ !is.na({quo_name(y_var)})'))
nm1 <- names(df.in)[startsWith(names(df.in), prefix = "b")]
flextable(df.in) %>%
bold(i = cond,
part = "body") %>%
bg(i = ~ a == 2,
bg = "Red3",
j = nm1)
}
to.table(x=a,
y=Year,
z= c(2020,2018,2019),
fac == "This")
-output

Variable Names based on Number in Character Position in R

I'm currently trying to change variable names based on the number in each position of the string.
variables <- c("X0.0.1", "X0.1.0", "X1.0.0", "X0.0.2", "X0.1.1", "X0.2.0", "X1.0.1",
"X1.1.0", "X2.0.0", "X0.0.3", "X0.1.2", "X0.2.1", "X0.3.0", "X1.0.2", "X1.1.1", "X1.2.0",
"X2.0.1","X2.1.0","X3.0.0")
Ideally, I'd have something similar to "X0.0.1" = "x", "X0.1.0" = "y", "X1.0.0" = "z", "X0.0.2" = "xx" is there a way to quickly duplicate the variable if there is a 2 in the position of that number? Or even "X3.0.0" = "zzz"?
I believe the following code does what the question asks for. It uses rep to get the repetitions and then paste them together.
s <- strsplit(substring(variables, 2), "\\.")
sapply(s, function(x){
vec <- c("x", "y", "z")[seq_along(x)]
x <- as.integer(x)
y <- rep(vec, rev(x))
paste(y, collapse = "")
})
# [1] "x" "y" "z" "xx" "xy" "yy" "xz" "yz" "zz" "xxx"
#[11] "xxy" "xyy" "yyy" "xxz" "xyz" "yyz" "xzz" "yzz" "zzz"
Edit.
The following function tries to answer the dialog in the comments. It returns a data.frame with the strings and their degrees. Then it's a matter of sorting by degree/chr.
changeVariable <- function(x, chr = c("x", "y", "z")){
s <- strsplit(substring(x, 2), "\\.")
y <- lapply(s, function(.x){
vec <- chr[seq_along(.x)]
.x <- as.integer(.x)
.y <- rep(vec, rev(.x))
list(chr = paste(.y, collapse = ""),
degree = sum(.x)
)
})
res <- do.call(rbind.data.frame, y)
row.names(res) <- NULL
res
}
res <- changeVariable(variables)
res[order(res$degree, res$chr), ]
Edit 2.
With results pasted with superscripts:
changeVariable2 <- function(x){
s <- strsplit(substring(x, 2), "\\.")
y <- lapply(s, function(.x){
vec <- c("x", "y", "z")[seq_along(.x)]
.x <- rev(as.integer(.x))
.y <- vec[.x != 0]
.x <- .x[.x != 0]
list(chr = paste0(.y, "^", .x, collapse = " "),
degree = sum(.x)
)
})
res <- do.call(rbind.data.frame, y)
row.names(res) <- NULL
res
}

Processing a data frame in r by subgroup: is it possible to get rid of the 'for' loop?

I frequently work with data frames and have to run some sophisticated data wrangling / manipulations by subgroup that is defined in one of the columns. I am aware of dplyr and group_by and know that many things could be solved using group_by. However, often I have to do some pretty intricate calculations and end up just using the 'for' loop.
I was wondering about the existence of some other general approach or paradigm that is faster/more elegant. Maybe map (that I am not very familiar with)?
Below is an example. Notice - it is fake and meaningless. So let's ignore why I need to do those things or the fact that there could be 2 consequtive NAs in a column, etc. That's not the focus of my question. The point is that often I have to operate "within the constraints of a subgroup" and then - inside that subgroup - I have to do operations columnwise, rowwise and sometimes even cellwise.
I also realize that I could probably put most of that code inside a function, split my data frame into a list based on 'group', apply this function to each element of that list and then do.call(rbind...) at the end. But is this the only way?
Thanks a lot for any hints!
library(dplyr)
library(forcats)
set.seed(123)
x <- tibble(group = c(rep('a', 10), rep('b', 10), rep('c', 10)),
attrib = c(sample(c("one", "two", "three", "four"), 10, replace = T),
sample(c("one", "two", "three"), 10, replace = T),
sample(c("one", "three", "four"), 10, replace = T)),
v1 = sample(c(1:5, NA), 30, replace = T),
v2 = sample(c(1:5, NA), 30, replace = T),
v3 = sample(c(1:5, NA), 30, replace = T),
n1 = abs(rnorm(30)), n2 = abs(rnorm(30)), n3 = abs(rnorm(30)))
v_vars = paste0("v", 1:3)
n_vars = paste0("n", 1:3)
results <- NULL # Placeholder for final results
for(i in seq(length(unique(x$group)))) { # loop through groups
mygroup <- unique(x$group)[i]
mysubtable <- x %>% filter(group == mygroup)
# IMPUTE NAs in v columns
# Replace every NA with a mean of values above and below it; and if it's the first or
# the last value, with the mean of 2 values below or above it.
for (v in v_vars){ # loop through v columns
which_nas <- which(is.na(mysubtable[[v]])) # create index of NAs for column v
if (length(which_nas) == 0) next else {
for (na in which_nas) { # loop through indexes of column values that are NAs
if (na == 1) {
mysubtable[[v]][na] <- mean(c(mysubtable[[v]][na + 1],
mysubtable[[v]][na + 2]), na.rm = TRUE)
} else if (na == nrow(mysubtable)) {
mysubtable[[v]][na] <- mean(c(mysubtable[[v]][na - 2],
mysubtable[[v]][na - 1]), na.rm = TRUE)
} else {
mysubtable[[v]][na] <- mean(c(mysubtable[[v]][na - 1],
mysubtable[[v]][na + 1]), na.rm = TRUE)
}
} # end of loop through NA indexes
} # end of else
} # end of loop through v vars
# Aggregate v columns (mean) for each value of column 'attrib'
result1 <- mysubtable %>% group_by(attrib) %>%
summarize_at(v_vars, mean)
# Aggregate n columns (sum) for each value of column 'attrib'
result2 <- mysubtable %>% group_by(attrib) %>%
summarize_at(n_vars, sum)
# final result should contain the name of the group
results[[i]] <- cbind(mygroup, result1, result2[-1])
}
results <- do.call(rbind, results)
Maybe this example is too simple, but in this case, the only thing you need to pull out is the imputation.
my_impute <- function(x) {
which_nas <- which(is.na(x))
for (na in which_nas) {
if (na == 1) {
x[na] <- mean(c(x[na + 1], x[na + 2]), na.rm = TRUE)
} else if (na == length(x)) {
x[na] <- mean(c(x[na - 2], x[na - 1]), na.rm = TRUE)
} else {
x[na] <- mean(c(x[na - 1], x[na + 1]), na.rm = TRUE)
}
}
x
}
Then you just need to group appropriately and impute and summarize.
x2 <- x %>% group_by(group) %>% mutate_at(v_vars, my_impute) %>%
group_by(group, attrib)
full_join(x2 %>% summarize_at(v_vars, mean),
x2 %>% summarize_at(n_vars, sum))
My usual method for things like this, where similar calculations need to be on a bunch of columns, is to put it in long format. Here it feels a little like the long way round, but perhaps this would be useful to see.
x %>% mutate(row=1:n()) %>% gather("variable", "value", c(v_vars, n_vars)) %>%
separate(variable, c("var", "x"), sep=1) %>% spread(var, value) %>%
arrange(group, x, row) %>% group_by(group, x) %>%
mutate(v=my_impute(v)) %>% group_by(group, attrib, x) %>%
summarize(v=mean(v), n=sum(n)) %>%
gather("var", "value", v, n) %>% mutate(X=paste0(var, x)) %>%
select(-x, -var) %>% spread(X, value)
More generally, split-apply-combine is probably the way to go, as you suggest in your question; here's a way using the tidyverse.
doX <- function(x) {
x2 <- x %>% mutate_at(v_vars, my_impute) %>% group_by(attrib)
full_join(x2 %>% summarize_at(v_vars, mean),
x2 %>% summarize_at(n_vars, sum))
}
x %>% group_by(group) %>% nest() %>%
mutate(result=map(data, doX)) %>% select(-data) %>% unnest()
The more traditional method is with do.call, split, and rbind; here I don't make the effort to keep the group information.
do.call(rbind, lapply(split(x, x$group), doX))
The first thing to do is to change your data imputing into a function. I made some simple modifications to have it accept a vector and simplified the call to mean.
fx_na_rm <- function(z) {
which_nas <- which(is.na(z))
if (length(which_nas) > 0) {
for (na in which_nas) { # loop through indexes of column values that are NAs
if (na == 1) {
z[na] <- mean(z[na + (1:2)], na.rm = TRUE)
} else if (na == nrow(mysubtable)) {
z[na] <- mean(z[na - (1:2)], na.rm = TRUE)
} else {
z[na] <- mean(z[c(na - 1, na + 1)], na.rm = TRUE)
}
} # end of loop through NA indexes
}
return(z)
}
I like data.table so here's a solution that uses it. Now since you use different functions for the n and v variable groups, most purrr or any other solutions will also be a little funny.
library(data.table)
dt <- copy(as.data.table(x))
v_vars = paste0("v", 1:3)
n_vars = paste0("n", 1:3)
dt[, (v_vars) := lapply(.SD, as.numeric), .SDcols = v_vars]
dt[, (v_vars) := lapply(.SD, fx_na_rm), by = group, .SDcols = v_vars]
# see https://stackoverflow.com/questions/50626316/r-data-table-apply-function-a-to-some-columns-and-function-b-to-some-others
scols <- list(v_vars, n_vars)
funs <- rep(c(mean, sum), lengths(scols))
dt[, setNames(Map(function(f, x) f(x), funs, .SD), unlist(scols))
, by = .(group,attrib)
, .SDcols = unlist(scols)]
The for loop itself is difficult to vectorize because the results can depend on itself. Here is my attempt which is not an identical output to yours:
# not identical
fx_na_rm2 <- function(z) {
which_nas <- which(is.na(z))
if (length(which_nas) > 0) {
ind <- c(rbind(which_nas - 1 + 2 * (which_nas == 1) + -1 * (which_nas == length(z)),
which_nas + 1 + 1 * (which_nas == 1) + -2 * (which_nas == length(z))))
z[which_nas] <- colMeans(matrix(z[ind], nrow = 2), na.rm = T)
}
return(z)
}

paste grid -- expand.grid for string concatenation

If we want to get all combinations of two vectors, we can use rep/recycling rules:
x <- 1:4
y <- 1:2
cbind(rep(x, each = length(y)), rep(y, length(x)))
# [,1] [,2]
# [1,] 1 1
# [2,] 1 2
# [3,] 2 1
# [4,] 2 2
# [5,] 3 1
# [6,] 3 2
# [7,] 4 1
# [8,] 4 2
But expand.grid is much nicer -- it handles all the repetition for us.
expand.grid(x, y)
# Var1 Var2
# 1 1 1
# 2 2 1
# 3 3 1
# 4 4 1
# 5 1 2
# 6 2 2
# 7 3 2
# 8 4 2
Is there a simple version of this for concatenating strings? Like paste.grid? I have a named object where a lot of the objects have names like x_y_z where x, y, and z vary like x and y above.
For example, suppose x can be "avg" or "median", y can be "male" or "female", and z can be "height" or "weight". How can we concisely get all 8 combinations of the three?
Using rep is a pain:
x <- c("avg", "median")
y <- c("male", "female")
z <- c("height", "weight")
paste(rep(x, each = length(y) * length(z)),
rep(rep(y, each = length(z)), length(x)),
rep(z, length(x) * length(y)), sep = "_")
And repurposing expand.grid is a bit clunky (and probably inefficient):
apply(expand.grid(x, y, z), 1, paste, collapse = "_")
Am I missing something? Is there a better way to do this?
Yes, this is what interaction does
levels(interaction(x,y,z,sep='_'))
The implementation is pretty much the same as your rep code.
Outputs:
[1] "avg_female_height" "median_female_height" "avg_male_height" "median_male_height" "avg_female_weight"
[6] "median_female_weight" "avg_male_weight" "median_male_weight"
Using data.table's CJ cross-joining function:
library(data.table)
CJ(x,y,z)[, paste(V1,V2,V3, sep = "_")]
#[1] "avg_female_height" "avg_female_weight" "avg_male_height" "avg_male_weight"
#[5] "median_female_height" "median_female_weight" "median_male_height" "median_male_weight"
Or a variation of your apply approach would be:
do.call(paste, c(expand.grid(x, y, z), sep = "_"))
#[1] "avg_male_height" "median_male_height" "avg_female_height" "median_female_height"
#[5] "avg_male_weight" "median_male_weight" "avg_female_weight" "median_female_weight"
Rudimentary (microbenchmark::microbenchmark) benchmarking shows a pretty significant speed-up by using:
library(tidyr)
library(magrittr)
df <- data.frame(x, y, z)
df %>%
complete(x, y, z) %>%
unite("combo", x, y, z, sep = "_")
A bit slower, but perhaps more straight forward and vectorized variant the apply technique:
df <- expand.grid(x, y, z)
df$combo <- paste(df$Var1, df$Var1, df$Var3, sep = "_")
Someone should chime in with a data.table approach...
Benchmarking: Small Grid (256 elements)
set.seed(21034)
x <- sample(letters, 4, TRUE)
y <- sample(letters, 4, TRUE)
z <- sample(letters, 4, TRUE)
a <- sample(letters, 4, TRUE)
library(data.table)
library(microbenchmark)
library(magrittr)
library(tidyr)
microbenchmark(times = 25L,
DT1 = CJ(x, y, z, a)[ , paste(V1, V2, V3, V4, sep = "_")],
DT2 = CJ(x, y, z, a)[ , do.call(paste, c(.SD, sep = "_"))],
app1 = do.call(paste, c(expand.grid(x, y, z, a), sep = "_")),
app2 = paste((df <- expand.grid(x, y, z, a))$Var1,
df$Var2, df$Var3, sep = "_"),
magg_outer = outer(x, y, paste, sep = "_") %>%
outer(z, paste, sep = "_") %>%
outer(a, paste, sep = "_") %>% as.vector,
magg_tidy = data.frame(x, y, z, a) %>%
complete(x, y, z, a) %>%
unite("combo", x, y, z, a, sep = "_"),
interaction = levels(interaction(x, y, z, a, sep = "_")),
original = apply(expand.grid(x, y, z, a), 1, paste, collapse = "_"),
rep = paste(rep(x, each = (ny <- length(y)) * (nz <- length(z)) *
(na <- length(a))),
rep(rep(y, each = nz * na), (nx <- length(x))),
rep(rep(z, each = na), nx * ny), sep = "_"),
Reduce = Reduce(function(x, y) paste(rep(x, each = length(y)),
rep(y, length(x)), sep = "_"),
list(x, y, z, a)))
# Unit: microseconds
# expr min lq mean median uq max neval cld
# DT1 529.578 576.6400 624.00002 589.8270 604.9845 5449.287 1000 d
# DT2 561.028 606.4220 639.94659 620.4335 636.2735 5484.514 1000 d
# app1 201.043 225.4475 240.36960 233.4795 243.7090 4244.687 1000 b
# app2 196.692 225.6130 244.33543 234.0455 243.7925 4110.605 1000 b
# magg_outer 164.352 194.1395 205.30300 204.4220 211.1990 456.122 1000 b
# magg_tidy 1872.228 2038.1560 2150.98234 2067.8770 2126.1025 21891.884 1000 f
# interaction 254.885 295.1935 313.54392 306.6680 316.8095 4196.465 1000 c
# original 852.018 935.4960 976.24388 954.5115 972.5550 4973.724 1000 e
# rep 50.737 54.1515 60.22671 55.3660 56.9220 3823.655 1000 a
# Reduce 58.395 65.3860 68.46049 66.8920 68.5640 158.184 1000 a
Benchmarking: Large Grid (1,000,000 elements)
set.seed(21034)
x <- sprintf("%03d", sample(100))
y <- sprintf("%03d", sample(100))
z <- sprintf("%02d", sample(10))
a <- sprintf("%02d", sample(10))
library(data.table)
library(microbenchmark)
library(magrittr)
library(tidyr)
microbenchmark(times = 25L,
DT1 = CJ(x, y, z, a)[ , paste(V1, V2, V3, V4, sep = "_")],
DT2 = CJ(x, y, z, a)[ , do.call(paste, c(.SD, sep = "_"))],
app1 = do.call(paste, c(expand.grid(x, y, z, a), sep = "_")),
app2 = paste((df <- expand.grid(x, y, z, a))$Var1,
df$Var2, df$Var3, sep = "_"),
magg_outer = outer(x, y, paste, sep = "_") %>%
outer(z, paste, sep = "_") %>%
outer(a, paste, sep = "_") %>% as.vector,
magg_tidy = data.frame(x, y, z, a) %>%
complete(x, y, z, a) %>%
unite("combo", x, y, z, a, sep = "_"),
interaction = levels(interaction(x, y, z, a, sep = "_")),
original = apply(expand.grid(x, y, z, a), 1, paste, collapse = "_"),
rep = paste(rep(x, each = (ny <- length(y)) * (nz <- length(z)) *
(na <- length(a))),
rep(rep(y, each = nz * na), (nx <- length(x))),
rep(rep(z, each = na), nx * ny), sep = "_"),
Reduce = Reduce(function(x, y) paste(rep(x, each = length(y)),
rep(y, length(x)), sep = "_"),
list(x, y, z, a)))
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# DT1 360.6528 467.8408 517.4579 520.1484 549.1756 861.1567 25 ab
# DT2 355.0438 504.9642 572.0732 551.9106 615.6621 927.3210 25 b
# app1 727.4513 766.3053 926.1888 910.3998 957.7610 1690.1540 25 c
# app2 472.5724 567.1121 633.5304 600.3779 634.3158 1135.7535 25 b
# magg_outer 384.0112 475.5070 600.6317 525.8936 676.7134 927.6736 25 b
# magg_tidy 520.6428 602.5028 695.5500 680.8821 748.8746 1180.1107 25 bc
# interaction 353.7317 481.4732 531.0035 518.7084 585.0872 693.5171 25 ab
# original 4965.1156 5358.8704 5914.3560 5780.6609 6074.7470 9024.6476 25 d
# rep 206.0964 236.5811 273.1093 252.8179 285.0910 455.1776 25 a
# Reduce 322.0695 390.2595 446.3948 424.9185 508.5235 621.1878 25 ab
What about using outer()? Your two examples become
x <- 1:4
y <- 1:2
as.vector(outer(x, y, paste, sep = "_"))
## [1] "1_1" "2_1" "3_1" "4_1" "1_2" "2_2" "3_2" "4_2"
library(magrittr)
x <- c("avg", "median")
y <- c("male", "female")
z <- c("height", "weight")
outer(x, y, paste, sep = "_") %>% outer(z, paste, sep = "_") %>% as.vector
## [1] "avg_male_height" "median_male_height" "avg_female_height" "median_female_height" "avg_male_weight"
## [6] "median_male_weight" "avg_female_weight" "median_female_weight"
The second example can be simplified a little with Reduce():
Reduce(function(a, b) outer(a, b, paste, sep = "_"), list(x, y, z)) %>% as.vector
It's not efficient, however. Using microbenchmark, I find that your solution using rep() is about 10 times faster.

Resources