Apply a function over all combinations of arguments - r

I would like to be able to apply a function to all combinations of a set of input arguments. I have a working solution (below) but would be surprised if there's not a better / more generic way to do this using, e.g. plyr, but so far have not found anything. Is there a better solution?
# Apply function FUN to all combinations of arguments and append results to
# data frame of arguments
cmapply <- function(FUN, ..., MoreArgs = NULL, SIMPLIFY = TRUE,
USE.NAMES = TRUE)
{
l <- expand.grid(..., stringsAsFactors=FALSE)
r <- do.call(mapply, c(
list(FUN=FUN, MoreArgs = MoreArgs, SIMPLIFY = SIMPLIFY, USE.NAMES = USE.NAMES),
l
))
if (is.matrix(r)) r <- t(r)
cbind(l, r)
}
examples:
# calculate sum of combinations of 1:3, 1:3 and 1:2
cmapply(arg1=1:3, arg2=1:3, 1:2, FUN=sum)
# paste input arguments
cmapply(arg1=1:3, arg2=c("a", "b"), c("x", "y", "z"), FUN=paste)
# function returns a vector
cmapply(a=1:3, b=2, FUN=function(a, b) c("x"=b-a, "y"=a+b))

This function isn't necessarily any better, just slightly different:
rcapply <- function(FUN, ...) {
## Cross-join all vectors
DT <- CJ(...)
## Get the original names
nl <- names(list(...))
## Make names, if all are missing
if(length(nl)==0L) nl <- make.names(1:length(list(...)))
## Fill in any missing names
nl[!nzchar(nl)] <- paste0("arg", 1:length(nl))[!nzchar(nl)]
setnames(DT, nl)
## Call the function using all columns of every row
DT2 <- DT[,
as.data.table(as.list(do.call(FUN, .SD))), ## Use all columns...
by=.(rn=1:nrow(DT))][ ## ...by every row
, rn:=NULL] ## Remove the temp row number
## Add res to names of unnamed result columns
setnames(DT2, gsub("(V)([0-9]+)", "res\\2", names(DT2)))
return(data.table(DT, DT2))
}
head(rcapply(arg1=1:3, arg2=1:3, 1:2, FUN=sum))
## arg1 arg2 arg3 res1
## 1: 1 1 1 3
## 2: 1 1 2 4
## 3: 1 2 1 4
## 4: 1 2 2 5
## 5: 1 3 1 5
## 6: 1 3 2 6
head(rcapply(arg1=1:3, arg2=c("a", "b"), c("x", "y", "z"), FUN=paste))
## arg1 arg2 arg3 res1
## 1: 1 a x 1 a x
## 2: 1 a y 1 a y
## 3: 1 a z 1 a z
## 4: 1 b x 1 b x
## 5: 1 b y 1 b y
## 6: 1 b z 1 b z
head(rcapply(a=1:3, b=2, FUN=function(a, b) c("x"=b-a, "y"=a+b)))
## a b x y
## 1: 1 2 1 3
## 2: 2 2 0 4
## 3: 3 2 -1 5

A slight simplification of your original code:
cmapply <- function(FUN, ..., MoreArgs = NULL)
{
l <- expand.grid(..., stringsAsFactors=FALSE)
r <- .mapply(FUN=FUN, dots=l, MoreArgs = MoreArgs)
r <- simplify2array(r, higher = FALSE)
if (is.matrix(r)) r <- t(r)
return(cbind(l, r))
}
This does not require a do.call.
It does miss the SIMPLIFY and USE.NAMES arguments, but the way you are using it seems to make the arguments not usable anyway: if SIMPLIFY = FALSE, the rbind() will fail, and USE.NAMES = TRUE does not do anything because the names get lost after the rbind() anyway.

Related

variables not recognized in for-loop nested within lapply

I have the following data
set.seed(42)
dat <- list(data.table(id=1:10, group=rep(1:2, each=5), x=rnorm(10)),
data.table(id=1:10, group=rep(1:2, each=5), x=rnorm(10)))
to which I would like to apply this function element by element and group by group.
subs = function(x, ..., verbose=FALSE){
L = substitute(list(...))[-1]
mon = data.table(cond = as.character(L))[, skip := FALSE]
for (i in seq_along(L)){
d = eval( substitute(x[cond, verbose=v], list(cond = L[[i]], v = verbose)) )
if (nrow(d)){
x = d
} else {
mon[i, skip := TRUE]
}
}
#print(mon)
return(x)
}
However, when I run this code
# works
out <- lapply(1:2, function(h){
res <- list()
d <- dat[[h]]
for(k in 1:2){
g <- d[group==k]
cutoff <- 1
print(cutoff)
res[[k]] <- subs(g, x>cutoff)
}
res
})
I receive the error that object cutoff cannot be found, although it is printed correctly. However, when I apply the same for-loop outside of the lapply(), it appears to work.
d1 <- dat[[1]]
s <- list()
for(k in 1:2){
g <- d1[group==k]
cutoff <- 1
s[[k]] <- subs(g, x>cutoff)
}
> s
[[1]]
id group x
1: 1 1 1.370958
[[2]]
id group x
1: 7 2 1.511522
2: 9 2 2.018424
This leads me to suspect that it's the inclusion in the lapply() that causes the error but I find it hard to see what the error is, let along how to fix it.
Edit
Data with two variables:
set.seed(42)
dat <- list(data.table(id=1:10, group=rep(1:2, each=5), x=rnorm(10), y=11:20),
data.table(id=1:10, group=rep(1:2, each=5), x=rnorm(10), y=11:20))
with expected result
[[1]]
id group x y
1: 9 2 2.0184237 19
2: 1 1 1.3709584 11
3: 2 1 -0.5646982 12
4: 3 1 0.3631284 13
5: 4 1 0.6328626 14
6: 5 1 0.4042683 15
[[2]]
id group x y
1: 2 1 2.2866454 12
2: 10 2 1.3201133 20
If you use non-standard evaluation you always pay a price. Here it is a scoping issue.
It works like this:
subs = function(x, ..., verbose=FALSE){
L = substitute(list(...))[-1]
mon = data.table(cond = as.character(L))[, skip := FALSE]
for (i in seq_along(L)){
d = eval( substitute(x[cond,, #needed to add this comma, don't know why
verbose=v], list(cond = L[[i]], v = verbose)))
if (nrow(d)){
x = d
} else {
mon[i, skip := TRUE]
}
}
#print(mon)
return(x)
}
out <- lapply(1:2, function(h){
res <- list()
d <- dat[[h]]
for(k in 1:2){
g <- d[group==k]
cutoff <- 1
res[[k]] <- eval(substitute(subs(g, x>cutoff), list(cutoff = cutoff)))
}
res
})
#works
Is there a particular reason for not using data.table's by parameter?
Edit:
Background: The point of subs() is to apply multiple conditions (if
multiple are passed to it) unless one would result in an empty subset.
I would use a different approach then:
subs = function(x, ..., verbose=FALSE){
L = substitute(list(...))[-1]
for (i in seq_along(L)){
d = eval( substitute(x[cond, , verbose=v], list(cond = L[[i]], v = verbose)))
x <- rbind(d, x[!d, on = "group"])
}
return(x)
}
out <- lapply(dat, function(d){
cutoff <- 2 #to get empty groups
eval(substitute(subs(d, x>cutoff), list(cutoff = cutoff)))
})
#[[1]]
# id group x
#1: 9 2 2.0184237
#2: 1 1 1.3709584
#3: 2 1 -0.5646982
#4: 3 1 0.3631284
#5: 4 1 0.6328626
#6: 5 1 0.4042683
#
#[[2]]
# id group x
#1: 2 1 2.2866454
#2: 6 2 0.6359504
#3: 7 2 -0.2842529
#4: 8 2 -2.6564554
#5: 9 2 -2.4404669
#6: 10 2 1.3201133
Beware that this does not retain the ordering.
Another option that retains the ordering:
subs = function(x, ..., verbose=FALSE){
L = substitute(list(...))[-1]
for (i in seq_along(L)){
x = eval( substitute(x[, {
res <- .SD[cond];
if (nrow(res) > 0) res else .SD
}, by = "group", verbose=v], list(cond = L[[i]], v = verbose)))
}
return(x)
}
The by variable could be passed as a function parameter and then substituted in together with the condition.
I haven't done benchmarks comparing the efficiency of these two.

How do I find the row and column number for a particular string in a data.table?

Let's say I have the following data.table:
DT <- setDT(data.frame(id = 1:10, LETTERS = LETTERS[1:10],
letters = letters[1:10]))
##+ > DT
## id LETTERS letters
## 1: 1 A a
## 2: 2 B b
## 3: 3 C c
## 4: 4 D d
## 5: 5 E e
## 6: 6 F f
## 7: 7 G g
## 8: 8 H h
## 9: 9 I i
## 10: 10 J j
and I want to find the row and column numbers of the letter 'h' (which are 8 and 3). How would I do that?
DT[, which(.SD == "h", arr.ind = TRUE)]
# row col
# [1,] 8 3
EDIT:
Trying to take into account Michael's points:
str_idx = which(sapply(DT, function(x) is.character(x) || is.factor(x)))
idx <- DT[, which(as.matrix(.SD) == "h", arr.ind = TRUE), .SDcols = str_idx]
idx[, "col"] <- chmatch(names(str_idx)[idx[, "col"]], names(DT))
idx
# row col
# [1,] 8 3
Depends on the exact format of your desired output.
# applying to non-string columns is inefficient
str_idx = which(sapply(DT, is.character))
# returns a list as long as str_idx with two elements appropriately named
lapply(str_idx, function(jj) list(row = which(DT[[jj]] == 'h'), col = jj))
It should also be possible to melt the string columns your table to avoid looping.

data.table avoid recycling

I'm constructing a data.table from two (or more) input vectors with different lengths:
x <- c(1,2,3,4)
y <- c(8,9)
dt <- data.table(x = x, y = y)
And need the shorter vector(s) to be filled with NA rather than recycling their values, resulting in a data.table like this:
x y
1: 1 8
2: 2 9
3: 3 NA
4: 4 NA
Is there a way to achieve this without explicitly filling the shorter vector(s) with NA before passing them to the data.table() constructor?
Thanks!
One can use out of range indices:
library("data.table")
x <- c(1,2,3,4)
y <- c(8,9)
n <- max(length(x), length(y))
dt <- data.table(x = x[1:n], y = y[1:n])
# > dt
# x y
# 1: 1 8
# 2: 2 9
# 3: 3 NA
# 4: 4 NA
Or you can extend y by doing (as #Roland recommended in the comment):
length(y) <- length(x) <- max(length(x), length(y))
dt <- data.table(x, y)
An option is cbind.fill from rowr
library(rowr)
setNames(cbind.fill(x, y, fill = NA), c("x", "y"))
Or place the vectors in a list and then pad NA at the end based on the maximum length of the list elements
library(data.table)
lst <- list(x = x, y = y)
as.data.table(lapply(lst, `length<-`, max(lengths(lst))))
# x y
#1: 1 8
#2: 2 9
#3: 3 NA
#4: 4 NA
The "out of range indices" answer provided by jogo can be extended cleanly to in-place assignment using .N:
x <- c(1,2,3,4)
y <- c(8,9)
n <- max(length(x), length(y))
dt <- data.table(x = x[1:n], y = y[1:n])
z <- c(6,7)
dt[, z := z[1:.N]]
# x y z
# 1: 1 8 6
# 2: 2 9 7
# 3: 3 NA NA
# 4: 4 NA NA

cbind: is there a way to have missing values set to NA?

Please forgive me if I missed an answer to such a simple question.
I want to use cbind() to bind two columns. One of them is a single entry shorter in length.
Can I have R supply an NA for the missing value?
The documentation discusses a deparse.level argument but this doesn't seem to be my solution.
Further, if I may be so bold, would there also be a quick way to prepend the shorter column with NA's?
Try this:
x <- c(1:5)
y <- c(4:1)
length(y) = length(x)
cbind(x,y)
x y
[1,] 1 4
[2,] 2 3
[3,] 3 2
[4,] 4 1
[5,] 5 NA
or this:
x <- c(4:1)
y <- c(1:5)
length(x) = length(y)
cbind(x,y)
x y
[1,] 4 1
[2,] 3 2
[3,] 2 3
[4,] 1 4
[5,] NA 5
I think this will do something similar to what DWin suggested and work regardless of which vector is shorter:
x <- c(4:1)
y <- c(1:5)
lengths <- max(c(length(x), length(y)))
length(x) <- lengths
length(y) <- lengths
cbind(x,y)
The code above can also be condensed to:
x <- c(4:1)
y <- c(1:5)
length(x) <- length(y) <- max(c(length(x), length(y)))
cbind(x,y)
EDIT
Here is what I came up with to address the question:
"Further, if I may be so bold, would there also be a quick way to prepend the shorter column with NA's?"
inserted into the original post by Matt O'Brien.
x <- c(4:1)
y <- c(1:5)
first <- 1 # 1 means add NA to top of shorter vector
# 0 means add NA to bottom of shorter vector
if(length(x)<length(y)) {
if(first==1) x = c(rep(NA, length(y)-length(x)),x);y=y
if(first==0) x = c(x,rep(NA, length(y)-length(x)));y=y
}
if(length(y)<length(x)) {
if(first==1) y = c(rep(NA, length(x)-length(y)),y);x=x
if(first==0) y = c(y,rep(NA, length(x)-length(y)));x=x
}
cbind(x,y)
# x y
# [1,] NA 1
# [2,] 4 2
# [3,] 3 3
# [4,] 2 4
# [5,] 1 5
Here is a function:
x <- c(4:1)
y <- c(1:5)
first <- 1 # 1 means add NA to top of shorter vector
# 0 means add NA to bottom of shorter vector
my.cbind <- function(x,y,first) {
if(length(x)<length(y)) {
if(first==1) x = c(rep(NA, length(y)-length(x)),x);y=y
if(first==0) x = c(x,rep(NA, length(y)-length(x)));y=y
}
if(length(y)<length(x)) {
if(first==1) y = c(rep(NA, length(x)-length(y)),y);x=x
if(first==0) y = c(y,rep(NA, length(x)-length(y)));x=x
}
return(cbind(x,y))
}
my.cbind(x,y,first)
my.cbind(c(1:5),c(4:1),1)
my.cbind(c(1:5),c(4:1),0)
my.cbind(c(1:4),c(5:1),1)
my.cbind(c(1:4),c(5:1),0)
my.cbind(c(1:5),c(5:1),1)
my.cbind(c(1:5),c(5:1),0)
This version allows you to cbind two vectors of different mode:
x <- c(4:1)
y <- letters[1:5]
first <- 1 # 1 means add NA to top of shorter vector
# 0 means add NA to bottom of shorter vector
my.cbind <- function(x,y,first) {
if(length(x)<length(y)) {
if(first==1) x = c(rep(NA, length(y)-length(x)),x);y=y
if(first==0) x = c(x,rep(NA, length(y)-length(x)));y=y
}
if(length(y)<length(x)) {
if(first==1) y = c(rep(NA, length(x)-length(y)),y);x=x
if(first==0) y = c(y,rep(NA, length(x)-length(y)));x=x
}
x <- as.data.frame(x)
y <- as.data.frame(y)
return(data.frame(x,y))
}
my.cbind(x,y,first)
# x y
# 1 NA a
# 2 4 b
# 3 3 c
# 4 2 d
# 5 1 e
my.cbind(c(1:5),letters[1:4],1)
my.cbind(c(1:5),letters[1:4],0)
my.cbind(c(1:4),letters[1:5],1)
my.cbind(c(1:4),letters[1:5],0)
my.cbind(c(1:5),letters[1:5],1)
my.cbind(c(1:5),letters[1:5],0)
A while back I had put together a function called Cbind that was meant to do this sort of thing. In its current form, it should be able to handle vectors, data.frames, and matrices as the input.
For now, the function is here: https://gist.github.com/mrdwab/6789277
Here is how one would use the function:
x <- 1:5
y <- letters[1:4]
z <- matrix(1:4, ncol = 2, dimnames = list(NULL, c("a", "b")))
Cbind(x, y, z)
# x y z_a z_b
# 1 1 a 1 3
# 2 2 b 2 4
# 3 3 c NA NA
# 4 4 d NA NA
# 5 5 <NA> NA NA
Cbind(x, y, z, first = FALSE)
# x y z_a z_b
# 1 1 <NA> NA NA
# 2 2 a NA NA
# 3 3 b NA NA
# 4 4 c 1 3
# 5 5 d 2 4
The two three functions required are padNA, dotnames, and Cbind, which are defined as follows:
padNA <- function (mydata, rowsneeded, first = TRUE) {
## Pads vectors, data.frames, or matrices with NA
temp1 = colnames(mydata)
rowsneeded = rowsneeded - nrow(mydata)
temp2 = setNames(
data.frame(matrix(rep(NA, length(temp1) * rowsneeded),
ncol = length(temp1))), temp1)
if (isTRUE(first)) rbind(mydata, temp2)
else rbind(temp2, mydata)
}
dotnames <- function(...) {
## Gets the names of the objects passed through ...
vnames <- as.list(substitute(list(...)))[-1L]
vnames <- unlist(lapply(vnames,deparse), FALSE, FALSE)
vnames
}
Cbind <- function(..., first = TRUE) {
## cbinds vectors, data.frames, and matrices together
Names <- dotnames(...)
datalist <- setNames(list(...), Names)
nrows <- max(sapply(datalist, function(x)
ifelse(is.null(dim(x)), length(x), nrow(x))))
datalist <- lapply(seq_along(datalist), function(x) {
z <- datalist[[x]]
if (is.null(dim(z))) {
z <- setNames(data.frame(z), Names[x])
} else {
if (is.null(colnames(z))) {
colnames(z) <- paste(Names[x], sequence(ncol(z)), sep = "_")
} else {
colnames(z) <- paste(Names[x], colnames(z), sep = "_")
}
}
padNA(z, rowsneeded = nrows, first = first)
})
do.call(cbind, datalist)
}
Part of the reason I stopped working on the function was that the gdata package already has a function called cbindX that handles cbinding data.frames and matrices with different numbers of rows. It will not work directly on vectors, so you need to convert them to data.frames first.
library(gdata)
cbindX(data.frame(x), data.frame(y), z)
# x y a b
# 1 1 a 1 3
# 2 2 b 2 4
# 3 3 c NA NA
# 4 4 d NA NA
# 5 5 <NA> NA NA

Label portions of a dataframe based on boolean value, including previous rows?

For a given dataframe, I'd like to split it based on some boolean value, and then apply a label to that row and the previous rows up until that point.
Assuming the following dataframe:
test <- data.frame(x = 1:10, y = c(F, F, F, T, F, F, T, F, F, F))
I'd ultimately like to create a new column that would contain a label for that specific portion of the dataframe. Ideally, something like the following:
x y z
1 F 1
2 F 1
3 F 1
4 T 1
5 F 2
6 F 2
7 T 2
8 F 3
9 F 3
10 F 3
My current thought is that I need to loop through the dataframe with a function similar to the following (but not exactly):
label.portion <- function(test) {
for (i in 1:nrow(test)) {
z <- 1
if(test$y[i]) { z <- z + 1 }
return(z)
}
}
What is the best/easiest way of doing this? Any help is much appreciated.
Your z column can be built as
z <- with(test, sum(y)-rev(cumsum(rev(y)))+1)
in order to make every new z value start at a FALSE y after a TRUE y, as per your example.
Then you can do cbind(test, z) to get what you want.
One liner solution using transform
transform(test,z= cumsum(c(0,diff(y)) == -1)+1)
x y z
1 1 FALSE 1
2 2 FALSE 1
3 3 FALSE 1
4 4 TRUE 1
5 5 FALSE 2
6 6 FALSE 2
7 7 TRUE 2
8 8 FALSE 3
9 9 FALSE 3
10 10 FALSE 3
Another one liner solution which will be slightly faster than other solutions (except data.table)
test <- data.frame(x = 1:10, y = c(F, F, F, T, F, F, T, F, F, F))
test$z <- c(1, head(cumsum(test$y), -1) + 1)
test
## x y z
## 1 1 FALSE 1
## 2 2 FALSE 1
## 3 3 FALSE 1
## 4 4 TRUE 1
## 5 5 FALSE 2
## 6 6 FALSE 2
## 7 7 TRUE 2
## 8 8 FALSE 3
## 9 9 FALSE 3
## 10 10 FALSE 3
Benchmarks with other solutions provided (excluding data.table)
test <- data.frame(x = 1:1e+05, y = sample(c(T, F), size = 1e+05, replace = TRUE))
microbenchmark(c(1, head(cumsum(test$y), -1) + 1), cumsum(c(0, diff(test$y)) == -1) + 1, with(test, sum(y) - rev(cumsum(rev(y))) +
1), times = 100)
## Unit: milliseconds
## expr min lq median uq max neval
## c(1, head(cumsum(test$y), -1) + 1) 1.685473 1.758474 1.865409 4.647218 5.091512 100
## cumsum(c(0, diff(test$y)) == -1) + 1 4.064867 4.379714 6.936561 7.338810 7.657961 100
## with(test, sum(y) - rev(cumsum(rev(y))) + 1) 2.568766 2.720395 5.396096 5.701176 30.642436 100
Here is an approach using na.locf from xts and data.table for coding elegance (and efficiency)
library(data.table)
library(xts) # for na.locf
test <- data.table(test)
test[(y), grp := seq_along(y)][, grp := na.locf(grp, fromLast = TRUE)]
test[is.na(grp), grp := max(test[, grp], na.rm =TRUE) + 1L]
And a far clearer and faster approach
test[, grp := {xx <- diff(c(0,.I[y], length(.I))); rep.int(seq_along(xx),xx)}]
Note that diff uses a for loop implemented in R, so an Rcpp sugar implementation) would be faster (I'm sure that a cpp function would blow most of these out of the water)

Resources