Splitting data.frame inside and outside an R function - r

I have 3 data.frames (A, B1 and B2). I split each by variable study.name and get my desired output shown as out1, out2, out3:
J <- split(A, A$study.name); out1 <- do.call(rbind, c(J, make.row.names = F))
M <- split(B1, B1$study.name); out2 <- do.call(rbind, c(M, make.row.names = F))
N <- split(B2, B2$study.name); out3 <- do.call(rbind, c(N, make.row.names = F))
But I'm wondering why I can't achieve the same output from my function foo? (see below)
A <- read.csv("https://raw.githubusercontent.com/izeh/m/master/irr.csv", h = T) ## data A
B1 <- read.csv('https://raw.githubusercontent.com/izeh/m/master/irr2.csv', h = T) ## data B1
B2 <- read.csv("https://raw.githubusercontent.com/izeh/m/master/irr4.csv", h = T) ## data B2
foo <- function(...){ ## The unsuccessful function `foo`
r <- list(...)
## r <- Can we HERE delete rows and columns that are ALL `NA` or EMPTY in `r`?
J <- unlist(lapply(seq_along(r), function(i) split(r[[i]], r[[i]]$study.name)), recursive = FALSE)
lapply(seq_along(J), function(i)do.call(rbind, c(J[[i]], make.row.names = FALSE)) )
}
foo(B1, B2) # Example without success

We can do the cleaning of rows/columns before doing the split
foo <- function(...){
r <- list(...)
lapply(r, function(dat) {
m1 <- is.na(dat)|dat == ""
i1 <- rowSums(m1) < ncol(m1)
j1 <- colSums(m1) < nrow(m1)
dat1 <- dat[i1, j1]
facColumns <- sapply(dat1, is.factor)
dat1[facColumns] <- lapply(dat1[facColumns], as.character)
dat1$study.name <- factor(dat1$study.name, levels = unique(dat1$study.name))
l1 <- split(dat1, dat1$study.name)
do.call(rbind, c(l1, make.row.names = FALSE))
}
)
}
lapply(foo(B1, B2), head, 2)
#[[1]]
# study.name group.name outcome ESL prof scope type
#1 Shin.Ellis ME.short 1 1 2 1 1
#2 Shin.Ellis ME.long 1 1 2 1 1
#[[2]]
# study.name group.name outcome ESL prof scope type
#1 Shin.Ellis ME.short 1 1 2 1 1
#2 Shin.Ellis ME.long 1 1 2 1 1
or using a single object as argument
lapply(foo(A), head, 2)
#[[1]]
# study.name group.name outcome ESL prof scope type ESL.1 prof.1 scope.1 type.1
#1 Shin.Ellis ME.short 1 1 2 1 1 1 2 1 1
#2 Shin.Ellis ME.long 1 1 2 1 1 1 2 1 1

Related

Counting split rules in decision trees in R

I'm trying to count each unique split rule from a data frame of decision trees in R. For example, if I have a data frame containing 4 trees like the one shown below:
df <- data.frame(
var = c('x10', NA, NA,
'x10', NA, 'x7', NA, NA,
'x5', 'x2', NA, NA, 'x9', NA, NA,
'x5', NA, NA),
num = c(1,1,1,
2,2,2,2,2,
1,1,1,1,1,1,1,
2,2,2),
iter = c(rep(1, 8), rep(2, 10))
)
> df
var num iter
1 x10 1 1
2 <NA> 1 1
3 <NA> 1 1
4 x10 2 1
5 <NA> 2 1
6 x7 2 1
7 <NA> 2 1
8 <NA> 2 1
9 x5 1 2
10 x2 1 2
11 <NA> 1 2
12 <NA> 1 2
13 x9 1 2
14 <NA> 1 2
15 <NA> 1 2
16 x5 2 2
17 <NA> 2 2
18 <NA> 2 2
The var column contains the variable name used in the splitting rule and is ordered by depth first. So, for example, the 4 trees created from that data would look like this:
I'm trying to find a way to return the count of each pair of variables used in a split rule, but grouped by iter. For example, if we look at the 2nd tree (i.e.,num == 2, iter == 1) we can see that x7 splits on x10. so, the pair x10:x7 appears 1 time when iter == 1.
My desired output would look something like this:
allSplits count iter
1 x10:x7 1 1
2 x5:x2 1 2
3 x5:x9 1 2
Any suggestions as to how I could do this?
There is probably a package that knows how to operate on this kind of data frame, but maybe these two hand-crafted recursive functions can get you started.
mkTree <- function(x, pos = 1L) {
var <- x[pos]
if (is.na(var)) {
list(NA_character_, NULL, NULL, 1L)
} else {
node <- vector("list", 4L)
node[[1L]] <- var
node[[2L]] <- l <- Recall(x, pos + 1L)
node[[3L]] <- r <- Recall(x, pos + 1L + l[[4L]])
node[[4L]] <- 1L + l[[4L]] + r[[4L]]
node
}
}
tabTree <- function(tree, sep = ":") {
x <- rep.int(NA_character_, tree[[4L]])
pos <- 1L
recurse <- function(subtree) {
var1 <- subtree[[1L]]
if (!is.na(var1)) {
for (i in 2:3) {
var2 <- subtree[[c(i, 1L)]]
if (!is.na(var2)) {
x[pos] <<- paste0(var1, sep, var2)
pos <<- pos + 1L
Recall(subtree[[i]])
}
}
}
}
recurse(tree)
x <- x[!is.na(x)]
if (length(x)) {
x <- factor(x)
setNames(tabulate(x), levels(x))
} else {
integer(0L)
}
}
mkTree transforms into recursive lists the segments of var in your data frame that specify a tree. Nodes in these recursive structures have the form:
list(variable_name, left_node, right_node, subtree_size)
tabTree takes the mkTree result and returns a named integer vector tabulating the splits. So you could do
f <- function(x) tabTree(mkTree(x))
L <- tapply(df[["var"]], df[c("num", "iter")], f, simplify = FALSE)
to get a list matrix storing the tabulated splits for each [num, iter] pair (i.e., for each tree).
L
## iter
## num 1 2
## 1 integer,0 integer,2
## 2 1 integer,0
L[2L, 1L]
## [[1]]
## x10:x7
## 1
L[1L, 2L]
## [[1]]
## x5:x2 x5:x9
## 1 1
And you could sum over num to get tabulated splits for each level of iter.
g <- function(l) {
x <- unlist(unname(l))
tapply(x, names(x), sum)
}
apply(L, 2L, g)
## $`1`
## x10:x7
## 1
## $`2`
## x5:x2 x5:x9
## 1 1

variables not recognized in for-loop nested within lapply

I have the following data
set.seed(42)
dat <- list(data.table(id=1:10, group=rep(1:2, each=5), x=rnorm(10)),
data.table(id=1:10, group=rep(1:2, each=5), x=rnorm(10)))
to which I would like to apply this function element by element and group by group.
subs = function(x, ..., verbose=FALSE){
L = substitute(list(...))[-1]
mon = data.table(cond = as.character(L))[, skip := FALSE]
for (i in seq_along(L)){
d = eval( substitute(x[cond, verbose=v], list(cond = L[[i]], v = verbose)) )
if (nrow(d)){
x = d
} else {
mon[i, skip := TRUE]
}
}
#print(mon)
return(x)
}
However, when I run this code
# works
out <- lapply(1:2, function(h){
res <- list()
d <- dat[[h]]
for(k in 1:2){
g <- d[group==k]
cutoff <- 1
print(cutoff)
res[[k]] <- subs(g, x>cutoff)
}
res
})
I receive the error that object cutoff cannot be found, although it is printed correctly. However, when I apply the same for-loop outside of the lapply(), it appears to work.
d1 <- dat[[1]]
s <- list()
for(k in 1:2){
g <- d1[group==k]
cutoff <- 1
s[[k]] <- subs(g, x>cutoff)
}
> s
[[1]]
id group x
1: 1 1 1.370958
[[2]]
id group x
1: 7 2 1.511522
2: 9 2 2.018424
This leads me to suspect that it's the inclusion in the lapply() that causes the error but I find it hard to see what the error is, let along how to fix it.
Edit
Data with two variables:
set.seed(42)
dat <- list(data.table(id=1:10, group=rep(1:2, each=5), x=rnorm(10), y=11:20),
data.table(id=1:10, group=rep(1:2, each=5), x=rnorm(10), y=11:20))
with expected result
[[1]]
id group x y
1: 9 2 2.0184237 19
2: 1 1 1.3709584 11
3: 2 1 -0.5646982 12
4: 3 1 0.3631284 13
5: 4 1 0.6328626 14
6: 5 1 0.4042683 15
[[2]]
id group x y
1: 2 1 2.2866454 12
2: 10 2 1.3201133 20
If you use non-standard evaluation you always pay a price. Here it is a scoping issue.
It works like this:
subs = function(x, ..., verbose=FALSE){
L = substitute(list(...))[-1]
mon = data.table(cond = as.character(L))[, skip := FALSE]
for (i in seq_along(L)){
d = eval( substitute(x[cond,, #needed to add this comma, don't know why
verbose=v], list(cond = L[[i]], v = verbose)))
if (nrow(d)){
x = d
} else {
mon[i, skip := TRUE]
}
}
#print(mon)
return(x)
}
out <- lapply(1:2, function(h){
res <- list()
d <- dat[[h]]
for(k in 1:2){
g <- d[group==k]
cutoff <- 1
res[[k]] <- eval(substitute(subs(g, x>cutoff), list(cutoff = cutoff)))
}
res
})
#works
Is there a particular reason for not using data.table's by parameter?
Edit:
Background: The point of subs() is to apply multiple conditions (if
multiple are passed to it) unless one would result in an empty subset.
I would use a different approach then:
subs = function(x, ..., verbose=FALSE){
L = substitute(list(...))[-1]
for (i in seq_along(L)){
d = eval( substitute(x[cond, , verbose=v], list(cond = L[[i]], v = verbose)))
x <- rbind(d, x[!d, on = "group"])
}
return(x)
}
out <- lapply(dat, function(d){
cutoff <- 2 #to get empty groups
eval(substitute(subs(d, x>cutoff), list(cutoff = cutoff)))
})
#[[1]]
# id group x
#1: 9 2 2.0184237
#2: 1 1 1.3709584
#3: 2 1 -0.5646982
#4: 3 1 0.3631284
#5: 4 1 0.6328626
#6: 5 1 0.4042683
#
#[[2]]
# id group x
#1: 2 1 2.2866454
#2: 6 2 0.6359504
#3: 7 2 -0.2842529
#4: 8 2 -2.6564554
#5: 9 2 -2.4404669
#6: 10 2 1.3201133
Beware that this does not retain the ordering.
Another option that retains the ordering:
subs = function(x, ..., verbose=FALSE){
L = substitute(list(...))[-1]
for (i in seq_along(L)){
x = eval( substitute(x[, {
res <- .SD[cond];
if (nrow(res) > 0) res else .SD
}, by = "group", verbose=v], list(cond = L[[i]], v = verbose)))
}
return(x)
}
The by variable could be passed as a function parameter and then substituted in together with the condition.
I haven't done benchmarks comparing the efficiency of these two.

R: cbind function in for loop

c <- readline(prompt="Enter an integer: ")
b <- readline(prompt="Enter an integer: ")
for(i in 1:c){
assign(paste("a", i, sep = ""), i)
}
This gives a1, a2 ... ac variables containing 1,2 ... c
How can I use cbind based on the value of b? For example, take the following:
# assume b = 3 and c = 12:
t1 <- cbind(a1,a2,a3)
t2 <- cbind(a4,a5,a6)
t3 <- cbind(a7,a8,a9)
t4 <- cbind(a10,a11,a12)
# assume b = 4 and c = 12:
t1 <- cbind(a1,a2,a3,a4)
t2 <- cbind(a5,a6,a7,a8)
t3 <- cbind(a9,a10,a11,a12)
Another example to clarify: assume b = 3, c=6
a1 <- c(3,5,2)
a2 <- c(4,7,3)
a3 <- c(3,5,2)
a4 <- c(4,5,3)
a5 <- c(5,5,5)
a6 <- c(4,3,1)
t1 <- cbind(a1,a2,a3)
t2 <- cbind(a4,a5,a6)
Expected value of t1:
3 4 3
5 7 5
2 3 2
I am making some assumptions about your data. I am assuming you have values assigned with the columns you are trying to cbind.
a <- 12
b <- 3
test <- NULL
index <- NULL
for(i in 1:a){
test[i] <- paste0("n_", i)
index[i] <- paste(i)
}
start <- seq(1,a-b+1, by=b)
end <- seq(b,a, by=b)
s = list()
k=1
for(k in 1:length(start)){
cbind_list <- start[k]:end[k]
s[[k]] <- rbind(test[seq(cbind_list[1],cbind_list[length(cbind_list)],by=1)])
}
list_cols <- do.call(rbind, s)
n_1 <- rep(1,4)
n_2 <- rep(2,4)
n_3 <- rep(3,4)
n_4 <- rep(4,4)
n_5 <- rep(5,4)
n_6 <- rep(6,4)
n_7 <- rep(7,4)
n_8 <- rep(8,4)
n_9 <- rep(9,4)
n_10 <- rep(10,4)
n_11 <- rep(11,4)
n_12 <- rep(12,4)
df <- data.frame(n_1,n_2,n_3,n_4,n_5,n_6,n_7,n_8,n_9,n_10,n_11,n_12)
t=list()
for(p in 1:nrow(list_cols)){
nam <- paste0("t",p)
assign(nam,cbind(df[,match(list_cols[p,], colnames(df))]))
}
OUTPUT:
> t1
n_1 n_2 n_3
1 1 2 3
2 1 2 3
3 1 2 3
4 1 2 3
UPDATED:
a <- 6
b <- 3
test <- NULL
index <- NULL
for(i in 1:a){
test[i] <- paste0("n_", i)
index[i] <- paste(i)
}
start <- seq(1,a-b+1, by=b)
end <- seq(b,a, by=b)
s = list()
k=1
for(k in 1:length(start)){
cbind_list <- start[k]:end[k]
s[[k]] <- rbind(test[seq(cbind_list[1],cbind_list[length(cbind_list)],by=1)])
}
list_cols <- do.call(rbind, s)
n_1 <- c(3,5,2)
n_2 <- c(4,7,3)
n_3 <- c(3,5,2)
n_4 <- c(4,5,3)
n_5 <- c(5,5,5)
n_6 <- c(4,3,1)
df <- data.frame(n_1,n_2,n_3,n_4,n_5,n_6)
t=list()
p=1
for(p in 1:nrow(list_cols)){
nam <- paste0("t",p)
assign(nam,cbind(df[,match(list_cols[p,], colnames(df))]))
}
OUTPUT:
> t1
n_1 n_2 n_3
1 3 4 3
2 5 7 5
3 2 3 2

R - Collapse into vector same member of a list

I have a list with same structure for every member as the following
config <- NULL
config[["secA"]] <- NULL
config[["secA"]]$VAL <- 0
config[["secA"]]$ARR <- c(1,2,3,4,5)
config[["secA"]]$DF <- data.frame(matrix(c(1,5,3,8),2,2))
config[["secB"]] <- NULL
config[["secB"]]$VAL <- 1
config[["secB"]]$ARR <- c(1,3,2,4,9)
config[["secB"]]$DF <- data.frame(matrix(c(2,6,1,9),2,2))
config[["secC"]] <- NULL
config[["secC"]]$VAL <- 5
config[["secC"]]$ARR <- c(4,2,1,5,8)
config[["secC"]]$DF <- data.frame(matrix(c(4,2,1,7),2,2))
and I need to obtain 3 vectors VAL, ARR and DF, each with the concatenated elements of the corresponding member. such as
# VAL: 0,1,5
# ARR: 1,2,3,4,5,1,3,2,4,9,4,2,1,5,8
# DF: 1,5,3,8,2,6,1,9,4,2,1,7
Looking at similar situations, I have the feeling I need to use a combination of do.call and cbind or lapply but I have no clue. any suggestions?
config <- NULL
config[["secA"]] <- NULL
config[["secA"]]$VAL <- 0
config[["secA"]]$ARR <- c(1,2,3,4,5)
config[["secA"]]$DF <- data.frame(matrix(c(1,5,3,8),2,2))
config[["secB"]] <- NULL
config[["secB"]]$VAL <- 1
config[["secB"]]$ARR <- c(1,3,2,4,9)
config[["secB"]]$DF <- data.frame(matrix(c(2,6,1,9),2,2))
config[["secC"]] <- NULL
config[["secC"]]$VAL <- 5
config[["secC"]]$ARR <- c(4,2,1,5,8)
config[["secC"]]$DF <- data.frame(matrix(c(4,2,1,7),2,2))
sapply(names(config[[1]]), function(x)
unname(unlist(sapply(config, `[`, x))), USE.NAMES = TRUE)
# $VAL
# [1] 0 1 5
#
# $ARR
# [1] 1 2 3 4 5 1 3 2 4 9 4 2 1 5 8
#
# $DF
# [1] 1 5 3 8 2 6 1 9 4 2 1 7
Or you can use this clist function
Unfortunately there were no other answers.
(l <- Reduce(clist, config))
# $VAL
# [1] 0 1 5
#
# $ARR
# [1] 1 2 3 4 5 1 3 2 4 9 4 2 1 5 8
#
# $DF
# X1 X2 X1 X2 X1 X2
# 1 1 3 2 1 4 1
# 2 5 8 6 9 2 7
It merges data frames and matrices, so you need to unlist to get the vector you want
l$DF <- unname(unlist(l$DF))
l
# $VAL
# [1] 0 1 5
#
# $ARR
# [1] 1 2 3 4 5 1 3 2 4 9 4 2 1 5 8
#
# $DF
# [1] 1 5 3 8 2 6 1 9 4 2 1 7
Function
clist <- function (x, y) {
islist <- function(x) inherits(x, 'list')
'%||%' <- function(a, b) if (!is.null(a)) a else b
get_fun <- function(x, y)
switch(class(x %||% y),
matrix = cbind,
data.frame = function(x, y)
do.call('cbind.data.frame', Filter(Negate(is.null), list(x, y))),
factor = function(...) unlist(list(...)), c)
stopifnot(islist(x), islist(y))
nn <- names(rapply(c(x, y), names, how = 'list'))
if (is.null(nn) || any(!nzchar(nn)))
stop('All non-NULL list elements should have unique names', domain = NA)
nn <- unique(c(names(x), names(y)))
z <- setNames(vector('list', length(nn)), nn)
for (ii in nn)
z[[ii]] <- if (islist(x[[ii]]) && islist(y[[ii]]))
Recall(x[[ii]], y[[ii]]) else
(get_fun(x[[ii]], y[[ii]]))(x[[ii]], y[[ii]])
z
}
Another approach, with slightly less code.
un_config <- unlist(config)
un_configNAM <- names(un_config)
vecNAM <- c("VAL", "ARR", "DF")
for(n in vecNAM){
assign(n, un_config[grepl(n, un_configNAM)])
}
This will return 3 vectors as the OP requested. However, generally it is more advantageous to store results in a list as rawr suggests. You of course can adopt the above code so that results are stored within a list.
l <- rep(list(NA), length(vecNAM))
i = 1
for(n in vecNAM){
l[[i]] <- un_config[grepl(n, un_configNAM)]
i = i +1
}

Intersect all possible combinations of list elements

I have a list of vectors:
> l <- list(A=c("one", "two", "three", "four"), B=c("one", "two"), C=c("two", "four", "five", "six"), D=c("six", "seven"))
> l
$A
[1] "one" "two" "three" "four"
$B
[1] "one" "two"
$C
[1] "two" "four" "five" "six"
$D
[1] "six" "seven"
I would like to calculate the length of the overlap between all possible pairwise combinations of the list elements, i.e. (the format of the result doesn't matter):
AintB 2
AintC 2
AintD 0
BintC 1
BintD 0
CintD 1
I know combn(x, 2) can be used to get a matrix of all possible pairwise combinations in a vector and that length(intersect(a, b)) would give me the length of the overlap of two vectors, but I can't think of a way to put the two things together.
Any help is much appreciated! Thanks.
If I understand correctly, you can look at crossprod and stack:
crossprod(table(stack(l)))
# ind
# ind A B C D
# A 4 2 2 0
# B 2 2 1 0
# C 2 1 4 1
# D 0 0 1 2
You can extend the idea if you want a data.frame of just the relevant values as follows:
Write a spiffy function
listIntersect <- function(inList) {
X <- crossprod(table(stack(inList)))
X[lower.tri(X)] <- NA
diag(X) <- NA
out <- na.omit(data.frame(as.table(X)))
out[order(out$ind), ]
}
Apply it
listIntersect(l)
# ind ind.1 Freq
# 5 A B 2
# 9 A C 2
# 13 A D 0
# 10 B C 1
# 14 B D 0
# 15 C D 1
Performance seems pretty decent.
Expand the list:
L <- unlist(replicate(100, l, FALSE), recursive=FALSE)
names(L) <- make.unique(names(L))
Set up some functions to test:
fun1 <- function(l) listIntersect(l)
fun2 <- function(l) apply( combn( l , 2 ) , 2 , function(x) length( intersect( unlist( x[1]) , unlist(x[2]) ) ) )
fun3 <- function(l) {
m1 <- combn(names(l),2)
val <- sapply(split(m1, col(m1)),function(x) {x1 <- l[[x[1]]]; x2 <- l[[x[2]]]; length(intersect(x1, x2))})
Ind <- apply(m1,2,paste,collapse="int")
data.frame(Ind, val, stringsAsFactors=F)
}
Check out the timings:
system.time(F1 <- fun1(L))
# user system elapsed
# 0.33 0.00 0.33
system.time(F2 <- fun2(L))
# user system elapsed
# 4.32 0.00 4.31
system.time(F3 <- fun3(L))
# user system elapsed
# 6.33 0.00 6.33
Everyone seems to be sorting the result differently, but the numbers match:
table(F1$Freq)
#
# 0 1 2 4
# 20000 20000 29900 9900
table(F2)
# F2
# 0 1 2 4
# 20000 20000 29900 9900
table(F3$val)
#
# 0 1 2 4
# 20000 20000 29900 9900
combn works with list structures as well, you just need a little unlist'ing of the result to use intersect...
# Get the combinations of names of list elements
nms <- combn( names(l) , 2 , FUN = paste0 , collapse = "" , simplify = FALSE )
# Make the combinations of list elements
ll <- combn( l , 2 , simplify = FALSE )
# Intersect the list elements
out <- lapply( ll , function(x) length( intersect( x[[1]] , x[[2]] ) ) )
# Output with names
setNames( out , nms )
#$AB
#[1] 2
#$AC
#[1] 2
#$AD
#[1] 0
#$BC
#[1] 1
#$BD
#[1] 0
#$CD
#[1] 1
Try:
m1 <- combn(names(l),2)
val <- sapply(split(m1, col(m1)),function(x) {x1 <- l[[x[1]]]; x2 <- l[[x[2]]]; length(intersect(x1, x2))})
Ind <- apply(m1,2,paste,collapse="int")
data.frame(Ind, val, stringsAsFactors=F)
# Ind val
# 1 AntB 2
# 2 AntC 2
# 3 AntD 0
# 4 BntC 1
# 5 BntD 0
# 6 CntD 1

Resources