R - Collapse into vector same member of a list - r

I have a list with same structure for every member as the following
config <- NULL
config[["secA"]] <- NULL
config[["secA"]]$VAL <- 0
config[["secA"]]$ARR <- c(1,2,3,4,5)
config[["secA"]]$DF <- data.frame(matrix(c(1,5,3,8),2,2))
config[["secB"]] <- NULL
config[["secB"]]$VAL <- 1
config[["secB"]]$ARR <- c(1,3,2,4,9)
config[["secB"]]$DF <- data.frame(matrix(c(2,6,1,9),2,2))
config[["secC"]] <- NULL
config[["secC"]]$VAL <- 5
config[["secC"]]$ARR <- c(4,2,1,5,8)
config[["secC"]]$DF <- data.frame(matrix(c(4,2,1,7),2,2))
and I need to obtain 3 vectors VAL, ARR and DF, each with the concatenated elements of the corresponding member. such as
# VAL: 0,1,5
# ARR: 1,2,3,4,5,1,3,2,4,9,4,2,1,5,8
# DF: 1,5,3,8,2,6,1,9,4,2,1,7
Looking at similar situations, I have the feeling I need to use a combination of do.call and cbind or lapply but I have no clue. any suggestions?

config <- NULL
config[["secA"]] <- NULL
config[["secA"]]$VAL <- 0
config[["secA"]]$ARR <- c(1,2,3,4,5)
config[["secA"]]$DF <- data.frame(matrix(c(1,5,3,8),2,2))
config[["secB"]] <- NULL
config[["secB"]]$VAL <- 1
config[["secB"]]$ARR <- c(1,3,2,4,9)
config[["secB"]]$DF <- data.frame(matrix(c(2,6,1,9),2,2))
config[["secC"]] <- NULL
config[["secC"]]$VAL <- 5
config[["secC"]]$ARR <- c(4,2,1,5,8)
config[["secC"]]$DF <- data.frame(matrix(c(4,2,1,7),2,2))
sapply(names(config[[1]]), function(x)
unname(unlist(sapply(config, `[`, x))), USE.NAMES = TRUE)
# $VAL
# [1] 0 1 5
#
# $ARR
# [1] 1 2 3 4 5 1 3 2 4 9 4 2 1 5 8
#
# $DF
# [1] 1 5 3 8 2 6 1 9 4 2 1 7
Or you can use this clist function
Unfortunately there were no other answers.
(l <- Reduce(clist, config))
# $VAL
# [1] 0 1 5
#
# $ARR
# [1] 1 2 3 4 5 1 3 2 4 9 4 2 1 5 8
#
# $DF
# X1 X2 X1 X2 X1 X2
# 1 1 3 2 1 4 1
# 2 5 8 6 9 2 7
It merges data frames and matrices, so you need to unlist to get the vector you want
l$DF <- unname(unlist(l$DF))
l
# $VAL
# [1] 0 1 5
#
# $ARR
# [1] 1 2 3 4 5 1 3 2 4 9 4 2 1 5 8
#
# $DF
# [1] 1 5 3 8 2 6 1 9 4 2 1 7
Function
clist <- function (x, y) {
islist <- function(x) inherits(x, 'list')
'%||%' <- function(a, b) if (!is.null(a)) a else b
get_fun <- function(x, y)
switch(class(x %||% y),
matrix = cbind,
data.frame = function(x, y)
do.call('cbind.data.frame', Filter(Negate(is.null), list(x, y))),
factor = function(...) unlist(list(...)), c)
stopifnot(islist(x), islist(y))
nn <- names(rapply(c(x, y), names, how = 'list'))
if (is.null(nn) || any(!nzchar(nn)))
stop('All non-NULL list elements should have unique names', domain = NA)
nn <- unique(c(names(x), names(y)))
z <- setNames(vector('list', length(nn)), nn)
for (ii in nn)
z[[ii]] <- if (islist(x[[ii]]) && islist(y[[ii]]))
Recall(x[[ii]], y[[ii]]) else
(get_fun(x[[ii]], y[[ii]]))(x[[ii]], y[[ii]])
z
}

Another approach, with slightly less code.
un_config <- unlist(config)
un_configNAM <- names(un_config)
vecNAM <- c("VAL", "ARR", "DF")
for(n in vecNAM){
assign(n, un_config[grepl(n, un_configNAM)])
}
This will return 3 vectors as the OP requested. However, generally it is more advantageous to store results in a list as rawr suggests. You of course can adopt the above code so that results are stored within a list.
l <- rep(list(NA), length(vecNAM))
i = 1
for(n in vecNAM){
l[[i]] <- un_config[grepl(n, un_configNAM)]
i = i +1
}

Related

apply similar variable to multiple dataset in r

I have 6 data named from dat1 to dat6, I want to add variable region and label them in a similar way, like this:
dat1$region <- paste("NE-1")
dat2$region <- paste("NE-2")
dat3$region <- paste("NE-3")
dat4$region <- paste("NE-4")
dat5$region <- paste("NE-5")
How can I write this code in a more concise way? using apply or for-loop?
Thanks!!
One option is to use get and assign functions in a for-loop.
Sample data:
dat1 <- data.frame(id=1:4, region = letters[1:4])
dat2 <- data.frame(id=5:8, region = letters[5:8])
dat3 <- data.frame(id=9:12, region = letters[9:12])
dat4 <- data.frame(id=13:16, region = letters[13:16])
dat5 <- data.frame(id=17:20, region = letters[17:20])
dat1
# id region
# 1 1 a
# 2 2 b
# 3 3 c
# 4 4 d
Apply for-loop:
for(i in 1:5){
name = paste("dat",i,sep="")
temp <- get(name)
temp$region = paste("NE",i,sep = "-")
assign(name, temp)
}
Verify results:
dat1
# id region
# 1 1 NE-1
# 2 2 NE-1
# 3 3 NE-1
# 4 4 NE-1
dat5
# id region
# 1 17 NE-5
# 2 18 NE-5
# 3 19 NE-5
# 4 20 NE-5
Keep all dataframes in a list then use lapply:
# example dataframes
dat1 <- cars[1:2, ]
dat2 <- cars[3:4, ]
dat3 <- cars[5:6, ]
myList <- list(dat1, dat2, dat3)
# myList
# [[1]]
# speed dist
# 1 4 2
# 2 4 10
#
# [[2]]
# speed dist
# 3 7 4
# 4 7 22
#
# [[3]]
# speed dist
# 5 8 16
# 6 9 10
Then it is easier to do repetitive operations. Loop through the list, add region column:
res <- lapply(seq_along(myList), function(i){
x <- myList[[ i ]]
x$region <- paste0("NE-", i)
x
})
res
# [[1]]
# speed dist region
# 1 4 2 NE-1
# 2 4 10 NE-1
#
# [[2]]
# speed dist region
# 3 7 4 NE-2
# 4 7 22 NE-2
#
# [[3]]
# speed dist region
# 5 8 16 NE-3
# 6 9 10 NE-3
How about this (assuming all your items start with dat and end with a unique identifier string):
dat_names <- ls()[grepl("^dat", ls())]
dat_ID <- sapply(dat_names, function(d) gsub("dat", "", d))
for(d in 1:length(dat_names)) {
dat_names[[d]]$region <- paste("NE-", dat_ID[d], sep="")
}

Merging list of numbers in R

I've a list with groups of numbers as follows:
myList <- list(1:5, c(1,3,4,7), 2:6, c(3,6:9), 4:8)
myList
#[[1]]
#[1] 1 2 3 4 5
#
#[[2]]
#[1] 1 3 4 7
#
#[[3]]
#[1] 2 3 4 5 6
#
#[[4]]
#[1] 3 6 7 8 9
#
#[[5]]
#[1] 4 5 6 7 8
So we have 5 groups of numbers in myList. I want to merge two groups if they have at least s% same numbers in them. I compute similarity using the following function call:
get.Overlap <- function(group_A, group_B)
{
common <- length(intersect(group_A, group_B))
minimum_length <- min(length(group_A), length(group_B))
#formula: |A ^ B| / min{|A|, |B|}
overlap_score <- common / minimum_length
overlap_score
}
I have implemented mergeList(myList, threshold_s) using loops as follows:
mergeList <- function(myList, threshold_s)
{
returnList <- list()
i <- 1
while(i <= length(myList))
{
thisList <- myList[[i]]
j <- i + 1
while(j <= length(myList))
{
tempList <- myList[[j]]
if(get.Overlap(thisList, tempList) >= threshold_s)
{
thisList <- union(thisList, tempList)
myList[[j]] <- NULL
}
else
{
j <- j + 1
}
}
returnList <- c(returnList, list(thisList))
i <- i + 1
}
returnList
}
Now if I call merge(myList, threshold_s), where threshold_s is set to 0.80 meaning 80% similarity, the output will be
#[[1]]
#[1] 1 2 3 4 5 6
#
#[[2]]
#[1] 1 3 4 7
#
#[[3]]
#[1] 3 4 6 7 8 9
The complexity is comparatively high. I am looking for a fast implement of merge(myList, threshold_s) for a large list, say length of myList may be around 50,000.
Thanks in advance.

Intersect all possible combinations of list elements

I have a list of vectors:
> l <- list(A=c("one", "two", "three", "four"), B=c("one", "two"), C=c("two", "four", "five", "six"), D=c("six", "seven"))
> l
$A
[1] "one" "two" "three" "four"
$B
[1] "one" "two"
$C
[1] "two" "four" "five" "six"
$D
[1] "six" "seven"
I would like to calculate the length of the overlap between all possible pairwise combinations of the list elements, i.e. (the format of the result doesn't matter):
AintB 2
AintC 2
AintD 0
BintC 1
BintD 0
CintD 1
I know combn(x, 2) can be used to get a matrix of all possible pairwise combinations in a vector and that length(intersect(a, b)) would give me the length of the overlap of two vectors, but I can't think of a way to put the two things together.
Any help is much appreciated! Thanks.
If I understand correctly, you can look at crossprod and stack:
crossprod(table(stack(l)))
# ind
# ind A B C D
# A 4 2 2 0
# B 2 2 1 0
# C 2 1 4 1
# D 0 0 1 2
You can extend the idea if you want a data.frame of just the relevant values as follows:
Write a spiffy function
listIntersect <- function(inList) {
X <- crossprod(table(stack(inList)))
X[lower.tri(X)] <- NA
diag(X) <- NA
out <- na.omit(data.frame(as.table(X)))
out[order(out$ind), ]
}
Apply it
listIntersect(l)
# ind ind.1 Freq
# 5 A B 2
# 9 A C 2
# 13 A D 0
# 10 B C 1
# 14 B D 0
# 15 C D 1
Performance seems pretty decent.
Expand the list:
L <- unlist(replicate(100, l, FALSE), recursive=FALSE)
names(L) <- make.unique(names(L))
Set up some functions to test:
fun1 <- function(l) listIntersect(l)
fun2 <- function(l) apply( combn( l , 2 ) , 2 , function(x) length( intersect( unlist( x[1]) , unlist(x[2]) ) ) )
fun3 <- function(l) {
m1 <- combn(names(l),2)
val <- sapply(split(m1, col(m1)),function(x) {x1 <- l[[x[1]]]; x2 <- l[[x[2]]]; length(intersect(x1, x2))})
Ind <- apply(m1,2,paste,collapse="int")
data.frame(Ind, val, stringsAsFactors=F)
}
Check out the timings:
system.time(F1 <- fun1(L))
# user system elapsed
# 0.33 0.00 0.33
system.time(F2 <- fun2(L))
# user system elapsed
# 4.32 0.00 4.31
system.time(F3 <- fun3(L))
# user system elapsed
# 6.33 0.00 6.33
Everyone seems to be sorting the result differently, but the numbers match:
table(F1$Freq)
#
# 0 1 2 4
# 20000 20000 29900 9900
table(F2)
# F2
# 0 1 2 4
# 20000 20000 29900 9900
table(F3$val)
#
# 0 1 2 4
# 20000 20000 29900 9900
combn works with list structures as well, you just need a little unlist'ing of the result to use intersect...
# Get the combinations of names of list elements
nms <- combn( names(l) , 2 , FUN = paste0 , collapse = "" , simplify = FALSE )
# Make the combinations of list elements
ll <- combn( l , 2 , simplify = FALSE )
# Intersect the list elements
out <- lapply( ll , function(x) length( intersect( x[[1]] , x[[2]] ) ) )
# Output with names
setNames( out , nms )
#$AB
#[1] 2
#$AC
#[1] 2
#$AD
#[1] 0
#$BC
#[1] 1
#$BD
#[1] 0
#$CD
#[1] 1
Try:
m1 <- combn(names(l),2)
val <- sapply(split(m1, col(m1)),function(x) {x1 <- l[[x[1]]]; x2 <- l[[x[2]]]; length(intersect(x1, x2))})
Ind <- apply(m1,2,paste,collapse="int")
data.frame(Ind, val, stringsAsFactors=F)
# Ind val
# 1 AntB 2
# 2 AntC 2
# 3 AntD 0
# 4 BntC 1
# 5 BntD 0
# 6 CntD 1

cbind: is there a way to have missing values set to NA?

Please forgive me if I missed an answer to such a simple question.
I want to use cbind() to bind two columns. One of them is a single entry shorter in length.
Can I have R supply an NA for the missing value?
The documentation discusses a deparse.level argument but this doesn't seem to be my solution.
Further, if I may be so bold, would there also be a quick way to prepend the shorter column with NA's?
Try this:
x <- c(1:5)
y <- c(4:1)
length(y) = length(x)
cbind(x,y)
x y
[1,] 1 4
[2,] 2 3
[3,] 3 2
[4,] 4 1
[5,] 5 NA
or this:
x <- c(4:1)
y <- c(1:5)
length(x) = length(y)
cbind(x,y)
x y
[1,] 4 1
[2,] 3 2
[3,] 2 3
[4,] 1 4
[5,] NA 5
I think this will do something similar to what DWin suggested and work regardless of which vector is shorter:
x <- c(4:1)
y <- c(1:5)
lengths <- max(c(length(x), length(y)))
length(x) <- lengths
length(y) <- lengths
cbind(x,y)
The code above can also be condensed to:
x <- c(4:1)
y <- c(1:5)
length(x) <- length(y) <- max(c(length(x), length(y)))
cbind(x,y)
EDIT
Here is what I came up with to address the question:
"Further, if I may be so bold, would there also be a quick way to prepend the shorter column with NA's?"
inserted into the original post by Matt O'Brien.
x <- c(4:1)
y <- c(1:5)
first <- 1 # 1 means add NA to top of shorter vector
# 0 means add NA to bottom of shorter vector
if(length(x)<length(y)) {
if(first==1) x = c(rep(NA, length(y)-length(x)),x);y=y
if(first==0) x = c(x,rep(NA, length(y)-length(x)));y=y
}
if(length(y)<length(x)) {
if(first==1) y = c(rep(NA, length(x)-length(y)),y);x=x
if(first==0) y = c(y,rep(NA, length(x)-length(y)));x=x
}
cbind(x,y)
# x y
# [1,] NA 1
# [2,] 4 2
# [3,] 3 3
# [4,] 2 4
# [5,] 1 5
Here is a function:
x <- c(4:1)
y <- c(1:5)
first <- 1 # 1 means add NA to top of shorter vector
# 0 means add NA to bottom of shorter vector
my.cbind <- function(x,y,first) {
if(length(x)<length(y)) {
if(first==1) x = c(rep(NA, length(y)-length(x)),x);y=y
if(first==0) x = c(x,rep(NA, length(y)-length(x)));y=y
}
if(length(y)<length(x)) {
if(first==1) y = c(rep(NA, length(x)-length(y)),y);x=x
if(first==0) y = c(y,rep(NA, length(x)-length(y)));x=x
}
return(cbind(x,y))
}
my.cbind(x,y,first)
my.cbind(c(1:5),c(4:1),1)
my.cbind(c(1:5),c(4:1),0)
my.cbind(c(1:4),c(5:1),1)
my.cbind(c(1:4),c(5:1),0)
my.cbind(c(1:5),c(5:1),1)
my.cbind(c(1:5),c(5:1),0)
This version allows you to cbind two vectors of different mode:
x <- c(4:1)
y <- letters[1:5]
first <- 1 # 1 means add NA to top of shorter vector
# 0 means add NA to bottom of shorter vector
my.cbind <- function(x,y,first) {
if(length(x)<length(y)) {
if(first==1) x = c(rep(NA, length(y)-length(x)),x);y=y
if(first==0) x = c(x,rep(NA, length(y)-length(x)));y=y
}
if(length(y)<length(x)) {
if(first==1) y = c(rep(NA, length(x)-length(y)),y);x=x
if(first==0) y = c(y,rep(NA, length(x)-length(y)));x=x
}
x <- as.data.frame(x)
y <- as.data.frame(y)
return(data.frame(x,y))
}
my.cbind(x,y,first)
# x y
# 1 NA a
# 2 4 b
# 3 3 c
# 4 2 d
# 5 1 e
my.cbind(c(1:5),letters[1:4],1)
my.cbind(c(1:5),letters[1:4],0)
my.cbind(c(1:4),letters[1:5],1)
my.cbind(c(1:4),letters[1:5],0)
my.cbind(c(1:5),letters[1:5],1)
my.cbind(c(1:5),letters[1:5],0)
A while back I had put together a function called Cbind that was meant to do this sort of thing. In its current form, it should be able to handle vectors, data.frames, and matrices as the input.
For now, the function is here: https://gist.github.com/mrdwab/6789277
Here is how one would use the function:
x <- 1:5
y <- letters[1:4]
z <- matrix(1:4, ncol = 2, dimnames = list(NULL, c("a", "b")))
Cbind(x, y, z)
# x y z_a z_b
# 1 1 a 1 3
# 2 2 b 2 4
# 3 3 c NA NA
# 4 4 d NA NA
# 5 5 <NA> NA NA
Cbind(x, y, z, first = FALSE)
# x y z_a z_b
# 1 1 <NA> NA NA
# 2 2 a NA NA
# 3 3 b NA NA
# 4 4 c 1 3
# 5 5 d 2 4
The two three functions required are padNA, dotnames, and Cbind, which are defined as follows:
padNA <- function (mydata, rowsneeded, first = TRUE) {
## Pads vectors, data.frames, or matrices with NA
temp1 = colnames(mydata)
rowsneeded = rowsneeded - nrow(mydata)
temp2 = setNames(
data.frame(matrix(rep(NA, length(temp1) * rowsneeded),
ncol = length(temp1))), temp1)
if (isTRUE(first)) rbind(mydata, temp2)
else rbind(temp2, mydata)
}
dotnames <- function(...) {
## Gets the names of the objects passed through ...
vnames <- as.list(substitute(list(...)))[-1L]
vnames <- unlist(lapply(vnames,deparse), FALSE, FALSE)
vnames
}
Cbind <- function(..., first = TRUE) {
## cbinds vectors, data.frames, and matrices together
Names <- dotnames(...)
datalist <- setNames(list(...), Names)
nrows <- max(sapply(datalist, function(x)
ifelse(is.null(dim(x)), length(x), nrow(x))))
datalist <- lapply(seq_along(datalist), function(x) {
z <- datalist[[x]]
if (is.null(dim(z))) {
z <- setNames(data.frame(z), Names[x])
} else {
if (is.null(colnames(z))) {
colnames(z) <- paste(Names[x], sequence(ncol(z)), sep = "_")
} else {
colnames(z) <- paste(Names[x], colnames(z), sep = "_")
}
}
padNA(z, rowsneeded = nrows, first = first)
})
do.call(cbind, datalist)
}
Part of the reason I stopped working on the function was that the gdata package already has a function called cbindX that handles cbinding data.frames and matrices with different numbers of rows. It will not work directly on vectors, so you need to convert them to data.frames first.
library(gdata)
cbindX(data.frame(x), data.frame(y), z)
# x y a b
# 1 1 a 1 3
# 2 2 b 2 4
# 3 3 c NA NA
# 4 4 d NA NA
# 5 5 <NA> NA NA

Alternative to class() that does not distinguish between "numeric" and "integer"

Given a data.frame, I would like to test if all the columns are of the same "class". if they are I'd like to leave the data.frame as is. If they aren't I'd like to keep all columns that match the first variables class and drop any that are not of that class. The exception being that, for my purposes, integer and numeric are equal.
For example:
dat <- data.frame(numeric,numeric,integer,factor)
Would be:
data.frame(numeric,numeric,integer)
Additionally
dat <- data.frame(character,character,integer)
Would be:
data.frame(character,character)
And finally:
dat <- data.frame(numeric,numeric,numeric,factor)
Would be:
data.frame(numeric,numeric,numeric)
I would do this:
dat <- data.frame(
a=as.integer(1:26), b=as.integer(26:1), c=as.numeric(1:26), d=as.factor(1:26)
)
Create two helper functions:
is.numint <- function(x)is.numeric(x) || is.integer(x)
is.charfact <- function(x)is.character(x) || is.factor(x)
Return only numeric columns:
head(dat[, sapply(dat, is.numint)])
a b c
1 1 26 1
2 2 25 2
3 3 24 3
4 4 23 4
5 5 22 5
Return only factor columns:
head(dat[, sapply(dat, is.charfact), drop=FALSE])
d
1 1
2 2
3 3
4 4
5 5
6 6
Combining this approach, and rewriting your function:
dropext <- function(x){
is.numint <- function(x)is.numeric(x) || is.integer(x)
is.charfact <- function(x)is.character(x) || is.factor(x)
cl <- rep(NA, length(x))
cl[sapply(x, is.numint)] <- "num"
cl[sapply(x, is.charfact)] <- "char"
x[, cl == unique(cl)[1], drop=FALSE]
}
dropext(dat)
a b c
1 1 26 1
2 2 25 2
3 3 24 3
4 4 23 4
5 5 22 5
How about:
if(length(unique(cl <- sapply(dat, class))) > 1 &&
any(!sapply(dat, is.numeric))) {
dat <- dat[ , which(cl == cl[1]), drop = FALSE]
}
This assumes that in the following example:
dat2 <- data.frame(A = factor(sample(LETTERS, 26, replace = TRUE)),
B = factor(sample(LETTERS, 26, replace = TRUE)),
C = sample(LETTERS, 26, replace = TRUE),
dat, stringsAsFactors = FALSE)
> sapply(dat2, class)
A B C
"factor" "factor" "character"
as.integer.1.26. as.integer.26.1. as.numeric.1.26.
"integer" "integer" "numeric"
you want only the factor variables, i.e. you want to distinguish between character and factor variables - which is what your code appears to do.
For this example I used
if(length(unique(cl <- sapply(dat2, class))) > 1 &&
any(!sapply(dat2, is.numeric))) {
dat2 <- dat2[ ,which(cl == cl[1]), drop = FALSE]
}
which results in
> head(dat2)
A B
1 D G
2 P D
3 C T
4 X F
5 N R
6 A E
> sapply(dat2, class)
A B
"factor" "factor"
On dat, the above if() statement would not change dat:
> if(length(unique(cl <- sapply(dat, class))) > 1 &&
+ any(!sapply(dat, is.numeric))) {
+ dat <- dat[ , which(cl == cl[1]), drop = FALSE]
+ }
> head(dat)
as.integer.1.26. as.integer.26.1. as.numeric.1.26.
1 1 26 1
2 2 25 2
3 3 24 3
4 4 23 4
5 5 22 5
6 6 21 6
Appreciate the commentary and your answers, in the end all I needed was a class() function that does not distinguish between integer and numeric. Which can be accomplished with a simple wrapper.
class.wrap <- function(x) {
test <- class(x)
if(test == "integer") test <- "numeric"
return(test)
}

Resources