Find combinations of objects in R - r

I have the following example code:
library(caTools)
sample1 = rnorm(20)
sample2 = rnorm(30)
sample3 = rnorm(40)
# could be more samples
args = list(sample1, sample2, sample3) # could be more
> combs(c(args), k=2)
[,1] [,2]
[1,] Numeric,20 Numeric,30
[2,] Numeric,20 Numeric,40
[3,] Numeric,30 Numeric,40
However, this is not what is desired. I would like to feed combs input that should give the same as:
> combs(c("sample1","sample2", "sample3"),k=2)
[,1] [,2]
[1,] "sample1" "sample2"
[2,] "sample1" "sample3"
[3,] "sample2" "sample3"
and from there I would want to use get to extract the vectors for each sampleX object by row.
How can I do this without hardcoding "sample1", "sample2", etc. so that I can have as many as samples as are fed to it?

From library(gtools):
combinations(3,2,c("sample1","sample2", "sample3"))
Result:
[,1] [,2]
[1,] "sample1" "sample2"
[2,] "sample1" "sample3"
[3,] "sample2" "sample3"
The same result can be obtained if those objects are named elements of a list:
tmp <- list(sample1=1:3,sample2=4:6,sample3=7:9)
combinations(3,2,names(tmp))
Or, if those objects are all in an environment:
tmp <- new.env()
tmp$sample1 <- 1:3
tmp$sample2 <- 4:6
tmp$sample3 <- 7:9
combinations(3,2,objects(tmp))

How about this? I use simplified data as an illustrative example.
Edit
Thanks to #GSee for recommending two improvements in this approach [see comment].
This is not something I'd be keen to do, but we use ls and the pattern argument on the names of all objects in your global environment to return the names of those that fit the pattern i.e. all objects which include "sample" in the object names - so be careful - and then stick them in a list using mget.
We then get the combinations of list elements using combn and use an anonymous function to combine all elements of list pairs using expand.grid. If you want this as a two column data.frame you can use do.call and rbind the returned list together:
sample1 <- 1:2
sample2 <- 3:4
sample3 <- 5:6
args <-mget( ls( pattern = "^sample\\d+") , env = .GlobalEnv )
res <- combn( length(args) , 2 , FUN = function(x) expand.grid(args[[x[1]]] , args[[x[2]]]) , simplify = FALSE )
do.call( rbind , res )
Var1 Var2
1 1 3
2 2 3
3 1 4
4 2 4
5 1 5
6 2 5
7 1 6
8 2 6
9 3 5
10 4 5
11 3 6
12 4 6

Here is an approach
# put samples in separate structure, for instance a list
samples <- list(s1=rnorm(20), s2=rnorm(30), s3=rnorm(40))
cmb <- t(combn(names(samples),m=2))
apply(cmb,1,FUN=function(x) list(samples[[x[[1]]]], samples[[x[[2]]]]))

Related

Extract and organize data from subsetted lists on R

I spent the last days trying to solve it by myself using several different sources of information, including other questions here on Stackoverflow, but failed. I'm a complete beginner, so that’s probably why I’m struggling so much with this.
I created these dummy data below to illustrate how my original data looks like.
list1<-list(path = ".../folder1/folder2/Country_State_Species_Individual1.png",
matrix1 = cbind(1:3, 1:9),
matrix2 = cbind(1:3, 1:9),
matrix3 = cbind(1:3, 1:9))
list2<-list(path = ".../folder1/folder2/Country_State_Species_Individual2.png",
matrix1 = cbind(1:3, 1:9),
matrix2 = cbind(1:3, 1:9),
matrix3 = cbind(1:3, 1:9))
list3<-list(path = ".../folder1/folder2/Country_State_Species_Individual3.png",
matrix1 = cbind(1:3, 1:9),
matrix2 = cbind(1:3, 1:9),
matrix3 = cbind(1:3, 1:9))
general_list <- list(list1, list2, list3)
As you can see, it is a big list (general_list) composed by small lists (list1, list2, list3) that are identical in structure.
My initial goal can be described in two steps:
1 – Sample 6 random rows from each matrix2 and save each of these outputs in a new object.
2 – Rename these objects using the information contained in the original file name stored in the path
I want to rename the extracted matrices this way because I need to be able to sort the matrices by the variables expressed in the file names (Country, State and especially Individuals). But maybe might be a more efficient/practical way to do this.
The most advisable way to store these new objects would be on a new list?
I would also be happy to receive any suggestions on how to achieve my initial goal and how to proceed in order to optimize the storage of these new objects (having in mind that they will be used in some analysis after everything is done).
Best regards!
We loop over the 'general_list', extract the matrix2, then sample 6 rows from the dataset, create a new list ('out') and rename the list with the basename of the 'path' element
out <- lapply(general_list, function(x) {
x1 <- x$matrix2
x1[sample(nrow(x1), 6, replace = FALSE),] })
names(out) <- sapply(general_list,
function(x) tools::file_path_sans_ext(basename(x$path)))
out
#$Country_State_Species_Individual1
# [,1] [,2]
#[1,] 3 9
#[2,] 2 2
#[3,] 1 7
#[4,] 1 4
#[5,] 3 6
#[6,] 2 8
#$Country_State_Species_Individual2
# [,1] [,2]
#[1,] 3 3
#[2,] 1 7
#[3,] 3 9
#[4,] 2 2
#[5,] 3 6
#[6,] 1 1
#$Country_State_Species_Individual3
# [,1] [,2]
#[1,] 3 3
#[2,] 2 2
#[3,] 1 4
#[4,] 2 5
#[5,] 1 7
#[6,] 3 6
Or using tidyverse
library(dplyr)
library(purrr)
out <- map(general_list, ~ .x %>%
pluck('matrix2') %>%
as.data.frame %>%
sample_n(6) %>%
as.matrix)
names(out) <- map_chr(general_list, ~
tools::file_path_sans_ext(basename(.x$path)))

List of lists to matrix

I have a list of lists and I want to convert it into a matrix such that each column = one sublist.
Mock example
list1 <- list(1, 2)
list2 <- list(1, 2, 3)
list3 <- list(1, 2, 3, 4)
list_lists <- list (list1, list2, list3)
I'm first egalizing the lengths of all the sublists (padding with NULLs if needed) so that all sublists have the length of the longest one. That is to avoid having R repeating data to fill in the rows in the final matrix (feel free if I can skip this step somehow).
max_length <- max(unlist(lapply (list_lists, FUN = length)))
list_lists <- lapply (list_lists, function (x) {length (x) <- max_length; return (x)})
My best attempt so far
mat <- lapply (list_lists, cbind)
mat does look superficially like what I want but it is actually not. It is not a matrix (and attempts to convert it into one using as.matrix are unsuccessful) and I cannot refer to columns/rows like I would do with a matrix.
I am expecting
[,1] [,2] [,3]
[1,] 1 1 1
[2,] 2 2 2
[3,] NULL 3 3
[4,] NULL NULL 4
What is weird to me is that
mat <- cbind (list_lists[[1]], list_lists[[2]], list_lists[[3]])
seems to work. I would bet these two lines are the same, how can they be different?
They are different, lapply returns a list, See below from an excerpt from documentation
Use do.call instead of mat <- lapply (list_lists, cbind) as following:
mat <- do.call("cbind",list_lists)
do.call is same as cbind (list_lists[[1]], list_lists[[2]], list_lists[[3]]) , it happens to operate on a sequence of lists which would be dataframe columns.
> do.call("cbind",list_lists)
[,1] [,2] [,3]
[1,] 1 1 1
[2,] 2 2 2
[3,] NULL 3 3
[4,] NULL NULL 4
>
Understanding do.call:
From documentation:
do.call constructs and executes a function call from a name or a
function and a list of arguments to be passed to it.
lapply returns a list of the same length as X, each element of which
is the result of applying FUN to the corresponding element of X.
Search on r console for ?do.call and ?lapply
You can also read: do.call and lapply
Use sapply instead of lappy like this:
list_lists <- sapply (list_lists, function (x) {length (x) <- max_length; return (x)})
this should give you the matrix that you wanted. Seems like the sapply will recursively unlist each list in the list_lists then apply the function that you specified and wrap all the outputs into a matrix, effectively bypassing the other line that you specifie above.
The stri_list2matrix function should be able to handle this:
library(stringi)
stri_list2matrix(list_lists)
## [,1] [,2] [,3]
## [1,] "1" "1" "1"
## [2,] "2" "2" "2"
## [3,] NA "3" "3"
## [4,] NA NA "4"
Another option is to use your "max_length" to create the matrix:
ml <- max(lengths(list_lists))
do.call(cbind, lapply(list_lists, function(x) `length<-`(unlist(x), ml)))
## [,1] [,2] [,3]
## [1,] 1 1 1
## [2,] 2 2 2
## [3,] NA 3 3
## [4,] NA NA 4
A third option is to use melt from "reshape2":
library(reshape2)
dcast(melt(list_lists), L2 ~ L1)
## L2 1 2 3
## 1 1 1 1 1
## 2 2 2 2 2
## 3 3 NA 3 3
## 4 4 NA NA 4

Split data frames into K fold after randomizing in R

I have a data frame with 6 rows. I want to split it into 5 folds, so ultimately there would be 4 data frames with 1 element each and the last data frame should have 2 elements. I have tried the following code. But it doesnot help. I am new to R. Any help is appreciated.
a = matrix(1:12,6,2)
d <- split(a,rep(1:6,each=4))
Warning message:
In split.default(a, rep(1:6, each = 4)) :
data length is not a multiple of split variable
split expects as vector with groups as it's second argument. In your case
ngroups <- 5
floor(seq(1, ngroups, length.out = nrow(a)))
ans also split doesn't work that well with matrices, so first convert to data.frame:
split(as.data.frame(a), floor(seq(1, ngroups, length.out = nrow(a))))
Edit: Following a suggestion from #IShouldByABoat, the following also works for matrix objects:
split.as.data.frame(a, floor(seq(1, ngroups, length.out = nrow(a))))
Not sure about the "1 element each" aspect which seems to be problematic with R's version of matrix objects, but here is a way to split into the elements of a 12 element matrix that satify the requirements:
split( matrix(1:12,ncol=2), findInterval(1:6, c(sort(sample(1:6,5)),Inf)))
$`1`
[1] 1 7
$`2`
[1] 2 3 8 9
$`3`
[1] 4 10
$`4`
[1] 5 11
$`5`
[1] 6 12
If you wanted to fom them back int o two-column matrices:
lapply( split( matrix(1:12,ncol=2), findInterval(1:6, c(sort(sample(1:6,5)),Inf))) ,
matrix, ncol=2)
$`1`
[,1] [,2]
[1,] 1 7
$`2`
[,1] [,2]
[1,] 2 8
$`3`
[,1] [,2]
[1,] 3 9
$`4`
[,1] [,2]
[1,] 4 10
[2,] 5 11
$`5`
[,1] [,2]
[1,] 6 12
I solved a similar problem using the modulo operator on the 1:6 sequence. For your example, try this:
a = matrix(1:12, 6, 2)
d = split(as.data.frame(a), 1:6%%5)
Simple, and it gets the job done.
For splitting into K folds, you might find using the following useful:
nfolds = 5
a = matrix(1:12, 6, 2)
folds = 1:nrow(a)%%nfolds # or sample(1:nrow(a)%%nfolds) if you want to randomize
fold = 1 # which ever fold you want to test with
train = a[folds != fold,]
test = a[folds == fold,]

Create a list of matrices with a loop and merging them in R

I have several matrices, lets make it simple and say I have 3 matrices. I want to create a list of them and then use rbind to put one over the other.
If I do it by hand, using the following code, it works:
list<-list(matrix1,matrix2,matrix3)
test<-do.call("rbind",list)
and I get a matrix of 97947 rows by 4 columns which is what I want.
but if I do a loop, it does not work:
list2<-list()
for (i in 1:3)
{
y<-paste0("matrix",x)
list2[[x]] <- y
}
test2<-do.call("rbind",list2)
And I get a 3x1 character matrix ???
Can someone please point me to the error?
Any comments would be greatly appreciated.
Thank you!!!!
Consider using a function like mget to get all of your matrix objects from the globalenvironment (the default environment) and put them in a list. You can then use your do.call method and avoid the loop. Here is a toy example:
# Some data
m1 <- matrix( 1:4 , 2 , byrow = TRUE )
m2 <- matrix( 1:4 , 2 , byrow = TRUE )
m3 <- matrix( 1:4 , 2 , byrow = TRUE )
# Use mget to put them in a list. mget searches the .GlobalEnvironment (by default) for the object names in it's first argument
list <- mget( paste0( "m" , 1:3 ) )
list
#$m1
# [,1] [,2]
#[1,] 1 2
#[2,] 3 4
#$m2
# [,1] [,2]
#[1,] 1 2
#[2,] 3 4
#$m3
# [,1] [,2]
#[1,] 1 2
#[2,] 3 4
# rbind them
do.call( rbind , list )
# [,1] [,2]
#[1,] 1 2
#[2,] 3 4
#[3,] 1 2
#[4,] 3 4
#[5,] 1 2
#[6,] 3 4

Named rows and cols for matrices in R

Is it possible to have named rows and columns in Matrices?
for example:
[,a] [,b]
[a,] 1 , 2
[b,] 3 , 4
Is it even reasonable to have such a thing for exploring the data?
Sure. Use dimnames:
> a <- matrix(1:4, nrow = 2)
> a
[,1] [,2]
[1,] 1 3
[2,] 2 4
> dimnames(a) <- list(c("A", "B"), c("AA", "BB"))
> a
AA BB
A 1 3
B 2 4
With dimnames, you can provide a list of (first) rownames and (second) colnames for your matrix. Alternatively, you can specify rownames(x) <- whatever and colnames(x) <- whatever.

Resources