I would like to transpose a nested list. Assume the following nested list x is given:
a <- list(c("a","b","c","d"))
b <- list(c("d","c","b","a"))
c <- list(c("4","3","2","1"))
d <- list(c("1","2","3","4"))
x <- list(a,b,c,d)
The outcome should be a nested list where the first column of the original list x is the first nested list element, that is "a","d","4","1", the second column is the second nested list element, i.e. "b","c","3","2" and so on. In the end the structure is kind of a transpose of the original structure. How can this be done in R?
We could also do without lapply (using matrix):
relist(matrix(unlist(x), ncol = 4, byrow = T), skeleton = x)
Benchmarking
library(microbenchmark)
a <- list(c("a","b","c","d"))
b <- list(c("d","c","b","a"))
c <- list(c("4","3","2","1"))
d <- list(c("1","2","3","4"))
x <- list(a,b,c,d)
f_akrun <- function(x) {m1 <- do.call(rbind, lapply(x, function(y) do.call(rbind, y)));relist(m1, skeleton = x);}
f_m0h3n <- function(x) {relist(matrix(unlist(x), ncol = length(x[[1]][[1]]), byrow = T), skeleton = x)}
setequal(f_akrun(x), f_m0h3n(x))
# [1] TRUE
microbenchmark(f_akrun(x), f_m0h3n(x))
# Unit: microseconds
# expr min lq mean median uq max neval
# f_akrun(x) 135.591 137.301 144.3545 138.585 148.422 334.484 100
# f_m0h3n(x) 110.782 111.638 116.5477 112.493 117.412 212.153 100
We can try
m1 <- do.call(rbind, lapply(x, function(y) do.call(rbind, y)))
relist(m1, skeleton = x)
Related
Say you have some participants and control in a given experiment that are evaluated in three characteristics, something like this:
part_A <- c(3, 5, 4)
part_B <- c(12, 15, 18)
part_C <- c(50, 40, 45)
ctrl_1 <- c(4, 5, 5)
ctrl_2 <- c(1, 0, 4)
ctrl_3 <- c(13, 16, 17)
ctrl_4 <- c(28, 30, 35)
ctrl_5 <- c(51, 43, 44)
I want to find for each participant which control case is the closest match.
If I used the dist() function, I could get it, but it would take a lot of time also calculating the distances between controls, which is useless to me (and in the real data, there are 1000 times more control cases than participant cases).
Is there a way to ask for the distances between each of these elements to each of those elements? And something that work for very large data sets?
In the example above, the result I want is:
Participant Closest_Ctrl
1 part_A ctrl_1
2 part_B ctrl_3
3 part_C ctrl_5
Here is a solution that should be sufficiently fast for a not-too-big number of participants:
ctrl <- do.call(cbind, mget(ls(pattern = "ctrl_\\d+")))
dat <- mget(ls(pattern = "part_[[:upper:]+]"))
res <- vapply(dat, function(x) colnames(ctrl)[which.min(sqrt(colSums(x - ctrl)^2))],
FUN.VALUE = character(1))
stack(res)
# values ind
#1 ctrl_1 part_A
#2 ctrl_3 part_B
#3 ctrl_5 part_C
If this is too slow I would quickly code it in Rcpp.
Convert input to data frames
parts <- do.call(data.frame, mget(ls(pattern = "part_[A-C]")))
ctrl <- do.call(data.frame, mget(ls(pattern = "ctrl_[1-5]")))
Generate output
# calculate distances
dists <- outer(parts, ctrl, Vectorize(function(x, y) sqrt(sum((x - y)^2))))
# generate output by calculating column with min value (max negative value)
data.frame(Participant = names(parts),
Closest_Ctrl = names(ctrl)[max.col(-dists)])
# Participant Closest_Ctrl
# 1 part_A ctrl_1
# 2 part_B ctrl_3
# 3 part_C ctrl_5
Benchmark
parts <- do.call(data.frame, mget(ls(pattern = "part_[A-C]")))
ctrl <- do.call(data.frame, mget(ls(pattern = "ctrl_[1-5]")))
parts <- do.call(cbind, replicate(100, parts, simplify = F))
ctrl <- do.call(cbind, replicate(100, ctrl, simplify = F))
r1 <- f1()
r2 <- f2()
all.equal(r1 %>% lapply(as.factor) %>% setNames(1:2),
r2[2:1] %>% lapply(as.factor) %>% setNames(1:2))
# [1] TRUE
f1 <- function(x){
dists <- outer(parts, ctrl, Vectorize(function(x, y) sqrt(sum((x - y)^2))))
# generate output by calculating column with min value (max negative value)
data.frame(Participant = names(parts),
Closest_Ctrl = names(ctrl)[max.col(-dists)])
}
f2 <- function(x){
res <- vapply(parts, function(x) colnames(ctrl)[which.min(sqrt(colSums(x - ctrl)^2))],
FUN.VALUE = character(1))
stack(res)
}
microbenchmark::microbenchmark(f1(), f2(), times = 5)
# Unit: milliseconds
# expr min lq mean median uq max neval
# f1() 305.7324 314.8356 435.3961 324.6116 461.4788 770.3221 5
# f2() 12359.6995 12831.7995 13567.8296 13616.5216 14244.0836 14787.0438 5
Benchmark 2
parts <- do.call(data.frame, mget(ls(pattern = "part_[A-C]")))
ctrl <- do.call(data.frame, mget(ls(pattern = "ctrl_[1-5]")))
parts <- do.call(cbind, replicate(10, parts, simplify = F))
ctrl <- do.call(cbind, replicate(10*1000, ctrl, simplify = F))
r1 <- f1()
r2 <- f2()
all.equal(r1 %>% lapply(as.factor) %>% setNames(1:2),
r2[2:1] %>% lapply(as.factor) %>% setNames(1:2))
# [1] TRUE
f1 <- function(x){
dists <- outer(parts, ctrl, Vectorize(function(x, y) sqrt(sum((x - y)^2))))
# generate output by calculating column with min value (max negative value)
data.frame(Participant = names(parts),
Closest_Ctrl = names(ctrl)[max.col(-dists)])
}
f2 <- function(x){
res <- vapply(parts, function(x) colnames(ctrl)[which.min(sqrt(colSums(x - ctrl)^2))],
FUN.VALUE = character(1))
stack(res)
}
microbenchmark::microbenchmark(f1(), f2(), times = 5)
# Unit: seconds
# expr min lq mean median uq max neval
# f1() 3.450176 4.211997 4.493805 4.339818 5.154191 5.312844 5
# f2() 119.120484 124.280423 132.637003 130.858727 131.148630 157.776749 5
I have a symmetric matrix Siginv and a list Z containing N matrices of size TxK.
I have three approaches to compute what I need (see below). The first one literally takes the formula, and is slow, as expected. Are there faster ways of doing what I do?
library(microbenchmark)
K <- 2
N <- 50
Siginv <- matrix(rnorm(N^2), ncol=N)
Siginv[lower.tri(Siginv)] <- t(Siginv)[lower.tri(Siginv)] # just some symmetric matrix
Tdim <- 400
Z <- replicate(N, matrix(rnorm(Tdim*K), ncol=K), simplify = F)
microbenchmark({
I <- diag(Tdim)
Z.m <- do.call(rbind, Z)
meat.mat.GLS.kp <- t(Z.m)%*%(Siginv%x%I)%*%Z.m
}, {
combs <- expand.grid(1:N, 1:N)
cprods.GLS <- mapply(function(i,j) Siginv[j,i]*t(Z[[i]])%*%Z[[j]], combs[,1], combs[,2], SIMPLIFY = F)
meat.mat.GLS <- Reduce("+", cprods.GLS)
}, {
combs <- expand.grid(1:N, 1:N)
cprods.GLS2 <- mapply(function(i,j) Siginv[j,i]*crossprod(Z[[i]],Z[[j]]), combs[,1], combs[,2], SIMPLIFY = F)
meat.mat.GLS2 <- Reduce("+", cprods.GLS2)
}, times=5)
all.equal(meat.mat.GLS.kp, meat.mat.GLS, meat.mat.GLS2) # TRUE
Currently, the approaches compare as follows:
min lq mean median uq max neval cld
4499.66564 4911.42674 4874.35170 4958.81553 4977.55873 5024.29187 5 b
23.03861 23.09293 23.82407 23.29574 24.04696 25.64611 5 a
12.92261 13.08275 13.54088 13.15898 13.80212 14.73794 5 a
If I have:
mylist <- lapply(1:10, function(x) matrix(NA, nrow=2, ncol=2))
And I want to replace, for example, the first, second and fifth element in the list with a:
mymatrix=cbind(c(1,1),c(1,1))
What can I do? I tried with:
mylist[c(1,2,5)]=mymatrix
But I can't substitute the new matrix because it's another list and with the [[]] I can only access to one element.
I think I have to use the lapply function but I can't figure out in which way.
Would this work for you?
mylist[c(1, 2, 5)] <- lapply(mylist[c(1, 2, 5)], function(x) x <- mymatrix)
Similar to #jaSf but faster and "cleaner":
idx <- c(1, 2, 3)
mylist[idx] <- list(mymatrix)
microbenchmark:
Unit: nanoseconds
expr min lq mean median uq max neval cld
this 687 828 1135.152 959 1127 2787458 1e+05 a
jaSf 2982 3575 4482.867 4034 4535 2979424 1e+05 b
Otherwise would recommend using modifyList() to update named lists like:
foo <- list(a = 1, b = list(c = "a", d = FALSE))
bar <- modifyList(foo, list(e = 2, b = list(d = TRUE)))
str(foo)
str(bar)
Another option could be using just far-loop as:
for(i in c(1,2,5)){
mylist[[i]] <- mymatrix
}
I have two needs, both connected to a dataset similar to the reproducible one below. I have a list of 18 entities, each composed of a list of 17-19 data.frames. Reproducible dataset follows (there are matrices instead of data.frames, but I do not suppose that makes a difference):
test <- list(list(matrix(10:(50-1), ncol = 10), matrix(60:(100-1), ncol = 10), matrix(110:(150-1), ncol = 10)),
list(matrix(200:(500-1), ncol = 10), matrix(600:(1000-1), ncol = 10), matrix(1100:(1500-1), ncol = 10)))
I need to subset each dataframe/matrix into two parts (by a given number of rows) and save to a new list of lists
Secondly, I need to extract and save a given column(s) out of every data.frame in a list of lists.
I have no idea how to go around doing it apart from for(), but I am sure it should be possible with apply() family of functions.
Thank you for reading
EDIT:
My expected output would look as follows:
extractedColumns <- list(list(matrix(10:(50-1), ncol = 10)[, 2], matrix(60:(100-1), ncol = 10)[, 2], matrix(110:(150-1), ncol = 10)[, 2]),
list(matrix(200:(500-1), ncol = 10)[, 2], matrix(600:(1000-1), ncol = 10)[, 2], matrix(1100:(1500-1), ncol = 10)[, 2]))
numToSubset <- 3
substetFrames <- list(list(list(matrix(10:(50-1), ncol = 10)["first length - numToSubset rows", ], matrix(10:(50-1), ncol = 10)["last numToSubset rows", ]),
list(matrix(60:(100-1), ncol = 10)["first length - numToSubset rows", ], matrix(60:(100-1), ncol = 10)["last numToSubset rows", ]),
list(matrix(110:(150-1), ncol = 10)["first length - numToSubset rows", ], matrix(110:(150-1), ncol = 10)["last numToSubset rows", ])),
etc...)
It gets to look very messy, hope you can follow what I want.
You can use two nested lapplys:
lapply(test, function(x) lapply(x, '[', c(2, 3)))
Ouput:
[[1]]
[[1]][[1]]
[1] 11 12
[[1]][[2]]
[1] 61 62
[[1]][[3]]
[1] 111 112
[[2]]
[[2]][[1]]
[1] 201 202
[[2]][[2]]
[1] 601 602
[[2]][[3]]
[1] 1101 1102
Explanation
The first lapply will be applied on the two lists of test. Each one of those two lists contain another 3. The second lapply will iterate over those 3 lists and subset (thats the '[' function in the second lapply) columns c(2, 3).
Note: In the case of a matrix [ will subset elements 2 and 3 but the same function will subset columns when used on a data.frame.
Subsetting rows and columns
lapply is very flexible with the use of anonymous functions. By changing the code into:
#change rows and columns into what you need
lapply(test, function(x) lapply(x, function(y) y[rows, columns]))
You can specify any combination of rows or columns you want.
To piggyback #LyzandeR's answer, consider the often ignored sibling of the apply family, rapply that can recursively run functions on lists of vectors/matrices, returning such nested structures. Often it can compare to nested lapply or its variants v/sapply:
newtest1 <- lapply(test, function(x) lapply(x, '[', c(2, 3)))
newtest2 <- rapply(test, function(x) `[`(x, c(2, 3)), classes="matrix", how="list")
all.equal(newtest1, newtest2)
# [1] TRUE
Interestingly, to my amazement, rapply runs slower in this use case compared to nested lapply! Hmmmm, back to the lab I go...
library(microbenchmark)
microbenchmark(newtest1 <- lapply(test, function(x) lapply(x, '[', c(2, 3))))
# Unit: microseconds
# mean median uq max neval
# 31.92804 31.278 32.241 74.587 100
microbenchmark(newtest2 <- rapply(test, function(x) `[`(x, c(2, 3)),
classes="matrix", how="list"))
# Unit: microseconds
# min lq mean median uq max neval
# 69.293 72.18 79.53353 73.143 74.5865 219.91 100
Even more interesting, is removing the [ operator for the equivalent matrix bracketing, nested lapply runs even better and rapply even worse!
microbenchmark(newtest3 <- lapply(test, function(x)
lapply(x, function(y) y[c(2, 3), 1])))
# Unit: microseconds
# min lq mean median uq max neval
# 26.947 28.391 32.00987 29.354 30.798 100.09 100
all.equal(newtest1, newtest3)
# [1] TRUE
microbenchmark(newtest4 <- rapply(test, function(x) x[c(2,3), 1],
classes="matrix", how="list"))
# Unit: microseconds
# min lq mean median uq max neval
# 74.105 76.752 80.37076 77.955 78.918 203.549 100
all.equal(newtest2, newtest4)
# [1] TRUE
I'm trying to paste all possible characters that are arranged in any diagonal within an N * N matrix.
For example, consider the following 3 X 3 matrix:
#Create matrix, convert to character dataframe
matrix <- matrix(data=c('s','t','y','a','e','l','f','n','e'),nrow=3,ncol=3)
matrix <- as.data.frame(matrix)
for(i in 1:length(colnames(matrix))){
matrix[,i] <- as.character(matrix[,i])
}
In the matrix above I need to paste the diagonals: "see","fey", "ees", and "yef". I can find these in the dataframe with the following code:
diag <- paste(matrix[1,1],matrix[2,2],matrix[3,3],sep='')
diag1 <- paste(matrix[1,3],matrix[2,2],matrix[3,1],sep='')
diag2 <- paste(matrix[3,1],matrix[2,2],matrix[1,3],sep='')
diag3 <- paste(matrix[3,3],matrix[2,2],matrix[1,1],sep='')
The problem is that I want to automate this so that it will work on any N x N matrix. (I'm writing a function to find the diagonals in any N X N matrix). Is there an efficient way to do this?
Oh, that's easy if you use matrix instead of data.frame :)
We can choose matrix elements just like we can take vector elements:
matrix[1:3] # First three elements == first column
n <- ncol(matrix)
(1:n-1)*n+1:n
## [1] 1 5 9
(1:n-1)*n+n:1
## [1] 3 5 7
So now we can use this:
matrix[(1:n-1)*n+1:n]
[1] "s" "e" "e"
paste0(matrix[(1:n-1)*n+1:n],collapse="")
[1] "see"
And if you want it backwards, just reverse the vector of indexes using rev function:
paste0(matrix[rev((1:n-1)*n+1:n)],collapse="")
[1] "ees"
Some benchmarks:
rotate <- function(x) t(apply(x, 2, rev))
revMat <- function(mat, dir=0){
x <- if(bitwAnd(dir,1)) rev(seq(nrow(mat))) else seq(nrow(mat))
y <- if(bitwAnd(dir,2)) rev(seq(ncol(mat))) else seq(nrow(mat))
mat[x,y]
}
bartek <- function(matrix){
n <- ncol(matrix)
c(paste0(matrix[(1:n-1)*n+1:n],collapse=""), paste0(matrix[rev((1:n-1)*n+1:n)],collapse=""),
paste0(matrix[(1:n-1)*n+n:1],collapse=""), paste0(matrix[rev((1:n-1)*n+n:1)],collapse=""))
}
Joe <- function(matrix){
diag0 <- diag(matrix)
diag1 <- diag(rotate(matrix))
diag2 <- rev(diag0)
diag3 <- rev(diag1)
c(paste(diag0, collapse = ""),paste(diag1, collapse = ""),
paste(diag2, collapse = ""),paste(diag3, collapse = ""))
}
James <- function(mat){
sapply(0:3,function(x) paste(diag(revMat(mat,x)),collapse=""))
}
matrix <- matrix(c('s','t','y','a','e','l','f','n','e'), ncol = 3)
microbenchmark(bartek(matrix), Joe(matrix), James(matrix))
Unit: microseconds
expr min lq mean median uq max neval
bartek(matrix) 50.273 55.2595 60.78952 59.4390 62.438 134.880 100
Joe(matrix) 167.431 176.6170 188.46908 182.8260 192.646 337.717 100
James(matrix) 321.313 334.3350 346.15230 339.7235 348.565 447.115 100
matrix <- matrix(1:10000, ncol=100)
microbenchmark(bartek(matrix), Joe(matrix), James(matrix))
Unit: microseconds
expr min lq mean median uq max neval
bartek(matrix) 314.385 326.752 336.1194 331.936 337.9805 423.323 100
Joe(matrix) 2168.141 2221.477 2460.1002 2257.439 2298.4400 8856.482 100
James(matrix) 1200.572 1250.354 1407.5943 1276.307 1323.8845 7419.931 100
For a matrix, this can be accomplished by taking the diag of the four possible rotations. If you set up a rotate function as follows (credit), this becomes straightforward:
> rotate <- function(x) t(apply(x, 2, rev))
> diag0 <- paste(diag(matrix), collapse = "")
> diag1 <- paste(diag(rotate(matrix)), collapse = "")
> diag2 <- paste(diag(rotate(rotate(matrix))), collapse = "")
> diag3 <- paste(diag(rotate(rotate(rotate(matrix)))), collapse = "")
> diag0
[1] "see"
> diag1
[1] "yef"
> diag2
[1] "ees"
> diag3
[1] "fey"
As pointed out by Frank in comments, this could become slow for sufficiently large matrices (on my machine, rotate starts to take longer than about a second for matrices larger than 1000 X 1000). You can save some time by using rev prior to pasting, eg:
> diag0 <- diag(matrix)
> diag1 <- diag(rotate(matrix))
> diag2 <- rev(diag0)
> diag3 <- rev(diag1)
> paste(diag2, collapse = "")
[1] "ees"
> paste(diag3, collapse = "")
[1] "fey"
One way is to use diag on the matrix, called mat here to avoid clashing with the function name, and reversing the row and/or column orders for to get each diagonal and direction.
You can do it with a supplementary function to make the reversals systematic so you can use sapply to loop.
revMat <- function(mat, dir=0)
{
x <- if(bitwAnd(dir,1)) rev(seq(nrow(mat))) else seq(nrow(mat))
y <- if(bitwAnd(dir,2)) rev(seq(ncol(mat))) else seq(nrow(mat))
mat[x,y]
}
sapply(0:3,function(x) paste(diag(revMat(mat,x)),collapse=""))
[1] "see" "yef" "fey" "ees"
Convert matrix to an actual matrix m (as opposed to a data frame). Then the four diagonals are:
m <- as.matrix(matrix)
ix <- ncol(m):1
paste(diag(m), collapse = "")
paste(diag(m[ix,]), collapse = "")
paste(diag(m[,ix]), collapse = "")
paste(diag(m[ix, ix]), collapse = "")