Merge two vectors based on index in R

Merge two vectors based on index in R - r

I'm trying to merge two vectors of same length where NAs in vector "a" align with the numbers in vector "b" and vice versa:
a <- c(1, NA, 3, NA)
b <- c(NA, 2, NA, 4)
The output should be:
1, 2 ,3, 4
Thanks for the help!
edit: the solution I used was
a[is.na(a)] <- b[is.na(a)]

The values of a that correspond to is.na(a) should be replaced with the values of b that correspond to the negation of is.na(b). Here I define a new vector d so as to not over-write the original vectors a or b.
d <- a
d[is.na(d)] <- b[!is.na(b)]
d
# [1] 1 2 3 4
If you know the NA values begin in the second position, you could also alternate the assignment.
d <- a
d[c(FALSE, TRUE)] <- b[c(FALSE, TRUE)]
d
# [1] 1 2 3 4

Here are some more solutions that might have more "literal" resonance. They have equivalent outputs:
m <- mapply(c, na.omit(a), na.omit(b), SIMPLIFY= FALSE) ## or,
m <- Map(c, na.omit(a), na.omit(b))
output <- unlist(m) ## or,
output <- Reduce(c, m)
What this does it first concatenates pairs across na.omit(a) and na.omit(b), and then concatenates all those pairs together.
As far as performance goes, here is a quick benchmark:
library(microbenchmark)
gc()
a <- (1:1e4)[c(TRUE, NA)]
b <- (1:1e4)[c(NA, TRUE)]
microbenchmark(
unlist(mapply(c, na.omit(a), na.omit(b), SIMPLIFY= FALSE)),
unlist(Map(c, na.omit(a), na.omit(b))),
Reduce(c, mapply(c, na.omit(a), na.omit(b), SIMPLIFY= FALSE)),
Reduce(c, Map(c, na.omit(a), na.omit(b))),
times = 100
)
# Unit: milliseconds
# expr min lq
# unlist(mapply(c, na.omit(a), na.omit(b), SIMPLIFY = FALSE)) 4.476689 5.103025
# unlist(Map(c, na.omit(a), na.omit(b))) 4.475753 4.902474
# Reduce(c, mapply(c, na.omit(a), na.omit(b), SIMPLIFY = FALSE)) 75.974627 82.953051
# Reduce(c, Map(c, na.omit(a), na.omit(b))) 75.919419 82.626217
# median uq max neval
# 5.488113 5.723023 10.59291 100
# 5.422528 5.784764 13.04502 100
# 86.082578 89.652660 114.94584 100
# 85.761412 89.550317 158.90629 100
Unsurprisingly, Reduce is much slower than unlist. Map is only slightly slower than mapply. However Reduce is much, much more generally applicable, whereas unlist can really only handle this special case.

Related

Count number of 1's from right to left, stopping at the first 0

I want to count the number of 1's that occur from RIGHT to LEFT across multiple columns, which stops when encountering the first 0.
Example DF:
df<-data.frame(replicate(7,sample(0:1,30,rep=T)))
colnames(df)<-seq(1950,2010,10)
I've manually entered the desired result here under a new column "condition" as an example:
Thanks in advance for your help,
Cai

Here's a fully vectorized attempt
indx <- rowSums(df) == ncol(df) # Per Jaaps comment
df$condition <- ncol(df) - max.col(-df, ties = "last")
df$condition[indx] <- ncol(df) - 1
This is basically finds the first zero from the right and counts how many columns were before that (which are basically the 1s in a binary data)
EDIT
Had to add handling for the special case when all the rows are ones

df$condition <- apply(df, 1, function(x) {
y <- rev(x)
sum(cumprod(y))
})

[Edit: now works]
Try this
df$condition <- apply(df,1,function(x){x<- rev(x);m <- match(0,x)[1]; if (is.na(m)) sum(x) else sum(x[1:m])})
we're matching the first 0, then summing up until this element.
If there's no zero we sum the full row
Here's a benchmark of all solutions :
library(stringr)
microbenchmark(
Moody_Mudskipper = apply(df,1,function(x){x<- rev(x);m <- match(0,x)[1]; if (is.na(m)) sum(x) else sum(x[1:m])}),
akrun = apply(df, 1, function(x) {x1 <- rle(x)
x2 <- tail(x1$lengths, 1)[tail(x1$values, 1)==1]
if(length(x2)==0) 0 else x2}),
akrun2 = str_count(do.call(paste0, df), "[1]+$"),
roland = apply(df, 1, function(x) {y <- rev(x);sum(y * cumprod(y != 0L))}),
David_Arenburg = ncol(df) - max.col(-df, ties = "last"),
times = 10)
# Unit: microseconds
# expr min lq mean median uq max neval
# Moody_Mudskipper 1437.948 1480.417 1677.1929 1536.159 1597.209 3009.320 10
# akrun 6985.174 7121.078 7718.2696 7691.053 7856.862 9289.146 10
# akrun2 1101.731 1188.793 1290.8971 1226.486 1343.099 1790.091 10
# akrun3 693.315 791.703 830.3507 820.371 884.782 1030.240 10
# roland 1197.995 1270.901 1708.5143 1332.305 1727.802 4568.660 10
# David_Arenburg 2845.459 3060.638 3406.3747 3167.519 3495.950 5408.494 10
# David_Arenburg_corrected 3243.964 3341.644 3757.6330 3384.645 4195.635 4943.099 10
For a bigger example David's solution is indeed the fastest, as said in the chosen solution's comments:
df<-data.frame(replicate(7,sample(0:1,1000,rep=T)))
# Unit: milliseconds
# expr min lq mean median uq max neval
# Moody_Mudskipper 31.324456 32.155089 34.168533 32.827345 33.848560 44.952570 10
# akrun 225.592061 229.055097 238.307506 234.761584 241.266853 271.000470 10
# akrun2 28.779824 29.261499 33.316700 30.118144 38.026145 46.711869 10
# akrun3 14.184466 14.334879 15.528201 14.633227 17.237317 18.763742 10
# roland 27.946005 28.341680 29.328530 28.497224 29.760516 33.692485 10
# David_Arenburg 3.149823 3.282187 3.630118 3.455427 3.727762 5.240031 10
# David_Arenburg_corrected 3.464098 3.534527 4.103335 3.833937 4.187141 6.165159 10

We can loop through the rows, use rle
df$condition <- apply(df, 1, function(x) {x1 <- rle(x)
x2 <- tail(x1$lengths, 1)[tail(x1$values, 1)==1]
if(length(x2)==0) 0 else x2})
Or another option is str_extract
library(stringr)
v1 <- str_extract(do.call(paste0, df), "1+$")
d$condition <- ifelse(is.na(v1), 0, nchar(v1))
Or with a slightly more efficient stringi
library(stringi)
v1 <- stri_count(stri_extract(do.call(paste0, df), regex = "1+$"), regex = ".")
v1[is.na(v1)] <- 0
df$condition <- v1
Or with a more compact option
stri_count(do.call(paste0, df), regex = '(?=1+$)')

Apply a function by row on a dataframe independently of the number of columns

I'd like to apply a function by rows on a data.frame to concatenate column titles depending on the value in the row.
df
A B
1 TRUE TRUE
2 FALSE TRUE
3 FALSE FALSE
A B Result
1 TRUE TRUE A / B
2 FALSE TRUE B
3 FALSE FALSE NA
I read about dplyr using mutate() and rowwise(), but I don't know how to apply them since the columns aren't constants.
for a row "i" I would do something like:
paste(names(df)[as.logical(df[i,])], collapse = ' / ')
Any help would be welcome.
Thank you.

I would recommend against using apply on data.frames (due to matrix conversions) and especially with a margin of 1 (row operation are slow in R). Instead, you could pretty easily vectorize this over columns without matrix conversions too, here's an example
res <- rep(NA_character_, nrow(df))
for(j in names(df)) res[df[[j]]] <- paste(res[df[[j]]], j, sep = " / ")
sub("NA / ", "", res, fixed = TRUE)
# [1] "A / B" "B" NA
Below is a benchmark that shows about ~X16 improvement
set.seed(123)
N <- 1e5
df <- as.data.frame(matrix(sample(c(TRUE, FALSE), N*2, replace = TRUE), ncol = 2))
Rowwise <- function(df) apply(df, 1, FUN = function(x) paste(names(x)[x], collapse=" / "))
Colwise <- function(df) {
res <- rep(NA_character_, nrow(df));
for(j in names(df)) res[df[[j]]] <- paste(res[df[[j]]], j, sep = " / ");
sub("NA / ", "", res, fixed = TRUE)
}
microbenchmark::microbenchmark(Rowwise(df), Colwise(df))
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# Rowwise(df) 458.54526 502.43496 545.47028 548.42042 584.18000 669.6161 100 b
# Colwise(df) 27.11235 27.83873 34.65596 29.05341 32.83664 137.7905 100 a

If the dataset is not really big (i.e. in millions/billions of rows) we can use apply with MARGIN=1 to loop over the rows, subset the names of the vector using the logical vector as index and paste them together. It is easier to code in a single line.
df$Result <- apply(df, 1, FUN = function(x) paste(names(x)[x], collapse=" / "))
However, if we have a big dataset, another option is to create a key/value pair and replace the values by matching and it is faster than the above solution.
v1 <- do.call(paste, df)
unname(setNames(c("A / B", "B", "A", NA), do.call(paste,
expand.grid(rep(list(c(TRUE, FALSE)), 2))))[v1])
#[1] "A / B" "B" NA
Or we can use arithmetic operation to do this
c(NA, "A", "B", "A / B")[1 + df[,1] + 2 * df[,2]]
#[1] "A / B" "B" NA
Benchmarks
Using #DavidArenburg's dataset and including the two solutions posted here (changed the column names of 'df' to 'A' and 'B')
newPaste <- function(df) {
v1 <- do.call(paste, df)
unname(setNames(c("A / B", "B", "A", NA), do.call(paste,
expand.grid(rep(list(c(TRUE, FALSE)), 2))))[v1])
}
arith <- function(df){
c(NA, "A", "B", "A / B")[1 + df[,1] + 2 * df[,2]]
}
microbenchmark::microbenchmark(Rowwise(df), Colwise(df), newPaste(df),arith(df))
#Unit: milliseconds
# expr min lq mean median uq max neval
# Rowwise(df) 398.024791 453.68129 488.07312 481.051431 523.466771 688.36084 100
# Colwise(df) 25.361609 28.10300 34.20972 30.952365 35.885061 95.92575 100
# newPaste(df) 65.777304 69.07432 82.08602 71.606890 82.232980 176.66516 100
# arith(df) 1.790622 1.88339 4.74913 2.027674 4.753279 58.50942 100

Transpose a nested list

I would like to transpose a nested list. Assume the following nested list x is given:
a <- list(c("a","b","c","d"))
b <- list(c("d","c","b","a"))
c <- list(c("4","3","2","1"))
d <- list(c("1","2","3","4"))
x <- list(a,b,c,d)
The outcome should be a nested list where the first column of the original list x is the first nested list element, that is "a","d","4","1", the second column is the second nested list element, i.e. "b","c","3","2" and so on. In the end the structure is kind of a transpose of the original structure. How can this be done in R?

We could also do without lapply (using matrix):
relist(matrix(unlist(x), ncol = 4, byrow = T), skeleton = x)
Benchmarking
library(microbenchmark)
a <- list(c("a","b","c","d"))
b <- list(c("d","c","b","a"))
c <- list(c("4","3","2","1"))
d <- list(c("1","2","3","4"))
x <- list(a,b,c,d)
f_akrun <- function(x) {m1 <- do.call(rbind, lapply(x, function(y) do.call(rbind, y)));relist(m1, skeleton = x);}
f_m0h3n <- function(x) {relist(matrix(unlist(x), ncol = length(x[[1]][[1]]), byrow = T), skeleton = x)}
setequal(f_akrun(x), f_m0h3n(x))
# [1] TRUE
microbenchmark(f_akrun(x), f_m0h3n(x))
# Unit: microseconds
# expr min lq mean median uq max neval
# f_akrun(x) 135.591 137.301 144.3545 138.585 148.422 334.484 100
# f_m0h3n(x) 110.782 111.638 116.5477 112.493 117.412 212.153 100

We can try
m1 <- do.call(rbind, lapply(x, function(y) do.call(rbind, y)))
relist(m1, skeleton = x)

Paste all possible diagonals of an n*n matrix or dataframe

I'm trying to paste all possible characters that are arranged in any diagonal within an N * N matrix.
For example, consider the following 3 X 3 matrix:
#Create matrix, convert to character dataframe
matrix <- matrix(data=c('s','t','y','a','e','l','f','n','e'),nrow=3,ncol=3)
matrix <- as.data.frame(matrix)
for(i in 1:length(colnames(matrix))){
matrix[,i] <- as.character(matrix[,i])
}
In the matrix above I need to paste the diagonals: "see","fey", "ees", and "yef". I can find these in the dataframe with the following code:
diag <- paste(matrix[1,1],matrix[2,2],matrix[3,3],sep='')
diag1 <- paste(matrix[1,3],matrix[2,2],matrix[3,1],sep='')
diag2 <- paste(matrix[3,1],matrix[2,2],matrix[1,3],sep='')
diag3 <- paste(matrix[3,3],matrix[2,2],matrix[1,1],sep='')
The problem is that I want to automate this so that it will work on any N x N matrix. (I'm writing a function to find the diagonals in any N X N matrix). Is there an efficient way to do this?

Oh, that's easy if you use matrix instead of data.frame :)
We can choose matrix elements just like we can take vector elements:
matrix[1:3] # First three elements == first column
n <- ncol(matrix)
(1:n-1)*n+1:n
## [1] 1 5 9
(1:n-1)*n+n:1
## [1] 3 5 7
So now we can use this:
matrix[(1:n-1)*n+1:n]
[1] "s" "e" "e"
paste0(matrix[(1:n-1)*n+1:n],collapse="")
[1] "see"
And if you want it backwards, just reverse the vector of indexes using rev function:
paste0(matrix[rev((1:n-1)*n+1:n)],collapse="")
[1] "ees"
Some benchmarks:
rotate <- function(x) t(apply(x, 2, rev))
revMat <- function(mat, dir=0){
x <- if(bitwAnd(dir,1)) rev(seq(nrow(mat))) else seq(nrow(mat))
y <- if(bitwAnd(dir,2)) rev(seq(ncol(mat))) else seq(nrow(mat))
mat[x,y]
}
bartek <- function(matrix){
n <- ncol(matrix)
c(paste0(matrix[(1:n-1)*n+1:n],collapse=""), paste0(matrix[rev((1:n-1)*n+1:n)],collapse=""),
paste0(matrix[(1:n-1)*n+n:1],collapse=""), paste0(matrix[rev((1:n-1)*n+n:1)],collapse=""))
}
Joe <- function(matrix){
diag0 <- diag(matrix)
diag1 <- diag(rotate(matrix))
diag2 <- rev(diag0)
diag3 <- rev(diag1)
c(paste(diag0, collapse = ""),paste(diag1, collapse = ""),
paste(diag2, collapse = ""),paste(diag3, collapse = ""))
}
James <- function(mat){
sapply(0:3,function(x) paste(diag(revMat(mat,x)),collapse=""))
}
matrix <- matrix(c('s','t','y','a','e','l','f','n','e'), ncol = 3)
microbenchmark(bartek(matrix), Joe(matrix), James(matrix))
Unit: microseconds
expr min lq mean median uq max neval
bartek(matrix) 50.273 55.2595 60.78952 59.4390 62.438 134.880 100
Joe(matrix) 167.431 176.6170 188.46908 182.8260 192.646 337.717 100
James(matrix) 321.313 334.3350 346.15230 339.7235 348.565 447.115 100
matrix <- matrix(1:10000, ncol=100)
microbenchmark(bartek(matrix), Joe(matrix), James(matrix))
Unit: microseconds
expr min lq mean median uq max neval
bartek(matrix) 314.385 326.752 336.1194 331.936 337.9805 423.323 100
Joe(matrix) 2168.141 2221.477 2460.1002 2257.439 2298.4400 8856.482 100
James(matrix) 1200.572 1250.354 1407.5943 1276.307 1323.8845 7419.931 100

For a matrix, this can be accomplished by taking the diag of the four possible rotations. If you set up a rotate function as follows (credit), this becomes straightforward:
> rotate <- function(x) t(apply(x, 2, rev))
> diag0 <- paste(diag(matrix), collapse = "")
> diag1 <- paste(diag(rotate(matrix)), collapse = "")
> diag2 <- paste(diag(rotate(rotate(matrix))), collapse = "")
> diag3 <- paste(diag(rotate(rotate(rotate(matrix)))), collapse = "")
> diag0
[1] "see"
> diag1
[1] "yef"
> diag2
[1] "ees"
> diag3
[1] "fey"
As pointed out by Frank in comments, this could become slow for sufficiently large matrices (on my machine, rotate starts to take longer than about a second for matrices larger than 1000 X 1000). You can save some time by using rev prior to pasting, eg:
> diag0 <- diag(matrix)
> diag1 <- diag(rotate(matrix))
> diag2 <- rev(diag0)
> diag3 <- rev(diag1)
> paste(diag2, collapse = "")
[1] "ees"
> paste(diag3, collapse = "")
[1] "fey"

One way is to use diag on the matrix, called mat here to avoid clashing with the function name, and reversing the row and/or column orders for to get each diagonal and direction.
You can do it with a supplementary function to make the reversals systematic so you can use sapply to loop.
revMat <- function(mat, dir=0)
{
x <- if(bitwAnd(dir,1)) rev(seq(nrow(mat))) else seq(nrow(mat))
y <- if(bitwAnd(dir,2)) rev(seq(ncol(mat))) else seq(nrow(mat))
mat[x,y]
}
sapply(0:3,function(x) paste(diag(revMat(mat,x)),collapse=""))
[1] "see" "yef" "fey" "ees"

Convert matrix to an actual matrix m (as opposed to a data frame). Then the four diagonals are:
m <- as.matrix(matrix)
ix <- ncol(m):1
paste(diag(m), collapse = "")
paste(diag(m[ix,]), collapse = "")
paste(diag(m[,ix]), collapse = "")
paste(diag(m[ix, ix]), collapse = "")

Revert list structure

GOAL
Given a list of lists my goal is to reverse its structure (R language).
So, I want to bring the elements of the nested lists to be elements of the tier one list.
Probably an example better specifies my purpose. Given:
z <- list(z1 = list(a = 1, b = 2, c = 3), z2 = list(b = 4, a = 1, c = 0))
I want an output equivalent to the subsequent R object:
o <- list(a = list(z1 = 1, z2 = 1), b = list(z1 = 2, z2 = 4), c = list(z1 = 3, z2 = 0))
SOLUTIONS
MY SOLUTION
I created my own solution, which I am attaching below, but let me know if there is some better.
revert_list_str_1 <- function(ls) {
res <- lapply(names(ls[[1]]), function(n, env) {
name <- paste(n, 'elements', sep = '_')
assign(name, vector('list', 0))
inner <- sapply(ls, function(x) {
assign(name, c(get(name), x[which(names(x) == n)]))
})
names(inner) <- names(ls)
inner
})
names(res) <- names(ls[[1]])
res
}
Executing str(revert_list_str_1(z)) I obtain the subsequent output, corresponding to what I wanted.
List of 3
$ a:List of 2
..$ z1: num 1
..$ z2: num 1
$ b:List of 2
..$ z1: num 2
..$ z2: num 4
$ c:List of 2
..$ z1: num 3
..$ z2: num 0
But as I said I'd like to investigate (and learn) the existence of a more elegant and dynamic solution.
In fact my solution works fully only if all the nested lists have the same names (also in different order). This because of names(ls[[1]]). I would also point out that it acts only on lists of 2 levels, like the one reported.
So, do you know other solutions that are more dynamic? Can rapply and/or Filter functions be useful for this task?
end edit 1.
ANALYSIS OF PROPOSED SOLUTIONS
I've done a little analysis of the proposes solutions, thans you all !.
The analysis consists of verifying the following points for all functions:
accepted classes (nested list elements)
type preserved also if there are elements with different types (if they are atomic)
object contained in the elements preserved (e.g. a matrix)
columns considered (for columns I mean the names of the nested lists)
not common columns ignored (the classification 'not' is understood positively in this case)
not common columns preserved
it works also when columns do not match (based only on the names of the first nested list)
In all this cases the classification 'yes' is understood positively execept for point 2.1.
This are all the functions I've considered (the comments relate to the analysis items mentioned above):
# yes 1.1
# yes 1.2
# yes 2.1, not 2.2, not 2.3
revert_list_str_1 <- function(ls) { # #leodido
# see above
}
# not 1.1
# not 1.2
# not 2.1, not 2.2, not 2.3
revert_list_str_2 <- function(ls) { # #mnel
# convert each component of list to a data.frame
# so rbind.data.frame so named elements are matched
x <- data.frame((do.call(rbind, lapply(ls, data.frame))))
# convert each column into an appropriately named list
o <- lapply(as.list(x), function(i, nam) as.list(`names<-`(i, nam)), nam = rownames(x))
o
}
# yes 1.1
# yes 1.2
# yes 2.1, not 2.2, yes 2.3
revert_list_str_3 <- function(ls) { # #mnel
# unique names
nn <- Reduce(unique, lapply(ls, names))
# convert from matrix to list `[` used to ensure correct ordering
as.list(data.frame(do.call(rbind,lapply(ls, `[`, nn))))
}
# yes 1.1
# yes 1.2
# yes 2.1, not 2.2, yes 2.3
revert_list_str_4 <- function(ls) { # #Josh O'Brien
# get sub-elements in same order
x <- lapply(ls, `[`, names(ls[[1]]))
# stack and reslice
apply(do.call(rbind, x), 2, as.list)
}
# not 1.1
# not 1.2
# not 2.1, not 2.2, not 2.3
revert_list_str_5 <- function(ls) { # #mnel
apply(data.frame((do.call(rbind, lapply(ls, data.frame)))), 2, as.list)
}
# not 1.1
# not 1.2
# not 2.1, yes 2.2, yes 2.3
revert_list_str_6 <- function(ls) { # #baptiste + #Josh O'Brien
b <- recast(z, L2 ~ L1)
apply(b, 1, as.list)
}
# yes 1.1
# yes 1.2
# not 2.1, yes 2.2, yes 2.3
revert_list_str_7 <- function(ll) { # #Josh O'Brien
nms <- unique(unlist(lapply(ll, function(X) names(X))))
ll <- lapply(ll, function(X) setNames(X[nms], nms))
ll <- apply(do.call(rbind, ll), 2, as.list)
lapply(ll, function(X) X[!sapply(X, is.null)])
}
CONSIDERATIONS
From this analysis emerges that:
functions revert_list_str_7 and revert_list_str_6 are the most flexible regarding the names of the nested list
functions revert_list_str_4, revert_list_str_3 followed by my own function are complete enough, good trade-offs.
the most complete in absolute function is revert_list_str_7.
BENCHMARKS
To complete the work I've done some little benchmarks (with microbenchmark R package) on this 4 functions (times = 1000 for each benchmark).
BENCHMARK 1
Input:
list(z1 = list(a = 1, b = 2, c = 3), z2 = list(a = 0, b = 3, d = 22, f = 9)).
Results:
Unit: microseconds
expr min lq median uq max
1 func_1 250.069 467.5645 503.6420 527.5615 2028.780
2 func_3 204.386 393.7340 414.5485 429.6010 3517.438
3 func_4 89.922 173.7030 189.0545 194.8590 1669.178
4 func_6 11295.463 20985.7525 21433.8680 21934.5105 72476.316
5 func_7 348.585 387.0265 656.7270 691.2060 2393.988
Winner: revert_list_str_4.
BENCHMARK 2
Input:
list(z1 = list(a = 1, b = 2, c = 'ciao'), z2 = list(a = 0, b = 3, c = 5)).
revert_list_str_6 excluded because it does not support different type of nested child elements.
Results:
Unit: microseconds
expr min lq median uq max
1 func_1 249.558 483.2120 502.0915 550.7215 2096.978
2 func_3 210.899 387.6835 400.7055 447.3785 1980.912
3 func_4 92.420 170.9970 182.0335 192.8645 1857.582
4 func_7 257.772 469.9280 477.8795 487.3705 2035.101
Winner: revert_list_str_4.
BENCHMARK 3
Input:
list(z1 = list(a = 1, b = m, c = 'ciao'), z2 = list(a = 0, b = 3, c = m)).
m is a matrix 3x3 of integers and revert_list_str_6 has been excluded again.
Results:
Unit: microseconds
expr min lq median uq max
1 func_1 261.173 484.6345 503.4085 551.6600 2300.750
2 func_3 209.322 393.7235 406.6895 449.7870 2118.252
3 func_4 91.556 174.2685 184.5595 196.2155 1602.983
4 func_7 252.883 474.1735 482.0985 491.9485 2058.306
Winner: revert_list_str_4. Again!
end edit 2.
CONCLUSION
First of all: thanks to all, wonderful solutions.
In my opinion if you know in advance that you list will have nested list with the same names reverse_str_4 is the winner as best compromise between performances and support for different types.
The most complete solution is revert_list_str_7 although the full flexibility induces an average of about 2.5 times a worsening of performances compared to reverse_str_4 (useful if your nested list have different names).

Edit:
Here's a more flexible version that will work on lists whose elements don't necessarily contain the same set of sub-elements.
fun <- function(ll) {
nms <- unique(unlist(lapply(ll, function(X) names(X))))
ll <- lapply(ll, function(X) setNames(X[nms], nms))
ll <- apply(do.call(rbind, ll), 2, as.list)
lapply(ll, function(X) X[!sapply(X, is.null)])
}
## An example of an 'unbalanced' list
z <- list(z1 = list(a = 1, b = 2),
z2 = list(b = 4, a = 1, c = 0))
## Try it out
fun(z)
Original answer
z <- list(z1 = list(a = 1, b = 2, c = 3), z2 = list(b = 4, a = 1, c = 0))
zz <- lapply(z, `[`, names(z[[1]])) ## Get sub-elements in same order
apply(do.call(rbind, zz), 2, as.list) ## Stack and reslice

EDIT -- working from #Josh O'Briens suggestion and my own improvemes
The problem was that do.call rbind was not calling rbind.data.frame which does some matching of names. rbind.data.frame should work, because data.frames are lists and each sublist is a list, so we could just call it directly.
apply(do.call(rbind.data.frame, z), 1, as.list)
However, while this may be succicint, it is slow because do.call(rbind.data.frame, ...) is inherently slow.
Something like (in two steps)
# convert each component of z to a data.frame
# so rbind.data.frame so named elements are matched
x <- data.frame((do.call(rbind, lapply(z, data.frame))))
# convert each column into an appropriately named list
o <- lapply(as.list(x), function(i,nam) as.list(`names<-`(i, nam)), nam = rownames(x))
o
$a
$a$z1
[1] 1
$a$z2
[1] 1
$b
$b$z1
[1] 2
$b$z2
[1] 4
$c
$c$z1
[1] 3
$c$z2
[1] 0
And an alternative
# unique names
nn <- Reduce(unique,lapply(z, names))
# convert from matrix to list `[` used to ensure correct ordering
as.list(data.frame(do.call(rbind,lapply(z, `[`, nn))))

reshape can get you close,
library(reshape)
b = recast(z, L2~L1)
split(b[,-1], b$L2)

The recently released purrr contains a function, transpose, whose's purpose is to 'revert' a list structure. There is a major caveat to the transpose function, it matches on position and not name, https://cran.r-project.org/web/packages/purrr/purrr.pdf. These means that it is not the correct tool for the benchmark 1 above. I therefore only consider benchmark 2 and 3 below.
Benchmark 2
B2 <- list(z1 = list(a = 1, b = 2, c = 'ciao'), z2 = list(a = 0, b = 3, c = 5))
revert_list_str_8 <- function(ll) { # #z109620
transpose(ll)
}
microbenchmark(revert_list_str_1(B2), revert_list_str_3(B2), revert_list_str_4(B2), revert_list_str_7(B2), revert_list_str_8(B2), times = 1e3)
Unit: microseconds
expr min lq mean median uq max neval
revert_list_str_1(B2) 228.752 254.1695 297.066646 268.8325 293.5165 4501.231 1000
revert_list_str_3(B2) 211.645 232.9070 277.149579 250.9925 278.6090 2512.361 1000
revert_list_str_4(B2) 79.673 92.3810 112.889130 100.2020 111.4430 2522.625 1000
revert_list_str_7(B2) 237.062 252.7030 293.978956 264.9230 289.1175 4838.982 1000
revert_list_str_8(B2) 2.445 6.8440 9.503552 9.2880 12.2200 148.591 1000
Clearly function transpose is the winner! It also utilizes much less code.
Benchmark 3
B3 <- list(z1 = list(a = 1, b = m, c = 'ciao'), z2 = list(a = 0, b = 3, c = m))
microbenchmark(revert_list_str_1(B3), revert_list_str_3(B3), revert_list_str_4(B3), revert_list_str_7(B3), revert_list_str_8(B3), times = 1e3)
Unit: microseconds
expr min lq mean median uq max neval
revert_list_str_1(B3) 229.242 253.4360 280.081313 266.877 281.052 2818.341 1000
revert_list_str_3(B3) 213.600 232.9070 271.793957 248.304 272.743 2739.646 1000
revert_list_str_4(B3) 80.161 91.8925 109.713969 98.980 108.022 2403.362 1000
revert_list_str_7(B3) 236.084 254.6580 287.274293 264.922 280.319 2718.628 1000
revert_list_str_8(B3) 2.933 7.3320 9.140367 9.287 11.243 55.233 1000
Again, transpose outperforms all others.
The problem with these above benchmarks test is that they focus on very small lists. For this reason, the numerous loops nested within functions 1-7 do not pose too much of a problem. As the size of the list and therefore the iteration increase, the speed gains of transpose will likely increase.
The purrr package is awesome! It does a lot more than revert lists. In combination with the dplyr package, the purrr package makes it possible to do most of your coding using the poweriful and beautiful functional programming paradigm. Thank the lord for Hadley!

How about this simple solution, which is completely general, and almost as fast as Josh O'Brien's original answer that assumed common internal names (#4).
zv <- unlist(unname(z), recursive=FALSE)
ans <- split(setNames(zv, rep(names(z), lengths(z))), names(zv))
And here is a general version that is robust to not having names:
invertList <- function(z) {
zv <- unlist(unname(z), recursive=FALSE)
zind <- if (is.null(names(zv))) sequence(lengths(z)) else names(zv)
if (!is.null(names(z)))
zv <- setNames(zv, rep(names(z), lengths(z)))
ans <- split(zv, zind)
if (is.null(names(zv)))
ans <- unname(ans)
ans
}

I'd like to add a further solution to this valuable collection (to which I have turned many times):
revert_list_str_9 <- function(x) do.call(Map, c(c, x))
If this were code golf, we'd have a clear winner! Of course, this requires the individual list entries to be in the same order. This can be extended, using various ideas from above, such as
revert_list_str_10 <- function(x) {
nme <- names(x[[1]]) # from revert_list_str_4
do.call(Map, c(c, lapply(x, `[`, nme)))
}
revert_list_str_11 <- function(x) {
nme <- Reduce(unique, lapply(x, names)) # from revert_list_str_3
do.call(Map, c(c, lapply(x, `[`, nme)))
}
Performance-wise it's also not too shabby. If stuff is properly sorted, we have a new base R solution to beat. If not, timings still are very competitive.
z <- list(z1 = list(a = 1, b = 2, c = 3), z2 = list(b = 4, a = 1, c = 0))
microbenchmark::microbenchmark(
revert_list_str_1(z), revert_list_str_2(z), revert_list_str_3(z),
revert_list_str_4(z), revert_list_str_5(z), revert_list_str_7(z),
revert_list_str_9(z), revert_list_str_10(z), revert_list_str_11(z),
times = 1e3
)
#> Unit: microseconds
#> expr min lq mean median uq max
#> revert_list_str_1(z) 51.946 60.9845 67.72623 67.2540 69.8215 1293.660
#> revert_list_str_2(z) 461.287 482.8720 513.21260 490.5495 498.8110 1961.542
#> revert_list_str_3(z) 80.180 89.4905 99.37570 92.5800 95.3185 1424.012
#> revert_list_str_4(z) 19.383 24.2765 29.50865 26.9845 29.5385 1262.080
#> revert_list_str_5(z) 499.433 525.8305 583.67299 533.1135 543.4220 25025.568
#> revert_list_str_7(z) 56.647 66.1485 74.53956 70.8535 74.2445 1309.346
#> revert_list_str_9(z) 6.128 7.9100 11.50801 10.2960 11.5240 1591.422
#> revert_list_str_10(z) 8.455 10.9500 16.06621 13.2945 14.8430 1745.134
#> revert_list_str_11(z) 14.953 19.8655 26.79825 22.1805 24.2885 2084.615
Unfortunately, this is not my creation, but exists courtesy of #thelatemail.

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Merge two vectors based on index in R - r

I'm trying to merge two vectors of same length where NAs in vector "a" align with the numbers in vector "b" and vice versa: a <- c(1, NA, 3, NA) b <- c(NA, 2, NA, 4) The output should be: 1, 2 ,3, 4 Thanks for the help! edit: the solution I used was a[is.na(a)] <- b[is.na(a)]

Related

Count number of 1's from right to left, stopping at the first 0

Apply a function by row on a dataframe independently of the number of columns

Transpose a nested list

Paste all possible diagonals of an n*n matrix or dataframe

Revert list structure

Categories

Resources