Replace multiple values in a list in R - r

If I have:
mylist <- lapply(1:10, function(x) matrix(NA, nrow=2, ncol=2))
And I want to replace, for example, the first, second and fifth element in the list with a:
mymatrix=cbind(c(1,1),c(1,1))
What can I do? I tried with:
mylist[c(1,2,5)]=mymatrix
But I can't substitute the new matrix because it's another list and with the [[]] I can only access to one element.
I think I have to use the lapply function but I can't figure out in which way.

Would this work for you?
mylist[c(1, 2, 5)] <- lapply(mylist[c(1, 2, 5)], function(x) x <- mymatrix)

Similar to #jaSf but faster and "cleaner":
idx <- c(1, 2, 3)
mylist[idx] <- list(mymatrix)
microbenchmark:
Unit: nanoseconds
expr min lq mean median uq max neval cld
this 687 828 1135.152 959 1127 2787458 1e+05 a
jaSf 2982 3575 4482.867 4034 4535 2979424 1e+05 b
Otherwise would recommend using modifyList() to update named lists like:
foo <- list(a = 1, b = list(c = "a", d = FALSE))
bar <- modifyList(foo, list(e = 2, b = list(d = TRUE)))
str(foo)
str(bar)

Another option could be using just far-loop as:
for(i in c(1,2,5)){
mylist[[i]] <- mymatrix
}

Related

do.call doesn't work with "+" as "what" and a list of 3+ elements

I can use do.call to sum two vectors elementwise:
do.call(what="+", args =list(c(0,0,1), c(1,2,3))
>[1] 1 2 4
However, if I'd like to call the same operator with a list of three vectors, it fails:
do.call(what = "+", args = list(c(0,0,1), c(1,2,3), c(9,1,2)))
>Error in `+`(c(0, 0, 1), c(1, 2, 3), c(9, 1, 2)): operator needs one or two arguments
I could use Reduce
Reduce(f = "+", x = list(c(0,0,1), c(1,2,3), c(9,1,2)))
>[1] 10 3 6
but I am aware of the overhead generated by the Reduce operation as compared to do.call and in my REAL application it isn't tolerable, as I need to sum not 3-element lists, but rather 10^5-element list of 10^4-element-long vectors.
UPD: Reduce turned out to be the fastest method, after all...
lst <- list(1:10000, 10001:20000, 20001:30000)
lst2 <- lst[rep(seq.int(length(lst)), 1000)]
microbenchmark::microbenchmark(colSums(do.call(rbind, lst2)),
vapply(transpose(lst2), sum, 0),
Reduce(f = "+", x = lst2))
Unit: milliseconds
expr min lq mean median uq max neval cld
colSums(do.call(rbind, lst2)) 153.5086 194.9139 222.7954 198.1952 201.8152 915.6354 100 b
vapply(transpose(lst2), sum, 0) 398.9424 537.3834 732.4747 781.7255 813.7376 1538.4301 100 c
Reduce(f = "+", x = lst2) 101.5618 105.5864 139.8651 108.1204 112.7861 2567.1793 100 a
As your list gets larger, you might find that this starts to become fast:
# careful if you use the tidyverse that purrr does not mask transpose
library(data.table)
lst <- list(c(0,0,1), c(1,2,3), c(9, 1, 2))
vapply(transpose(lst), sum, 0)
# [1] 10 3 6
I have taken a few answers to compare speed, which seems to be what you want.
# make the list a bit bigger...
lst2 <- lst[rep(seq.int(length(lst)), 1000)]
microbenchmark::microbenchmark(Reduce(`+`, lst2),
colSums(do.call(rbind, lst2)),
vapply(transpose(lst2), sum, 0),
eval(str2lang(paste0(lst2,collapse = "+"))))
)
Unit: microseconds
expr min lq mean median uq max neval
Reduce(`+`, lst2) 954.9 1088.10 1341.271 1191.05 1389.00 6923.2 100
colSums(do.call(rbind, lst2)) 402.2 474.80 761.473 538.85 843.75 7079.7 100
vapply(transpose(lst2), sum, 0) 81.9 91.85 110.455 103.90 119.00 330.4 100
eval(str2lang(paste0(lst2, collapse = "+"))) 17489.2 18466.65 20767.888 19572.25 20809.80 57770.4 100
Here it is though with longer vectors, as is your use case. This benchmark will take a minute or two to run. Notice the unit is now in milliseconds. I think it will depend on how long the list is.
lst <- list(1:10000, 10001:20000, 20001:30000)
lst2 <- lst[rep(seq.int(length(lst)), 1000)]
microbenchmark::microbenchmark(colSums(do.call(rbind, lst2)),
vapply(transpose(lst2), sum, 0))
)
Unit: milliseconds
expr min lq mean median uq max neval
colSums(do.call(rbind, lst2)) 141.7147 146.6305 188.5108 163.4915 228.7852 270.5679 100
vapply(transpose(lst2), sum, 0) 261.8630 335.6093 348.6241 341.6958 348.6404 495.0994 100
You could use :
colSums(do.call(rbind, lst))
#[1] 10 3 6
Or similarly :
rowSums(do.call(cbind, lst))
where lst is :
lst <- list(c(0,0,1), c(1,2,3), c(9, 1, 2))
Another base R workaround
rowSums(as.data.frame(lst)
or
eval(str2lang(paste0(lst,collapse = "+")))
which gives
[1] 10 3 6
Data
lst <- list(c(0,0,1), c(1,2,3), c(9, 1, 2))

Efficiently find set differences and generate random sample

I have a very large data set with categorical labels a and a vector b that contains all possible labels in the data set:
a <- c(1,1,3,2) # artificial data
b <- c(1,2,3,4) # fixed categories
Now I want to find for each observation in a the set of all remaining categories (that is, the elements of b excluding the given observation in a). From these remaining categories, I want to sample one at random.
My approach using a loop is
goal <- numeric() # container for results
for(i in 1:4){
d <- setdiff(b, a[i]) # find the categories except the one observed in the data
goal[i] <- sample(d,1) # sample one of the remaining categories randomly
}
goal
[1] 4 4 1 1
However, this has to be done a large number of times and applied to very large data sets. Does anyone have a more efficient version that leads to the desired result?
EDIT:
The function by akrun is unfortunately slower than the original loop. If anyone has a creative idea with a competitive result, I'm happy to hear it!
We can use vapply
vapply(a, function(x) sample(setdiff(b, x), 1), numeric(1))
set.seed(24)
a <- sample(c(1:4), 10000, replace=TRUE)
b <- 1:4
system.time(vapply(a, function(x) sample(setdiff(b, x), 1), numeric(1)))
# user system elapsed
# 0.208 0.007 0.215
It turns out that resampling the labels that are equal to the labels in the data is an even faster approach, using
test = sample(b, length(a), replace=T)
resample = (a == test)
while(sum(resample>0)){
test[resample] = sample(b, sum(resample), replace=T)
resample = (a == test)
}
Updated Benchmarks for N=10,000:
Unit: microseconds
expr min lq mean median uq max neval
loop 14337.492 14954.595 16172.2165 15227.010 15585.5960 24071.727 100
akrun 14899.000 15507.978 16271.2095 15736.985 16050.6690 24085.839 100
resample 87.242 102.423 113.4057 112.473 122.0955 174.056 100
shree(data = a, labels = b) 5195.128 5369.610 5472.4480 5454.499 5574.0285 5796.836 100
shree_mapply(data = a, labels = b) 1500.207 1622.516 1913.1614 1682.814 1754.0190 10449.271 100
Update: Here's a fast version with mapply. This method avoids calling sample() for every iteration so is a bit faster. -
mapply(function(x, y) b[!b == x][y], a, sample(length(b) - 1, length(a), replace = T))
Here's a version without setdiff (setdiff can be a bit slow) although I think even more optimization is possible. -
vapply(a, function(x) sample(b[!b == x], 1), numeric(1))
Benchmarks -
set.seed(24)
a <- sample(c(1:4), 1000, replace=TRUE)
b <- 1:4
microbenchmark::microbenchmark(
akrun = vapply(a, function(x) sample(setdiff(b, x), 1), numeric(1)),
shree = vapply(a, function(x) sample(b[!b == x], 1), numeric(1)),
shree_mapply = mapply(function(x, y) b[!b == x][y], a, sample(length(b) - 1, length(a), replace = T))
)
Unit: milliseconds
expr min lq mean median uq max neval
akrun 28.7347 30.66955 38.319655 32.57875 37.45455 237.1690 100
shree 5.6271 6.05740 7.531964 6.47270 6.87375 45.9081 100
shree_mapply 1.8286 2.01215 2.628989 2.14900 2.54525 7.7700 100

R - Extracting information from list of lists of data.frames

I have two needs, both connected to a dataset similar to the reproducible one below. I have a list of 18 entities, each composed of a list of 17-19 data.frames. Reproducible dataset follows (there are matrices instead of data.frames, but I do not suppose that makes a difference):
test <- list(list(matrix(10:(50-1), ncol = 10), matrix(60:(100-1), ncol = 10), matrix(110:(150-1), ncol = 10)),
list(matrix(200:(500-1), ncol = 10), matrix(600:(1000-1), ncol = 10), matrix(1100:(1500-1), ncol = 10)))
I need to subset each dataframe/matrix into two parts (by a given number of rows) and save to a new list of lists
Secondly, I need to extract and save a given column(s) out of every data.frame in a list of lists.
I have no idea how to go around doing it apart from for(), but I am sure it should be possible with apply() family of functions.
Thank you for reading
EDIT:
My expected output would look as follows:
extractedColumns <- list(list(matrix(10:(50-1), ncol = 10)[, 2], matrix(60:(100-1), ncol = 10)[, 2], matrix(110:(150-1), ncol = 10)[, 2]),
list(matrix(200:(500-1), ncol = 10)[, 2], matrix(600:(1000-1), ncol = 10)[, 2], matrix(1100:(1500-1), ncol = 10)[, 2]))
numToSubset <- 3
substetFrames <- list(list(list(matrix(10:(50-1), ncol = 10)["first length - numToSubset rows", ], matrix(10:(50-1), ncol = 10)["last numToSubset rows", ]),
list(matrix(60:(100-1), ncol = 10)["first length - numToSubset rows", ], matrix(60:(100-1), ncol = 10)["last numToSubset rows", ]),
list(matrix(110:(150-1), ncol = 10)["first length - numToSubset rows", ], matrix(110:(150-1), ncol = 10)["last numToSubset rows", ])),
etc...)
It gets to look very messy, hope you can follow what I want.
You can use two nested lapplys:
lapply(test, function(x) lapply(x, '[', c(2, 3)))
Ouput:
[[1]]
[[1]][[1]]
[1] 11 12
[[1]][[2]]
[1] 61 62
[[1]][[3]]
[1] 111 112
[[2]]
[[2]][[1]]
[1] 201 202
[[2]][[2]]
[1] 601 602
[[2]][[3]]
[1] 1101 1102
Explanation
The first lapply will be applied on the two lists of test. Each one of those two lists contain another 3. The second lapply will iterate over those 3 lists and subset (thats the '[' function in the second lapply) columns c(2, 3).
Note: In the case of a matrix [ will subset elements 2 and 3 but the same function will subset columns when used on a data.frame.
Subsetting rows and columns
lapply is very flexible with the use of anonymous functions. By changing the code into:
#change rows and columns into what you need
lapply(test, function(x) lapply(x, function(y) y[rows, columns]))
You can specify any combination of rows or columns you want.
To piggyback #LyzandeR's answer, consider the often ignored sibling of the apply family, rapply that can recursively run functions on lists of vectors/matrices, returning such nested structures. Often it can compare to nested lapply or its variants v/sapply:
newtest1 <- lapply(test, function(x) lapply(x, '[', c(2, 3)))
newtest2 <- rapply(test, function(x) `[`(x, c(2, 3)), classes="matrix", how="list")
all.equal(newtest1, newtest2)
# [1] TRUE
Interestingly, to my amazement, rapply runs slower in this use case compared to nested lapply! Hmmmm, back to the lab I go...
library(microbenchmark)
microbenchmark(newtest1 <- lapply(test, function(x) lapply(x, '[', c(2, 3))))
# Unit: microseconds
# mean median uq max neval
# 31.92804 31.278 32.241 74.587 100
microbenchmark(newtest2 <- rapply(test, function(x) `[`(x, c(2, 3)),
classes="matrix", how="list"))
# Unit: microseconds
# min lq mean median uq max neval
# 69.293 72.18 79.53353 73.143 74.5865 219.91 100
Even more interesting, is removing the [ operator for the equivalent matrix bracketing, nested lapply runs even better and rapply even worse!
microbenchmark(newtest3 <- lapply(test, function(x)
lapply(x, function(y) y[c(2, 3), 1])))
# Unit: microseconds
# min lq mean median uq max neval
# 26.947 28.391 32.00987 29.354 30.798 100.09 100
all.equal(newtest1, newtest3)
# [1] TRUE
microbenchmark(newtest4 <- rapply(test, function(x) x[c(2,3), 1],
classes="matrix", how="list"))
# Unit: microseconds
# min lq mean median uq max neval
# 74.105 76.752 80.37076 77.955 78.918 203.549 100
all.equal(newtest2, newtest4)
# [1] TRUE

Transpose a nested list

I would like to transpose a nested list. Assume the following nested list x is given:
a <- list(c("a","b","c","d"))
b <- list(c("d","c","b","a"))
c <- list(c("4","3","2","1"))
d <- list(c("1","2","3","4"))
x <- list(a,b,c,d)
The outcome should be a nested list where the first column of the original list x is the first nested list element, that is "a","d","4","1", the second column is the second nested list element, i.e. "b","c","3","2" and so on. In the end the structure is kind of a transpose of the original structure. How can this be done in R?
We could also do without lapply (using matrix):
relist(matrix(unlist(x), ncol = 4, byrow = T), skeleton = x)
Benchmarking
library(microbenchmark)
a <- list(c("a","b","c","d"))
b <- list(c("d","c","b","a"))
c <- list(c("4","3","2","1"))
d <- list(c("1","2","3","4"))
x <- list(a,b,c,d)
f_akrun <- function(x) {m1 <- do.call(rbind, lapply(x, function(y) do.call(rbind, y)));relist(m1, skeleton = x);}
f_m0h3n <- function(x) {relist(matrix(unlist(x), ncol = length(x[[1]][[1]]), byrow = T), skeleton = x)}
setequal(f_akrun(x), f_m0h3n(x))
# [1] TRUE
microbenchmark(f_akrun(x), f_m0h3n(x))
# Unit: microseconds
# expr min lq mean median uq max neval
# f_akrun(x) 135.591 137.301 144.3545 138.585 148.422 334.484 100
# f_m0h3n(x) 110.782 111.638 116.5477 112.493 117.412 212.153 100
We can try
m1 <- do.call(rbind, lapply(x, function(y) do.call(rbind, y)))
relist(m1, skeleton = x)

Programmatically sum columns of a data frame?

Suppose I have the following data frame:
dd <- data.frame(x1 = c(1, 2, 3), x2 = c(3, 4, 5), x3 = (5, 6, 7))
I want to create a new column, like so:
dd$x.sum <- dd$x1 + dd$x2 + dd$x3
But instead of writing out the addition (suppose I actually have 100 columns x1, x2, ..., x100 that I want to sum together), I want to write it programmatically. Something like:
dd$x.sum <- sum(sapply(1:3, function(i) {
return(dd[paste0("x", i)])
}))
Except that doesn't work.
How do I do this?
You could also use Reduce with +, although this won't have an na.rm argument
dd <- data.frame(matrix(runif(1e6),ncol=1000))
dd$sum <- Reduce('+', dd[paste0('X',seq_len(1000))])
# some benchmarking
xx <- paste0("X", seq_len(1000))
library(microbenchark)
microbenchmark(Reduce('+', dd[xx]), rowSums(dd[xx]))
## Unit: milliseconds
## expr min lq median uq max neval
## Reduce("+", dd[xx]) 14.93642 15.19713 15.56077 15.78606 31.92162 100
## rowSums(dd[xx]) 30.79629 31.44574 31.67192 32.14342 58.07938 100
There is no need for iteration:
dd$sum <- rowSums(dd[paste0("x", 1:3)])
However, if you want to iterate you can over the index themselves:
dd$sum <- rowSums(sapply(paste0("x", 1:3), function(ind) dd[[ind]]))
The reason the sum(sapply(...)) statement is not working as you have it is because sapply is returning a list, and sum cannot take a list as an argument

Resources