Increase performance by moving away from a for loop - r

The gist of the argument is the following:
A function that I wrote, takes into consideration one argument, an alphanumeric string, and should output a string where the values of each element of this alphanumeric string are switched for some 'mapping'. MRE as follows:
#This is the original and switches value map
map = data.table(mapped = c(0:35), original = c(0:9,LETTERS))
#the function that I'm using:
as_numbers <- function(string) {
#split string unlisted
vector_unlisted <- unlist(strsplit(string,""))
#match the string in vector
for (i in 1:length(vector_unlisted)) {
vector_unlisted[i] <- subset(map, map$original==vector_unlisted[i])[[1]][1]
}
vector_unlisted <- paste0(vector_unlisted, collapse = "")
return(vector_unlisted)
}
I am trying to move away from the for loop for something that increases performance, as the function works, but it is pretty slow for the amount of elements I have supplied in this form:
unlist(lapply(dat$alphanum, function(x) as_numbers(x)))
An example of the input strings would be:549300JV8KEETQJYUG13. This should result in a string like 5493001931820141429261934301613
Supplying just one string in this case:
> as_numbers("549300JV8KEETQJYUG13")
[1] "5493001931820141429261934301613"

We can use base conversion:
#input and expected output
x <- "549300JV8KEETQJYUG13"
# "5493001931820141429261934301613"
#output
res <- paste0(strtoi(unlist(strsplit(x, "")), base = 36), collapse = "")
#test output
as_numbers(x) == res
# [1] TRUE
Performance
Since this post is about performance, here is benchmarking* for 3 solutions:
#input set up
map = data.table(mapped = c(0:35), original = c(0:9,LETTERS))
x <- rep(c("549300JV8KEETQJYUG13", "5493V8KE300J"), 1000)
#define functions
base_f <- function(string) {
sapply(string, function(x) {
paste0(strtoi(unlist(strsplit(x, "")), base = 36), collapse = "")
})
}
match_f <- function(string) {
mapped <- map$mapped
original <- map$original
sapply(strsplit(string, ""), function(y) {
paste0(mapped[match(y, original)], collapse= "")})
}
reduce_f <- function(string) {
Reduce(function(string,r)
gsub(map$original[r],
map$mapped[r], string, fixed = TRUE),
seq_len(nrow(map)), string)
}
#test if all return same output
all(base_f(x) == match_f(x))
# [1] TRUE
all(base_f(x) == reduce_f(x))
# [1] TRUE
library(rbenchmark)
benchmark(replications = 1000,
base_f(x),
match_f(x),
reduce_f(x))
# test replications elapsed relative user.self sys.self user.child sys.child
# 1 base_f(x) 1000 22.15 4.683 22.12 0 NA NA
# 2 match_f(x) 1000 19.18 4.055 19.11 0 NA NA
# 3 reduce_f(x) 1000 4.73 1.000 4.72 0 NA NA
*Note: microbenchmark() keeps throwing warnings, hence used rbenchmark() instead. Feel free to test with other libraries and update this post.

Using Reduce and gsub, you could define the following function
replacer <- function(x) Reduce(function(x,r) gsub(map$original[r],
map$mapped[r], x, fixed=T), seq_len(nrow(map)),x)
# Let's test it
replacer("549300JV8KEETQJYUG13")
#[1] "5493001931820141429261934301613"

Seems like a merge:
map[as.data.table(unlist(strsplit(string, ""))),
.(mapped), on = c(original = "V1")][ , paste0(mapped, collapse = "")]
Note that both "D1" and "1V" will be mapped to "131"...
On your example output is: "5493001931820141429261934301613"
You can use sep = "." if you actually want this to be a reversible mapping...

I would use match:
as_numbers <- function(string) {
lapply(strsplit(string, ""), function(y) {
paste0(map$mapped[match(y, map$original)], collapse= "")})
}
as_numbers(c("549300JV8KEETQJYUG13", "5493V8KE300J"))
#[[1]]
#[1] "5493001931820141429261934301613"
#
#[[2]]
#[1] "5493318201430019"
Added an lapply call to handle length > 1 input correctly.
If you need further speed up, you can store map$mapped and map$original in separate vectors and use them in the match call instead of map$... so you don't need to subset the data.frame/data.table so many times (which is quite costly).
Since the Q was about performance, here's a benchmark of two of the solutions:
map = data.table(mapped = c(0:35), original = c(0:9,LETTERS))
x <- rep(c("549300JV8KEETQJYUG13", "5493V8KE300J"), 1000)
ascii_func <- function(string) {
lapply(string, function(x) {
x_ascii <- strtoi(charToRaw(x), 16)
paste(ifelse(x_ascii >= 65 & x_ascii <= 90,
x_ascii - 55, x_ascii - 48),
collapse = "")
})
}
match_func <- function(string) {
mapped <- map$mapped
original <- map$original
lapply(strsplit(string, ""), function(y) {
paste0(mapped[match(y, original)], collapse= "")})
}
library(microbenchmark)
microbenchmark(ascii_func(x), match_func(x), times = 25L)
#Unit: milliseconds
# expr min lq mean median uq max neval
# ascii_func(x) 83.47 92.55 96.91 96.82 103.06 112.07 25
# match_func(x) 24.30 24.74 26.86 26.11 28.67 31.55 25
identical(ascii_func(x), match_func(x))
#[1] TRUE

Related

How to Find the Second Highest digit in a string R

Question:
Write a function that accepts a string and returns the second highest numerical digit in the input as an integer.
The following rules should apply:
Inputs with no numerical digits should return -1
Inputs with only one numerical digit should return -1
Non-numeric characters should be ignored
Each numerical input should be treated individually, meaning in the event of a joint highest digit then the second highest digit will also be the highest digit
For example:
"abc:1231234" returns 3
"123123" returns 3
[execution time limit] 5 seconds (r)
[input] string input
The input string
[output] integer
The second-highest digit
I can convert the string into a numeric vector with strsplit and as.numeric and get rid of NAs (letters). But not sure where to go from here.
Clarification: Ideally base R solution.
I've got this code so far which, while messy, deals with all but the case where there are joint highest numbers:
solution <- function(input) {
d <- as.integer(strsplit(input,"")[[1]])
if (any(is.na(d))) {
d <- d[-which(is.na(d))]
}
if(all(is.na(d))) {
return(-1)
}
if (length(is.na(d)) == length(d)-1) {
return(-1)
}
sort(d,TRUE)[2]
}
A stringr::str_count solution:
library(stringr)
secondHighest1 <- function(str) {
ans <- 10L - match(TRUE, cumsum(str_count(str, paste0(9:0))) > 1L)
if (is.na(ans)) -1L else ans
}
A base R solution:
secondHighest2 <- function(str) {
suppressWarnings(ans <- 10L - match(TRUE, cumsum(tabulate(10L - as.integer(strsplit(str, "")[[1]]))) > 1L))
if (is.na(ans)) -1L else ans
}
UPDATE: Borrowing Adam's idea of using utf8ToInt instead of strsplit gives a big speed boost:
secondHighest3 <- function(str) {
nums <- utf8ToInt(str)
nums <- nums[nums < 58L]
if (length(nums) > 1L) max(-1L, max(nums[-which.max(nums)]) - 48L) else -1L
}
set.seed(94)
chrs <- c(paste0(9:0), letters, LETTERS)
str <- paste0(sample(chrs, 1e5, TRUE, (1:62)^4), collapse = "")
secondHighest1(str)
#> [1] 3
secondHighest2(str)
#> [1] 3
secondHighest3(str)
#> [1] 3
microbenchmark::microbenchmark(secondHighest1(str),
secondHighest2(str),
secondHighest3(str))
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> secondHighest1(str) 1193.8 1279.55 1524.712 1338.80 1525.2 5013.7 100
#> secondHighest2(str) 16825.3 18049.65 21297.962 19339.75 24126.4 36652.6 100
#> secondHighest3(str) 706.0 774.80 1371.196 867.40 1045.0 17780.4 100
As a one-liner
string <- "abc:1231234"
sort(unique(suppressWarnings(as.integer(strsplit(string, "", fixed = TRUE)[[1]]))), decreasing = TRUE)[2]
#> [1] 3
Or using the magrittr pipe:
library(magrittr)
suppressWarnings(
strsplit(string, "", fixed = TRUE)[[1]] %>%
as.integer() %>%
unique() %>%
sort(decreasing = TRUE) %>%
.[2]
)
#> [1] 3
Created on 2022-03-25 by the reprex package (v2.0.1)
You can also do something like this to convert to ASCII integers.
solution <- function(input) {
if (nchar(input) < 2L) return(-1L)
# ASCII codes for 0:9 are 48:57
int_input <- utf8ToInt(input) - 48L
sort(replace(int_input, !(int_input %in% 0L:9L), -1L), decreasing = TRUE)[2]
}
Testing a few strings...
str1 <- "abc:1231234"
str2 <- "987654321"
str3 <- "abcdefg4"
str4 <- "abc$<>$#%fgdgLJJ"
str5 <- "123123"
solution(str1)
# [1] 3
solution(str2)
# [1] 8
solution(str3)
# [1] -1
solution(str4)
# [1] -1
solution(str5
# [1] 3

How to convert playtime to seconds in R

I have a vector as follows:
playtimes <- c("1H18M20S", "1H27M5S", "18M27S", "56M38S", "21S")
and I want to convert these to playtimes in second. For example, the resulting vector would be something like this:
playtimeInSeconds <- c(4700, 5225, 1107, 3398, 21)
Im having trouble with separating the strings correctly based on the H, M and S. I wrote the following that works for the playtimes under 1 hour
minutes <- gsub("M.*", "", playtime)
seconds <- gsub(".*M", "", playtime) %>%
gsub("S", "", .)
totalPlaytime <- as.numeric(minutes)*60 + as.numeric(seconds)
But Im not sure how to tackle the H portion of some strings.
You could strsplit and adapt the length of the list elements reversely to 3 which allows you to use sapply to get a matrix where you apply the matrix product %*%.
m <- sapply(strsplit(p, 'H|M|S'), \(x) as.double(rev(`length<-`(rev(x), 3))))
res <- as.vector(t(replace(m, is.na(m), 0)) %*% rbind(3600, 60, 1))
res
# [1] 4700 5225 1107 3398 21
interesting problem. here is a solution that potentially could be more efficient but does the job
# function from https://www.statworx.com/de/blog/strsplit-but-keeping-the-delimiter/
strsplit <- function(x,
split,
type = "remove",
perl = FALSE,
...) {
if (type == "remove") {
# use base::strsplit
out <- base::strsplit(x = x, split = split, perl = perl, ...)
} else if (type == "before") {
# split before the delimiter and keep it
out <- base::strsplit(x = x,
split = paste0("(?<=.)(?=", split, ")"),
perl = TRUE,
...)
} else if (type == "after") {
# split after the delimiter and keep it
out <- base::strsplit(x = x,
split = paste0("(?<=", split, ")"),
perl = TRUE,
...)
} else {
# wrong type input
stop("type must be remove, after or before!")
}
return(out)
}
# convert to seconds
to_seconds <- c(H = 60 * 60,
M = 60,
S = 1)
get_seconds <- function(value, unit) {
value * to_seconds[unit]
}
# example vector
playtimes <- c("1H18M20S", "1H27M5S", "18M27S", "56M38S", "21S")
# extract time parts
times <- strsplit(playtimes,
split = "[A-Z]",
type = "after")
times
#> [[1]]
#> [1] "1H" "18M" "20S"
#>
#> [[2]]
#> [1] "1H" "27M" "5S"
#>
#> [[3]]
#> [1] "18M" "27S"
#>
#> [[4]]
#> [1] "56M" "38S"
#>
#> [[5]]
#> [1] "21S"
# calculate each time in seconds
sapply(times,
function(t) {
# split numeric and unit part
t_split <- strsplit(x = t,
split = "[A-Z]",
type = "before")
# calculate seconds for each unit part
times_in_seconds <- get_seconds(value = as.numeric(sapply(t_split, `[`, 1)),
unit = sapply(t_split, `[`, 2))
# sum of all parts
sum(times_in_seconds)
})
#> [1] 4700 5225 1107 3398 21
I followed the example given in the 3rd answer here and made the following
playtime <- sapply(playtime, function(x){paste(paste(rep(0, 3 - str_count(x, '[0-9]+')), collapse = ' '), x)})
totalPlaytime <- time_length(hms(playtime))
Short, sweet, and checks for potential errors where the playtime is less that 1 hr or less than 1 min.

Fastest way to convert a list of character vectors to numeric in R

In R, what is the fastest way to convert a list containing suites of character numbers (as character vectors) into numeric?
With the following dummy data:
set.seed(2)
N = 1e7
ncol = 10
myT = formatC(matrix(runif(N), ncol = ncol)) # A matrix converted to characters
# Each row is collapsed into a single suite of characters:
myT = apply(myT, 1, function(x) paste(x, collapse=' ') )
head(myT)
Producing:
[1] "0.1849 0.855 0.8272 0.5403 0.3891 0.5184 0.7776 0.5533 0.1566 0.01591"
[2] "0.7024 0.1008 0.9442 0.8582 0.3184 0.9289 0.9957 0.1311 0.2131 0.07355"
[3] "0.5733 0.5493 0.3915 0.4423 0.8522 0.6042 0.9265 0.006878 0.7052 0.71"
[... etc ...]
I could do
library(stringi)
# In the actual dataset, the number of spaces between numbers may vary, hence "\\s+"
system.time(newT <- lapply(stri_split_regex(myT, "\\s+", omit_empty=T), as.numeric))
newT <- unlist(newT) # Final goal is to have a single vector of numbers
On my Intel Core i7 2.10GHz with 64-bit and 16GB system (under ubuntu):
user system elapsed
3.748 0.008 3.757
With the real dataset (ncol=150 and N~1e9), this is way too long.
Any better option?
This is twice as fast on my system:
x <- paste(myT, collapse = "\n")
library(data.table)
DT <- fread(x)
newT2 <- c(t(DT))
I would suggest the "iotools" package, specifically the mstrsplit function. With that you would just do:
library(iotools)
newT <- as.vector(t(mstrsplit(myT, sep = " ", ncol = 10, type = "numeric")))
Get the "iotools" package on GitHub.
Timing comparisons:
OPFun <- function(myT) {
newT <- lapply(stri_split_regex(myT, "\\s+", omit_empty=T), as.numeric)
unlist(newT)
}
RolandFun <- function(myT) {
x <- paste(myT, collapse = "\n")
DT <- fread(x)
newT2 <- c(t(DT))
newT2
}
AMFun <- function(myT) {
as.vector(t(mstrsplit(myT, sep = " ", ncol = 10, type = "numeric")))
}
system.time(OP <- OPFun(myT))
# user system elapsed
# 3.920 0.004 3.917
system.time(Roland <- RolandFun(myT))
# user system elapsed
# 3.156 0.020 3.175
system.time(AM <- AMFun(myT))
# user system elapsed
# 0.664 0.016 0.676
all.equal(OP, Roland)
# [1] TRUE
all.equal(Roland, AM)
# [1] TRUE
mstrsplit(myT, sep = " ", type = "numeric")[, 1] is marginally faster. Note that the order of doing things influences performance. unlist(lapply(x, as.numeric)) is slower than as.numeric(unlist(x))
set.seed(2)
N = 1e4
ncol = 10
myT = formatC(matrix(runif(N), ncol = ncol)) # A matrix converted to characters
myT = apply(myT, 1, function(x) paste(x, collapse=' ') )
head(myT)
library(microbenchmark)
library(stringi)
library(data.table)
library(iotools)
microbenchmark(
original = {
newT <- lapply(stri_split_regex(myT, "\\s+", omit_empty=T), as.numeric)
unlist(newT)
},
data.table = {
x <- paste(myT, collapse = "\n")
DT <- fread(x)
c(t(DT))
},
iotools = {
as.vector(t(mstrsplit(myT, sep = " ", ncol = 10, type = "numeric")))
},
strsplit = {
as.numeric(unlist(strsplit(myT, " ")))
},
original2 = {
as.numeric(unlist(stri_split_regex(myT, "\\s+", omit_empty = TRUE)))
},
iotools2 = {
mstrsplit(myT, sep = " ", type = "numeric")[, 1]
}
)
Unit: milliseconds
expr min lq mean median uq max neval cld
original 52.03538 53.56949 56.02025 54.27165 55.40487 94.45513 100 c
data.table 93.10810 94.63730 98.04845 95.41537 96.51202 212.66666 100 e
iotools 18.73776 19.44485 21.00974 19.75573 20.05614 42.47620 100 a
strsplit 67.04637 69.24053 70.58916 69.86529 70.95980 84.86132 100 d
original2 48.25558 49.47346 51.49833 50.14377 50.96139 84.22928 100 b
iotools2 18.53165 19.19126 19.72922 19.52567 19.71340 32.48726 100 a

"raise" inner list to level of outer list in R [duplicate]

I am trying to achieve the functionality similar to unlist, with the exception that types are not coerced to a vector, but the list with preserved types is returned instead. For instance:
flatten(list(NA, list("TRUE", list(FALSE), 0L))
should return
list(NA, "TRUE", FALSE, 0L)
instead of
c(NA, "TRUE", "FALSE", "0")
which would be returned by unlist(list(list(NA, list("TRUE", list(FALSE), 0L)).
As it is seen from the example above, the flattening should be recursive. Is there a function in standard R library which achieves this, or at least some other function which can be used to easily and efficiently implement this?
UPDATE: I don't know if it is clear from the above, but non-lists should not be flattened, i.e. flatten(list(1:3, list(4, 5))) should return list(c(1, 2, 3), 4, 5).
Interesting non-trivial problem!
MAJOR UPDATE With all that's happened, I've rewrote the answer and removed some dead ends. I also timed the various solutions on different cases.
Here's the first, rather simple but slow, solution:
flatten1 <- function(x) {
y <- list()
rapply(x, function(x) y <<- c(y,x))
y
}
rapply lets you traverse a list and apply a function on each leaf element. Unfortunately, it works exactly as unlist with the returned values. So I ignore the result from rapply and instead I append values to the variable y by doing <<-.
Growing y in this manner is not very efficient (it's quadratic in time). So if there are many thousands of elements this will be very slow.
A more efficient approach is the following, with simplifications from #JoshuaUlrich:
flatten2 <- function(x) {
len <- sum(rapply(x, function(x) 1L))
y <- vector('list', len)
i <- 0L
rapply(x, function(x) { i <<- i+1L; y[[i]] <<- x })
y
}
Here I first find out the result length and pre-allocate the vector. Then I fill in the values.
As you can will see, this solution is much faster.
Here's a version of #JoshO'Brien great solution based on Reduce, but extended so it handles arbitrary depth:
flatten3 <- function(x) {
repeat {
if(!any(vapply(x, is.list, logical(1)))) return(x)
x <- Reduce(c, x)
}
}
Now let the battle begin!
# Check correctness on original problem
x <- list(NA, list("TRUE", list(FALSE), 0L))
dput( flatten1(x) )
#list(NA, "TRUE", FALSE, 0L)
dput( flatten2(x) )
#list(NA, "TRUE", FALSE, 0L)
dput( flatten3(x) )
#list(NA_character_, "TRUE", FALSE, 0L)
# Time on a huge flat list
x <- as.list(1:1e5)
#system.time( flatten1(x) ) # Long time
system.time( flatten2(x) ) # 0.39 secs
system.time( flatten3(x) ) # 0.04 secs
# Time on a huge deep list
x <-'leaf'; for(i in 1:11) { x <- list(left=x, right=x, value=i) }
#system.time( flatten1(x) ) # Long time
system.time( flatten2(x) ) # 0.05 secs
system.time( flatten3(x) ) # 1.28 secs
...So what we observe is that the Reduce solution is faster when the depth is low, and the rapply solution is faster when the depth is large!
As correctness goes, here are some tests:
> dput(flatten1( list(1:3, list(1:3, 'foo')) ))
list(1L, 2L, 3L, 1L, 2L, 3L, "foo")
> dput(flatten2( list(1:3, list(1:3, 'foo')) ))
list(1:3, 1:3, "foo")
> dput(flatten3( list(1:3, list(1:3, 'foo')) ))
list(1L, 2L, 3L, 1:3, "foo")
Unclear what result is desired, but I lean towards the result from flatten2...
For lists that are only a few nestings deep, you could use Reduce() and c() to do something like the following. Each application of c() removes one level of nesting. (For fully general solution, see EDITs below.)
L <- (list(NA, list("TRUE", list(FALSE), 0L)))
Reduce(c, Reduce(c, L))
[[1]]
[1] NA
[[2]]
[1] "TRUE"
[[3]]
[1] FALSE
[[4]]
[1] 0
# TIMING TEST
x <- as.list(1:4e3)
system.time(flatten(x)) # Using the improved version
# user system elapsed
# 0.14 0.00 0.13
system.time(Reduce(c, x))
# user system elapsed
# 0.04 0.00 0.03
EDIT Just for fun, here's a version of #Tommy's version of #JoshO'Brien's solution that does work for already flat lists. FURTHER EDIT Now #Tommy's solved that problem as well, but in a cleaner way. I'll leave this version in place.
flatten <- function(x) {
x <- list(x)
repeat {
x <- Reduce(c, x)
if(!any(vapply(x, is.list, logical(1)))) return(x)
}
}
flatten(list(3, TRUE, 'foo'))
# [[1]]
# [1] 3
#
# [[2]]
# [1] TRUE
#
# [[3]]
# [1] "foo"
How about this? It builds off Josh O'Brien's solution but does the recursion with a while loop instead using unlist with recursive=FALSE.
flatten4 <- function(x) {
while(any(vapply(x, is.list, logical(1)))) {
# this next line gives behavior like Tommy's answer;
# removing it gives behavior like Josh's
x <- lapply(x, function(x) if(is.list(x)) x else list(x))
x <- unlist(x, recursive=FALSE)
}
x
}
Keeping the commented line in gives results like this (which Tommy prefers, and so do I, for that matter).
> x <- list(1:3, list(1:3, 'foo'))
> dput(flatten4(x))
list(1:3, 1:3, "foo")
Output from my system, using Tommy's tests:
dput(flatten4(foo))
#list(NA, "TRUE", FALSE, 0L)
# Time on a long
x <- as.list(1:1e5)
system.time( x2 <- flatten2(x) ) # 0.48 secs
system.time( x3 <- flatten3(x) ) # 0.07 secs
system.time( x4 <- flatten4(x) ) # 0.07 secs
identical(x2, x4) # TRUE
identical(x3, x4) # TRUE
# Time on a huge deep list
x <-'leaf'; for(i in 1:11) { x <- list(left=x, right=x, value=i) }
system.time( x2 <- flatten2(x) ) # 0.05 secs
system.time( x3 <- flatten3(x) ) # 1.45 secs
system.time( x4 <- flatten4(x) ) # 0.03 secs
identical(x2, unname(x4)) # TRUE
identical(unname(x3), unname(x4)) # TRUE
EDIT: As for getting the depth of a list, maybe something like this would work; it gets the index for each element recursively.
depth <- function(x) {
foo <- function(x, i=NULL) {
if(is.list(x)) { lapply(seq_along(x), function(xi) foo(x[[xi]], c(i,xi))) }
else { i }
}
flatten4(foo(x))
}
It's not super fast but it seems to work fine.
x <- as.list(1:1e5)
system.time(d <- depth(x)) # 0.327 s
x <-'leaf'; for(i in 1:11) { x <- list(left=x, right=x, value=i) }
system.time(d <- depth(x)) # 0.041s
I'd imagined it being used this way:
> x[[ d[[5]] ]]
[1] "leaf"
> x[[ d[[6]] ]]
[1] 1
But you could also get a count of how many nodes are at each depth too.
> table(sapply(d, length))
1 2 3 4 5 6 7 8 9 10 11
1 2 4 8 16 32 64 128 256 512 3072
Edited to address a flaw pointed out in the comments. Sadly, it just makes it even less efficient. Ah well.
Another approach, although I'm not sure it will be more efficient than anything #Tommy has suggested:
l <- list(NA, list("TRUE", list(FALSE), 0L))
flatten <- function(x){
obj <- rapply(x,identity,how = "unlist")
cl <- rapply(x,class,how = "unlist")
len <- rapply(x,length,how = "unlist")
cl <- rep(cl,times = len)
mapply(function(obj,cl){rs <- as(obj,cl); rs}, obj, cl,
SIMPLIFY = FALSE, USE.NAMES = FALSE)
}
> flatten(l)
[[1]]
[1] NA
[[2]]
[1] "TRUE"
[[3]]
[1] FALSE
[[4]]
[1] 0
purrr::flatten achieves that. Though it is not recursive (by design).
So applying it twice should work:
library(purrr)
l <- list(NA, list("TRUE", list(FALSE), 0L))
flatten(flatten(l))
Here is an attempt at a recursive version:
flatten_recursive <- function(x) {
stopifnot(is.list(x))
if (any(vapply(x, is.list, logical(1)))) Recall(purrr::flatten(x)) else x
}
flatten_recursive(l)
hack_list <- function(.list) {
.list[['_hack']] <- function() NULL
.list <- unlist(.list)
.list$`_hack` <- NULL
.list
}
You can also use rrapply in the rrapply-package (extended version of base-rapply) by setting how = "flatten":
library(rrapply)
rrapply(list(NA, list("TRUE", list(FALSE), 0L)), how = "flatten")
#> [[1]]
#> [1] NA
#>
#> [[2]]
#> [1] "TRUE"
#>
#> [[3]]
#> [1] FALSE
#>
#> [[4]]
#> [1] 0
Computation times
Below are some benchmark timings against the flatten2 and flatten3 functions in Tommy's response for two large nested lists:
flatten2 <- function(x) {
len <- sum(rapply(x, function(x) 1L))
y <- vector('list', len)
i <- 0L
rapply(x, function(x) { i <<- i+1L; y[[i]] <<- x })
y
}
flatten3 <- function(x) {
repeat {
if(!any(vapply(x, is.list, logical(1)))) return(x)
x <- Reduce(c, x)
}
}
## large deeply nested list (1E6 elements, 6 layers)
deep_list <- rrapply(replicate(10, 1, simplify = F), classes = c("list", "numeric"), condition = function(x, .xpos) length(.xpos) < 6, f = function(x) replicate(10, 1, simplify = F), how = "recurse")
system.time(flatten2(deep_list))
#> user system elapsed
#> 1.715 0.012 1.727
## system.time(flatten3(deep_list)), not run takes more than 10 minutes
system.time(rrapply(deep_list, how = "flatten"))
#> user system elapsed
#> 0.105 0.016 0.121
## large shallow nested list (1E6 elements, 2 layers)
shallow_list <- lapply(replicate(1000, 1, simplify = F), function(x) replicate(1000, 1, simplify = F))
system.time(flatten2(shallow_list))
#> user system elapsed
#> 1.308 0.040 1.348
system.time(flatten3(shallow_list))
#> user system elapsed
#> 5.246 0.012 5.259
system.time(rrapply(shallow_list, how = "flatten"))
#> user system elapsed
#> 0.09 0.00 0.09

R filtering out a subset

I have a data.frame A
and a data.frame B which contains a subset of A
How can I create a data.frame C which is data.frame A with data.frame B excluded?
Thanks for your help.
get the rows in A that aren't in B
C = A[! data.frame(t(A)) %in% data.frame(t(B)), ]
If this B data set is truly a nested version of the first data set there has to be indexing that created this data set to begin with. IMHO we shouldn't be discussing the differences between the data sets but negating the original indexing that created the B data set to begin with. Here's an example of what I mean:
A <- mtcars
B <- mtcars[mtcars$cyl==6, ]
C <- mtcars[mtcars$cyl!=6, ]
A <- data.frame(x = 1:10, y = 1:10)
#Random subset of A in B
B <- A[sample(nrow(A),3),]
#get A that is not in B
C <- A[-as.integer(rownames(B)),]
Performance test vis-a-vis mplourde's answer:
library(rbenchmark)
f1 <- function() A[- as.integer(rownames(B)),]
f2 <- function() A[! data.frame(t(A)) %in% data.frame(t(B)), ]
benchmark(f1(), f2(), replications = 10000,
columns = c("test", "elapsed", "relative"),
order = "elapsed"
)
test elapsed relative
1 f1() 1.531 1.0000
2 f2() 8.846 5.7779
Looking at the rownames is approximately 6x faster. Two calls to transpose can get expensive computationally.
If B is truly a subset of A, which you can check with:
if(!identical(A[rownames(B), , drop = FALSE], B)) stop("B is not a subset of A!")
then you can filter by rownames:
C <- A[!rownames(A) %in% rownames(B), , drop = FALSE]
or
C <- A[setdiff(rownames(A), rownames(B)), , drop = FALSE]
Here are two data.table solutions that will be memory and time efficient
render_markdown(strict = T)
library(data.table)
# some biggish data
set.seed(1234)
ADT <- data.table(x = seq.int(1e+07), y = seq.int(1e+07))
.rows <- sample(nrow(ADT), 30000)
# Random subset of A in B
BDT <- ADT[.rows, ]
# set keys for fast merge
setkey(ADT, x)
setkey(BDT, x)
## how CDT <- ADT[-ADT[BDT,which=T]] the data as `data.frames for fastest
## alternative
A <- copy(ADT)
setattr(A, "class", "data.frame")
B <- copy(BDT)
setattr(B, "class", "data.frame")
f2 <- function() noBDT <- ADT[-ADT[BDT, which = T]]
f3 <- function() noBDT2 <- ADT[-BDT[, x]]
f1 <- function() noB <- A[-as.integer(rownames(B)), ]
library(rbenchmark)
benchmark(base = f1(),DT = f2(), DT2 = f3(), replications = 3)
## test replications elapsed relative user.self sys.self
## 2 DT 3 0.92 1.108 0.77 0.15
## 1 base 3 3.72 4.482 3.19 0.52
## 3 DT2 3 0.83 1.000 0.72 0.11
This is not the fastest and is likely to be very slow but is an alternative to mplourde's that takes into account the row data and should work on mixed data which flodel critiqued. It relies on the paste2 function from the qdap package which doesn't exist yet as I plan to release it within the enxt month or 2:
Paste 2 function:
paste2 <- function(multi.columns, sep=".", handle.na=TRUE, trim=TRUE){
if (trim) multi.columns <- lapply(multi.columns, function(x) {
gsub("^\\s+|\\s+$", "", x)
}
)
if (!is.data.frame(multi.columns) & is.list(multi.columns)) {
multi.columns <- do.call('cbind', multi.columns)
}
m <- if(handle.na){
apply(multi.columns, 1, function(x){if(any(is.na(x))){
NA
} else {
paste(x, collapse = sep)
}
}
)
} else {
apply(multi.columns, 1, paste, collapse = sep)
}
names(m) <- NULL
return(m)
}
# Flodel's mixed data set:
A <- data.frame(x = 1:4, y = as.character(1:4)); B <- A[1:2, ]
# My approach:
A[!paste2(A)%in%paste2(B), ]

Resources