Convert named vector to list in R - r

Suppose I have the following named numeric vector:
a <- 1:8
names(a) <- rep(c('I', 'II'), each = 4)
How can I convert this vector to a list of length 2 (shown below)?
a.list
# $I
# [1] 1 2 3 4
# $II
# [1] 5 6 7 8
Note that as.list(a) is not what I'm looking for.
My very unsatisfying (and slow for large vectors) solution is:
names.uniq <- unique(names(a))
a.list <- setNames(vector('list', length(names.uniq)), names.uniq)
for(i in 1:length(names.uniq)) {
names.i <- names.uniq[i]
a.i <- a[names(a)==names.i]
a.list[[names.i]] <- unname(a.i)
}
Thank you in advance for your help,
Devin

Like I said in the comment, you can use split to create a list.
a.list <- split(a, names(a))
a.list <- lapply(a.list, unname)
A one-liner would be
a.list <- lapply(split(a, names(a)), unname)
#$I
#[1] 1 2 3 4
#
#$II
#[1] 5 6 7 8
EDIT.
Then, thelatemail posted a simplification of this in his comment. I've timed it using Devin King's way and it's not only simpler it's also 25% faster.
a.list <- split(unname(a),names(a))

I'd suggest looking at packages that excel at working with aggregating large amounts of data, like the data.table package. With data.table, you could do:
a <- 1:5e7
names(a) <- c(rep('I',1e7), rep('II',1e7), rep('III',1e7),
rep('IV',1e7), rep('V',1e7))
library(data.table)
temp <- data.table(names(a), a)[, list(V2 = list(a)), V1]
a.list <- setNames(temp[["V2"]], temp[["V1"]])
Here are some functions to test the various options out with:
myFun <- function(invec) {
x <- data.table(names(invec), invec)[, list(V2 = list(invec)), V1]
setNames(x[["V2"]], x[["V1"]])
}
rui1 <- function(invec) {
a.list <- split(invec, names(invec))
lapply(a.list, unname)
}
rui2 <- function(invec) {
split(unname(invec), names(invec))
}
op <- function(invec) {
names.uniq <- unique(names(invec))
a.list <- setNames(vector('list', length(names.uniq)), names.uniq)
for(i in 1:length(names.uniq)) {
names.i <- names.uniq[i]
a.i <- a[names(invec) == names.i]
a.list[[names.i]] <- unname(a.i)
}
a.list
}
And the results of microbenchmark on 10 replications:
library(microbenchmark)
microbenchmark(myFun(a), rui1(a), rui2(a), op(a), times = 10)
# Unit: milliseconds
# expr min lq mean median uq max neval
# myFun(a) 698.1553 768.6802 932.6525 934.6666 1056.558 1168.889 10
# rui1(a) 2967.4927 3097.6168 3199.9378 3185.1826 3319.453 3413.185 10
# rui2(a) 2152.0307 2285.4515 2372.9896 2362.7783 2426.821 2643.033 10
# op(a) 2672.4703 2872.5585 2896.7779 2901.7979 2971.782 3039.663 10
Also, note that in testing the different solutions, you might want to consider other scenarios, for instance, cases where you expect to have lots of different names. In that case, your for loop slows down significantly. Try, for example, the above functions with the following data:
set.seed(1)
b <- sample(100, 5e7, TRUE)
names(b) <- sample(c(letters, LETTERS, 1:100), 5e7, TRUE)

Testing Rui Barradas' solution vs my original solution on a larger vector
a <- 1:5e7
names(a) <- c(rep('I',1e7), rep('II',1e7), rep('III',1e7), rep('IV',1e7), rep('V',1e7))
Rui's
st1 <- Sys.time()
a.list <- split(a, names(a))
a.list <- lapply(a.list, unname)
Sys.time() - st1
Time difference of 2.560906 secs
Mine
st1 <- Sys.time()
names.uniq <- unique(names(a))
a.list <- setNames(vector('list', length(names.uniq)), names.uniq)
for(i in 1:length(names.uniq)) {
names.i <- names.uniq[i]
a.i <- a[names(a)==names.i]
a.list[[names.i]] <- unname(a.i)
}
Sys.time() - st1
Time difference of 2.712066 secs
thelatemail's
st1 <- Sys.time()
a.list <- split(unname(a),names(a))
Sys.time() - st1
Time difference of 1.62851 secs

To handle also unnamed vectors, use then:
vec_to_list <- function(vec) {
if (is.null(names(vec))) names(vec) <- 1:length(vec)
split(unname(vec), names(vec))
}

Related

Fastest Way To Find Last Names From String in R

I am trying to identify likely last name from parts of name strings in various formats in R. What is the fastest way to identify the longest string match from the dataset of last names to a given name string (I'm using the wru surnames2010 dataset)?
I need the longest possibility rather than any possibility. I.e. in the example below the first string "scottcampbell" contains possible surnames "scott" and "campbell". I want to only return the longest of the possible matches, in this case only "campbell".
Reproduce example data:
library(wru)
data("surnames2010")
#filter out names under 4 characters
lnames <- surnames2010[nchar(as.character(surnames2010$surname))>3,]
testvec <- c("scottcampbell","mattbaker","tsmith","watkins","burnsmary","terri","frankrodriguez","neal")
Desired imagined function+result:
foo_longest_matches(testvec)
#Desired imagined result:
[1] "campbell" "baker" "smith" "watkins" "burns" "terri" "rodriguez" "neal")
You could use adist. Please note that you are doing more than 1million comparisons to obtain the longest. I would prefer you use a different method. The best so far that I have in mind is
a <- adist(toupper(testvec), surnames2010$surname, counts = TRUE)
b <- attr(a, "trafos")
d <- array(grepl("S|I", b) + nchar(gsub("(.)\\1++", "1",b, perl=TRUE)), dim(a)) * 10 + a
as.character(surnames2010$surname[max.col(-d)])
[1] "CAMPBELL" "BAKER" "SMITH" "WATKINS" "BURNS" "TERRI" "RODRIGUEZ" "NEAL"
benchmark:
longest <- function(testvec,namevec){
a <- adist(testvec, namevec, counts = TRUE)
b <- attr(a, "trafos")
d <- array(grepl("S|I", b) + nchar(gsub("(.)\\1++", "1",b, perl=TRUE)), dim(a)) * 10 + a
as.character(namevec[max.col(-d)])
}
EDIT: Was able to obtain a faster method(Not necessarily the fastest)
longest2 <- function(testvec,namevec){
a <- stack(sapply(namevec,grep,testvec,value = TRUE,simplify = FALSE))
tapply(as.character(a[, 2]), a[, 1], function(x) x[which.max(nchar(x))])[testvec]
}
microbenchmark::microbenchmark(longest(testvec,lnames$surname),longest2(testvec,lnames$surname),foo_longest_matches(testvec),times = 5)
Unit: seconds
expr min lq mean median uq max neval
longest(testvec, lnames$surname) 3.316550 3.984128 5.308339 6.265192 6.396348 6.579477 5
longest2(testvec, lnames$surname) 1.817059 1.917883 2.835354 3.350068 3.538278 3.553481 5
foo_longest_matches(testvec) 10.093179 10.325489 11.610619 10.756714 10.889326 15.988384 5
Not sure about fastest but here is a method to test:
library(wru)
data("surnames2010")
lnames <- surnames2010[nchar(as.character(surnames2010$surname))>3,]
testvec <- c("scottcampbell","mattbaker","tsmith","watkins","burnsmary","terri","frankrodriguez","neal")
lnames$surname <- tolower(lnames$surname)
testvec <- tolower(testvec)
foo_longest_matches <- function(string_vector) {
outdf <- c()
for (name in string_vector) {
print(name)
ting <- lnames[sapply(lnames$surname, function(x) grepl(x, name)),]
# you only care about the longest, remove the next line to get all matches
ting <- ting[which.max(nchar(ting$surname)),]
outdf <- rbind(outdf, ting)
}
return(outdf)
}
get_matches <- foo_longest_matches(testvec)
get_matches
# surname p_whi p_bla p_his p_asi p_oth
# 47 campbell 0.7366 0.2047 0.02490000 0.00530000 0.02840000
# 44 baker 0.7983 0.1444 0.02280000 0.00560000 0.02890000
# 1 smith 0.7090 0.2311 0.02400000 0.00500000 0.03080000
# 240 watkins 0.6203 0.3227 0.02090000 0.00420000 0.03200000
# 155 burns 0.8026 0.1406 0.02480000 0.00590000 0.02610000
# 110133 terri 0.7453 0.1801 0.01243333 0.01243333 0.04973333
# 9 rodriguez 0.0475 0.0054 0.93770000 0.00570000 0.00360000
# 337 neal 0.6210 0.3184 0.02160000 0.00600000 0.03290000

R Merge two data frames where one data frame is a subset of the columns of the second data frame [duplicate]

I have a list of data frames with different sets of columns. I would like to combine them by rows into one data frame. I use plyr::rbind.fill to do that. I am looking for something that would do this more efficiently, but is similar to the answer given here
require(plyr)
set.seed(45)
sample.fun <- function() {
nam <- sample(LETTERS, sample(5:15))
val <- data.frame(matrix(sample(letters, length(nam)*10,replace=TRUE),nrow=10))
setNames(val, nam)
}
ll <- replicate(1e4, sample.fun())
rbind.fill(ll)
UPDATE: See this updated answer instead.
UPDATE (eddi): This has now been implemented in version 1.8.11 as a fill argument to rbind. For example:
DT1 = data.table(a = 1:2, b = 1:2)
DT2 = data.table(a = 3:4, c = 1:2)
rbind(DT1, DT2, fill = TRUE)
# a b c
#1: 1 1 NA
#2: 2 2 NA
#3: 3 NA 1
#4: 4 NA 2
FR #4790 added now - rbind.fill (from plyr) like functionality to merge list of data.frames/data.tables
Note 1:
This solution uses data.table's rbindlist function to "rbind" list of data.tables and for this, be sure to use version 1.8.9 because of this bug in versions < 1.8.9.
Note 2:
rbindlist when binding lists of data.frames/data.tables, as of now, will retain the data type of the first column. That is, if a column in first data.frame is character and the same column in the 2nd data.frame is "factor", then, rbindlist will result in this column being a character. So, if your data.frame consisted of all character columns, then, your solution with this method will be identical to the plyr method. If not, the values will still be the same, but some columns will be character instead of factor. You'll have to convert to "factor" yourself after. Hopefully this behaviour will change in the future.
And now here's using data.table (and benchmarking comparison with rbind.fill from plyr):
require(data.table)
rbind.fill.DT <- function(ll) {
# changed sapply to lapply to return a list always
all.names <- lapply(ll, names)
unq.names <- unique(unlist(all.names))
ll.m <- rbindlist(lapply(seq_along(ll), function(x) {
tt <- ll[[x]]
setattr(tt, 'class', c('data.table', 'data.frame'))
data.table:::settruelength(tt, 0L)
invisible(alloc.col(tt))
tt[, c(unq.names[!unq.names %chin% all.names[[x]]]) := NA_character_]
setcolorder(tt, unq.names)
}))
}
rbind.fill.PLYR <- function(ll) {
rbind.fill(ll)
}
require(microbenchmark)
microbenchmark(t1 <- rbind.fill.DT(ll), t2 <- rbind.fill.PLYR(ll), times=10)
# Unit: seconds
# expr min lq median uq max neval
# t1 <- rbind.fill.DT(ll) 10.8943 11.02312 11.26374 11.34757 11.51488 10
# t2 <- rbind.fill.PLYR(ll) 121.9868 134.52107 136.41375 184.18071 347.74724 10
# for comparison change t2 to data.table
setattr(t2, 'class', c('data.table', 'data.frame'))
data.table:::settruelength(t2, 0L)
invisible(alloc.col(t2))
setcolorder(t2, unique(unlist(sapply(ll, names))))
identical(t1, t2) # [1] TRUE
It should be noted that plyr's rbind.fill edges past this particular data.table solution until list size of about 500.
Benchmarking plot:
Here's the plot on runs with list length of data.frames with seq(1000, 10000, by=1000). I've used microbenchmark with 10 reps on each of these different list lengths.
Benchmarking gist:
Here's the gist for benchmarking, in case anyone wants to replicate the results.
Now that rbindlist (and rbind) for data.table has improved functionality and speed with the recent changes/commits in v1.9.3 (development version), and dplyr has a faster version of plyr's rbind.fill, named rbind_all, this answer of mine seems a bit too outdated.
Here's the relevant NEWS entry for rbindlist:
o 'rbindlist' gains 'use.names' and 'fill' arguments and is now implemented entirely in C. Closes #5249
-> use.names by default is FALSE for backwards compatibility (doesn't bind by
names by default)
-> rbind(...) now just calls rbindlist() internally, except that 'use.names'
is TRUE by default, for compatibility with base (and backwards compatibility).
-> fill by default is FALSE. If fill is TRUE, use.names has to be TRUE.
-> At least one item of the input list has to have non-null column names.
-> Duplicate columns are bound in the order of occurrence, like base.
-> Attributes that might exist in individual items would be lost in the bound result.
-> Columns are coerced to the highest SEXPTYPE, if they are different, if/when possible.
-> And incredibly fast ;).
-> Documentation updated in much detail. Closes DR #5158.
So, I've benchmarked the newer (and faster versions) on relatively bigger data below.
New Benchmark:
We'll create a total of 10,000 data.tables with columns ranging from 200-300 with the total number of columns after binding to be 500.
Functions to create data:
require(data.table) ## 1.9.3 commit 1267
require(dplyr) ## commit 1504 devel
set.seed(1L)
names = paste0("V", 1:500)
foo <- function() {
cols = sample(200:300, 1)
data = setDT(lapply(1:cols, function(x) sample(10)))
setnames(data, sample(names)[1:cols])
}
n = 10e3L
ll = vector("list", n)
for (i in 1:n) {
.Call("Csetlistelt", ll, i, foo())
}
And here are the timings:
## Updated timings on data.table v1.9.5 - three consecutive runs:
system.time(ans1 <- rbindlist(ll, fill=TRUE))
# user system elapsed
# 1.993 0.106 2.107
system.time(ans1 <- rbindlist(ll, fill=TRUE))
# user system elapsed
# 1.644 0.092 1.744
system.time(ans1 <- rbindlist(ll, fill=TRUE))
# user system elapsed
# 1.297 0.088 1.389
## dplyr's rbind_all - Timings for three consecutive runs
system.time(ans2 <- rbind_all(ll))
# user system elapsed
# 9.525 0.121 9.761
# user system elapsed
# 9.194 0.112 9.370
# user system elapsed
# 8.665 0.081 8.780
identical(ans1, setDT(ans2)) # [1] TRUE
There is still something to be gained if you parallelize both rbind.fill and rbindlist.
The results are done with data.table version 1.8.8 as version 1.8.9 got bricked when I tried it with the parallelized function. So the results aren't identical between data.table and plyr, but they are identical within data.table or plyr solution. Meaning parallel plyr matches to unparallel plyr, and vice versa.
Here's the benchmark/scripts. The parallel.rbind.fill.DT looks horrible, but that's the fastest one I could pull.
require(plyr)
require(data.table)
require(ggplot2)
require(rbenchmark)
require(parallel)
# data.table::rbindlist solutions
rbind.fill.DT <- function(ll) {
all.names <- lapply(ll, names)
unq.names <- unique(unlist(all.names))
rbindlist(lapply(seq_along(ll), function(x) {
tt <- ll[[x]]
setattr(tt, 'class', c('data.table', 'data.frame'))
data.table:::settruelength(tt, 0L)
invisible(alloc.col(tt))
tt[, c(unq.names[!unq.names %chin% all.names[[x]]]) := NA_character_]
setcolorder(tt, unq.names)
}))
}
parallel.rbind.fill.DT <- function(ll, cluster=NULL){
all.names <- lapply(ll, names)
unq.names <- unique(unlist(all.names))
if(is.null(cluster)){
ll.m <- rbindlist(lapply(seq_along(ll), function(x) {
tt <- ll[[x]]
setattr(tt, 'class', c('data.table', 'data.frame'))
data.table:::settruelength(tt, 0L)
invisible(alloc.col(tt))
tt[, c(unq.names[!unq.names %chin% all.names[[x]]]) := NA_character_]
setcolorder(tt, unq.names)
}))
}else{
cores <- length(cluster)
sequ <- as.integer(seq(1, length(ll), length.out = cores+1))
Call <- paste(paste("list", seq(cores), sep=""), " = ll[", c(1, sequ[2:cores]+1), ":", sequ[2:(cores+1)], "]", sep="", collapse=", ")
ll <- eval(parse(text=paste("list(", Call, ")")))
rbindlist(clusterApply(cluster, ll, function(ll, unq.names){
rbindlist(lapply(seq_along(ll), function(x, ll, unq.names) {
tt <- ll[[x]]
setattr(tt, 'class', c('data.table', 'data.frame'))
data.table:::settruelength(tt, 0L)
invisible(alloc.col(tt))
tt[, c(unq.names[!unq.names %chin% colnames(tt)]) := NA_character_]
setcolorder(tt, unq.names)
}, ll=ll, unq.names=unq.names))
}, unq.names=unq.names))
}
}
# plyr::rbind.fill solutions
rbind.fill.PLYR <- function(ll) {
rbind.fill(ll)
}
parallel.rbind.fill.PLYR <- function(ll, cluster=NULL, magicConst=400){
if(is.null(cluster) | ceiling(length(ll)/magicConst) < length(cluster)){
rbind.fill(ll)
}else{
cores <- length(cluster)
sequ <- as.integer(seq(1, length(ll), length.out = ceiling(length(ll)/magicConst)))
Call <- paste(paste("list", seq(cores), sep=""), " = ll[", c(1, sequ[2:(length(sequ)-1)]+1), ":", sequ[2:length(sequ)], "]", sep="", collapse=", ")
ll <- eval(parse(text=paste("list(", Call, ")")))
rbind.fill(parLapply(cluster, ll, rbind.fill))
}
}
# Function to generate sample data of varying list length
set.seed(45)
sample.fun <- function() {
nam <- sample(LETTERS, sample(5:15))
val <- data.frame(matrix(sample(letters, length(nam)*10,replace=TRUE),nrow=10))
setNames(val, nam)
}
ll <- replicate(10000, sample.fun())
cl <- makeCluster(4, type="SOCK")
clusterEvalQ(cl, library(data.table))
clusterEvalQ(cl, library(plyr))
benchmark(t1 <- rbind.fill.PLYR(ll),
t2 <- rbind.fill.DT(ll),
t3 <- parallel.rbind.fill.PLYR(ll, cluster=cl, 400),
t4 <- parallel.rbind.fill.DT(ll, cluster=cl),
replications=5)
stopCluster(cl)
# Results for rbinding 10000 dataframes
# done with 4 cores, i5 3570k and 16gb memory
# test reps elapsed relative
# rbind.fill.PLYR 5 321.80 16.682
# rbind.fill.DT 5 26.10 1.353
# parallel.rbind.fill.PLYR 5 28.00 1.452
# parallel.rbind.fill.DT 5 19.29 1.000
# checking are results equal
t1 <- as.matrix(t1)
t2 <- as.matrix(t2)
t3 <- as.matrix(t3)
t4 <- as.matrix(t4)
t1 <- t1[order(t1[, 1], t1[, 2]), ]
t2 <- t2[order(t2[, 1], t2[, 2]), ]
t3 <- t3[order(t3[, 1], t3[, 2]), ]
t4 <- t4[order(t4[, 1], t4[, 2]), ]
identical(t2, t4) # TRUE
identical(t1, t3) # TRUE
identical(t1, t2) # FALSE, mismatch between plyr and data.table
As you can see parallesizing rbind.fill made it comparable to data.table, and you could get marginal increase of speed by parallesizing data.table even with this low of a dataframe count.
simply dplyr::bind_rows will do the job, as
library(dplyr)
merged_list <- bind_rows(ll)
#check it
> nrow(merged_list)
[1] 100000
> ncol(merged_list)
[1] 26
Time taken
> system.time(merged_list <- bind_rows(ll))
user system elapsed
0.29 0.00 0.28

Why is the [[]] operator subsetting faster than the [,] operator in R?

I benchmarked a few solutions for replacing missing values per column.
set.seed(11)
df <- data.frame(replicate(3, sample(c(1:5, -99), 6, rep = TRUE)))
names(df) <- letters[1:3]
fix_na <- function(x) {
x[x == -99] <- NA
}
microbenchmark(
for(i in seq_along(df)) df[, i] <- fix_na(df[, i]),
for(i in seq_along(df)) df[[i]] <- fix_na(df[[i]]),
df[] <- lapply(df, fix_na)
)
Unit: microseconds
expr min lq mean median uq max neval
for (i in seq_along(df)) df[, i] <- fix_na(df[, i]) 179.167 191.9060 206.1650 204.2335 211.630 364.497 100
for (i in seq_along(df)) df[[i]] <- fix_na(df[[i]]) 83.420 92.8715 104.5787 98.0080 109.309 204.645 100
df[] <- lapply(df, fix_na) 105.199 113.4175 128.0265 117.9385 126.979 305.734 100
Why is the [[]] operator subsetting the dataframe 2x faster than the [,] operator?
EDIT
I included the two recommended calls from docendo discimus and increased the amount of data.
set.seed(11)
df1 <- data.frame(replicate(2000, sample(c(1:5, -99), 500, rep = TRUE)))
df2 <- df1
df3 <- df1
df4 <- df1
df5 <- df1
The results change yes, but my question still is there: [[]] performs faster than [,]
Unit: milliseconds
expr min lq mean median uq
for (i in seq_along(df1)) df1[, i] <- fix_na(df1[, i]) 301.06608 356.48011 377.31592 372.05625 392.73450 472.3330
for (i in seq_along(df2)) df2[[i]] <- fix_na(df2[[i]]) 238.72005 287.55364 301.35651 298.05950 314.04369 386.4288
df3[] <- lapply(df3, fix_na) 170.53264 189.83858 198.32358 193.43300 202.43855 284.1164
df4[df4 == -99] <- NA 75.05571 77.64787 85.59757 80.72697 85.16831 363.2223
is.na(df5) <- df5 == -99 74.44877 77.81799 84.22055 80.06496 83.01401 347.5798
A faster approach would be using set from data.table
library(data.table)
setDT(df)
for(j in seq_along(df)){
set(df, i = which(df[[j]]== -99), j=j, value = NA)
}
Regarding the OP's question about the benchmarking with [ and [[, the [[ extract the column without the overhead of .data.frame. But, I would benchmark on a bigger dataset to find any difference. Also, as we assign NA on the same data, it doesn't do any change when we are doing the operation again.
Benchmarks
set.seed(11)
df1 <- data.frame(replicate(2000, sample(c(1:5, -99), 500, rep = TRUE)))
df2 <- copy(df1)
df3 <- copy(df1)
df4 <- copy(df1)
df5 <- copy(df1)
df6 <- copy(df1)
f1 <- function() for (i in seq_along(df1)) df1[, i] <- fix_na(df1[, i])
f2 <- function() for (i in seq_along(df2)) df2[[i]] <- fix_na(df1[[i]])
f3 <- function() df3[] <- lapply(df3, fix_na)
f4 <- function() df4[df4 == -99] <- NA
f5 <- function() is.na(df5) <- df5 == -99
f6 <- function() {
setDT(df6)
for(j in seq_along(df)){
set(df, i = which(df[[j]]== -99), j=j, value = NA)
}
}
t(sapply(paste0("f", 1:6), function(f) system.time(get(f)())))[,1:3]
# user.self sys.self elapsed
#f1 0.29 0 0.30
#f2 0.22 0 0.22
#f3 0.11 0 0.11
#f4 0.31 0 0.31
#f5 0.31 0 0.32
#f6 0.00 0 0.00
Here, I am using the system.time as the functions in the OP's post already replace the value of NA in the first run, so there is no point in running it again and again.
Got an answer for a very similar problem on the site suggested from Arun: adv-r.had.co.nz/Performance.html
At the section Extracting a single value from a data frame it says:
Blockquote The following microbenchmark shows seven ways to access a single value (the number in the bottom-right corner) from the built-in mtcars dataset. The variation in performance is startling: the slowest method takes 30x longer than the fastest. There’s no reason that there has to be such a huge difference in performance. It’s simply that no one has had the time to fix it.
Among the different selection methods also the two operators [[ and [ are compared with the same results as observed by me. [[ outperforms [

Speedproblems by using apply in a function to translate multiple strings

I wrote a little function dictTranslator to translate multiple strings into abbreviations or group them into categories... I use a list as a dictionary and have a data frame column with abbreviations and I want a column category with the related category name.
dictTranslator <- function(x, dict) {
sapply(x, function(a) {
result <- names(which(sapply(dict, function(b) {a %in% b})))
if(identical(result, character(0))) {
warning(sprintf('NAs are introduced, "%s" not found!', a), call.=FALSE)
NA
} else {
result
}
})
}
my_dictionary <-
list(embryo=c('00h','01h','02h','e02','03h','04h','05h','06h','e06',
'08h','10h','12h','e12','14h','16h','18h','20h','e20'),
larvae=c('L1','L2','L3e','L3l'),
pupae=c('p1','p2','p3','p4','p5'),
adult=c('vm','m','vf','f'))
sample data:
df <- data.frame(abbreviation=rep(unlist(my_dictionary), 30000))
nrow(df)
# [1] 930000
system.time(df$category <- dictTranslator(df$abbreviation, my_dictionary))
The function works as expected but the performance is pretty slow (about a minute). Has anyone an idea to speed this up or is there maybe a better solution to this?
The result looks like:
> head(df,40)
abbreviation category
1 00h embryo
2 01h embryo
...
19 L1 larvae
20 L2 larvae
21 L3e larvae
22 L3l larvae
23 p1 pupae
24 p2 pupae
25 p3 pupae
26 p4 pupae
27 p5 pupae
28 vm adult
29 m adult
30 vf adult
31 f adult
32 00h embryo
33 01h embryo
34 02h embryo
35 e02 embryo
36 03h embryo
I would use match on a named vector, plus removal of the numbers generated by unlisting my_dictionary (for which I used sub). This way you can avoid costly loops.
x <- unlist(my_dictionary)
df$category <- sub('\\d+$', '', names(x)[match(df$abbreviation, x)])
Run time is less than a second:
df <- data.frame(abbreviation=rep(unlist(my_dictionary), 30000),
stringsAsFactors = FALSE)
system.time({df$category <- sub('\\d+$', '', names(x)[match(df$abbreviation, x)])})
# User System elapsed
# 0.634 0.003 0.639
I found this approach finished fastest on my machine. Note that the input data is character which results in speed gains when compared with factor input.
## non-factor sample data
my_dictionary <- unlist(my_dictionary)
df <- data.frame(abbreviation = rep(my_dictionary, 30000),
stringsAsFactors = FALSE)
system.time({
## names
result <- sapply(df$abbreviation, function(i) {
names(which(i == my_dictionary))
})
## discard numbers
df$category <- gsub("\\d", "", result)
})
# user system elapsed
# 3.993 0.000 3.991
As regards the code you provided (i.e., using multiple sapply loops in combination with %in%), keep in mind that %in% (or match) perform rather poorly when you are searching for a single entry (see e.g. this question).
I suggest another data format for dictionary, also use data.table library:
library(data.table)
md <- list()
for(i in 1:length(my_dictionary)) {
md[[i]] <- data.table(abbreviation = my_dictionary[[i]], category = names(my_dictionary[i]))
}
md <- rbindlist(md)
Then you simply join:
df <- data.table(df, key = 'abbreviation')
df <- df[md]
Comparison. I did ran a comparison of three approaches and here are the results:
Data preparation:
#docendo-discimus and #fdetsch use same data format
md1 <- unlist(my_dictionary)
df1 <- df
#danas.zuokas uses
library(data.table)
md2 <- list()
for(i in 1:length(my_dictionary)) {
md2[[i]] <- data.table(abbreviation = my_dictionary[[i]], category = names(my_dictionary[i]))
}
md2 <- rbindlist(md2)
df2 <- data.table(df, key = 'abbreviation')
Here are three functions to compare
f_dd <- function(x, y) { x$category <- sub('\\d+$', '', names(y)[match(x$abbreviation, y)]); x } #docendo-discimus
f_dz <- function(x, y) { x <- x[y]; x } #danas.zuokas
f_fd <- function(x, y) { x$category <- gsub('\\d', '', sapply(x$abbreviation, function(i) names(which(i == y)))); x } #fdetsch
And here are the results
library(microbenchmark)
microbenchmark(f_dd(df1, md1), f_fd(df1, md1), f_dz(df2, md2), times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
f_dd(df1, md1) 1041.9195 1142.8361 1236.2033 1224.498 1266.9600 1469.7119 10
f_fd(df1, md1) 7106.6641 7417.5538 7924.3541 7868.716 8304.7760 8961.2615 10
f_dz(df2, md2) 35.6389 41.7524 77.2347 63.478 70.1699 183.9867 10
I will use a named vector for this:
a2c <- c ("larvae", "larvae", "larvae", "pupae", "pupae", ...)
names (a2e) <- c ('L1','L2','L3e','p1', 'p2', ...)
In your case you can easily build the vector doing:
a2c <- rep (names (my_dictionary), times = sapply (my_dictionary, length))
names (a2c) <- unlist (my_dictionary)
Make sure they match:
cbind (names (a2c), a2c)
And then:
df[,"category"] <- a2c[df$abbreviation]
In your example df$abbreviation is a factor so you may need to turn it into character
df[,"category"] <- a2c[as.character (df$abbreviation)]

YAVL: Yet another vectorized loop in R

I'm only just getting a handle on vectorizing code with R (links to useful examples would help), and I'm trying to find a faster way of handling this loop. a,b,c all have a bunch of numbers in them and I'm trying to find any particular number that occurs in all 3 columns. The loop works, but is super slow:
for(i in 1:length(a)){
if(any(a[i]==b))
if(any(a[i]==c))
print(a[i])
}
Is there an apply function that would work really well here?
Maybe this?
x <- 1:5
y <- 4:10
z <- 4:8
> Reduce(intersect,list(x,y,z))
[1] 4 5
I see you have accepted #joran solution, but it is really hidden loop. This is a "vectorized" solution:
> x <- 1:5
> y <- 4:10
> z <- 4:8
> x[ (x %in% y) & (x %in% z) ]
[1] 4 5
You could also count the total number of times each appeared (assuming there are no duplicates in each; if so, run unique on them first. This code also returns the desired numbers as characters; it could be converted back as needed.
x <- 1:5; y <- 4:10; z <- 4:8
foo <- table(c(x,y,z))
names(foo)[foo==3]
## [1] "4" "5"
You can also improve your for loop by using intersect within a for ( basically it what it is done within Reduce)
intersect.list <- function(list) { ## code from stabperf package
if (is.null(list)) return(NA)
# Handle empty list
if (length(list) < 1) return(NA)
# Start with first element of list
int <- list[[1]]
for (v in list[-1]) { int <- intersect(int, v) }
return(int)
}
intersect.list(list(x,y,z))
4 5
benchmarking :
library(microbenchmark)
set.seed(1)
N <- 1e6
x <- sample(1:100,N,rep=T)
y <- sample(1:100,N,rep=T)
z <- sample(1:100,N,rep=T)
vectorized <- function()x[ (x %in% y) & (x %in% z) ]
microbenchmark(intersect.list(list(x,y,z)),
+ vectorized(),
+ Reduce(intersect,list(x,y,z)),times=10)
Unit: milliseconds
expr min lq median uq max neval
intersect.list(list(x, y, z)) 73.2862 75.14838 76.77792 85.54216 121.8442 10
vectorized() 131.9560 132.40266 134.47248 139.93902 172.7829 10
Reduce(intersect, list(x, y, z)) 88.4308 90.06320 92.72929 128.05930 133.2982 10
As you see the for loop if slightly faster then Reduce and vectorized solution.

Resources