R Compare one set of values with multiple sets - r

I have a vector of values (x).
I would like to determine the length of its overlap with each of the sets sitting in a list (y) - but without running a loop or lapply. Is it possible?
I am really interested in accelerating the execution.
Thank you very much!
Below is an example with an implementation using a loop:
x <- c(1:5)
y <- list(1:5, 2:6, 3:7, 4:8, 5:9, 6:10)
overlaps <- rep(0, length(y))
for (i in seq(length(y))) { #i=1
# overlaps[i] <- length(intersect(x, y[[i]])) # it is slower than %in%
overlaps[i] <- sum(x %in% y[[i]])
}
overlaps
And below is the comparison of some of the methods that were suggested in the responses below. As you can see, the loop is still the fastest - but I'd love to find something faster:
# Function with the loop:
myloop <- function(x, y) {
overlaps <- rep(0, length(y))
for (i in seq(length(y))) overlaps[i] <- sum(x %in% y[[i]])
overlaps
}
# Function with sapply:
mysapply <- function(x, y) sapply(y, function(e) sum(e %in% x))
# Function with map_dbl:
library(purrr)
mymap <- function(x, y) {
map_dbl(y, ~sum(. %in% x))
}
library(microbenchmark)
microbenchmark(myloop(x, y), mysapply(x, y), mymap(x, y), times = 30000)
# Unit: microseconds
# expr min lq mean median uq max neval
# myloop(x, y) 17.2 19.4 26.64801 21.2 22.6 9348.6 30000
# mysapply(x, y) 27.1 29.5 39.19692 31.0 32.9 20176.2 30000
# mymap(x, y) 59.8 64.1 88.40618 66.0 70.5 114776.7 30000

Use sapply for code compactness.
Even if sapply doesn't bring much performance benefits, compared to a for loop, at least the code is far more compact. This is the sapply equivalent of your code:
x <- c(1:5)
y <- list(1:5, 2:6, 3:7, 4:8, 5:9, 6:10)
res <- sapply(y, function(e) length(intersect(e, x)))
> res
[1] 5 4 3 2 1 0
Performance gains
As correctly stated by #StupidWolf, it's not sapply that is slowing down the execution, but rather length and intersect. That's my test with 100.000 executions:
B <- 100000
system.time(replicate(B, sapply(y, function(e) length(intersect(e, x)))))
user system elapsed
9.79 0.01 9.79
system.time(replicate(B, sapply(y, function(e) sum(e %in% x))))
user system elapsed
2 0 2
#Using microbenchmark for preciser results:
library(microbenchmark)
microbenchmark(expr1 = sapply(y, function(e) length(intersect(e, x))), times = B)
expr min lq mean median uq max neval
expr1 81.4 84.9 91.87689 86.5 88.2 7368.7 1e+05
microbenchmark(expr2 = sapply(y, function(e) sum(e %in% x)), times = B)
expr min lq mean median uq max neval
expr2 15.4 16.1 17.68144 16.4 17 7567.9 1e+05
As we can see, the second approach is by far the performance winner.
Hope this helps.

You can use map from purrr, it goes through every element of the list y, and performs a function. Below i use map_dbl which returns a vector
library(purrr)
map_dbl(y,~+(. %in% x))
[1] 5 4 3 2 1 0
To see the time:
f1 = function(){
x <- c(1:5)
y <- lapply(1:5,function(i)sample(1:10,5,replace=TRUE))
map_dbl(y,~sum(. %in% x))
}
f2 = function(){
x <- c(1:5)
y <- lapply(1:5,function(i)sample(1:10,5,replace=TRUE))
overlaps <- rep(0, length(y))
for (i in seq(length(y))) { #i=1
overlaps[i] <- length(intersect(x, y[[i]]))
}
overlaps
}
f3 = function(){
x <- c(1:5)
y <- lapply(1:5,function(i)sample(1:10,5,replace=TRUE))
sapply(y,function(i)sum(i%in%x))
}
Let's put it to test:
system.time(replicate(10000,f1()))
user system elapsed
1.27 0.02 1.35
system.time(replicate(10000,f2()))
user system elapsed
1.72 0.00 1.72
system.time(replicate(10000,f3()))
user system elapsed
0.97 0.00 0.97
So if you want speed, do something like sapply + %in% , if something easily readable, do purrr

Here is an option using data.table which should be fast if you have a long list of vectors in y.
library(data.table)
DT <- data.table(ID=rep(seq_along(y), lengths(y)), Y=unlist(y))
DT[.(Y=x), on=.(Y)][, .N, ID]
In addition if you need to run this for multiple x, I would suggest creating a data.table that combines all of the x before running the code
output:
ID N
1: 1 5
2: 2 4
3: 3 3
4: 4 2
5: 5 1

Related

extract elements from a list based on a vector of indices

I want to extract elements from a list based on indices stored in a separate vector.
This is my attempt at it:
list_positions<-c(2,3,4)
my_list<-list(c(1,3,4),c(2,3,4,5,6),c(1,2,3,4,6))
my_fun<-function(x,y){
x[y]
}
mapply(my_fun,x=my_list,y=list_positions)
Maybe somebody can suggest a faster solution. My list is has around 14 million elements. I tried parallel solutions, where instead of mapply I used clusterMap but still I would like to have a better performance.
We may unlist the list, create index based on lengths of 'my_list' and extract the vector
v1 <- unlist(my_list)
p1 <- list_positions
v1[cumsum(lengths(my_list))- (lengths(my_list)-p1)]
#[1] 3 4 4
Benchmarks
set.seed(24)
lst <- lapply(1:1e6, function(i) sample(1:10, sample(2:5), replace=FALSE))
p2 <- sapply(lst, function(x) sample(length(x), 1))
system.time({
r1 <- mapply(`[`, lst, p2)
})
#user system elapsed
# 1.84 0.02 1.86
system.time( r4 <- mapply(my_fun, lst, p2) )
# user system elapsed
# 1.88 0.01 1.89
system.time({ r4 <- mapply(my_fun, lst, p2) }) #placing inside the {}
# user system elapsed
# 2.31 0.00 2.31
system.time({ ##cccmir's function
r3 <- mapply(my_func1, lst, p2)
})
# user system elapsed
# 12.10 0.03 12.13
system.time({
v2 <- unlist(lst)
r2 <- v2[cumsum(lengths(lst))- (lengths(lst)-p2)]
})
# user system elapsed
# 0.14 0.00 0.14
identical(r1, r2)
#[1] TRUE
you should use a for loop in this case, for example:
library(microbenchmark)
list_positions<-c(2,3,4)
my_list<-list(c(1,3,4),c(2,3,4,5,6),c(1,2,3,4,6))
my_fun<-function(x,y){
x[y]
}
mapply(my_fun,x=my_list,y=list_positions)
my_func1 <- function(aList, positions){
res <- numeric(length(aList))
for(i in seq_along(aList)) {
res[i] <- aList[[i]][positions[i]]
}
return(res)
}
my_func2 <- function(aList, positions) {
v1 <- unlist(aList)
p1 <- positions
v1[cumsum(lengths(my_list))- (lengths(my_list)-p1)]
}
microbenchmark(mapply(my_fun,x=my_list,y=list_positions), my_func1(my_list, list_positions), my_func2(my_list, list_positions), times = 1000)
#Unit: microseconds
# expr min lq mean median uq max neval
#mapply(my_fun, x = my_list, y = list_positions) 12.764 13.858 17.453172 14.588 16.775 119.613 1000
# my_func1(my_list, list_positions) 5.106 5.835 7.328412 6.200 6.929 38.292 1000
# my_func2(my_list, list_positions) 2.553 3.282 4.337367 3.283 3.648 52.514 1000
#akrun solution is the fastest

R-like way to add a % column to a data frame

I want to add a column that shows me the percentage part of that row compared to the some of the column. (sorry for my bad mathematical english here).
> trees['Heigth_%'] <- round((100 / sum(trees$Height) * trees$Height), digits=2)
> head(trees)
Girth Height Volume Heigth_%
1 8.3 70 10.3 2.97
2 8.6 65 10.3 2.76
3 8.8 63 10.2 2.67
4 10.5 72 16.4 3.06
5 10.7 81 18.8 3.44
6 10.8 83 19.7 3.52
This work.
But the question is if this is a good and R-like way?
e.g. Is sum() called for each row? Or is R intelligent enough here?
To answer you question if sum is called for every row or is R intelligent enough, you can use trace:
df = data.frame(a = 1:10, b = 21:30)
df['b_%'] = round((100 / sum(df$b) * df$b), digits=2)
trace('sum')
round((100 / sum(df$b) * df$b), digits=2)
untrace('sum')
Which shows only one call to the sum function. Afterwards, R recognizes that the lengths of trees$Height and sum(trees$Height) differ and tries to replicate the shorter one until is has the same length as the bigger one.
Converting first to data.table and then using prop.table is a bit faster then f1-f3 (IF you neglect the conversion to a data.table, as this is usually only done once for all subsequent commands).
# Get data
data(trees)
# Load package & convert to data.table
library(data.table)
trees <- as.data.table(trees)
# data.table way to create the new variable
f4 <- function(trees) {
trees[, Heigth_percentage:=round(prop.table(Height)*100,2)]
}
Here are the bechmark results:
# > microbenchmark(r1 <- f1(trees),
# + r2 <- f2(trees),
# + r3 <- f3(trees),
# + r3 <- f4(trees),
# + times = 10000)
# Unit: microseconds
# expr min lq mean median uq max neval cld
# r1 <- f1(trees) 616.616 666.290 730.8883 683.122 708.164 8390.818 10000 b
# r2 <- f2(trees) 617.437 666.701 730.3211 683.533 709.191 8100.574 10000 b
# r3 <- f3(trees) 596.500 655.616 721.1057 672.243 697.080 55048.757 10000 b
# r4 <- f4(trees) 551.342 612.922 680.7581 633.037 665.059 54672.712 10000 a
To begin, Vandenman's answer is much more adequate and precise. What follows is not really worth an answer, but as usual - not readable as a comment.
I have added prop.table() and data.table() (see majom's answer) approaches to the timings. With 40k rows data.table() is a bit closer to the rest, but still slower (~3 ms to ~3.7 ms), with 400k rows it starts to be comparable, and with 4M rows it is finally faster than the rest:
library(microbenchmark)
trees <- data.frame(Height = runif(400000, 9, 11),
Heigth_PCT = numeric(4000000))
trees_dt <- as.data.table(trees)
f1 <- function(trees) {
trees$Heigth_PCT <- round((100 / sum(trees$Height) * trees$Height), digits = 2)
return(trees)
}
f2 <- function(trees) {
sum_trees <- sum(trees$Height)
trees$Heigth_PCT <- round((100 / sum_trees * trees$Height), digits = 2)
return(trees)
}
f3 <- function(trees) {
trees$Heigth_PCT <- round(prop.table(trees$Height)*100, digits = 2)
return(trees)
}
f4 <- function(trees_dt) {
trees_dt[, Heigth_PCT := round(prop.table(Height)*100, 2)]
}
# Time both functions
microbenchmark(r1 <- f1(trees),
r2 <- f2(trees),
r3 <- f3(trees),
r4 <- f4(trees_dt),
times = 100)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# r1 <- f1(trees) 296.4452 309.3853 373.5945 318.7987 400.0373 639.8556 100 a
# r2 <- f2(trees) 296.3453 310.6638 381.4048 323.0655 474.9295 682.2172 100 a
# r3 <- f3(trees) 304.3185 317.0654 383.9600 328.5494 395.6238 783.2435 100 a
# r4 <- f4(trees_dt) 304.3327 315.4685 361.9526 325.8711 366.1153 722.7629 100 a
sapply(list(r2, r3, as.data.frame(r4)), identical, r1)
# [1] TRUE TRUE TRUE
Edit: prop.table() added.
Edit 2: data.table() added.

If else statement for a list with matrices

I am learning how to properly set up loops but still struggle with the correct indexing and syntax.
Below I have a list of two matrices, 2 columns x 3 rows each.
I want to look at the second column in each matrix, and create ideally a new column with values 1 if x>0.50, else = 0. Help will be much appreciated for both ways to do it, it should help me learn. Also, if you know any good reference of structuring loops in higher dimensional lists it would be much appreciated. Thanks so much.
a <- c(0.1,0.2,0.3)
b <- c(0.8,0.2,0.5)
c <- c(0.4,0.9,1.0)
d <- c(0.7,0.9,0.2)
ab <- cbind(a,b)
cd <- cbind(c,d)
abcd <- list(ab,cd)
presabs <- vector("list",ncol(y))
# Trying lapply but indexing something wrong
res <- lapply(abcd, function(x) if (x[,2]>0.5) {1} else {0})
# Other method also not working:
for (i in 1:length(abcd))
for (j in 1:length(a)){
{
if(abc[[i]][j]>0.50){
presabs[j] <- 1
} else {
presabs[j] <- 0
}
}
}
You can either use ifelse or in this case it is also not needed. The OP's question is creating a binary variable as the third column. This can be done using many variations,
lapply(abcd, function(x) cbind(x,new= +(x[,2]>0.5)))
or
lapply(abcd, function(x) cbind(x,new= (x[,2]>0.5)+0L))
Or
lapply(abcd, function(x) cbind(x,new= (x[,2]>0.5)*1))
Or
lapply(abcd, function(x) cbind(x,new= as.integer(x[,2]>0.5)))
If the values to be changed are different, for example
a <- 3
b <- 2
lapply(abcd, function(x) cbind(x, new= c(a, b)[(x[,2] > 0.5)+1L]))
Benchmarks
set.seed(25)
abcd1 <- lapply(1:60, function(i) matrix(rnorm(1e5*2), ncol=2))
viaChris <- function() lapply(abcd1, function(x) f(x, a=1, b=0, thresh =.5))
akrun <- function() lapply(abcd1, function(x) cbind(x, lab= +(x[,2] >0.5)))
system.time(viaChris())
# user system elapsed
#1.683 0.000 1.444
system.time(akrun())
# user system elapsed
# 0.481 0.000 0.322
library(microbenchmark)
microbenchmark(akrun(), viaChris(), unit='relative', times=20L)
#Unit: relative
# expr min lq mean median uq max neval cld
# akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20 a
#viaChris() 3.726728 3.459581 3.475673 3.488114 3.400262 3.445557 20 b
The above answer works because logical values are coerced into 1 (for TRUE) and 0 (for FALSE). A more general solution might look something like:
lapply(abcd, function(x) cbind(x, ifelse(x[, 2] > .5, a, b)))
where a and b are numeric values you can specify. We can even be more general. For example:
## Define a general function that adds a new column of values
## based on whether or not the values in the i'th column of the
## matrix exceeds a threshold.
f = function(x, a, b, thresh, i = 2)
cbind(x, lab = ifelse(x[, i] > thresh, a, b))
## Apply the function above to each matrix in the list 'abcd', with
## a = 1, b = 0, and thresh = .5.
lapply(abcd, function(x) f(x, a = 1, b = 0, thresh = .5))

Is it possible to speed up my function for creating a correlation matrix?

I have written the following function to estimate the pairwise correlations of multinomial variables using so-called Cramér's V. I use the vcd package for this purpose, but to my knowledge there is no existing function that would create a symmetrical correlation matrix of V from a matrix or data.frame similar to cor.
The function is:
require(vcd)
get.V<-function(y){
col.y<-ncol(y)
V<-matrix(ncol=col.y,nrow=col.y)
for(i in 1:col.y){
for(j in 1:col.y){
V[i,j]<-assocstats(table(y[,i],y[,j]))$cramer
}
}
return(V)
}
However, for large numbers of variables it gets relatively slow.
no.var<-5
y<-matrix(ncol=no.var,sample(1:5,100*no.var,TRUE))
get.V(y)
As you increase no.var computing time may explode. Since I need to apply this to a data.frame of lengths 100 and higher, my question is, whether it is possible to 'speed up' my function by more elegant programming, maybe. Thank you.
As well as the reducing the number of tests performed, or otherwise
optimising the running of the whole function, we might be able to make
assocstats faster. We'll start by establishing a test case to make
sure we don't accidentally make a faster function that's incorrect.
x <- vcd::Arthritis$Improved
y <- vcd::Arthritis$Treatment
correct <- vcd::assocstats(table(x, y))$cramer
correct
## [1] 0.3942
is_ok <- function(x) stopifnot(all.equal(x, correct))
We'll start by making a version of assocstats that's very close to the
original.
cramer1 <- function (x, y) {
mat <- table(x, y)
tab <- summary(MASS::loglm(~1 + 2, mat))$tests
phi <- sqrt(tab[2, 1] / sum(mat))
cont <- sqrt(phi ^ 2 / (1 + phi ^ 2))
sqrt(phi ^ 2 / min(dim(mat) - 1))
}
is_ok(cramer1(x, y))
The slowest operation here is going to be loglm, so before we try
making that faster, it's worth looking for an alternative approach. A
little googling finds a useful blog
post.
Let's also try that:
cramer2 <- function(x, y) {
chi <- chisq.test(x, y, correct=FALSE)$statistic[[1]]
ulength_x <- length(unique(x))
ulength_y <- length(unique(y))
sqrt(chi / (length(x) * (min(ulength_x, ulength_y) - 1)))
}
is_ok(cramer2(x, y))
How does the performance stack up:
library(microbenchmark)
microbenchmark(
cramer1(x, y),
cramer2(x, y)
)
## Unit: microseconds
## expr min lq median uq max neval
## cramer1(x, y) 1080.0 1149.3 1182.0 1222.1 2598 100
## cramer2(x, y) 800.7 850.6 881.9 934.6 1866 100
cramer2() is faster. chisq.test() is likely to be the bottleneck, so
lets see if we can make that function faster by doing less:
chisq.test() does a lot more than compute the test-statistic, so it's
likely that we can make it faster. A few minutes careful work reduces
the function to:
chisq_test <- function (x, y) {
O <- table(x, y)
n <- sum(O)
E <- outer(rowSums(O), colSums(O), "*")/n
sum((abs(O - E))^2 / E)
}
We can then create a new cramer3() that uses chisq.test().
cramer3 <- function(x, y) {
chi <- chisq_test(x, y)
ulength_x <- length(unique(x))
ulength_y <- length(unique(y))
sqrt(chi / (length(x) * (min(ulength_x, ulength_y) - 1)))
}
is_ok(cramer3(x, y))
microbenchmark(
cramer1(x, y),
cramer2(x, y),
cramer3(x, y)
)
## Unit: microseconds
## expr min lq median uq max neval
## cramer1(x, y) 1088.6 1138.9 1169.6 1221.5 2534 100
## cramer2(x, y) 796.1 840.6 865.0 906.6 1893 100
## cramer3(x, y) 334.6 358.7 373.5 390.4 1409 100
And now that we have our own simple version of chisq.test() we could
eek out a little more speed by using the results of table() to figure
out the number of unique elements in x and y:
cramer4 <- function(x, y) {
O <- table(x, y)
n <- length(x)
E <- outer(rowSums(O), colSums(O), "*")/n
chi <- sum((abs(O - E))^2 / E)
sqrt(chi / (length(x) * (min(dim(O)) - 1)))
}
is_ok(cramer4(x, y))
microbenchmark(
cramer1(x, y),
cramer2(x, y),
cramer3(x, y),
cramer4(x, y)
)
## Unit: microseconds
## expr min lq median uq max neval
## cramer1(x, y) 1097.6 1145.8 1183.3 1233.3 2318 100
## cramer2(x, y) 800.7 840.5 860.7 895.5 2079 100
## cramer3(x, y) 334.4 353.1 365.7 384.1 1654 100
## cramer4(x, y) 248.0 263.3 273.2 283.5 1342 100
Not bad - we've made it 4 times faster just using R code. From here, you
could try to get even more speed by:
Using tcrossprod() instead of outer()
Making a faster version of table() for this special (2d) case
Using Rcpp to compute the test-statistic from the tabular data
You are best off using the vectorized version of outer like Tyler suggested. You can still get a performance boost by writing a function to calculate just the Cramer's V. The assocstats function uses summary on the table and that calculates a lot of statistics you don't want. If you reply the call to assocstats to a a user defined function along the lines of
cv <- function(x, y) {
t <- table(x, y)
chi <- chisq.test(t)$statistic
cramer <- sqrt(chi / (NROW(x) * (min(dim(t)) - 1)))
cramer
}
This new function, by calculating only Cramer's V, runs in about 40% of the time required for assocstats. You could potentially speed it up again my reducing the chisq.test to something that only calculates the chi square test statistic.
Even if you just adjust your loop index values to realize you have a symmetric matrix with 1 on the diagonals and use this cv function instead of assocstats you are looking at easily a 5 fold increase in performance.
Edit: As requested, the full code I've been using to get a 4x speed up is
cv <- function(x, y) {
t <- table(x, y)
chi <- suppressWarnings(chisq.test(t))$statistic
cramer <- sqrt(chi / (NROW(x) * (min(dim(t)) - 1)))
cramer
}
get.V3<-function(y, fill = TRUE){
col.y<-ncol(y)
V<-matrix(ncol=col.y,nrow=col.y)
for(i in 1:(col.y - 1)){
for(j in (i + 1):col.y){
V[i,j]<-cv(y[,i],y[,j])
}
}
diag(V) <- 1
if (fill) {
for (i in 1:ncol(V)) {
V[, i] <- V[i, ]
}
}
V
}
It looks to be very similar to what Hadley suggests below, although his version of the function to get Cramer's V uses correct = FALSE in chisq.test. If all the tables are larger than 2x2, the setting on correct doesn't matter. For 2x2 tables, the results will vary depending on the argument. It is probably best to follow his example and set it to correct = FALSE so that everything is calculated the same regardless of the table size.
You could reduce the calculation time by calculate only one half of your matrix:
get.V2 <-function(y){
cb <- combn(1:ncol(y), 2, function(i)assocstats(table(y[, i[1]], y[, i[2]]))$cramer)
m <- matrix(0, ncol(y), ncol(y))
m[lower.tri(m)] <- cb
diag(m) <- 1
## copy the lower.tri to upper.tri, suggested by #iacobus
for (i in 1:nrow(m)) {
m[i, ] <- m[, i]
}
return(m)
}
EDIT: added #iacobus suggestion to populate the upper.tri of the matrix and added a little benchmark:
library("vcd")
library("qdapTools")
library("rbenchmark")
## suggested by #TylerRinker
get.V3 <- function(y)v_outer(y, function(i, j)assocstats(table(i, j))$cramer)
set.seed(1)
no.var<-10
y<-matrix(ncol=no.var,sample(1:5,100*no.var,TRUE))
benchmark(get.V(y), get.V2(y), get.V3(y), replications=10, order="relative")
# test replications elapsed relative user.self sys.self user.child sys.child
#2 get.V2(y) 10 0.992 1.000 0.988 0.000 0 0
#1 get.V(y) 10 2.239 2.257 2.232 0.004 0 0
#3 get.V3(y) 10 2.495 2.515 2.484 0.004 0 0
This uses a vectorized version of outer:
library(qdapTools)
y <- matrix(ncol=no.var,sample(1:5,100*no.var,TRUE))
get.V2<-function(x, y){
assocstats(table(x, y))$cramer
}
v_outer(y, get.V2)
## > v_outer(y, get.V2)
## V1 V2 V3 V4 V5
## V1 1.000 0.224 0.158 0.195 0.217
## V2 0.224 1.000 0.175 0.163 0.240
## V3 0.158 0.175 1.000 0.208 0.145
## V4 0.195 0.163 0.208 1.000 0.189
## V5 0.217 0.240 0.145 0.189 1.000
Edit
On 1000 variables these are the system times:
Tyler: Time difference of 38.79437 mins
sgibb: Time difference of 19.54342 mins
Clearly sgibb's approach is superior.

convert a string into value in R

Take for example:
s <- c("1y1","2y1","1.5y2","1.8y2")
y1 and y2 means to multiply 2 and 1.8.
1y1 = 1*2=2
1.5y2 = 1.5*1.8 = 2.7
So, the result would be c(2,4,2.7,3.24).
How can I simply get it?
I can do it by c(1,2.5,3,2.7)*c(2,2,1.8,1.8), but it is not elegant, if the string is very long.
Using gsub to replace y1 and y2 by their respective numeric values then one can use the classical eval(parse(..)) to evaluate the resulted numeric expressions.
s <- c("1y1","2y1","1.5y2","1.8y2")
s <- gsub('y1','*2',s)
s <-gsub('y2','*1.8',s)
sapply(s,function(x)eval(parse(text=x)))
1*2 2*2 1.5*1.8 1.8*1.8
2.00 4.00 2.70 3.24
EDIT more elegant option using gsubfn, and its flexible function substitution version.
> library(gsubfn)
> sub.f <- function(x,y) as.numeric(x) * ifelse(y == 1, 2, 1. 8)
> as.numeric(gsubfn('(.*)y(.*)', sub.f, s))
[1] 2.00 4.00 2.70 3.24
we split, each expression into 2 components x and y and use them (once coerced to numeric) to compute manually the expression.
Alternately, this one liner:
> strapply(s, '(.*)y(.*)', ~ as.numeric(x) * ifelse(y == 1, 2, 1.8), simplify = TRUE)
[1] 2.00 4.00 2.70 3.24
You can try this:
s = sub(pattern="y", replacement="*y", x=s)
sub will replace all the ocurrences of y with *y, so the expression is syntactically correct to R. However, it's not really an expression but a character vector. Take notice sub will replace only the first ocurrence of y. If there are more than one "y" per string, you have to use gsub instead. For more information see ?sub.
y1 = 2
y2 = 1.8
s = parse(text=s) # parse will convert s to a R expression
s.num = sapply(s, eval) # and this will evaluate the expression to give the results
[1] 2.00 4.00 2.70 3.24
Here is another idea:
s <- c("1y1", "2y1", "1.5y2", "1.8y2")
s2 <- as.numeric(sub("y[0-9]", "", s))
mult <- rep(NA, len = length(s))
mult[grep("y1", s)] <- 2
mult[grep("y2", s)] <- 1.8
s2 * mult
#[1] 2.00 4.00 2.70 3.24
On a larger scale:
S = sample(s, 1e4, T)
f1 = function() {
S <- gsub('y1','*2',S)
S <-gsub('y2','*1.8',S)
sapply(S,function(x)eval(parse(text=x)))
}
f2 = function() {
S = sub(pattern="y", replacement="*y", x=S)
S = parse(text=S)
S.num = sapply(S, eval)
S.num
} ; y1 = 2 ; y2 = 1.8
f3 = function() {
s2 <- as.numeric(sub("y[0-9]", "", S))
mult <- rep(NA, len = length(S))
mult[grep("y1", S)] <- 2
mult[grep("y2", S)] <- 1.8
s2 * mult
}
library(microbenchmark)
microbenchmark(f1(), f2(), f3(), times = 10)
#Unit: milliseconds
# expr min lq median uq max neval
# f1() 1940.63876 2037.03500 2064.4605 2072.98875 2101.73358 10
# f2() 93.43298 98.69724 115.8716 125.07774 153.43385 10
# f3() 12.91724 13.01781 13.1805 13.30283 17.94973 10
sum(f1() != f2())
#[1] 0
sum(f2() != f3())
#[1] 0

Resources