How to calculate "1+1" without using eval(parse(...? - r

I have been looking for quite a while but it seems the answer always seems to be to use eval(parse(text="1+1")).
I have a column in my data frame, it has a list of strings such as "1+1*6", "1*4/3" etc. I wish to compute these into a new column without using the eval(parse( functions as I am looking to do it over 8 million rows.
It is basically an attempt to answer the question: Given the numbers 1:9 find all the solutions where (A_B_C) / (D_E_F) = GHI, where A:I are the numbers 1:9 (without repeating) and the underscores are one of the four operators *, /, +,-, also without repeating.
I created a dataframe with all the permutations of 1:9 and for each of these I calculated the permutations of the four operators.
require(gtools)
x <- permutations(n = 9, r = 9, v = 1:9)
y <- permutations(n = 4, r = 4, v = c("*", "/", "+", "-"))
for(i in 1:nrow(x)){
for(j in 1:nrow(y)){
math <- paste("(", x[i,1], y[j,1], x[i,2],y[j,2], x[i,3],")", "/", "(", x[i,4] ,y[j,3], x[i,5] ,y[j,4], x[i,6],")")
equals <- eval(parse(text=math))
sum <- as.numeric(paste0(x[i,7], x[i,8], x[i,9]))
if(sum==equals) {
print(c(i,j))
}
}
}
However this takes far too long, hence I am trying to remove the time consuming eval(parse(..
Any help would be really appreciated. Thanks!
Freddie

Vectorisation is key
math <- apply(
y,
1,
function(j){
paste("(", x[, 1], j[1], x[, 2], j[2], x[, 3],")/(", x[, 4], j[3], x[, 5], j[4], x[, 6], ")")
}
)
math <- apply(math, 2, paste, collapse = ",")
math <- paste("c(", math, ")")
equals <- sapply(parse(text = math), eval)
sum <-matrix(x[, 7] * 100 + x[, 8] * 10 + x[, 9], nrow = nrow(x), ncol = nrow(y))
abs(sum - equals) < 1e-8
Let's see what the difference in speed is
require(gtools)
x <- permutations(n = 9, r = 9, v = 1:9)
y <- permutations(n = 4, r = 4, v = c("*", "/", "+", "-"))
x <- x[sample(nrow(x), 40), ]
y <- y[sample(nrow(y), 20), ]
library(microbenchmark)
microbenchmark(
loop = for(i in 1:nrow(x)){
for(j in 1:nrow(y)){
math <- paste("(", x[i,1], y[j,1], x[i,2],y[j,2], x[i,3],")", "/", "(", x[i,4] ,y[j,3], x[i,5] ,y[j,4], x[i,6],")")
equals <- eval(parse(text=math))
sum <- as.numeric(paste0(x[i,7], x[i,8], x[i,9]))
if(sum==equals) {
print(c(i,j))
}
}
},
vectorised = {
math <- apply(
y,
1,
function(j){
paste("(", x[, 1], j[1], x[, 2], j[2], x[, 3],")/(", x[, 4], j[3], x[, 5], j[4], x[, 6], ")")
}
)
math <- apply(math, 2, paste, collapse = ",")
math <- paste("c(", math, ")")
equals <- sapply(parse(text = math), eval)
sum <-matrix(x[, 7] * 100 + x[, 8] * 10 + x[, 9], nrow = nrow(x), ncol = nrow(y))
abs(sum - equals) < 1e-8
}
)
The results:
Unit: milliseconds
expr min lq mean median uq max neval cld
loop 158.666383 162.084918 167.477490 165.880665 170.258076 240.43746 100 b
vectorised 8.540623 8.966214 9.613615 9.142515 9.413117 17.88282 100 a

Related

Format a number in Indian currency format

Indian currency format uses a comma separator after every 2 digits, except for the last section which is 3 digits. Can one suggest a function in R that can achieve that.
Example:
input 12345678.23 output 1,23,45,678.23
i/p: 4356, o/p: 4,356
i/p: 435, o/p: 435
I don't know of any native way to do this, but the following function will achieve it for you:
nums <- function(n) {
dec <- round(n %% 1, 2)
dec <- ifelse(dec < 0.01, "", substr(dec, 2, 4))
int <- n %/% 1
ints <- vapply(int, function(x) {
x <- as.character(x)
len <- nchar(x)
if(len <= 3) return(x)
rev_x <- paste(rev(unlist(strsplit(x, ""))), collapse = "")
str <- paste0(substr(rev_x, 1, 3), ",")
str2 <- substr(rev_x, 4, 100)
str2 <- gsub("(\\d{2})", "\\1,", str2)
rev_x <- paste0(str, str2)
return(paste(rev(unlist(strsplit(rev_x, ""))), collapse = ""))
}, character(1))
return(sub("^,", "", paste0(ints, dec)))
}
You can use it like this:
nums(c(1234.12, 342, 35123251.12))
#> [1] "1,234.12" "342" "3,51,23,251.12"
Here might be one option
f <- Vectorize(function(x, digits = 2) {
r <- ""
if (grepl(".", x, fixed = TRUE)) {
r <- c(sub(".*(?=\\.)", "", x, perl = TRUE))
x <- sub("\\..*", "", x)
}
n <- nchar(x)
r <- paste0(substr(x, n - 2, n), r)
x <- substr(x, 1, n - 3)
while (nchar(x)) {
n <- nchar(x)
r <- c(substr(x, n - 1, n), r)
x <- substr(x, 1, n - 2)
}
paste0(r, collapse = ",")
})
and you will see
> f(c(12345678.23, 4356, 435, 900425, 1230010.45))
[1] "1,23,45,678.23" "4,356" "435" "9,00,425"
[5] "12,30,010.45"

rowmean and standard deviation using data.table

x <- matrix(rnorm(500 * 10), nrow = 500, ncol = 10)
x[, 1] <- 1:500
x <- data.frame(x)
names(x) <- c('ID', 2000:2008)
library(data.table)
setDT(x)
I want to calculate mean, sd and no. of data points per row but I am getting error
x[, c("meanY",'sdY',"nY") := .(rowMeans(.SD, na.rm = TRUE), sd(.SD, na.rm = TRUE), rowSums(!is.na(.SD))), .SDcols=c(2:10)]
The issues lies in sd() which doesn't work row-wise.
x[,
c("meanY",'sdY',"nY") :=
.(rowMeans(.SD, na.rm = TRUE),
apply(.SD, 1, sd, na.rm = TRUE),
rowSums(!is.na(.SD))),
.SDcols = 2:10]
Assuming the output as a list, you can use following code to have a try:
op <- c("mean","sd","length")
r <- lapply(op, function(v) apply(x, 1, eval(parse(text = v))))
names(r) <- op
where it should work with your data.frame example:
x <- matrix(rnorm(500 * 10), nrow = 500, ncol = 10)
x[, 1] <- 1:500
x <- data.frame(x)
names(x) <- c('ID', 2000:2008)

Parallelization with data.table

I have the following problem. I have a piece-wise linear function described by (xPoints, yPoints) and want to compute fast--I have to do it over and over again--the implied y-value for a long list of x's, where x could fall outside the range of xPoints. I have coded a function f_pwl that computes the implied y-value, but it is slow, so I was trying to parallelize its call. But it is actually slower than using data.table := syntax. I will appreciate suggestions to speed things up either by improving my f_pwl function, or by implementing an efficient parallelization, as I have access to 20 cores to speed things up.
Here is a sample code.
# libraries
require(data.table) # for fread, work with large data
require(abind) # for abind()
require(foreach) # for parallel processing, used with doParallel
require(doParallel) # for parallel processing, used with foreach
f_pwl <- function(x) {
temp <- as.vector( rep(NA, length = length(x)), mode = "double" )
for (i in seq(from = 1, to = length(x), by = 1)) {
if (x[i] > max(xPoints) | x[i] < min(xPoints)) {
# nothing to do, temp[i] <- NA
} else if (x[i] == max(xPoints)) {
# value equal max(yPoints)
temp[i] <- max(yPoints)
} else {
# value is f_pwl(x)
xIndexVector = as.logical( x[i] >= xPoints & abind(xPoints[2:length(xPoints)], max(xPoints)) > x[i] )
xIndexVector_plus1 = shift( xIndexVector, n = 1, fill = FALSE, type = "lag" )
alpha_j = (xPoints[xIndexVector_plus1] - x[i])/(xPoints[xIndexVector_plus1] - xPoints[xIndexVector])
temp[i] <- alpha_j %*% yPoints[xIndexVector] + (1-alpha_j) %*% yPoints[xIndexVector_plus1]
}
} # end for i
as.vector( temp, mode = "double" )
}
## Main program
xPoints <- c(4, 9, 12, 15, 18, 21)
yPoints <- c(1, 2, 3, 4, 5, 6)
x <- rnorm(1e4, mean = 12, sd = 5)
dt <- as.data.table( x )
dt[ , c("y1", "y2", "y3") := as.vector( mode = "double", NA ) ]
# data.table := command
system.time({
dt[, y2 := f_pwl( x ) ]
})
# mapply
system.time({
dt[ , y1 := mapply( f_pwl, x ), by=.I ]
})
# parallel
system.time({
#setup parallel backend to use many processors
cores=detectCores()
cl <- makeCluster(cores[1]-1, type="FORK") #not to overload your computer
registerDoParallel(cl)
dt$y3 <- foreach(i=1:nrow(dt), .combine=cbind) %dopar% {
tempY <- f_pwl( dt$x[i] )
tempY
}
#stop cluster
stopCluster(cl)
})
summary( dt[ , .(y1-y2, y1-y3, y2-y3)] )
First, calculate and store the alpha_j's.
Then, sort DT by x first and cut it into the relevant intervals before performing your linear interpolation
alpha <- c(NA, diff(yPoints) / diff(xPoints))
DT[order(x),
y := alpha[.GRP] * (x - xPoints[.GRP-1L]) + yPoints[.GRP-1L],
by=cut(x, xPoints)]
Please let me know how it performs.
data:
library(data.table)
## Main program
set.seed(27L)
xPoints <- c(4, 9, 12, 15, 18, 21)
yPoints <- c(1, 2, 3, 4, 5, 6)
DT <- data.table(x=rnorm(1e4, mean=12, sd=5))
check:
f_pwl <- function(x) {
temp <- as.vector( rep(NA, length = length(x)), mode = "double" )
for (i in seq(from = 1, to = length(x), by = 1)) {
if (x[i] > max(xPoints) | x[i] < min(xPoints)) {
# nothing to do, temp[i] <- NA
} else if (x[i] == max(xPoints)) {
# value equal max(yPoints)
temp[i] <- max(yPoints)
} else {
# value is f_pwl(x)
xIndexVector = as.logical( x[i] >= xPoints & abind(xPoints[2:length(xPoints)], max(xPoints)) > x[i] )
xIndexVector_plus1 = shift( xIndexVector, n = 1, fill = FALSE, type = "lag" )
alpha_j = (xPoints[xIndexVector_plus1] - x[i])/(xPoints[xIndexVector_plus1] - xPoints[xIndexVector])
temp[i] <- alpha_j %*% yPoints[xIndexVector] + (1-alpha_j) %*% yPoints[xIndexVector_plus1]
}
} # end for i
as.vector( temp, mode = "double" )
}
system.time({
DT[, yOP := f_pwl( x ) ]
})
DT[abs(y-yOP) > 1e-6]
#Empty data.table (0 rows) of 3 cols: x,y,yOP

subtracting unique pair-wise objects from for loop in R

I'm trying to subtract each unique pair-wise ps from the for loop in my function below. To do so, I first find unique pair-wise ps using combn(p, 2) and second use outer to subtract each unique pair from each other.
In both steps, I get error. Is there a fix for the error?
prop <- function(n, yes, a, b = a){
p <- list()
for(i in 1:length(n)){
p[[i]] <- rbeta(2, a[i] + yes[i], b[i] + (n[i] - yes[i]))
}
outer(combn(p, 2), FUN = "-") # Gives Error
}
prop(n = c(10, 20, 30), yes = rep(5, 3), a = rep(1, 3))
By default, it is simplify = TRUE in combn. So, even though the output is a list, it is simplified to have a dim attribute by converting each of the the list as elements in a matrix. As the m is 2, there are 2 list elements for each comparison, extract those elements using [[ and subtract
combn(p, 2, FUN = function(x) x[[1]]- x[[2]])
-full function
prop <- function(n, yes, a, b = a){
p <- list()
for(i in 1:length(n)){
p[[i]] <- rbeta(2, a[i] + yes[i], b[i] + (n[i] - yes[i]))
}
combn(p, 2, FUN = function(x) x[[1]]- x[[2]])
}
prop(n = c(10, 20, 30), yes = rep(5, 3), a = rep(1, 3))
If we wanted to include another argument how
prop <- function(n, yes, a, b = a, how= "one.two"){
delta <- switch(how,
one.two = function(x) x[[1]] - x[[2]],
two.one = function(x) x[[2]] - x[[1]])
p <- list()
for(i in 1:length(n)){
p[[i]] <- rbeta(2, a[i] + yes[i], b[i] + (n[i] - yes[i]))
}
out <- combn(p, 2, FUN = delta)
nm1 <- paste0("p", combn(seq_along(p), 2, FUN = paste, collapse="-"))
colnames(out) <- nm1
out
}
prop(n = c(10, 20, 30), yes = rep(5, 3), a = rep(1, 3), how = "one.two")
prop(n = c(10, 20, 30), yes = rep(5, 3), a = rep(1, 3), how = "two.one")

Split a vector into chunks such that sum of each chunk is approximately constant

I have a large data frame with more than 100 000 records where the values are sorted
For example, consider the following dummy data set
df <- data.frame(values = c(1,1,2,2,3,4,5,6,6,7))
I want to create 3 groups of above values (in sequence only) such that the sum of each group is more or less the same
So for the above group, if I decide to divide the sorted df in 3 groups as follows, their sums will be
1. 1 + 1 + 2 +2 + 3 + 4 = 13
2. 5 + 6 = 11
3. 6 + 7 = 13
How can create this optimization in R? any logic?
So, let's use pruning. I think other solutions are giving a good solution, but not the best one.
First, we want to minimize
where S_n is the cumulative sum of the first n elements.
computeD <- function(p, q, S) {
n <- length(S)
S.star <- S[n] / 3
if (all(p < q)) {
(S[p] - S.star)^2 + (S[q] - S[p] - S.star)^2 + (S[n] - S[q] - S.star)^2
} else {
stop("You shouldn't be here!")
}
}
I think the other solutions optimize over p and q independently, which won't give a global minima (expected for some particular cases).
optiCut <- function(v) {
S <- cumsum(v)
n <- length(v)
S_star <- S[n] / 3
# good starting values
p_star <- which.min((S - S_star)^2)
q_star <- which.min((S - 2*S_star)^2)
print(min <- computeD(p_star, q_star, S))
count <- 0
for (q in 2:(n-1)) {
S3 <- S[n] - S[q] - S_star
if (S3*S3 < min) {
count <- count + 1
D <- computeD(seq_len(q - 1), q, S)
ind = which.min(D);
if (D[ind] < min) {
# Update optimal values
p_star = ind;
q_star = q;
min = D[ind];
}
}
}
c(p_star, q_star, computeD(p_star, q_star, S), count)
}
This is as fast as the other solutions because it prunes a lot the iterations based on the condition S3*S3 < min. But, it gives the optimal solution, see optiCut(c(1, 2, 3, 3, 5, 10)).
For the solution with K >= 3, I basically reimplemented trees with nested tibbles, that was fun!
optiCut_K <- function(v, K) {
S <- cumsum(v)
n <- length(v)
S_star <- S[n] / K
# good starting values
p_vec_first <- sapply(seq_len(K - 1), function(i) which.min((S - i*S_star)^2))
min_first <- sum((diff(c(0, S[c(p_vec_first, n)])) - S_star)^2)
compute_children <- function(level, ind, val) {
# leaf
if (level == 1) {
val <- val + (S[ind] - S_star)^2
if (val > min_first) {
return(NULL)
} else {
return(val)
}
}
P_all <- val + (S[ind] - S[seq_len(ind - 1)] - S_star)^2
inds <- which(P_all < min_first)
if (length(inds) == 0) return(NULL)
node <- tibble::tibble(
level = level - 1,
ind = inds,
val = P_all[inds]
)
node$children <- purrr::pmap(node, compute_children)
node <- dplyr::filter(node, !purrr::map_lgl(children, is.null))
`if`(nrow(node) == 0, NULL, node)
}
compute_children(K, n, 0)
}
This gives you all the solution that are least better than the greedy one:
v <- sort(sample(1:1000, 1e5, replace = TRUE))
test <- optiCut_K(v, 9)
You need to unnest this:
full_unnest <- function(tbl) {
tmp <- try(tidyr::unnest(tbl), silent = TRUE)
`if`(identical(class(tmp), "try-error"), tbl, full_unnest(tmp))
}
print(test <- full_unnest(test))
And finally, to get the best solution:
test[which.min(test$children), ]
Here is one approach:
splitter <- function(values, N){
inds = c(0, sapply(1:N, function(i) which.min(abs(cumsum(as.numeric(values)) - sum(as.numeric(values))/N*i))))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(values, re))
}
how good is it:
# I calculate the mean and sd of the maximal difference of the sums in the
#splits of 100 runs:
#split on 15 parts
set.seed(5)
z1 = as.data.frame(matrix(1:15, nrow=1))
repeat{
values = sort(sample(1:1000, 1000000, replace = T))
z = splitter(values, 15)
z = lapply(z, sum)
z = unlist(z)
z1 = rbind(z1, z)
if (nrow(z1)>101){
break
}
}
z1 = z1[-1,]
mean(apply(z1, 1, function(x) max(x) - min(x)))
[1] 1004.158
sd(apply(z1, 1, function(x) max(x) - min(x)))
[1] 210.6653
#with less splits (4)
set.seed(5)
z1 = as.data.frame(matrix(1:4, nrow=1))
repeat{
values = sort(sample(1:1000, 1000000, replace = T))
z = splitter(values, 4)
z = lapply(z, sum)
z = unlist(z)
z1 = rbind(z1, z)
if (nrow(z1)>101){
break
}
}
z1 = z1[-1,]
mean(apply(z1, 1, function(x) max(x) - min(x)))
#632.7723
sd(apply(z1, 1, function(x) max(x) - min(x)))
#260.9864
library(microbenchmark)
1M:
values = sort(sample(1:1000, 1000000, replace = T))
microbenchmark(
sp_27 = splitter(values, 27),
sp_3 = splitter(values, 3),
)
Unit: milliseconds
expr min lq mean median uq max neval cld
sp_27 897.7346 934.2360 1052.0972 1078.6713 1118.6203 1329.3044 100 b
sp_3 108.3283 116.2223 209.4777 173.0522 291.8669 409.7050 100 a
btw F. Privé is correct this function does not give the globally optimal split. It is greedy which is not a good characteristic for such a problem. It will give splits with sums closer to global sum / n in the initial part of the vector but behaving as so will compromise the splits in the later part of the vector.
Here is a test comparison of the three functions posted so far:
db = function(values, N){
temp = floor(sum(values)/N)
inds = c(0, which(c(0, diff(cumsum(values) %% temp)) < 0)[1:(N-1)], length(values))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(values, re))
} #had to change it a bit since the posted one would not work - the core
#which calculates the splitting positions is the same
missuse <- function(values, N){
inds = c(0, sapply(1:N, function(i) which.min(abs(cumsum(as.numeric(values)) - sum(as.numeric(values))/N*i))))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(values, re))
}
prive = function(v, N){ #added dummy N argument because of the tester function
dummy = N
computeD <- function(p, q, S) {
n <- length(S)
S.star <- S[n] / 3
if (all(p < q)) {
(S[p] - S.star)^2 + (S[q] - S[p] - S.star)^2 + (S[n] - S[q] - S.star)^2
} else {
stop("You shouldn't be here!")
}
}
optiCut <- function(v, N) {
S <- cumsum(v)
n <- length(v)
S_star <- S[n] / 3
# good starting values
p_star <- which.min((S - S_star)^2)
q_star <- which.min((S - 2*S_star)^2)
print(min <- computeD(p_star, q_star, S))
count <- 0
for (q in 2:(n-1)) {
S3 <- S[n] - S[q] - S_star
if (S3*S3 < min) {
count <- count + 1
D <- computeD(seq_len(q - 1), q, S)
ind = which.min(D);
if (D[ind] < min) {
# Update optimal values
p_star = ind;
q_star = q;
min = D[ind];
}
}
}
c(p_star, q_star, computeD(p_star, q_star, S), count)
}
z3 = optiCut(v)
inds = c(0, z3[1:2], length(v))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(v, re))
} #added output to be more in line with the other two
Function for testing:
tester = function(split, seed){
set.seed(seed)
z1 = as.data.frame(matrix(1:3, nrow=1))
repeat{
values = sort(sample(1:1000, 1000000, replace = T))
z = split(values, 3)
z = lapply(z, sum)
z = unlist(z)
z1 = rbind(z1, z)
if (nrow(z1)>101){
break
}
}
m = mean(apply(z1, 1, function(x) max(x) - min(x)))
s = sd(apply(z1, 1, function(x) max(x) - min(x)))
return(c("mean" = m, "sd" = s))
} #tests 100 random 1M length vectors with elements drawn from 1:1000
tester(db, 5)
#mean sd
#779.5686 349.5717
tester(missuse, 5)
#mean sd
#481.4804 216.9158
tester(prive, 5)
#mean sd
#451.6765 174.6303
prive is the clear winner - however it takes quite a bit longer than the other 2. and can handle splitting on 3 elements only.
microbenchmark(
missuse(values, 3),
prive(values, 3),
db(values, 3)
)
Unit: milliseconds
expr min lq mean median uq max neval cld
missuse(values, 3) 100.85978 111.1552 185.8199 120.1707 304.0303 393.4031 100 a
prive(values, 3) 1932.58682 1980.0515 2096.7516 2043.7133 2211.6294 2671.9357 100 b
db(values, 3) 96.86879 104.5141 194.0085 117.6270 306.7143 500.6455 100 a
N = 3
temp = floor(sum(df$values)/N)
inds = c(0, which(c(0, diff(cumsum(df$values) %% temp)) < 0)[1:(N-1)], NROW(df))
split(df$values, rep(1:N, ifelse(N == 1, NROW(df), diff(inds))))
#$`1`
#[1] 1 1 2 2 3 4
#$`2`
#[1] 5 6
#$`3`
#[1] 6 7

Resources