Why only working when arguments have a length >= 3: R function - r

I have made a function so that it works when its arguments each have a length >= 2.
But I'm wondering why the function only works when its argument have each have a length of >= 3!
Am I missing something? (Any fix so the function works when length of its args are each of 2 as well?)
[Note: I always expect the output of function (i.e., CI) to be a matrix with 2 columns, length(n) rows, except when length(n) == 2. When length(n) == 2 I expect the output to have 1 row, and 2 columns.]
abc <- function(n, yes, a, b = a){
p <- list()
for(i in 1:length(n)){
p[[i]] <- rbeta(1e3, a[i] + yes[i], b[i] + (n[i] - yes[i]))
}
ps <- combn(p, 2, FUN = function(x) x[[1]]- x[[2]])
CI <- matrix(NA, length(n), 2)
for(i in 1:length(n)){
CI[i, ] <- quantile(ps[, i], c(.025, .975))
}
CI
}
For example:
abc(n = c(10, 20, 30), yes = rep(5, 3), a = rep(1, 3)) # Works well :-)
abc(n = c(10, 20), yes = rep(5, 2), a = rep(1, 2)) # Doesn't work! :-(
# Error in ps[, i] : subscript out of bounds

There is easy fix to problem. Replace length(n) with ncol(ps) while creating result matrix and running for loop to copy values to CI. It makes more sense as number of combinations generate by 'combnwill more than actual length ofn`.
abc <- function(n, yes, a, b = a){
p <- list()
for(i in 1:length(n)){
p[[i]] <- rbeta(1e3, a[i] + yes[i], b[i] + (n[i] - yes[i]))
}
str(p)
ps <- combn(p, 2, FUN = function(x) x[[1]]- x[[2]])
CI <- matrix(NA, ncol(ps), 2)
for(i in 1:ncol(ps)){
CI[i, ] <- quantile(ps[, i], c(.025, .975), na.rm = TRUE)
}
CI
}
#Results
#> abc(n = c(10, 20, 30), yes = rep(5, 3), a = rep(1, 3))
# [,1] [,2]
#[1,] -0.10141014 0.5774627
#[2,] 0.02638096 0.6159326
#[3,] -0.12473451 0.3069135
#> abc(n = c(10, 20), yes = rep(5, 2), a = rep(1, 2))
# [,1] [,2]
#[1,] -0.1228497 0.5304606

Related

How to fix Object(...) not found while it is declared in R

I am trying to take a derivative of a double sum function. I am running into this error:
Error in deriv.f.1(X = X.data, y = y.vec, alpha = alpha.vector[1, ]) :
object 'L_D_grad' not found
I have tried to move the {} brackets around, double check if I missed a closing/opening bracket, if I have extra opening/closing bracket. However, the error still exists.
# Generate Sample Data
gen.sample <- function(n){
x <- rnorm(n,5,10)
y <- ifelse(x < 2.843,1,-1)
return(data.frame(x,y))
}
##
deriv.f.1 <- function(X,y,alpha){
N <- length(X)
L_D_grad < numeric(N)
xy.alpha.sum <- numeric(N)
for(k in 1:N){
for(l in 1:N){
if(l == k){
xy.alpha.sum[l] = 0}
else{
xy.alpha.sum[l] <- alpha[l]*y[k]*y[l]*X[k]*X[l]}
}
L_D_grad[k] <- 1 - sum(xy.alpha.sum) - alpha[k]*(y[k])^2*(X[k])^2
}
return(L_D_grad)
}
## Illustration
set.seed(4997)
options(digits = 4,scipen = -4)
sample.data <- gen.sample(n=N)
X.data <- sample.data$x
y.vec <- sample.data$y
alpha.vector <- matrix(rep(seq(from=-5,to = 5, length.out = N),N*N),
ncol = N, nrow = N, byrow = TRUE)
alpha_vec <- alpha.vector[1,]
deriv.f.1(X = X.data, y = y.vec, alpha = alpha_vec)
Thanks in advance!
Here is my code:
# Generate Sample Data
gen.sample <- function(n){
x <- rnorm(n,5,10)
y <- ifelse(x < 2.843,1,-1)
return(data.frame(x,y))
}
##
deriv.f.1 <- function(X,y,alpha){
N <- length(X)
L_D_grad <- numeric(N)
xy.alpha.sum <- numeric(N)
for(k in 1:N){
for(l in 1:N){
if(l == k){
xy.alpha.sum[l] = 0}
else{
xy.alpha.sum[l] <- alpha[l]*y[k]*y[l]*X[k]*X[l]}
}
L_D_grad[k] <- 1 - sum(xy.alpha.sum) - alpha[k]*(y[k])^2*(X[k])^2
}
return(L_D_grad)
}
## Illustration
set.seed(4997)
options(digits = 4,scipen = -4)
N=10
sample.data <- gen.sample(n=N)
X.data <- sample.data$x
y.vec <- sample.data$y
alpha.vector <- matrix(rep(seq(from=-5,to = 5, length.out = N),N*N),
ncol = N, nrow = N, byrow = TRUE)
alpha_vec <- alpha.vector[1,]
deriv.f.1(X = X.data, y = y.vec, alpha = alpha_vec)
Where:
#sample.data
#x y
#1 -5.303e+00 1
#2 1.493e+01 -1
#3 9.797e+00 -1
#4 1.991e+01 -1
#5 -1.454e+01 1
#6 1.423e+01 -1
#7 1.025e+01 -1
#8 5.455e+00 -1
#9 3.719e+00 -1
#10 2.021e+01 -1
And deriv.f.1(X = X.data, y = y.vec, alpha = alpha_vec)
# -1.271e+01 -3.759e+01 -2.432e+01 -5.046e+01 -3.659e+01 -3.577e+01 -2.548e+01 -1.310e+01
# -8.612e+00 -5.123e+01
I made two changes:
Assign N a value: N=10
Correct assignment form L_D_grad: L_D_grad <- numeric(N)

Parallelization with data.table

I have the following problem. I have a piece-wise linear function described by (xPoints, yPoints) and want to compute fast--I have to do it over and over again--the implied y-value for a long list of x's, where x could fall outside the range of xPoints. I have coded a function f_pwl that computes the implied y-value, but it is slow, so I was trying to parallelize its call. But it is actually slower than using data.table := syntax. I will appreciate suggestions to speed things up either by improving my f_pwl function, or by implementing an efficient parallelization, as I have access to 20 cores to speed things up.
Here is a sample code.
# libraries
require(data.table) # for fread, work with large data
require(abind) # for abind()
require(foreach) # for parallel processing, used with doParallel
require(doParallel) # for parallel processing, used with foreach
f_pwl <- function(x) {
temp <- as.vector( rep(NA, length = length(x)), mode = "double" )
for (i in seq(from = 1, to = length(x), by = 1)) {
if (x[i] > max(xPoints) | x[i] < min(xPoints)) {
# nothing to do, temp[i] <- NA
} else if (x[i] == max(xPoints)) {
# value equal max(yPoints)
temp[i] <- max(yPoints)
} else {
# value is f_pwl(x)
xIndexVector = as.logical( x[i] >= xPoints & abind(xPoints[2:length(xPoints)], max(xPoints)) > x[i] )
xIndexVector_plus1 = shift( xIndexVector, n = 1, fill = FALSE, type = "lag" )
alpha_j = (xPoints[xIndexVector_plus1] - x[i])/(xPoints[xIndexVector_plus1] - xPoints[xIndexVector])
temp[i] <- alpha_j %*% yPoints[xIndexVector] + (1-alpha_j) %*% yPoints[xIndexVector_plus1]
}
} # end for i
as.vector( temp, mode = "double" )
}
## Main program
xPoints <- c(4, 9, 12, 15, 18, 21)
yPoints <- c(1, 2, 3, 4, 5, 6)
x <- rnorm(1e4, mean = 12, sd = 5)
dt <- as.data.table( x )
dt[ , c("y1", "y2", "y3") := as.vector( mode = "double", NA ) ]
# data.table := command
system.time({
dt[, y2 := f_pwl( x ) ]
})
# mapply
system.time({
dt[ , y1 := mapply( f_pwl, x ), by=.I ]
})
# parallel
system.time({
#setup parallel backend to use many processors
cores=detectCores()
cl <- makeCluster(cores[1]-1, type="FORK") #not to overload your computer
registerDoParallel(cl)
dt$y3 <- foreach(i=1:nrow(dt), .combine=cbind) %dopar% {
tempY <- f_pwl( dt$x[i] )
tempY
}
#stop cluster
stopCluster(cl)
})
summary( dt[ , .(y1-y2, y1-y3, y2-y3)] )
First, calculate and store the alpha_j's.
Then, sort DT by x first and cut it into the relevant intervals before performing your linear interpolation
alpha <- c(NA, diff(yPoints) / diff(xPoints))
DT[order(x),
y := alpha[.GRP] * (x - xPoints[.GRP-1L]) + yPoints[.GRP-1L],
by=cut(x, xPoints)]
Please let me know how it performs.
data:
library(data.table)
## Main program
set.seed(27L)
xPoints <- c(4, 9, 12, 15, 18, 21)
yPoints <- c(1, 2, 3, 4, 5, 6)
DT <- data.table(x=rnorm(1e4, mean=12, sd=5))
check:
f_pwl <- function(x) {
temp <- as.vector( rep(NA, length = length(x)), mode = "double" )
for (i in seq(from = 1, to = length(x), by = 1)) {
if (x[i] > max(xPoints) | x[i] < min(xPoints)) {
# nothing to do, temp[i] <- NA
} else if (x[i] == max(xPoints)) {
# value equal max(yPoints)
temp[i] <- max(yPoints)
} else {
# value is f_pwl(x)
xIndexVector = as.logical( x[i] >= xPoints & abind(xPoints[2:length(xPoints)], max(xPoints)) > x[i] )
xIndexVector_plus1 = shift( xIndexVector, n = 1, fill = FALSE, type = "lag" )
alpha_j = (xPoints[xIndexVector_plus1] - x[i])/(xPoints[xIndexVector_plus1] - xPoints[xIndexVector])
temp[i] <- alpha_j %*% yPoints[xIndexVector] + (1-alpha_j) %*% yPoints[xIndexVector_plus1]
}
} # end for i
as.vector( temp, mode = "double" )
}
system.time({
DT[, yOP := f_pwl( x ) ]
})
DT[abs(y-yOP) > 1e-6]
#Empty data.table (0 rows) of 3 cols: x,y,yOP

subtracting unique pair-wise objects from for loop in R

I'm trying to subtract each unique pair-wise ps from the for loop in my function below. To do so, I first find unique pair-wise ps using combn(p, 2) and second use outer to subtract each unique pair from each other.
In both steps, I get error. Is there a fix for the error?
prop <- function(n, yes, a, b = a){
p <- list()
for(i in 1:length(n)){
p[[i]] <- rbeta(2, a[i] + yes[i], b[i] + (n[i] - yes[i]))
}
outer(combn(p, 2), FUN = "-") # Gives Error
}
prop(n = c(10, 20, 30), yes = rep(5, 3), a = rep(1, 3))
By default, it is simplify = TRUE in combn. So, even though the output is a list, it is simplified to have a dim attribute by converting each of the the list as elements in a matrix. As the m is 2, there are 2 list elements for each comparison, extract those elements using [[ and subtract
combn(p, 2, FUN = function(x) x[[1]]- x[[2]])
-full function
prop <- function(n, yes, a, b = a){
p <- list()
for(i in 1:length(n)){
p[[i]] <- rbeta(2, a[i] + yes[i], b[i] + (n[i] - yes[i]))
}
combn(p, 2, FUN = function(x) x[[1]]- x[[2]])
}
prop(n = c(10, 20, 30), yes = rep(5, 3), a = rep(1, 3))
If we wanted to include another argument how
prop <- function(n, yes, a, b = a, how= "one.two"){
delta <- switch(how,
one.two = function(x) x[[1]] - x[[2]],
two.one = function(x) x[[2]] - x[[1]])
p <- list()
for(i in 1:length(n)){
p[[i]] <- rbeta(2, a[i] + yes[i], b[i] + (n[i] - yes[i]))
}
out <- combn(p, 2, FUN = delta)
nm1 <- paste0("p", combn(seq_along(p), 2, FUN = paste, collapse="-"))
colnames(out) <- nm1
out
}
prop(n = c(10, 20, 30), yes = rep(5, 3), a = rep(1, 3), how = "one.two")
prop(n = c(10, 20, 30), yes = rep(5, 3), a = rep(1, 3), how = "two.one")

Split a vector into chunks such that sum of each chunk is approximately constant

I have a large data frame with more than 100 000 records where the values are sorted
For example, consider the following dummy data set
df <- data.frame(values = c(1,1,2,2,3,4,5,6,6,7))
I want to create 3 groups of above values (in sequence only) such that the sum of each group is more or less the same
So for the above group, if I decide to divide the sorted df in 3 groups as follows, their sums will be
1. 1 + 1 + 2 +2 + 3 + 4 = 13
2. 5 + 6 = 11
3. 6 + 7 = 13
How can create this optimization in R? any logic?
So, let's use pruning. I think other solutions are giving a good solution, but not the best one.
First, we want to minimize
where S_n is the cumulative sum of the first n elements.
computeD <- function(p, q, S) {
n <- length(S)
S.star <- S[n] / 3
if (all(p < q)) {
(S[p] - S.star)^2 + (S[q] - S[p] - S.star)^2 + (S[n] - S[q] - S.star)^2
} else {
stop("You shouldn't be here!")
}
}
I think the other solutions optimize over p and q independently, which won't give a global minima (expected for some particular cases).
optiCut <- function(v) {
S <- cumsum(v)
n <- length(v)
S_star <- S[n] / 3
# good starting values
p_star <- which.min((S - S_star)^2)
q_star <- which.min((S - 2*S_star)^2)
print(min <- computeD(p_star, q_star, S))
count <- 0
for (q in 2:(n-1)) {
S3 <- S[n] - S[q] - S_star
if (S3*S3 < min) {
count <- count + 1
D <- computeD(seq_len(q - 1), q, S)
ind = which.min(D);
if (D[ind] < min) {
# Update optimal values
p_star = ind;
q_star = q;
min = D[ind];
}
}
}
c(p_star, q_star, computeD(p_star, q_star, S), count)
}
This is as fast as the other solutions because it prunes a lot the iterations based on the condition S3*S3 < min. But, it gives the optimal solution, see optiCut(c(1, 2, 3, 3, 5, 10)).
For the solution with K >= 3, I basically reimplemented trees with nested tibbles, that was fun!
optiCut_K <- function(v, K) {
S <- cumsum(v)
n <- length(v)
S_star <- S[n] / K
# good starting values
p_vec_first <- sapply(seq_len(K - 1), function(i) which.min((S - i*S_star)^2))
min_first <- sum((diff(c(0, S[c(p_vec_first, n)])) - S_star)^2)
compute_children <- function(level, ind, val) {
# leaf
if (level == 1) {
val <- val + (S[ind] - S_star)^2
if (val > min_first) {
return(NULL)
} else {
return(val)
}
}
P_all <- val + (S[ind] - S[seq_len(ind - 1)] - S_star)^2
inds <- which(P_all < min_first)
if (length(inds) == 0) return(NULL)
node <- tibble::tibble(
level = level - 1,
ind = inds,
val = P_all[inds]
)
node$children <- purrr::pmap(node, compute_children)
node <- dplyr::filter(node, !purrr::map_lgl(children, is.null))
`if`(nrow(node) == 0, NULL, node)
}
compute_children(K, n, 0)
}
This gives you all the solution that are least better than the greedy one:
v <- sort(sample(1:1000, 1e5, replace = TRUE))
test <- optiCut_K(v, 9)
You need to unnest this:
full_unnest <- function(tbl) {
tmp <- try(tidyr::unnest(tbl), silent = TRUE)
`if`(identical(class(tmp), "try-error"), tbl, full_unnest(tmp))
}
print(test <- full_unnest(test))
And finally, to get the best solution:
test[which.min(test$children), ]
Here is one approach:
splitter <- function(values, N){
inds = c(0, sapply(1:N, function(i) which.min(abs(cumsum(as.numeric(values)) - sum(as.numeric(values))/N*i))))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(values, re))
}
how good is it:
# I calculate the mean and sd of the maximal difference of the sums in the
#splits of 100 runs:
#split on 15 parts
set.seed(5)
z1 = as.data.frame(matrix(1:15, nrow=1))
repeat{
values = sort(sample(1:1000, 1000000, replace = T))
z = splitter(values, 15)
z = lapply(z, sum)
z = unlist(z)
z1 = rbind(z1, z)
if (nrow(z1)>101){
break
}
}
z1 = z1[-1,]
mean(apply(z1, 1, function(x) max(x) - min(x)))
[1] 1004.158
sd(apply(z1, 1, function(x) max(x) - min(x)))
[1] 210.6653
#with less splits (4)
set.seed(5)
z1 = as.data.frame(matrix(1:4, nrow=1))
repeat{
values = sort(sample(1:1000, 1000000, replace = T))
z = splitter(values, 4)
z = lapply(z, sum)
z = unlist(z)
z1 = rbind(z1, z)
if (nrow(z1)>101){
break
}
}
z1 = z1[-1,]
mean(apply(z1, 1, function(x) max(x) - min(x)))
#632.7723
sd(apply(z1, 1, function(x) max(x) - min(x)))
#260.9864
library(microbenchmark)
1M:
values = sort(sample(1:1000, 1000000, replace = T))
microbenchmark(
sp_27 = splitter(values, 27),
sp_3 = splitter(values, 3),
)
Unit: milliseconds
expr min lq mean median uq max neval cld
sp_27 897.7346 934.2360 1052.0972 1078.6713 1118.6203 1329.3044 100 b
sp_3 108.3283 116.2223 209.4777 173.0522 291.8669 409.7050 100 a
btw F. Privé is correct this function does not give the globally optimal split. It is greedy which is not a good characteristic for such a problem. It will give splits with sums closer to global sum / n in the initial part of the vector but behaving as so will compromise the splits in the later part of the vector.
Here is a test comparison of the three functions posted so far:
db = function(values, N){
temp = floor(sum(values)/N)
inds = c(0, which(c(0, diff(cumsum(values) %% temp)) < 0)[1:(N-1)], length(values))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(values, re))
} #had to change it a bit since the posted one would not work - the core
#which calculates the splitting positions is the same
missuse <- function(values, N){
inds = c(0, sapply(1:N, function(i) which.min(abs(cumsum(as.numeric(values)) - sum(as.numeric(values))/N*i))))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(values, re))
}
prive = function(v, N){ #added dummy N argument because of the tester function
dummy = N
computeD <- function(p, q, S) {
n <- length(S)
S.star <- S[n] / 3
if (all(p < q)) {
(S[p] - S.star)^2 + (S[q] - S[p] - S.star)^2 + (S[n] - S[q] - S.star)^2
} else {
stop("You shouldn't be here!")
}
}
optiCut <- function(v, N) {
S <- cumsum(v)
n <- length(v)
S_star <- S[n] / 3
# good starting values
p_star <- which.min((S - S_star)^2)
q_star <- which.min((S - 2*S_star)^2)
print(min <- computeD(p_star, q_star, S))
count <- 0
for (q in 2:(n-1)) {
S3 <- S[n] - S[q] - S_star
if (S3*S3 < min) {
count <- count + 1
D <- computeD(seq_len(q - 1), q, S)
ind = which.min(D);
if (D[ind] < min) {
# Update optimal values
p_star = ind;
q_star = q;
min = D[ind];
}
}
}
c(p_star, q_star, computeD(p_star, q_star, S), count)
}
z3 = optiCut(v)
inds = c(0, z3[1:2], length(v))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(v, re))
} #added output to be more in line with the other two
Function for testing:
tester = function(split, seed){
set.seed(seed)
z1 = as.data.frame(matrix(1:3, nrow=1))
repeat{
values = sort(sample(1:1000, 1000000, replace = T))
z = split(values, 3)
z = lapply(z, sum)
z = unlist(z)
z1 = rbind(z1, z)
if (nrow(z1)>101){
break
}
}
m = mean(apply(z1, 1, function(x) max(x) - min(x)))
s = sd(apply(z1, 1, function(x) max(x) - min(x)))
return(c("mean" = m, "sd" = s))
} #tests 100 random 1M length vectors with elements drawn from 1:1000
tester(db, 5)
#mean sd
#779.5686 349.5717
tester(missuse, 5)
#mean sd
#481.4804 216.9158
tester(prive, 5)
#mean sd
#451.6765 174.6303
prive is the clear winner - however it takes quite a bit longer than the other 2. and can handle splitting on 3 elements only.
microbenchmark(
missuse(values, 3),
prive(values, 3),
db(values, 3)
)
Unit: milliseconds
expr min lq mean median uq max neval cld
missuse(values, 3) 100.85978 111.1552 185.8199 120.1707 304.0303 393.4031 100 a
prive(values, 3) 1932.58682 1980.0515 2096.7516 2043.7133 2211.6294 2671.9357 100 b
db(values, 3) 96.86879 104.5141 194.0085 117.6270 306.7143 500.6455 100 a
N = 3
temp = floor(sum(df$values)/N)
inds = c(0, which(c(0, diff(cumsum(df$values) %% temp)) < 0)[1:(N-1)], NROW(df))
split(df$values, rep(1:N, ifelse(N == 1, NROW(df), diff(inds))))
#$`1`
#[1] 1 1 2 2 3 4
#$`2`
#[1] 5 6
#$`3`
#[1] 6 7

How to make a 0 stay as a 0 in a matrix

I use the following code to generate a matrix
randomdiv <-
function(nchrom, ndivs, size) {
sz <- matrix(nrow = nchrom, ncol = ndivs)
for (j in 1:nchrom) {
n <- size
for (i in 1:ndivs)
{
old_subs <- rbinom (1, n, 0.5)
num_chrom <- rep(1 / nchrom, nchrom)
new_subs <- rmultinom(1, size * nchrom / 2, prob = c(num_chrom))
m <- old_subs + new_subs
sz[j,i] <- m[1,1]
n <- m
}
}
return (sz)
}
>randomdiv(3, 3, 10)
[,1] [,2] [,3]
[1,] 11 13 12
[2,] 6 8 5
[3,] 12 11 9
The only adjustment I need to make is that when a 0 is generated in the column by the rbinom function, I need that occurence to stay as a 0 for the remainder of the matrix, but anything >0 needs to go through the rest of the loop and have new_subs added to it.
I have tried;
randomdiv <- function(nchrom, ndivs, size) {sz <- matrix(nrow = nchrom, ncol = ndivs)
for (j in 1:nchrom) {
n <- size
for (i in 1:ndivs)
{
old_subs <- rbinom (1, n, 0.5)
num_chrom <- rep(1/nchrom, nchrom)
new_subs <- rmultinom(1, size*nchrom/2, prob = c(num_chrom))
m <- ifelse(old_subs>0, old_subs + new_subs, old_subs+0)
sz[j,i] <- m[1,1]
n <- m
}
}
return (replicate(ncell, sz, simplify = FALSE))
}
> randomdiv(3, 3, 10)
#Error in m[1, 1] : incorrect number of dimensions
I've tried a few different tactics with the ifelse function, but I think it only treats the columns as a whole, so if there is a 0 at all, nothing happens for the whole column, whereas I need each value in the columns to be treated individually.
You just need to use if() with an else and skip several lines of code if there's a 0:
randomdiv <-
function(nchrom, ndivs, size) {
sz <- matrix(nrow = nchrom, ncol = ndivs)
for (j in 1:nchrom) {
n <- size
for (i in 1:ndivs)
{
old_subs <- rbinom (1, n, 0.5)
if(old_subs>0){
num_chrom <- rep(1 / nchrom, nchrom)
new_subs <- rmultinom(1, size * nchrom / 2, prob = c(num_chrom))
m <- old_subs + new_subs
sz[j,i] <- m[1,1]
} else sz[j,i] <- old_subs
n <- m
}
}
return (sz)
}
randomdiv(3, 3, 2)
# [,1] [,2] [,3]
# [1,] 2 2 0
# [2,] 1 2 4
# [3,] 1 1 0

Resources