r vector of non-sums of self items - r

I am trying to build a function that creates a vector where any item is NOT the sum of any combination of other items in the list (without duplication).
This function does the job but is quite slow... any bright thoughts on how to improve it?
sum_fun <- function(k)
{
out_list <- c(2,3,4)
new_num <- 4
while(length(out_list) < k)
{
new_num <- new_num + 1
#Check if new_num can be written as a sum of the terms in out_list
new_valid <- T
for (i in 2:(length(out_list) - 1)){
if (new_num %in% (apply(combn(out_list,i), FUN = sum, MAR = 2)))
{
new_valid <- F
break
}
}
if (new_valid)
{
out_list <- c(out_list, new_num)
}
}
return(out_list)
}

This was a good question. I made some changes to your original function and got mine to run a bit quicker than your function. On a side note, how many are you trying to find?
The main idea is that we shouldn't calculate more things more often than we absolutely have to. I think the for loop was probably slowing things down a bit, plus, how many of the column sums were repeated? If we can "de-dup" the list, maybe we can search through it more quickly [reduce, reuse, recycle :) ].
sum_fun2 <- function(k)
{
out_list <- c(2,3,4) #dummy list
new_num <- 4 #dummy number
calc_big_sum <- T #calculate big sum on the first go
while(length(out_list) < k)
{
new_num <- new_num + 1 #dummy number to add
#calculate big sum, and then find unique values
if(calc_big_sum){
big_sum<- unique(unlist(lapply(lapply(2:(length(out_list) - 1),
FUN = function(x) combn(out_list, m = x)),
FUN = function(y) apply(y, 2, sum))))
}
if(new_num %in% big_sum){
calc_big_sum = F #don't make it calculate the sum again
}else{
out_list <- c(out_list, new_num) #add number to list
calc_big_sum = T #make it calculate a new sum
}
}
return(out_list)
}
> system.time(sum_fun2(10))
user system elapsed
0.03 0.00 0.03
> system.time(sum_fun(10))
user system elapsed
1.30 0.00 1.27
> system.time(sum_fun2(14))
user system elapsed
3.35 0.07 3.47
> system.time(sum_fun(14))
## I ended it
Timing stopped at: 39.86 0 40.02

Related

Calculating standard deviation on large table [duplicate]

I recently posted this question on the r-help mailing list but got no answers, so I thought I would post it here as well and see if there were any suggestions.
I am trying to calculate the cumulative standard deviation of a matrix. I want a function that accepts a matrix and returns a matrix of the same size where output cell (i,j) is set to the standard deviation of input column j between rows 1 and i. NAs should be ignored, unless cell (i,j) of the input matrix itself is NA, in which case cell (i,j) of the output matrix should also be NA.
I could not find a built-in function, so I implemented the following code. Unfortunately, this uses a loop that ends up being somewhat slow for large matrices. Is there a faster built-in function or can someone suggest a better approach?
cumsd <- function(mat)
{
retval <- mat*NA
for (i in 2:nrow(mat)) retval[i,] <- sd(mat[1:i,], na.rm=T)
retval[is.na(mat)] <- NA
retval
}
Thanks.
You could use cumsum to compute necessary sums from direct formulas for variance/sd to vectorized operations on matrix:
cumsd_mod <- function(mat) {
cum_var <- function(x) {
ind_na <- !is.na(x)
nn <- cumsum(ind_na)
x[!ind_na] <- 0
cumsum(x^2) / (nn-1) - (cumsum(x))^2/(nn-1)/nn
}
v <- sqrt(apply(mat,2,cum_var))
v[is.na(mat) | is.infinite(v)] <- NA
v
}
just for comparison:
set.seed(2765374)
X <- matrix(rnorm(1000),100,10)
X[cbind(1:10,1:10)] <- NA # to have some NA's
all.equal(cumsd(X),cumsd_mod(X))
# [1] TRUE
And about timing:
X <- matrix(rnorm(100000),1000,100)
system.time(cumsd(X))
# user system elapsed
# 7.94 0.00 7.97
system.time(cumsd_mod(X))
# user system elapsed
# 0.03 0.00 0.03
Another try (Marek's is faster)
cumsd2 <- function(y) {
n <- nrow(y)
apply(y,2,function(i) {
Xmeans <- lapply(1:n,function(z) rep(sum(i[1:z])/z,z))
Xs <- sapply(1:n, function(z) i[1:z])
sapply(2:n,function(z) sqrt(sum((Xs[[z]]-Xmeans[[z]])^2,na.rm = T)/(z-1)))
})
}

efficiently working with sets in R

Background:
I am dealing with a combinatorial problem in R. For a given list of sets I need to generate all pairs per set without producing duplicates.
Example:
initial_list_of_sets <- list()
initial_list_of_sets[[1]] <- c(1,2,3)
initial_list_of_sets[[2]] <- c(2,3,4)
initial_list_of_sets[[3]] <- c(3,2)
initial_list_of_sets[[4]] <- c(5,6,7)
get_pairs(initial_list_of_sets)
# should return (1 2),(1 3),(2 3),(2 4),(3 4),(5 6),(5 7),(6 7)
Please note that (3 2) is not included in the results, as it is mathematically equal to (2 3).
My (working but inefficient) approach so far:
# checks if sets contain a_set
contains <- function(sets, a_set){
for (existing in sets) {
if (setequal(existing, a_set)) {
return(TRUE)
}
}
return(FALSE)
}
get_pairs <- function(from_sets){
all_pairs <- list()
for (a_set in from_sets) {
# generate all pairs for current set
pairs <- combn(x = a_set, m = 2, simplify = FALSE)
for (pair in pairs) {
# only add new pairs if they are not yet included in all_pairs
if (!contains(all_pairs, pair)) {
all_pairs <- c(all_pairs, list(pair))
}
}
}
return(all_pairs)
}
My question:
As I am dealing with mathematical sets I can't use the %in% operator instead of my contains function, because then (2 3) and (3 2) would be different pairs. However it seems very inefficient to iterate over all existing sets in contains. Is there a better way to implement this function?
Perhaps you can rewrite your get_pairs function as something like the following:
myFun <- function(inlist) {
unique(do.call(rbind, lapply(inlist, function(x) t(combn(sort(x), 2)))))
}
Here's a quick time comparison.
n <- 100
set.seed(1)
x <- sample(2:8, n, TRUE)
initial_list_of_sets <- lapply(x, function(y) sample(100, y))
system.time(get_pairs(initial_list_of_sets))
# user system elapsed
# 1.964 0.000 1.959
system.time(myFun(initial_list_of_sets))
# user system elapsed
# 0.012 0.000 0.014
If needed, you can split the matrix by rows to get your list.
Eg:
myFun <- function(inlist) {
temp <- unique(do.call(rbind, lapply(inlist, function(x) t(combn(sort(x), 2)))))
lapply(1:nrow(temp), function(x) temp[x, ])
}

Fast calculations of the Pareto front in R

So I am trying to calculate the Pareto front (http://en.wikipedia.org/wiki/Pareto_efficiency) in R and am able to do it, however, I am not able to do it efficiently. In particular as the number of pairs of points increases, the computations slow down considerably.
So in general, what I want to do is check for all non-dominated (or dominated) pairs. Now the way I have been doing this is to find all such pair of points such that xi > X
and yi > Y where (xi, yi) are a single pair and X and Y represent all points x and y. Now, this part works very fast and is easy to implement, however, there is the additional possibility that multiple x values may be the same but they will have different y values so in that case I want to be able to identify the x value that has the lowest y value (and vise versa for points that have identical y values but different x values).
To illustrate this point here is a picture from Wikipedia:
so basically I want to be able to identify all points that lie on the red line.
Here is my code that does work but is very inefficient for large datasets:
#Example Data that actually runs quickly
x = runif(10000)
y = runif(10000)
pareto = 1:length(x)
for(i in 1:length(x)){
cond1 = y[i]!=min(y[which(x==x[i])])
cond2 = x[i]!=min(x[which(y==y[i])])
for(n in 1:length(x)){
if((x[i]>x[n] & y[i]>y[n]) | (x[i]==x[n] & cond1) | (y[i]==y[n] & cond2)){
pareto[i] = NA
break
}
}
}
#All points not on the red line should be marks as NA in the pareto variable
The slow down definitely comes from calculating the points where (x[i]==x[n] & cond1) | (y[i]==y[n] & cond2) but I cannot find a way around it or a better Boolean expression to capture everything that I want. any suggestions greatly appreciated!
Following #BrodieG
system.time( {
d = data.frame(x,y)
D = d[order(d$x,d$y,decreasing=FALSE),]
front = D[which(!duplicated(cummin(D$y))),]
} )
user system elapsed
0.02 0.00 0.02
which is 0.86/0.02 = 43 times faster!
EDIT: new version:
system.time( {
pareto.2 <- logical(length(x))
x.sort <- sort(x)
y.sort <- y[order(x)]
y.min <- max(y)
for(i in 1:length(x.sort)) {
if(pareto.2[i] <- y.sort[i] <= y.min) y.min <- y.sort[i]
}
} )
# user system elapsed
# 0.036 0.000 0.035
OLD VERSION:
This is about 6x faster on my system. You can probably do better with a better algorithm, as well as with Rcpp, but this was straightforward. The trick here is to sort by x, which then allows you to limit your check to making sure that all prior values of x must have greater values of y to ensure that point is on the frontier.
system.time( {
pareto.2 <- logical(length(x))
x.sort <- sort(x)
y.sort <- y[order(x)]
for(i in 1:length(x.sort)) {
pareto.2[i] <- all(y.sort[1:i] >= y.sort[i])
}
} )
# user system elapsed
# 0.86 0.00 0.88
The original:
pareto = 1:length(x)
system.time(
for(i in 1:length(x)){
cond1 = y[i]!= min(y[which(x==x[i])])
cond2 = x[i]!= min(x[which(y==y[i])])
for(n in 1:length(x)){
if((x[i]>x[n] & y[i]>y[n]) | (x[i]==x[n] & cond1) | (y[i]==y[n] & cond2)){
pareto[i] = NA
break
}
}
}
)
# user system elapsed
# 5.32 0.00 5.33
And showing the two methods produce the same result (a bit tricky because I need to re-order pareto.2 to the original order of x):
all.equal(pareto.2[match(1:length(x), order(x))], !is.na(pareto))
# [1] TRUE
Wanted to share with you my solution as a function. It's been tested and works pretty good for N Pareto-fronts. Set to fronts = Inf to calculate all fronts.
pareto_front <- function(x, y, fronts = 1, sort = TRUE) {
stopifnot(length(x) == length(y))
d <- data.frame(x, y)
Dtemp <- D <- d[order(d$x, d$y, decreasing = FALSE), ]
df <- data.frame()
i <- 1
while (nrow(Dtemp) >= 1 & i <= max(fronts)) {
these <- Dtemp[which(!duplicated(cummin(Dtemp$y))), ]
these$pareto_front <- i
df <- rbind(df, these)
Dtemp <- Dtemp[!row.names(Dtemp) %in% row.names(these), ]
i <- i + 1
}
ret <- merge(x = d, y = df, by = c("x", "y"), all.x = TRUE, sort = sort)
return(ret)
}

Collecting an unknown number of results in a loop

What is the idiomatic way to collect results in a loop in R if the number of final results is not known beforehand? Here's a toy example:
results = vector('integer')
i=1L
while (i < bigBigBIGNumber) {
if (someCondition(i)) results = c(results, i)
i = i+1
}
results
The problem with this example is that (I assume) it will have quadratic complexity as the vector needs to be re-allocated at every append. (Is this correct?) I'm looking for a solution that avoids this.
I found Filter, but it requires pre-generating 1:bigBigBIGNumber which I want to avoid to conserve memory. (Question: does for (i in 1:N) also pre-generate 1:N and keep it in memory?)
I could make something like a linked list like this:
results = list()
i=1L
while (i < bigBigBIGNumber) {
if (someCondition(i)) results = list(results, i)
i = i+1
}
unlist(results)
(Note that this is not concatenation. It's building a structure like list(list(list(1),2),3), then flattening with unlist.)
Is there a better way than this? What is the idiomatic way that's typically used? (I am very new to R.) I'm looking for suggestion on how to tackle this type of problem. Suggestions both about compact (easy to write) and fast code are most welcome! (But I'd like to focus on fast and memory efficient.)
Here is an algorithm that doubles the size of the output list as it fills up, achieving somewhat linear computation times as show the benchmark tests:
test <- function(bigBigBIGNumber = 1000) {
n <- 10L
results <- vector("list", n)
m <- 0L
i <- 1L
while (i < bigBigBIGNumber) {
if (runif(1) > 0.5) {
m <- m + 1L
results[[m]] <- i
if (m == n) {
results <- c(results, vector("list", n))
n <- n * 2L
}
}
i = i + 1L
}
unlist(results)
}
system.time(test(1000))
# user system elapsed
# 0.008 0.000 0.008
system.time(test(10000))
# user system elapsed
# 0.090 0.002 0.093
system.time(test(100000))
# user system elapsed
# 0.885 0.051 0.936
system.time(test(1000000))
# user system elapsed
# 9.428 0.339 9.776
Presumably there's a maximum size you're willing to tolerate; pre-allocate and fill up to that level, then trim if necessary. This avoids the risk of not being able to satisfy the request to double in size, even when only a small additional amount of memory might be required; it fails early, and involves only one rather than log(n) re-allocations. Here's a function that takes a maximum size, a generating function, and a token that the generating function returns when there is nothing left to generate. We get up to n results before returning
filln <-
function(n, FUN, ..., RESULT_TYPE="numeric", DONE_TOKEN=NA_real_)
{
results <- vector(RESULT_TYPE, n)
i <- 0L
while (i < n) {
ans <- FUN(..., DONE_TOKEN=DONE_TOKEN)
if (identical(ans, DONE_TOKEN))
break
i <- i + 1L
results[[i]] <- ans
}
if (i == n)
warning("intolerably large result")
else length(results) <- i
results
}
Here's a generator
fun <- function(thresh, DONE_TOKEN) {
x <- rnorm(1)
if (x > thresh) DONE_TOKEN else x
}
and in action
> set.seed(123L); length(filln(10000, fun, 3))
[1] 163
> set.seed(123L); length(filln(10000, fun, 4))
[1] 10000
Warning message:
In filln(10000, fun, 4) : intolerably large result
> set.seed(123L); length(filln(100000, fun, 4))
[1] 23101
We can benchmark the overhead, approximately, by comparing to something that knows in advance how much space is required
f1 <- function(n, FUN, ...) {
i <- 0L
result <- numeric(n)
while (i < n) {
i <- i + 1L
result[i] <- FUN(...)
}
result
}
Here we check the timing and value of a single result
> set.seed(123L); system.time(res0 <- filln(100000, fun, 4))
user system elapsed
0.944 0.000 0.948
> set.seed(123L); system.time(res1 <- f1(23101, fun, 4))
user system elapsed
0.688 0.000 0.689
> identical(res0, res1)
[1] TRUE
which for this example is of course eclipsed by the simple vector solution(s)
set.seed(123L); system.time(res2 <- rnorm(23101))
identical(res0, res2)
If you can't compute 1:bigBigNumber, count the entries, create the vector, then populate it.
num <- 0L
i <- 0L
while (i < bigBigNumber) {
if (someCondition(i)) num <- num + 1L
i <- i + 1L
}
result <- integer(num)
num <- 0L
while (i < bigBigNumber) {
if (someCondition(i)) {
result[num] <- i
num <- num + 1L }
i <- i + 1L
}
(This code is not tested.)
If you can compute 1:bigBigBIGNumber, this will also work:
I assume that you want to call a function, and not simply tack on the indices themselves. Something like this may be closer to what you want:
values <- seq(bigBigBIGNumber)
sapply(values[someCondition(values)], my_function)
closer to the second one you listed:
results <- list()
for (i in ...) {
...
results[[i]] <- ...
}
Note that i does not need to be an integer, could be a character, etc.
Also, you can use results[[length(results)]] <- ... if needed, but if you have an iterator already, probably wouldnt be.

Efficient calculation of matrix cumulative standard deviation in r

I recently posted this question on the r-help mailing list but got no answers, so I thought I would post it here as well and see if there were any suggestions.
I am trying to calculate the cumulative standard deviation of a matrix. I want a function that accepts a matrix and returns a matrix of the same size where output cell (i,j) is set to the standard deviation of input column j between rows 1 and i. NAs should be ignored, unless cell (i,j) of the input matrix itself is NA, in which case cell (i,j) of the output matrix should also be NA.
I could not find a built-in function, so I implemented the following code. Unfortunately, this uses a loop that ends up being somewhat slow for large matrices. Is there a faster built-in function or can someone suggest a better approach?
cumsd <- function(mat)
{
retval <- mat*NA
for (i in 2:nrow(mat)) retval[i,] <- sd(mat[1:i,], na.rm=T)
retval[is.na(mat)] <- NA
retval
}
Thanks.
You could use cumsum to compute necessary sums from direct formulas for variance/sd to vectorized operations on matrix:
cumsd_mod <- function(mat) {
cum_var <- function(x) {
ind_na <- !is.na(x)
nn <- cumsum(ind_na)
x[!ind_na] <- 0
cumsum(x^2) / (nn-1) - (cumsum(x))^2/(nn-1)/nn
}
v <- sqrt(apply(mat,2,cum_var))
v[is.na(mat) | is.infinite(v)] <- NA
v
}
just for comparison:
set.seed(2765374)
X <- matrix(rnorm(1000),100,10)
X[cbind(1:10,1:10)] <- NA # to have some NA's
all.equal(cumsd(X),cumsd_mod(X))
# [1] TRUE
And about timing:
X <- matrix(rnorm(100000),1000,100)
system.time(cumsd(X))
# user system elapsed
# 7.94 0.00 7.97
system.time(cumsd_mod(X))
# user system elapsed
# 0.03 0.00 0.03
Another try (Marek's is faster)
cumsd2 <- function(y) {
n <- nrow(y)
apply(y,2,function(i) {
Xmeans <- lapply(1:n,function(z) rep(sum(i[1:z])/z,z))
Xs <- sapply(1:n, function(z) i[1:z])
sapply(2:n,function(z) sqrt(sum((Xs[[z]]-Xmeans[[z]])^2,na.rm = T)/(z-1)))
})
}

Resources