Correlation with sliding window - r

I want to compute a pearson correlation between two vectors (each four elements) with a slinding window (window=1) and keep the best result:
list1 <- read.table(text= "20
34
89
35")
list2 <- read.table(text= "22
99
313
13
71
200")
The comparison will be a loop on:
cor(x=c(20,34,89,35),y=c(22,99,313,13), method = "pearson")
cor(x=c(20,34,89,35),y=c(99,313,13,71), method = "pearson")
cor(x=c(20,34,89,35),y=c(313,13,71,200), method = "pearson")
The result will contain the score and the vectors that give the highest correlation score. In this case it will be: x=c(20,34,89,35) and y=c(22,99,313,13) and 0.9588095.

Using rollapply compute the correlations, find the index of the largest one and derive y and its correlation with x from that.
library(zoo)
x <- list1$V1
w <- length(x)
ix <- which.max(rollapply(list2$V1, w, cor, x))
y <- list2$V1[seq(ix, length = w)]
y
## [1] 22 99 313 13
cor(x, y)
## [1] 0.9588095
A variation of the above is to return the correlation and the y vector from rollapply:
r <- rollapply(list2$V1, length(x), function(y) c(cor(x, y), y))
ix <- which.max(r[, 1])
r[ix, 1]
## [1] 0.9588095
r[ix, -1]
## [1] 22 99 313 13

An R base solution
out <- list(NULL)
j <- 1
ind <- 0
while(ind[length(ind)]<length(list2$V1)){
ind <- j:(j+3);
out[[j]] <- list(Vector1=list1$V1,
Vector2=list2$V1[ind],
Cor=cor(list1$V1, list2$V1[ind]));
out
j <- j+1
}
out[[which.max(unlist(sapply(out, "[", "Cor")))]]
which produces:
$Vector1
[1] 20 34 89 35
$Vector2
[1] 22 99 313 13
$Cor
[1] 0.9588095

Related

Alternative to for loop for fast calculations when equations depend on each other

I am using a for-loop to do step-by-step calculations where several equations depend on each other. Because of this dependence, I cannot find a solution where I do the calculations inside a dataframe. My main motivation is to speed up the calculations when the Time vector is very large in the reprex below.
Could you please suggest alternatives to the following for-loop based calculations, preferably inside a dataframe in R? The only thing I can think of is using for-loop in Rcpp.
Reproducible Example
last_time <- 10
STEP = 1
Time <- seq(from = 0, to = last_time, by = STEP)
## empty vectors
eq1 <- vector(mode = "double", length = length(Time))
eq2 <- vector(mode = "double", length = length(Time))
eq <- vector(mode = "double", length = length(Time))
eq3 <- vector(mode = "double", length = length(Time))
eq4 <- vector(mode = "double", length = length(Time))
## adding the first values
eq1[1] <- 25
eq2[1] <- 25
eq[1] <- 25
eq3[1] <- 100
eq4[1] <- 2
for (t in 2:length(Time)) {
## eq1
eq1[t] <- eq[t-1] + (2.5 * STEP * (1 - (eq[t-1])/25))
## eq2
eq2[t] <- (-2 * STEP) + ((-2^2) * (STEP^2)) - (2 * eq3[t-1]) - (eq[t-1] * STEP)
## min.
eq[t] <- min(eq1[t], eq2[t] )
## eq3
eq3[t] <- (eq[t] - eq[t-1])/(STEP)
## eq4
eq4[t] <- eq4[t-1] + (eq[t-1] * STEP) + (0.5 * eq3[t-1] * (STEP)^2)
}
Output:
my_data <- data.frame(Time, eq1, eq2, eq, eq3, eq4)
my_data
#> Time eq1 eq2 eq eq3 eq4
#> 1 0 25.00000 25.00000 25.00000 -256.00000 2.0000
#> 2 1 25.00000 -231.00000 -231.00000 25.60000 -101.0000
#> 3 2 -205.40000 225.00000 -205.40000 23.04000 -319.2000
#> 4 3 -182.36000 199.40000 -182.36000 20.73600 -513.0800
#> 5 4 -161.62400 176.36000 -161.62400 18.66240 -685.0720
#> 6 5 -142.96160 155.62400 -142.96160 16.79616 -837.3648
#> 7 6 -126.16544 136.96160 -126.16544 15.11654 -971.9283
#> 8 7 -111.04890 120.16544 -111.04890 13.60489 -1090.5355
#> 9 8 -97.44401 105.04890 -97.44401 12.24440 -1194.7819
#> 10 9 -85.19961 91.44401 -85.19961 11.01996 -1286.1037
#> 11 10 -74.17965 79.19961 -74.17965 0.00000 -1365.7934
Created on 2021-02-28 by the reprex package (v1.0.0)
You could define a recursive function. A loop is faster than recursion though.
g <- function(m, STEP, time, x=2) {
if (time == 0) m
else {
## eq1
m[x, 2] <- m[x - 1, 1] + 2.5*STEP*(1 - (m[x - 1, 1])/25)
## eq2
m[x, 3] <- -2*STEP + -2^2*STEP^2 - 2*m[x - 1, 4] - m[x - 1, 1]*STEP
## min.
m[x, 1] <- min(m[x, 2], m[x, 3])
## eq3
m[x - 1, 4] <- (m[x, 1] - m[x - 1, 1])/STEP
## eq4
m[x, 5] <- m[x - 1, 5] + m[x - 1, 1]*STEP + 0.5*m[x - 1, 4]*STEP^2
g(m, STEP, time - 1, x + 1)
}
}
Usage
last_time <- 10; STEP <- 1
First <- c(eq0=25, eq1=25, eq2=25, eq3=100, eq4=2)
m <- matrix(0, last_time + 1, length(First), dimnames=list(NULL, names(First)))
m[1, ] <- First
g(m, STEP, last_time)
# eq0 eq1 eq2 eq3 eq4
# [1,] 25.00000 25.00000 25.00000 -256.00000 2.0000
# [2,] -231.00000 25.00000 -231.00000 25.60000 -101.0000
# [3,] -205.40000 -205.40000 225.00000 23.04000 -319.2000
# [4,] -182.36000 -182.36000 199.40000 20.73600 -513.0800
# [5,] -161.62400 -161.62400 176.36000 18.66240 -685.0720
# [6,] -142.96160 -142.96160 155.62400 16.79616 -837.3648
# [7,] -126.16544 -126.16544 136.96160 15.11654 -971.9283
# [8,] -111.04890 -111.04890 120.16544 13.60489 -1090.5355
# [9,] -97.44401 -97.44401 105.04890 12.24440 -1194.7819
# [10,] -85.19961 -85.19961 91.44401 11.01996 -1286.1037
# [11,] -74.17965 -74.17965 79.19961 0.00000 -1365.7934
as you asked how it works:
The recursive filter function of stats::filter can be used with mapply as follows:
dataframe <-
mapply(stats::filter,
dataframe,
filter = vector,
method = "recursive")
where vector is e.g. c(25), which could be your first eq1[1] <- 25
The recursive filter works like a recursive loop but is a bit more elegant:
Then the mapply recursive filter would do:
dataframe / vector
row or timepoint 1 20
row or timepoint 2 30 + (20 * c(25))
row or timepoint 3 40 + ((20*25)+30) * c(25))
It calculates the value in the first row and uses it in the next, where it multiplies the next vector. Perhaps if you play around with stats filter and the recursive method you also get the same result. It is a row based calculation over time similar to Rcpp but more flexible.

R: Find set of columns which contain most 1s in matrix of 0 and 1

I have a matrix of 1s and 0s where the rows are individuals and the columns are events. A 1 indicates that an event happened to an individual and a 0 that it did not.
I want to find which set of (in the example) 5 columns/events that cover the most rows/individuals.
Test Data
#Make test data
set.seed(123)
d <- sapply(1:300, function(x) sample(c(0,1), 30, T, c(0.9,0.1)))
colnames(d) <- 1:300
rownames(d) <- 1:30
My attempt
My initial attempt was just based on combining the set of 5 columns with the highest colMeans:
#Get top 5 columns with highest row coverage
col_set <- head(sort(colMeans(d), decreasing = T), 5)
#Have a look the set
col_set
>
197 199 59 80 76
0.2666667 0.2666667 0.2333333 0.2333333 0.2000000
#Check row coverage of the column set
sum(apply(d[,colnames(d) %in% names(col_set)], 1, sum) > 0) / 30 #top 5
>
[1] 0.7
However this set does not cover the most rows. I tested this by pseudo-random sampling 10.000 different sets of 5 columns, and then finding the set with the highest coverage:
#Get 5 random columns using colMeans as prob in sample
##Random sample 10.000 times
set.seed(123)
result <- lapply(1:10000, function(x){
col_set2 <- sample(colMeans(d), 5, F, colMeans(d))
cover <- sum(apply(d[,colnames(d) %in% names(col_set2)], 1, sum) > 0) / 30 #random 5
list(set = col_set2, cover = cover)
})
##Have a look at the best set
result[which.max(sapply(result, function(x) x[["cover"]]))]
>
[[1]]
[[1]]$set
59 169 262 68 197
0.23333333 0.10000000 0.06666667 0.16666667 0.26666667
[[1]]$cover
[1] 0.7666667
The reason for supplying the colMeans to sample is that the columns with the highest coverages are the ones I am most interested in.
So, using pseudo-random sampling I can collect a set of columns with higher coverage than when just using the top 5 columns. However, since my actual data sets are larger than the example I am looking for a more efficient and rational way of finding the set of columns with the highest coverage.
EDIT
For the interested, I decided to microbenchmark the 3 solutions provided:
#Defining G. Grothendieck's coverage funciton outside his solutions
coverage <- function(ix) sum(rowSums(d[, ix]) > 0) / 30
#G. Grothendieck top solution
solution1 <- function(d){
cols <- tail(as.numeric(names(sort(colSums(d)))), 20)
co <- combn(cols, 5)
itop <- which.max(apply(co, 2, coverage))
co[, itop]
}
#G. Grothendieck "Older solution"
solution2 <- function(d){
require(lpSolve)
ones <- rep(1, 300)
res <- lp("max", colSums(d), t(ones), "<=", 5, all.bin = TRUE, num.bin.solns = 10)
m <- matrix(res$solution[1:3000] == 1, 300)
cols <- which(rowSums(m) > 0)
co <- combn(cols, 5)
itop <- which.max(apply(co, 2, coverage))
co[, itop]
}
#user2554330 solution
bestCols <- function(d, n = 5) {
result <- numeric(n)
for (i in seq_len(n)) {
result[i] <- which.max(colMeans(d))
d <- d[d[,result[i]] != 1,, drop = FALSE]
}
result
}
#Benchmarking...
microbenchmark::microbenchmark(solution1 = solution1(d),
solution2 = solution2(d),
solution3 = bestCols(d), times = 10)
>
Unit: microseconds
expr min lq mean median uq max neval
solution1 390811.850 497155.887 549314.385 578686.3475 607291.286 651093.16 10
solution2 55252.890 71492.781 84613.301 84811.7210 93916.544 117451.35 10
solution3 425.922 517.843 3087.758 589.3145 641.551 25742.11 10
This looks like a relatively hard optimization problem, because of the ways columns interact. An approximate strategy would be to pick the column with the highest mean; then delete the rows with ones in that column, and repeat. You won't necessarily find the best solution this way, but you should get a fairly good one.
For example,
set.seed(123)
d <- sapply(1:300, function(x) sample(c(0,1), 30, T, c(0.9,0.1)))
colnames(d) <- 1:300
rownames(d) <- 1:30
bestCols <- function(d, n = 5) {
result <- numeric(n)
for (i in seq_len(n)) {
result[i] <- which.max(colMeans(d))
d <- d[d[,result[i]] != 1,, drop = FALSE]
}
cat("final dim is ", dim(d))
result
}
col_set <- bestCols(d)
sum(apply(d[,colnames(d) %in% col_set], 1, sum) > 0) / 30 #top 5
This gives 90% coverage.
The following provides a heuristic to find an approximate solution. Find the N=20 columns, say, with the most ones, cols, and then use brute force to find every subset of 5 columns out of those 20. The subset having the highest coverage is shown below and its coverage is 93.3%.
coverage <- function(ix) sum(rowSums(d[, ix]) > 0) / 30
N <- 20
cols <- tail(as.numeric(names(sort(colSums(d)))), N)
co <- combn(cols, 5)
itop <- which.max(apply(co, 2, coverage))
co[, itop]
## [1] 90 123 197 199 286
coverage(co[, itop])
## [1] 0.9333333
Repeating this for N=5, 10, 15 and 20 we get coverages of 83.3%, 86.7%, 90% and 93.3%. The higher the N the better the coverage but the lower the N the less the run time.
Older solution
We can approximate the problem with a knapsack problem that chooses the 5 columns with largest numbers of ones using integer linear programming.
We get the 10 best solutions to this approximate problem, get all columns which are in at least one of the 10 solutions. There are 14 such columns and we then use brute force to find which subset of 5 of the 14 columns has highest coverage.
library(lpSolve)
ones <- rep(1, 300)
res <- lp("max", colSums(d), t(ones), "<=", 5, all.bin = TRUE, num.bin.solns = 10)
coverage <- function(ix) sum(rowSums(d[, ix]) > 0) / 30
# each column of m is logical 300-vector defining possible soln
m <- matrix(res$solution[1:3000] == 1, 300)
# cols is the set of columns which are in any of the 10 solutions
cols <- which(rowSums(m) > 0)
length(cols)
## [1] 14
# use brute force to find the 5 best columns among cols
co <- combn(cols, 5)
itop <- which.max(apply(co, 2, coverage))
co[, itop]
## [1] 90 123 197 199 286
coverage(co[, itop])
## [1] 0.9333333
You can try to test if there is a better column and exchange this with the one currently in the selection.
n <- 5 #Number of columns / events
i <- rep(1, n)
for(k in 1:10) { #How many times itterate
tt <- i
for(j in seq_along(i)) {
x <- +(rowSums(d[,i[-j]]) > 0)
i[j] <- which.max(colSums(x == 0 & d == 1))
}
if(identical(tt, i)) break
}
sort(i)
#[1] 90 123 197 199 286
mean(rowSums(d[,i]) > 0)
#[1] 0.9333333
Taking into account, that the initial condition influences the result you can take random starts.
n <- 5 #Number of columns / events
x <- apply(d, 2, function(x) colSums(x == 0 & d == 1))
diag(x) <- -1
idx <- which(!apply(x==0, 1, any))
x <- apply(d, 2, function(x) colSums(x != d))
diag(x) <- -1
x[upper.tri(x)] <- -1
idx <- unname(c(idx, which(apply(x==0, 1, any))))
res <- sample(idx, n)
for(l in 1:100) {
i <- sample(idx, n)
for(k in 1:10) { #How many times itterate
tt <- i
for(j in seq_along(i)) {
x <- +(rowSums(d[,i[-j]]) > 0)
i[j] <- which.max(colSums(x == 0 & d == 1))
}
if(identical(tt, i)) break
}
if(sum(rowSums(d[,i]) > 0) > sum(rowSums(d[,res]) > 0)) res <- i
}
sort(res)
#[1] 90 123 197 199 286
mean(rowSums(d[,res]) > 0)
#[1] 0.9333333

How to code elementary symmetric polynomials in R

I want to program a function in R that compute the elementary symmetric polynomials. For i=0, 1, ..., p, the i-th elementary polynomial is given by
How can I code this function in R? I've tried
x<-c(1,2,3,4)
crossprod(x)
# or
for (i in 1:length(x)) print(crossprod((combn(x,i))))
but I don't get the desired result, which is supposed to give
e0= 1
e1= 10
e2= 35
e3= 50
e4= 24
Take the product of each combination using combn(x, k, prod) and then sum that:
sympoly <- function(k, x) sum(combn(x, k, prod))
sapply(0:4, sympoly, 1:4)
## [1] 1 10 35 50 24
The solution is not crossprod, it's combn/prod followed by sum.
elSymPoly <- function(x){
sapply(c(0, seq_along(x)), function(n){
sum(apply(combn(x, n), 2, prod))
})
}
x <- c(1, 2, 3, 4)
elSymPoly(x)
#[1] 1 10 35 50 24
Note that the function also works with an empty vector (but not with NULL).
y <- integer(0)
elSymPoly(y)
#[1] 1

Take the first unique value form a function

This is my function:
g <- function(x,y){
x <- (x-y):x
y <- 1:30 # ------> (y is always fixed 1:30)
z<- outer(x,y,fv) # ---->(fv is a previous function)
s <- colSums(z)
which(s==max(s),arr.ind=T)
}
It tells me the position of the max value in s. I basically have a problem in choosing y because given a small y, the max(s) appears more than once in s. For example:
#given x=53
> g(53,1)
[1] 13 16 20 22 25 26 27
> g(53,2)
[1] 20 25 26
> g(53,3)
[1] 20 25 26
> g(53,4)
[1] 20 25 26
> g(53,5)
[1] 20 25
> g(53,6)
[1] 25 -----> This is the only result i would like from my function (right y=6)
Another example:
# given x=71
> g(71,1)
[1] 7 9 14
> g(71,2)
[1] 7 14
> g(71,3)
[1] 14 -----> my desired result (right y=3)
Therefore, i would like a function resulting in the first unique solution given y as small as possible ( ex: g(53)=25 , g(71)=14, ...). Any help? Thanks
This is a simplify example. I hope to be more clear in questioning:
#The idea is the same:
n <- 1:9
e <- rep(nn,500)
p<- sample(e) # --->(Need to sample in order to have more max later (mixed matrix)
mat <- matrix(p,90)
g <- function(x,y){
x <- (x-y):x
k <- rowSums(mat[,x])
which(k==max(k), arr.ind=T)
}
#In my sample matrix :
k <- rowSums(mat[,44:45])
which(k==max(k), arr.ind=T)
[1] 44 71 90
#In fact
g(45,1)
[1] 44 71 90 # ---> more than one solution
g(45,2)
[1] 90 # ----> I would like to pick up this value wich is the first unique solution given x=45
Therefore, i would like a function resulting in the first unique solution for y as small as possible given x ( in this new ex: g(45)=90... ).
I got it. It is a bit long but i think right.
Taking into consideration the second simplify example:
g <- function(x,y){
x <- (x-y):x
k <- rowSums(mat[,x])
q <- which(k==max(k), arr.ind=T)
length(q)
}
gv <- Vectorize(g)
l <- function(x){
y<- 1:30 # <- (until 30 to be sure)
z<- outer(x,y,gv)
y <- which.min(z) # <- (min is surely length=1 and which.min takes the first)
x <- (x-y):x
k <- rowSums(mat[,x])
q <- which(k==max(k), arr.ind=T)
q
}
l(45)
[1] 90
It seems like you could just do this with a recursive function. Consider the following:
set.seed(42)
n = 1:9
e = rep(n, 500)
p = sample(e)
mat = matrix(p, 90)
g <- function(x, y=1) {
xv <- (x-y):x
k <- rowSums(mat[, xv])
i <- which(k == max(k), arr.ind=T)
n <- length(i)
if (n == 1) {
return(y) # want to know the min y that solves the problem, right?
} else {
y <- y + 1 # increase y by 1
g(x,y) # run our function again with a new value of y
}
}
You should now be able to run g(45) and get 1 as the result, since that is the value of y that solves the problem, and g(33) to get 2.

vectorize head(which(t > x), n=1) for many values of x

I have a situation similar to the following in R:
t <- (1:100) * 15
x <- c(134, 552, 864, 5000)
And I want to find for each value in x what the first index in t where t > x is. The following works using a loop:
y <- numeric(length(x))
for (i in 1:length(x))
y[i] <- which(t > x[i])[1]
# y
# [1] 9 37 58 NA
I was taught that loops in R are 'bad and slow', and while the time this takes to run for a reasonably large x is not a deal-breaker, I would like to know whether there is a better way?
If the objects are not too big (so that RAM is not limiting), you don't need *apply functions, which are just hidden loops.
temp <- outer(x,t,'<')
y <- length(t) - (rowSums(temp)-1)
y[y>length(t)] <- NA
#[1] 9 37 58 NA
fun <- function(x){
which(t > x)[1]
}
R > sapply(x, fun)
[1] 9 37 58 NA
Almost the same:
require(functional)
apply(matrix(t > rep(x, each=length(t)), length(t)), 2, Compose(which, Curry(append, Inf), min))
## [1] 9 37 58 Inf

Resources