What is the fastest way to do this summation in R?
This is what I have so far
ans = 0
for (i in 1:dimx[1]){
for (j in 1:dimx[2]){
ans = ans + ((x[i,j] - parameters$mu)^2)/(parameters$omega_2[i]*parameters$sigma_2[j])
}
}
where omega_2, and sigma_2 are omega^2 and sigma^2 respectively.
Nothing fancy:
# sample data
m <- matrix(1:20, 4)
sigma <- 1:ncol(m)
omega <- 1:nrow(m)
mu <- 2
sum(((m - mu) / outer(omega, sigma))^2)
Usually it is quite easy to vectorize this kind of operations. In this case, though, it is a bit trickier when n is not equal to m and also because of double summation. But here is how we can proceed:
# n = 3, m = 2
xs <- cbind(1:3, 4:6)
omegas <- 1:3
sigmas <- 1:2
mu <- 3
sum((t((xs - mu) / omegas) / sigmas)^2)
# [1] 5
Here we use recycling three times and t() to divide appropriate elements by sigmas.
Related
I am having problems when saving the results in a for loop.
I am computing a variance (this is not relevant I think) and my code is:
library(dirmult)
n <- 50
p <- 20
size <- 5*p
prob_true <- rep(1/p, p)
multinom <- as.matrix(rmultinom(n, size, prob = prob_true))
zeros <- round(0.5*p*n)
a <- c(as.matrix(multinom))
a[sample(1:(p*n), zeros)] <- 0
data_zeros <- matrix(a, p, n)
dirmult <- dirmult(t(data_zeros))
alpha <- dirmult$gamma
sum_alpha <- (1-dirmult$theta)/dirmult$theta
for (j in ncol(data_zeros)){
A <- alpha/sum_alpha
B <- 1 - A
N <- colSums(data_zeros)
C <- 1 + sum_alpha
var_s_dirm <- list()
var_s_dirm[[j]] <- N[j]*A*B*((N[j]+sum_alpha)/C)
}
In particular I can say that alpha is a vector with 20 values, sum_alpha is a scalar data_zeros is my dataset which has 20 rows and 50 columns and N is the sum of each column of the dataset, so it is a vector with 50 values.
It seems very simple to do what I wanted to do:
I want to get a list with 50 vectors where each one differs form the other by the fact that I multiply for a different value of N.
I really hope that somebody can help me finding the error.
The problem is (probably) you are setting constants in each time j is increased, and in each step you clear the list with the line var_s_dirm <- list()...
See if this works for you
library(dirmult)
n <- 50
p <- 20
size <- 5*p
prob_true <- rep(1/p, p)
multinom <- as.matrix(rmultinom(n, size, prob = prob_true))
zeros <- round(0.5*p*n)
a <- c(as.matrix(multinom))
a[sample(1:(p*n), zeros)] <- 0
data_zeros <- matrix(a, p, n)
dirmult <- dirmult(t(data_zeros))
alpha <- dirmult$gamma
sum_alpha <- (1-dirmult$theta)/dirmult$theta
A <- alpha/sum_alpha
B <- 1 - A
N <- colSums(data_zeros)
C <- 1 + sum_alpha
var_s_dirm <- list()
for (j in 1:ncol(data_zeros)){
var_s_dirm[[j]] <- N[j]*A*B*((N[j]+sum_alpha)/C)
}
output
var_s_dirm
[[1]]
[1] 2.614833 2.327105 2.500483 3.047700 2.233528 2.130223 2.700103 2.869699 2.930213 2.575903 2.198459 2.846096
[13] 2.425448 3.517559 3.136266 2.565345 2.578267 2.763113 2.709707 3.420792
[[2]]
[1] 2.568959 2.286279 2.456615 2.994231 2.194343 2.092850 2.652732 2.819353 2.878806 2.530712 2.159889 2.796165
[13] 2.382897 3.455848 3.081244 2.520339 2.533034 2.714637 2.662168 3.360778
[[3]]
[1] 3.211199 2.857849 3.070769 3.742790 2.742930 2.616064 3.315916 3.524193 3.598509 3.163391 2.699862 3.495207
[13] 2.978622 4.319811 3.851556 3.150424 3.166294 3.393297 3.327711 4.200974
....
I need help with a code to generate random numbers according to constraints.
Specifically, I am trying to simulate random numbers ALFA and BETA from, respectively, a Normal and a Gamma distribution such that ALFA - BETA < 1.
Here is what I have written but it does not work at all.
set.seed(42)
n <- 0
repeat {
n <- n + 1
a <- rnorm(1, 10, 2)
b <- rgamma(1, 8, 1)
d <- a - b
if (d < 1)
alfa[n] <- a
beta[n] <- b
l = length(alfa)
if (l == 10000) break
}
Due to vectorization, it will be faster to generate the numbers "all at once" rather than in a loop:
set.seed(42)
N = 1e5
a = rnorm(N, 10, 2)
b = rgamma(N, 8, 1)
d = a - b
alfa = a[d < 1]
beta = b[d < 1]
length(alfa)
# [1] 36436
This generated 100,000 candidates, 36,436 of which met your criteria. If you want to generate n samples, try setting N = 4 * n and you'll probably generate more than enough, keep the first n.
Your loop has 2 problems: (a) you need curly braces to enclose multiple lines after an if statement. (b) you are using n as an attempt counter, but it should be a success counter. As written, your loop will only stop if the 10000th attempt is a success. Move n <- n + 1 inside the if statement to fix:
set.seed(42)
n <- 0
alfa = numeric(0)
beta = numeric(0)
repeat {
a <- rnorm(1, 10, 2)
b <- rgamma(1, 8, 1)
d <- a - b
if (d < 1) {
n <- n + 1
alfa[n] <- a
beta[n] <- b
l = length(alfa)
if (l == 500) break
}
}
But the first way is better... due to "growing" alfa and beta in the loop, and generating numbers one at a time, this method takes longer to generate 500 numbers than the code above takes to generate 30,000.
As commented by #Gregor Thomas, the failure of your attempt is due to the missing of curly braces to enclose the if statement. If you would like to skip {} for if control, maybe you can try the code below
set.seed(42)
r <- list()
repeat {
a <- rnorm(1, 10, 2)
b <- rgamma(1, 8, 1)
d <- a - b
if (d < 1) r[[length(r)+1]] <- cbind(alfa = a, beta = b)
if (length(r) == 100000) break
}
r <- do.call(rbind,r)
such that
> head(r)
alfa beta
[1,] 9.787751 12.210648
[2,] 9.810682 14.046190
[3,] 9.874572 11.499204
[4,] 6.473674 8.812951
[5,] 8.720010 8.799160
[6,] 11.409675 10.602608
I would like to estimate the parameter for exponential distribution using Maximum Product of Spacings (MPS). I will have to minimize:
-(1/(n + 1))*(sum of log D[i] from i = 1 to n + 1),
where D[i] = F(x[i]) - F(x[i - 1])
And the following is my R code:
n<- 10
mydata<- rexp(n, rate=2)
x<- sort(mydata)
fnn<- function(lambda,x){
for (i in 2:n){
c<- 1-exp(-lambda*x[i])
d<- 1-exp(-lambda*x[i-1])
}
s<- (1/(n-1))*sum(log(c-d))
return(-s)
}
optim(0.8, fnn, x=x)
Can someone please verify if I am doing the right thing here?
The output I obtained is far from the true value of lambda = 2.
$`par`
[1] 0.92375
$value
[1] 0.1847188
$counts
function gradient
18 NA
$convergence
[1] 0
$message
NULL
what modifications should I include?
The problem with your code is that it is rewriting c and d each time through the for loop. It also had a bug in the computation of the multiplicative constant 1/(n + 1).
Here is a corrected version. The key is to reserve memory before the loop with numeric(n - 1).
I also include a simpler version, taking advantage of R's built-in pexp.
fnn <- function(lambda, x){
n <- length(x)
c <- numeric(n - 1)
d <- numeric(n - 1)
for (i in 2:n){
c[i - 1] <- 1 - exp(-lambda*x[i])
d[i - 1] <- 1 - exp(-lambda*x[i-1])
}
s <- (1/(n + 1))*sum(log(c - d))
return(-s)
}
fnn2 <- function(lambda, x){
n <- length(x)
D <- log(pexp(x[-1], rate = lambda,) - pexp(x[-n], rate = lambda))
s <- sum(D)/(n + 1)
-s
}
set.seed(1234)
n <- 10
mydata <- rexp(n, rate = 2)
x <- sort(mydata)
opt <- optim(0.8, fnn, x = x)
opt2 <- optim(0.8, fnn2, x = x)
opt$par
#[1] 2.9225
opt2$par
#[1] 2.9225
identical(opt$par, opt2$par)
#[1] TRUE
My problem is in R
I start from a dataframe, where I have 2 variables z and p (p are the weights)
I need this sum
∑_i ∑_j ((z_i - z_j)·p_i·p_j·I_z)
Where I_z is an indicator, if z_i < z_j it is = -1, =1 otherwise
please consider that the data are big, dataframe could have also 10000 rows
I try with matrix but I have a problem of memory
I think to be obliged to use for loops...
any suggestion ?
thank you
Elena
Your "indicator" is just a fancy way of defining the abs function.
You can use outer is you have sufficient RAM:
set.seed(2)
n <- 2
DF <- data.frame(z=sample(1:2, n, TRUE),
p=sample(1:2, n, TRUE))
# z p
#1 1 2
#2 2 1
sum(outer(seq_len(nrow(DF)), seq_len(nrow(DF)), function(i, j) {
abs(DF$z[i] - DF$z[j]) * DF$p[i] * DF$p[j]
}))
#[1] 4
n <- 1e4
DF <- data.frame(z=sample(1:2, n, TRUE),
p=sample(1:2, n, TRUE))
sum(outer(seq_len(nrow(DF)), seq_len(nrow(DF)), function(i, j) {
abs(DF$z[i] - DF$z[j]) * DF$p[i] * DF$p[j]
}))
#[1] 112224330
If you don't, you need a loop. Using combn is one possibility, but it is slow since it is basically a loop:
2 * sum(combn(seq_len(nrow(DF)), 2, function(ind) {
abs(z[ind[1]] - z[ind[2]]) * p[ind[1]] * p[ind[2]]
}))
#[1] 112224330
I have got a triple summation expression like this
sum(l(from 1 to n))
sum(i(from 1 to m))
sum(t(from 1 to m)
[phil_z1_1[i]*phil_z1_1[t}*I(X(l)<min(y(i),y(t))]
I have done:
set.seed(1234567)
x <- rnorm(2900)
n <- length(x)
y <- rnorm(3000)*0.25
m <-length(y)
z1 <- runif(m,min=0,max=1)
z2 <- runif(m,min=0,max=1)
phil_z1_1 <- sqrt(12*(z1/z2)))
for min(y[i],y[t]) I have done something like
y_m<-matrix(rep(y,length(y)),ncol=length(y))
y_m_t<-t(y_m)
y_min<-pmin(y_m_t,y_m)
After expanding the two inner summation, For example, for example m=2,n=3
I can put the original expression into the matrices like x*A*x'
where
x=[phil_z1_1[1] phil_z1_1[2]]
A is a 2*2 matrix
A=[sum(from 1 to n) I(x[l]<=min(y[1],y[1]), sum(from 1 to n) I(x[l]<=min(y1,y2); sum(from 1 to n) I(x[l]<=min(y[2],y[1]), sum(from 1 to n) I(x[l]<=min(y[2],y[2])]
Therefore,
x*A*x'=[phil_z1_1[1] phil_z1_1[2]]*[sum(from 1 to n) I(x[l]<=min(y[1],y[1]), sum(from 1 to n) I(x[l]<=min(y1,y2); sum(from 1 to n) I(x[l]<=min(y[2],y[1]), sum(from 1 to n) I(x[l]<=min(y[2],y[2])][phil_z1_1[1] phil_z1_1[2]]'
Basically I want to create a m*m matrix for A, in which each individual element is equal to the sum of its corresponding part, for example, sum(from 1 to n)x[l]<=min(y[1],y[1]) will be the a11 of matrix A I want to create
I have tried to use
args <- expand.grid(l=1:n, i=1:m, t=1:m)
args <- subset(args, x[l] <= pmin(y[i],y[t])-z1[i]*z2[t])
args <- transform(args, result=phil_z1_1[i]*phil_z1_1[t])
sum(args[,"result"])
But r cannot run the above programming, as the sample size of data set is too big, around 3,000.
Can someone tell me how to solve this problem?
Thanks in advance!
Here is a matrix approach for your triple sum
set.seed(1234567)
n <- 10
x <- rnorm(n)
m <- 3000
y <- rnorm(m)/4
y_m <- pmin(matrix(rep(y,m), ncol=m, byrow=TRUE), y)
z1 <- runif(m,min=0,max=1)
z2 <- runif(m,min=0,max=1)
phi <- sqrt(12*(z1/z2))
phi_m <- phi %o% phi
f1 <- function(l) sum(phi_m * (x[l] < y_m))
sum(sapply(1:n, f1))
[1] 242034847337
It is not lightning fast, but much faster than the data.frame approach
f2 <- function(lrng) {
args <- expand.grid(l=lrng, i=1:m, t=1:m)
args <- subset(args, x[l] <= pmin(y[i],y[t]))
args <- transform(args, result=phi[i]*phi[t])
sum(args[,"result"])
}
sum(sapply(1:n, f2)) # 90 times slower
[1] 242034847337