Starting with some sample data:
sample_data <- data.frame(id = 1:3,
x = c(128, 113, 126),
n = c(347, 344, 347),
m = c(335, 334, 347),
index = c(11, 9, -1))
theta <- matrix(c(0.5 ,0.5, 2, 2), nrow=2, ncol=2)
lhs <- function(a, b, g, d, dat){
beta(a + dat$x, b + dat$n - dat$x) / beta(a, b) * beta(g, d + dat$n) / beta(g, d)
}
The function lhs returns a vector of the same number of rows as the argument dat.
rhs <- function(dat, ...){
n = dat$n
m = dat$m
x = dat$x
index = dat$x
temp <- data.frame(i = 0:index,
n = rep(n, index + 1) ,
m = rep(m, index + 1) ,
x = rep(x, index + 1))
sum(beta(a + temp$x, b + temp$m - temp$x + temp$i) / beta(a,b) * beta(g + 1, d + temp$m + temp$i) / beta(g, d))
}
The function rhs works only on a single row because each observation has a different value for index (the index for the sum inside rhs). The intent is to return one value per row in dat. I've tried to do this with apply in the function LL (below).
LL <- function(theta, dat){
a <- theta[1,1]
b <- theta[2,1]
g <- theta[1,2]
d <- theta[2,2]
.lhs <- lhs(a, b, g, d, dat)
.rhs <- ifelse(index > -1, apply(dat, 1, rhs), 0)
sum(log(.lhs+log.rhs))
}
It seems like I need to be able to pass the value of index, n, m, and x to rhs from a given row. That is, not a vector of length(data$n), but the value of n at that row being passed through apply in the function LL.
Is this the correct approach? How can I do such a thing?
Thanks.
Edit
I've clean things up a bit and made a slight modification to the sample data. The correct return value -I think! - can be arrived at by (explicitly passing a,b,g, and d)
sum(lhs(a = theta[1,1],
b = theta[2,1],
g = theta[1,2],
d = theta[2,2],
sample_data)) +
rhs(sample_data[1,]) +
rhs(sample_data[2,]) +
rhs(sample_data[3,])
Related
The for loop below iterates over nodes in an igraph graph. There are 2048 of these, so it is very slow. I've tried to code as efficiently as possible (for example, by not growing vectors). How can I make the loop run faster?
Edit: I've also thought about writing this in C++ via Rcpp. I just don't know how I would use igraph in that case.
Edit 2: compatible_models actually depends on child_node. What I gave here is an example of what it could be for a particular value of child_node.
library(igraph)
library(Metrics)
set.seed(1234)
N <- 10000
A <- rnorm(N, 10, 2)
B <- rnorm(N, 9, 2)
C <- rnorm(N, 12, 1)
D <- rnorm(N, 7, 3)
Y <- A + B + A*B + D + A^2 + rnorm(N)
data <- data.frame(Y = Y, A = A, B = B, C = C, D = D)
partition <- sort(sample(N, 0.7*N))
data_train <- data[partition, ]
data_test <- data[-partition, ]
g <- make_empty_graph()
g <- g + vertices(1:2049)
generate_edges <- function(start_vertex, end_vertices) {
edges <- c()
for (i in 1:length(end_vertices)) {
edges <- c(edges, start_vertex, end_vertices[i])
}
return(edges)
}
outward_edges <- generate_edges(V(g)[1], V(g)[2:vcount(g)])
g <- g + edges(outward_edges, attr1 = rep(0, length(outward_edges) / 2), attr2 = rep(0, length(outward_edges) / 2))
successors <- matrix(nrow = length(g[[1, ]][[1]]), ncol = 9) # nrow = number of successors of node 1
i <- 1
for (child_node in 2:2049) {
# compatible_models <- lapply(...) # suppose this is a list of "formula" objects
# like:
compatible_models <- list(Y ~ A + B + C, Y ~ I(A^2) + B + C + D, Y ~ B + D)
compatible_models <- lapply(compatible_models, lm, data = data_train)
predictions <- sapply(compatible_models, predict, newdata = data_test)
successors[i, 1:3] <- c(edge_attr(g, name = "attr1", g[[1, V(g)[child_node], edges = TRUE]]), edge_attr(g, name = "attr2", g[[1, V(g)[child_node], edges = TRUE]]),
sum(apply(predictions, 2, rmse, actual = data_test$Y))/length(compatible_models))
i <- i + 1
}
Correct me if i am wrong but i think you could evaluate the first three lines (or any lines that build model objects, but do not evaluate anything) outside of the loop, which ~ triples the performance of the code on my machine:
successors <- matrix(nrow = length(g[[1, ]][[1]]), ncol = 9) # nrow = number of successors of node 1
i <- 1
start_time <- Sys.time()
for (child_node in 2:2049) {
# build models inside loop:
compatible_models <- list(Y ~ A + B + C, Y ~ I(A^2) + B + C + D, Y ~ B + D)
compatible_models <- lapply(compatible_models, lm, data = data_train)
predictions <- sapply(compatible_models, predict, newdata = data_test)
successors[i, 1:3] <- c(edge_attr(g, name = "attr1", g[[1, V(g)[child_node], edges = TRUE]]), edge_attr(g, name = "attr2", g[[1, V(g)[child_node], edges = TRUE]]),
sum(apply(predictions, 2, rmse, actual = data_test$Y))/length(compatible_models))
i <- i + 1
}
Sys.time()-start_time
#Time difference of 26.69914 secs
Optimized code with model creation outside of loop:
## model building:
compatible_models <- list(Y ~ A + B + C, Y ~ I(A^2) + B + C + D, Y ~ B + D)
compatible_models <- lapply(compatible_models, lm, data = data_train)
predictions <- sapply(compatible_models, predict, newdata = data_test)
## initialisation:
successors2 <- matrix(nrow = length(g[[1, ]][[1]]), ncol = 9)
i <- 1
start_time <- Sys.time()
for (child_node in 2:2049) {
successors2[i, 1:3] <- c(edge_attr(g, name = "attr1", g[[1, V(g)[child_node], edges = TRUE]]), edge_attr(g, name = "attr2", g[[1, V(g)[child_node], edges = TRUE]]),
sum(apply(predictions, 2, rmse, actual = data_test$Y))/length(compatible_models))
i <- i + 1
}
Sys.time()-start_time
#Time difference of 8.885826 secs
all.equal(successors,successors2)
# [1] TRUE
I have written the following code.
library(quantreg)
# return the g function:
G = function(m, N, gamma) {
Tm = m * N
k = 1:Tm
Gvalue = sqrt(m) * (1 + k/m) * (k/(m + k))^gamma
return(Gvalue)
}
sqroot <- function(A) {
e = eigen(A)
v = e$vectors
val = e$values
sq = v %*% diag(sqrt(val)) %*% solve(v)
return(t(sq))
}
fa = function(m, N, a) {
Tm = m * N
k = 1:Tm
t = (m + k)/m
f_value = (t - 1) * t * (a^2 + log(t/(t - 1)))
return(sqrt(f_value))
}
m = 50
N = 2
n= 50*3
x1 = matrix(runif(n, 0, 1), ncol = 1)
x = cbind(1, x1)
beta = c(1, 1)
xb = x %*% beta
pr = 1/(1+exp(-xb))
y = rbinom(n,1,pr)
# calculate statistic:
stat = function(y, x, m, N, a) {
y_train = y[1:m]
x_train = x[(1:m),]
y_test = y[-(1:m)]
x_test = x[-(1:m),]
fit = glm(y ~ 0 + x, family="binomial")
coef = coef(fit)
log_predict = predict(fit, type="response")
sigma = sqrt(1/(m-1)* sum((y_train - log_predict)^2))
Jvalue = t(x_train) %*% x_train/m * sigma^2
Jsroot = sqroot(Jvalue)
fvalue = fa(m, N, a)
score1 = apply((x_test * as.vector((y_test - x_test %*% coef))), 2, cumsum)
statvalue1 = t(solve(Jsroot) %*% t(score1))/fvalue/sqrt(m)
statmax1 = pmax(abs(statvalue1[, 1]), abs(statvalue1[, 2]))
result = list(stat = statmax1)
return(result)
}
m =50
N = 2
a = 2.795
value = stat(y, x, m, N, a)
value
I want to perform bootstrap to obtain B = 999 number of statistics. I use the following r code. But it produces an error saying "Error in statistic(data, original, ...) :
argument "m" is missing, with no default"
library(boot)
data1 = data.frame(y = y, x = x1, m = m , N = N, a = a)
head(data1)
boot_value = boot(data1, statistic = stat, R = 999)
Can anyone give me a hint? Also, am I able to get the bootstrap results in a matrix format? Since the stat function gives 100 values.
There are different kinds of bootstrapping. If you want to draw from your data 999 samples with replications of same size of your data you may just use replicate, no need for packages.
We put the data to be resampled into a data frame. It looks to me like m, N, a remain constant, so we just provide it as vectors.
data2 <- data.frame(y=y, x=x)
stat function needs to be adapted to unpack y and x-matrix. At the bottom we remove the list call to get just a vector back. unnameing will just give us the numbers.
stat2 <- function(data, m, N, a) {
y_train <- data[1:m, 1]
x_train <- as.matrix(data[1:m, 2:3])
y_test <- data[-(1:m), 1]
x_test <- as.matrix(data[-(1:m), 2:3])
y <- data[, "y"]
x <- as.matrix(data[, 2:3])
fit <- glm(y ~ 0 + x, family="binomial")
coef <- coef(fit)
log_predict <- predict(fit, type="response")
sigma <- sqrt(1/(m-1) * sum((y_train - log_predict)^2))
Jvalue <- t(x_train) %*% x_train/m * sigma^2
Jsroot <- sqroot(Jvalue)
fvalue <- fa(m, N, a)
score1 <- apply((x_test * as.vector((y_test - x_test %*% coef))), 2, cumsum)
statvalue1 <- t(solve(Jsroot) %*% t(score1))/fvalue/sqrt(m)
statmax1 <- pmax(abs(statvalue1[, 1]), abs(statvalue1[, 2]))
result <- unname(statmax1)
return(result)
}
replicate is a cousin of sapply, designed for repeated evaluation. In the call we just sample the rows 999 times and already get a matrix back. As in sapply we need to transform our result.
res <- t(replicate(999, stat2(data2[sample(1:nrow(data2), nrow(data2), replace=TRUE), ], m, N, a)))
Result
As result we get 999 bootstrap replications in the rows with 100 attributes in the columns.
str(res)
# num [1:999, 1:100] 0.00205 0.38486 0.10146 0.12726 0.47056 ...
The code also runs quite fast.
user system elapsed
3.46 0.01 3.49
Note, that there are different kinds of bootstrapping. E.g. sometimes just a part of the sample is resampled, weights are used, clustering is applied etc. Since you attempted to use boot the method shown should be the default, though.
I'm wondering in the following, why when I directly provide the value of q (see f2), uniroot() works perfectly fine BUT when instead I provide q as a function of other input values uniroot() (see f1) fails?
In the code, everything that has ...1 suffix (e.g., f1) relates to when I indirectly provide q. And everything that has ...2 suffix (e.g., f2) relates to when I directly provide q.
My goal is to solve for df2 such that y = .15 (correct answer is ~ 336.3956). (please just run the entire code below.)
alpha = c(.025, .975); df1 = 3; q = 48.05649 ; peta = .3 # input values
f1 <- function(alpha, q, df1, df2, ncp){ # Objective function (`q` indirectly)
alpha - suppressWarnings(pf(q = (peta / df1) / ((1 - peta)/df2), df1, df2,
ncp, lower.tail = FALSE))
}
f2 <- function(alpha, q, df1, df2, ncp){ # Objective function (`q` directly)
alpha - suppressWarnings(pf(q = q, df1, df2, ncp, lower.tail = FALSE))
}
ncp1 <- function(df2){ # root finding
b <- sapply(c(alpha[1], alpha[2]),
function(x) uniroot(f1, c(0, 1e7), alpha = x, q = peta, df1 = df1, df2 = df2)[[1]])
b / (b + (df2 + 4))
}
ncp2 <- function(df2){ # root finding
b <- sapply(c(alpha[1], alpha[2]),
function(x) uniroot(f2, c(0, 1e7), alpha = x, q = q, df1 = df1, df2 = df2)[[1]])
b / (b + (df2 + 4))
}
m1 <- function(df2, y){ # A Utility function
abs(abs(diff(ncp1(df2))) - y)
}
m2 <- function(df2, y){ # A Utility function
abs(abs(diff(ncp2(df2))) - y)
}
optimize(m1, c(1, 1e7), y = .15)[[1]] # Incorrect answer: 1e+07
optimize(m2, c(1, 1e7), y = .15)[[1]] # Correct answer: 336.3956
In ncp1
you have q = peta which then gets passed on to f1 but then does not actually get used, as pf takes q as (peta / df1) / ((1 - peta)/df2).
In ncp2
you have q = peta which then gets passed on to f2 and in turn to pf.
So the bottom line is you use different values for q in pf. If you re-active warnings, you will see that f1 as part of ncp1 fails to reach convergence.
I'm trying to subtract each unique pair-wise ps from the for loop in my function below. To do so, I first find unique pair-wise ps using combn(p, 2) and second use outer to subtract each unique pair from each other.
In both steps, I get error. Is there a fix for the error?
prop <- function(n, yes, a, b = a){
p <- list()
for(i in 1:length(n)){
p[[i]] <- rbeta(2, a[i] + yes[i], b[i] + (n[i] - yes[i]))
}
outer(combn(p, 2), FUN = "-") # Gives Error
}
prop(n = c(10, 20, 30), yes = rep(5, 3), a = rep(1, 3))
By default, it is simplify = TRUE in combn. So, even though the output is a list, it is simplified to have a dim attribute by converting each of the the list as elements in a matrix. As the m is 2, there are 2 list elements for each comparison, extract those elements using [[ and subtract
combn(p, 2, FUN = function(x) x[[1]]- x[[2]])
-full function
prop <- function(n, yes, a, b = a){
p <- list()
for(i in 1:length(n)){
p[[i]] <- rbeta(2, a[i] + yes[i], b[i] + (n[i] - yes[i]))
}
combn(p, 2, FUN = function(x) x[[1]]- x[[2]])
}
prop(n = c(10, 20, 30), yes = rep(5, 3), a = rep(1, 3))
If we wanted to include another argument how
prop <- function(n, yes, a, b = a, how= "one.two"){
delta <- switch(how,
one.two = function(x) x[[1]] - x[[2]],
two.one = function(x) x[[2]] - x[[1]])
p <- list()
for(i in 1:length(n)){
p[[i]] <- rbeta(2, a[i] + yes[i], b[i] + (n[i] - yes[i]))
}
out <- combn(p, 2, FUN = delta)
nm1 <- paste0("p", combn(seq_along(p), 2, FUN = paste, collapse="-"))
colnames(out) <- nm1
out
}
prop(n = c(10, 20, 30), yes = rep(5, 3), a = rep(1, 3), how = "one.two")
prop(n = c(10, 20, 30), yes = rep(5, 3), a = rep(1, 3), how = "two.one")
I am wondering how I could make my function Bpp to accept a vector for its first argument t?
Bpp = function(t, n1, n2 = NULL){
N = ifelse(is.null(n2), n1, n1*n2/(n1+n2))
df = ifelse(is.null(n2), n1 - 1, n1 + n2 - 2)
H1 = integrate(function(delta)dcauchy(delta, 0, sqrt(2)/2)*dt(t, df, delta*sqrt(N)), -Inf, Inf)[[1]]
H0 = dt(t, df)
BF10 = H1/H0
p.value = 2*(1-pt(abs(t), df))
list(BF10 = BF10, p.value = p.value)
}
Bpp(t = -6:6, 20, 20) ## This will give error because `t` is now a vector?
Looks like I could give a quick answer without testing. Use the following in your Bpp:
# joint density
joint <- function(delta, t) dcauchy(delta, 0, sqrt(2)/2) * dt(t, df, delta*sqrt(N))
# marginal density of `t`
marginal.t <- function (t) integrate(joint, lower = -Inf, upper = Inf, t = t)[[1]]
H1 <- sapply(t, marginal.t)
So, here we also could use Vectorize how would that look like?
Use your original Bpp:
Bpp <- Vectorize(Bpp, vectorize.args = "t")
Bpp(-6:6, 20, 20)