Script "tennis common opponent" in R - r

I'm follow this article about "common opponent in tennis", my goal is to script it in the most effecient way. Below you can find my code but is so slow. For calculate the result of 1 match my laptop spent 120seconds more or less, and I have a dataset of 150k of rows to calculate.
the article: https://core.ac.uk/download/pdf/82518495.pdf
Need your help to clean and improve my code. Any suggestion is appreciate
tableA: https://1drv.ms/u/s!At-ZKKnf0H4jafxCX96NLxu00nc
tableB: https://1drv.ms/u/s!At-ZKKnf0H4javHgoPjzfCMXhg4
data_tennis_co: https://1drv.ms/u/s!At-ZKKnf0H4jaJyNkYrr8muff8k
data_tennis_co = read.table("test_co.csv", header=FALSE, sep=",", fill = TRUE)
A = read.table("tableA.csv", header=FALSE, sep=",", fill = TRUE)
B = read.table("tableB.csv", header=FALSE, sep=",", fill = TRUE)
#BASIC FUNCTIONS
G<-function(p){res<- p^4*(15-4*p-((10*p^2)/(1-2*p*(1-p))))}
d<- function(p,q) {res<- p*q*(1-(p*(1-q)+(1-p)*q))^-1}
TB <- function(p,q) {res <- foreach(i = seq_along(1:28), .combine = sum) %dopar% {tb<-A[i,1]*p^A[i,2]*(1-p)^A[i,3]*q^A[i,4]*(1-q)^A[i,5]*d(p,q)^A[i,6]}}
S <- function(p,q) {res <- foreach(i = seq_along(1:21), .combine = rbind) %dopar% {s<-B[i,1]*G(p)^B[i,2]*(1-G(p))^B[i,3]*G(q)^B[i,4]*(1-G(q))^B[i,5]*(G(p)*G(q)+(G(p)*(1-G(q))+(1-G(p))*G(q))*TB(p,q))^B[i,6]} sum(res)}
M3 <- function(p,q) {res <- S(p,q)^2*(1+2*(1-S(p,q)))}
DELTA_AB <- function(spwAC,rpwAC,spwBC,rpwBC) {res <- (spwAC-(1-rpwAC))-(spwBC-(1-rpwBC))}
PR<- function(spwAC,rpwAC,spwBC,rpwBC) {res <- (M3(0.6+DELTA_AB(spwAC,rpwAC,spwBC,rpwBC),(1-0.6))+M3(0.6,(1-(0.6-DELTA_AB(spwAC,rpwAC,spwBC,rpwBC)))))/2}
#COMMON OPPONENTS
MAL<-function(id1,id2){
prova<- subset(data_tennis_co, V3 == 1 & V4==2)
previous<-subset(data_tennis_co, V2 < prova$V2)
s1 <- subset(previous, V3 == 1 | V4==1)
s2 <- subset(previous, V3 ==2 | V4==2)
s1$opp <- ifelse(s1$V3==1, s1$V4, s1$V3)
s2$opp <- ifelse(s2$V3==2, s2$V4, s2$V3)
inn<- intersect(s1$opp,s2$opp)
common1<-s1[s1$opp %in% inn,]
common2<-s2[s2$opp %in% inn,]
# fare media se id non unico
COM <- merge(common1, common2,by=c("opp"))
COM$OMALLEY <- unlist(mapply(PR, COM$V5.x, COM$V6.x, COM$V7.y, COM$V8.y))
COM$OMALLEY[is.nan(COM$OMALLEY)] <- 0.5
return(tryCatch(sum(COM$OMALLEY)/nrow(COM), error=function(e) NaN))
}
tic()
RESA<-MAL(1,2)
toc()

the main bottleneck in the code is the use of parallel loops in TB and S, for operations that can be done faster using vectorized R functions.
G <- function(p) p^4*(15-4*p-((10*p^2)/(1-2*p*(1-p))))
d <- function(p, q) p*q*(1-(p*(1-q)+(1-p)*q))^-1
TB <- function(p, q) sum(A[,1] * p^A[,2] * (1-p)^A[,3] *
q^A[,4] * (1-q)^A[,5] * d(p,q)^A[,6])
S <- function(p, q) {
Gp <- G(p)
Gq <- G(q)
sum(B[,1] * Gp^B[,2] * (1-Gp)^B[,3] * Gq^B[,4] * (1-Gq)^B[,5] *
(Gp*Gq+(Gp*(1-Gq)+(1-Gp)*Gq)*TB(p,q))^B[,6])
}
M3 <- function(p, q) {
s <- S(p,q)
s^2*(1+2*(1-s))
}
DELTA_AB <- function(spwAC,rpwAC,spwBC,rpwBC) (spwAC-(1-rpwAC)) -
(spwBC-(1-rpwBC))
PR <- function(spwAC,rpwAC,spwBC,rpwBC) {
D <- DELTA_AB(spwAC, rpwAC, spwBC, rpwBC)
(M3(p = 0.6 + D, q = (1 - 0.6)) +
M3(p = 0.6, q = 1 - (0.6 - D))) / 2
}
Solution here:
https://codereview.stackexchange.com/questions/194301/tenis-common-opponents-r

Related

Cannot make sense of the error while using OptimParallel in R

I'm trying to run the following function mentioned below using OptimParallel in R on a certain data set. The code is as follows:
install.packages("optimParallel")
install.packages('parallel')
library(parallel)
library(optimParallel)
library(doParallel)
library(data.table)
library(Rlab)
library(HDInterval)
library(mvtnorm)
library(matrixStats)
library(dplyr)
library(cold)
## Bolus data:
data("bolus")
d1 <- bolus
d1$group <- ifelse(d1$group == "2mg",1,0)
colnames(d1) <- c("index",'group',"time","y")
d2 <- d1 %>% select(index, y, group, time)
colnames(d2) <- c('index','y','x1','x2') ### Final data
## Modification of the objective function:
## Another approach:
dpd_poi <- function(x,fixed = c(rep(FALSE,5))){
params <- fixed
dpd_1 <- function(p){
params[!fixed] <- p
alpha <- params[1]
beta_0 <- params[2]
beta_1 <- params[3]
beta_2 <- params[4]
rho <- params[5]
add_pi <- function(d){
k <- beta_0+(d[3]*beta_1)+(d[4]*beta_2)
k1 <- exp(k) ## for Poisson regression
d <- cbind(d,k1)
}
dat_split <- split(x , f = x$index)
result <- lapply(dat_split, add_pi)
result <- rbindlist(result)
result <- as.data.frame(result)
colnames(result) <- c('index','y','x1','x2','lamb')
result_split <- split(result, f = result$index)
expression <- function(d){
bin <- as.data.frame(combn(d$y , 2))
pr <- as.data.frame(combn(d$lamb , 2))
## Evaluation of the probabilities:
f_jk <- function(u,v){
dummy_func <- function(x,y){
ppois(x, lambda = y)
}
dummy_func_1 <- function(x,y){
ppois(x-1, lambda = y)
}
k <- mapply(dummy_func,u,v)
k_1 <- mapply(dummy_func_1,u,v)
inv1 <- inverseCDF(as.matrix(k), pnorm)
inv2 <- inverseCDF(as.matrix(k_1), pnorm)
mean <- rep(0,2)
lower <- inv2
upper <- inv1
corr <- diag(2)
corr[lower.tri(corr)] <- rho
corr[upper.tri(corr)] <- rho
prob <- pmvnorm(lower = lower, upper = upper, mean = mean, corr = corr)
prob <- (1+(1/alpha))*(prob^alpha)
## First expression: (changes for Poisson regression)
lam <- as.vector(t(v))
v1 <- rpois(1000, lambda = lam[1])
v2 <- rpois(1000, lambda = lam[2])
all_possib <- as.data.frame(rbind(v1,v2))
new_func <- function(u){
k <- mapply(dummy_func,u,v)
k_1 <- mapply(dummy_func_1,u,v)
inv1_1 <- inverseCDF(as.matrix(k), pnorm)
inv2_1 <- inverseCDF(as.matrix(k_1), pnorm)
mean1 <- rep(0,2)
lower1 <- inv2_1
upper1 <- inv1_1
corr1 <- diag(2)
corr1[lower.tri(corr1)] <- rho
corr1[upper.tri(corr1)] <- rho
prob1 <- pmvnorm(lower = lower1, upper = upper1, mean = mean1, corr = corr1)
prob1 <- prob1^(alpha)
}
val <- apply(all_possib, 2, new_func)
val_s <- mean(val) ## approximation
return(val_s - prob)
}
final_res <- mapply(f_jk, bin, pr)
final_value <- sum(final_res)
}
u <- sapply(result_split,expression)
return(sum(u))
}
}
## run the objective function:
cl <- makeCluster(25)
setDefaultCluster(cl=cl)
clusterExport(cl,c('d2','val'))
clusterEvalQ(cl,c(library(data.table), library(Rlab),library(HDInterval),library(mvtnorm),library(matrixStats),library(dplyr),library(cold)))
val <- dpd_poi(d2, c(0.5,FALSE,FALSE,FALSE,FALSE))
optimParallel(par = c(beta_0 =1, beta_1 =0.1 ,beta_2 = 1,rho=0.2),fn = val ,method = "L-BFGS-B",lower = c(-10,-10,-10,0),upper = c(Inf,Inf,Inf,1))
stopCluster(cl)
After running for some time, it returns the following error:
checkForRemoteErrors(val)
9 nodes produced errors; first error: missing value where TRUE/FALSE needed
However, when I make a minor change in the objective function (pick 2 random numbers from rpois instead of 1000) and run the same code using optim, it converges and gives me a proper result. This is a Monte Carlo simulation and it does not make sense to draw so few Poisson variables. I have to use optimParllel, otherwise, it takes way too long to converge. I could also run this code using simulated data.
I'm unable to figure out where the issue truly lies. I truly appreciate any help in this regard.

Speed up loop operation

I need to run a coverage probability test on different sample sizes and censoring proportions. I need to replicate 1000 bootstrap samples using the boot function in R. I have run the code for up to 3-8 hours and I have no idea regarding the runtime.
set.seed(20)
lambda <- 0.02
beta <- 0.5
alpha <- 0.05
n <- 140
N <- 1000
lambda_hat <- NULL
beta_hat <- NULL
cp <- NULL
bp_lambda <- matrix(NA, nrow=N, ncol=2)
bp_beta <- matrix(NA, nrow=N, ncol=2)
for (i in 1:N) {
u <- runif(n)
c_i <- rexp(n, 0.0001)
t_i <- (log(1 - (1/lambda)*log(1 - u)))^(1/beta)
s_i <- 1*(t_i < c_i)
t <- pmin(t_i, c_i)
data <- data.frame(u, t_i, c_i, s_i, t)
estimates.boot <- function(data, j){
data <- data[j, ]
data0 <- data[which(data$s_i == 0), ] #uncensored data
data1 <- data[which(data$s_i == 1), ] #right censored data
data
library(maxLik)
LLF <- function(para) {
t1 <- data$t_i
lambda <- para[1]
beta <- para[2]
e <- s_i*log(lambda*t1^(beta - 1)*beta*exp(t1^beta)*exp(lambda*(1 - exp(t1^beta))))
r <- (1 - s_i)*log(exp(lambda*(1 - exp(t1^beta))))
f <- sum(e + r)
return(f)
}
mle <- maxLik(LLF, start=c(para=c(0.02, 0.5)))
lambda_hat[i] <- mle$estimate[1]
beta_hat[i] <- mle$estimate[2]
return(c(lambda_hat[i], beta_hat[i]))
}
library(boot)
bootstrap <- boot(data, estimates.boot, 1000)
bootlambda <- bootstrap$t[, 1]
klambda <- bootlambda[order(bootlambda)]
bp_lambda[i, ] <- c(klambda[25], klambda[975])
bootbeta <- bootstrap$t[, 2]
kbeta <- bootbeta[order(bootbeta)]
bp_beta[i, ] <- c(kbeta[25], kbeta[975])
}
left_lambda <- sum(bp_lambda[, 1]>lambda)/N
right_lambda <- sum(bp_lambda[, 2]<lambda)/N
total_lambda <- left_lambda + right_lambda
left_beta <- sum(bp_beta[, 1] > beta)/N
right_beta <- sum(bp_beta[, 2]<beta)/N
total_beta <- left_beta + right_beta
sealphahat <- sqrt(alpha*(1 - alpha)/N)
antilambda <- total_lambda>(alpha + 2.58*sealphahat)
conlambda <- total_lambda<(alpha - 2.58*sealphahat)
asymlambda <- (max(left_lambda, right_lambda)/min(left_lambda, right_lambda)) > 1.5
antibeta <- total_beta > (alpha + 2.58*sealphahat)
conbeta <- total_beta < (alpha - 2.58*sealphahat)
asymbeta <- (max(left_beta, right_beta)/min(left_beta, right_beta)) > 1.5
anti <- antilambda + antibeta
con <- conlambda + conbeta
asym <- asymlambda + asymbeta
cbind(anti, con, asym)
Anyone have idea how to speed up the operation?
Basically, you want to apply a random sampling to an estimation function (inner bootstrap) and randomly sample the entire process again (outer bootstrap).
Consequently we could write an estimation function estimate() using replicate() (to avoid boot:boot) and a function for the inner bootstrap innerBoot(). In the latter we could use matrixStats::rowQuantiles for fast computation of the quantiles you want.
I essentially used your code, just fixed a few issues that prevented the code from running.
estimate <- function() {
u <- runif(n)
c_i <- rexp(n, 0.0001)
t_i <- (log(1 - (1/lambda)*log(1 - u)))^(1/beta)
s_i <- 1*(t_i < c_i)
t <- pmin(t_i, c_i)
LLF <- function(para) {
lambda <- para[1]
beta <- para[2]
e <- s_i*log(lambda*t_i^(beta - 1)*beta*exp(t_i^beta)*exp(lambda*(1 - exp(t_i^beta))))
r <- (1 - s_i)*log(exp(lambda*(1 - exp(t_i^beta))))
return(sum(e + r))
}
mle <- maxLik::maxLik(LLF, start=c(para=c(0.02, 0.5)))
return(setNames(mle$estimate, c('lambda_hat', 'beta_hat')))
}
innerBoot <- function() {
boot <- replicate(N, estimate())
return(matrixStats::rowQuantiles(boot, p=c(.025, .975)))
}
We also perform the outer bootstrap with replicate(). I wrap it here in system.time() to get a time measurement.
lambda <- 0.02
beta <- 0.5
alpha <- 0.05
n <- 140
# N <- 1000
N <- 10 ## for testing
seed <- 42
set.seed(seed)
tm <- system.time(
BA <- replicate(N, innerBoot())
)
I got these measurements,
tm
# user system elapsed ## N = 10
# 1.055 0.000 1.057
# user system elapsed ## N = 100
# 102.012 0.227 102.489
which indicates that for N <- 1000 about 167 minutes are to be expected.
The result is an array of dim 2x2xN.
> dim(BA)
[1] 2 2 100
To calculate the summaries we may easily refer to the respective cells.
boot_sum <- function(BA) {
left_lambda <- sum(BA[1, 1, ] > lambda)/N
right_lambda <- sum(BA[1, 2, ] >< lambda)/N
left_beta <- sum(BA[2, 1, ] > beta)/N
right_beta <- sum(BA[2, 2, ] < beta)/N
total_lambda <- left_lambda + right_lambda
total_beta <- left_beta + right_beta
sealphahat <- sqrt(alpha*(1 - alpha)/N)
antilambda <- total_lambda > (alpha + 2.58*sealphahat)
conlambda <- total_lambda < (alpha - 2.58*sealphahat)
asymlambda <- (max(left_lambda, right_lambda)/min(left_lambda, right_lambda)) > 1.5
antibeta <- total_beta > (alpha + 2.58*sealphahat)
conbeta <- total_beta < (alpha - 2.58*sealphahat)
asymbeta <- (max(left_beta, right_beta)/min(left_beta, right_beta)) > 1.5
anti <- antilambda + antibeta
con <- conlambda + conbeta
asym <- asymlambda + asymbeta
return(cbind(anti, con, asym))
}
boot_sum(BA)
# anti con asym
# [1,] 2 0 2
Note: You should definitely check the code in the body of estimate() (i.e. run it manually several times without bootstrapping), as it throws warnings every now and then, probably there is a mistake in how you define LLF().
Warning messages:
1: In log(lambda * t_i^(beta - 1) * beta * exp(t_i^beta) * exp(lambda * :
NaNs produced
2: In log(lambda * t_i^(beta - 1) * beta * exp(t_i^beta) * exp(lambda * :
NaNs produced
Also I'm not sure if the summary calculation currently makes much sense.
My advice is to check your likelihood function and the summary by 1. manually run the lines, 2. starting with a very small N like 10 or so, to see if calculations make sense.
Once you've checked that, it's worth to wait the ~167 minutes to wait for the result.
Or parallelize innerBoot(), which is about 80% faster overall (using 7 cores), as follows:
innerBootParallel <- function() {
boot <- parSapply(cl, 1:N, function(i) estimate())
return(matrixStats::rowQuantiles(boot, p=c(.025, .975)))
}
library(parallel)
cl <- makeCluster(detectCores() - 1)
clusterExport(cl, c('estimate', 'n', 'lambda', 'N', 'beta'))
clusterSetRNGStream(cl, seed)
BA <- replicate(N, innerBootParallel())
stopCluster(cl)
boot_sum(BA)

R microbenching loop vs vector vs parallel is yielding inverted results

I'm having a weird issue and not able to identify the root cause. I wrote a simple for-loop, vector, and parallel core and running in the Jupyter environment.
fn_loop <- function(x1, nrep = 1000) {
n1 <- length(x1)
B <- nrep
Tboot <- rep(NA, B)
xx1 <- rep(NA, n1)
for (i in 1:B) {
xx1 <- sample(x1, n1, replace = TRUE) # sample of size n1 with replacement from x1
Tboot[i] <- mean(xx1)
}
return(Tboot)
}
fn_vec <- function(x1, nrep = 1000) {
n1 <- length(x1)
B <- nrep
Tboot <- lapply(1:nrep, function(i){ mean(sample(x1, n1, replace = TRUE))})
return(Tboot)
}
fn_par <- function(x1, nrep = 1000) {
n1 <- length(x1)
B <- nrep
Tboot <- mclapply(1:nrep, function(i){ mean(sample(x1, n1, replace = TRUE))}, mc.cores=8)
return(Tboot)
}
#Benchmark forloop, vector, parallel codes
library(microbenchmark)
m <- microbenchmark(times = 1000, # default is 100
"loop" = fn_loop(x1, 1000),
"vector" = fn_vec(x1, 1000),
"parallel" = fn_par(x1, 1000))
ggplot2::autoplot(m)
The results in the ggplot2 are inverted...loop is taking the least amount of code compared to parallel. Can someone please point my mistake. Thanks .

Speeding up linear model fitting on complete pairwise observations in large sparse matrix in R

I have a numeric data.frame df with 134946 rows x 1938 columns.
99.82% of the data are NA.
For each pair of (distinct) columns "P1" and "P2", I need to find which rows have non-NA values for both and then do some operations on those rows (linear model).
I wrote a script that does this, but it seems quite slow.
This post seems to discuss a related task, but I can't immediately see if or how it can be adapted to my case.
Borrowing the example from that post:
set.seed(54321)
nr = 1000;
nc = 900;
dat = matrix(runif(nr*nc), nrow=nr)
rownames(dat) = paste(1:nr)
colnames(dat) = paste("time", 1:nc)
dat[sample(nr*nc, nr*nc*0.9)] = NA
df <- as.data.frame(dat)
df_ps <- names(df)
N_ps <- length(df_ps)
My script is:
tic = proc.time()
out <- do.call(rbind,sapply(1:(N_ps-1), function(i) {
if (i/10 == floor(i/10)) {
cat("\ni = ",i,"\n")
toc = proc.time();
show(toc-tic);
}
do.call(rbind,sapply((i+1):N_ps, function(j) {
w <- which(complete.cases(df[,i],df[,j]))
N <- length(w)
if (N >= 5) {
xw <- df[w,i]
yw <- df[w,j]
if ((diff(range(xw)) != 0) & (diff(range(yw)) != 0)) {
s <- summary(lm(yw~xw))
o <- c(i,j,N,s$adj.r.squared,s$coefficients[2],s$coefficients[4],s$coefficients[8],s$coefficients[1],s$coefficients[3],s$coefficients[7])} else {
o <- c(i,j,N,rep(NA,7))
}
} else {o <- NULL}
return(o)
},simplify=F))
}
,simplify=F))
toc = proc.time();
show(toc-tic);
This takes about 10 minutes on my machine.
You can imagine what happens when I need to handle a much larger (although more sparse) data matrix. I never managed to finish the calculation.
Question: do you think this could be done more efficiently?
The thing is I don't know which operations take more time (subsetting of df, in which case I would remove duplications of that? appending matrix data, in which case I would create a flat vector and then convert it to matrix at the end? ...).
Thanks!
EDIT following up from minem's post
As shown by minem, the speed of this calculation strongly depended on the way linear regression parameters were calculated. Therefore changing that part was the single most important thing to do.
My own further trials showed that: 1) it was essential to use sapply in combination with do.call(rbind, rather than any flat vector, to store the data (I am still not sure why - I might make a separate post about this); 2) on the original matrix I am working on, much more sparse and with a much larger nrows/ncolumns ratio than the one in this example, using the information on the x vector available at the start of each i iteration to reduce the y vector at the start of each j iteration increased the speed by several orders of magnitude, even compared with minem's original script, which was already much better than mine above.
I suppose the advantage comes from filtering out many rows a priori, thus avoiding costly xna & yna operations on very long vectors.
The modified script is the following:
set.seed(54321)
nr = 1000;
nc = 900;
dat = matrix(runif(nr*nc), nrow = nr)
rownames(dat) = paste(1:nr)
colnames(dat) = paste("time", 1:nc)
dat[sample(nr*nc, nr*nc*0.90)] = NA
df <- as.data.frame(dat)
df_ps <- names(df)
N_ps <- length(df_ps)
tic = proc.time()
naIds <- lapply(df, function(x) !is.na(x))
dl <- as.list(df)
rl <- sapply(1:(N_ps - 1), function(i) {
if ((i-1)/10 == floor((i-1)/10)) {
cat("\ni = ",i,"\n")
toc = proc.time();
show(toc-tic);
}
x <- dl[[i]]
xna <- which(naIds[[i]])
rl2 <- sapply((i + 1):N_ps, function(j) {
y <- dl[[j]][xna]
yna <- which(naIds[[j]][xna])
w <- xna[yna]
N <- length(w)
if (N >= 5) {
xw <- x[w]
yw <- y[yna]
if ((min(xw) != max(xw)) && (min(yw) != max(yw))) {
# extracts from lm/lm.fit/summary.lm functions
X <- cbind(1L, xw)
m <- .lm.fit(X, yw)
# calculate adj.r.squared
fitted <- yw - m$residuals
rss <- sum(m$residuals^2)
mss <- sum((fitted - mean(fitted))^2)
n <- length(m$residuals)
rdf <- n - m$rank
# rdf <- df.residual
r.squared <- mss/(mss + rss)
adj.r.squared <- 1 - (1 - r.squared) * ((n - 1L)/rdf)
# calculate se & pvals
p1 <- 1L:m$rank
Qr <- m$qr
R <- chol2inv(Qr[p1, p1, drop = FALSE])
resvar <- rss/rdf
se <- sqrt(diag(R) * resvar)
est <- m$coefficients[m$pivot[p1]]
tval <- est/se
pvals <- 2 * pt(abs(tval), rdf, lower.tail = FALSE)
res <- c(m$coefficients[2], se[2], pvals[2],
m$coefficients[1], se[1], pvals[1])
o <- c(i, j, N, adj.r.squared, res)
} else {
o <- c(i,j,N,rep(NA,7))
}
} else {o <- NULL}
return(o)
}, simplify = F)
do.call(rbind, rl2)
}, simplify = F)
out2 <- do.call(rbind, rl)
toc = proc.time();
show(toc - tic)
E.g. try with nr=100000; nc=100.
I should probably mention that I tried using indices, i.e.:
naIds <- lapply(df, function(x) which(!is.na(x)))
and then obviously generating w by intersection:
w <- intersect(xna,yna)
N <- length(w)
This however is slower than the above.
Larges bottleneck is lm function, because there are lot of checks & additional calculations, that you do not necessarily need. So I extracted only the needed parts.
I got this to run in +/- 18 seconds.
set.seed(54321)
nr = 1000;
nc = 900;
dat = matrix(runif(nr*nc), nrow = nr)
rownames(dat) = paste(1:nr)
colnames(dat) = paste("time", 1:nc)
dat[sample(nr*nc, nr*nc*0.9)] = NA
df <- as.data.frame(dat)
df_ps <- names(df)
N_ps <- length(df_ps)
tic = proc.time()
naIds <- lapply(df, function(x) !is.na(x)) # outside loop
dl <- as.list(df) # sub-setting list elements is faster that columns
rl <- sapply(1:(N_ps - 1), function(i) {
x <- dl[[i]]
xna <- naIds[[i]] # relevant logical vector if not empty elements
rl2 <- sapply((i + 1):N_ps, function(j) {
y <- dl[[j]]
yna <- naIds[[j]]
w <- xna & yna
N <- sum(w)
if (N >= 5) {
xw <- x[w]
yw <- y[w]
if ((min(xw) != max(xw)) && (min(xw) != max(xw))) { # faster
# extracts from lm/lm.fit/summary.lm functions
X <- cbind(1L, xw)
m <- .lm.fit(X, yw)
# calculate adj.r.squared
fitted <- yw - m$residuals
rss <- sum(m$residuals^2)
mss <- sum((fitted - mean(fitted))^2)
n <- length(m$residuals)
rdf <- n - m$rank
# rdf <- df.residual
r.squared <- mss/(mss + rss)
adj.r.squared <- 1 - (1 - r.squared) * ((n - 1L)/rdf)
# calculate se & pvals
p1 <- 1L:m$rank
Qr <- m$qr
R <- chol2inv(Qr[p1, p1, drop = FALSE])
resvar <- rss/rdf
se <- sqrt(diag(R) * resvar)
est <- m$coefficients[m$pivot[p1]]
tval <- est/se
pvals <- 2 * pt(abs(tval), rdf, lower.tail = FALSE)
res <- c(m$coefficients[2], se[2], pvals[2],
m$coefficients[1], se[1], pvals[1])
o <- c(i, j, N, adj.r.squared, res)
} else {
o <- c(i,j,N,rep(NA,6))
}
} else {o <- NULL}
return(o)
}, simplify = F)
do.call(rbind, rl2)
}, simplify = F)
out2 <- do.call(rbind, rl)
toc = proc.time();
show(toc - tic);
# user system elapsed
# 17.94 0.11 18.44

How to convert UK grid reference to latitude and longitude in R

I have a vector of UK British National Grid references:
x <- c("SK393744", "SK442746", "SK376747", "SK108191", "SP169914", "SP206935", "SK173105", "SJ993230", "SK448299", "SK112396")
I need to convert this vector in WGS84 coordinates (latitude and longitude).
How can I do it using R?
Give these a go. If they work, I'll make a package with a few more of the other functions in that javascript library (which also has sister PHP & Java libraries, so it's fitting R shld have one).
# takes numeric east/north generated from the os.grid.parse() function
# i shld have made it take the vector the os.grid.parse() returns but
# we'll save that for a proper package version
os.grid.to.lat.lon <- function(E, N) {
a <- 6377563.396
b <- 6356256.909
F0 <- 0.9996012717
lat0 <- 49*pi/180
lon0 <- -2*pi/180
N0 <- -100000
E0 <- 400000
e2 <- 1 - (b^2)/(a^2)
n <- (a-b)/(a+b)
n2 <- n^2
n3 <- n^3
lat <- lat0
M <- 0
repeat {
lat <- (N-N0-M)/(a*F0) + lat
Ma <- (1 + n + (5/4)*n2 + (5/4)*n3) * (lat-lat0)
Mb <- (3*n + 3*n*n + (21/8)*n3) * sin(lat-lat0) * cos(lat+lat0)
Mc <- ((15/8)*n2 + (15/8)*n3) * sin(2*(lat-lat0)) * cos(2*(lat+lat0))
Md <- (35/24)*n3 * sin(3*(lat-lat0)) * cos(3*(lat+lat0))
M <- b * F0 * (Ma - Mb + Mc - Md)
if (N-N0-M < 0.00001) { break }
}
cosLat <- cos(lat)
sinLat <- sin(lat)
nu <- a*F0/sqrt(1-e2*sinLat*sinLat)
rho <- a*F0*(1-e2)/((1-e2*sinLat*sinLat)^1.5)
eta2 <- nu/rho-1
tanLat <- tan(lat)
tan2lat <- tanLat*tanLat
tan4lat <- tan2lat*tan2lat
tan6lat <- tan4lat*tan2lat
secLat <- 1/cosLat
nu3 <- nu*nu*nu
nu5 <- nu3*nu*nu
nu7 <- nu5*nu*nu
VII <- tanLat/(2*rho*nu)
VIII <- tanLat/(24*rho*nu3)*(5+3*tan2lat+eta2-9*tan2lat*eta2)
IX <- tanLat/(720*rho*nu5)*(61+90*tan2lat+45*tan4lat)
X <- secLat/nu
XI <- secLat/(6*nu3)*(nu/rho+2*tan2lat)
XII <- secLat/(120*nu5)*(5+28*tan2lat+24*tan4lat)
XIIA <- secLat/(5040*nu7)*(61+662*tan2lat+1320*tan4lat+720*tan6lat)
dE <- (E-E0)
dE2 <- dE*dE
dE3 <- dE2*dE
dE4 <- dE2*dE2
dE5 <- dE3*dE2
dE6 <- dE4*dE2
dE7 <- dE5*dE2
lon <- lon0 + X*dE - XI*dE3 + XII*dE5 - XIIA*dE7
lat <- lat - VII*dE2 + VIII*dE4 - IX*dE6
lat <- lat * 180/pi
lon <- lon * 180/pi
return(c(lat, lon))
}
# takes a string OS reference and returns an E/N vector
os.grid.parse <- function(grid.ref) {
grid.ref <- toupper(grid.ref)
# get numeric values of letter references, mapping A->0, B->1, C->2, etc:
l1 <- as.numeric(charToRaw(substr(grid.ref,1,1))) - 65
l2 <- as.numeric(charToRaw(substr(grid.ref,2,2))) - 65
# shuffle down letters after 'I' since 'I' is not used in grid:
if (l1 > 7) l1 <- l1 - 1
if (l2 > 7) l2 <- l2 - 1
# convert grid letters into 100km-square indexes from false origin - grid square SV
e <- ((l1-2) %% 5) * 5 + (l2 %% 5)
n <- (19 - floor(l1/5) *5 ) - floor(l2/5)
if (e<0 || e>6 || n<0 || n>12) { return(c(NA,NA)) }
# skip grid letters to get numeric part of ref, stripping any spaces:
ref.num <- gsub(" ", "", substr(grid.ref, 3, nchar(grid.ref)))
ref.mid <- floor(nchar(ref.num) / 2)
ref.len <- nchar(ref.num)
if (ref.len >= 10) { return(c(NA,NA)) }
e <- paste(e, substr(ref.num, 0, ref.mid), sep="", collapse="")
n <- paste(n, substr(ref.num, ref.mid+1, ref.len), sep="", collapse="")
nrep <- 5 - match(ref.len, c(0,2,4,6,8))
e <- as.numeric(paste(e, "5", rep("0", nrep), sep="", collapse=""))
n <- as.numeric(paste(n, "5", rep("0", nrep), sep="", collapse=""))
return(c(e,n))
}

Resources