How to convert UK grid reference to latitude and longitude in R - r

I have a vector of UK British National Grid references:
x <- c("SK393744", "SK442746", "SK376747", "SK108191", "SP169914", "SP206935", "SK173105", "SJ993230", "SK448299", "SK112396")
I need to convert this vector in WGS84 coordinates (latitude and longitude).
How can I do it using R?

Give these a go. If they work, I'll make a package with a few more of the other functions in that javascript library (which also has sister PHP & Java libraries, so it's fitting R shld have one).
# takes numeric east/north generated from the os.grid.parse() function
# i shld have made it take the vector the os.grid.parse() returns but
# we'll save that for a proper package version
os.grid.to.lat.lon <- function(E, N) {
a <- 6377563.396
b <- 6356256.909
F0 <- 0.9996012717
lat0 <- 49*pi/180
lon0 <- -2*pi/180
N0 <- -100000
E0 <- 400000
e2 <- 1 - (b^2)/(a^2)
n <- (a-b)/(a+b)
n2 <- n^2
n3 <- n^3
lat <- lat0
M <- 0
repeat {
lat <- (N-N0-M)/(a*F0) + lat
Ma <- (1 + n + (5/4)*n2 + (5/4)*n3) * (lat-lat0)
Mb <- (3*n + 3*n*n + (21/8)*n3) * sin(lat-lat0) * cos(lat+lat0)
Mc <- ((15/8)*n2 + (15/8)*n3) * sin(2*(lat-lat0)) * cos(2*(lat+lat0))
Md <- (35/24)*n3 * sin(3*(lat-lat0)) * cos(3*(lat+lat0))
M <- b * F0 * (Ma - Mb + Mc - Md)
if (N-N0-M < 0.00001) { break }
}
cosLat <- cos(lat)
sinLat <- sin(lat)
nu <- a*F0/sqrt(1-e2*sinLat*sinLat)
rho <- a*F0*(1-e2)/((1-e2*sinLat*sinLat)^1.5)
eta2 <- nu/rho-1
tanLat <- tan(lat)
tan2lat <- tanLat*tanLat
tan4lat <- tan2lat*tan2lat
tan6lat <- tan4lat*tan2lat
secLat <- 1/cosLat
nu3 <- nu*nu*nu
nu5 <- nu3*nu*nu
nu7 <- nu5*nu*nu
VII <- tanLat/(2*rho*nu)
VIII <- tanLat/(24*rho*nu3)*(5+3*tan2lat+eta2-9*tan2lat*eta2)
IX <- tanLat/(720*rho*nu5)*(61+90*tan2lat+45*tan4lat)
X <- secLat/nu
XI <- secLat/(6*nu3)*(nu/rho+2*tan2lat)
XII <- secLat/(120*nu5)*(5+28*tan2lat+24*tan4lat)
XIIA <- secLat/(5040*nu7)*(61+662*tan2lat+1320*tan4lat+720*tan6lat)
dE <- (E-E0)
dE2 <- dE*dE
dE3 <- dE2*dE
dE4 <- dE2*dE2
dE5 <- dE3*dE2
dE6 <- dE4*dE2
dE7 <- dE5*dE2
lon <- lon0 + X*dE - XI*dE3 + XII*dE5 - XIIA*dE7
lat <- lat - VII*dE2 + VIII*dE4 - IX*dE6
lat <- lat * 180/pi
lon <- lon * 180/pi
return(c(lat, lon))
}
# takes a string OS reference and returns an E/N vector
os.grid.parse <- function(grid.ref) {
grid.ref <- toupper(grid.ref)
# get numeric values of letter references, mapping A->0, B->1, C->2, etc:
l1 <- as.numeric(charToRaw(substr(grid.ref,1,1))) - 65
l2 <- as.numeric(charToRaw(substr(grid.ref,2,2))) - 65
# shuffle down letters after 'I' since 'I' is not used in grid:
if (l1 > 7) l1 <- l1 - 1
if (l2 > 7) l2 <- l2 - 1
# convert grid letters into 100km-square indexes from false origin - grid square SV
e <- ((l1-2) %% 5) * 5 + (l2 %% 5)
n <- (19 - floor(l1/5) *5 ) - floor(l2/5)
if (e<0 || e>6 || n<0 || n>12) { return(c(NA,NA)) }
# skip grid letters to get numeric part of ref, stripping any spaces:
ref.num <- gsub(" ", "", substr(grid.ref, 3, nchar(grid.ref)))
ref.mid <- floor(nchar(ref.num) / 2)
ref.len <- nchar(ref.num)
if (ref.len >= 10) { return(c(NA,NA)) }
e <- paste(e, substr(ref.num, 0, ref.mid), sep="", collapse="")
n <- paste(n, substr(ref.num, ref.mid+1, ref.len), sep="", collapse="")
nrep <- 5 - match(ref.len, c(0,2,4,6,8))
e <- as.numeric(paste(e, "5", rep("0", nrep), sep="", collapse=""))
n <- as.numeric(paste(n, "5", rep("0", nrep), sep="", collapse=""))
return(c(e,n))
}

Related

i am simulating CTMC using Gillespie Algorithm for dynamics of leprosy using 10 compartment ,encounter an error when closing my bracket after simdat,i

abcdefghij.onestep <- function (x, params) {
Susceptible <- x[2]
Exposed <- x[3]
Infected_Multibacillary <- x[4]
Infected_Paucibacillary <- x[5]
Exposed_Detected_Diagnosis <- x[6]
Treated <- x[7]
Disability <- x[8]
Recovered <- x[9]
Relapse_Multibacillary <-x[10]
Relapse_Paucibacillary <-x[11]
N <- Susceptible + Exposed + Infected_Multibacillary + Infected_Paucibacillary + Exposed_Detected_Diagnosis + Treated + Disability + Recovered + Relapse_Multibacillary + Relapse_Paucibacillary
m12 <- params["m12"]
m25 <- params["m25"]
m23 <- params["m23"]
m24 <- params["m24"]
m35 <- params["m35"]
m45 <- params["m45"]
m37 <- params["m37"]
m56 <- params["m56"]
m67 <- params["m67"]
m68 <- params["m68"]
m89 <- params["m89"]
m810 <- params["m810"]
m96 <- params["m96"]
m97 <- params["m97"]
m106 <- params["m106"]
mu <- params["mu"]
rates <- c(
birth=mu*N, susceptible_exposed=m12*Susceptible*Infected_Multibacillary -m25*Infected_Multibacillary * Exposed_Detected_Diagnosis -m23*Exposed*Infected_Multibacillary -m24*Exposed*Infected_Paucibacillary,
exposed_infected_multibacillary=m23*Exposed*Infected_Paucibacillary-m35*Infected_Multibacillary*Exposed_Detected_Diagnosis-m45* Exposed_Detected_Diagnosis* Infected_Paucibacillary-m37*Disability*Infected_Multibacillary,
exposed_infected_paucibacillary=m24*Exposed*Infected_Paucibacillary-m45*Infected_Paucibacillary*Exposed_Detected_Diagnosis,
infected_multibacillary_exposed_detected=m35*Infected_Multibacillary*Exposed_Detected_Diagnosis+m45*Infected_Paucibacillary* Exposed_Detected_Diagnosis-m56* Exposed_Detected_Diagnosis*Treated,
exposed_detected_treatment=m56*Exposed_Detected_Diagnosis*Treated-m67*Treated* Disability-m68*Treated*Recovered,
infected_multibacillary_disability=m37*Disability*Infected_Multibacillary+m67*Treated* Disability,
treated_recovered=m68* Treated*Recovered-m89*Recovered*Infected_Multibacillary-m810* Recovered*Infected_Paucibacillary,
relapse_multibacillary_treatment=-m96*Relapse_Multibacillary* Treated-m97*Relapse_Multibacillary*Disability,
relapse_paucibacillary_treatment=-m106*Relapse_Paucibacillary*Treated,
susceptible_death=mu*Susceptible,
exposed_death=mu*Exposed,
infected_multibacillary_death=mu*Infected_Multibacillary,
infected_paucibacillary_death=mu*Infected_Paucibacillary,
exposed_detected_death=mu* Exposed_Detected_Diagnosis,
treatment_death=mu*Treated,
disability_death=mu*Disability,
recovered_death=mu*Recovered,
relapse_multibacillary_death=mu*Relapse_Multibacillary,
relapse_paucibacillary_death=mu*Relapse_Paucibacilary
)
transitions <- list(
birth=c(1,0,0,0,0,0,0,0,0,0),
susceptible_exposed=c(-1,1,0,0,0,0,0,0,0,0,0,0),
exposed_infected_multibacillary=c(0,-1,1,0,0,0,0,0,0,0),
exposed_infected_paucibacillary=c(0,-1,0,1,0,0,0,0,0,0),
infected_multibacillary_exposed_detected=c(0,0,-1,0,1,0,0,0,0,0),
exposed_detected_treatment=c(0,0,0,0,-1,1,0,0,0,0),
infected_multibacillary_disability=c(0,0,-1,0,0,0,1,0,0,0),
treated_recovered=c(0,0,0,0,0,-1,0,1,0,0),
relapse_multibacillary_treatment=c(0,0,0,0,0,1,0,0,-1,0),
relapse_paucibacillary_treatment=c(0,0,0,0,0,1,0,0,0,-1),
susceptible_death=c(-1,0,0,0,0,0,0,0,0,0),
exposed_death= c(0,-1,0,0,0,0,0,0,0,0),
infected_multibacillary_death= c(0,0,-1,0,0,0,0,0,0,0),
infected_paucibacillary_death= c(0,0,0,-1,0,0,0,0,0,0),
exposed_detected_death= c(0,0,0,0,-1,0,0,0,0,0),
treatment_death= c(0,0,0,0,0,-1,0,0,0,0),
disability_death= c(0,0,0,0,0,0,-1,0,0,0),
recovered_death= c(0,0,0,0,0,0,0,-1,0,0),
relapse_multibacillary_death= c(0,0,0,0,0,0,0,0,-1,0),
relapse_paucibacillary_death= c(0,0,0,0,0,0,0,0,0,-1)
)
total.rate <- sum(rates)
if (total.rate==0)
tau <- Inf
else
tau <- rexp(n=1,rate=total.rate)
event <- sample.int(n=6,size=1,prob=rates/total.rate)
x+c(tau,transitions[[event]])
}
abcdefghij.simul <- function (x, params, maxstep = 10000) {
output <- array(dim=c(maxstep+1,4))
colnames(output) <- names(x)
output[1,] <-x
k <- 1
while ((k <= maxstep) && (x["Exposed"] > 0)) {
k <- k+1
output[k,] <- x <- abcdefghij.onestep(x,params)
}
as.data.frame(output[1:k,])
}
And in R this happens:
> set.seed(56856583)
> nsims <- 1
> xstart <- c(time=1,Susceptible=100000,Exposed=1,Infected_Multibacillary=1,Infected_Paucibacillary=1,Exposed_Detected_Diagnosis=1,Treated=1,Disability=1,Recovered=1,Relapse_Multibacillary=1,Relapse_Paucibacillary=1)
> library(plyr)
> simdat <- rdply(nsims, abcdefghij.simul(xstart,params))
Error in dimnames(x) <- dn :
length of 'dimnames' [2] not equal to array extent
Called from: `colnames<-`(`*tmp*`, value = names(x))
Browse[1]>
Change this line:
output <- array(dim=c(maxstep+1,4))
To this instead:
output <- array(dim=c(maxstep+1,11))
Your xstart variable has 11 elements. If you want them all on one row, you have to create something that is 11 columns wide, not 4. Perhaps you had just 4 values in the beginning.
Furthermore you don't seem to define params anywhere. The code won't run until you do.

append list for loop

I cannot figure out what i'm doing wrong with appending the results of a loop (as a tibble) into a list.
Below is code. I believe it has to do with the d in distance and the figures not being positive?
Error in datalist[[d]] <- dat :
attempt to select less than one element in integerOneIndex
library(tidyverse)
x <- c(10,5)
y <- c(1,3)
distance <- c(1,2,3) # distance away from the road
old<-data.frame(x,y)
datalist = list()
datalist2 = list()
for (d in 1: length(distance)) {
# Given a vector (defined by 2 points) and the distance,
# calculate a new vector that is distance away from the original
segment.shift <- function(x, y, d){
# calculate vector
v <- c(x[2] - x[1],y[2] - y[1])
# normalize vector
v <- v/sqrt((v[1]**2 + v[2]**2))
# perpendicular unit vector
vnp <- c( -v[2], v[1] )
return(list(x = c( x[1] + d*vnp[1], x[2] + d*vnp[1]),
y = c( y[1] + d*vnp[2], y[2] + d*vnp[2])))
}
# allocate memory for the bike path
xn <- numeric( (length(x) - 1) * 2 )
yn <- numeric( (length(y) - 1) * 2 )
for ( i in 1:(length(x) - 1) ) {
xs <- c(x[i], x[i+1])
ys <- c(y[i], y[i+1])
new.s <- segment.shift( xs, ys, d )
xn[(i-1)*2+1] <- new.s$x[1] ; xn[(i-1)*2+2] <- new.s$x[2]
yn[(i-1)*2+1] <- new.s$y[1] ; yn[(i-1)*2+2] <- new.s$y[2]
}
dat1<-as_tibble()
dat1<-as.data.frame(xn)%>%mutate(id=row_number())%>%mutate(Dist_Col=paste(d,"left",sep="_"))
#datalist[[d]] <- dat1 # add it to your list
dat2<-as_tibble()
dat2<-as.data.frame(yn)%>%mutate(id=row_number())%>%mutate(Dist_Col=paste(d,"left",sep="_"))
#datalist2[[d]] <- dat2 # add it to your list
###Now do right side
# allocate memory for the bike path
xn <- numeric( (length(x) - 1) * 2 )
yn <- numeric( (length(y) - 1) * 2 )
for ( i in 1:(length(x) - 1) ) {
xs <- c(x[i], x[i+1])
ys <- c(y[i], y[i+1])
new.s <- segment.shift( xs, ys, -d )
xn[(i-1)*2+1] <- new.s$x[1] ; xn[(i-1)*2+2] <- new.s$x[2]
yn[(i-1)*2+1] <- new.s$y[1] ; yn[(i-1)*2+2] <- new.s$y[2]
}
dat3<-as_tibble()
dat3<-as.data.frame(xn)%>%mutate(id=row_number())%>%mutate(Dist_Col=paste(d,"right",sep="_"))
datcomb<- full_join(dat1,dat3)
datalist[[d]] <- datcomb # add it to your list
dat4<-as_tibble()
dat4<-as.data.frame(yn)%>%mutate(id=row_number())%>%mutate(Dist_Col=paste(d,"right",sep="_"))
dat2comb<- full_join(dat2,dat4)
datalist2[[d]] <- dat2comb # add it to your list
}
big_data = do.call(rbind, datalist)
big_data2 = do.call(rbind, datalist2)
comb_data<- full_join(big_data,big_data2)
ggplot()+geom_line(data=old,aes(x,y),color='black')+geom_line(data=comb_data,aes(xn,yn,group=Dist_Col),color='red')
see updated code above which plot parallel lines on both sides of the original line.
I resolved the issue finally with
for (d in 1:length(distance))
plot update

Dynamic Programming in R - Algorithm for Distance Between Warehouse Locations

I am trying to calculate the distance between warehouse locations in rStudio utilizing an algorithm from an academic paper. The formula accounts for dimensions of location width, depth, and side of the aisle. The added complexity comes in when calculating the shortest distance with multiple cross aisles. This is all based on this paper.
This is from a bird's eye view:
I have static values for α = 48, ß = 96, ϒ = 108, Ω = 75, S = 22.
I then have a data frame with i as the key for location number, X for aisle number, y for section number, z for side number, and Cross Aisle is a boolean (0 = not a cross-aisle, 1 = is a cross-aisle). Here is a sample:
i X Y Z Cross Aisle
1 1 1 1 0
2 1 2 1 0
....
357 12 20 2 0
These are the formulas between locations i and j if the warehouse had no cross-aisles and was one rectangular grid:
Items in the same aisle (xi = xj):
dij = |yi - yj| * ß + |zi - zj| * ϒ
If items are in different aisles, there are three different scenarios:
dij = {|xi - xj| * (2α + ϒ) + v} if zi = zj
dij = {(xj - xi) * (2α + ϒ) + ϒ + v} if zi = 1, zj = 2
dij = {(xj - xi) * (2α + ϒ) - ϒ + v} if zi = 2, zj = 1
where v is the "vertical" distance (bird's eye, up-down aisle):
v = min(ß * (2 * S - yi - yj), ß * (yi + yj)) + 2Ω
(*Note: the academic paper has a typo in the v formula. It states 2 - yi - yj in the first bit, but I found another, original source that had it correctly as 2 * S-yi - yj.)
This piece of the formula is not the tricky part. It is a fairly simple if/then function in R to compute. However, this next bit with the cross-aisle gets me.
This is from the academic paper:
The authors state essentially: There are two locations p1 and p2. There are two neighboring cross-aisles, a1 and a2. One is above p1 and the other is below p1. Also, cross-aisles b1 and b2 are found, which are neighboring p2 and lead left. The distance between p1 and p2 are as follows:
d(p1,p2) = min{d(p1,ai) + d(ai,bj) + d(bj,p2),i,j ∈ {1,2}}
I am unsure how to apply this algorithm to my data set and construct the necessary loops, and matrix to find the distances between my warehouse locations. I could really use some help here making sense of it.
Here is my actual data set.
Here is an image of my warehouse to give you a sense of the layout. The "X" locations are cross-aisles.
I was able to get a workable loop without the cross-aisles:
whse.data <- read.xlsx("data set.xlsx", sheet = 1) %>%
as.data.frame()
### CREATE COMBINATION OF LOCATIONS
require(tools)
cmbn.loc <- combinations(n = max(whse.data$i), r = 2, v = whse.data$i,
repeats.allowed = FALSE) %>%
as.data.frame()
### CALCULATE DISTANCE BETWEEN LOCATIONS
LocDst <- function(dc, wc, wa, tr, s, df, comb){
# Creates a distance between various locations
#
# Args:
# dc: Depth of cell (alpha)
# wc: Width of cell (beta)
# wa: Width of aisle (y)
# tr: turning radius (omega)
# s: number of sections (S)
# df: Data Frame with location i, x, y, z, Cross.Aisle data
# comb: Combination of locations to compare
#
# Returns:
# Data frame with distances between each location combination
#
dist.df_total <- data.frame()
for (n in 1:nrow(comb)){
i <- comb[n,1]
j <- comb[n,2]
xi <- df[df$i == i,2]
yi <- df[df$i == i,3]
zi <- df[df$i == i,4]
xj <- df[df$i == j,2]
yj <- df[df$i == j,3]
zj <- df[df$i == j,4]
v <- min(wc * (2 * s - yi - yj), wc * (yi + yj)) + 2 * tr
if(xi == xj){
dij <- abs(yi - yj) * wc + abs(zi - zj) * wa
} else if (zi == zj){
dij <- (abs(xi - xj) * (2 * dc + wa) + v)
} else if (zi == 1 & zj == 2){
dij <- ((xj - xi) * (2 * dc + wa) + wa + v)
} else {
dij <- ((xj - xi) * (2 * dc * wa) - wa + v)
}
dist.df <- data.frame(`i` = i, `j` = j, dist = dij)
dist.df_total <- rbind.data.frame(dist.df_total, dist.df)
}
return(dist.df_total)
}
dist <- LocDst(48, 96, 108, 75, 18, whse.data, cmbn.loc)
I need a workable for loop or something to be run Algorithm 1 above, please.
I was able to get something to work. If anyone has anything more straightforward, I am all ears. Maybe this will be helpful to someone out there!
I had to use Excel to calculate the distance between the cross-aisles. There's probably a code for it, but it wasn't value-add for me at this time. Here's a sample of that data:
V1 V2 Dist
7 18 672
7 19 780
7 33 204
....
341 342 108
where V1 represents the first location number and V2 the second for all cross-aisle combinations.
Everything else should be computed within the code (beyond what put above):
require(dplyr)
require(openxlsx)
require(tools)
whse.data <- read.xlsx("data set.xlsx", sheet = 1) %>%
as.data.frame()
### CREATE COMBINATION OF LOCATIONS
cmbn.loc <- combinations(n = max(whse.data$i), r = 2, v = whse.data$i,
repeats.allowed = FALSE) %>%
as.data.frame()
# CROSS-AISLES IN EACH SHELF
ca.shelf <- cross.aisles %>%
group_by(Shelf) %>%
summarise(No.Cross.Aisles = sum(Cross.Aisle)) %>%
as.data.frame()
# DISTANCE BETWEEN CROSS AISLES
cmbn.cross.aisle <- combinations(n = nrow(cross.aisles),
r = 2,
v = cross.aisles$i,
repeats.allowed = FALSE) %>%
as.data.frame()
dist.cross.aisle <- read.xlsx("Combination of Cross-Aisles v3.xlsx", sheet = 1) %>%
as.data.frame()
# CROSS AISLE FUNCTION
CrsAisDst <- function(dc, wc, wa, tr, s, no.sh, df, comb, ca.m, d.m){
# Creates a distance between various locations
#
# Args:
# dc: Depth of cell (alpha)
# wc: Width of cell (beta)
# wa: Width of aisle (y)
# tr: turning radius (omega)
# s: number of sections (S)
# no.sh: number of shelves
# df: Data Frame with location i, x, y, z, Cross.Aisle data
# comb: Combination of locations to compare
# ca.m: Cross-aisles matrix
# d.m: Distances between cross-aisles
#
# Returns:
# Data frame with distances between each location combination
#
dist.df_total <- data.frame()
for (n in 1:nrow(comb)){
i <- comb[n,1]
j <- comb[n,2]
xi <- df[df$i == i,2]
yi <- df[df$i == i,3]
zi <- df[df$i == i,4]
xj <- df[df$i == j,2]
yj <- df[df$i == j,3]
zj <- df[df$i == j,4]
v <- min(wc * (2 * s - yi - yj), wc * (yi + yj)) + 2 * tr
if(xi == xj){
min.dij <- abs(yi - yj) * wc + abs(zi - zj) * wa
} else {
shi <- df[df$i == i,6]
shj <- df[df$i == j,6]
### CROSS-AISLES
ca.i <- #ca.m[ca.m$Shelf == shi,1]
data.frame(`i` = ca.m[ca.m$Shelf == shi,1])
ca.j <- #ca.m[ca.m$Shelf == shj,1]
data.frame(`j` = ca.m[ca.m$Shelf == shj,1])
## JOIN DISTANCES
dist.df_total.i <- data.frame()
dist.df_total.j <- data.frame()
#
for (m in 1:nrow(ca.i)){
i.i <- i
j.i <- ca.i[m,]
xi.i <- df[df$i == i.i,2]
yi.i <- df[df$i == i.i,3]
zi.i <- df[df$i == i.i,4]
xj.i <- df[df$i == j.i,2]
yj.i <- df[df$i == j.i,3]
zj.i <- df[df$i == j.i,4]
dij.i <- abs(yi.i - yj.i) * wc + abs(zi.i - zj.i) * wa
dist.df.i <- data.frame(`i` = i.i, `j` = j.i, dist = dij.i)
dist.df_total.i <- rbind.data.frame(dist.df_total.i, dist.df.i)
}
for (l in 1:nrow(ca.j)){
i.j <- j
j.j <- ca.j[l,]
xi.j <- df[df$i == i.j,2]
yi.j <- df[df$i == i.j,3]
zi.j <- df[df$i == i.j,4]
xj.j <- df[df$i == j.j,2]
yj.j <- df[df$i == j.j,3]
zj.j <- df[df$i == j.j,4]
dij.j <- abs(yi.j - yj.j) * wc + abs(zi.j - zj.j) * wa
dist.df.j <- data.frame(`i` = i.j, `j` = j.j, dist = dij.j)
dist.df_total.j <- rbind.data.frame(dist.df_total.j, dist.df.j)
}
min.i <- dist.df_total.i %>% slice(which.min(dist))
min.j <- dist.df_total.j %>% slice(which.min(dist))
aisle <- data.frame(V1=min.i$j,V2=min.j$j)
dist.aisle <- semi_join(d.m, aisle, by = c("V1", "V2"))
# CALCULATING DISTANCE WITHOUT CROSS-AISLES
if (zi == zj){
dij <- (abs(xi - xj) * (2 * dc + wa) + v)
} else if (zi == 1 & zj == 2){
dij <- ((xj - xi) * (2 * dc + wa) + wa + v)
} else {
dij <- ((xj - xi) * (2 * dc * wa) - wa + v)
}
min.dij <- min(dij, (min.i$dist + min.j$dist + dist.aisle$Dist))
}
dist.df <- data.frame(`i` = i, `j` = j, dist = min.dij)
dist.df_total <- rbind.data.frame(dist.df_total, dist.df)
}
return(dist.df_total)
}
aisle.dist <- CrsAisDst(48, 96, 108, 75, 18, 23, whse.data, cmbn.loc, cross.aisles,
dist.cross.aisle)
Output looks like this:
i j dist
7 18 672
7 19 780
7 33 204
....
341 342 108
(Note: this last same I ran was just among cross-aisles, which is why the numbers look the same. I have tested it, though, and it will use the regular formula if it is less distance.)

Speeding up linear model fitting on complete pairwise observations in large sparse matrix in R

I have a numeric data.frame df with 134946 rows x 1938 columns.
99.82% of the data are NA.
For each pair of (distinct) columns "P1" and "P2", I need to find which rows have non-NA values for both and then do some operations on those rows (linear model).
I wrote a script that does this, but it seems quite slow.
This post seems to discuss a related task, but I can't immediately see if or how it can be adapted to my case.
Borrowing the example from that post:
set.seed(54321)
nr = 1000;
nc = 900;
dat = matrix(runif(nr*nc), nrow=nr)
rownames(dat) = paste(1:nr)
colnames(dat) = paste("time", 1:nc)
dat[sample(nr*nc, nr*nc*0.9)] = NA
df <- as.data.frame(dat)
df_ps <- names(df)
N_ps <- length(df_ps)
My script is:
tic = proc.time()
out <- do.call(rbind,sapply(1:(N_ps-1), function(i) {
if (i/10 == floor(i/10)) {
cat("\ni = ",i,"\n")
toc = proc.time();
show(toc-tic);
}
do.call(rbind,sapply((i+1):N_ps, function(j) {
w <- which(complete.cases(df[,i],df[,j]))
N <- length(w)
if (N >= 5) {
xw <- df[w,i]
yw <- df[w,j]
if ((diff(range(xw)) != 0) & (diff(range(yw)) != 0)) {
s <- summary(lm(yw~xw))
o <- c(i,j,N,s$adj.r.squared,s$coefficients[2],s$coefficients[4],s$coefficients[8],s$coefficients[1],s$coefficients[3],s$coefficients[7])} else {
o <- c(i,j,N,rep(NA,7))
}
} else {o <- NULL}
return(o)
},simplify=F))
}
,simplify=F))
toc = proc.time();
show(toc-tic);
This takes about 10 minutes on my machine.
You can imagine what happens when I need to handle a much larger (although more sparse) data matrix. I never managed to finish the calculation.
Question: do you think this could be done more efficiently?
The thing is I don't know which operations take more time (subsetting of df, in which case I would remove duplications of that? appending matrix data, in which case I would create a flat vector and then convert it to matrix at the end? ...).
Thanks!
EDIT following up from minem's post
As shown by minem, the speed of this calculation strongly depended on the way linear regression parameters were calculated. Therefore changing that part was the single most important thing to do.
My own further trials showed that: 1) it was essential to use sapply in combination with do.call(rbind, rather than any flat vector, to store the data (I am still not sure why - I might make a separate post about this); 2) on the original matrix I am working on, much more sparse and with a much larger nrows/ncolumns ratio than the one in this example, using the information on the x vector available at the start of each i iteration to reduce the y vector at the start of each j iteration increased the speed by several orders of magnitude, even compared with minem's original script, which was already much better than mine above.
I suppose the advantage comes from filtering out many rows a priori, thus avoiding costly xna & yna operations on very long vectors.
The modified script is the following:
set.seed(54321)
nr = 1000;
nc = 900;
dat = matrix(runif(nr*nc), nrow = nr)
rownames(dat) = paste(1:nr)
colnames(dat) = paste("time", 1:nc)
dat[sample(nr*nc, nr*nc*0.90)] = NA
df <- as.data.frame(dat)
df_ps <- names(df)
N_ps <- length(df_ps)
tic = proc.time()
naIds <- lapply(df, function(x) !is.na(x))
dl <- as.list(df)
rl <- sapply(1:(N_ps - 1), function(i) {
if ((i-1)/10 == floor((i-1)/10)) {
cat("\ni = ",i,"\n")
toc = proc.time();
show(toc-tic);
}
x <- dl[[i]]
xna <- which(naIds[[i]])
rl2 <- sapply((i + 1):N_ps, function(j) {
y <- dl[[j]][xna]
yna <- which(naIds[[j]][xna])
w <- xna[yna]
N <- length(w)
if (N >= 5) {
xw <- x[w]
yw <- y[yna]
if ((min(xw) != max(xw)) && (min(yw) != max(yw))) {
# extracts from lm/lm.fit/summary.lm functions
X <- cbind(1L, xw)
m <- .lm.fit(X, yw)
# calculate adj.r.squared
fitted <- yw - m$residuals
rss <- sum(m$residuals^2)
mss <- sum((fitted - mean(fitted))^2)
n <- length(m$residuals)
rdf <- n - m$rank
# rdf <- df.residual
r.squared <- mss/(mss + rss)
adj.r.squared <- 1 - (1 - r.squared) * ((n - 1L)/rdf)
# calculate se & pvals
p1 <- 1L:m$rank
Qr <- m$qr
R <- chol2inv(Qr[p1, p1, drop = FALSE])
resvar <- rss/rdf
se <- sqrt(diag(R) * resvar)
est <- m$coefficients[m$pivot[p1]]
tval <- est/se
pvals <- 2 * pt(abs(tval), rdf, lower.tail = FALSE)
res <- c(m$coefficients[2], se[2], pvals[2],
m$coefficients[1], se[1], pvals[1])
o <- c(i, j, N, adj.r.squared, res)
} else {
o <- c(i,j,N,rep(NA,7))
}
} else {o <- NULL}
return(o)
}, simplify = F)
do.call(rbind, rl2)
}, simplify = F)
out2 <- do.call(rbind, rl)
toc = proc.time();
show(toc - tic)
E.g. try with nr=100000; nc=100.
I should probably mention that I tried using indices, i.e.:
naIds <- lapply(df, function(x) which(!is.na(x)))
and then obviously generating w by intersection:
w <- intersect(xna,yna)
N <- length(w)
This however is slower than the above.
Larges bottleneck is lm function, because there are lot of checks & additional calculations, that you do not necessarily need. So I extracted only the needed parts.
I got this to run in +/- 18 seconds.
set.seed(54321)
nr = 1000;
nc = 900;
dat = matrix(runif(nr*nc), nrow = nr)
rownames(dat) = paste(1:nr)
colnames(dat) = paste("time", 1:nc)
dat[sample(nr*nc, nr*nc*0.9)] = NA
df <- as.data.frame(dat)
df_ps <- names(df)
N_ps <- length(df_ps)
tic = proc.time()
naIds <- lapply(df, function(x) !is.na(x)) # outside loop
dl <- as.list(df) # sub-setting list elements is faster that columns
rl <- sapply(1:(N_ps - 1), function(i) {
x <- dl[[i]]
xna <- naIds[[i]] # relevant logical vector if not empty elements
rl2 <- sapply((i + 1):N_ps, function(j) {
y <- dl[[j]]
yna <- naIds[[j]]
w <- xna & yna
N <- sum(w)
if (N >= 5) {
xw <- x[w]
yw <- y[w]
if ((min(xw) != max(xw)) && (min(xw) != max(xw))) { # faster
# extracts from lm/lm.fit/summary.lm functions
X <- cbind(1L, xw)
m <- .lm.fit(X, yw)
# calculate adj.r.squared
fitted <- yw - m$residuals
rss <- sum(m$residuals^2)
mss <- sum((fitted - mean(fitted))^2)
n <- length(m$residuals)
rdf <- n - m$rank
# rdf <- df.residual
r.squared <- mss/(mss + rss)
adj.r.squared <- 1 - (1 - r.squared) * ((n - 1L)/rdf)
# calculate se & pvals
p1 <- 1L:m$rank
Qr <- m$qr
R <- chol2inv(Qr[p1, p1, drop = FALSE])
resvar <- rss/rdf
se <- sqrt(diag(R) * resvar)
est <- m$coefficients[m$pivot[p1]]
tval <- est/se
pvals <- 2 * pt(abs(tval), rdf, lower.tail = FALSE)
res <- c(m$coefficients[2], se[2], pvals[2],
m$coefficients[1], se[1], pvals[1])
o <- c(i, j, N, adj.r.squared, res)
} else {
o <- c(i,j,N,rep(NA,6))
}
} else {o <- NULL}
return(o)
}, simplify = F)
do.call(rbind, rl2)
}, simplify = F)
out2 <- do.call(rbind, rl)
toc = proc.time();
show(toc - tic);
# user system elapsed
# 17.94 0.11 18.44

Script "tennis common opponent" in R

I'm follow this article about "common opponent in tennis", my goal is to script it in the most effecient way. Below you can find my code but is so slow. For calculate the result of 1 match my laptop spent 120seconds more or less, and I have a dataset of 150k of rows to calculate.
the article: https://core.ac.uk/download/pdf/82518495.pdf
Need your help to clean and improve my code. Any suggestion is appreciate
tableA: https://1drv.ms/u/s!At-ZKKnf0H4jafxCX96NLxu00nc
tableB: https://1drv.ms/u/s!At-ZKKnf0H4javHgoPjzfCMXhg4
data_tennis_co: https://1drv.ms/u/s!At-ZKKnf0H4jaJyNkYrr8muff8k
data_tennis_co = read.table("test_co.csv", header=FALSE, sep=",", fill = TRUE)
A = read.table("tableA.csv", header=FALSE, sep=",", fill = TRUE)
B = read.table("tableB.csv", header=FALSE, sep=",", fill = TRUE)
#BASIC FUNCTIONS
G<-function(p){res<- p^4*(15-4*p-((10*p^2)/(1-2*p*(1-p))))}
d<- function(p,q) {res<- p*q*(1-(p*(1-q)+(1-p)*q))^-1}
TB <- function(p,q) {res <- foreach(i = seq_along(1:28), .combine = sum) %dopar% {tb<-A[i,1]*p^A[i,2]*(1-p)^A[i,3]*q^A[i,4]*(1-q)^A[i,5]*d(p,q)^A[i,6]}}
S <- function(p,q) {res <- foreach(i = seq_along(1:21), .combine = rbind) %dopar% {s<-B[i,1]*G(p)^B[i,2]*(1-G(p))^B[i,3]*G(q)^B[i,4]*(1-G(q))^B[i,5]*(G(p)*G(q)+(G(p)*(1-G(q))+(1-G(p))*G(q))*TB(p,q))^B[i,6]} sum(res)}
M3 <- function(p,q) {res <- S(p,q)^2*(1+2*(1-S(p,q)))}
DELTA_AB <- function(spwAC,rpwAC,spwBC,rpwBC) {res <- (spwAC-(1-rpwAC))-(spwBC-(1-rpwBC))}
PR<- function(spwAC,rpwAC,spwBC,rpwBC) {res <- (M3(0.6+DELTA_AB(spwAC,rpwAC,spwBC,rpwBC),(1-0.6))+M3(0.6,(1-(0.6-DELTA_AB(spwAC,rpwAC,spwBC,rpwBC)))))/2}
#COMMON OPPONENTS
MAL<-function(id1,id2){
prova<- subset(data_tennis_co, V3 == 1 & V4==2)
previous<-subset(data_tennis_co, V2 < prova$V2)
s1 <- subset(previous, V3 == 1 | V4==1)
s2 <- subset(previous, V3 ==2 | V4==2)
s1$opp <- ifelse(s1$V3==1, s1$V4, s1$V3)
s2$opp <- ifelse(s2$V3==2, s2$V4, s2$V3)
inn<- intersect(s1$opp,s2$opp)
common1<-s1[s1$opp %in% inn,]
common2<-s2[s2$opp %in% inn,]
# fare media se id non unico
COM <- merge(common1, common2,by=c("opp"))
COM$OMALLEY <- unlist(mapply(PR, COM$V5.x, COM$V6.x, COM$V7.y, COM$V8.y))
COM$OMALLEY[is.nan(COM$OMALLEY)] <- 0.5
return(tryCatch(sum(COM$OMALLEY)/nrow(COM), error=function(e) NaN))
}
tic()
RESA<-MAL(1,2)
toc()
the main bottleneck in the code is the use of parallel loops in TB and S, for operations that can be done faster using vectorized R functions.
G <- function(p) p^4*(15-4*p-((10*p^2)/(1-2*p*(1-p))))
d <- function(p, q) p*q*(1-(p*(1-q)+(1-p)*q))^-1
TB <- function(p, q) sum(A[,1] * p^A[,2] * (1-p)^A[,3] *
q^A[,4] * (1-q)^A[,5] * d(p,q)^A[,6])
S <- function(p, q) {
Gp <- G(p)
Gq <- G(q)
sum(B[,1] * Gp^B[,2] * (1-Gp)^B[,3] * Gq^B[,4] * (1-Gq)^B[,5] *
(Gp*Gq+(Gp*(1-Gq)+(1-Gp)*Gq)*TB(p,q))^B[,6])
}
M3 <- function(p, q) {
s <- S(p,q)
s^2*(1+2*(1-s))
}
DELTA_AB <- function(spwAC,rpwAC,spwBC,rpwBC) (spwAC-(1-rpwAC)) -
(spwBC-(1-rpwBC))
PR <- function(spwAC,rpwAC,spwBC,rpwBC) {
D <- DELTA_AB(spwAC, rpwAC, spwBC, rpwBC)
(M3(p = 0.6 + D, q = (1 - 0.6)) +
M3(p = 0.6, q = 1 - (0.6 - D))) / 2
}
Solution here:
https://codereview.stackexchange.com/questions/194301/tenis-common-opponents-r

Resources