Related
abcdefghij.onestep <- function (x, params) {
Susceptible <- x[2]
Exposed <- x[3]
Infected_Multibacillary <- x[4]
Infected_Paucibacillary <- x[5]
Exposed_Detected_Diagnosis <- x[6]
Treated <- x[7]
Disability <- x[8]
Recovered <- x[9]
Relapse_Multibacillary <-x[10]
Relapse_Paucibacillary <-x[11]
N <- Susceptible + Exposed + Infected_Multibacillary + Infected_Paucibacillary + Exposed_Detected_Diagnosis + Treated + Disability + Recovered + Relapse_Multibacillary + Relapse_Paucibacillary
m12 <- params["m12"]
m25 <- params["m25"]
m23 <- params["m23"]
m24 <- params["m24"]
m35 <- params["m35"]
m45 <- params["m45"]
m37 <- params["m37"]
m56 <- params["m56"]
m67 <- params["m67"]
m68 <- params["m68"]
m89 <- params["m89"]
m810 <- params["m810"]
m96 <- params["m96"]
m97 <- params["m97"]
m106 <- params["m106"]
mu <- params["mu"]
rates <- c(
birth=mu*N, susceptible_exposed=m12*Susceptible*Infected_Multibacillary -m25*Infected_Multibacillary * Exposed_Detected_Diagnosis -m23*Exposed*Infected_Multibacillary -m24*Exposed*Infected_Paucibacillary,
exposed_infected_multibacillary=m23*Exposed*Infected_Paucibacillary-m35*Infected_Multibacillary*Exposed_Detected_Diagnosis-m45* Exposed_Detected_Diagnosis* Infected_Paucibacillary-m37*Disability*Infected_Multibacillary,
exposed_infected_paucibacillary=m24*Exposed*Infected_Paucibacillary-m45*Infected_Paucibacillary*Exposed_Detected_Diagnosis,
infected_multibacillary_exposed_detected=m35*Infected_Multibacillary*Exposed_Detected_Diagnosis+m45*Infected_Paucibacillary* Exposed_Detected_Diagnosis-m56* Exposed_Detected_Diagnosis*Treated,
exposed_detected_treatment=m56*Exposed_Detected_Diagnosis*Treated-m67*Treated* Disability-m68*Treated*Recovered,
infected_multibacillary_disability=m37*Disability*Infected_Multibacillary+m67*Treated* Disability,
treated_recovered=m68* Treated*Recovered-m89*Recovered*Infected_Multibacillary-m810* Recovered*Infected_Paucibacillary,
relapse_multibacillary_treatment=-m96*Relapse_Multibacillary* Treated-m97*Relapse_Multibacillary*Disability,
relapse_paucibacillary_treatment=-m106*Relapse_Paucibacillary*Treated,
susceptible_death=mu*Susceptible,
exposed_death=mu*Exposed,
infected_multibacillary_death=mu*Infected_Multibacillary,
infected_paucibacillary_death=mu*Infected_Paucibacillary,
exposed_detected_death=mu* Exposed_Detected_Diagnosis,
treatment_death=mu*Treated,
disability_death=mu*Disability,
recovered_death=mu*Recovered,
relapse_multibacillary_death=mu*Relapse_Multibacillary,
relapse_paucibacillary_death=mu*Relapse_Paucibacilary
)
transitions <- list(
birth=c(1,0,0,0,0,0,0,0,0,0),
susceptible_exposed=c(-1,1,0,0,0,0,0,0,0,0,0,0),
exposed_infected_multibacillary=c(0,-1,1,0,0,0,0,0,0,0),
exposed_infected_paucibacillary=c(0,-1,0,1,0,0,0,0,0,0),
infected_multibacillary_exposed_detected=c(0,0,-1,0,1,0,0,0,0,0),
exposed_detected_treatment=c(0,0,0,0,-1,1,0,0,0,0),
infected_multibacillary_disability=c(0,0,-1,0,0,0,1,0,0,0),
treated_recovered=c(0,0,0,0,0,-1,0,1,0,0),
relapse_multibacillary_treatment=c(0,0,0,0,0,1,0,0,-1,0),
relapse_paucibacillary_treatment=c(0,0,0,0,0,1,0,0,0,-1),
susceptible_death=c(-1,0,0,0,0,0,0,0,0,0),
exposed_death= c(0,-1,0,0,0,0,0,0,0,0),
infected_multibacillary_death= c(0,0,-1,0,0,0,0,0,0,0),
infected_paucibacillary_death= c(0,0,0,-1,0,0,0,0,0,0),
exposed_detected_death= c(0,0,0,0,-1,0,0,0,0,0),
treatment_death= c(0,0,0,0,0,-1,0,0,0,0),
disability_death= c(0,0,0,0,0,0,-1,0,0,0),
recovered_death= c(0,0,0,0,0,0,0,-1,0,0),
relapse_multibacillary_death= c(0,0,0,0,0,0,0,0,-1,0),
relapse_paucibacillary_death= c(0,0,0,0,0,0,0,0,0,-1)
)
total.rate <- sum(rates)
if (total.rate==0)
tau <- Inf
else
tau <- rexp(n=1,rate=total.rate)
event <- sample.int(n=6,size=1,prob=rates/total.rate)
x+c(tau,transitions[[event]])
}
abcdefghij.simul <- function (x, params, maxstep = 10000) {
output <- array(dim=c(maxstep+1,4))
colnames(output) <- names(x)
output[1,] <-x
k <- 1
while ((k <= maxstep) && (x["Exposed"] > 0)) {
k <- k+1
output[k,] <- x <- abcdefghij.onestep(x,params)
}
as.data.frame(output[1:k,])
}
And in R this happens:
> set.seed(56856583)
> nsims <- 1
> xstart <- c(time=1,Susceptible=100000,Exposed=1,Infected_Multibacillary=1,Infected_Paucibacillary=1,Exposed_Detected_Diagnosis=1,Treated=1,Disability=1,Recovered=1,Relapse_Multibacillary=1,Relapse_Paucibacillary=1)
> library(plyr)
> simdat <- rdply(nsims, abcdefghij.simul(xstart,params))
Error in dimnames(x) <- dn :
length of 'dimnames' [2] not equal to array extent
Called from: `colnames<-`(`*tmp*`, value = names(x))
Browse[1]>
Change this line:
output <- array(dim=c(maxstep+1,4))
To this instead:
output <- array(dim=c(maxstep+1,11))
Your xstart variable has 11 elements. If you want them all on one row, you have to create something that is 11 columns wide, not 4. Perhaps you had just 4 values in the beginning.
Furthermore you don't seem to define params anywhere. The code won't run until you do.
I cannot figure out what i'm doing wrong with appending the results of a loop (as a tibble) into a list.
Below is code. I believe it has to do with the d in distance and the figures not being positive?
Error in datalist[[d]] <- dat :
attempt to select less than one element in integerOneIndex
library(tidyverse)
x <- c(10,5)
y <- c(1,3)
distance <- c(1,2,3) # distance away from the road
old<-data.frame(x,y)
datalist = list()
datalist2 = list()
for (d in 1: length(distance)) {
# Given a vector (defined by 2 points) and the distance,
# calculate a new vector that is distance away from the original
segment.shift <- function(x, y, d){
# calculate vector
v <- c(x[2] - x[1],y[2] - y[1])
# normalize vector
v <- v/sqrt((v[1]**2 + v[2]**2))
# perpendicular unit vector
vnp <- c( -v[2], v[1] )
return(list(x = c( x[1] + d*vnp[1], x[2] + d*vnp[1]),
y = c( y[1] + d*vnp[2], y[2] + d*vnp[2])))
}
# allocate memory for the bike path
xn <- numeric( (length(x) - 1) * 2 )
yn <- numeric( (length(y) - 1) * 2 )
for ( i in 1:(length(x) - 1) ) {
xs <- c(x[i], x[i+1])
ys <- c(y[i], y[i+1])
new.s <- segment.shift( xs, ys, d )
xn[(i-1)*2+1] <- new.s$x[1] ; xn[(i-1)*2+2] <- new.s$x[2]
yn[(i-1)*2+1] <- new.s$y[1] ; yn[(i-1)*2+2] <- new.s$y[2]
}
dat1<-as_tibble()
dat1<-as.data.frame(xn)%>%mutate(id=row_number())%>%mutate(Dist_Col=paste(d,"left",sep="_"))
#datalist[[d]] <- dat1 # add it to your list
dat2<-as_tibble()
dat2<-as.data.frame(yn)%>%mutate(id=row_number())%>%mutate(Dist_Col=paste(d,"left",sep="_"))
#datalist2[[d]] <- dat2 # add it to your list
###Now do right side
# allocate memory for the bike path
xn <- numeric( (length(x) - 1) * 2 )
yn <- numeric( (length(y) - 1) * 2 )
for ( i in 1:(length(x) - 1) ) {
xs <- c(x[i], x[i+1])
ys <- c(y[i], y[i+1])
new.s <- segment.shift( xs, ys, -d )
xn[(i-1)*2+1] <- new.s$x[1] ; xn[(i-1)*2+2] <- new.s$x[2]
yn[(i-1)*2+1] <- new.s$y[1] ; yn[(i-1)*2+2] <- new.s$y[2]
}
dat3<-as_tibble()
dat3<-as.data.frame(xn)%>%mutate(id=row_number())%>%mutate(Dist_Col=paste(d,"right",sep="_"))
datcomb<- full_join(dat1,dat3)
datalist[[d]] <- datcomb # add it to your list
dat4<-as_tibble()
dat4<-as.data.frame(yn)%>%mutate(id=row_number())%>%mutate(Dist_Col=paste(d,"right",sep="_"))
dat2comb<- full_join(dat2,dat4)
datalist2[[d]] <- dat2comb # add it to your list
}
big_data = do.call(rbind, datalist)
big_data2 = do.call(rbind, datalist2)
comb_data<- full_join(big_data,big_data2)
ggplot()+geom_line(data=old,aes(x,y),color='black')+geom_line(data=comb_data,aes(xn,yn,group=Dist_Col),color='red')
see updated code above which plot parallel lines on both sides of the original line.
I resolved the issue finally with
for (d in 1:length(distance))
plot update
I am trying to calculate the distance between warehouse locations in rStudio utilizing an algorithm from an academic paper. The formula accounts for dimensions of location width, depth, and side of the aisle. The added complexity comes in when calculating the shortest distance with multiple cross aisles. This is all based on this paper.
This is from a bird's eye view:
I have static values for α = 48, ß = 96, ϒ = 108, Ω = 75, S = 22.
I then have a data frame with i as the key for location number, X for aisle number, y for section number, z for side number, and Cross Aisle is a boolean (0 = not a cross-aisle, 1 = is a cross-aisle). Here is a sample:
i X Y Z Cross Aisle
1 1 1 1 0
2 1 2 1 0
....
357 12 20 2 0
These are the formulas between locations i and j if the warehouse had no cross-aisles and was one rectangular grid:
Items in the same aisle (xi = xj):
dij = |yi - yj| * ß + |zi - zj| * ϒ
If items are in different aisles, there are three different scenarios:
dij = {|xi - xj| * (2α + ϒ) + v} if zi = zj
dij = {(xj - xi) * (2α + ϒ) + ϒ + v} if zi = 1, zj = 2
dij = {(xj - xi) * (2α + ϒ) - ϒ + v} if zi = 2, zj = 1
where v is the "vertical" distance (bird's eye, up-down aisle):
v = min(ß * (2 * S - yi - yj), ß * (yi + yj)) + 2Ω
(*Note: the academic paper has a typo in the v formula. It states 2 - yi - yj in the first bit, but I found another, original source that had it correctly as 2 * S-yi - yj.)
This piece of the formula is not the tricky part. It is a fairly simple if/then function in R to compute. However, this next bit with the cross-aisle gets me.
This is from the academic paper:
The authors state essentially: There are two locations p1 and p2. There are two neighboring cross-aisles, a1 and a2. One is above p1 and the other is below p1. Also, cross-aisles b1 and b2 are found, which are neighboring p2 and lead left. The distance between p1 and p2 are as follows:
d(p1,p2) = min{d(p1,ai) + d(ai,bj) + d(bj,p2),i,j ∈ {1,2}}
I am unsure how to apply this algorithm to my data set and construct the necessary loops, and matrix to find the distances between my warehouse locations. I could really use some help here making sense of it.
Here is my actual data set.
Here is an image of my warehouse to give you a sense of the layout. The "X" locations are cross-aisles.
I was able to get a workable loop without the cross-aisles:
whse.data <- read.xlsx("data set.xlsx", sheet = 1) %>%
as.data.frame()
### CREATE COMBINATION OF LOCATIONS
require(tools)
cmbn.loc <- combinations(n = max(whse.data$i), r = 2, v = whse.data$i,
repeats.allowed = FALSE) %>%
as.data.frame()
### CALCULATE DISTANCE BETWEEN LOCATIONS
LocDst <- function(dc, wc, wa, tr, s, df, comb){
# Creates a distance between various locations
#
# Args:
# dc: Depth of cell (alpha)
# wc: Width of cell (beta)
# wa: Width of aisle (y)
# tr: turning radius (omega)
# s: number of sections (S)
# df: Data Frame with location i, x, y, z, Cross.Aisle data
# comb: Combination of locations to compare
#
# Returns:
# Data frame with distances between each location combination
#
dist.df_total <- data.frame()
for (n in 1:nrow(comb)){
i <- comb[n,1]
j <- comb[n,2]
xi <- df[df$i == i,2]
yi <- df[df$i == i,3]
zi <- df[df$i == i,4]
xj <- df[df$i == j,2]
yj <- df[df$i == j,3]
zj <- df[df$i == j,4]
v <- min(wc * (2 * s - yi - yj), wc * (yi + yj)) + 2 * tr
if(xi == xj){
dij <- abs(yi - yj) * wc + abs(zi - zj) * wa
} else if (zi == zj){
dij <- (abs(xi - xj) * (2 * dc + wa) + v)
} else if (zi == 1 & zj == 2){
dij <- ((xj - xi) * (2 * dc + wa) + wa + v)
} else {
dij <- ((xj - xi) * (2 * dc * wa) - wa + v)
}
dist.df <- data.frame(`i` = i, `j` = j, dist = dij)
dist.df_total <- rbind.data.frame(dist.df_total, dist.df)
}
return(dist.df_total)
}
dist <- LocDst(48, 96, 108, 75, 18, whse.data, cmbn.loc)
I need a workable for loop or something to be run Algorithm 1 above, please.
I was able to get something to work. If anyone has anything more straightforward, I am all ears. Maybe this will be helpful to someone out there!
I had to use Excel to calculate the distance between the cross-aisles. There's probably a code for it, but it wasn't value-add for me at this time. Here's a sample of that data:
V1 V2 Dist
7 18 672
7 19 780
7 33 204
....
341 342 108
where V1 represents the first location number and V2 the second for all cross-aisle combinations.
Everything else should be computed within the code (beyond what put above):
require(dplyr)
require(openxlsx)
require(tools)
whse.data <- read.xlsx("data set.xlsx", sheet = 1) %>%
as.data.frame()
### CREATE COMBINATION OF LOCATIONS
cmbn.loc <- combinations(n = max(whse.data$i), r = 2, v = whse.data$i,
repeats.allowed = FALSE) %>%
as.data.frame()
# CROSS-AISLES IN EACH SHELF
ca.shelf <- cross.aisles %>%
group_by(Shelf) %>%
summarise(No.Cross.Aisles = sum(Cross.Aisle)) %>%
as.data.frame()
# DISTANCE BETWEEN CROSS AISLES
cmbn.cross.aisle <- combinations(n = nrow(cross.aisles),
r = 2,
v = cross.aisles$i,
repeats.allowed = FALSE) %>%
as.data.frame()
dist.cross.aisle <- read.xlsx("Combination of Cross-Aisles v3.xlsx", sheet = 1) %>%
as.data.frame()
# CROSS AISLE FUNCTION
CrsAisDst <- function(dc, wc, wa, tr, s, no.sh, df, comb, ca.m, d.m){
# Creates a distance between various locations
#
# Args:
# dc: Depth of cell (alpha)
# wc: Width of cell (beta)
# wa: Width of aisle (y)
# tr: turning radius (omega)
# s: number of sections (S)
# no.sh: number of shelves
# df: Data Frame with location i, x, y, z, Cross.Aisle data
# comb: Combination of locations to compare
# ca.m: Cross-aisles matrix
# d.m: Distances between cross-aisles
#
# Returns:
# Data frame with distances between each location combination
#
dist.df_total <- data.frame()
for (n in 1:nrow(comb)){
i <- comb[n,1]
j <- comb[n,2]
xi <- df[df$i == i,2]
yi <- df[df$i == i,3]
zi <- df[df$i == i,4]
xj <- df[df$i == j,2]
yj <- df[df$i == j,3]
zj <- df[df$i == j,4]
v <- min(wc * (2 * s - yi - yj), wc * (yi + yj)) + 2 * tr
if(xi == xj){
min.dij <- abs(yi - yj) * wc + abs(zi - zj) * wa
} else {
shi <- df[df$i == i,6]
shj <- df[df$i == j,6]
### CROSS-AISLES
ca.i <- #ca.m[ca.m$Shelf == shi,1]
data.frame(`i` = ca.m[ca.m$Shelf == shi,1])
ca.j <- #ca.m[ca.m$Shelf == shj,1]
data.frame(`j` = ca.m[ca.m$Shelf == shj,1])
## JOIN DISTANCES
dist.df_total.i <- data.frame()
dist.df_total.j <- data.frame()
#
for (m in 1:nrow(ca.i)){
i.i <- i
j.i <- ca.i[m,]
xi.i <- df[df$i == i.i,2]
yi.i <- df[df$i == i.i,3]
zi.i <- df[df$i == i.i,4]
xj.i <- df[df$i == j.i,2]
yj.i <- df[df$i == j.i,3]
zj.i <- df[df$i == j.i,4]
dij.i <- abs(yi.i - yj.i) * wc + abs(zi.i - zj.i) * wa
dist.df.i <- data.frame(`i` = i.i, `j` = j.i, dist = dij.i)
dist.df_total.i <- rbind.data.frame(dist.df_total.i, dist.df.i)
}
for (l in 1:nrow(ca.j)){
i.j <- j
j.j <- ca.j[l,]
xi.j <- df[df$i == i.j,2]
yi.j <- df[df$i == i.j,3]
zi.j <- df[df$i == i.j,4]
xj.j <- df[df$i == j.j,2]
yj.j <- df[df$i == j.j,3]
zj.j <- df[df$i == j.j,4]
dij.j <- abs(yi.j - yj.j) * wc + abs(zi.j - zj.j) * wa
dist.df.j <- data.frame(`i` = i.j, `j` = j.j, dist = dij.j)
dist.df_total.j <- rbind.data.frame(dist.df_total.j, dist.df.j)
}
min.i <- dist.df_total.i %>% slice(which.min(dist))
min.j <- dist.df_total.j %>% slice(which.min(dist))
aisle <- data.frame(V1=min.i$j,V2=min.j$j)
dist.aisle <- semi_join(d.m, aisle, by = c("V1", "V2"))
# CALCULATING DISTANCE WITHOUT CROSS-AISLES
if (zi == zj){
dij <- (abs(xi - xj) * (2 * dc + wa) + v)
} else if (zi == 1 & zj == 2){
dij <- ((xj - xi) * (2 * dc + wa) + wa + v)
} else {
dij <- ((xj - xi) * (2 * dc * wa) - wa + v)
}
min.dij <- min(dij, (min.i$dist + min.j$dist + dist.aisle$Dist))
}
dist.df <- data.frame(`i` = i, `j` = j, dist = min.dij)
dist.df_total <- rbind.data.frame(dist.df_total, dist.df)
}
return(dist.df_total)
}
aisle.dist <- CrsAisDst(48, 96, 108, 75, 18, 23, whse.data, cmbn.loc, cross.aisles,
dist.cross.aisle)
Output looks like this:
i j dist
7 18 672
7 19 780
7 33 204
....
341 342 108
(Note: this last same I ran was just among cross-aisles, which is why the numbers look the same. I have tested it, though, and it will use the regular formula if it is less distance.)
I have a numeric data.frame df with 134946 rows x 1938 columns.
99.82% of the data are NA.
For each pair of (distinct) columns "P1" and "P2", I need to find which rows have non-NA values for both and then do some operations on those rows (linear model).
I wrote a script that does this, but it seems quite slow.
This post seems to discuss a related task, but I can't immediately see if or how it can be adapted to my case.
Borrowing the example from that post:
set.seed(54321)
nr = 1000;
nc = 900;
dat = matrix(runif(nr*nc), nrow=nr)
rownames(dat) = paste(1:nr)
colnames(dat) = paste("time", 1:nc)
dat[sample(nr*nc, nr*nc*0.9)] = NA
df <- as.data.frame(dat)
df_ps <- names(df)
N_ps <- length(df_ps)
My script is:
tic = proc.time()
out <- do.call(rbind,sapply(1:(N_ps-1), function(i) {
if (i/10 == floor(i/10)) {
cat("\ni = ",i,"\n")
toc = proc.time();
show(toc-tic);
}
do.call(rbind,sapply((i+1):N_ps, function(j) {
w <- which(complete.cases(df[,i],df[,j]))
N <- length(w)
if (N >= 5) {
xw <- df[w,i]
yw <- df[w,j]
if ((diff(range(xw)) != 0) & (diff(range(yw)) != 0)) {
s <- summary(lm(yw~xw))
o <- c(i,j,N,s$adj.r.squared,s$coefficients[2],s$coefficients[4],s$coefficients[8],s$coefficients[1],s$coefficients[3],s$coefficients[7])} else {
o <- c(i,j,N,rep(NA,7))
}
} else {o <- NULL}
return(o)
},simplify=F))
}
,simplify=F))
toc = proc.time();
show(toc-tic);
This takes about 10 minutes on my machine.
You can imagine what happens when I need to handle a much larger (although more sparse) data matrix. I never managed to finish the calculation.
Question: do you think this could be done more efficiently?
The thing is I don't know which operations take more time (subsetting of df, in which case I would remove duplications of that? appending matrix data, in which case I would create a flat vector and then convert it to matrix at the end? ...).
Thanks!
EDIT following up from minem's post
As shown by minem, the speed of this calculation strongly depended on the way linear regression parameters were calculated. Therefore changing that part was the single most important thing to do.
My own further trials showed that: 1) it was essential to use sapply in combination with do.call(rbind, rather than any flat vector, to store the data (I am still not sure why - I might make a separate post about this); 2) on the original matrix I am working on, much more sparse and with a much larger nrows/ncolumns ratio than the one in this example, using the information on the x vector available at the start of each i iteration to reduce the y vector at the start of each j iteration increased the speed by several orders of magnitude, even compared with minem's original script, which was already much better than mine above.
I suppose the advantage comes from filtering out many rows a priori, thus avoiding costly xna & yna operations on very long vectors.
The modified script is the following:
set.seed(54321)
nr = 1000;
nc = 900;
dat = matrix(runif(nr*nc), nrow = nr)
rownames(dat) = paste(1:nr)
colnames(dat) = paste("time", 1:nc)
dat[sample(nr*nc, nr*nc*0.90)] = NA
df <- as.data.frame(dat)
df_ps <- names(df)
N_ps <- length(df_ps)
tic = proc.time()
naIds <- lapply(df, function(x) !is.na(x))
dl <- as.list(df)
rl <- sapply(1:(N_ps - 1), function(i) {
if ((i-1)/10 == floor((i-1)/10)) {
cat("\ni = ",i,"\n")
toc = proc.time();
show(toc-tic);
}
x <- dl[[i]]
xna <- which(naIds[[i]])
rl2 <- sapply((i + 1):N_ps, function(j) {
y <- dl[[j]][xna]
yna <- which(naIds[[j]][xna])
w <- xna[yna]
N <- length(w)
if (N >= 5) {
xw <- x[w]
yw <- y[yna]
if ((min(xw) != max(xw)) && (min(yw) != max(yw))) {
# extracts from lm/lm.fit/summary.lm functions
X <- cbind(1L, xw)
m <- .lm.fit(X, yw)
# calculate adj.r.squared
fitted <- yw - m$residuals
rss <- sum(m$residuals^2)
mss <- sum((fitted - mean(fitted))^2)
n <- length(m$residuals)
rdf <- n - m$rank
# rdf <- df.residual
r.squared <- mss/(mss + rss)
adj.r.squared <- 1 - (1 - r.squared) * ((n - 1L)/rdf)
# calculate se & pvals
p1 <- 1L:m$rank
Qr <- m$qr
R <- chol2inv(Qr[p1, p1, drop = FALSE])
resvar <- rss/rdf
se <- sqrt(diag(R) * resvar)
est <- m$coefficients[m$pivot[p1]]
tval <- est/se
pvals <- 2 * pt(abs(tval), rdf, lower.tail = FALSE)
res <- c(m$coefficients[2], se[2], pvals[2],
m$coefficients[1], se[1], pvals[1])
o <- c(i, j, N, adj.r.squared, res)
} else {
o <- c(i,j,N,rep(NA,7))
}
} else {o <- NULL}
return(o)
}, simplify = F)
do.call(rbind, rl2)
}, simplify = F)
out2 <- do.call(rbind, rl)
toc = proc.time();
show(toc - tic)
E.g. try with nr=100000; nc=100.
I should probably mention that I tried using indices, i.e.:
naIds <- lapply(df, function(x) which(!is.na(x)))
and then obviously generating w by intersection:
w <- intersect(xna,yna)
N <- length(w)
This however is slower than the above.
Larges bottleneck is lm function, because there are lot of checks & additional calculations, that you do not necessarily need. So I extracted only the needed parts.
I got this to run in +/- 18 seconds.
set.seed(54321)
nr = 1000;
nc = 900;
dat = matrix(runif(nr*nc), nrow = nr)
rownames(dat) = paste(1:nr)
colnames(dat) = paste("time", 1:nc)
dat[sample(nr*nc, nr*nc*0.9)] = NA
df <- as.data.frame(dat)
df_ps <- names(df)
N_ps <- length(df_ps)
tic = proc.time()
naIds <- lapply(df, function(x) !is.na(x)) # outside loop
dl <- as.list(df) # sub-setting list elements is faster that columns
rl <- sapply(1:(N_ps - 1), function(i) {
x <- dl[[i]]
xna <- naIds[[i]] # relevant logical vector if not empty elements
rl2 <- sapply((i + 1):N_ps, function(j) {
y <- dl[[j]]
yna <- naIds[[j]]
w <- xna & yna
N <- sum(w)
if (N >= 5) {
xw <- x[w]
yw <- y[w]
if ((min(xw) != max(xw)) && (min(xw) != max(xw))) { # faster
# extracts from lm/lm.fit/summary.lm functions
X <- cbind(1L, xw)
m <- .lm.fit(X, yw)
# calculate adj.r.squared
fitted <- yw - m$residuals
rss <- sum(m$residuals^2)
mss <- sum((fitted - mean(fitted))^2)
n <- length(m$residuals)
rdf <- n - m$rank
# rdf <- df.residual
r.squared <- mss/(mss + rss)
adj.r.squared <- 1 - (1 - r.squared) * ((n - 1L)/rdf)
# calculate se & pvals
p1 <- 1L:m$rank
Qr <- m$qr
R <- chol2inv(Qr[p1, p1, drop = FALSE])
resvar <- rss/rdf
se <- sqrt(diag(R) * resvar)
est <- m$coefficients[m$pivot[p1]]
tval <- est/se
pvals <- 2 * pt(abs(tval), rdf, lower.tail = FALSE)
res <- c(m$coefficients[2], se[2], pvals[2],
m$coefficients[1], se[1], pvals[1])
o <- c(i, j, N, adj.r.squared, res)
} else {
o <- c(i,j,N,rep(NA,6))
}
} else {o <- NULL}
return(o)
}, simplify = F)
do.call(rbind, rl2)
}, simplify = F)
out2 <- do.call(rbind, rl)
toc = proc.time();
show(toc - tic);
# user system elapsed
# 17.94 0.11 18.44
I'm follow this article about "common opponent in tennis", my goal is to script it in the most effecient way. Below you can find my code but is so slow. For calculate the result of 1 match my laptop spent 120seconds more or less, and I have a dataset of 150k of rows to calculate.
the article: https://core.ac.uk/download/pdf/82518495.pdf
Need your help to clean and improve my code. Any suggestion is appreciate
tableA: https://1drv.ms/u/s!At-ZKKnf0H4jafxCX96NLxu00nc
tableB: https://1drv.ms/u/s!At-ZKKnf0H4javHgoPjzfCMXhg4
data_tennis_co: https://1drv.ms/u/s!At-ZKKnf0H4jaJyNkYrr8muff8k
data_tennis_co = read.table("test_co.csv", header=FALSE, sep=",", fill = TRUE)
A = read.table("tableA.csv", header=FALSE, sep=",", fill = TRUE)
B = read.table("tableB.csv", header=FALSE, sep=",", fill = TRUE)
#BASIC FUNCTIONS
G<-function(p){res<- p^4*(15-4*p-((10*p^2)/(1-2*p*(1-p))))}
d<- function(p,q) {res<- p*q*(1-(p*(1-q)+(1-p)*q))^-1}
TB <- function(p,q) {res <- foreach(i = seq_along(1:28), .combine = sum) %dopar% {tb<-A[i,1]*p^A[i,2]*(1-p)^A[i,3]*q^A[i,4]*(1-q)^A[i,5]*d(p,q)^A[i,6]}}
S <- function(p,q) {res <- foreach(i = seq_along(1:21), .combine = rbind) %dopar% {s<-B[i,1]*G(p)^B[i,2]*(1-G(p))^B[i,3]*G(q)^B[i,4]*(1-G(q))^B[i,5]*(G(p)*G(q)+(G(p)*(1-G(q))+(1-G(p))*G(q))*TB(p,q))^B[i,6]} sum(res)}
M3 <- function(p,q) {res <- S(p,q)^2*(1+2*(1-S(p,q)))}
DELTA_AB <- function(spwAC,rpwAC,spwBC,rpwBC) {res <- (spwAC-(1-rpwAC))-(spwBC-(1-rpwBC))}
PR<- function(spwAC,rpwAC,spwBC,rpwBC) {res <- (M3(0.6+DELTA_AB(spwAC,rpwAC,spwBC,rpwBC),(1-0.6))+M3(0.6,(1-(0.6-DELTA_AB(spwAC,rpwAC,spwBC,rpwBC)))))/2}
#COMMON OPPONENTS
MAL<-function(id1,id2){
prova<- subset(data_tennis_co, V3 == 1 & V4==2)
previous<-subset(data_tennis_co, V2 < prova$V2)
s1 <- subset(previous, V3 == 1 | V4==1)
s2 <- subset(previous, V3 ==2 | V4==2)
s1$opp <- ifelse(s1$V3==1, s1$V4, s1$V3)
s2$opp <- ifelse(s2$V3==2, s2$V4, s2$V3)
inn<- intersect(s1$opp,s2$opp)
common1<-s1[s1$opp %in% inn,]
common2<-s2[s2$opp %in% inn,]
# fare media se id non unico
COM <- merge(common1, common2,by=c("opp"))
COM$OMALLEY <- unlist(mapply(PR, COM$V5.x, COM$V6.x, COM$V7.y, COM$V8.y))
COM$OMALLEY[is.nan(COM$OMALLEY)] <- 0.5
return(tryCatch(sum(COM$OMALLEY)/nrow(COM), error=function(e) NaN))
}
tic()
RESA<-MAL(1,2)
toc()
the main bottleneck in the code is the use of parallel loops in TB and S, for operations that can be done faster using vectorized R functions.
G <- function(p) p^4*(15-4*p-((10*p^2)/(1-2*p*(1-p))))
d <- function(p, q) p*q*(1-(p*(1-q)+(1-p)*q))^-1
TB <- function(p, q) sum(A[,1] * p^A[,2] * (1-p)^A[,3] *
q^A[,4] * (1-q)^A[,5] * d(p,q)^A[,6])
S <- function(p, q) {
Gp <- G(p)
Gq <- G(q)
sum(B[,1] * Gp^B[,2] * (1-Gp)^B[,3] * Gq^B[,4] * (1-Gq)^B[,5] *
(Gp*Gq+(Gp*(1-Gq)+(1-Gp)*Gq)*TB(p,q))^B[,6])
}
M3 <- function(p, q) {
s <- S(p,q)
s^2*(1+2*(1-s))
}
DELTA_AB <- function(spwAC,rpwAC,spwBC,rpwBC) (spwAC-(1-rpwAC)) -
(spwBC-(1-rpwBC))
PR <- function(spwAC,rpwAC,spwBC,rpwBC) {
D <- DELTA_AB(spwAC, rpwAC, spwBC, rpwBC)
(M3(p = 0.6 + D, q = (1 - 0.6)) +
M3(p = 0.6, q = 1 - (0.6 - D))) / 2
}
Solution here:
https://codereview.stackexchange.com/questions/194301/tenis-common-opponents-r