speed problems with odesolver in R - r

I have a differential equation model in R that uses the odesolver from the deSolve package. However, at the moment the model is running very slowly. I think this might be something to do with the function that I feed to odesolver being poorly written, but can't figure out what exactly is slowing it down and how I might speed it up. Does anyone have any ideas?
I've made an example that works in a similar way to mine:
library(data.table)
library(deSolve)
matrix_1 <- matrix(runif(100),10,10)
matrix_1[which(matrix_1 > 0.5)] <- 1
matrix_1[which(matrix_1 < 0.5)] <- 0
matrix_2 <- matrix(runif(100),10,10)
matrix_2[which(matrix_2 > 0.5)] <- 1
matrix_2[which(matrix_2 < 0.5)] <- 0
group_ID <- rep(c(1,2), 5)
N <- runif(10, 0, 100000)
Nchange <- function(t, N, parameters) {
with(as.list(c(N, parameters)), {
N_per_1 <- matrix_1 * N_per_connection
N_per_1[is.na(N_per_1)] <- 0
total_N_2 <- as.vector(N_per_1)
if (nrow(as.matrix(N_per_1)) > 1) {
total_N_2 <- colSums(N_per_1[drop = FALSE])
}
N_per_1_cost <- N_per_1
for (i in possible_competition) {
column <- as.vector(N_per_1[, i])
if (sum(column) > 0) {
active_groups <- unique(group_ID[column > 0])
if (length(active_groups) > 1){
group_ID_dets <- data.table("group_ID" = group_ID, "column"= column, "n_IDS" = 1:length(group_ID))
group_ID_dets$portions <- ave(group_ID_dets$column, group_ID_dets$group_ID, FUN = function(x) x / sum(x))
group_ID_dets[is.na(group_ID_dets)] <- 0
totals <- as.vector(unlist(tapply(group_ID_dets$column, group_ID_dets$group_ID, function(x) sum(x))))
totals[is.na(totals)] <- 0
totals <- totals*2 - sum(totals)
totals[totals < 0] <- 0
group_ID_totals <- data.table("group_ID" = unique(group_ID), "totals" = as.vector(totals))
group_ID_dets$totals <- group_ID_totals$totals[match(group_ID_dets$group_ID, group_ID_totals$group_ID)]
N_per_1[, i] <- group_ID_dets$totals * group_ID_dets$portions
}
}
}
res_per_1 <- N_per_1 * 0.1
N_per_2 <- matrix_2 * N_per_connection
N_per_2[is.na(N_per_2)] <- 0
res_per_2 <- N_per_2 * 0.1
dN <- rowSums(res_per_1) - rowSums(N_per_1_cost * 0.00003) + rowSums(res_per_2) -
rowSums(N_per_2 * 0.00003) - N*0.03
list(c(dN))
})
} # function describing differential equations
N_per_connection <- N/(rowSums(matrix_1) + rowSums(matrix_2))
possible_competition <- which(colSums(matrix_1 != 0)>1)
times <- seq(0, 100, by = 1)
out <- ode(y = N, times = times, func = Nchange, parms = NULL)

A good way to identify the bottle neck is with a profiler and the profvis package provides a good way of drilling down into the results. Wrapping your code in p <- profvis({YourCodeInHere}) and then viewing the results with print(p) gives the following insights:
The lines that are taking the most time are (in descending order of time taken):
group_ID_totals <- data.table("group_ID" = unique(group_ID), "totals" = as.vector(totals))
group_ID_dets$portions <- ave(group_ID_dets$column, group_ID_dets$group_ID, FUN = function(x) x / sum(x))
group_ID_dets <- data.table("group_ID" = group_ID, "column"= column, "n_IDS" = 1:length(group_ID))
totals <- as.vector(unlist(tapply(group_ID_dets$column, group_ID_dets$group_ID, function(x) sum(x))))
group_ID_dets$totals <- group_ID_totals$totals[match(group_ID_dets$group_ID, group_ID_totals$group_ID)]
I'm not familiar with the details of your ODE, but you should focus on optimising these tasks. I think the larger issue is that you're running these commands in a loop. Often, you'll hear that loops are slow in R, but a more nuanced discussion of this issue is found in the answers here. Some tips there might help you restructure your code/loop. Good luck!

Related

How to minimize the unacceptably long run time of the created R code

There is a code with three for loops running with data containing enough missing values. The major problem is with the unacceptably long run time which seems to take at least more than a month although I try to keep my PC opened during most of the day.
The structure below is 100% correct from what I am trying to achieve when I test with a very few data points. But as the number of columns and rows become 2781 and 280, respectively, I perceive it takes forever although I am 100% sure that this is running correctly even when I see the updated environment window of my R-Studio each time I refresh it.
My data also has lots of missing values, probably 40% or something. I think this is making the computation time extremely longer as well.
The data dimension is 315 * 2781.
However, I am trying to achieve an output in a 280 * 2781 matrix form.
May I please get help minimizing the run time of this following code?
It would be very appreciated if I can!
options(java.parameters = "- Xmx8000m")
memory.limit(size=8e+6)
data=read.table("C:/Data/input.txt",T,sep="\t");
data=data.frame(data)[,-1]
corr<-NULL
corr2<-NULL
corr3<-NULL
for(i in 1:280)
{
corr2<-NULL
for(j in 1:2781)
{
data2<-data[,-j]
corr<-NULL
for(k in 1:2780)
{
ifelse((is.error(grangertest(data[i:(i+35),j] ~ data2[i:(i+35),k], order = 1, na.action = na.omit)$P[2])==TRUE) || (grangertest(data[i:(i+35),j] ~ data2[i:(i+35),k], order = 1, na.action = na.omit)$P[2])>0.05|| (is.na(grangertest(data[i:(i+35),j] ~ data2[i:(i+35),k], order = 1, na.action = na.omit)$P[2])==TRUE),corr<-cbind(corr,0),corr<-cbind(corr,1))
}
corr2<-rbind(corr2,corr)
}
corr3<-rbind(corr3,rowSums(corr2))
}
The snippet of my data is as below:
> dput(data[1:30, 1:10])
structure(c(0.567388170165941, 0.193093325709924, 0.965938209090382,
0.348295788047835, 0.496113050729036, 0.0645384560339153, 0.946750836912543,
0.642093246569857, 0.565092500532046, 0.0952424583956599, 0.444063827162609,
0.709971546428278, 0.756330407923087, 0.601746253203601, 0.341865634545684,
0.953319212188944, 0.0788604547269642, 0.990508111426607, 0.35519331949763,
0.697004508692771, 0.285368352662772, 0.274287624517456, 0.575733694015071,
0.12937490013428, 0.00476219342090189, 0.684308280004188, 0.189448777819052,
0.615732178557664, 0.404873769031838, 0.357331350911409, 0.565436001634225,
0.380773033713922, 0.348490287549794, 0.0473814208526164, 0.389312234241515,
0.562123290728778, 0.30642102798447, 0.911173274740577, 0.566258994862437,
0.837928073247895, 0.107747194357216, 0.253737836843356, 0.651503744535148,
0.187739939894527, 0.951192815322429, 0.740037888288498, 0.0817571650259197,
0.740519099170342, 0.601534485351294, 0.120900869136676, 0.415282893227413,
0.591146623482928, 0.698511375114322, 0.08557975362055, 0.139396222075447,
0.303953414550051, 0.0743798329494894, 0.0293272000271827, 0.335832208395004,
0.665010208031163, 0.0319741254206747, 0.678886031731963, 0.154593498911709,
0.275712370406836, 0.828485634410754, 0.921500099124387, 0.651940459152684,
0.00574865937232971, 0.82236105017364, 0.55089360428974, 0.209424041677266,
0.861786168068647, 0.672873278381303, 0.301034058211371, 0.180336013436317,
0.481560358777642, 0.901354183442891, 0.986482679378241, 0.90117057505995,
0.476308439625427, 0.638073122361675, 0.27481731469743, 0.689271076582372,
0.324349449947476, 0.56620552809909, 0.867861548438668, 0.78374840435572,
0.0668482843320817, 0.276675389613956, 0.990600393852219, 0.990227151894942,
0.417612489778548, 0.391012848122045, 0.348758921027184, 0.0799746725242585,
0.88941288786009, 0.511429069796577, 0.0338982092216611, 0.240115304477513,
0.0268365524243563, 0.67206134647131, 0.816803207853809, 0.344421110814437,
0.864659120794386, 0.84128700569272, 0.116056860191748, 0.303730394458398,
0.48192183743231, 0.341675494797528, 0.0622653553728014, 0.823110743425786,
0.483212807681412, 0.968748248415068, 0.953057422768325, 0.116025703493506,
0.327919023809955, 0.590675016632304, 0.832283023977652, 0.342327545629814,
0.576901035616174, 0.942689201096073, 0.59300709143281, 0.565881528891623,
0.600007816683501, 0.133237989619374, 0.873827134957537, 0.744597729761153,
0.755133397178724, 0.0245723063126206, 0.97799762734212, 0.636845340020955,
0.73828601022251, 0.644093665992841, 0.57204390084371, 0.496023115236312,
0.703613247489557, 0.149237307952717, 0.0871439634356648, 0.0632112647872418,
0.83703236351721, 0.433215840253979, 0.430483993608505, 0.924051651498303,
0.913056606892496, 0.914889572421089, 0.215407102368772, 0.76880722376518,
0.269207723205909, 0.865548757137731, 0.28798541566357, 0.391722843516618,
0.649806497385725, 0.459413924254477, 0.907465039752424, 0.48731207777746,
0.554472463205457, 0.779784266138449, 0.566323830280453, 0.208658932242543,
0.958056638715789, 0.61858483706601, 0.838681482244283, 0.286310768220574,
0.895410191034898, 0.448722236789763, 0.297688684659079, 0.33291415637359,
0.0115265529602766, 0.850776052568108, 0.764857453294098, 0.469730701530352,
0.222089925780892, 0.0496484278701246, 0.32886885642074, 0.356443469878286,
0.612877089297399, 0.727906176587567, 0.0292073413729668, 0.429160050582141,
0.232313714455813, 0.678631312213838, 0.642334033036605, 0.99107678886503,
0.542449960019439, 0.835914565017447, 0.52798323193565, 0.303808332188055,
0.919654499506578, 0.944237019168213, 0.52141259261407, 0.794379767496139,
0.72268659202382, 0.114752230467275, 0.175116094760597, 0.437696389388293,
0.852590200025588, 0.511136321350932, 0.30879021063447, 0.174206420546398,
0.14262041519396, 0.375411552377045, 0.0204910831525922, 0.852757754037157,
0.631567053496838, 0.475924106314778, 0.508682047016919, 0.307679089019075,
0.70284536993131, 0.851252349093556, 0.0868967010173947, 0.586291917832568,
0.0529140203725547, 0.440692059928551, 0.207642213441432, 0.777513341512531,
0.141496006632224, 0.548626560717821, 0.419565241318196, 0.0702310993801802,
0.499403427587822, 0.189343606121838, 0.370725362794474, 0.888076487928629,
0.83070912421681, 0.466137421084568, 0.177098380634561, 0.91202046489343,
0.142300580162555, 0.823691181838512, 0.41561916610226, 0.939948018174618,
0.806491429451853, 0.795849160756916, 0.566376683535054, 0.36814984655939,
0.307756055146456, 0.602875682059675, 0.506007500691339, 0.538658684119582,
0.420845189364627, 0.663071365095675, 0.958144341595471, 0.793743418296799,
0.983086514985189, 0.266262857476249, 0.817585011478513, 0.122843299992383,
0.989197303075343, 0.71584410732612, 0.500571243464947, 0.397394519997761,
0.659465527161956, 0.459530522814021, 0.602246116613969, 0.250076721422374,
0.17533828667365, 0.6599256307818, 0.184704560553655, 0.15679649473168,
0.513444944983348, 0.205572377191857, 0.430164282443002, 0.131548407254741,
0.914019819349051, 0.935795902274549, 0.857401241315529, 0.977940042736009,
0.41389597626403, 0.179183913161978, 0.431347143370658, 0.477178965462372,
0.121315707685426, 0.107695729471743, 0.634954946814105, 0.859707030234858,
0.855825762730092, 0.708672808250412, 0.674073817208409, 0.672288877889514,
0.622144045541063, 0.433355041313916, 0.952878215815872, 0.229569894727319,
0.289388840552419, 0.937473804224283, 0.116283216979355, 0.659604362910613,
0.240837284363806, 0.726138337515295, 0.68390148691833, 0.381577257299796,
0.899390475358814, 0.26472729514353, 0.0383855854161084, 0.855232689995319,
0.655799814499915, 0.335587574867532, 0.163842789363116, 0.0353666560258716,
0.048316186061129), .Dim = c(30L, 10L))
I converted just the inner loop to mapply and did a quick speed test:
library(lmtest)
data <- matrix(runif(315*2781), nrow = 315)
get01 <- function(x, y) {
try(gt <- grangertest(x ~ y, order = 1, na.action = na.omit)$P[2])
if (exists("gt")) {
if (gt > 0.05 || is.na(gt)) {
return(0)
} else {
return(1)
}
} else {
return(0)
}
}
i <- 1; j <- 1
system.time(corr <- mapply(function(k) {get01(data[i:(i+35),j], data[i:(i+35),k])}, (1:2781)[-j]))
#> user system elapsed
#> 21.505 0.014 21.520
It would need to perform that mapply 778680 times, so that puts it at about 200 days. You'll either need a different approach with the Granger test or several cores. Here's the command to replace the full loop:
corr3 <- t(mapply(function(i) colSums(mapply(function(j) mapply(function(k) {get01(data[i:(i+35),j], data[i:(i+35),k])}, (1:2781)[-j]), 1:2781)), 1:280))
Replace that first mapply with simplify2array(parLapply to parallelize:
library(parallel)
cl <- makeCluster(detectCores())
clusterExport(cl, list("data", "get01"))
parLapply(cl, cl, function(x) require(lmtest))
corr3 <- t(simplify2array(parLapply(cl, 1:280, function(i) colSums(mapply(function(j) mapply(function(k) {get01(data[i:(i+35),j], data[i:(i+35),k])}, (1:2781)[-j]), 1:2781)))))
stopCluster(cl)
Here is a version, not parallelized, that speeds up the code in the question by a factor greater than 4.
Some bottlenecks in the question's code are easy to detect:
The matrices corr? are extended inside the loops. The solution is to reserve memory beforehand;
The test grangertest is called 3 times per inner iteration when only one is needed;
To cbind with 0 or 1 is in fact creating a vector, not a matrix.
Here is a comparative test between the question's code and the function below.
library(lmtest)
# avoids loading an extra package
is.error <- function(x){
inherits(x, c("error", "try-error"))
}
Lag <- 5L
nr <- nrow(data)
nc <- ncol(data)
t0 <- system.time({
corr<-NULL
corr2<-NULL
corr3<-NULL
for(i in 1:(nr - Lag))
{
corr2<-NULL
data3 <- data[i:(i + Lag), ]
for(j in 1:nc)
{
data2<-data[,-j]
corr<-NULL
for(k in 1:(nc - 1L))
{
ifelse((is.error(grangertest(data[i:(i+Lag),j] ~ data2[i:(i+Lag),k], order = 1, na.action = na.omit)$P[2])==TRUE) ||
(grangertest(data[i:(i+Lag),j] ~ data2[i:(i+Lag),k], order = 1, na.action = na.omit)$P[2])>0.05 ||
(is.na(grangertest(data[i:(i+Lag),j] ~ data2[i:(i+Lag),k], order = 1, na.action = na.omit)$P[2])==TRUE),
corr<-cbind(corr,0),
corr<-cbind(corr,1)
)
}
corr2 <- rbind(corr2, corr)
}
corr3<-rbind(corr3, rowSums(corr2))
}
corr3
})
I will use a simplified version of lmtest::grangertest.
granger_test <- function (x, y, order = 1, na.action = na.omit, ...) {
xnam <- deparse(substitute(x))
ynam <- deparse(substitute(y))
n <- length(x)
all <- cbind(x = x[-1], y = y[-1], x_1 = x[-n], y_1 = y[-n])
y <- as.vector(all[, 2])
lagX <- as.matrix(all[, (1:order + 2)])
lagY <- as.matrix(all[, (1:order + 2 + order)])
fm <- lm(y ~ lagY + lagX)
rval <- lmtest::waldtest(fm, 2, ...)
attr(rval, "heading") <- c("Granger causality test\n", paste("Model 1: ",
ynam, " ~ ", "Lags(", ynam, ", 1:", order, ") + Lags(",
xnam, ", 1:", order, ")\nModel 2: ", ynam, " ~ ", "Lags(",
ynam, ", 1:", order, ")", sep = ""))
rval
}
And now the function to run the tests.
f_Rui <- function(data, Lag){
nr <- nrow(data)
nc <- ncol(data)
corr3 <- matrix(0, nrow = nr - Lag, ncol = nc)
data3 <- matrix(0, nrow = Lag + 1L, ncol = nc)
data2 <- matrix(0, nrow = Lag + 1L, ncol = nc - 1L)
for(i in 1:(nr - Lag)) {
corr2 <- matrix(0, nrow = nc, ncol = nc - 1L)
data3[] <- data[i:(i + Lag), ]
for(j in 1:nc) {
corr <- integer(nc - 1L)
data2[] <- data3[, -j]
for(k in 1:(nc - 1L)){
res <- tryCatch(
grangertest(x = data2[, k], y = data3[, j], order = 1, na.action = na.omit),
error = function(e) e
)
if(!inherits(res, "error") && !is.na(res[['Pr(>F)']][2]) && res[['Pr(>F)']][2] <= 0.05) {
corr[k] <- 1L
}
}
corr2[j, ] <- corr
}
corr3[i, ] <- rowSums(corr2)
}
corr3
}
The results are identical and the timings much better.
t1 <- system.time({
res <- f_Rui(data, 5L)
})
identical(corr3, res)
#[1] TRUE
times <- rbind(t0, t1)
t(t(times)/t1)
# user.self sys.self elapsed user.child sys.child
#t0 4.682908 1.736111 4.707783 NaN NaN
#t1 1.000000 1.000000 1.000000 NaN NaN

R script (including multiple functions and loop) is very slow on a specific part

I have below code, which takes ages to run. I have already tried multiple things such as: limiting the amount of loops, using ifelse statements, and trying to declare as much thing as possible outside of the loop. However, it still takes a very long time.
What would be a good way to improve on this part of my code to improve its processing speed?
Are there some things I'm not seeing?
z <- 625
numDays <- 365
k <- numDays * 96
#To estimate the size of the list
df <- rmarkovchain(n=365, object = mcList, t0= "home", include.t0 = TRUE)
allTheCars <- rep(list(df), z)
#an example of df below:
Locations <- c("Home", "Bakery", "Grocery", "Home-Bakery", "Home-Grocery", "Bakery-Home", "Bakery-Grocery", "Grocery-Home", "Grocery-Bakery")
Iteration <- rep(seq(1:96), 365)
df <- data.frame(Iteration, sample(Locations, k, replace = TRUE))
#The loop takes a huge amount of time
for(y in 1:z){
df <- rmarkovchain(n=365, object = mcList, t0= "Home", include.t0 = TRUE)
df$Begin <- 0
df[1,3] <- b
df$Still <- ifelse(df$values == "Home", 1, 0)
df$KM <- vlookup(df$values, averageDistance, lookup_column = 1, result_column = 2)
df$Load <- ifelse(df$Still == 1, cp, 0)
df$costDistance <- df$KM * 0.21
df$End <- 0
df[is.na(df)] <- 0
df$reduce <- rep(seq(1:97), numDays)
df <- df %>% filter(reduce != 97)
df$Load <- ifelse(df$reduce <= 69 | df$reduce >= 87, df$Load, 0)
for(i in 1:k) {
df[i,3] <- ifelse(df[i,3] < b, pmin(df[i,3] + df[i,6], b), df[i,3])
df[i,8] <- df[i,3] - df[i,7]
j <- i + 1
df[j,3] <- df[i,8]
}
allDf[[y]] <- df
}
EDIT:
After Minem's suggestion to look at Profvis I found out that the second for-loop takes by far the most amount of time, which now looks like this:
for(i in 1:k) {
mainVector <- df[i,3]
extra <- df[i,6]
subtractingVector <- df[i,7]
mainVector <- ifelse(mainVector < b, pmin(mainVector + extra, b), mainVector )
newMain <- mainVector - subtractingVector
j <- i + 1
df[j,3] <- newMain
}
Now the vectorization of the first three vectors takes some time and the last line of code, which integrates the calculated value back in the dataframe costs the most time. Is there anyway to improve upon this?
Edit 2:
Reproducible example for all
library(dplyr)
library(markovchain)
library(expss)
matrixExample <- matrix(sample(runif(81, min = 0 , max =1), replace = FALSE ), nrow = 9, ncol = 9)
mcListLoop <- rep(list(matrixExample), 96)
mcList <- new("markovchainList", markovchains = mcListLoop)
distance <- runif(9, min = 5, max =10)
Locations <- c("Home", "Bakery", "Grocery", "Home-Bakery", "Home-Grocery", "Bakery-Home", "Bakery-Grocery", "Grocery-Home", "Grocery-Bakery")
averageDistance <- data.frame(cbind(distance, Locations))

Speeding up linear model fitting on complete pairwise observations in large sparse matrix in R

I have a numeric data.frame df with 134946 rows x 1938 columns.
99.82% of the data are NA.
For each pair of (distinct) columns "P1" and "P2", I need to find which rows have non-NA values for both and then do some operations on those rows (linear model).
I wrote a script that does this, but it seems quite slow.
This post seems to discuss a related task, but I can't immediately see if or how it can be adapted to my case.
Borrowing the example from that post:
set.seed(54321)
nr = 1000;
nc = 900;
dat = matrix(runif(nr*nc), nrow=nr)
rownames(dat) = paste(1:nr)
colnames(dat) = paste("time", 1:nc)
dat[sample(nr*nc, nr*nc*0.9)] = NA
df <- as.data.frame(dat)
df_ps <- names(df)
N_ps <- length(df_ps)
My script is:
tic = proc.time()
out <- do.call(rbind,sapply(1:(N_ps-1), function(i) {
if (i/10 == floor(i/10)) {
cat("\ni = ",i,"\n")
toc = proc.time();
show(toc-tic);
}
do.call(rbind,sapply((i+1):N_ps, function(j) {
w <- which(complete.cases(df[,i],df[,j]))
N <- length(w)
if (N >= 5) {
xw <- df[w,i]
yw <- df[w,j]
if ((diff(range(xw)) != 0) & (diff(range(yw)) != 0)) {
s <- summary(lm(yw~xw))
o <- c(i,j,N,s$adj.r.squared,s$coefficients[2],s$coefficients[4],s$coefficients[8],s$coefficients[1],s$coefficients[3],s$coefficients[7])} else {
o <- c(i,j,N,rep(NA,7))
}
} else {o <- NULL}
return(o)
},simplify=F))
}
,simplify=F))
toc = proc.time();
show(toc-tic);
This takes about 10 minutes on my machine.
You can imagine what happens when I need to handle a much larger (although more sparse) data matrix. I never managed to finish the calculation.
Question: do you think this could be done more efficiently?
The thing is I don't know which operations take more time (subsetting of df, in which case I would remove duplications of that? appending matrix data, in which case I would create a flat vector and then convert it to matrix at the end? ...).
Thanks!
EDIT following up from minem's post
As shown by minem, the speed of this calculation strongly depended on the way linear regression parameters were calculated. Therefore changing that part was the single most important thing to do.
My own further trials showed that: 1) it was essential to use sapply in combination with do.call(rbind, rather than any flat vector, to store the data (I am still not sure why - I might make a separate post about this); 2) on the original matrix I am working on, much more sparse and with a much larger nrows/ncolumns ratio than the one in this example, using the information on the x vector available at the start of each i iteration to reduce the y vector at the start of each j iteration increased the speed by several orders of magnitude, even compared with minem's original script, which was already much better than mine above.
I suppose the advantage comes from filtering out many rows a priori, thus avoiding costly xna & yna operations on very long vectors.
The modified script is the following:
set.seed(54321)
nr = 1000;
nc = 900;
dat = matrix(runif(nr*nc), nrow = nr)
rownames(dat) = paste(1:nr)
colnames(dat) = paste("time", 1:nc)
dat[sample(nr*nc, nr*nc*0.90)] = NA
df <- as.data.frame(dat)
df_ps <- names(df)
N_ps <- length(df_ps)
tic = proc.time()
naIds <- lapply(df, function(x) !is.na(x))
dl <- as.list(df)
rl <- sapply(1:(N_ps - 1), function(i) {
if ((i-1)/10 == floor((i-1)/10)) {
cat("\ni = ",i,"\n")
toc = proc.time();
show(toc-tic);
}
x <- dl[[i]]
xna <- which(naIds[[i]])
rl2 <- sapply((i + 1):N_ps, function(j) {
y <- dl[[j]][xna]
yna <- which(naIds[[j]][xna])
w <- xna[yna]
N <- length(w)
if (N >= 5) {
xw <- x[w]
yw <- y[yna]
if ((min(xw) != max(xw)) && (min(yw) != max(yw))) {
# extracts from lm/lm.fit/summary.lm functions
X <- cbind(1L, xw)
m <- .lm.fit(X, yw)
# calculate adj.r.squared
fitted <- yw - m$residuals
rss <- sum(m$residuals^2)
mss <- sum((fitted - mean(fitted))^2)
n <- length(m$residuals)
rdf <- n - m$rank
# rdf <- df.residual
r.squared <- mss/(mss + rss)
adj.r.squared <- 1 - (1 - r.squared) * ((n - 1L)/rdf)
# calculate se & pvals
p1 <- 1L:m$rank
Qr <- m$qr
R <- chol2inv(Qr[p1, p1, drop = FALSE])
resvar <- rss/rdf
se <- sqrt(diag(R) * resvar)
est <- m$coefficients[m$pivot[p1]]
tval <- est/se
pvals <- 2 * pt(abs(tval), rdf, lower.tail = FALSE)
res <- c(m$coefficients[2], se[2], pvals[2],
m$coefficients[1], se[1], pvals[1])
o <- c(i, j, N, adj.r.squared, res)
} else {
o <- c(i,j,N,rep(NA,7))
}
} else {o <- NULL}
return(o)
}, simplify = F)
do.call(rbind, rl2)
}, simplify = F)
out2 <- do.call(rbind, rl)
toc = proc.time();
show(toc - tic)
E.g. try with nr=100000; nc=100.
I should probably mention that I tried using indices, i.e.:
naIds <- lapply(df, function(x) which(!is.na(x)))
and then obviously generating w by intersection:
w <- intersect(xna,yna)
N <- length(w)
This however is slower than the above.
Larges bottleneck is lm function, because there are lot of checks & additional calculations, that you do not necessarily need. So I extracted only the needed parts.
I got this to run in +/- 18 seconds.
set.seed(54321)
nr = 1000;
nc = 900;
dat = matrix(runif(nr*nc), nrow = nr)
rownames(dat) = paste(1:nr)
colnames(dat) = paste("time", 1:nc)
dat[sample(nr*nc, nr*nc*0.9)] = NA
df <- as.data.frame(dat)
df_ps <- names(df)
N_ps <- length(df_ps)
tic = proc.time()
naIds <- lapply(df, function(x) !is.na(x)) # outside loop
dl <- as.list(df) # sub-setting list elements is faster that columns
rl <- sapply(1:(N_ps - 1), function(i) {
x <- dl[[i]]
xna <- naIds[[i]] # relevant logical vector if not empty elements
rl2 <- sapply((i + 1):N_ps, function(j) {
y <- dl[[j]]
yna <- naIds[[j]]
w <- xna & yna
N <- sum(w)
if (N >= 5) {
xw <- x[w]
yw <- y[w]
if ((min(xw) != max(xw)) && (min(xw) != max(xw))) { # faster
# extracts from lm/lm.fit/summary.lm functions
X <- cbind(1L, xw)
m <- .lm.fit(X, yw)
# calculate adj.r.squared
fitted <- yw - m$residuals
rss <- sum(m$residuals^2)
mss <- sum((fitted - mean(fitted))^2)
n <- length(m$residuals)
rdf <- n - m$rank
# rdf <- df.residual
r.squared <- mss/(mss + rss)
adj.r.squared <- 1 - (1 - r.squared) * ((n - 1L)/rdf)
# calculate se & pvals
p1 <- 1L:m$rank
Qr <- m$qr
R <- chol2inv(Qr[p1, p1, drop = FALSE])
resvar <- rss/rdf
se <- sqrt(diag(R) * resvar)
est <- m$coefficients[m$pivot[p1]]
tval <- est/se
pvals <- 2 * pt(abs(tval), rdf, lower.tail = FALSE)
res <- c(m$coefficients[2], se[2], pvals[2],
m$coefficients[1], se[1], pvals[1])
o <- c(i, j, N, adj.r.squared, res)
} else {
o <- c(i,j,N,rep(NA,6))
}
} else {o <- NULL}
return(o)
}, simplify = F)
do.call(rbind, rl2)
}, simplify = F)
out2 <- do.call(rbind, rl)
toc = proc.time();
show(toc - tic);
# user system elapsed
# 17.94 0.11 18.44

User defined function input to loop every row of data frame

Im trying to create my very own first project in R but have hit a roadblock.
I have a data frame such as below where every row represents dataset of a financial option.
type <- c("C", "C")
marketV <- c(1.1166, 1.911)
S <- c(20, 60)
K <- c(20, 56)
T <- c(0.333, 0.5)
df <- data.frame(type, marketV, S, K, T)
I made a user defined function to take this data frame as an input and works great when the data frame is one row long. However, I'm not sure how to have my function iterate through all the data frame rows and produce a result for all of them.
I'm new to R so I'm unsure whether I should be running a 'for' loop around or playing around with lapply, or if theres a simple syntax answer. I simply want the function to take the df as input, but repeat its calculation for n row, and produce n results. Thank you for the help in advance.
My current function code for a df with 1 row below as reference:
This is a corrected version of your program:
df <- data.frame(type=c("C", "C"), marketV=c(1.1166, 1.911), S=c(20, 60), K=c(20, 56), T=c(0.333, 0.5))
IV <- function(df) {
# check if df has more then 1 row:
if (nrow(df)>1) { message("!! nrow(df)>1 !!"); return(NA) }
# Initializing of variables
r <- 0
sigma <- 0.3
sigma_down <- 0.001
sigma_up <- 1
count <- 0
type <- df$type; marketV <- df$marketV; S <- df$S; K <- df$K; T <- df$T
d1 <- (log(S/K) + (sigma^2/2)*T)/(sigma*sqrt(T))
d2 <- (log(S/K) - (sigma^2/2)*T)/(sigma*sqrt(T))
if(type=="C") {
V <- exp(-r*T)*(S*pnorm(d1) - K*pnorm(d2))
} else {
V <- exp(-r*T)*(K*pnorm(-d2) - S*pnorm(-d1)) }
difference <- V - marketV
# Root finding of sigma by Bisection method
while(abs(difference)>0.001 && count<1000) {
if(difference < 0) {
sigma_down <- sigma
sigma <- (sigma_up + sigma)/2
} else {
sigma_up <- sigma
sigma <- (sigma_down + sigma)/2
}
d1 <- (log(S/K) + (sigma^2/2)*T)/(sigma*sqrt(T))
d2 <- d1 - sigma*sqrt(T)
if(type=="C") {
V <- exp(-r*T)*(S*pnorm(d1) - K*pnorm(d2))
} else {
V <- exp(-r*T)*(K*pnorm(-d2) - S*pnorm(-d1)) }
difference <- V - marketV
count <- count + 1
}
if(count == 1000){
return(NA) # If sigma to satisfy Black76 price cannot be found
} else{
return(sigma)
}
}
sapply(split(df, seq(nrow(df))), IV)
The main thing is to run row by row through the dataframe. This is done by
sapply(split(df, seq(nrow(df))), IV)
In your original function are many errors: the biggest is accessing to S, K and so on. You might thinking taking the values from the dataframe df. But in fact you were taking the values from the workspace! I corrected this by redefining:
type <- df$type; marketV <- df$marketV; S <- df$S; K <- df$K; T <- df$T
I inserted a test for the number of rows in df, so you will get:
> IV(df)
!! nrow(df)>1 !!
[1] NA
Here is a cleaned up version of your program:
df <- data.frame(type=c("C", "C"), marketV=c(1.1166, 1.911), S=c(20, 60), K=c(20, 56), T=c(0.333, 0.5))
IV2 <- function(type, marketV, S, K, T) {
r <- 0; sigma <- 0.3
sigma_down <- 0.001; sigma_up <- 1
count <- 0
if(type=="C") {
f.sig <- function(sigma) {
d1 <- (log(S/K) + (sigma^2/2)*T)/(sigma*sqrt(T))
d2 <- d1 - sigma*sqrt(T)
exp(-r*T)*(S*pnorm(d1) - K*pnorm(d2)) - marketV
}
} else {
f.sig <- function(sigma) {
d1 <- (log(S/K) + (sigma^2/2)*T)/(sigma*sqrt(T))
d2 <- d1 - sigma*sqrt(T)
exp(-r*T)*(K*pnorm(-d2) - S*pnorm(-d1)) - marketV
}
}
ifelse(f.sig(sigma_down)*f.sig(sigma_up) < 0, uniroot(f.sig, c(sigma_down,sigma_up))$root, NA) # sigma
}
sapply(split(df, seq(nrow(df))), do.call, what="IV2")

Replacing nested for loops with an apply function

I am building a movie recommendation engine and the below code computes the similarity matrix.
data <- read.csv('movie_test.csv')
similarity <- matrix(NA, nrow(data), nrow(data))
for (i in 1:nrow(data)) {
for (j in 1:nrow(data)) {
if (i != j) {
similarity[i, j] <- sum((data[i,] * data[j,]), na.rm = TRUE) /
(sqrt((sum(((data[i,] - data[j,] + data[j,]) * data[i,]), na.rm = TRUE))) *
sqrt((sum(((data[j,] - data[i,] + data[i,]) * data[j,]), na.rm = TRUE))))
}
}
}
For a small dataset this works perfect. But for 900 users and 1000 movies this does not scale. I have heard that the apply set of functions works faster but I doubt even that will scale. Is there any other way I can achieve the above task without using a for loop?
Thank you so much for your suggestions!!!
This should be fast:
m <- as.matrix(data)
m[is.na(m)] <- 0
z <- m %*% t(m)
d <- sqrt(diag(z))
similarity <- t(t(z) / d) / d
The diagonal will contain 1 which seems more appropriate than NA but if you prefer you can always do:
diag(similarity) <- NA

Resources