I am working on calculating a new raster (output ras) based on 2 rasters (input ras) and a 'stratum' raster. The Stratum raster values (1 to 4) refer to the rows in the bias and weight dataframes. Strata value '4' was used to fill any 'NA' in the Strata raster, otherwise the function would crash. The following input is required.
# load library
library(raster)
# reproducing the bias and weight data.frames
bias <- data.frame(
ras_1 = c(56,-7,-30,0),
ras_2 = c(29,18,-52,0),
ras_3 = c(44,4,-15,0)
)
rownames(bias) <- c("Strat 1","Strat 2","Strat 3","Strat 4")
weight <- data.frame(
ras_1 = c(0.56,0.66,0.23,0.33),
ras_2 = c(0.03,0.18,0.5,0.33),
ras_3 = c(0.41,0.16,0.22,0.34)
)
rownames(weight) <- c("Strat 1","Strat 2","Strat 3","Strat 4")
The following function (fusion) allows me to add a 'bias' value to the input rasters. After the bias has been added, the two corrected input raster cell values will be multiplied by a weight value, depending in which stratum they belong.
The result of the input 2 raster values will be summed and returned using 'calc'.
## Create raster data for input
# create 2 rasters
r1 <- raster(ncol=10,nrow=10)
r2 <- raster(ncol=10,nrow=10)
r1[] <- sample(seq(from = 1, to = 500, by = 1), size = 100, replace = TRUE)
r2[] <- sample(seq(from = 1, to = 500, by = 1), size = 100, replace = TRUE)
r2[1:2] <- NA # include NA in input maps for example purpose
# Create strata raster (4 strata's)
r3 <- raster(ncol=10,nrow=10)
r3[] <- sample(seq(from = 1, to = 4, by = 1), size = 100, replace = TRUE)
Strata.n <- 4 # number of strata values in this example
fusion <- function(x) {
result <- matrix(NA, dim(x)[1], 1)
for (n in 1:Strata.n) {
ok <- !is.na(x[,3]) & x[,3] == n
a <- x[ok,1] + bias[n,1] # add bias to first input raster value
b <- x[ok,2] + bias[n,2] # add bias to second input raster value
result[ok] <- a * weight[n,1] + b * weight[n,2] # Multiply values by weight
}
return(result)
}
s <- stack(r1,r2,r3)
Fused.map <- calc(s, fun = fusion, progress = 'text')
The problem with the above function is that:
It is only suited for 2 rasters
If one raster has NA, then the result will be NA for that cell
is.na(Fused.map#data#values) # check for NA in the fused map
What I would like to have is:
A function that takes any number of input rasters
It can work with NA values (ignores NA values in the rasters)
Re-adjusts the 'weight' if a raster has a NA value, so that the remaining weight values add up to 1
EDIT
The following function does what I need, but is significantly slower than the function above on large rasters. Fusion does it in 10 seconds, fusion2 function below needs 8 hours on large rasters...
fusion2 <- function(x) {
m <- matrix(x, nrow= 1, ncol=3) # Create matrix per stack of cells
n <- m[,3] # get the stratum
g <- m[1:(Strata.n-1)] + as.matrix(bias[n,]) # add bias to raster values
g[g < 0] <- 0 # set values below 0 to 0
w <- weight[n,1:(Strata.n-1)] # get correct strata weight values
w[is.na(g)]<- NA # set weight to NA if (g) raster values are NA
p <- sum(w, na.rm = T) # calculate sum of weight values
pp <- w/p # divide weight values by sum to get the proportion to == 1
pp <- as.numeric(pp)
result <- as.integer(round(sum(pp*g, na.rm = T))) # return raster value
return(result)
}
Fused.map <- calc(s, fun = fusion2, progress = 'text')
Any way to optimize the fusion2 function to a similar method as fusion1?
> sessionInfo()
R version 3.3.2 (2016-10-31)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows >= 8 x64 (build 9200)
Thank you for your time!
There seems to be a lot of unnecessary format conversions going on, and using the simplest data structures available is the fastest. calc parameter is a numeric vector, so you can use numeric vectors everywhere. Also, rounding and casting into an integer is redundant.
fusion3 <- function(x) {
n <- x[3] # get the stratum
g <- x[1:(Strata.n-1)] + as.numeric(bias[n,]) # add bias to raster values
g[g < 0] <- 0 # set values below 0 to 0
w <- as.numeric(weight[n,1:(Strata.n-1)]) # get correct strata weight values
w[is.na(g)]<- NA # set weight to NA if (g) raster values are NA
p <- sum(w, na.rm = T) # calculate sum of weight values
pp <- w/p # divide weight values by sum to get the proportion to == 1
result <- as.integer(sum(pp*g, na.rm = T)+0.5) # return raster value
return(result)
}
On a 100x100 raster, your original functions take:
system.time(Fused.map <- calc(s, fun = fusion, progress = 'text'))
user system elapsed
0.015 0.000 0.015
system.time(Fused.map <- calc(s, fun = fusion2, progress = 'text'))
user system elapsed
8.270 0.078 8.312
The modified function is already 5 times faster:
system.time(Fused.map3 <- calc(s, fun = fusion3, progress = 'text'))
user system elapsed
1.970 0.026 1.987
Next, precompute matrices from the data frames so you don't need to do that for each pixel:
bias_matrix = as.matrix(bias)
weight_matrix = as.matrix(weight)
fusion3 <- function(x) {
n <- x[3] # get the stratum
g <- x[1:(Strata.n-1)] + bias_matrix[n,] # add bias to raster values
g[g < 0] <- 0 # set values below 0 to 0
w <- weight_matrix[n,1:(Strata.n-1)] # get correct strata weight values
w[is.na(g)]<- NA # set weight to NA if (g) raster values are NA
p <- sum(w, na.rm = T) # calculate sum of weight values
pp <- w/p # divide weight values by sum to get the proportion to == 1
result <- as.integer(sum(pp*g, na.rm = T)+0.5) # return raster value
return(result)
}
We get:
system.time(Fused.map3 <- calc(s, fun = fusion3, progress = 'text'))
user system elapsed
0.312 0.008 0.318
And finally, also precompute 1:(Strata.n-1):
bias_matrix = as.matrix(bias)
weight_matrix = as.matrix(weight)
Strata.minus1 = 1:(Strata.n-1)
fusion3 <- function(x) {
n <- x[3] # get the stratum
g <- x[Strata.minus1] + bias_matrix[n,] # add bias to raster values
g[g < 0] <- 0 # set values below 0 to 0
w <- weight_matrix[n,Strata.minus1] # get correct strata weight values
w[is.na(g)]<- NA # set weight to NA if (g) raster values are NA
p <- sum(w, na.rm = T) # calculate sum of weight values
pp <- w/p # divide weight values by sum to get the proportion to == 1
result <- as.integer(sum(pp*g, na.rm = T)+0.5) # return raster value
return(result)
}
We get:
system.time(Fused.map3 <- calc(s, fun = fusion3, progress = 'text'))
user system elapsed
0.252 0.011 0.262
That's not quite 0.015 yet, but you also have to take into consideration that your original function does not output integers, nor does it set values below 0 to 0, nor does it make the proportions sum to 1, nor as you mentioned deal with NAs.
Mind you, this function still only works with only two rasters, because you hardcode stratum as layer 3. You should instead use raster::overlay with two parameters, the stratum raster and the layers themselves (or use calc with the stratum raster as layer 1, but that's not what calc is designed for).
Related
So, I've been trying to do a simulation study about a SIR-model.
I have the following code (Tryed to clean it up):
# Initial parameters
N <- 1E6 # Total population
I <- 1 # Number of Infectious at time 0
S <- N-1 # Number of Susceptibles at time 0
R <- 0 # Number of Recovered at time 0
# Vector to store observations in
Df1 <- data.frame("final_size" = as.numeric(), "peak_size" = as.numeric())
# Setting a seed for reproducibility
set.seed(1996)
n_sim <- 100
# Setting different values for R0, the basic reproduction number
R0 <- seq(0.5,2.5, 1)
for(values in R0){
if(values == 0.5){
# Transmission parameters
R0 <- values # Basic Reproduction number
nu <- 1/6 # Recovery rate (in days)
b <- nu*R0/N # Infection rate (in days)
for(sim in n_sim){
temp <- NULL
# Binomial model
#----------------
# Initial states
Sold = S # Number of Susceptibles at time t=0
Iold = I # Number of Infectious at time t=0
Rold = R # Number of Recovered at time t=0
# Output vectors
Svec =Sold; Ivec = Iold; Rvec = Rold
stop = FALSE
# Loop - continue until stop=TRUE
while (!stop){
Ih = rbinom(1,Sold,(1-exp(-b*Iold)))
Rh = rbinom(1,Iold,(1-exp(-nu)))
Rh = nu*Iold
Sold = Sold-Ih
Iold = Iold+Ih-Rh
Rold = Rold+Rh
Svec = c(Svec,Sold)
Ivec = c(Ivec,Iold)
Rvec = c(Rvec,Rold)
if (Iold<=2e-5){stop=T}
}
peak_size_df <- max(Ivec)
final_size_df <- Rvec[length(Rvec)]/N
temp <- rbind(temp, c(final_size_df, peak_size_df))
colnames(temp) <- c("final_size", "peak_size")
Df1 <- rbind(Df1, temp)
}
}
}
I'm looking to store data in Df1. However, at the end of the loop, only 1 loop has been stored, I assume the last loop. Which I don't really properly understand. I've recoded it a few times and in those cases I ended up with 99 NA rows for final size and peak size. In this version, I only end up with 1 row (with values however). I plan on expanding the loop for different values of R0 as seen in the R0 seq. Due to it not working for the first value, I haven't expanded it yet.
Any suggestions? Improvements?
After comments from Gregory and Cath, the following adjustments:
# Setting a seed for reproducibility
#set.seed(1996)
n_sim <- 100
# Initial parameters
N <- 1E6 # Total population
I <- 1 # Number of Infectious at time 0
S <- N-1 # Number of Susceptibles at time 0
R <- 0 # Number of Recovered at time 0
# Vector to store observations in
Df1 <- data.frame("final_size" = rep(NA, n_sim), "peak_size" = rep(NA, n_sim))
Df2 <- NULL
Df3 <- NULL
# Setting different values for R0, the basic reproduction number
R0 <- seq(0.5,2.5, 1)
#plot(Svec, type = "l", ylim = c(0, 1000000), col = "red")
#lines(Rvec, type = "l", col = "blue")
#lines(Ivec, type = "l")
#max(Ivec)
for(values in R0){
if(values == 0.5){
# Transmission parameters
R0_value <- values # Basic Reproduction number
nu <- 1/6 # Recovery rate (in days)
b <- nu*R0_value/N # Infection rate (in days)
for(sim in n_sim){
# Binomial model
#----------------
# Initial states
Sold = S # Number of Susceptibles at time t=0
Iold = I # Number of Infectious at time t=0
Rold = R # Number of Recovered at time t=0
# Output vectors
Svec =Sold; Ivec = Iold; Rvec = Rold
stop = FALSE
# Loop - continue until stop=TRUE
while (!stop){
Ih = rbinom(1,Sold,(1-exp(-b*Iold)))
Rh = rbinom(1,Iold,(1-exp(-nu)))
Rh = nu*Iold
Sold = Sold-Ih
Iold = Iold+Ih-Rh
Rold = Rold+Rh
Svec = c(Svec,Sold)
Ivec = c(Ivec,Iold)
Rvec = c(Rvec,Rold)
if (Iold<=2e-5){stop=T}
}
peak_size_df <- max(Ivec)
final_size_df <- Rvec[length(Rvec)]/N
Df1[sim, "final_size"] <- final_size_df
Df1[sim, "peak_size"] <- peak_size_df
}
}
}
Which gives me 99 rows with NA and row 100 the last simulation with values.
Any idea what is causing the NA's?
Im trying to use Genetic Algorithims using "GA" Package but faced a problem in making the fitness function, im using GA to simulate my data and get the most fitted values for constants in my model.
My data is from observations for a car speed and other parameters, so let's say i've a car and it made a 2 trips, and i want to make a model for it.
Each trip have multiple columns ( speed, delta velocity with the opposite car, and Range between the two cars ), so i've to take the first row of each trip and pass it to the equations in fitness function, then the equations will generate new results for the speed,delta velocity and the range, then i've to use the new values and generate others, then compare the simulated distance with the old range i've in my data which is the observed one and get the lowest difference by the GA .
First: here's my data.
https://drive.google.com/open?id=1923Jl6pDnQa_tGAluANUfIWCcyf85YVq
Second: here's my fitness function and the GA
Fitness_Function <- function(data, M_Acc, D_Speed, Beta, Com_Acc, Gap_J, D_Time){
Trips_IDs <- sort(unique(data$FileName))
# Trip=1;ROW=1
Calibrated_DF <- data.frame()
for (Trip in 1:2) {
Trip_Data <- data%>%filter(FileName==Trips_IDs[Trip])
attach(Trip_Data, warn.conflicts=F)
for (ROW in 1:(nrow(Trip_Data)-1)) {
if (ROW==1) {
speed <- Filling_Speed[1]
Delta_V <- Filling_DeltaVelocity[1]
Dist <- Filling_Range[1]
# M_Acc = 0.8418 ;D_Speed =29.2 ;Beta = 3.52
# Com_Acc = 0.8150 ;Gap_J = 1.554 ;D_Time = 0.878
Distance <- speed*D_Time - (speed*Delta_V)/(2*sqrt(M_Acc*Com_Acc))
if (Distance < 0 ) {
Distance <- 0
}
D_Gap <- Gap_J + Distance
Acceleration <- M_Acc*(1-(speed/D_Speed)^Beta-(D_Gap/Dist)^2)
}else{
speed <- speed_C
Delta_V <- Delta_V_C
Dist <- Dist_c
Distance <- speed*D_Time - (speed*Delta_V)/(2*sqrt(M_Acc*Com_Acc))
if (is.na(Distance)) {
}
Distance = 0
if (Distance < 0 ) {
Distance <- 0
}
D_Gap <- Gap_J + Distance
Acceleration <- M_Acc*(1-(speed/D_Speed)^Beta-(D_Gap/Dist)^2)
}
Lead_Veh_Speed_F <- Filling_Speed[ROW+1]+Filling_DeltaVelocity[ROW+1]
speed_C <- speed + Acceleration*0.1
Delta_V_C <- Lead_Veh_Speed_F-speed_C
Dist_c <- Dist+(Delta_V_C+Delta_V)/2*0.1
Calibrated_DF <- rbind(Calibrated_DF,c(Dist_c,ROW+1,Trips_IDs[Trip],Trip_Data$Filling_Range[ROW+1]))
}
detach(Trip_Data)
}
colnames(Calibrated_DF) <- c("C_Distance","row","Trip","Actual_Distance")
Calibrated_DF$Dif <- (Calibrated_DF$C_Distance-Calibrated_DF$Actual_Distance)^2
RMSPE <- sqrt(sum(Calibrated_DF$Dif)/sum(Calibrated_DF$Actual_Distance^2))
return(RMSPE)
# return(Calibrated_DF)
}
GA_Test <- ga(type='real-valued', lower=c( 0.1 , 1 , 1 , 0.1 , 0.1 , 0.1 ),
upper=c( 5 , 40 , 40 , 5 , 10 , 5 ), popSize=300, maxiter=300,run = 100,
keepBest=T, names = c("M_Acc", "D_Speed", "Beta", "Com_Acc", "Gap_J", "D_Time"),
fitness = function(b) -Fitness_Function(data, b[1],b[2], b[3],b[4],b[5],b[6]))
my problem is that: the code is very large, and it's veeeery slow to do even one iteration, i tried to use dplyr instead of using for loops but it's impossible to do that with dplyr, because i've to calculate the distance then acceleration then speed, then calculate them again for the other rows and i couldn't find away to do that with dplyr.
I'll post my beta code of using Dplyr here but it's not complete because i can't complete it.
So help please.
data <- data%>%group_by(Driver,FileName)%>%
mutate(Distance_Term = ifelse(row_number()==1,Speed_C*D_Time - (Speed_C*Delta_V_C)/(2*sqrt(M_Acc*Com_Acc)),0))
data <- data%>%mutate(Distance_Term = ifelse(Distance_Term < 0 , 0, Distance_Term))%>%
mutate(D_Gap = Gap_J + Distance_Term,Acceleration_C = M_Acc*(1-(Speed_C/D_Speed)^Beta-(D_Gap/Distance)^2))
Note: the FileName column in the trip ID also my PC has good qualifications, so the problem isn't in my PC
I've changed the for loop with accumulate2 function in purrr so it's more faster and more efficient, i got this answer from this question Calculate variables using equations then use the generated values to generate new one
Objective_Function <- function(data, M_Acc, D_Speed, Beta, Com_Acc, Gap_J, D_Time){
myfun <- function(list, lcs,lcs2){
ds <- lcs - list[[1]]
Distance <- list[[1]]*D_Time - (list[[1]] * ds) / (2*sqrt(M_Acc*Com_Acc))
if (Distance < 0|is.na(Distance)) {Distance <- 0}
gap <- Gap_J + Distance
acc <- M_Acc * (1 - (list[[1]] / D_Speed)^Beta - (gap / list[[2]])^2)
fcs_new <- list[[1]] + acc * 0.1
ds_new <- lcs2- fcs_new
di_new <- list[[2]]+(ds_new+ds)/2*0.1
return(list(Speed = fcs_new,Distance = di_new))
}
Generated_Data <- data %>%group_by(Driver,FileName)%>%
mutate(Speed_Distance_Calibrated = accumulate2( .init = list(Filling_Speed[1],
Filling_Range[1]),.x = Lead_Veh_Speed_F,.y = Lead_Veh_Speed_F2, myfun)[-1])%>%ungroup()
Generated_Data <- Generated_Data %>% group_by(Driver,FileName)%>%
mutate(Speed_Distance_Calibrated = append(list(list(Speed = Filling_Speed[1],Distance = Filling_Range[1])),Speed_Distance_Calibrated[-length(Speed_Distance_Calibrated)]))%>%ungroup()
Dif <- map_df(Generated_Data$Speed_Distance_Calibrated, `[`, 2)
Generated_Data <- Generated_Data %>% mutate(Dif_sq = (Dif$Distance - Generated_Data$Filling_Range)^2)
RMSPE <- sqrt(sum(Generated_Data$Dif_sq)/sum(Generated_Data$Filling_Range^2))
return(RMSPE)
}
GA_Test <- ga(type='real-valued', lower=c( 0.1 , 1 , 1 , 0.1 , 0.1 , 0.1 ),
upper=c( 5 , 40 , 40 , 5 , 10 , 5 ), popSize=300, maxiter=300,run = 100,
keepBest=T, names = c("M_Acc", "D_Speed", "Beta", "Com_Acc", "Gap_J", "D_Time"),
fitness = function(b) -Objective_Function(data, b[1],b[2], b[3],b[4],b[5],b[6]),parallel = TRUE)
Summary <- summary(GA_Test)
I encounter a problem with the use of the mice function to do multiple imputation. I want to do imputation only on part of the missing data, what looking at the help seems possible and straightworward. But i can't get it to work.
here is the example:
I have some missing data on x and y:
library(mice)
plouf <- data.frame(ID = rep(LETTERS[1:10],each = 10), x = sample(10,100,replace = T), y = sample(10,100,replace = T))
plouf[sample(100,10),c("x","y")] <- NA
I want only to impute missing data on y:
where <- data.frame(ID = rep(FALSE,100),x = rep(FALSE,100),y = is.na(plouf$y))
I do the imputation
plouf.imp <- mice(plouf, m = 1,method="pmm",maxit=5,where = where)
I look at the imputed values:
test <- complete(plouf.imp)
Here i still have NAs on y:
> sum(is.na(test$y))
[1] 10
if I use where to say to impute on all values, it works:
where <- data.frame(ID = rep(FALSE,100),x = is.na(plouf$x),y = is.na(plouf$y))
plouf.imp <- mice(plouf, m = 1,method="pmm",maxit=5,where = where)
test <- complete(plouf.imp)
> sum(is.na(test$y))
[1] 0
but it does the imputation on x too, that I don't want in this specific case (speed reason in a statistial simulation study)
Has anyone any idea ?
This is happening because of below code -
plouf[sample(100,10),c("x","y")] <- NA
Let's consider your 1st case wherein you want to impute y only. Check it's PredictorMatrix
plouf.imp <- mice(plouf, m = 1, method="pmm", maxit=5, where = whr)
plouf.imp
#PredictorMatrix:
# ID x y
#ID 0 0 0
#x 0 0 0
#y 1 1 0
It says that y's missing value will be predicted based on ID & x since it's value is 1 in row y.
Now check your sample data where you are populating NA in x & y column. You can notice that wherever y is NA x is also having the same NA value.
So what happens is that when mice refers PredictorMatrix for imputation in y column it encounters NA in x and ignore those rows as all independent variables (i.e. ID & x) are expected to be non-missing in order to predict the outcome i.e. missing values in y.
Try this -
library(mice)
#sample data
set.seed(123)
plouf <- data.frame(ID = rep(LETTERS[1:10],each = 10), x = sample(10,100,replace = T), y = sample(10,100,replace = T))
plouf[sample(100,10), "x"] <- NA
set.seed(999)
plouf[sample(100,10), "y"] <- NA
#missing value imputation
whr <- data.frame(ID = rep(FALSE,100), x = rep(FALSE,100), y = is.na(plouf$y))
plouf.imp <- mice(plouf, m = 1, method="pmm", maxit=5, where = whr)
test <- complete(plouf.imp)
sum(is.na(test$y))
#[1] 1
Here only one value of y is left to be imputed and in this case both x & y are having NA value i.e. row number 39 (similar to your 1st case).
I have a dataset where a subset of measurements for each entry are randomly missing:
dat <- matrix(runif(100), nrow=10)
rownames(dat) <- letters[1:10]
colnames(dat) <- paste("time", 1:10)
dat[sample(100, 25)] <- NA
I am interested in calculating correlations between each row in this dataset (i.e., a-a, a-b, a-c, a-d, ...). However, I would like to exclude correlations where there are fewer than 5 pairwise non-NA observations by setting their value to NA in the resulting correlation matrix.
Currently I am doing this as follows:
cor <- cor(t(dat), use = 'pairwise.complete.obs')
names <- rownames(dat)
filter <- sapply(names, function(x1) sapply(names, function(x2)
sum(!is.na(dat[x1,]) & !is.na(dat[x2,])) < 5))
cor[filter] <- NA
However, this operation is very slow as the actual dataset contains >1,000 entries.
Is there way to filter cells based on the number of non-NA pairwise observations in a vectorized manner, instead of within nested loops?
You can count the number of non-NA pairwise observations using matrix approach.
Let's use this data generation code. I made data larger and added more NAs.
nr = 1000;
nc = 900;
dat = matrix(runif(nr*nc), nrow=nr)
rownames(dat) = paste(1:nr)
colnames(dat) = paste("time", 1:nc)
dat[sample(nr*nc, nr*nc*0.9)] = NA
Then you filter code is taking 85 seconds
tic = proc.time()
names = rownames(dat)
filter = sapply(names, function(x1) sapply(names, function(x2)
sum(!is.na(dat[x1,]) & !is.na(dat[x2,])) < 5));
toc = proc.time();
show(toc-tic);
# 85.50 seconds
My version creates a matrix with values 1 for non-NAs in the original data. Then using matrix multiplication I calculate number of pairwise non-NAs. It ran in a fraction of a second.
tic = proc.time()
NAmat = matrix(0, nrow = nr, ncol = nc)
NAmat[ !is.na(dat) ] = 1;
filter2 = (tcrossprod(NAmat) < 5)
toc = proc.time();
show(toc-tic);
# 0.09 seconds
Simple check shows the results are the same:
all(filter == filter2)
# TRUE
I need some help vectorizing the following code because I believe that it will become more efficient. However i do not know how to begin... I created a loop that goes through z. z has 3 columns and 112847 rows, which might be a reason it takes a long time. The 3 columns contain numbers that are used in the MACD() function...
library(quantmod)
library(TTR)
# get stock data
getSymbols('LUNA')
#Choose the Adjusted Close of a Symbol
stock <- Ad(LUNA)
#Create matrix for returns only
y <- stock
#Create a "MATRIX" by choosing the Adjusted Close
Nudata3 <- stock
#Sharpe Ratio Matrix
SR1<- matrix(NA, nrow=1)
# I want to create a table with all possible combinations from the ranges below
i = c(2:50)
k = c(4:50)
j = c(2:50)
# stores possible combinations into z
z <- expand.grid(i,k,j)
colnames(z)<- c("one","two","three")
n = 1
stretches <- length(z[,1])
while (n < stretches){
# I am trying to go through all the values in "z"
Nuw <- MACD((stock), nFast=z[n,1], nSlow=z[n,2], nSig=z[n,3], maType="EMA")
colnames(Nuw) <- c("MACD","Signal") #change the col names to create signals
x <- na.omit(merge((stock), Nuw))
x$sig <- NA
# Create trading signals
sig1 <- Lag(ifelse((x$MACD <= x$Signal),-1, 0)) # short when MACD < SIGNAL
sig2 <- Lag(ifelse((x$MACD >= x$Signal),1, 0)) # long when MACD > SIGNAL
x$sig <- sig1 + sig2
#calculate Returns
ret <- na.omit(ROC(Ad(x))*x$sig)
colnames(ret)<- c(paste(z[n,1],z[n,2],z[n,3],sep=","))
x <- merge(ret,x)
y <- merge(y,ret) #This creates a MATRIX with RETURNs ONLY
Nudata3 <- merge(Nudata3, x)
((mean(ret)/sd(ret)) * sqrt(252)) -> ANNUAL # Creates a Ratio
ANNUAL->Shrat # stores Ratio into ShRat
SR1 <- cbind(SR1,Shrat) # binds all ratios as it loops
n <- (n+1)
}
I would like to know how to vectorize the MACD() function, to speed up the process since the length of stretches is approx. 112847. It takes my computer quite some time to go through the loop itself.
First and foremost - case specific optimization - remove the cases where nFast > nSlow as it doesn't make sense technically.
Secondly - you are creating objects and copying them over and over again. This is very expensive.
Thirdly - you can code this better perhaps by creating a matrix of signals in one loop and doing rest of the operations in vectorized manner.
I would code what you are doing something like this.
Please read help pages of mapply, do.call, merge and sapply if you don't understand.
require(quantmod)
getSymbols("LUNA")
#Choose the Adjusted Close of a Symbol
stock <- Ad(LUNA)
# I want to create a table with all possible combinations from the ranges below
i = c(2:50)
k = c(4:50)
j = c(2:50)
# stores possible combinations into z
z <- expand.grid(i,k,j)
IMO : This is where your first optimization should be. Remove cases where i > k
z <- z[z[,1]<z[,2], ]
It reduces the number of cases from 112847 to 57575
#Calculate only once. No need to calculate this in every iteration.
stockret <- ROC(stock)
getStratRet <- function(nFast, nSlow, nSig, stock, stockret) {
x <- MACD((stock), nFast=nFast, nSlow=nSlow, nSig=nSig, maType="EMA")
x <- na.omit(x)
sig <- Lag(ifelse((x$macd <= x$signal),-1, 0)) + Lag(ifelse((x$macd >= x$signal),1, 0))
return(na.omit(stockret * sig))
}
RETURNSLIST <- do.call(merge, mapply(FUN = getStratRet, nFast = z[,1], nSlow = z[,2], nSig = z[,3], MoreArgs = list(stock = stock, stockret = stockret), SIMPLIFY = TRUE))
getAnnualSharpe <- function(ret) {
ret <- na.omit(ret)
return ((mean(ret)/sd(ret)) * sqrt(252))
}
SHARPELIST <- sapply(RETURNSLIST, FUN = getAnnualSharpe)
Results will be as below. Which column belongs to which combo of i, j, k is trivial.
head(RETURNSLIST[, 1:3])
## LUNA.Adjusted LUNA.Adjusted.1 LUNA.Adjusted.2
## 2007-01-10 0.012739026 -0.012739026 0
## 2007-01-11 -0.051959739 0.051959739 0
## 2007-01-12 -0.007968170 -0.007968170 0
## 2007-01-16 -0.007905180 -0.007905180 0
## 2007-01-17 -0.005235614 -0.005235614 0
## 2007-01-18 0.028315920 -0.028315920 0
SHARPELIST
## LUNA.Adjusted LUNA.Adjusted.1 LUNA.Adjusted.2 LUNA.Adjusted.3 LUNA.Adjusted.4 LUNA.Adjusted.5 LUNA.Adjusted.6
## 0.04939150 -0.07428392 NaN 0.02626382 -0.06789803 -0.22584987 -0.07305477
## LUNA.Adjusted.7 LUNA.Adjusted.8 LUNA.Adjusted.9
## -0.05831643 -0.08864845 -0.08221986
system.time(
+ RETURNSLIST <- do.call(merge, mapply(FUN = getStratRet, nFast = z[1:100,1], nSlow = z[1:100,2], nSig = z[1:100,3], MoreArgs = list(stock = stock, stockret = stockret), SIMPLIFY = TRUE)),
+ SHARPELIST <- sapply(RETURNSLIST, FUN = getAnnualSharpe)
+ )
user system elapsed
2.28 0.00 2.29