Related
I'm writing a agent-based model for phytoplankton. Right now it is fairly basic and includes functions for growth, uptake of a resource ("S" in the code), division, mortality, and a function to keep track of the total extracellular resource concentration outside of the cells (q represents the amount of resource in an individual, then I sum the q of all the individuals and subtract it from the total S). It has become quite a time hog from all the for loops taking more than 24 hours to run when it should be much faster. Are there ways that I can optimize my code to help it run faster?
uptake <- function(x){
vmax <- x[1]
km <- x[2]
S <- x[3]
result <- vmax*(S/(km+S))
return(result)
}
growth <- function(x){
mumax <- x[1]
qnaught <- x[2]
q <- x[3]
result <- mumax*(1-(qnaught/q))
return(result)
}
division <- function(x){
mu <-x[1]
m <- x[2]
result <- mu*m
return(result)
}
extracellular <- function(x){
S <- x[1]
flow_rate <- x[2]
nutrient_inflow <- x[3]
quota_per_time <- x[4]
result <- ((flow_rate*nutrient_inflow)-(flow_rate*S)-quota_per_time)
return(result)
}
## Initial agents and time step parameters
num_agents <- 200
num_time_steps <- 400
## Set up data frame for agents
agents <- data.frame("ID" = 1:num_agents, "vmax" = 0.0000014, "km" = 17,
"mumax" = 1.1, "qnaught" = 0.00000036, "cell_size" = 0.00075,
"alive" = 1)
## Set up data frame for state variables
state <- data.frame("S" = 1.2)
agents <- cbind(agents, state)
state$flow_rate <- 0.86
state$nutrient_inflow <- 1.4
## Set up other parameter values
## q represents the amount of resource in an individual
q <- rep(0, num_agents)
size <- rep(0, num_agents)
m0 <- 0.00075
output <- data.frame()
agents_time <- data.frame()
## For loop for model
for (i in 1:num_time_steps) {
## mortality of agents in each time step
for (k in agents$ID){
random <- runif(1, min = 1, max = 100)
if (random < 10.0){
agents[k,7] = 0
}
}
agents_alive <- filter(agents, alive == 1)
## update S from state data frame
agents_alive$S <- state$S
q <- q[1:nrow(agents_alive)]
## sum uptake per time step to use for extracellular concentration
quota_per_agent <- apply(agents_alive[, c(2, 3, 8)], MARGIN = 1, FUN = function(x) uptake(x))
quota_per_time <- sum(quota_per_agent)
q <- q + quota_per_agent
tmp <- cbind(agents_alive, q)
mu <- apply(tmp[, c(4, 5, 9)], MARGIN = 1, FUN = function(x) growth(x))
tmp <- cbind(tmp, mu)
size <- size[1:nrow(agents_alive)]
size <- size + apply(tmp[, c(6, 10)], MARGIN = 1, FUN = function(x) division(x))
## if size calculated by division function is below minimum cell size (0.75),
## replace with minimum cell size
size <- ifelse(size < 0.00075, 0.00075, size)
index = 1
## division by cell size
for (j in size){
if (j > (2*tmp[index,6])){
size[index] = m0
q[index] = (q[index]/2)
last <- tail(agents$ID, n = 1)
new <- c(last + 1, 0.0000014, 17,
1.1, 0.00000036, m0, 1, state$S)
agents <- rbind(agents, new)
agents_alive <- rbind(agents_alive, new)
q <- append(q, q[index])
mu <- append(mu, mu[index])
size <- append(size, size[index])
}
index = index + 1
}
## update extracellular nutrient concentration state variable
state$quota_per_time <- quota_per_time
new_S <- state$S + apply(state, MARGIN = 1, FUN = function(x) extracellular(x))
state <- replace(state, 1, new_S)
if (state$S < 0){
state$S <- 0
}
#dataframe of outputs for each time step
out_per_time <- data.frame("ID" = agents_alive$ID,
"vmax" = agents_alive$vmax,
"km" = agents_alive$km,
"S" = agents_alive$S,
"mumax" = agents_alive$mumax,
"qnaught" = agents_alive$qnaught,
"time" = rep(i, nrow(agents_alive)),
"q" = q,
"mu" = mu,
"cell_size" = size,
"alive" = agents_alive$alive)
#row-append to output dataframe that stores outputs for all agents and all time steps
output <- rbind(output, out_per_time)
}
Using data.tables to implement the model will speed things up massively. A lot of the time here is spent on copying data within the data.frames and using a lot of memory to do so. data.tables use copy-in-place and much, much faster. I've done my best to understand the logic and replicate it... Where I may be off, hopefully an easy adjustment. Further, a number of for loops have been avoided by using data.table "set" operations and using mapply instead of apply functions.
library(data.table)
uptake2 <- function(vmax,km,S){
vmax*(S/(km+S))
}
growth2 <- function(mumax,qnaught,q){
mumax*(1-(qnaught/q))
}
division2 <- function(mu,cell_size,min_cell_size){
max(mu*cell_size,min_cell_size)
}
extracellular2 <- function(S,flow_rate,nutrient_inflow,quota_per_time){
max(((flow_rate*nutrient_inflow)-(flow_rate*S)-quota_per_time),0)
}
## Initial agents and time step parameters
num_agents <- 10
num_time_steps <- 400
min_cell_size <- 0.00075
state <- data.table(S = 1.2)
## Set up data frame for agents
agents <- data.table(ID = 1:num_agents,
vmax = 0.0000014,
km = 17,
mumax = 1.1,
qnaught = 0.00000036,
min_cell_size = min_cell_size,
alive = 1,
S = state$S)
state[,flow_rate:= 0.86]
state[,nutrient_inflow:= 1.4]
## Set up other parameter values
output <- data.table(ID = numeric(0),
vmax = numeric(0),
km = numeric(0),
S = numeric(0),
mumax = numeric(0),
qnaught = numeric(0),
time = numeric(0),
q = numeric(0),
mu = numeric(0),
cell_size = numeric(0),
alive = numeric(0))
#agents_time <- data.frame()
## For loop for model
for (i in 1:num_time_steps) {
## mortality of agents in each time step
for (k in agents$ID){
random <- runif(1, min = 1, max = 100)
if (random < 10.0){
agents[k,alive:=0]
}
}
agents_alive <- agents[alive==1]
if (nrow(agents_alive)==0) next;
## sum uptake per time step to use for extracellular concentration
agents_alive[,q:=mapply(uptake2,
vmax=vmax,
km=km,
S=state$S)]
quota_per_time <- sum(agents_alive$q)
agents_alive[,mu:=mapply(growth2,
mumax=mumax,
qnaught=qnaught,
q=q)]
agents_alive[,size:=mapply(division2,
mu=mu,
cell_size=min_cell_size,
min_cell_size=min_cell_size)]
## division by cell size
create_new_agents <- nrow(agents_alive[size > 2*min_cell_size])
if (create_new_agents > 0) {
new_agents <- data.table(ID = max(agents$ID)+(1:create_new_agents),
vmax = 0.0000014,
km = 17,
mumax = 1.1,
qnaught = 0.00000036,
cell_size = min_cell_size,
alive = 1,
S = state$S)
agents_alive[size > 2*cell_size,
`:=`(size=min_cell_size,
q=q/2)]
agents <- rbindlist(list(agents,
new_agents))
agents_alive <- rbindlist(list(agents_live,
new_agents))
}
## update extracellular nutrient concentration state variable
state$quota_per_time <- quota_per_time
state[,S:=extracellular2(S=S,
flow_rate=flow_rate,
nutrient_inflow=nutrient_inflow,
quota_per_time=quota_per_time)]
#row-append to output dataframe that stores outputs for all agents and all time steps
output <- rbindlist(list(output,
agents_alive[,
.(ID,
vmax,
km,
S,
mumax,
qnaught,
time=i,
q,
mu,
cell_size=size,
alive)]
))
print(i)
}
You can likely avoid the internal for loop with the uniformly distributed random variate. You could try using (perhaps with some adjustment) something like:
agents$random <- runif(length(agents$ID), min = 1, max = 100)
agents$alive[agents$random < 10] <- 0
head(agents)
ID vmax km mumax qnaught cell_size alive S random
1 1 1.4e-06 17 1.1 3.6e-07 0.00075 0 1.2 5.313743
2 2 1.4e-06 17 1.1 3.6e-07 0.00075 1 1.2 78.682158
3 3 1.4e-06 17 1.1 3.6e-07 0.00075 1 1.2 49.320952
4 4 1.4e-06 17 1.1 3.6e-07 0.00075 1 1.2 56.237847
5 5 1.4e-06 17 1.1 3.6e-07 0.00075 0 1.2 5.142588
6 6 1.4e-06 17 1.1 3.6e-07 0.00075 1 1.2 99.543501
If you still need the outer for-loop to iterate at each time step, you can essentially erase the "random" column afterwards with:
agents$random <- NULL
I have to choose 10 elements of a vector to maximizes a function. Since the vector is pretty long there are to many possibilities (~1000 choose 10) to compute them all. So I started to look into the GA package to use a genetic algorithm.
I came up with this MWE:
values <- 1:1000
# Fitness function which I want to maximise
f <- function(x){
# Choose values
y <- values[x]
# From the first 10 sum up the odd values.
y <- ifelse(y %% 2 != 0, y, 0)
y <- y[1:10]
return(sum(y))
}
# Maximum value of f for this example
y <- ifelse(values %% 2 != 0, values, 0)
sum(sort(y, decreasing = TRUE)[1:10])
# [1] 9900
# Genetic algorithm
GA <- ga(type = "permutation", fitness = f, lower = rep(1, 10), upper = rep(1000, 10), maxiter = 100)
summary(GA)
The results are a bit underwhelming. From summary(GA), I get the feeling that the algorithm always permutates all 1000 values (the solution goes from x1 to x1000) which leads to an inefficient optimization. How can I tell the algorithm that it should only should use 10 values (so the solution is x1 .. x10)?
You should read https://www.jstatsoft.org/article/view/v053i04. You don't have permutation problem but selection one hence you should use binary type of genetic algorithm. Because you want to select exclusively 10 (10 ones and 990 zeroes) you should probably write your own genetic operators because that is constraint that will hardly ever be satisfied by default operators (with inclusion of -Inf in fitness function if you have more than 10 zeroes). One approach:
Population (k tells how much ones you want):
myInit <- function(k){
function(GA){
m <- matrix(0, ncol = GA#nBits, nrow = GA#popSize)
for(i in seq_len(GA#popSize))
m[i, sample(GA#nBits, k)] <- 1
m
}
}
Crossover
myCrossover <- function(GA, parents){
parents <- GA#population[parents,] %>%
apply(1, function(x) which(x == 1)) %>%
t()
parents_diff <- list("vector", 2)
parents_diff[[1]] <- setdiff(parents[2,], parents[1,])
parents_diff[[2]] <- setdiff(parents[1,], parents[2,])
children_ind <- list("vector", 2)
for(i in 1:2){
k <- length(parents_diff[[i]])
change_k <- sample(k, sample(ceiling(k/2), 1))
children_ind[[i]] <- if(length(change_k) > 0){
c(parents[i, -change_k], parents_diff[[i]][change_k])
} else {
parents[i,]
}
}
children <- matrix(0, nrow = 2, ncol = GA#nBits)
for(i in 1:2)
children[i, children_ind[[i]]] <- 1
list(children = children, fitness = c(NA, NA))
}
Mutation
myMutation <- function(GA, parent){
ind <- which(GA#population[parent,] == 1)
n_change <- sample(3, 1)
ind[sample(length(ind), n_change)] <- sample(setdiff(seq_len(GA#nBits), ind), n_change)
parent <- integer(GA#nBits)
parent[ind] <- 1
parent
}
Fitness (your function adapted for binary GA):
f <- function(x, values){
ind <- which(x == 1)
y <- values[ind]
y <- ifelse(y %% 2 != 0, y, 0)
y <- y[1:10]
return(sum(y))
}
GA:
GA <- ga(
type = "binary",
fitness = f,
values = values,
nBits = length(values),
population = myInit(10),
crossover = myCrossover,
mutation = myMutation,
run = 300,
pmutation = 0.3,
maxiter = 10000,
popSize = 100
)
Chosen values
values[which(GA#solution[1,] == 1)]
I am struggling with this for so long. I have a logistic growth function where the growth parameter
r is a matrix. The model is constructed in a way that I have as an output two N the N1 and N2.
I would like to be able to change the r parameter over time. When time < 50 I would like
r = r1 where
r1=matrix(c(
2,3),
nrow=1, ncol=2
When time >= 50 I would like r=r2 where
r2=matrix(c(
1,2),
nrow=1, ncol=2
Here is my function. Any help is highly appreciated.
rm(list = ls())
library(deSolve)
model <- function(time, y, params) {
with(as.list(c(y,params)),{
N = y[paste("N",1:2, sep = "")]
dN <- r*N*(1-N/K)
return(list(c(dN)))
})
}
r=matrix(c(
4,5),
nrow=1, ncol=2)
K=100
params <- list(r,K)
y<- c(N1=0.1, N2=0.2)
times <- seq(0,100,1)
out <- ode(y, times, model, params)
plot(out)
I would like ideally something like this but it does not work
model <- function(time, y, params) {
with(as.list(c(y,params)),{
N = y[paste("N",1:2, sep = "")]
r = ifelse(times < 10, matrix(c(1,3),nrow=1, ncol=2),
ifelse(times > 10, matrix(c(1,4),nrow=1, ncol=2), matrix(c(1,2),nrow=1, ncol=2)))
print(r)
dN <- r*N*(1-N/K)
return(list(c(dN)))
})
}
Thank you for your time.
Here a generic approach that uses an extended version of the approx function. Note also some further simplifications of the model function and the additional plot of the parameter values.
Edit changed according to the suggestion of Lewis Carter to make the parameter change at t=3, so that the effect can be seen.
library(simecol) # contains approxTime, a vector version of approx
model <- function(time, N, params) {
r <- approxTime(params$signal, time, rule = 2, f=0, method="constant")[-1]
K <- params$K
dN <- r*N*(1-N/K)
return(list(c(dN), r))
}
signal <- matrix(
# time, r[1, 2],
c( 0, 2, 3,
3, 1, 2,
100, 1, 2), ncol=3, byrow=TRUE
)
## test of the interpolation
approxTime(signal, c(1, 2.9, 3, 100), rule = 2, f=0, method="constant")
params <- list(signal = signal, K = 100)
y <- c(N1=0.1, N2=0.2)
times <- seq(0, 10, 0.1)
out <- ode(y, times, model, params)
plot(out)
For a small number of state variables like in the example, separate signals with approxfun from package stats will look less generic but may be slighlty faster.
As a further improvement, one may consider to replace the "hard" transitions with a more smooth one. This can then directly be formulated as a function without the need of approx, approxfun or approxTime.
Edit 2:
Package simecol imports deSolve, and we need only a small function from it. So instead of loading simecol it is also possible to include the approxTime function explicitly in the code. The conversion from data frame to matrix improves performance, but a matrix is preferred anyway in such cases.
approxTime <- function(x, xout, ...) {
if (is.data.frame(x)) {x <- as.matrix(x); wasdf <- TRUE} else wasdf <- FALSE
if (!is.matrix(x)) stop("x must be a matrix or data frame")
m <- ncol(x)
y <- matrix(0, nrow=length(xout), ncol=m)
y[,1] <- xout
for (i in 2:m) {
y[,i] <- as.vector(approx(x[,1], x[,i], xout, ...)$y)
}
if (wasdf) y <- as.data.frame(y)
names(y) <- dimnames(x)[[2]]
y
}
If you want to pass a matrix parameter you should pass a list of parameters and you can modify it inside the model when your time limit is exceeded (in the example below you don't even have to pass the r matrix to the model function)
library(deSolve)
model <- function(time, y, params) {
with(as.list(c(y,params)),{
if(time < 3) r = matrix(c(2,3), nrow = 1, ncol = 2)
else r = matrix(c(1,3), nrow = 1, ncol = 2)
N = y[paste("N",1:2, sep = "")]
dN <- r*N*(1-N/K)
return(list(c(dN)))
})
}
y <- c(N1=0.1, N2=0.2)
params <- list(r = matrix(c(0,0), nrow = 1, ncol = 2), K=100)
times <- seq(0,10,0.1)
out <- ode(y, times, model, params)
plot(out)
You can see examples of this for instance with Delay Differential Equations ?dede
I have tried to improve my previous code so that I can incorporate conditional probability.
Source Code
states <- c(1, 2)
alpha <- c(1, 1)/2
mat <- matrix(c(0.5, 0.5,
0, 1), nrow = 2, ncol = 2, byrow = TRUE)
# this function calculates the next state, if present state is given.
# X = present states
# pMat = probability matrix
nextX <- function(X, pMat)
{
#set.seed(1)
probVec <- vector() # initialize vector
if(X == states[1]) # if the present state is 1
{
probVec <- pMat[1,] # take the 1st row
}
if(X==states[2]) # if the prsent state is 2
{
probVec <- pMat[2,] # take the 2nd row
}
return(sample(states, 1, replace=TRUE, prob=probVec)) # calculate the next state
}
# this function simulates 5 steps
steps <- function(alpha1, mat1, n1)
{
vec <- vector(mode="numeric", length = n1+1) # initialize an empty vector
X <- sample(states, 1, replace=TRUE, prob=alpha1) # initial state
vec[1] <- X
for (i in 2:(n1+1))
{
X <- nextX(X, mat1)
vec[i] <- X
}
return (vec)
}
# this function repeats the simulation n1 times.
# steps(alpha1=alpha, mat1=mat, n1=5)
simulate <- function(alpha1, mat1, n1)
{
mattt <- matrix(nrow=n1, ncol=6, byrow=T);
for (i in 1:(n1))
{
temp <- steps(alpha1, mat1, 5)
mattt[i,] <- temp
}
return (mattt)
}
Execution
I created this function so that it can handle any conditional probability:
prob <- function(simMat, fromStep, toStep, fromState, toState)
{
mean(simMat[toStep+1, simMat[fromStep+1, ]==fromState]==toState)
}
sim <- simulate(alpha, mat, 10)
p <- prob(sim, 0,1,1,1) # P(X1=1|X0=1)
p
Output
NaN
Why is this source code giving NaN?
How can I correct it?
I didn't inspect the rest of your code, but it seems that only prob has a mistake; you are mixing up rows with columns and instead it should be
prob <- function(simMat, fromStep, toStep, fromState, toState)
mean(simMat[simMat[, fromStep + 1] == fromState, toStep + 1] == toState)
Then NaN still remains a valid possibility for the following reason. We are looking at a conditional probability P(X1=1|X0=1) which, by definition, is well defined only when P(X0=1)>0. The same holds with sample estimates: if there are no cases where X0=1, then the "denominator" in the mean inside of prob is zero. Thus, it cannot and should not be fixed (i.e., returning 0 in those cases would be wrong).
I would like to iterate through vectors of values and calculate something for every value while being within a function environment in R. For example:
# I have costs for 3 companies
c <- c(10, 20, 30)
# I have the same revenue across all 3
r <- 100
# I want to obtain the profits for all 3 within one variable
result <- list()
# I could do this in a for loop
for(i in 1:3){
result[i] <- r - c[i]
}
Now lets assume I have a model that is very long and I define everything as a function which is to be solved with various random draws for the costs.
# Random draws
n <- 1000
r <- rnorm(n, mean = 100, sd = 10)
c1 <- rnorm(n, mean = 10, sd = 1)
c2 <- rnorm(n, mean = 20, sd = 2)
c3 <- rnorm(n, mean = 30, sd = 3)
X <- data.frame(r, c1, c2, c3)
fun <- function(x){
r <- x[1]
c <- c(x[2], x[3], x[4])
for(i in 1:3){
result[i] <- r - c[i]
}
return(result)
}
I could then evaluate the result for all draws by iterating through the rows of randomly sampled input data.
for(j in 1:n){
x <- X[j,]
y <- fun(x)
}
In this example, the output variable y would entail the nested result variable which comprises of the results for all 3 companies. However, my line of thinking results in an error and I think it has to do with the fact that I try to return a nested variable? Hence my question how you guys would approach something like this.
I would suggest rethinking your coding approach. This is a very un-R-like way of doing things.
For example, the first for loop can be written much more succinctly as
x <- c(10, 20, 30)
r <- 100
result <- lapply(-x, `+`, r)
Then fun becomes something like
fun <- function(x) lapply(-x[-1], `+`, x[1])
To then operate over the rows of a data.frame (which is what you seem to do in the last step), you can use something like
apply(X, 1, fun)
where the MARGIN = 1 argument in apply ensures that you are applying a function per row (as opposed to per column).
Here's an approach using your function and a for loop:
# Random draws
n <- 1000
r <- rnorm(n, mean = 100, sd = 10)
c1 <- rnorm(n, mean = 10, sd = 1)
c2 <- rnorm(n, mean = 20, sd = 2)
c3 <- rnorm(n, mean = 30, sd = 3)
X <- data.frame(r, c1, c2, c3)
result <- list()
fun <- function(x){
r <- x[[1]]
c <- c(x[[2]], x[[3]], x[[4]])
for(i in 1:3){
result[i] <- r - c[i]
}
return(result)
}
# Create a list to store results
profits <- rep(rep(list(1:3)),nrow(X))
# Loop throuhg each row of dataframe and store in profits.
for(i in 1:nrow(X)){
profits_temp <-
fun(list(X[i,"r"],X[i,"c1"],X[i,"c2"],X[i,"c3"]))
for(j in 1:3)
profits[[i]][[j]] <- profits_temp[[j]]
}
# Eye results
profits[[1]]
#> [1] 93.23594 81.25731 70.27699
profits[[2]]
#> [1] 80.50516 69.27517 63.36439