I would like to create a map showing the bi-variate spatial correlation between two variables. This could be done either by doing a LISA map of bivariate Moran's I spatial correlation or using the L index proposed by Lee (2001).
The bi-variate Moran's I is not implemented in the spdep library, but the L index is, so here is what I've tried without success using the L index. A answer showing a solution based on Moran's I would also be very welcomed !
As you can see from the reproducible example below, I've manged so far to calculate the local L indexes. What I would like to do is to estimate the pseudo p-values and create a map of the results like those maps we use in LISA spatial clusters with high-high, high-low, ..., low-low.
In this example, the goal is to create a map with bi-variate Lisa association between black and white population. The map should be created in ggplot2 , showing the clusters:
High-presence of black and High-presence of white people
High-presence of black and Low-presence of white people
Low-presence of black and High-presence of white people
Low-presence of black and Low-presence oh white people
Reproducible example
library(UScensus2000tract)
library(ggplot2)
library(spdep)
library(sf)
# load data
data("oregon.tract")
# plot Census Tract map
plot(oregon.tract)
# Variables to use in the correlation: white and black population in each census track
x <- scale(oregon.tract$white)
y <- scale(oregon.tract$black)
# create Queen contiguity matrix and Spatial weights matrix
nb <- poly2nb(oregon.tract)
lw <- nb2listw(nb)
# Lee index
Lxy <-lee(x, y, lw, length(x), zero.policy=TRUE)
# Lee’s L statistic (Global)
Lxy[1]
#> -0.1865688811
# 10k permutations to estimate pseudo p-values
LMCxy <- lee.mc(x, y, nsim=10000, lw, zero.policy=TRUE, alternative="less")
# quik plot of local L
Lxy[[2]] %>% density() %>% plot() # Lee’s local L statistic (Local)
LMCxy[[7]] %>% density() %>% lines(col="red") # plot values simulated 10k times
# get confidence interval of 95% ( mean +- 2 standard deviations)
two_sd_above <- mean(LMCxy[[7]]) + 2 * sd(LMCxy[[7]])
two_sd_below <- mean(LMCxy[[7]]) - 2 * sd(LMCxy[[7]])
# convert spatial object to sf class for easier/faster use
oregon_sf <- st_as_sf(oregon.tract)
# add L index values to map object
oregon_sf$Lindex <- Lxy[[2]]
# identify significant local results
oregon_sf$sig <- if_else( oregon_sf$Lindex < 2*two_sd_below, 1, if_else( oregon_sf$Lindex > 2*two_sd_above, 1, 0))
# Map of Local L index but only the significant results
ggplot() + geom_sf(data=oregon_sf, aes(fill=ifelse( sig==T, Lindex, NA)), color=NA)
What about this?
I'm using the regular Moran's I instead of that Lee Index you suggest. But I think the underlying reasoning is pretty much the same.
As you may see bellow -- the results produced this way look very much alike those comming from GeoDA
library(dplyr)
library(ggplot2)
library(sf)
library(spdep)
library(rgdal)
library(stringr)
library(UScensus2000tract)
#======================================================
# load data
data("oregon.tract")
# Variables to use in the correlation: white and black population in each census track
x <- oregon.tract$white
y <- oregon.tract$black
#======================================================
# Programming some functions
# Bivariate Moran's I
moran_I <- function(x, y = NULL, W){
if(is.null(y)) y = x
xp <- (x - mean(x, na.rm=T))/sd(x, na.rm=T)
yp <- (y - mean(y, na.rm=T))/sd(y, na.rm=T)
W[which(is.na(W))] <- 0
n <- nrow(W)
global <- (xp%*%W%*%yp)/(n - 1)
local <- (xp*W%*%yp)
list(global = global, local = as.numeric(local))
}
# Permutations for the Bivariate Moran's I
simula_moran <- function(x, y = NULL, W, nsims = 1000){
if(is.null(y)) y = x
n = nrow(W)
IDs = 1:n
xp <- (x - mean(x, na.rm=T))/sd(x, na.rm=T)
W[which(is.na(W))] <- 0
global_sims = NULL
local_sims = matrix(NA, nrow = n, ncol=nsims)
ID_sample = sample(IDs, size = n*nsims, replace = T)
y_s = y[ID_sample]
y_s = matrix(y_s, nrow = n, ncol = nsims)
y_s <- (y_s - apply(y_s, 1, mean))/apply(y_s, 1, sd)
global_sims <- as.numeric( (xp%*%W%*%y_s)/(n - 1) )
local_sims <- (xp*W%*%y_s)
list(global_sims = global_sims,
local_sims = local_sims)
}
#======================================================
# Adjacency Matrix (Queen)
nb <- poly2nb(oregon.tract)
lw <- nb2listw(nb, style = "B", zero.policy = T)
W <- as(lw, "symmetricMatrix")
W <- as.matrix(W/rowSums(W))
W[which(is.na(W))] <- 0
#======================================================
# Calculating the index and its simulated distribution
# for global and local values
m <- moran_I(x, y, W)
m[[1]] # global value
m_i <- m[[2]] # local values
local_sims <- simula_moran(x, y, W)$local_sims
# Identifying the significant values
alpha <- .05 # for a 95% confidence interval
probs <- c(alpha/2, 1-alpha/2)
intervals <- t( apply(local_sims, 1, function(x) quantile(x, probs=probs)))
sig <- ( m_i < intervals[,1] ) | ( m_i > intervals[,2] )
#======================================================
# Preparing for plotting
oregon.tract <- st_as_sf(oregon.tract)
oregon.tract$sig <- sig
# Identifying the LISA patterns
xp <- (x-mean(x))/sd(x)
yp <- (y-mean(y))/sd(y)
patterns <- as.character( interaction(xp > 0, W%*%yp > 0) )
patterns <- patterns %>%
str_replace_all("TRUE","High") %>%
str_replace_all("FALSE","Low")
patterns[oregon.tract$sig==0] <- "Not significant"
oregon.tract$patterns <- patterns
# Plotting
ggplot() + geom_sf(data=oregon.tract, aes(fill=patterns), color="NA") +
scale_fill_manual(values = c("red", "pink", "light blue", "dark blue", "grey95")) +
theme_minimal()
You can get results closer (but not identical) to that of GeoDa by making changes in the confidence interval (e.g. using 90% instead of 95%).
I suppose the remaining discrepancies come from a slightly different method of calculating the Moran's I. My version gives the same values of that function moran available in the package spdep. But GeoDa probably uses another approach.
I suppose this is quite late to add to the thread, however Lee's L is quite different to what you have done here, which is Wartenberg's (1985) innovation. This has some potential drawbacks. Mainly, that it tests the relationship between x and the lag of y as #RogerioJB clarified by explaining that the spatially lagged y is calculated by multiplying the simulated y by the adjacency matrix. Lee's (2001) innovation is quite different and involves the integration of Pearson's r and a Spatial Smoothing Scalar (SSS) and instead compares the process between x and y as opposed to the lag of y. The approach #RogerioJB adopted can be replicated by generating the distribution of possible local l's from the lee.mc function. In turn the results can be plotted in a style that is similar to the GeoDa-like high-high ... low-low significance cluster map.
Building upon the suggestion by #justin-k, I modified the bivariate LISA code by #rogeriojb to calculate Lee's L statistic. This approach creates a modified lee.mc() function from the spdep package to simulate the local L values. I provide another example in a GitHub gist with point-level data.
library(boot)
library(dplyr)
library(ggplot2)
library(sf)
library(spdep)
library(rgdal)
library(stringr)
library(UScensus2000tract)
#======================================================
# load data
data("oregon.tract")
# Variables to use in the correlation: white and black population in each census track
x <- oregon.tract$white
y <- oregon.tract$black
# ----------------------------------------------------- #
# Program a function
## Permutations for Lee's L statistic
## Modification of the lee.mc() function within the {spdep} package
## Saves 'localL' output instead of 'L' output
simula_lee <- function(x, y, listw, nsim = nsim, zero.policy = NULL, na.action = na.fail) {
if (deparse(substitute(na.action)) == "na.pass")
stop ("na.pass not permitted")
na.act <- attr(na.action(cbind(x, y)), "na.action")
x[na.act] <- NA
y[na.act] <- NA
x <- na.action(x)
y <- na.action(y)
if (!is.null(na.act)) {
subset <- !(1:length(listw$neighbours) %in% na.act)
listw <- subset(listw, subset, zero.policy = zero.policy)
}
n <- length(listw$neighbours)
if ((n != length(x)) | (n != length(y)))
stop ("objects of different length")
gamres <- suppressWarnings(nsim > gamma(n + 1))
if (gamres)
stop ("nsim too large for this number of observations")
if (nsim < 1)
stop ("nsim too small")
xy <- data.frame(x, y)
S2 <- sum((unlist(lapply(listw$weights, sum)))^2)
lee_boot <- function(var, i, ...) {
return(lee(x = var[i, 1], y = var[i, 2], ...)$localL)
}
res <- boot(xy, statistic = lee_boot, R = nsim, sim = "permutation",
listw = listw, n = n, S2 = S2, zero.policy = zero.policy)
}
# ----------------------------------------------------- #
# Adjacency Matrix
nb <- poly2nb(oregon.tract)
lw <- nb2listw(nb, style = "B", zero.policy = T)
W <- as(lw, "symmetricMatrix")
W <- as.matrix(W / rowSums(W))
W[which(is.na(W))] <- 0
# ----------------------------------------------------- #
# Calculate the index and its simulated distribution
# for global and local values
# Global Lee's L
lee.test(x = x, y = y, listw = lw, zero.policy = TRUE,
alternative = "two.sided", na.action = na.omit)
# Local Lee's L values
m <- lee(x = x, y = y, listw = lw, n = length(x),
zero.policy = TRUE, NAOK = TRUE)
# Local Lee's L simulations
local_sims <- simula_lee(x = x, y = y, listw = lw, nsim = 10000,
zero.policy = TRUE, na.action = na.omit)
m_i <- m[[2]] # local values
# Identify the significant values
alpha <- 0.05 # for a 95% confidence interval
probs <- c(alpha/2, 1-alpha/2)
intervals <- t(apply(t(local_sims[[2]]), 1, function(x) quantile(x, probs = probs)))
sig <- (m_i < intervals[ , 1] ) | ( m_i > intervals[ , 2])
#======================================================
# Preparing for plotting
oregon.tract <- st_as_sf(oregon.tract)
oregon.tract$sig <- sig
# Identifying the Lee's L patterns
xp <- scale(x)
yp <- scale(y)
patterns <- as.character(interaction(xp > 0, W%*%yp > 0))
patterns <- patterns %>%
str_replace_all("TRUE","High") %>%
str_replace_all("FALSE","Low")
patterns[oregon.tract$sig == 0] <- "Not significant"
oregon.tract$patterns <- patterns
# Plotting
ggplot() +
geom_sf(data = oregon.tract, aes(fill = patterns), color = "NA") +
scale_fill_manual(values = c("red", "pink", "light blue", "dark blue", "grey95")) +
guides(fill = guide_legend(title = "Lee's L clusters")) +
theme_minimal()
Lee's L clusters for oregon.tract data
Related
A similar question in How to write a double for loop in r with choosing maximal element in one loop?.
The same setup:
If I want to sample theta[j] as first for j=1,2,...,71, then draw replicated( like 1000 times) yrep[k] form Bin(n[j], theta[j]), n[j] is known.
For theta[1], we have yrep[1,1], yrep[1,2], ..., yrep[1,1000]. Then for all theta[j], we will have a matrix of data set of yrep[i,j], i=1,...,71, j=1,..,1000.Then compute mean, max or min of each column yrep[1,1], yrep[1,2], yrep[1,3], ... yrep[1,71], we will get 1000 mean, max or min.
How to write this for loop?
I first try to write a loop to sample theta[j] and yrep. I do not know how to add a code to compute the maximal, mean, and minimal in this loop. I am not sure if this code is right:
theta<-NULL
yrep<-NULL
test<-NULL
k=1
for(i in 1:1000){
for(j in 1:71){
theta[j] <- rbeta(1,samp_A+y[j], samp_B+n[j]-y[j])
yrep[k]<-rbinom(1, n[j], theta[j])
k=k+1
}
t<-c(test, max(yrep))
}
Data is given in How to write a double for loop in r with choosing maximal element in one loop?:
#Data
y <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,
2,1,5,2,5,3,2,7,7,3,3,2,9,10,4,4,4,4,4,4,4,10,4,4,4,5,11,12,
5,5,6,5,6,6,6,6,16,15,15,9,4)
n <-
c(20,20,20,20,20,20,20,19,19,19,19,18,18,17,20,20,20,20,19,19,18,18,25,24,
23,20,20,20,20,20,20,10,49,19,46,27,17,49,47,20,20,13,48,50,20,20,20,20,
20,20,20,48,19,19,19,22,46,49,20,20,23,19,22,20,20,20,52,46,47,24,14)
#Evaluate densities in grid
x <- seq(0.0001, 0.9999, length.out = 1000)
#Compute the marginal posterior of alpha and beta in hierarchical model Use grid
A <- seq(0.5, 15, length.out = 100)
B <- seq(0.3, 45, length.out = 100)
#Make vectors that contain all pairwise combinations of A and B
cA <- rep(A, each = length(B))
cB <- rep(B, length(A))
#Use logarithms for numerical accuracy!
lpfun <- function(a, b, y, n) log(a+b)*(-5/2) +
sum(lgamma(a+b)-lgamma(a)-lgamma(b)+lgamma(a+y)+lgamma(b+n-y)-
lgamma(a+b+n))
lp <- mapply(lpfun, cA, cB, MoreArgs = list(y, n))
#Subtract maximum value to avoid over/underflow in exponentiation
df_marg <- data.frame(x = cA, y = cB, p = exp(lp - max(lp)))
#Sample from the grid (with replacement)
nsamp <- 100
samp_indices <- sample(length(df_marg$p), size = nsamp,
replace = T, prob = df_marg$p/sum(df_marg$p))
samp_A <- cA[samp_indices[1:nsamp]]
samp_B <- cB[samp_indices[1:nsamp]]
df_psamp <- mapply(function(a, b, x) dbeta(x, a, b),
samp_A, samp_B, MoreArgs = list(x = x)) %>%
as.data.frame() %>% cbind(x) %>% gather(ind, p, -x)
This is not very well tested.
There is no need for loops to sample from distributions included in base R, those functions are vectorized on their arguments. Code following the lines below should be able to do what the question asks for.
Ni <- 1000
Nj <- 17
theta <- rbeta(Ni*Nj, rep(samp_A + y, each = Ni), rep(samp_B + n - y, each = Ni))
yrep <- rbinom(Ni*Nj, n, theta)
test1 <- matrix(yrep, nrow = Ni)
mins1 <- matrixStats::colMins(test1)
I'm trying to understand how to use Dynamic Linear Modeling for forecasting. I found an example of the DLM functionality of the MARSS package in R being used for forecasting. Below is all the code in the example, starting with loading the data and ending with creating the in-sample forecasts.
What I don't understand is how I would make an out-of-sample forecast? The code below generates "in-sample" forecasts, where it uses already-known information to generate predictions about already-existing data.
Say I want to forecast the Salmon Survival tomorrow rather than throughout the last several weeks. How would I do that?
Any help would be appreciated.
# load the data
data(SalmonSurvCUI, package = "MARSS")
# get time indices
years <- SalmonSurvCUI[, 1]
# number of years of data
TT <- length(years)
# get response variable: logit(survival)
dat <- matrix(SalmonSurvCUI[, 2], nrow = 1)
# get predictor variable
CUI <- SalmonSurvCUI[, 3]
## z-score the CUI
CUI.z <- matrix((CUI - mean(CUI))/sqrt(var(CUI)), nrow = 1)
# number of regr params (slope + intercept)
m <- dim(CUI.z)[1] + 1
# for process eqn
B <- diag(m) ## 2x2; Identity
U <- matrix(0, nrow = m, ncol = 1) ## 2x1; both elements = 0
Q <- matrix(list(0), m, m) ## 2x2; all 0 for now
diag(Q) <- c("q.alpha", "q.beta") ## 2x2; diag = (q1,q2)
# for observation eqn
Z <- array(NA, c(1, m, TT)) ## NxMxT; empty for now
Z[1, 1, ] <- rep(1, TT) ## Nx1; 1's for intercept
Z[1, 2, ] <- CUI.z ## Nx1; predictor variable
A <- matrix(0) ## 1x1; scalar = 0
R <- matrix("r") ## 1x1; scalar = r
# only need starting values for regr parameters
inits.list <- list(x0 = matrix(c(0, 0), nrow = m))
# list of model matrices & vectors
mod.list <- list(B = B, U = U, Q = Q, Z = Z, A = A, R = R)
# fit univariate DLM
dlm1 <- MARSS(dat, inits = inits.list, model = mod.list)
# get list of Kalman filter output
kf.out <- MARSSkfss(dlm1)
## forecasts of regr parameters; 2xT matrix
eta <- kf.out$xtt1
## ts of E(forecasts)
fore.mean <- vector()
for (t in 1:TT) {
fore.mean[t] <- Z[, , t] %*% eta[, t, drop = FALSE]
}
# variance of regr parameters; 1x2xT array
Phi <- kf.out$Vtt1
## obs variance; 1x1 matrix
R.est <- coef(dlm1, type = "matrix")$R
## ts of Var(forecasts)
fore.var <- vector()
for (t in 1:TT) {
tZ <- matrix(Z[, , t], m, 1) ## transpose of Z
fore.var[t] <- Z[, , t] %*% Phi[, , t] %*% tZ + R.est
}
The model of the beta and alpha is a random walk without drift so the prediction of beta(TT+k) and alpha(TT+k) will just be beta(TT) and alpha(TT) where TT is the last time step in the data (in this case CUI.z).
So your prediction is
logit.survival(TT+k) = alpha(TT) + beta(TT)*CUI.z(TT+k)
alpha(TT) and beta(TT) would be output via kf.out$xtT[,TT], i.e. last state estimate. You will need to provide a CUI.z at t=TT+k.
MARSS version 3.11.0 will have predict function and will output these predictions along with the prediction intervals. But release date is sometime late summer 2020. The functionality is in the GitHub development site (under the resids_update branch) but final testing is still being done.
I have a working implementation of multivariable linear regression using gradient descent in R. I'd like to see if I can use what I have to run a stochastic gradient descent. I'm not sure if this is really inefficient or not. For example, for each value of α I want to perform 500 SGD iterations and be able to specify the number of randomly picked samples in each iteration. It would be nice to do this so I could see how the number of samples influences the results. I'm having trouble through with the mini-batching and I want to be able to easily plot the results.
This is what I have so far:
# Read and process the datasets
# download the files from GitHub
download.file("https://raw.githubusercontent.com/dbouquin/IS_605/master/sgd_ex_data/ex3x.dat", "ex3x.dat", method="curl")
x <- read.table('ex3x.dat')
# we can standardize the x vaules using scale()
x <- scale(x)
download.file("https://raw.githubusercontent.com/dbouquin/IS_605/master/sgd_ex_data/ex3y.dat", "ex3y.dat", method="curl")
y <- read.table('ex3y.dat')
# combine the datasets
data3 <- cbind(x,y)
colnames(data3) <- c("area_sqft", "bedrooms","price")
str(data3)
head(data3)
################ Regular Gradient Descent
# http://www.r-bloggers.com/linear-regression-by-gradient-descent/
# vector populated with 1s for the intercept coefficient
x1 <- rep(1, length(data3$area_sqft))
# appends to dfs
# create x-matrix of independent variables
x <- as.matrix(cbind(x1,x))
# create y-matrix of dependent variables
y <- as.matrix(y)
L <- length(y)
# cost gradient function: independent variables and values of thetas
cost <- function(x,y,theta){
gradient <- (1/L)* (t(x) %*% ((x%*%t(theta)) - y))
return(t(gradient))
}
# GD simultaneous update algorithm
# https://www.coursera.org/learn/machine-learning/lecture/8SpIM/gradient-descent
GD <- function(x, alpha){
theta <- matrix(c(0,0,0), nrow=1)
for (i in 1:500) {
theta <- theta - alpha*cost(x,y,theta)
theta_r <- rbind(theta_r,theta)
}
return(theta_r)
}
# gradient descent α = (0.001, 0.01, 0.1, 1.0) - defined for 500 iterations
alphas <- c(0.001,0.01,0.1,1.0)
# Plot price, area in square feet, and the number of bedrooms
# create empty vector theta_r
theta_r<-c()
for(i in 1:length(alphas)) {
result <- GD(x, alphas[i])
# red = price
# blue = sq ft
# green = bedrooms
plot(result[,1],ylim=c(min(result),max(result)),col="#CC6666",ylab="Value",lwd=0.35,
xlab=paste("alpha=", alphas[i]),xaxt="n") #suppress auto x-axis title
lines(result[,2],type="b",col="#0072B2",lwd=0.35)
lines(result[,3],type="b",col="#66CC99",lwd=0.35)
}
Is it more practical to find a way to use sgd()? I can't seem to figure out how to have the level of control I'm looking for with the sgd package
Sticking with what you have now
## all of this is the same
download.file("https://raw.githubusercontent.com/dbouquin/IS_605/master/sgd_ex_data/ex3x.dat", "ex3x.dat", method="curl")
x <- read.table('ex3x.dat')
x <- scale(x)
download.file("https://raw.githubusercontent.com/dbouquin/IS_605/master/sgd_ex_data/ex3y.dat", "ex3y.dat", method="curl")
y <- read.table('ex3y.dat')
data3 <- cbind(x,y)
colnames(data3) <- c("area_sqft", "bedrooms","price")
x1 <- rep(1, length(data3$area_sqft))
x <- as.matrix(cbind(x1,x))
y <- as.matrix(y)
L <- length(y)
cost <- function(x,y,theta){
gradient <- (1/L)* (t(x) %*% ((x%*%t(theta)) - y))
return(t(gradient))
}
I added y to your GD function and created a wrapper function, myGoD, to call yours but first subsetting the data
GD <- function(x, y, alpha){
theta <- matrix(c(0,0,0), nrow=1)
theta_r <- NULL
for (i in 1:500) {
theta <- theta - alpha*cost(x,y,theta)
theta_r <- rbind(theta_r,theta)
}
return(theta_r)
}
myGoD <- function(x, y, alpha, n = nrow(x)) {
idx <- sample(nrow(x), n)
y <- y[idx, , drop = FALSE]
x <- x[idx, , drop = FALSE]
GD(x, y, alpha)
}
Check to make sure it works and try with different Ns
all.equal(GD(x, y, 0.001), myGoD(x, y, 0.001))
# [1] TRUE
set.seed(1)
head(myGoD(x, y, 0.001, n = 20), 2)
# x1 V1 V2
# V1 147.5978 82.54083 29.26000
# V1 295.1282 165.00924 58.48424
set.seed(1)
head(myGoD(x, y, 0.001, n = 40), 2)
# x1 V1 V2
# V1 290.6041 95.30257 59.66994
# V1 580.9537 190.49142 119.23446
Here is how you can use it
alphas <- c(0.001,0.01,0.1,1.0)
ns <- c(47, 40, 30, 20, 10)
par(mfrow = n2mfrow(length(alphas)))
for(i in 1:length(alphas)) {
# result <- myGoD(x, y, alphas[i]) ## original
result <- myGoD(x, y, alphas[i], ns[i])
# red = price
# blue = sq ft
# green = bedrooms
plot(result[,1],ylim=c(min(result),max(result)),col="#CC6666",ylab="Value",lwd=0.35,
xlab=paste("alpha=", alphas[i]),xaxt="n") #suppress auto x-axis title
lines(result[,2],type="b",col="#0072B2",lwd=0.35)
lines(result[,3],type="b",col="#66CC99",lwd=0.35)
}
You don't need the wrapper function--you can just change your GD slightly. It is always good practice to explicitly pass arguments to your functions rather than relying on scoping. Before you were assuming that y would be pulled from your global environment; here y must be given or you will get an error. This will avoid many headaches and mistakes down the road.
GD <- function(x, y, alpha, n = nrow(x)){
idx <- sample(nrow(x), n)
y <- y[idx, , drop = FALSE]
x <- x[idx, , drop = FALSE]
theta <- matrix(c(0,0,0), nrow=1)
theta_r <- NULL
for (i in 1:500) {
theta <- theta - alpha*cost(x,y,theta)
theta_r <- rbind(theta_r,theta)
}
return(theta_r)
}
I have written a custom likelihood function that fits a multi-data model that integrates mark-recapture and telemetry data (sensu Royle et al. 2013 Methods in Ecology and Evolution). The likelihood function is designed to be flexible in terms of whether and how many covariates are specified for different linear models in different likelihood components which is determined by values supplied as function arguments (i.e., data matrices "detcovs" and "dencovs" in my code). The likelihood function works when I directly supply it to optimization functions (e.g., optim or nlm), but does not play nice with the mle2 function in the bbmle package. My problem is that I continually run into the following error: "some named arguments in 'start' are not arguments to the specified log-likelihood function". This is my first attempt at writing custom likelihood functions so I'm sure there are general coding conventions of which I'm unaware that make such tasks much more efficient and amendable to the mle2 function. Below is my likelihood function, code creating the staring value objects, and code calling the mle2 function. Any advice how to solve the error problem and general comments on writing cleaner functions is welcome. Many thanks in advance.
Edit: As requested, I have simplified the likelihood function and provided code to simulate reproducible data to which the model can be fit. Included in the simulation code are 2 custom functions and use of the raster function from the raster package. Hopefully, I have sufficiently simplified everything to enable others to troubleshoot. Again, many thanks for your help!
Jared
Likelihood function:
CSCR.RSF.intlik2.EXAMPLE <- function(alpha0,sigma,alphas=NULL,betas=NULL,n0,yscr=NULL,K=NULL,X=X,trapcovs=NULL,Gden=NULL,Gdet=NULL,ytel=NULL,stel=NULL,
dencovs=NULL,detcovs=NULL){
#
# this version of the code handles a covariate on log(Density). This is starting value 5
#
# start = vector of starting values
# yscr = nind x ntraps encounter matrix
# K = number of occasions
# X = trap locations
# Gden = matrix with grid cell coordinates for density raster
# Gdet = matrix with gride cell coordinates for RSF raster
# dencovs = all covariate values for all nGden pixels in density raster
# trapcovs = covariate value at trap locations
# detcovs = all covariate values for all nGrsf pixels in RSF raster
# ytel = nguys x nGdet matrix of telemetry fixes in each nGdet pixels
# stel = home range center of telemetered individuals, IF you wish to estimate it. Not necessary
# alphas = starting values for RSF/detfn coefficients excluding sigma and intercept
# alpha0 = starting values for RSF/detfn intercept
# sigma = starting value for RSF/detfn sigma
# betas = starting values for density function coefficients
# n0 = starting value for number of undetected individuals on log scale
#
n0 = exp(n0)
nGden = nrow(Gden)
D = e2dist(X,Gden)
nGdet <- nrow(Gdet)
alphas = alphas
loglam = alpha0 -(1/(2*sigma*sigma))*D*D + as.vector(trapcovs%*%alphas) # ztrap recycled over nG
psi = exp(as.vector(dencovs%*%betas))
psi = psi/sum(psi)
probcap = 1-exp(-exp(loglam))
#probcap = (exp(theta0)/(1+exp(theta0)))*exp(-theta1*D*D)
Pm = matrix(NA,nrow=nrow(probcap),ncol=ncol(probcap))
ymat = yscr
ymat = rbind(yscr,rep(0,ncol(yscr)))
lik.marg = rep(NA,nrow(ymat))
for(i in 1:nrow(ymat)){
Pm[1:length(Pm)] = (dbinom(rep(ymat[i,],nGden),rep(K,nGden),probcap[1:length(Pm)],log=TRUE))
lik.cond = exp(colSums(Pm))
lik.marg[i] = sum( lik.cond*psi )
}
nv = c(rep(1,length(lik.marg)-1),n0)
part1 = lgamma(nrow(yscr)+n0+1) - lgamma(n0+1)
part2 = sum(nv*log(lik.marg))
out = -1*(part1+ part2)
lam = t(exp(a0 - (1/(2*sigma*sigma))*t(D2)+ as.vector(detcovs%*%alphas)))# recycle zall over all ytel guys
# lam is now nGdet x nG!
denom = rowSums(lam)
probs = lam/denom # each column is the probs for a guy at column [j]
tel.loglik = -1*sum( ytel*log(probs) )
out = out + tel.loglik
out
}
Data simulation code:
library(raster)
library(bbmle)
e2dist <- function (x, y){
i <- sort(rep(1:nrow(y), nrow(x)))
dvec <- sqrt((x[, 1] - y[i, 1])^2 + (x[, 2] - y[i, 2])^2)
matrix(dvec, nrow = nrow(x), ncol = nrow(y), byrow = F)
}
spcov <- function(R) {
v <- sqrt(nrow(R))
D <- as.matrix(dist(R))
V <- exp(-D/2)
cov1 <- t(chol(V)) %*% rnorm(nrow(R))
Rd <- as.data.frame(R)
colnames(Rd) <- c("x", "y")
Rd$C <- as.numeric((cov1 - mean(cov1)) / sd(cov1))
return(Rd)
}
set.seed(1234)
co <- seq(0.3, 0.7, length=5)
X <- cbind(rep(co, each=5),
rep(co, times=5))
B <- 10
co <- seq(0, 1, length=B)
Z <- cbind(rep(co, each=B), rep(co, times=B))
dencovs <- cbind(spcov(Z),spcov(Z)[,3]) # ordered as reading raster image from left to right, bottom to top
dimnames(dencovs)[[2]][3:4] <- c("dencov1","dencov2")
denr.list <- vector("list",2)
for(i in 1:2){
denr.list[[i]] <- raster(
list(x=seq(0,1,length=10),
y=seq(0,1,length=10),
z=t(matrix(dencovs[,i+2],10,10,byrow=TRUE)))
)
}
B <- 20
co <- seq(0, 1, length=B)
Z <- cbind(rep(co, each=B), rep(co, times=B))
detcovs <- cbind(spcov(Z),spcov(Z)[,3]) # ordered as reading raster image from left to right, bottom to top
dimnames(detcovs)[[2]][3:4] <- c("detcov1","detcov2")
detcov.raster.list <- vector("list",2)
trapcovs <- matrix(0,J,2)
for(i in 1:2){
detr.list[[i]] <- raster(
list(x=seq(0,1,length=20),
y=seq(0,1,length=20),
z=t(matrix(detcovs[,i+2],20,20,byrow=TRUE)))
)
trapcovs[,i] <- extract(detr.list[[i]],X)
}
alpha0 <- -3
sigma <- 0.15
alphas <- c(1,-1)
beta0 <- 3
betas <- c(-1,1)
pixelArea <- (dencovs$y[2] - dencovs$y[1])^2
mu <- exp(beta0 + as.matrix(dencovs[,3:4])%*%betas)*pixelArea
EN <- sum(mu)
N <- rpois(1, EN)
pi <- mu/sum(mu)
s <- dencovs[sample(1:nrow(dencovs), size=N, replace=TRUE, prob=pi),1:2]
J <- nrow(X)
K <- 10
yc <- d <- p <- matrix(NA, N, J)
D <- e2dist(s,X)
loglam <- t(alpha0 - t((1/(2*sigma*sigma))*D*D) + as.vector(trapcovs%*%alphas))
p <- 1-exp(-exp(loglam))
for(i in 1:N) {
for(j in 1:J) {
yc[i,j] <- rbinom(1, K, p[i,j])
}
}
detected <- apply(yc>0, 1, any)
yscr <- yc[detected,]
ntel <- 5
nfixes <- 100
poss.tel <- which(s[,1]>0.2 & s[,1]<0.8 & s[,2]>0.2 & s[,2]<0.8)
stel.id <- sample(poss.tel,ntel)
stel <- s[stel.id,]
ytel <- matrix(NA,ntel,nrow(detcovs))
d <- e2dist(stel,detcovs[,1:2])
lam <- t(exp(1 - t((1/(2*sigma*sigma))*d*d) + as.vector(as.matrix(detcovs[,3:4])%*%alphas)))
for(i in 1:ntel){
ytel[i,] <- rmultinom(1,nfixes,lam[i,]/sum(lam[i,]))
}
Specify starting values and call mle2 function:
start1 <- list(alpha0=alpha0,sigma=sigma,alphas=alphas,betas=betas,n0=log(N-nrow(yscr)))
parnames(CSCR.RSF.intlik2.EXAMPLE) <- names(start)
out1 <- mle2(CSCR.RSF.intlik2.EXAMPLE,start=start1,method="SANN",optimizer="optim",
data=list(yscr=yscr,K=K,X=X,trapcovs=trapcovs,Gden=dencovs[,1:2],Gdet=detcovs[,1:2],
ytel=ytel,stel=stel,dencovs=as.matrix(dencovs[,3:4]),detcovs=as.matrix(detcovs[,3:4]))
)
I would like to compute the convolution of two probability distributions in R and I need some help. For the sake of simplicity, let's say I have a variable x that is normally distributed with mean = 1.0 and stdev = 0.5, and y that is log-normally distributed with mean = 1.5 and stdev = 0.75. I want to determine z = x + y. I understand that the distribution of z is not known a priori.
As an aside the real world example I am working with requires addition to two random variables that are distributed according to a number of different distributions.
Does anyone know how to add two random variables by convoluting the probability density functions of x and y?
I have tried generating n normally distributed random values (with above parameters) and adding them to n log-normally distributed random values. However, I wish to know if I can use the convolution method instead. Any help would be greatly appreciated.
EDIT
Thank you for these answers. I define a pdf, and try to do the convolution integral, but R complains on the integration step. My pdfs are Log Pearson 3 and are as follows
dlp3 <- function(x, a, b, g) {
p1 <- 1/(x*abs(b) * gamma(a))
p2 <- ((log(x)-g)/b)^(a-1)
p3 <- exp(-1* (log(x)-g) / b)
d <- p1 * p2 * p3
return(d)
}
f.m <- function(x) dlp3(x,3.2594,-0.18218,0.53441)
f.s <- function(x) dlp3(x,9.5645,-0.07676,1.184)
f.t <- function(z) integrate(function(x,z) f.s(z-x)*f.m(x),-Inf,Inf,z)$value
f.t <- Vectorize(f.t)
integrate(f.t, lower = 0, upper = 3.6)
R complains at the last step since the f.t function is bounded and my integration limits are probably not correct. Any ideas on how to solve this?
Here is one way.
f.X <- function(x) dnorm(x,1,0.5) # normal (mu=1.5, sigma=0.5)
f.Y <- function(y) dlnorm(y,1.5, 0.75) # log-normal (mu=1.5, sigma=0.75)
# convolution integral
f.Z <- function(z) integrate(function(x,z) f.Y(z-x)*f.X(x),-Inf,Inf,z)$value
f.Z <- Vectorize(f.Z) # need to vectorize the resulting fn.
set.seed(1) # for reproducible example
X <- rnorm(1000,1,0.5)
Y <- rlnorm(1000,1.5,0.75)
Z <- X + Y
# compare the methods
hist(Z,freq=F,breaks=50, xlim=c(0,30))
z <- seq(0,50,0.01)
lines(z,f.Z(z),lty=2,col="red")
Same thing using package distr.
library(distr)
N <- Norm(mean=1, sd=0.5) # N is signature for normal dist
L <- Lnorm(meanlog=1.5,sdlog=0.75) # same for log-normal
conv <- convpow(L+N,1) # object of class AbscontDistribution
f.Z <- d(conv) # distribution function
hist(Z,freq=F,breaks=50, xlim=c(0,30))
z <- seq(0,50,0.01)
lines(z,f.Z(z),lty=2,col="red")
I was having trouble getting integrate() to work for different density parameters, so I came up with an alternative to #jlhoward's using Riemann approximation:
set.seed(1)
#densities to be convolved. could also put these in the function below
d1 <- function(x) dnorm(x,1,0.5) #
d2 <- function(y) dlnorm(y,1.5, 0.75)
#Riemann approximation of convolution
conv <- function(t, a, b, d) { #a to b needs to cover the range of densities above. d needs to be small for accurate approx.
z <- NA
x <- seq(a, b, d)
for (i in 1:length(t)){
print(i)
z[i] <- sum(d1(x)*d2(t[i]-x)*d)
}
return(z)
}
#check against sampled convolution
X <- rnorm(1000, 1, 0.5)
Y <- rlnorm(1000, 1.5, 0.75)
Z <- X + Y
t <- seq(0, 50, 0.05) #range to evaluate t, smaller increment -> smoother curve
hist(Z, breaks = 50, freq = F, xlim = c(0,30))
lines(t, conv(t, -100, 100, 0.1), type = "s", col = "red")