I have a matrix whose values I want to plot. Using the image function it looks like this.
How can I plot a line over the image?
(In my case, it want to plot a line over the maximum values along the x axes)
Edit
The image and the line I want to plot over are the output from the Bayesian Online Changepoint detection. Since it is quite short I'll share the whole code. The plotting part is at the end:
# Bayesian Online Changepoint Detection
# Adams, MacKay 2007
# http://hips.seas.harvard.edu/content/bayesian-online-changepoint-detection
#######################################
# Other python and matlab implementations
# https://github.com/JackKelly/bayesianchangepoint
# http://hips.seas.harvard.edu/content/bayesian-online-changepoint-detection
# http://www.inference.phy.cam.ac.uk/rpa23/cp/gaussdemo.m
# http://www.inference.phy.cam.ac.uk/rpa23/cp/studentpdf.m
# http://www.inference.phy.cam.ac.uk/rpa23/cp/constant_hazard.m
# Even more commented, but different paper:
# https://github.com/davyfeng/ipdata/blob/master/csv/bocpd/core/bocpd.m
# Generate data
x1 <- rnorm(100, 10.5, 0.1)
x2 <- rnorm(100, 1, 0.1)
x3 <- rnorm(100, -10, 0.1)
x4 <- rnorm(100, -1, 0.1)
x5 <- rnorm(100, 5, 0.1)
x6 <- rnorm(30, 1, 0.1)
x7 <- rnorm(100, 8, 0.1)
x <- c(x1,x2,x3,x4,x5, x6,x7)
##############
# Algorithm
##############
# Prepare the scaled and shifted student-t
dt.scaled.shifted <- function(x, m, s, df) stats::dt((x-m)/s, df)/s
# Prepare the Hazard function
hazard <-function(x, lambda=200){rep(1/lambda, length(x))}
L <- length(x)
R <- matrix(rep(0,(L+1)*(L+1)), L+1, L+1)
R[1,1] <- 1 # for t=1 where are sure that p(r=1)=1
mu0 <- 0; kappa0 <- 1; alpha0 <-1; beta0 <- 1;
muT <- mu0
kappaT <- kappa0
alphaT <- alpha0
betaT <- beta0
maxes <- rep(0, L)
# Process data as they come in
for(t in 1:L){
# Evaluate predictive probability
predprobs <- dt.scaled.shifted(x[t], muT, betaT*(kappaT+1)/(alphaT*-kappaT), 2*alphaT)
H <- hazard(x[1:t])
# Calculate growth probabilities
R[2:(t+1), t+1] <- R[1:t,t]*predprobs*(1-H)
# Calculate changepoint (reset) probabilities
R[1,t+1] <- sum(R[1:t,t]*predprobs*H)
# Renormalize
R[,t+1] <- R[,t+1] / sum(R[,t+1])
# Update parameters for each possible run length
# keep the past ones since they will be used iteratively
muT0 <- c(mu0, (kappaT*muT + x[t])/(kappaT+1))
kappaT0 <- c(kappa0,kappaT+1)
alphaT0 <- c(alpha0, alphaT + 0.5)
betaT0 <- c(beta0, kappaT + (kappaT * (x[t]-muT)^2)/(2*(kappaT+1)))
muT <- muT0
kappaT <- kappaT0
alphaT <- alphaT0
betaT <- betaT0
# Store the maximum, to plot later
maxes[t] <- which.max(R[,t])
}
# Plot results
par(mfrow=c(2,1))
plot(x, type='l')
image((-t(log(R))), col = grey(seq(0,1,length=256)), axes=T)
par(new=T)
plot(1:(dim(R)[1]-1), maxes,type='l', col="red")
On the top there is the original data. On the bottom, the probability of a current run to have length y. The red line on the bottom should fit the dark shades.
(to be deleted. It does not work. I leave it temporaly to save the comments.)
I got it, I thought I had already tried par(new=T) but obviously I hadn't:
m <- matrix(rnorm(100,1,1),50,50)
image(m, col = grey(seq(0,1,length=256)))
par(new=T)
plot(seq(0,1, length=50), type='l', col="red", lwd=5)
Quick example simulating the whole process:
data <- vector()
for(i in 1:50){
data <- rbind(data, dpois(1:50, i^1.2))
}
maxes <- apply(data, 1, which.max)
image(-data, col = grey(seq(0,1,length=256)))
par(new=T)
plot(1:dim(data)[1], c(maxes),type='l')
Related
I did simulations in R and plotted the results in histograms. There is no problem adding probability density into the histograms with the density() function. But for some reason I get very strange results when I plot the theoretical densities to the histograms for comparison purposes. Here are two example codes and two pictures. The blue theoretical pdfs are way off and I do not know why. Could someone with R skills point out my mistakes?
# generating 10000 samples thrice from U(0,1) distribution
# and sorting them for the statistics:
n <- 10000
samples1 <- data.frame('0'=c(rep(NA,4)))
samples2 <- data.frame('0'=c(rep(NA,10)))
samples3 <- data.frame('0'=c(rep(NA,10)))
for (i in 1:n) {
new <- runif(4)
samples1[ , ncol(samples1) + 1] <- sort(new)
colnames(samples1)[ncol(samples1)] <- i
new <- runif(10)
samples2[ , ncol(samples2)+1] <- sort(new)
colnames(samples2)[ncol(samples2)] <- i
new <- runif(10)
samples3[ , ncol(samples3)+1] <- sort(new)
colnames(samples3)[ncol(samples3)] <- i
}
# dropping the first (useless) columns:
samples1 <- samples1[-c(1)]
samples2 <- samples2[-c(1)]
samples3 <- samples3[-c(1)]
# selecting the statistics from the samples:
# X_2:4
stat24 <- rep(NA,n)
for (i in 1:n) {
stat24[i] <- samples1[2,i]
}
# X_2:10
stat210 <- rep(NA,n)
for (i in 1:n) {
stat210[i] <- samples1[2,i]
}
# X_10:10
stat1010 <- rep(NA,n)
for (i in 1:n) {
stat1010[i] <- samples1[10,i]
}
# plotting the histograms and Beta pdfs:
hist(stat24, freq = FALSE)
lines(dbeta(stat24, 2, 5), col='blue')
lines(density(stat24), col='red')
Distribution of the kth statistic follows Beta(k, n+k-1) distribution which appears as the odd blue stroke on the right.
n <- 10000
random_variable_F <- rep(NA,n)
# generating 10000 samples of sizes 10 and 5 and computing F:
for (i in 1:n) {
x <- rnorm(10, mean = 10, sd = sqrt(5))
y <- rnorm(5, mean = 20, sd = sqrt(10))
random_variable_F[i] <- ((var(x))*5)/((var(y)*10))
}
#head(random_variable_F)
# plotting the histogram:
hist(random_variable_F, freq = F)
lines(density(random_variable_F), col='red')
lines(df(random_variable_F, 9, 4,), col='blue')
Random variable F follows F-distribution. Paramaters are the sample sizes minus one, in this case 10-1=9 and 5-1=4. the theoretical curve is quite wild:
If you pass a single vector to lines, it assumes that this is a vector of y values you want to plot. It plots the first y value at x = 1, the second y value at x = 2, etc, all the way up to x = length(y). In your case, random_variable_F is an unordered random variable, and you are just plotting its sequential values at 1:10000 along the x axis.
Clearly, you want the function y = df(x) to be plotted, so you need to pass random_variable_F as the x values and df(random_variable_F) as the y values. You will also need to sort random_variable_F first to ensure the line is plotted from left to right:
hist(random_variable_F, freq = F)
lines(density(random_variable_F), col='red')
lines(sort(random_variable_F), df(sort(random_variable_F), 9, 4), col='blue')
Note that this doesn't happen when you plot lines(density(random_variable_F)) because density produces a list containing ordered x and y valued rather than a vector.
I have a Uniform distribution with parameters (0,$\theta$),
and I have calculated its MME, and the other two estimators.
I also calculated their individual bias in my code.
But I am having trouble properly plot the three biases in one plot.
I wonder would you mind providing me with any suggestion that how can I modify the "matplot" part. Maybe the question lies there? Thank you very much!!
n <- 1000
theta <- 9
x <- runif(n, 0, theta)
theta.hat.mme <-2*mean(x)
theta.hat.G <-(n*theta)/(n+1)
theta.hat.H <-((n+1)/n)*theta.hat.G
# theta.hat.G is the max of (x_1, x_2,...,x_n) be an estimator of theta, and theta.hat.H is the modified estimator of theta.hat.G to make it unbiased.
# Empirical Exploration of properties
M <- 1000 # Number of sample of size n
L <- 9 # Different values of n
# create storage variables
mme <- numeric(M)
vn <- numeric(L)
bias <- variance <- matrix(0, nrow=L, ncol=3)
for(l in 1:L)
{
n <- 20 + (l-1)*10
for(m in 1:M)
{
x <- runif(n, 0, theta)
mme[m] <- 2*mean(x)
}
bias[l,1] <- mean(mme)-theta
bias[1,2] <-mean(theta.hat.G)-theta
bias[1,3] <-mean(theta.hat.H)-theta
variance[l,1] <- var(mme)
variance[1,2] <-var(theta.hat.G)
variance[1,3] <-var(theta.hat.H)
vn[l] <- n
}
#windows(5,5)
matplot(vn, bias, type='l', col=1:3, lty=1:3, xlab='n')
abline(h=0, lty=3)
legend('topright', col=1:3,
c('Method of Moments Estimator', 'Theta G', 'Theta H'),
inset=0.01, lty=1:2)
I have written a custom likelihood function that fits a multi-data model that integrates mark-recapture and telemetry data (sensu Royle et al. 2013 Methods in Ecology and Evolution). The likelihood function is designed to be flexible in terms of whether and how many covariates are specified for different linear models in different likelihood components which is determined by values supplied as function arguments (i.e., data matrices "detcovs" and "dencovs" in my code). The likelihood function works when I directly supply it to optimization functions (e.g., optim or nlm), but does not play nice with the mle2 function in the bbmle package. My problem is that I continually run into the following error: "some named arguments in 'start' are not arguments to the specified log-likelihood function". This is my first attempt at writing custom likelihood functions so I'm sure there are general coding conventions of which I'm unaware that make such tasks much more efficient and amendable to the mle2 function. Below is my likelihood function, code creating the staring value objects, and code calling the mle2 function. Any advice how to solve the error problem and general comments on writing cleaner functions is welcome. Many thanks in advance.
Edit: As requested, I have simplified the likelihood function and provided code to simulate reproducible data to which the model can be fit. Included in the simulation code are 2 custom functions and use of the raster function from the raster package. Hopefully, I have sufficiently simplified everything to enable others to troubleshoot. Again, many thanks for your help!
Jared
Likelihood function:
CSCR.RSF.intlik2.EXAMPLE <- function(alpha0,sigma,alphas=NULL,betas=NULL,n0,yscr=NULL,K=NULL,X=X,trapcovs=NULL,Gden=NULL,Gdet=NULL,ytel=NULL,stel=NULL,
dencovs=NULL,detcovs=NULL){
#
# this version of the code handles a covariate on log(Density). This is starting value 5
#
# start = vector of starting values
# yscr = nind x ntraps encounter matrix
# K = number of occasions
# X = trap locations
# Gden = matrix with grid cell coordinates for density raster
# Gdet = matrix with gride cell coordinates for RSF raster
# dencovs = all covariate values for all nGden pixels in density raster
# trapcovs = covariate value at trap locations
# detcovs = all covariate values for all nGrsf pixels in RSF raster
# ytel = nguys x nGdet matrix of telemetry fixes in each nGdet pixels
# stel = home range center of telemetered individuals, IF you wish to estimate it. Not necessary
# alphas = starting values for RSF/detfn coefficients excluding sigma and intercept
# alpha0 = starting values for RSF/detfn intercept
# sigma = starting value for RSF/detfn sigma
# betas = starting values for density function coefficients
# n0 = starting value for number of undetected individuals on log scale
#
n0 = exp(n0)
nGden = nrow(Gden)
D = e2dist(X,Gden)
nGdet <- nrow(Gdet)
alphas = alphas
loglam = alpha0 -(1/(2*sigma*sigma))*D*D + as.vector(trapcovs%*%alphas) # ztrap recycled over nG
psi = exp(as.vector(dencovs%*%betas))
psi = psi/sum(psi)
probcap = 1-exp(-exp(loglam))
#probcap = (exp(theta0)/(1+exp(theta0)))*exp(-theta1*D*D)
Pm = matrix(NA,nrow=nrow(probcap),ncol=ncol(probcap))
ymat = yscr
ymat = rbind(yscr,rep(0,ncol(yscr)))
lik.marg = rep(NA,nrow(ymat))
for(i in 1:nrow(ymat)){
Pm[1:length(Pm)] = (dbinom(rep(ymat[i,],nGden),rep(K,nGden),probcap[1:length(Pm)],log=TRUE))
lik.cond = exp(colSums(Pm))
lik.marg[i] = sum( lik.cond*psi )
}
nv = c(rep(1,length(lik.marg)-1),n0)
part1 = lgamma(nrow(yscr)+n0+1) - lgamma(n0+1)
part2 = sum(nv*log(lik.marg))
out = -1*(part1+ part2)
lam = t(exp(a0 - (1/(2*sigma*sigma))*t(D2)+ as.vector(detcovs%*%alphas)))# recycle zall over all ytel guys
# lam is now nGdet x nG!
denom = rowSums(lam)
probs = lam/denom # each column is the probs for a guy at column [j]
tel.loglik = -1*sum( ytel*log(probs) )
out = out + tel.loglik
out
}
Data simulation code:
library(raster)
library(bbmle)
e2dist <- function (x, y){
i <- sort(rep(1:nrow(y), nrow(x)))
dvec <- sqrt((x[, 1] - y[i, 1])^2 + (x[, 2] - y[i, 2])^2)
matrix(dvec, nrow = nrow(x), ncol = nrow(y), byrow = F)
}
spcov <- function(R) {
v <- sqrt(nrow(R))
D <- as.matrix(dist(R))
V <- exp(-D/2)
cov1 <- t(chol(V)) %*% rnorm(nrow(R))
Rd <- as.data.frame(R)
colnames(Rd) <- c("x", "y")
Rd$C <- as.numeric((cov1 - mean(cov1)) / sd(cov1))
return(Rd)
}
set.seed(1234)
co <- seq(0.3, 0.7, length=5)
X <- cbind(rep(co, each=5),
rep(co, times=5))
B <- 10
co <- seq(0, 1, length=B)
Z <- cbind(rep(co, each=B), rep(co, times=B))
dencovs <- cbind(spcov(Z),spcov(Z)[,3]) # ordered as reading raster image from left to right, bottom to top
dimnames(dencovs)[[2]][3:4] <- c("dencov1","dencov2")
denr.list <- vector("list",2)
for(i in 1:2){
denr.list[[i]] <- raster(
list(x=seq(0,1,length=10),
y=seq(0,1,length=10),
z=t(matrix(dencovs[,i+2],10,10,byrow=TRUE)))
)
}
B <- 20
co <- seq(0, 1, length=B)
Z <- cbind(rep(co, each=B), rep(co, times=B))
detcovs <- cbind(spcov(Z),spcov(Z)[,3]) # ordered as reading raster image from left to right, bottom to top
dimnames(detcovs)[[2]][3:4] <- c("detcov1","detcov2")
detcov.raster.list <- vector("list",2)
trapcovs <- matrix(0,J,2)
for(i in 1:2){
detr.list[[i]] <- raster(
list(x=seq(0,1,length=20),
y=seq(0,1,length=20),
z=t(matrix(detcovs[,i+2],20,20,byrow=TRUE)))
)
trapcovs[,i] <- extract(detr.list[[i]],X)
}
alpha0 <- -3
sigma <- 0.15
alphas <- c(1,-1)
beta0 <- 3
betas <- c(-1,1)
pixelArea <- (dencovs$y[2] - dencovs$y[1])^2
mu <- exp(beta0 + as.matrix(dencovs[,3:4])%*%betas)*pixelArea
EN <- sum(mu)
N <- rpois(1, EN)
pi <- mu/sum(mu)
s <- dencovs[sample(1:nrow(dencovs), size=N, replace=TRUE, prob=pi),1:2]
J <- nrow(X)
K <- 10
yc <- d <- p <- matrix(NA, N, J)
D <- e2dist(s,X)
loglam <- t(alpha0 - t((1/(2*sigma*sigma))*D*D) + as.vector(trapcovs%*%alphas))
p <- 1-exp(-exp(loglam))
for(i in 1:N) {
for(j in 1:J) {
yc[i,j] <- rbinom(1, K, p[i,j])
}
}
detected <- apply(yc>0, 1, any)
yscr <- yc[detected,]
ntel <- 5
nfixes <- 100
poss.tel <- which(s[,1]>0.2 & s[,1]<0.8 & s[,2]>0.2 & s[,2]<0.8)
stel.id <- sample(poss.tel,ntel)
stel <- s[stel.id,]
ytel <- matrix(NA,ntel,nrow(detcovs))
d <- e2dist(stel,detcovs[,1:2])
lam <- t(exp(1 - t((1/(2*sigma*sigma))*d*d) + as.vector(as.matrix(detcovs[,3:4])%*%alphas)))
for(i in 1:ntel){
ytel[i,] <- rmultinom(1,nfixes,lam[i,]/sum(lam[i,]))
}
Specify starting values and call mle2 function:
start1 <- list(alpha0=alpha0,sigma=sigma,alphas=alphas,betas=betas,n0=log(N-nrow(yscr)))
parnames(CSCR.RSF.intlik2.EXAMPLE) <- names(start)
out1 <- mle2(CSCR.RSF.intlik2.EXAMPLE,start=start1,method="SANN",optimizer="optim",
data=list(yscr=yscr,K=K,X=X,trapcovs=trapcovs,Gden=dencovs[,1:2],Gdet=detcovs[,1:2],
ytel=ytel,stel=stel,dencovs=as.matrix(dencovs[,3:4]),detcovs=as.matrix(detcovs[,3:4]))
)
Labeling data points in a plot can get unwieldy:
Randomly sampling few labels may disappoint:
What would be a nice way to pick a small set of nicely-spaced data labels? That is, to randomly pick representatives whose labels are not overlapping.
# demo data
set.seed(123)
N <- 50
x <- runif(N)
y <- x + rnorm(N, 0, x)
data <- data.frame(x, y, labels=state.name)
# plot with labels
plot(x,y)
text(x,y,labels)
# plot a few labels
frame()
few_labels <- data[sample(N, 10), ]
plot(x,y)
with(few_labels, text(x,y,labels))
One way to do is through clustering. Here is a solution with stats::hclust. We agglomerate the data points in cluster and then pick one random observation from each cluster.
few_labels <- function(df, coord=1:ncol(df),grp=5){
require(dplyr)
df$cl <- cutree(hclust(dist(df[,coord])),grp)
few_labels <- df %>% group_by(cl) %>%
do(sample_n(.,1))
return(few_labels)
}
# demo data
set.seed(123)
N <- 50
x <- runif(N)
y <- x + rnorm(N, 0, x)
data <- data.frame(x, y, labels=state.name)
# plot a few labels
frame()
few_labels <- few_labels(data,coord=1:2,grp=12)
plot(x,y)
with(few_labels, text(x,y,labels))
For all labels:
xlims=c(-1,2)
plot(x,y,xlim=xlims)
#text(x,y,data$labels,pos = 2,cex=0.7)
library(plotrix)
spread.labels(x,y,data$labels,cex=0.7,ony=NA)
Another way is to pick randomly a point, throw all proximate ones, and so on, until no point is left:
radius <- .1 # of a ball containing the largest label
d <- as.matrix(dist(data[, c("x","y")], upper=TRUE, diag=TRUE))
remaining <- 1:N
spaced <- numeric()
i <- 1
while(length(remaining)>0) {
p <- ifelse(length(remaining)>1, sample(remaining, 1), remaining)
spaced <- c(spaced, p) # ...
remaining <- setdiff(remaining, which(d[p, ] < 2*radius))
i <- i + 1
}
frame()
plot(x,y)
spaced_labels <- data[spaced, ]
with(spaced_labels, text(x,y,labels))
I would like to compute the convolution of two probability distributions in R and I need some help. For the sake of simplicity, let's say I have a variable x that is normally distributed with mean = 1.0 and stdev = 0.5, and y that is log-normally distributed with mean = 1.5 and stdev = 0.75. I want to determine z = x + y. I understand that the distribution of z is not known a priori.
As an aside the real world example I am working with requires addition to two random variables that are distributed according to a number of different distributions.
Does anyone know how to add two random variables by convoluting the probability density functions of x and y?
I have tried generating n normally distributed random values (with above parameters) and adding them to n log-normally distributed random values. However, I wish to know if I can use the convolution method instead. Any help would be greatly appreciated.
EDIT
Thank you for these answers. I define a pdf, and try to do the convolution integral, but R complains on the integration step. My pdfs are Log Pearson 3 and are as follows
dlp3 <- function(x, a, b, g) {
p1 <- 1/(x*abs(b) * gamma(a))
p2 <- ((log(x)-g)/b)^(a-1)
p3 <- exp(-1* (log(x)-g) / b)
d <- p1 * p2 * p3
return(d)
}
f.m <- function(x) dlp3(x,3.2594,-0.18218,0.53441)
f.s <- function(x) dlp3(x,9.5645,-0.07676,1.184)
f.t <- function(z) integrate(function(x,z) f.s(z-x)*f.m(x),-Inf,Inf,z)$value
f.t <- Vectorize(f.t)
integrate(f.t, lower = 0, upper = 3.6)
R complains at the last step since the f.t function is bounded and my integration limits are probably not correct. Any ideas on how to solve this?
Here is one way.
f.X <- function(x) dnorm(x,1,0.5) # normal (mu=1.5, sigma=0.5)
f.Y <- function(y) dlnorm(y,1.5, 0.75) # log-normal (mu=1.5, sigma=0.75)
# convolution integral
f.Z <- function(z) integrate(function(x,z) f.Y(z-x)*f.X(x),-Inf,Inf,z)$value
f.Z <- Vectorize(f.Z) # need to vectorize the resulting fn.
set.seed(1) # for reproducible example
X <- rnorm(1000,1,0.5)
Y <- rlnorm(1000,1.5,0.75)
Z <- X + Y
# compare the methods
hist(Z,freq=F,breaks=50, xlim=c(0,30))
z <- seq(0,50,0.01)
lines(z,f.Z(z),lty=2,col="red")
Same thing using package distr.
library(distr)
N <- Norm(mean=1, sd=0.5) # N is signature for normal dist
L <- Lnorm(meanlog=1.5,sdlog=0.75) # same for log-normal
conv <- convpow(L+N,1) # object of class AbscontDistribution
f.Z <- d(conv) # distribution function
hist(Z,freq=F,breaks=50, xlim=c(0,30))
z <- seq(0,50,0.01)
lines(z,f.Z(z),lty=2,col="red")
I was having trouble getting integrate() to work for different density parameters, so I came up with an alternative to #jlhoward's using Riemann approximation:
set.seed(1)
#densities to be convolved. could also put these in the function below
d1 <- function(x) dnorm(x,1,0.5) #
d2 <- function(y) dlnorm(y,1.5, 0.75)
#Riemann approximation of convolution
conv <- function(t, a, b, d) { #a to b needs to cover the range of densities above. d needs to be small for accurate approx.
z <- NA
x <- seq(a, b, d)
for (i in 1:length(t)){
print(i)
z[i] <- sum(d1(x)*d2(t[i]-x)*d)
}
return(z)
}
#check against sampled convolution
X <- rnorm(1000, 1, 0.5)
Y <- rlnorm(1000, 1.5, 0.75)
Z <- X + Y
t <- seq(0, 50, 0.05) #range to evaluate t, smaller increment -> smoother curve
hist(Z, breaks = 50, freq = F, xlim = c(0,30))
lines(t, conv(t, -100, 100, 0.1), type = "s", col = "red")