ggplot equivalent of gelman.plot MCMC diagnostic in r - r

I am creating a series of MCMC diagnostic plots in r using ggplot. I realize there is already a package available in gg for MCMC plotting, but much of this is for my own education as well as practical use. One thing I can't seem to figure out is how to generate the gelman.plot in a ggplot framework.
The gelman.diag function only returns a simple data point and I would like to recreate the complete running chart as shown in gelman.plot.
Is anyone familiar with the algorithmic structure of the gelman potential scale reduction factor and/or a means to port its output to ggplot?
Thank you!

You haven't provided a reproducible example, so I've used the example here. We need the object called combinedchains from that example. In order to avoid cluttering the answer, I've put the code for that at the end of this post.
Now we can run gelman.plot on combined.chains. This is the plot we want to duplicate:
library(coda)
gelman.plot(combined.chains)
To create a ggplot version, we need to get the data for the plot. I haven't done MCMC before, so I'm going to let gelman.plot generate the data for me. For your actual use case, you can probably just generate the appropriate data directly.
Let's look at what gelman.plot is doing: We can see the code for that function by typing the bare function name in the console. A portion of the function code is below. The ... show where I've removed sections of the original code for brevity. Note the call to gelman.preplot, with the output of that function stored in y. Note also that y is returned invisibly at the end. y is a list containing the data we need to create a gelman.plot in ggplot.
gelman.plot = function (x, bin.width = 10, max.bins = 50, confidence = 0.95,
transform = FALSE, autoburnin = TRUE, auto.layout = TRUE,
ask, col = 1:2, lty = 1:2, xlab = "last iteration in chain",
ylab = "shrink factor", type = "l", ...)
{
...
y <- gelman.preplot(x, bin.width = bin.width, max.bins = max.bins,
confidence = confidence, transform = transform, autoburnin = autoburnin)
...
return(invisible(y))
}
So, let's get the data that gelman.plot returns invisibly and store it in an object:
gp.dat = gelman.plot(combinedchains)
Now for the ggplot version. First, gp.dat is a list and we need to convert the various parts of that list into a single data frame that ggplot can use.
library(ggplot2)
library(dplyr)
library(reshape2)
df = data.frame(bind_rows(as.data.frame(gp.dat[["shrink"]][,,1]),
as.data.frame(gp.dat[["shrink"]][,,2])),
q=rep(dimnames(gp.dat[["shrink"]])[[3]], each=nrow(gp.dat[["shrink"]][,,1])),
last.iter=rep(gp.dat[["last.iter"]], length(gp.dat)))
For the plot, we'll melt df into long format, so that we can have each chain in a separate facet.
ggplot(melt(df, c("q","last.iter"), value.name="shrink_factor"),
aes(last.iter, shrink_factor, colour=q, linetype=q)) +
geom_hline(yintercept=1, colour="grey30", lwd=0.2) +
geom_line() +
facet_wrap(~variable, labeller= labeller(.cols=function(x) gsub("V", "Chain ", x))) +
labs(x="Last Iteration in Chain", y="Shrink Factor",
colour="Quantile", linetype="Quantile") +
scale_linetype_manual(values=c(2,1))
MCMC example code to create the combinedchains object (code copied from here):
trueA = 5
trueB = 0
trueSd = 10
sampleSize = 31
x = (-(sampleSize-1)/2):((sampleSize-1)/2)
y = trueA * x + trueB + rnorm(n=sampleSize,mean=0,sd=trueSd)
likelihood = function(param){
a = param[1]
b = param[2]
sd = param[3]
pred = a*x + b
singlelikelihoods = dnorm(y, mean = pred, sd = sd, log = T)
sumll = sum(singlelikelihoods)
return(sumll)
}
prior = function(param){
a = param[1]
b = param[2]
sd = param[3]
aprior = dunif(a, min=0, max=10, log = T)
bprior = dnorm(b, sd = 5, log = T)
sdprior = dunif(sd, min=0, max=30, log = T)
return(aprior+bprior+sdprior)
}
proposalfunction = function(param){
return(rnorm(3,mean = param, sd= c(0.1,0.5,0.3)))
}
run_metropolis_MCMC = function(startvalue, iterations) {
chain = array(dim = c(iterations+1,3))
chain[1,] = startvalue
for (i in 1:iterations) {
proposal = proposalfunction(chain[i,])
probab = exp(likelihood(proposal) + prior(proposal) - likelihood(chain[i,]) - prior(chain[i,]))
if (runif(1) < probab){
chain[i+1,] = proposal
}else{
chain[i+1,] = chain[i,]
}
}
return(mcmc(chain))
}
startvalue = c(4,2,8)
chain = run_metropolis_MCMC(startvalue, 10000)
chain2 = run_metropolis_MCMC(startvalue, 10000)
combinedchains = mcmc.list(chain, chain2)
UPDATE: gelman.preplot is an internal coda function that's not directly visible to users. To get the function code, in the console type getAnywhere(gelman.preplot). Then you can see what the function is doing and, if you wish, construct your own function to return the appropriate diagnostic data in a form more suitable for ggplot.

Related

Getting a list of points (in x, y form) in a Julia graph

If I have a Julia plot with any number of points, would it be possible for me to get a list of all of the data points within the graph (using the Plots library)?
EDIT: I am working with GeoStats.jl to create temporal variograms, and I just wanted to calculate the error (using RMSE and MAE) of the model's fit. To do this, I thought I had to compare the points within model's curve with the original semivariogram. The current code I have running is:
using GeoStats, Plots, DataFrames, CSV, Dates, MLJ
data_frame = CSV.read("C:/Users/VSCode/MINTS-Variograms/data/MINTS_001e06373996_IPS7100_2022_01_02.csv", DataFrame)
ms = [parse(Float64,x[20:26]) for x in data_frame[!,:dateTime]]
ms = string.(round.(ms,digits = 3)*1000)
ms = chop.(ms,tail= 2)
data_frame.dateTime = chop.(data_frame.dateTime,tail= 6)
data_frame.dateTime = data_frame.dateTime.* ms
data_frame.dateTime = DateTime.(data_frame.dateTime,"yyyy-mm-dd HH:MM:SS.sss")
ls_index = findall(x-> Millisecond(500)<x<Millisecond(1500), diff(data_frame.dateTime))
df = data_frame[ls_index, :]
#include calculation for average lag
#initialize georef data
𝒟 = georef((Z=df.pm2_5, ))
#empirical variogram - same thing as semivariogram
g = EmpiricalVariogram(𝒟, :Z, maxlag=300.)
plot(g, label = "")
γ = fit(Variogram, g)
plot!(γ, label = "")
hline!([γ.nugget], label = "")
hline!([γ.sill], label = "")
println("nugget: " * string(γ.nugget))
println("sill: " * string(γ.sill))

How to bootstrap the correlation via boot::boot() for multiple pairs of variables at the same time?

I have to calculate a lot of bootstrapped correlations (Pearson r). My knowledge of R (not to speak of writing my own functions) is limited. So far, I have only managed to calculate each bootstrapped correlation individually via boot::boot(), which is quite time intensive due to the high number of correlations.
How do I calculate several bootstrapped correlations at the same time?
Here is the code I've been working with successfully, i.e. calculating each correlation individually. This means I would have to repeat this code around 300 times, exchanging small parts of the code each time.
bootPearsonSZ <- function(data,i){
cor(BdSZ$ndh[i],BdSZ$nkr_erst[i], use = "complete.obs", method = "pearson") # BdSZ = name of the data tibble I'm working with
}
set.seed(1)
boot_PearsonSZ <- boot(BdSZ, bootPearsonSZ, 10000)
boot_PearsonSZ
mean(boot_PearsonSZ$t) #Shows me the bootstrapped value for Pearson r
boot.ci(boot.out = boot_PearsonSZ, type = "all", conf = 0.99) #Shows me the 99% conf. intervall
Here is the code I unsuccessfully used to calculate at least some of the correlations at once. The code is not working correctly: The output of boot() only shows me the correlation for the last line in my function, i.e. cor(BdSZ$ndh[i],BdSZ$azr[i], use = "complete.obs", method = "pearson")
bootPearsonSZ <- function(data,i){
cor(BdSZ$ndh[i],BdSZ$nkr_erst[i], use = "complete.obs", method = "pearson") # BdSZ = name of the data tibble I'm working with
cor(BdSZ$ndh[i],BdSZ$nkr_ge[i], use = "complete.obs", method = "pearson")
cor(BdSZ$ndh[i],BdSZ$nkr_an[i], use = "complete.obs", method = "pearson")
cor(BdSZ$ndh[i],BdSZ$azr[i], use = "complete.obs", method = "pearson") #Apparently only the last line of code will be used by boot()
}
set.seed(1)
boot_PearsonSZ <- boot(BdSZ, bootPearsonSZ, 10000)
boot_PearsonSZ
mean(boot_PearsonSZ$t)
boot.ci(boot.out = boot_PearsonSZ, type = "all", conf = 0.99)
Additional info, that might be relevant to answering my question:
I have both cross-sectional and longitudinal date. I want to calculate the correlations for 4x7 = 28 pairs of variables.
For the cross-sectional part of my study I have to calculate them for 3 city districts + all districts together, which leads me to perform 28x4 = 112 correlations.
For the longitudinal data I have one district but 7 years (+ all years together), which leads me to perform 28x(7+1) = 224 correlations.
Before calculating the correlations, I currently create a subset of my tibble each time, which only contains the district or year for which I want to calculate the bootstrapped correlation. Maybe there is a possibility to work around this by using the subsetting within the function I have written (and thus making it simpler)?
I'm very gratefull for any sort of help!
EDIT: Added reproducible example as asked for by #stephan-kolassa:
library(boot)
library(tidyr)
library(faux)
IndependentVariables <- rnorm_multi(n = 30,
mu = c(100, 100, 100, 100, 100, 100, 100),
sd = c(10, 10, 10, 10, 10, 10, 10),
r = 0.25,
varnames = c("IV1", "IV2", "IV3", "IV4", "IV5", "IV6", "IV7"),
empirical = FALSE)
DependentVariable <- rnorm_multi(n = 30,
mu = c(100, 100, 100, 100),
sd = c(10, 10, 10, 10),
r = 0.6,
varnames = c("DV1", "DV2", "DV3", "DV4"),
empirical = FALSE)
ID <- c(1:30)
mydata <- cbind(ID, IndependentVariables, DependentVariable)
bootPearson <- function(data,i){
cor(mydata$DV1[i],mydata$IV1[i], use = "complete.obs", method = "pearson")
cor(mydata$DV1[i],mydata$IV2[i], use = "complete.obs", method = "pearson")
cor(mydata$DV1[i],mydata$IV3[i], use = "complete.obs", method = "pearson")
cor(mydata$DV1[i],mydata$IV4[i], use = "complete.obs", method = "pearson")
cor(mydata$DV1[i],mydata$IV5[i], use = "complete.obs", method = "pearson")
cor(mydata$DV1[i],mydata$IV6[i], use = "complete.obs", method = "pearson")
cor(mydata$DV1[i],mydata$IV7[i], use = "complete.obs", method = "pearson")
}
set.seed(1)
boot_Pearson <- boot(mydata, bootPearson, 2000)
boot_Pearson
mean(boot_Pearson$t) #Shows me the bootstrapped value for Pearson r
boot.ci(boot.out = boot_Pearson, type = "all", conf = 0.99) #Shows me the 99% conf. intervall
Your bootPearson() function does not do what you presumably want it to do. Right now, it calculates seven different correlations but only returns the last one - everything else gets calculated and discarded. In R, functions only return the last result created in the function body. You may want to read up on how R functions work.
The solution is simple: just change bootPearson() to create and return a single object - namely, a vector of length 7 that contains the seven correlations you calculate. Concatenate them into one vector using the c() command:
bootPearson <- function(data,i){
c(cor(mydata$DV1[i],mydata$IV1[i], use = "complete.obs", method = "pearson"),
cor(mydata$DV1[i],mydata$IV2[i], use = "complete.obs", method = "pearson"),
cor(mydata$DV1[i],mydata$IV3[i], use = "complete.obs", method = "pearson"),
cor(mydata$DV1[i],mydata$IV4[i], use = "complete.obs", method = "pearson"),
cor(mydata$DV1[i],mydata$IV5[i], use = "complete.obs", method = "pearson"),
cor(mydata$DV1[i],mydata$IV6[i], use = "complete.obs", method = "pearson"),
cor(mydata$DV1[i],mydata$IV7[i], use = "complete.obs", method = "pearson"))
}
Of course, you can now also loop over your DVs and IVs within this function and fill the results vector (using a counter to point at the correct entry) - no need to copy 28 almost identical lines.
bootPearson <- function(data,i){
result <- rep(NA,28)
pointer <- 1
for ( iv in 1:7 ) {
for ( dv in 1:4 ) {
result[pointer] <- cor(mydata[i,iv+1],mydata[i,dv+8], use = "complete.obs", method = "pearson")
pointer <- pointer+1
}
}
result
}
Note how the final result makes the function return the entire vector.

Generate missing value in dataset using `ampute` function from `mice` library in R

I originally posted this on CrossValidated but now realize this website is more appropriate for my question.
Following this link, I am trying to generate missing values to simulate a real-world situation
First I generated explanatory variables using the following code:
n = 50
x1 = rnorm(n,mean = 0,sd = 1)
x2 = rnorm(n,mean = 0,sd = 1)
x3 = rnorm(n,mean = 0,sd = 1)
x4 = rnorm(n,mean = 0,sd = 1)
Then I generate the responsive variable by the following code.
z = -1 + .5*x1 + .5*x2 + .5*x3 + .5*x4 + rnorm(1,0,0.1)
pr = 1/(1+exp(z)) # pass through an inv-logit function
y = rbinom(n,1,pr) # bernoulli response variable
data_mat <- as.data.frame(cbind(x1,x2,x3,x4,y))
I am trying to use the ampute function from the mice library to generate missing data based on the binary response variable.
The missing not at random case I would like to generate is as follows: when Y = 0, the independent variables are four times more likely to have missing data than the independent variables when Y = 0.
Can this be done using ampute function or is there an alternative way? Thank you.

Calculating and indexing mcmc chains in coda

There are two things I need to do. Firstly I would like to be able to create new variables in a coda mcmc object that have been calculated from existing variables so that I can run chain diagnostics on the new variable. Secondly I would like to be able to index single variables in some of the coda plot functions while still viewing all chains.
Toy data. Bayesian t-test on the sleep data using JAGS and rjags.
data(sleep)
# read in data
y <- sleep$extra
x <- as.numeric(as.factor(sleep$group))
nTotal <- length(y)
nGroup <- length(unique(x))
mY <- mean(y)
sdY <- sd(y)
# make dataList
dataList <- list(y = y, x = x, nTotal = nTotal, nGroup = nGroup, mY = mY, sdY = sdY)
# model string
modelString <- "
model{
for (oIdx in 1:nTotal) {
y[oIdx] ~ dnorm(mu[x[oIdx]], 1/sigma[x[oIdx]]^2)
}
for (gIdx in 1:nGroup) {
mu[gIdx] ~ dnorm(mY, 1/sdY)
sigma[gIdx] ~ dunif(sdY/10, sdY*10)
}
}
"
writeLines(modelString, con = "tempModel.txt")
# chains
# 1. adapt
jagsModel <- jags.model(file = "tempModel.txt",
data = dataList,
n.chains = 3,
n.adapt = 1000)
# 2. burn-in
update(jagsModel, n.iter = 1000)
# 3. generate
codaSamples <- coda.samples(model = jagsModel,
variable.names = c("mu", "sigma"),
thin = 15,
n.iter = 10000*15/3)
Problem one
If I convert the coda object to a dataframe I can calculate the difference between the estimates for the two groups and plot this new variable, like so...
df <- as.data.frame(as.matrix(codaSamples))
names(df) <- gsub("\\[|\\]", "", names(df), perl = T) # remove brackets
df$diff <- df$mu1 - df$mu2
ggplot(df, aes(x = diff)) +
geom_histogram(bins = 100, fill = "skyblue") +
geom_vline(xintercept = mean(df$diff), colour = "red", size = 1, linetype = "dashed")
...but how do I get a traceplot? I can get one for existing variables within the coda object like so...
traceplot(codaSamples[[1]][,1])
...but I would like to be able to get them for the the new diff variable.
Problem Two
Which brings me to the second problem. I would like to be able to get a traceplot (among other things) for individual variables. As I have shown above I can get them for a single variable if I only want to see one chain but I'd like to see all chains. I can see all chains for all variables in the model with the simple
plot(codaSamples)
...but what if I don't want or need to see all variables? What if I just want to see the trace and/or desnity plots for one, or even two, variables (but not all variables) but with all chains in the plot?

R - Splitting Data, regression and applying equation to new split data set

I have a large data set that has older and newer data. I created two data frames, EarlyYears with the older data and LaterYears with the new data, so they have the same columns.
What I want to do is regress the data from Early years to determine an equation and apply it to the Later Years to test the equation's strength - A and B are constants, Input is what I am testing - I change it for different runs of the code - and Dummy is 1 is there is no data for the input. However, I want to split both the EarlyYears and LaterYears data by quintiles of one of the variables, and apply the equation found in quintile 1 of EarlyYears to data from LaterYears that is in quintile 1. I am fairly new at R, and so far have:
Model<-data.frame(Date = rep(c("3/31/09","3/31/11"),each = 20),
InputRating = rep(c(1:5), 8), Dummy = rep(c(rep(0,9),1),4),
Y = rep(1,3,5,7,11,13,17,19), A = 1:40,B = 1:40*3+7)
newer<-as.numeric(grep("/11",Model$Date))
later<-as.numeric(grep("/11",Model$Date,invert = TRUE))
LaterYears<-Model[newer,]
EarlyYears<-Model[later,]
newModel<-EarlyYears
DataSet.Input<-data.frame(Date = newModel$Date, InputRating = newModel$InputRating,
Dummy = newModel$Dummy, Y = newModel$Y, A = newModel$A,B = newModel$B)
quintiles<-quantile(DataSet.Input$A,probs=c(0.2,0.4,0.6, 0.8, 1.0))
VarQuint<-findInterval(DataSet.Input$A,quintiles,rightmost.closed=TRUE)+1L
regressionData<-do.call(rbind,lapply(split(DataSet.Input,VarQuint),
FUN = function(SplitData) {
SplitRegression<-lm(Y ~ A + B + InputRating + Dummy, data = SplitData, na.action = na.omit)
c(coef.Intercept = coef(summary(SplitRegression))[1],
coef.A = coef(summary(SplitRegression))[2],
coef.B = coef(summary(SplitRegression))[3],
coef.Input = coef(summary(SplitRegression))[4],
coef.Dummy= coef(summary(SplitRegression))[5])
}))
i = 0
quintiles.LY<-quantile(LaterYears$A,probs=c(0.2,0.4,0.6, 0.8, 1.0))
Quint.LY<-findInterval(LaterYears$A,quintiles,rightmost.closed=TRUE)+1L
LaterYears$ExpectedValue <-apply(split(LaterYears,Quint.LY),1,
FUN = function(SplitData) {
i=i+1
regressionData[i,1]+regressionData[i,2]*SplitData$A +
regressionData[i,3]*SplitData$B + regressionData[i,4]*SplitData$Input +
regressionData[i,5]*SplitData$Dummy
})
The first part works great to get the data in regressionData. I want this results of applying the equation to be held in a column within the LaterYears dataset, but I get an error -
Error in apply(split(LaterYears, Quint.LY), 1, FUN = function(SplitData) { :
dim(X) must have a positive length
when running this with apply, and blank when running with lapply which is what I originally tried.
Any help with how to fix this would be greatly appreciated!
Thanks!
Perhaps something like this, using predict would be better. It doesn't work very well for your example data but it may work on the real data.
# by, splits a dataset by a factor
regressionData <- by(DataSet.Input,VarQuint,
function(d) {
lm1 <- lm(Y ~ A + B + InputRating + Dummy, d)
})
quintiles.LY<-quantile(LaterYears$A,probs=seq(0,1,0.2))
Quint.LY<-findInterval(LaterYears$A,quintiles,rightmost.closed=TRUE)+1L
LaterYearsPredict <- split(LaterYears,Quint.LY)
# lapply's arguments can be anything that is a sequence
LaterYears$ExpectedValue <- unlist(lapply(1:length(LaterYearsPredict),
function(x)
predict(regressionData[[x]],LaterYearsPredict[[x]])
))

Resources