codes:basic_mdp_r_code [Zhiyong Johnny Zhang]

################################################################
# Code to fit a basic MDP model
#
# Be warned that this is bad code.  I've tried to keep the code
# understandable, rather than efficient.  
################################################################


################################################################
#
# Part 1.  A description of the model
#
# The model is a conjugate, normal-normal model
# Variances are treated as known throughout
#
#  The model is the basic model discussed in class
#  mu            ~ N(mu_0, rho^2)
#  F | mu        ~ DP(M F_0);            F_0 = N(mu, tau^2)
#  theta_i | F   ~ F
#  X_i | theta_i ~ N(theta_i, sigma^2)
#
################################################################

################################################################
#
# Part 2.  Some useful objects
#
# First, we'll need information on the clusters
#
# clust$s
# clust$k
# clust$n.i
# clust$theta.star
#
# It's also good to have the data stored as an object--overkill
#  in this case, as the data consist of little more than the vector, x
#
# data$x
# data$n
#
# The parameters governing likelihood and prior are also essential
#
# prior$mu.0
# prior$rho.sq
# prior$M
# prior$tau.sq
# prior$sig.sq
#
# Also, we'll track the value of mu as the gibbs sampler progresses
#
# mu
################################################################

################################################################
#
# The real bit of the code -- functions for a variety of small tasks
#  first, then a function for a complete iterate of the Gibbs sampler,
#  and finally, a function for a small simulation
################################################################

################################################################
# 
# Function 1.  A function to remove theta_i from the cluster 
#              structure
#
# Note the silly  "dump" of clust into variables of the same 
#  names at the beginning of the function and the collection
#  of the variables at the end.  Compare the readability of 
#  this to later functions without the dump and collect.  
#
################################################################

fn.remove.theta.i <- function(i,clust)
{
 s <- clust$s
 k <- clust$k
 n.i <- clust$n.i
 theta.star <- clust$theta.star

 tmp <- s[i]
 if (n.i[tmp] > 1)
   n.i[tmp] <- n.i[tmp] - 1
 else 
   {
    n.i[tmp] <- n.i[k]
    n.i <- n.i[-k]
    s[s==k] <- tmp
    theta.star[tmp] <- theta.star[k]
    theta.star <- theta.star[-k]
    k <- k - 1
   }
 s[i] <- 0

 clust$s <- s
 clust$k <- k
 clust$n.i <- n.i
 clust$theta.star <- theta.star

 return(clust)
}

################################################################
# 
# Function 2.  A function to generate theta_i for the cluster 
#              structure
#
# Tasks:  Compute probabilities of joining cluster, beginning
#           new cluster
#         Generate cluster membership
#         Update s and n_i
#         Update k and theta_star if needed
#
################################################################

fn.gen.theta.i <- function(i,clust,prior,mu,data)
{
 prb <- c(clust$n.i,prior$M)
 for (j in 1:clust$k)
  {
   tmp.m <- clust$theta.star[j]
   tmp.v <- prior$sig.sq
   prb[j] <- prb[j] * dnorm(data$x[i],mean=tmp.m,sd=sqrt(tmp.v))
  }
 prb[clust$k + 1] <- prb[clust$k + 1] * 
     dnorm(data$x[i], mean=mu, sd=sqrt(prior$tau.sq + prior$sig.sq))

 tmp <- sample(1:(clust$k+1),size=1,prob=prb)

 if (tmp > clust$k)
  {
   clust$s[i] <- tmp
   clust$k <- clust$k + 1
   clust$n.i <- c(clust$n.i,1)
   tmp.m <- ((1/prior$sig.sq)*data$x[i] + (1/prior$tau.sq)*mu) / 
            ((1/prior$sig.sq) + (1/prior$tau.sq))
   tmp.v <- 1/((1/prior$sig.sq) + (1/prior$tau.sq))
   tmp <- rnorm(n=1,mean=tmp.m,sd=sqrt(tmp.v))
   clust$theta.star <- c(clust$theta.star,tmp)
  }
 else
  {
   clust$s[i] <- tmp
   clust$n.i[tmp] <- clust$n.i[tmp] + 1
  }

 return(clust)
}

################################################################
# 
# Function 3.  A function to generate theta_star 
#
# Tasks:  Loop through i = 1, ..., k
#         Find cond'l posterior distribution for theta_star[i]
#         Generate theta_star[i]
#
################################################################

fn.gen.theta.star <- function(clust,prior,mu,data)
{
 for (i in 1:clust$k)
  {
   tmp.m <- ((clust$n.i[i]/prior$sig.sq)*mean(data$x[clust$s==i]) + 
             (1/prior$tau.sq)*mu) / 
            ((clust$n.i[i]/prior$sig.sq) + (1/prior$tau.sq))
   tmp.v <- 1/((clust$n.i[i]/prior$sig.sq) + (1/prior$tau.sq))
   clust$theta.star[i] <- rnorm(n=1,mean=tmp.m,sd=sqrt(tmp.v))
  }

 return(clust)
}

################################################################
# 
# Function 4.  A function to generate mu 
#
# Tasks:  Find cond'l posterior distribution for mu
#         Generate mu
#
################################################################

fn.gen.mu <- function(clust,prior)
{
 tmp.m <- ((clust$k/prior$tau.sq)*mean(clust$theta.star) +
           (1 / prior$rho.sq) * prior$mu.0) / 
          ((clust$k/prior$tau.sq) + (1/prior$rho.sq))
 tmp.v <- 1 / ((clust$k/prior$tau.sq) + (1/prior$rho.sq))
 mu <- rnorm(n=1,mean=tmp.m,sd=sqrt(tmp.v))
 
 return(mu)
}

################################################################
# 
# Function 5.  One iterate of the Gibbs sampler 
#
# Tasks:  Generate each theta_i in turn
#         Generate theta_star
#         Generate mu
#
################################################################

fn.one.iterate <- function(clust,prior,mu,data)
{
 for (i in 1:data$n)
  {
   clust <- fn.remove.theta.i(i,clust)
   clust <- fn.gen.theta.i(i,clust,prior,mu,data)
  }
 clust <- fn.gen.theta.star(clust,prior,mu,data)
 mu <- fn.gen.mu(clust,prior)

 ret.obj <- NULL
 ret.obj$clust <- clust
 ret.obj$prior <- prior
 ret.obj$mu <- mu

 return(ret.obj)
}

################################################################
# 
# Function 6.  A brief Gibbs sampler 
#
# Tasks:  Set up object (here, matrix) to store results
#         Run one iterate of Gibbs sampler
#         Tally results
#
# Improvements for you to make:
#   Burn-in -- allow explicit description of burn-in to be
#              discarded
#   Subsampling -- Not to be done unless storage issues are
#                  important.  But, allow subsampling of the
#                  output  
#   Initialization -- an automated initialization routine.  
#                     Best to allow a couple of options for
#                     the initialization.  
#
################################################################

fn.gibbs.sampler <- function(n.reps,prior,data,clust,mu)
{
 # Insert initialization routine if desired
 # Insert burn-in period if desired
 res.mat <- matrix(rep(0,n.reps*(data$n+1)),nrow=n.reps)

 for (i in 1:n.reps)
  {
   tmp <- fn.one.iterate(clust,prior,mu,data)
   clust <- tmp$clust
   mu <- tmp$mu
   res.mat[i,] <- c(mu,clust$theta.star[clust$s])
# print(clust$k)
  }

 return(res.mat)
}


# A run of the code is as simple as this--once the called
#  objects are set up.  
n.reps <- 1000
res.mat <- fn.gibbs.sampler(n.reps,prior,data,clust,mu)



################################################################
# 
# As with any code, diagnostics are essential.  For one 
#  example, one could jot down the code below.  Set up 
#  a cluster structure in "bb", along with prior, mu,
#  and data.  Run the routine fn.gen.theta.star a bunch
#  of times.  For this normal-normal model (for each theta_star[i]),
#  you know the form of the posterior.  Check mean and
#  variance of each column against known values.   
#
# Change mu and repeat
# Change sig.sq and repeat
# Change tau.sq and repeat
#
################################################################

tmp.mat <- matrix(rep(0,4000),ncol=4)
for (i in 1:1000)
  {tmp.mat[i,] <- fn.gen.theta.star(bb,prior,mu,data)$theta.star}
apply(tmp.mat,2,mean)
apply(tmp.mat,2,var)


################################################################
# 
# Two data sets.  For both data sets, the observations are weight
#  (in pounds).  The first data set is available at statlib (on the
#  web at http://lib.stat.cmu.edu/datasets/), where you can find it
#  under bodyfat.  The second is from the 2008-09 Buckeye football
#  roster.  
#
# We'll look at a basic compound decision problem for both data
#  sets.  Eliciting a prior distribution for weights, I went with 
#  the following values:
#     mu.0 = 180, rho.sq = 400   center is 180, give or take 20
#     M = 20, tau.sq = 225       together, tau.sq + sig.sq give a 
#     sig.sq = 100                 sd of approx 18
#
#  Initially, each observation in its own cluster
#     k = data.n, s = 1:k, n.i=rep(1,k), theta.star = data.x
#
################################################################

# create appropriate data object
prior1 <- NULL
prior1$mu.0 <- 180
prior1$rho.sq <- 400
prior1$M <- 20
prior1$tau.sq <- 225
prior1$sig.sq <- 100

clust1 <- NULL
clust1$k <- data1$n
clust1$s <- 1:data1$n
clust1$n.i <- rep(1,data1$n)
clust1$theta.star <- data1$x

mu1 <- prior1$mu.0

# for data set 2, listed in object data2
prior2 <- prior1
clust2 <- NULL
clust2$k <- data2$n
clust2$s <- 1:data2$n
clust2$n.i <- rep(1,data2$n)
clust2$theta.star <- data2$x
mu2 <- prior2@mu.0

# run the code 

n.reps <- 10000
date()
res.mat1 <- fn.gibbs.sampler(n.reps,prior,data,clust,mu)
date()
# 39 minutes on my laptop, n=252

n.reps <- 10000
date()
res.mat2 <- fn.gibbs.sampler(n.reps,prior2,data2,clust2,mu2)
date()
# 12 minutes on my laptop, n=112

################################################################
# 
# A second analysis removes the distribution over base measures.
#  To make the new model more comparable to the old one, we might
#  match the prior predictive distributions.  To do this, we set
#  mu = mu.0 and sweep the variation from rho.sq into tau.sq.  
#  We also use a routine that does not update mu.  
#
#  The prior distribution
#     mu = 180                   center is 180
#     M = 20, tau.sq = 625        
#     sig.sq = 100               
#
#  Initially, each observation in its own cluster
#     k = data.n, s = 1:k, n.i=rep(1,k), theta.star = data.x
#
################################################################

################################################################
# 
# Function 5'.  One iterate of the Gibbs sampler, no hierarchy
#
# Tasks:  Generate each theta_i in turn
#         Generate theta_star
#
################################################################

fn.one.iterate.nohier <- function(clust,prior,mu,data)
{
 for (i in 1:data$n)
  {
   clust <- fn.remove.theta.i(i,clust)
   clust <- fn.gen.theta.i(i,clust,prior,mu,data)
  }
 clust <- fn.gen.theta.star(clust,prior,mu,data)

 ret.obj <- NULL
 ret.obj$clust <- clust
 ret.obj$prior <- prior

 return(ret.obj)
}

################################################################
# 
# Function 6'.  A brief Gibbs sampler, no hierarchy
#
# Tasks:  Set up object (here, matrix) to store results
#         Run one iterate of Gibbs sampler
#         Tally results
#
# Improvements for you to make:
#   Burn-in -- allow explicit description of burn-in to be
#              discarded
#   Subsampling -- Not to be done unless storage issues are
#                  important.  But, allow subsampling of the
#                  output  
#   Initialization -- an automated initialization routine.  
#                     Best to allow a couple of options for
#                     the initialization.  
#
################################################################

fn.gibbs.sampler.nohier <- function(n.reps,prior,data,clust,mu)
{
 # Insert initialization routine if desired
 # Insert burn-in period if desired
 res.mat <- matrix(rep(0,n.reps*(data$n+1)),nrow=n.reps)

 for (i in 1:n.reps)
  {
   tmp <- fn.one.iterate.nohier(clust,prior,mu,data)
   clust <- tmp$clust
   res.mat[i,] <- c(mu,clust$theta.star[clust$s])
# print(clust$k)
  }

 return(res.mat)
}

# Sets up the objects for the analysis ...3 is for the
#  bodyfat data set, ...4 is for the Buckeye data set.  
prior3 <- prior1
prior3$tau.sq <- 625
clust3 <- clust1
mu3 <- prior3$mu.0
data3 <- data

prior4 <- prior2
prior4$tau.sq <- 625
clust4 <- clust2
mu4 <- prior4$mu.0
data4 <- data2


# run the code
n.reps <- 10000
date()
res.mat3 <- fn.gibbs.sampler.nohier(n.reps,prior3,data3,clust3,mu3)
date()
# 42 minutes on my laptop, n=252

n.reps <- 10000
date()
res.mat4 <- fn.gibbs.sampler.nohier(n.reps,prior4,data4,clust4,mu4)
date()
# 10 minutes on my laptop, n=112


################################################################
# 
# Diagnostics are an essential part of MCMC.  The primary questions
#  are whether the sampler has converged (i.e., after some burn-in
#  period, are the draws from a very close approximation to the 
#  posterior distribution) and how well the sampler mixes (i.e., is
#  the Markov chain moving around the parameter space quickly enough).  
# 
# As in other areas of Statistics, diagnostics are largely an art.  
#  A few solid plots and summaries are these:
#    (i) time series plots of parameters--use points, not lines
#    (ii) plots of one parameter against another
#    (iii) a sequence of density estimates for a parameter
#    (iv) autocorrelations for the parameters
#
################################################################

# As a brief example, the following addresses convergence/mixing
#  for the first example.  The plots are for mu, followed by 
#  theta_1 ... theta_15.  A more complete look would examine the
#  rest of the thetas as well.  Look at these plots with a "full screen"
#  graphics device.  
par(mfrow=c(4,4))
for (i in 1:16) plot(tmp,res.mat1[tmp,i])

# Note the strange behavior for theta_1.  Convergence and mixing
#  appear to hold, but there is a "gap" in the small range of values 
#  for theta_1.  Pursue this for an interesting look at the distribution.  
#  What accounts for the phenomenon?  
par(mfrow=c(1,1))
plot(density(res.mat1[,2]))

# One might also track the number of clusters amongst the theta_i
tmp.vec <- rep(0,10000)
for (i in 1:10000)
 {tmp.vec[i] <- length(unique(res.mat1[i,]))-1}
plot(tmp,tmp.vec)
# Note:  This is a bad way to look at the number of clusters.  
#  Part of good diagnostics is planning.  What is a better way to 
#  set up and perform this diagnostic since we know we'll want to 
#  look at this.  

# Further diagnostics should be done.  Think hard about whether
#  or not you're getting the kind of MCMC output that should be 
#  used for estimation.  If not, it's back to the drawing board.  


################################################################
# 
# Exploration of the posterior distribution and estimation are the
#  two main purposes of MCMC when used to fit a Bayesian model.   
#  Setting exploration aside, one might use basic estimators.  For
#  the compound decision problem, our primary interest is in each
#  of the theta_i.  We may also be interested in a new theta for
#  which we have no data, or in approximating the decision rule for
#  theta_hat given x.  
#
# A nonparametric approach to the compound decision problem opens 
#  up more possibilities.  As with theta_1 above, we may find that
#  the posterior distribution for a parameter is distinctly non-normal
#  (while for other parameters, normality provides a good approximation).  
#  Is it worth while collecting more information on a particular theta_i?  
#  With greater variety in the posterior distributions, we expect 
#  greater variety in our decisions about how valuable further data
#  is, and hence about which further data to collect.   
#
################################################################

par(mfrow=c(2,2))

theta.hat1 <- apply(res.mat1,2,mean)
plot(data1$x,theta.hat1[2:253])
lines(c(150,300),c(150,300))
theta.hat2 <- apply(res.mat2,2,mean)
plot(data2$x,theta.hat2[2:113])
lines(c(150,300),c(150,300))
theta.hat3 <- apply(res.mat3,2,mean)
plot(data3$x,theta.hat3[2:253])
lines(c(150,300),c(150,300))
theta.hat4 <- apply(res.mat4,2,mean)
plot(data4$x,theta.hat4[2:113])
lines(c(150,300),c(150,300))

theta.hat.new1 <- (prior1$M * theta.hat1[1] + sum(theta.hat1[2:253])) / 
                  (prior1$M + data1$n)
theta.hat.new2 <- (prior2$M * theta.hat2[1] + sum(theta.hat2[2:113])) / 
                  (prior2$M + data2$n)
theta.hat.new3 <- (prior3$M * theta.hat3[1] + sum(theta.hat3[2:253])) / 
                  (prior3$M + data3$n)
theta.hat.new4 <- (prior4$M * theta.hat4[1] + sum(theta.hat4[2:113])) / 
                  (prior4$M + data4$n)
theta.hat.new1
theta.hat.new2
theta.hat.new3
theta.hat.new4

> theta.hat.new1
[1] 179.8613
> theta.hat.new2
[1] 228.7993
> theta.hat.new3
[1] 178.8999
> theta.hat.new4
[1] 221.7696