################################################################ # Code to fit a basic MDP model # # Be warned that this is bad code. I've tried to keep the code # understandable, rather than efficient. ################################################################ ################################################################ # # Part 1. A description of the model # # The model is a conjugate, normal-normal model # Variances are treated as known throughout # # The model is the basic model discussed in class # mu ~ N(mu_0, rho^2) # F | mu ~ DP(M F_0); F_0 = N(mu, tau^2) # theta_i | F ~ F # X_i | theta_i ~ N(theta_i, sigma^2) # ################################################################ ################################################################ # # Part 2. Some useful objects # # First, we'll need information on the clusters # # clust$s # clust$k # clust$n.i # clust$theta.star # # It's also good to have the data stored as an object--overkill # in this case, as the data consist of little more than the vector, x # # data$x # data$n # # The parameters governing likelihood and prior are also essential # # prior$mu.0 # prior$rho.sq # prior$M # prior$tau.sq # prior$sig.sq # # Also, we'll track the value of mu as the gibbs sampler progresses # # mu ################################################################ ################################################################ # # The real bit of the code -- functions for a variety of small tasks # first, then a function for a complete iterate of the Gibbs sampler, # and finally, a function for a small simulation ################################################################ ################################################################ # # Function 1. A function to remove theta_i from the cluster # structure # # Note the silly "dump" of clust into variables of the same # names at the beginning of the function and the collection # of the variables at the end. Compare the readability of # this to later functions without the dump and collect. # ################################################################ fn.remove.theta.i <- function(i,clust) { s <- clust$s k <- clust$k n.i <- clust$n.i theta.star <- clust$theta.star tmp <- s[i] if (n.i[tmp] > 1) n.i[tmp] <- n.i[tmp] - 1 else { n.i[tmp] <- n.i[k] n.i <- n.i[-k] s[s==k] <- tmp theta.star[tmp] <- theta.star[k] theta.star <- theta.star[-k] k <- k - 1 } s[i] <- 0 clust$s <- s clust$k <- k clust$n.i <- n.i clust$theta.star <- theta.star return(clust) } ################################################################ # # Function 2. A function to generate theta_i for the cluster # structure # # Tasks: Compute probabilities of joining cluster, beginning # new cluster # Generate cluster membership # Update s and n_i # Update k and theta_star if needed # ################################################################ fn.gen.theta.i <- function(i,clust,prior,mu,data) { prb <- c(clust$n.i,prior$M) for (j in 1:clust$k) { tmp.m <- clust$theta.star[j] tmp.v <- prior$sig.sq prb[j] <- prb[j] * dnorm(data$x[i],mean=tmp.m,sd=sqrt(tmp.v)) } prb[clust$k + 1] <- prb[clust$k + 1] * dnorm(data$x[i], mean=mu, sd=sqrt(prior$tau.sq + prior$sig.sq)) tmp <- sample(1:(clust$k+1),size=1,prob=prb) if (tmp > clust$k) { clust$s[i] <- tmp clust$k <- clust$k + 1 clust$n.i <- c(clust$n.i,1) tmp.m <- ((1/prior$sig.sq)*data$x[i] + (1/prior$tau.sq)*mu) / ((1/prior$sig.sq) + (1/prior$tau.sq)) tmp.v <- 1/((1/prior$sig.sq) + (1/prior$tau.sq)) tmp <- rnorm(n=1,mean=tmp.m,sd=sqrt(tmp.v)) clust$theta.star <- c(clust$theta.star,tmp) } else { clust$s[i] <- tmp clust$n.i[tmp] <- clust$n.i[tmp] + 1 } return(clust) } ################################################################ # # Function 3. A function to generate theta_star # # Tasks: Loop through i = 1, ..., k # Find cond'l posterior distribution for theta_star[i] # Generate theta_star[i] # ################################################################ fn.gen.theta.star <- function(clust,prior,mu,data) { for (i in 1:clust$k) { tmp.m <- ((clust$n.i[i]/prior$sig.sq)*mean(data$x[clust$s==i]) + (1/prior$tau.sq)*mu) / ((clust$n.i[i]/prior$sig.sq) + (1/prior$tau.sq)) tmp.v <- 1/((clust$n.i[i]/prior$sig.sq) + (1/prior$tau.sq)) clust$theta.star[i] <- rnorm(n=1,mean=tmp.m,sd=sqrt(tmp.v)) } return(clust) } ################################################################ # # Function 4. A function to generate mu # # Tasks: Find cond'l posterior distribution for mu # Generate mu # ################################################################ fn.gen.mu <- function(clust,prior) { tmp.m <- ((clust$k/prior$tau.sq)*mean(clust$theta.star) + (1 / prior$rho.sq) * prior$mu.0) / ((clust$k/prior$tau.sq) + (1/prior$rho.sq)) tmp.v <- 1 / ((clust$k/prior$tau.sq) + (1/prior$rho.sq)) mu <- rnorm(n=1,mean=tmp.m,sd=sqrt(tmp.v)) return(mu) } ################################################################ # # Function 5. One iterate of the Gibbs sampler # # Tasks: Generate each theta_i in turn # Generate theta_star # Generate mu # ################################################################ fn.one.iterate <- function(clust,prior,mu,data) { for (i in 1:data$n) { clust <- fn.remove.theta.i(i,clust) clust <- fn.gen.theta.i(i,clust,prior,mu,data) } clust <- fn.gen.theta.star(clust,prior,mu,data) mu <- fn.gen.mu(clust,prior) ret.obj <- NULL ret.obj$clust <- clust ret.obj$prior <- prior ret.obj$mu <- mu return(ret.obj) } ################################################################ # # Function 6. A brief Gibbs sampler # # Tasks: Set up object (here, matrix) to store results # Run one iterate of Gibbs sampler # Tally results # # Improvements for you to make: # Burn-in -- allow explicit description of burn-in to be # discarded # Subsampling -- Not to be done unless storage issues are # important. But, allow subsampling of the # output # Initialization -- an automated initialization routine. # Best to allow a couple of options for # the initialization. # ################################################################ fn.gibbs.sampler <- function(n.reps,prior,data,clust,mu) { # Insert initialization routine if desired # Insert burn-in period if desired res.mat <- matrix(rep(0,n.reps*(data$n+1)),nrow=n.reps) for (i in 1:n.reps) { tmp <- fn.one.iterate(clust,prior,mu,data) clust <- tmp$clust mu <- tmp$mu res.mat[i,] <- c(mu,clust$theta.star[clust$s]) # print(clust$k) } return(res.mat) } # A run of the code is as simple as this--once the called # objects are set up. n.reps <- 1000 res.mat <- fn.gibbs.sampler(n.reps,prior,data,clust,mu) ################################################################ # # As with any code, diagnostics are essential. For one # example, one could jot down the code below. Set up # a cluster structure in "bb", along with prior, mu, # and data. Run the routine fn.gen.theta.star a bunch # of times. For this normal-normal model (for each theta_star[i]), # you know the form of the posterior. Check mean and # variance of each column against known values. # # Change mu and repeat # Change sig.sq and repeat # Change tau.sq and repeat # ################################################################ tmp.mat <- matrix(rep(0,4000),ncol=4) for (i in 1:1000) {tmp.mat[i,] <- fn.gen.theta.star(bb,prior,mu,data)$theta.star} apply(tmp.mat,2,mean) apply(tmp.mat,2,var) ################################################################ # # Two data sets. For both data sets, the observations are weight # (in pounds). The first data set is available at statlib (on the # web at http://lib.stat.cmu.edu/datasets/), where you can find it # under bodyfat. The second is from the 2008-09 Buckeye football # roster. # # We'll look at a basic compound decision problem for both data # sets. Eliciting a prior distribution for weights, I went with # the following values: # mu.0 = 180, rho.sq = 400 center is 180, give or take 20 # M = 20, tau.sq = 225 together, tau.sq + sig.sq give a # sig.sq = 100 sd of approx 18 # # Initially, each observation in its own cluster # k = data.n, s = 1:k, n.i=rep(1,k), theta.star = data.x # ################################################################ # create appropriate data object prior1 <- NULL prior1$mu.0 <- 180 prior1$rho.sq <- 400 prior1$M <- 20 prior1$tau.sq <- 225 prior1$sig.sq <- 100 clust1 <- NULL clust1$k <- data1$n clust1$s <- 1:data1$n clust1$n.i <- rep(1,data1$n) clust1$theta.star <- data1$x mu1 <- prior1$mu.0 # for data set 2, listed in object data2 prior2 <- prior1 clust2 <- NULL clust2$k <- data2$n clust2$s <- 1:data2$n clust2$n.i <- rep(1,data2$n) clust2$theta.star <- data2$x mu2 <- prior2@mu.0 # run the code n.reps <- 10000 date() res.mat1 <- fn.gibbs.sampler(n.reps,prior,data,clust,mu) date() # 39 minutes on my laptop, n=252 n.reps <- 10000 date() res.mat2 <- fn.gibbs.sampler(n.reps,prior2,data2,clust2,mu2) date() # 12 minutes on my laptop, n=112 ################################################################ # # A second analysis removes the distribution over base measures. # To make the new model more comparable to the old one, we might # match the prior predictive distributions. To do this, we set # mu = mu.0 and sweep the variation from rho.sq into tau.sq. # We also use a routine that does not update mu. # # The prior distribution # mu = 180 center is 180 # M = 20, tau.sq = 625 # sig.sq = 100 # # Initially, each observation in its own cluster # k = data.n, s = 1:k, n.i=rep(1,k), theta.star = data.x # ################################################################ ################################################################ # # Function 5'. One iterate of the Gibbs sampler, no hierarchy # # Tasks: Generate each theta_i in turn # Generate theta_star # ################################################################ fn.one.iterate.nohier <- function(clust,prior,mu,data) { for (i in 1:data$n) { clust <- fn.remove.theta.i(i,clust) clust <- fn.gen.theta.i(i,clust,prior,mu,data) } clust <- fn.gen.theta.star(clust,prior,mu,data) ret.obj <- NULL ret.obj$clust <- clust ret.obj$prior <- prior return(ret.obj) } ################################################################ # # Function 6'. A brief Gibbs sampler, no hierarchy # # Tasks: Set up object (here, matrix) to store results # Run one iterate of Gibbs sampler # Tally results # # Improvements for you to make: # Burn-in -- allow explicit description of burn-in to be # discarded # Subsampling -- Not to be done unless storage issues are # important. But, allow subsampling of the # output # Initialization -- an automated initialization routine. # Best to allow a couple of options for # the initialization. # ################################################################ fn.gibbs.sampler.nohier <- function(n.reps,prior,data,clust,mu) { # Insert initialization routine if desired # Insert burn-in period if desired res.mat <- matrix(rep(0,n.reps*(data$n+1)),nrow=n.reps) for (i in 1:n.reps) { tmp <- fn.one.iterate.nohier(clust,prior,mu,data) clust <- tmp$clust res.mat[i,] <- c(mu,clust$theta.star[clust$s]) # print(clust$k) } return(res.mat) } # Sets up the objects for the analysis ...3 is for the # bodyfat data set, ...4 is for the Buckeye data set. prior3 <- prior1 prior3$tau.sq <- 625 clust3 <- clust1 mu3 <- prior3$mu.0 data3 <- data prior4 <- prior2 prior4$tau.sq <- 625 clust4 <- clust2 mu4 <- prior4$mu.0 data4 <- data2 # run the code n.reps <- 10000 date() res.mat3 <- fn.gibbs.sampler.nohier(n.reps,prior3,data3,clust3,mu3) date() # 42 minutes on my laptop, n=252 n.reps <- 10000 date() res.mat4 <- fn.gibbs.sampler.nohier(n.reps,prior4,data4,clust4,mu4) date() # 10 minutes on my laptop, n=112 ################################################################ # # Diagnostics are an essential part of MCMC. The primary questions # are whether the sampler has converged (i.e., after some burn-in # period, are the draws from a very close approximation to the # posterior distribution) and how well the sampler mixes (i.e., is # the Markov chain moving around the parameter space quickly enough). # # As in other areas of Statistics, diagnostics are largely an art. # A few solid plots and summaries are these: # (i) time series plots of parameters--use points, not lines # (ii) plots of one parameter against another # (iii) a sequence of density estimates for a parameter # (iv) autocorrelations for the parameters # ################################################################ # As a brief example, the following addresses convergence/mixing # for the first example. The plots are for mu, followed by # theta_1 ... theta_15. A more complete look would examine the # rest of the thetas as well. Look at these plots with a "full screen" # graphics device. par(mfrow=c(4,4)) for (i in 1:16) plot(tmp,res.mat1[tmp,i]) # Note the strange behavior for theta_1. Convergence and mixing # appear to hold, but there is a "gap" in the small range of values # for theta_1. Pursue this for an interesting look at the distribution. # What accounts for the phenomenon? par(mfrow=c(1,1)) plot(density(res.mat1[,2])) # One might also track the number of clusters amongst the theta_i tmp.vec <- rep(0,10000) for (i in 1:10000) {tmp.vec[i] <- length(unique(res.mat1[i,]))-1} plot(tmp,tmp.vec) # Note: This is a bad way to look at the number of clusters. # Part of good diagnostics is planning. What is a better way to # set up and perform this diagnostic since we know we'll want to # look at this. # Further diagnostics should be done. Think hard about whether # or not you're getting the kind of MCMC output that should be # used for estimation. If not, it's back to the drawing board. ################################################################ # # Exploration of the posterior distribution and estimation are the # two main purposes of MCMC when used to fit a Bayesian model. # Setting exploration aside, one might use basic estimators. For # the compound decision problem, our primary interest is in each # of the theta_i. We may also be interested in a new theta for # which we have no data, or in approximating the decision rule for # theta_hat given x. # # A nonparametric approach to the compound decision problem opens # up more possibilities. As with theta_1 above, we may find that # the posterior distribution for a parameter is distinctly non-normal # (while for other parameters, normality provides a good approximation). # Is it worth while collecting more information on a particular theta_i? # With greater variety in the posterior distributions, we expect # greater variety in our decisions about how valuable further data # is, and hence about which further data to collect. # ################################################################ par(mfrow=c(2,2)) theta.hat1 <- apply(res.mat1,2,mean) plot(data1$x,theta.hat1[2:253]) lines(c(150,300),c(150,300)) theta.hat2 <- apply(res.mat2,2,mean) plot(data2$x,theta.hat2[2:113]) lines(c(150,300),c(150,300)) theta.hat3 <- apply(res.mat3,2,mean) plot(data3$x,theta.hat3[2:253]) lines(c(150,300),c(150,300)) theta.hat4 <- apply(res.mat4,2,mean) plot(data4$x,theta.hat4[2:113]) lines(c(150,300),c(150,300)) theta.hat.new1 <- (prior1$M * theta.hat1[1] + sum(theta.hat1[2:253])) / (prior1$M + data1$n) theta.hat.new2 <- (prior2$M * theta.hat2[1] + sum(theta.hat2[2:113])) / (prior2$M + data2$n) theta.hat.new3 <- (prior3$M * theta.hat3[1] + sum(theta.hat3[2:253])) / (prior3$M + data3$n) theta.hat.new4 <- (prior4$M * theta.hat4[1] + sum(theta.hat4[2:113])) / (prior4$M + data4$n) theta.hat.new1 theta.hat.new2 theta.hat.new3 theta.hat.new4 > theta.hat.new1 [1] 179.8613 > theta.hat.new2 [1] 228.7993 > theta.hat.new3 [1] 178.8999 > theta.hat.new4 [1] 221.7696