# Supplement to MDP code for regression modelling. # Code relies on the MASS library # The model to be fit with the code below is a # traditional linear regression model, with one # exception. The errors are presumed to follow # an "arbitrary" continuous distribution. The # prior over this distribution is an MDP model. # Note that the parameter sigma^2 is not the # usual residual variance. Also, one must be # careful in interpretation of "beta_0". The # value of the mean of the distribution F must # be swept into the intercept. ################################################################ # # Part 1. A description of the model # # The model is a conjugate, normal theory regression model # Variances are treated as known throughout # # The model is the basic model discussed in class # beta ~ N(mu_beta, Sig_beta) # F ~ DP(M F_0); F_0 = N(0, tau^2) # theta_i | F ~ F # y_i | theta_i ~ N(x_i^t beta + theta_i, sigma^2) # ################################################################ ################################################################ # # Part 2. Some useful objects - the initial ones are just those # from the simpler normal means problem. # # First, we'll need information on the clusters # # clust$s # clust$k # clust$n.i # clust$theta.star # # It's also good to have the data stored as an object # # data$y # data$X # data$n # data$p # data$fits # data$resids # # The parameters governing likelihood and prior are also essential # # prior$beta$mean # prior$beta$var # prior$beta$prec # prior$sig.sq$a # prior$sig.sq$b # prior$M # prior$tau.sq # # Also, we'll track the value of mu as the gibbs sampler progresses # # param$beta # param$sig.sq # param$tau.sq # ################################################################ ################################################################ # # Function 1. Generation of the beta vector # # The calculations for the conditional generation match those # of the typical parametric model exactly. # # Makes use of mvrnorm from the MASS library. ################################################################ fn.gen.beta <- function(prior,param,clust,data) { s <- clust$s theta.star <- clust$theta.star y <- data$y - theta.star[s] X <- data$X prior.mean <- prior$mean prior.var <- prior$var prior.prec <- prior$prec sig.sq <- param$sig.sq post.prec <- prior.prec + ((t(X) %*% X)/sig.sq) post.var <- solve(post.prec) tmp <- (prior.prec %*% prior.mean) + ((t(X) %*% y)/sig.sq) post.mean <- post.var %*% tmp beta <- mvrnorm(n=1,mu=post.mean,Sigma=post.var) return(beta) } ################################################################ # # Function 2. Initialization of the beta vector and sigma^2 # # This routine initializes the beta vector at the posterior mean # for the regression coefficients -- given an estimated value # of sigma^2 based on least squares. # ################################################################ fn.init.beta.bls <- function(prior,data) { X <- data$X y <- data$y n <- data$n p <- data$p prior.mean <- prior$mean prior.var <- prior$var prior.prec <- prior$prec beta.ls <- solve(t(X) %*% X) %*% t(X) %*% y fit <- X %*% beta.ls sig.sq <- sum((y - fit)^2) / (n - p) post.prec <- prior.prec + ((t(X) %*% X)/sig.sq) post.var <- solve(post.prec) tmp <- (prior.prec %*% prior.mean) + ((t(X) %*% y)/sig.sq) beta <- post.var %*% tmp tmp <- NULL tmp$beta <- beta tmp$sig.sq <- sig.sq return(tmp) } ################################################################ # # Function 3. One iterate of the Gibbs sampler # # Tasks: Generate each theta_i in turn # Generate theta_star # ################################################################ fn.one.iterate.4 <- function(clust,prior,mu=0,data) { n <- length(data$x) for (i in 1:n) { clust <- fn.remove.theta.i(i,clust) clust <- fn.gen.theta.i.2(i,clust,prior,mu,data) } clust <- fn.gen.theta.star.3(clust,prior,mu,data) ret.obj <- NULL ret.obj$clust <- clust ret.obj$prior <- prior return(ret.obj) } ################################################################ # # Function 4. The nonparametric bits # # These conditional generations match those for the model for # normal means. One need only adjust the data fed into the # routine. Work with the residuals. # ################################################################ fn.gibbs.sampler.reg.3 <- function(n.reps,prior,data,clust,mu,n.burn=10) { n <- data$n p <- data$p # Insert initialization routine if desired tmp <- fn.init.beta.bls(prior$beta,data) param$beta <- as.vector(tmp$beta) # param$sig.sq <- tmp$sig.sq data$fits <- data$X %*% param$beta data$resids <- data$y - data$fits clust$theta.star <- data$resids print("init done") # Insert burn-in period if desired for (i in 1:n.burn) { tmp.data <- NULL tmp.data$x <- data$resids tmp.prior <- NULL tmp.prior$tau.sq <- prior$tau.sq tmp.prior$sig.sq <- param$sig.sq tmp.prior$M <- prior$M tmp <- fn.one.iterate.4(clust=clust,prior=tmp.prior,data=tmp.data) clust <- tmp$clust param$beta <- fn.gen.beta(prior$beta,param,clust,data) } # Set up output object and run chain for estimation period res.mat <- matrix(rep(0,n.reps*(p+1+n)),nrow=n.reps) for (i in 1:n.reps) { tmp.data <- NULL tmp.data$x <- data$resids tmp.prior <- NULL tmp.prior$tau.sq <- prior$tau.sq tmp.prior$sig.sq <- param$sig.sq tmp.prior$M <- prior$M tmp <- fn.one.iterate.4(clust=clust,prior=tmp.prior,data=tmp.data) clust <- tmp$clust param$beta <- fn.gen.beta(prior$beta,param,clust,data) res.mat[i,] <- c(param$beta,param$sig.sq,clust$theta.star[clust$s]) } return(res.mat) } ################################################################ # # Function 5. A density estimate of the residual distribution # # This one relies on slow and inefficient coding. If k << n, # substantial gains in speed are possible by working with the # theta.star instead of the theta. # # Input: theta is a matrix, each row of which contains the # values of theta associated with the n cases # # Usage tmp.prior <- NULL; tmp.prior$sig.sq <- prior$sig.sq; # tmp.prior$tau.sq <- prior$tau.sq; tmp.prior$M <- M; # plot(fn.dens.est(tmp.prior,theta)) # ################################################################ fn.dens.est <- function(prior,theta,num.grid=1000) { sig.sq <- prior$sig.sq tau.sq <- prior$tau.sq M <- prior$M n <- dim(theta)[2] n.reps <- dim(theta)[1] sigma <- sqrt(sig.sq) min.pt <- min(theta) - 2.5 * sigma max.pt <- max(theta) + 2.5 * sigma grid <- (0:num.grid)/(num.grid + 1) grid <- min.pt + grid * (max.pt - min.pt) theta.vec <- as.vector(theta) y <- rep(0,num.grid+1) for (i in 1:length(theta.vec)) { y <- y + dnorm(x=grid,mean=theta.vec[i],sd=sigma) } y <- y + M * n.reps * dnorm(x=grid,mean=0,sd=sqrt(sig.sq + tau.sq)) density <- NULL density$x <- grid density$y <- y / ((M+n)*n.reps) return(density) } ################################################################ # # Making use of the code. # # The bit below makes use of the regression model with a nonparametric # error distribution. It fits a simple linear regression model to # weight as a function of height to the bodyfat data set. # ################################################################ # sets up the objects to run the code data <- NULL data$y <- bodyfat.mat[,4] data$X <- cbind(rep(1,length(data$y)),bodyfat.mat[,5]) data$n <- length(data$y) data$p <- dim(data$X)[2] # OLS fits and residuals data$fits <- data$X %*% solve(t(data$X) %*% data$X) %*% t(data$X) %*% data$y data$resids <- data$y - data$fits prior <- NULL prior$beta$mean <- c(0,0) prior$beta$var <- diag(c(10000,10000)) prior$beta$prec <- solve(prior$beta$var) # prior$sig.sq$a # These lines would be used if sigma^2 # prior$sig.sq$b # were treated as unknown prior$M <- 1 prior$tau.sq <- 400 clust <- NULL clust$k <- data$n clust$s <- 1:data$n clust$n.i <- rep(1,data$n) clust$theta.star <- rep(0,data$n) param <- NULL param$beta <- c(0,0) param$sig.sq <- 100 # runs the code. 100 burn-in, 1000 reps for estimation --too small, but relatively quick. set.seed(544) res.mat.a <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100) # takes a look at the output. # plots of residual density estimates - kde from least squares and the Bayes model tmp.prior <- prior tmp.prior$sig.sq <- param$sig.sq plot(fn.dens.est(tmp.prior,res.mat.a[,4:255])) lines(density(data$resids)) # least squares: residuals vs. fits plot(data$fits,data$resids) # a plot of residuals from this model vs. those from a least squares fit plot(data$resids,apply(res.mat.a[,4:255],2,mean)) ## How stable is the output from this sampler? We can investigate this through # a variety of diagnostics. We'll pursue the approach of looking at independent # runs of the code. All runs are based on the same initialization. res.mat.b <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100) res.mat.c <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100) res.mat.d <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100) res.mat.e <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100) big.res.mat <- rbind(res.mat.a,res.mat.b,res.mat.c,res.mat.d,res.mat.e) # standard trace-plot diagnostics for the runs plot(big.res.mat[,1],type='n'); points(big.res.mat[,1],pch='.') plot(big.res.mat[,2],type='n'); points(big.res.mat[,2],pch='.') plot(big.res.mat[,103],type='n'); points(big.res.mat[,103],pch='.') # a plot of beta_1 vs beta_0 plot(big.res.mat[,1],big.res.mat[,2],type='n') points(big.res.mat[,1],big.res.mat[,2],pch='.') # density estimates of the residual distribution lines(fn.dens.est(tmp.prior,res.mat.b[,4:255])) dens.est.a <- fn.dens.est(tmp.prior,res.mat.a[,4:255]) dens.est.b <- fn.dens.est(tmp.prior,res.mat.b[,4:255]) dens.est.c <- fn.dens.est(tmp.prior,res.mat.c[,4:255]) dens.est.d <- fn.dens.est(tmp.prior,res.mat.d[,4:255]) dens.est.e <- fn.dens.est(tmp.prior,res.mat.e[,4:255]) plot(dens.est.a,type="n") lines(dens.est.a) lines(dens.est.b) lines(dens.est.c) lines(dens.est.d) lines(dens.est.e) mean.mat <- matrix(rep(0,10),ncol=2) mean.mat[1,] <- apply(res.mat.a[,1:2],2,mean) mean.mat[2,] <- apply(res.mat.b[,1:2],2,mean) mean.mat[3,] <- apply(res.mat.c[,1:2],2,mean) mean.mat[4,] <- apply(res.mat.d[,1:2],2,mean) mean.mat[5,] <- apply(res.mat.e[,1:2],2,mean) apply(mean.mat,2,mean) sqrt(apply(mean.mat,2,var)/5) quantile(x=big.res.mat[,2],probs=c(0.025,0.5,0.975)) sqrt(apply(res.mat.a[,1:2],2,var)) sqrt(apply(res.mat.b[,1:2],2,var)) sqrt(apply(res.mat.c[,1:2],2,var)) sqrt(apply(res.mat.d[,1:2],2,var)) sqrt(apply(res.mat.e[,1:2],2,var)) ################# A modified data set, removing case 42, the 29.5 inch tall man data.f <- data # these for backup clust.f <- clust prior.f <- prior param.f <- param # adjust data and clust. prior and param are unchanged. data$y <- data$y[-42] data$X <- data$X[-42,] data$n <- length(data$y) data$p <- dim(data$X)[2] # OLS fits and residuals data$fits <- data$X %*% solve(t(data$X) %*% data$X) %*% t(data$X) %*% data$y data$resids <- data$y - data$fits clust$k <- data$n clust$s <- 1:data$n clust$n.i <- rep(1,data$n) clust$theta.star <- rep(0,data$n) data.s <- data clust.s <- clust prior.s <- prior param.s <- param ################ plots for handout res.mat <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100) plot(data$resids,apply(res.mat[,4:255],2,mean)) plot(data$fits,data$resids) plot(density(data$resids)) plot(density(res.mat[,4:255])) #ThetaHatvsResids.ps tmp.prior <- prior tmp.prior$sig.sq <- param$sig.sq plot(fn.dens.est(tmp.prior,res.mat[,4:255])) #PredictiveResidualDensities plot(dens.est.a,type="n") lines(dens.est.a) lines(dens.est.b) lines(dens.est.c) lines(dens.est.d) lines(dens.est.e) #Beta0TimePlot plot(big.res.mat[,1],type='n'); points(big.res.mat[,1],pch='.') #Beta1TimePlot plot(big.res.mat[,2],type='n'); points(big.res.mat[,2],pch='.') #Theta100TimePlot plot(big.res.mat[,103],type='n'); points(big.res.mat[,103],pch='.') #Beta1vsBeta0 plot(big.res.mat[,1],big.res.mat[,2],type='n') points(big.res.mat[,1],big.res.mat[,2],pch='.') #Beta1Density plot(density(big.res.mat[,2])) #LeastSquaresResidsVsFits plot(data$fits,data$resids) ################################################################ # # Function. A parametric Bayes fit of the model -- known variance # ################################################################ fn.fit.beta.bls <- function(prior,sig.sq,data) { X <- data$X y <- data$y n <- data$n p <- data$p prior.mean <- prior$mean prior.var <- prior$var prior.prec <- prior$prec beta.ls <- solve(t(X) %*% X) %*% t(X) %*% y fit <- X %*% beta.ls sig.sq.ls <- sum((y - fit)^2) / (n - p) post.prec <- prior.prec + ((t(X) %*% X)/sig.sq) post.var <- solve(post.prec) tmp <- (prior.prec %*% prior.mean) + ((t(X) %*% y)/sig.sq) beta <- post.var %*% tmp fits <- X %*% beta resids <- y - fits sig.sq <- sum(resids^2) tmp <- NULL tmp$beta <- beta tmp$Sigma <- post.var tmp$fits <- X %*% beta tmp$resids <- y - tmp$fits tmp$sig.sq <- sig.sq tmp$beta.ls <- beta.ls tmp$sig.sq.ls <- sig.sq.ls return(tmp) } # The small data set (case 42 removed) prior$beta$mean <- c(0,0) prior$beta$var <- diag(c(10000,10000)) prior$beta$prec <- solve(prior$beta$var) param$sig.sq <- 500 bayes.regression <- fn.fit.beta.bls(prior=prior$beta, sig.sq=param$sig.sq, data=data.s) cbind(bayes.regression$beta,bayes.regression$beta.ls) bayes.regression$Sigma plot(density(bayes.regression$resids)) # Adjusting the prior for beta_1 prior$beta$mean <- c(0,5) prior$beta$var <- diag(c(10000,2.25)) prior$beta$prec <- solve(prior$beta$var) bayes.regression <- fn.fit.beta.bls(prior=prior$beta, sig.sq=param$sig.sq, data=data.s) cbind(bayes.regression$beta,bayes.regression$beta.ls) bayes.regression$Sigma plot(density(bayes.regression$resids)) # Working with recentered data - 72 inches is recentering point prior$gamma$mean <- c(180,5) prior$gamma$var <- diag(c(50,2.25)) prior$gamma$prec <- solve(prior$gamma$var) trans.mat <- matrix(c(1,-72,0,1),ncol=2,byrow=T) prior$beta$mean <- trans.mat %*% prior$gamma$mean prior$beta$var <- trans.mat %*% prior$gamma$var %*% t(trans.mat) prior$beta$prec <- solve(prior$beta$var) trans.mat prior$gamma$mean prior$beta$mean prior$gamma$var prior$beta$var bayes.regression <- fn.fit.beta.bls(prior=prior$beta, sig.sq=param$sig.sq, data=data.s) cbind(bayes.regression$beta,bayes.regression$beta.ls) bayes.regression$Sigma plot(density(bayes.regression$resids)) # Recentered data, full data set bayes.regression <- fn.fit.beta.bls(prior=prior$beta, sig.sq=param$sig.sq, data=data.f) cbind(bayes.regression$beta,bayes.regression$beta.ls) bayes.regression$Sigma plot(density(bayes.regression$resids)) tmp.grid <- (-60:200) tmp.dens <- dnorm(x=tmp.grid,mean=0,sd=sqrt(500)) lines(tmp.grid,tmp.dens) ################################################################ # # Elicitations from class # # Each row is one prior. I've adjusted a couple -- used Mike's # 100000000000 to replace infinity in one case, used 0.000001 to # replace 0 in another case. All have been translated into pounds # and inches. Distributional forms for the analysis have all been # presumed to be normal, and hierarchies have been trimmed. Also, # I've resolved ambiguities as best I could. # # First two components are mean vector for beta, next four components # are the covariance matrix. Mine is the last one. # ################################################################ prior.mat <- matrix(rep(0,15*6),ncol=6) prior.mat[1,] <- c(0,0,100000000000,0,0,100000000000) prior.mat[2,] <- c(0,0,10,0,0,10) prior.mat[3,] <- c(0,1.862667,249806,0,0,249806) prior.mat[4,] <- c(20,2,5,0.2,0.2,0.8) prior.mat[5,] <- c(16.764,2.2352,3122.574,0,0,0.25) prior.mat[6,] <- c(-53,2.5,100,0,0,1) prior.mat[7,] <- c(0,3,100,0,0,2.25) prior.mat[8,] <- c(-180,5,11714,-162,-162,2.25) prior.mat[9,] <- c(-170,5,2500,0,0,9) prior.mat[10,] <- c(0,5,10000,0,0,9) prior.mat[11,] <- c(0,5,100,0,0,100) prior.mat[12,] <- c(0,5,1,-0.5,-0.5,1) prior.mat[13,] <- c(0,5,0.000001,0,0,6.25) prior.mat[14,] <- c(0,5.588,100,10,10,100) prior.mat[15,] <- c(0,10,100000000000,0,0,100000000000) param$sig.sq <- 660 # approximately sigma^2 from a least squares analysis posterior.mat <- prior.mat for (i in 1:15) { tmp <- prior.mat[i,] prior$beta$mean <- tmp[1:2] prior$beta$var <- matrix(tmp[3:6],ncol=2,byrow=T) prior$beta$prec <- solve(prior$beta$var) tmp <- fn.fit.beta.bls(prior=prior$beta, sig.sq=param$sig.sq, data=data.s) posterior.mat[i,] <- c(as.vector(tmp$beta),as.vector(tmp$Sigma)) } matrix(round(c(posterior.mat[,1],sqrt(posterior.mat[,3]), posterior.mat[,2],sqrt(posterior.mat[,6])),2),ncol=4) # A plot of posterior mean for beta_1 vs. its prior mean # Priors where the intercept has small variance are plotted # as 'x'; where the intercept has large variance as 'o'. plot(prior.mat[,2],posterior.mat[,2],type='n',xlab='Prior Mean',ylab='Posterior Mean') tmp <- (1:15)[prior.mat[,3] < 5000] points(prior.mat[tmp,2],posterior.mat[tmp,2],pch='x') points(prior.mat[-tmp,2],posterior.mat[-tmp,2],pch='o') # Cross-validation and the success of prior elicitation. I've relied # on the prior that I specified. tmp <- prior.mat[8,] prior$beta$mean <- tmp[1:2] prior$beta$var <- matrix(tmp[3:6],ncol=2,byrow=T) prior$beta$prec <- solve(prior$beta$var) coll.res <- matrix(rep(0,20*3),ncol=3) for (j in 1:20) { n.train <- 10 * j set.seed(544) res.mat <- matrix(rep(0,20000),ncol=2) for (i in 1:10000) { test <- NULL perm <- sample(x=1:251) train.ind <- perm[1:n.train] test.ind <- perm[(n.train+1):251] data.train <- data.s data.train$y <- data.train$y[train.ind] data.train$X <- data.train$X[train.ind,] data.train$n <- n.train data.train$fits <- rep(0,n.train) data.train$resids <- rep(0,n.train) tmp <- fn.fit.beta.bls(prior=prior$beta, sig.sq=param$sig.sq, data=data.train) test$X <- data.s$X[test.ind,] test$y <- data.s$y[test.ind] ls.fit <- test$X %*% tmp$beta.ls bayes.fit <- test$X %*% tmp$beta ls.mse <- mean((test$y - ls.fit)^2) bayes.mse <- mean((test$y - bayes.fit)^2) res.mat[i,] <- c(ls.mse,bayes.mse) } coll.res[j,] <- c(n.train,apply(res.mat,2,mean)) } # A plot for the handout on prior elicitation plot(rep(coll.res[,1],2),c(coll.res[,2],coll.res[,3]), xlab='training sample size',ylab='predictive mse') lines(coll.res[,1],coll.res[,2]) lines(coll.res[,1],coll.res[,3])