# Supplement to MDP code for regression modelling.  

# Code relies on the MASS library 

# The model to be fit with the code below is a 
#  traditional linear regression model, with one
#  exception.  The errors are presumed to follow 
#  an "arbitrary" continuous distribution.  The
#  prior over this distribution is an MDP model.  
#  Note that the parameter sigma^2 is not the 
#  usual residual variance.  Also, one must be 
#  careful in interpretation of "beta_0".  The 
#  value of the mean of the distribution F must 
#  be swept into the intercept.  

################################################################
#
# Part 1.  A description of the model
#
# The model is a conjugate, normal theory regression model
# Variances are treated as known throughout
#
#  The model is the basic model discussed in class
#  beta          ~ N(mu_beta, Sig_beta)
#  F             ~ DP(M F_0);            F_0 = N(0, tau^2)
#  theta_i | F   ~ F
#  y_i | theta_i ~ N(x_i^t beta + theta_i, sigma^2)
#
################################################################

################################################################
#
# Part 2.  Some useful objects - the initial ones are just those
#          from the simpler normal means problem.  
#
# First, we'll need information on the clusters
#
# clust$s
# clust$k
# clust$n.i
# clust$theta.star
#
# It's also good to have the data stored as an object
#
# data$y
# data$X
# data$n
# data$p
# data$fits
# data$resids
#
# The parameters governing likelihood and prior are also essential
#
# prior$beta$mean
# prior$beta$var
# prior$beta$prec
# prior$sig.sq$a
# prior$sig.sq$b
# prior$M
# prior$tau.sq
#
# Also, we'll track the value of mu as the gibbs sampler progresses
#
# param$beta
# param$sig.sq
# param$tau.sq
#
################################################################

################################################################
# 
# Function 1.  Generation of the beta vector 
#
# The calculations for the conditional generation match those 
#  of the typical parametric model exactly.
#
# Makes use of mvrnorm from the MASS library.  
################################################################

fn.gen.beta <- function(prior,param,clust,data)
{
 s <- clust$s
 theta.star <- clust$theta.star
 y <- data$y - theta.star[s]
 X <- data$X 
 prior.mean <- prior$mean
 prior.var <- prior$var
 prior.prec <- prior$prec
 sig.sq <- param$sig.sq
 
 post.prec <- prior.prec + ((t(X) %*% X)/sig.sq)
 post.var <- solve(post.prec)
 tmp <- (prior.prec %*% prior.mean) + ((t(X) %*% y)/sig.sq)
 post.mean <- post.var %*% tmp
 beta <- mvrnorm(n=1,mu=post.mean,Sigma=post.var)

 return(beta)
}


################################################################
# 
# Function 2.  Initialization of the beta vector and sigma^2
#
# This routine initializes the beta vector at the posterior mean
#  for the regression coefficients -- given an estimated value 
#  of sigma^2 based on least squares.  
#   
################################################################

fn.init.beta.bls <- function(prior,data)
{
 X <- data$X
 y <- data$y
 n <- data$n
 p <- data$p
 prior.mean <- prior$mean
 prior.var <- prior$var
 prior.prec <- prior$prec

 beta.ls <- solve(t(X) %*% X) %*% t(X) %*% y
 fit <- X %*% beta.ls
 sig.sq <- sum((y - fit)^2) / (n - p)
 
 post.prec <- prior.prec + ((t(X) %*% X)/sig.sq)
 post.var <- solve(post.prec)
 tmp <- (prior.prec %*% prior.mean) + ((t(X) %*% y)/sig.sq)
 beta <- post.var %*% tmp

 tmp <- NULL
 tmp$beta <- beta
 tmp$sig.sq <- sig.sq
 return(tmp)
}


################################################################
# 
# Function 3.  One iterate of the Gibbs sampler 
#
# Tasks:  Generate each theta_i in turn
#         Generate theta_star
#
################################################################

fn.one.iterate.4 <- function(clust,prior,mu=0,data)
{
 n <- length(data$x)
 for (i in 1:n)
  {
   clust <- fn.remove.theta.i(i,clust)
   clust <- fn.gen.theta.i.2(i,clust,prior,mu,data)
  }
 clust <- fn.gen.theta.star.3(clust,prior,mu,data)

 ret.obj <- NULL
 ret.obj$clust <- clust
 ret.obj$prior <- prior

 return(ret.obj)
}

################################################################
# 
# Function 4.  The nonparametric bits 
#
# These conditional generations match those for the model for
#  normal means.  One need only adjust the data fed into the 
#  routine.  Work with the residuals.    
#   
################################################################

fn.gibbs.sampler.reg.3 <- function(n.reps,prior,data,clust,mu,n.burn=10)
{
 n <- data$n
 p <- data$p

 # Insert initialization routine if desired
 tmp <- fn.init.beta.bls(prior$beta,data)
 param$beta <- as.vector(tmp$beta)
 # param$sig.sq <- tmp$sig.sq      
 data$fits <- data$X %*% param$beta
 data$resids <- data$y - data$fits
 clust$theta.star <- data$resids
 
print("init done")

 # Insert burn-in period if desired
 for (i in 1:n.burn)
  {
   tmp.data <- NULL
   tmp.data$x <- data$resids
   tmp.prior <- NULL
   tmp.prior$tau.sq <- prior$tau.sq
   tmp.prior$sig.sq <- param$sig.sq
   tmp.prior$M <- prior$M
   tmp <- fn.one.iterate.4(clust=clust,prior=tmp.prior,data=tmp.data)
   clust <- tmp$clust
   param$beta <- fn.gen.beta(prior$beta,param,clust,data)
  }

 # Set up output object and run chain for estimation period
 res.mat <- matrix(rep(0,n.reps*(p+1+n)),nrow=n.reps)

 for (i in 1:n.reps)
  {
   tmp.data <- NULL
   tmp.data$x <- data$resids
   tmp.prior <- NULL
   tmp.prior$tau.sq <- prior$tau.sq
   tmp.prior$sig.sq <- param$sig.sq
   tmp.prior$M <- prior$M
   tmp <- fn.one.iterate.4(clust=clust,prior=tmp.prior,data=tmp.data)
   clust <- tmp$clust
   param$beta <- fn.gen.beta(prior$beta,param,clust,data)
   res.mat[i,] <- c(param$beta,param$sig.sq,clust$theta.star[clust$s])
  }

 return(res.mat)
}


################################################################
# 
# Function 5.  A density estimate of the residual distribution 
#  
#  This one relies on slow and inefficient coding.  If k << n,
#   substantial gains in speed are possible by working with the
#   theta.star instead of the theta.  
#
#  Input:  theta is a matrix, each row of which contains the
#          values of theta associated with the n cases 
#
#  Usage tmp.prior <- NULL; tmp.prior$sig.sq <- prior$sig.sq;
#        tmp.prior$tau.sq <- prior$tau.sq; tmp.prior$M <- M;
#        plot(fn.dens.est(tmp.prior,theta))
#
################################################################

fn.dens.est <- function(prior,theta,num.grid=1000)
{
 sig.sq <- prior$sig.sq
 tau.sq <- prior$tau.sq
 M <- prior$M
 n <- dim(theta)[2]
 n.reps <- dim(theta)[1]
 sigma <- sqrt(sig.sq)
 min.pt <- min(theta) - 2.5 * sigma
 max.pt <- max(theta) + 2.5 * sigma
 grid <- (0:num.grid)/(num.grid + 1)
 grid <- min.pt + grid * (max.pt - min.pt)
 theta.vec <- as.vector(theta)
 y <- rep(0,num.grid+1)
 for (i in 1:length(theta.vec))
  {
   y <- y + dnorm(x=grid,mean=theta.vec[i],sd=sigma)
  }
 y <- y + M * n.reps * dnorm(x=grid,mean=0,sd=sqrt(sig.sq + tau.sq))

 density <- NULL
 density$x <- grid
 density$y <- y / ((M+n)*n.reps)

 return(density)
}


################################################################
# 
# Making use of the code.  
#
# The bit below makes use of the regression model with a nonparametric
#  error distribution.  It fits a simple linear regression model to
#  weight as a function of height to the bodyfat data set.  
#   
################################################################

# sets up the objects to run the code

data <- NULL
data$y <- bodyfat.mat[,4]
data$X <- cbind(rep(1,length(data$y)),bodyfat.mat[,5])
data$n <- length(data$y)
data$p <- dim(data$X)[2]
# OLS fits and residuals
data$fits <- data$X %*% solve(t(data$X) %*% data$X) %*% t(data$X) %*% data$y
data$resids <- data$y - data$fits

prior <- NULL
prior$beta$mean <- c(0,0)
prior$beta$var <- diag(c(10000,10000))
prior$beta$prec <- solve(prior$beta$var)
# prior$sig.sq$a  # These lines would be used if sigma^2 
# prior$sig.sq$b  # were treated as unknown
prior$M <- 1
prior$tau.sq <- 400

clust <- NULL
clust$k <- data$n
clust$s <- 1:data$n
clust$n.i <- rep(1,data$n)
clust$theta.star <- rep(0,data$n)

param <- NULL
param$beta <- c(0,0)
param$sig.sq <- 100


# runs the code.  100 burn-in, 1000 reps for estimation --too small, but relatively quick.  
set.seed(544)
res.mat.a <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100)

# takes a look at the output.  
#  plots of residual density estimates - kde from least squares and the Bayes model
tmp.prior <- prior
tmp.prior$sig.sq <- param$sig.sq
plot(fn.dens.est(tmp.prior,res.mat.a[,4:255]))
lines(density(data$resids))
#  least squares:  residuals vs. fits
plot(data$fits,data$resids)
#  a plot of residuals from this model vs. those from a least squares fit 
plot(data$resids,apply(res.mat.a[,4:255],2,mean))

## How stable is the output from this sampler?  We can investigate this through
#   a variety of diagnostics.  We'll pursue the approach of looking at independent
#   runs of the code.  All runs are based on the same initialization.  
res.mat.b <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100)
res.mat.c <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100)
res.mat.d <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100)
res.mat.e <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100)
big.res.mat <- rbind(res.mat.a,res.mat.b,res.mat.c,res.mat.d,res.mat.e)

# standard trace-plot diagnostics for the runs
plot(big.res.mat[,1],type='n'); points(big.res.mat[,1],pch='.')
plot(big.res.mat[,2],type='n'); points(big.res.mat[,2],pch='.')
plot(big.res.mat[,103],type='n'); points(big.res.mat[,103],pch='.')

# a plot of beta_1 vs beta_0
plot(big.res.mat[,1],big.res.mat[,2],type='n')
points(big.res.mat[,1],big.res.mat[,2],pch='.')

# density estimates of the residual distribution
lines(fn.dens.est(tmp.prior,res.mat.b[,4:255]))
dens.est.a <- fn.dens.est(tmp.prior,res.mat.a[,4:255])
dens.est.b <- fn.dens.est(tmp.prior,res.mat.b[,4:255])
dens.est.c <- fn.dens.est(tmp.prior,res.mat.c[,4:255])
dens.est.d <- fn.dens.est(tmp.prior,res.mat.d[,4:255])
dens.est.e <- fn.dens.est(tmp.prior,res.mat.e[,4:255])
plot(dens.est.a,type="n")
lines(dens.est.a)
lines(dens.est.b)
lines(dens.est.c)
lines(dens.est.d)
lines(dens.est.e)

mean.mat <- matrix(rep(0,10),ncol=2)
mean.mat[1,] <- apply(res.mat.a[,1:2],2,mean)
mean.mat[2,] <- apply(res.mat.b[,1:2],2,mean)
mean.mat[3,] <- apply(res.mat.c[,1:2],2,mean)
mean.mat[4,] <- apply(res.mat.d[,1:2],2,mean)
mean.mat[5,] <- apply(res.mat.e[,1:2],2,mean)
apply(mean.mat,2,mean)
sqrt(apply(mean.mat,2,var)/5)

quantile(x=big.res.mat[,2],probs=c(0.025,0.5,0.975))

sqrt(apply(res.mat.a[,1:2],2,var))
sqrt(apply(res.mat.b[,1:2],2,var))
sqrt(apply(res.mat.c[,1:2],2,var))
sqrt(apply(res.mat.d[,1:2],2,var))
sqrt(apply(res.mat.e[,1:2],2,var))





################# A modified data set, removing case 42, the 29.5 inch tall man

data.f <- data          # these for backup
clust.f <- clust
prior.f <- prior
param.f <- param

# adjust data and clust.  prior and param are unchanged.  
data$y <- data$y[-42]
data$X <- data$X[-42,]
data$n <- length(data$y)
data$p <- dim(data$X)[2]
# OLS fits and residuals
data$fits <- data$X %*% solve(t(data$X) %*% data$X) %*% t(data$X) %*% data$y
data$resids <- data$y - data$fits

clust$k <- data$n
clust$s <- 1:data$n
clust$n.i <- rep(1,data$n)
clust$theta.star <- rep(0,data$n)

data.s <- data
clust.s <- clust
prior.s <- prior
param.s <- param


################ plots for handout

res.mat <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100)
plot(data$resids,apply(res.mat[,4:255],2,mean))
plot(data$fits,data$resids)
plot(density(data$resids))
plot(density(res.mat[,4:255]))

#ThetaHatvsResids.ps
tmp.prior <- prior
tmp.prior$sig.sq <- param$sig.sq
plot(fn.dens.est(tmp.prior,res.mat[,4:255]))

#PredictiveResidualDensities
plot(dens.est.a,type="n")
lines(dens.est.a)
lines(dens.est.b)
lines(dens.est.c)
lines(dens.est.d)
lines(dens.est.e)

#Beta0TimePlot
plot(big.res.mat[,1],type='n'); points(big.res.mat[,1],pch='.')
#Beta1TimePlot
plot(big.res.mat[,2],type='n'); points(big.res.mat[,2],pch='.')
#Theta100TimePlot
plot(big.res.mat[,103],type='n'); points(big.res.mat[,103],pch='.')

#Beta1vsBeta0
plot(big.res.mat[,1],big.res.mat[,2],type='n')
points(big.res.mat[,1],big.res.mat[,2],pch='.')

#Beta1Density
plot(density(big.res.mat[,2]))

#LeastSquaresResidsVsFits
plot(data$fits,data$resids)



################################################################
# 
# Function.  A parametric Bayes fit of the model -- known variance
#   
################################################################

fn.fit.beta.bls <- function(prior,sig.sq,data)
{
 X <- data$X
 y <- data$y
 n <- data$n
 p <- data$p
 prior.mean <- prior$mean
 prior.var <- prior$var
 prior.prec <- prior$prec

 beta.ls <- solve(t(X) %*% X) %*% t(X) %*% y
 fit <- X %*% beta.ls
 sig.sq.ls <- sum((y - fit)^2) / (n - p)
 
 post.prec <- prior.prec + ((t(X) %*% X)/sig.sq)
 post.var <- solve(post.prec)
 tmp <- (prior.prec %*% prior.mean) + ((t(X) %*% y)/sig.sq)
 beta <- post.var %*% tmp

 fits <- X %*% beta
 resids <- y - fits
 sig.sq <- sum(resids^2) 

 tmp <- NULL
 tmp$beta <- beta
 tmp$Sigma <- post.var
 tmp$fits <- X %*% beta
 tmp$resids <- y - tmp$fits
 tmp$sig.sq <- sig.sq
 tmp$beta.ls <- beta.ls
 tmp$sig.sq.ls <- sig.sq.ls
 return(tmp)
}

# The small data set (case 42 removed)
prior$beta$mean <- c(0,0)
prior$beta$var <- diag(c(10000,10000))
prior$beta$prec <- solve(prior$beta$var)
param$sig.sq <- 500
bayes.regression <- fn.fit.beta.bls(prior=prior$beta,
                                    sig.sq=param$sig.sq,
                                    data=data.s)
cbind(bayes.regression$beta,bayes.regression$beta.ls)
bayes.regression$Sigma
plot(density(bayes.regression$resids))

# Adjusting the prior for beta_1 
prior$beta$mean <- c(0,5)
prior$beta$var <- diag(c(10000,2.25))
prior$beta$prec <- solve(prior$beta$var)

bayes.regression <- fn.fit.beta.bls(prior=prior$beta,
                                    sig.sq=param$sig.sq,
                                    data=data.s)
cbind(bayes.regression$beta,bayes.regression$beta.ls)
bayes.regression$Sigma
plot(density(bayes.regression$resids))

# Working with recentered data - 72 inches is recentering point
prior$gamma$mean <- c(180,5)
prior$gamma$var <- diag(c(50,2.25))
prior$gamma$prec <- solve(prior$gamma$var)
trans.mat <- matrix(c(1,-72,0,1),ncol=2,byrow=T)
prior$beta$mean <- trans.mat %*% prior$gamma$mean
prior$beta$var <- trans.mat %*% prior$gamma$var %*% t(trans.mat)
prior$beta$prec <- solve(prior$beta$var)
trans.mat
prior$gamma$mean
prior$beta$mean
prior$gamma$var
prior$beta$var
bayes.regression <- fn.fit.beta.bls(prior=prior$beta,
                                    sig.sq=param$sig.sq,
                                    data=data.s)
cbind(bayes.regression$beta,bayes.regression$beta.ls)
bayes.regression$Sigma
plot(density(bayes.regression$resids))

# Recentered data, full data set
bayes.regression <- fn.fit.beta.bls(prior=prior$beta,
                                    sig.sq=param$sig.sq,
                                    data=data.f)
cbind(bayes.regression$beta,bayes.regression$beta.ls)
bayes.regression$Sigma
plot(density(bayes.regression$resids))

tmp.grid <- (-60:200)
tmp.dens <- dnorm(x=tmp.grid,mean=0,sd=sqrt(500))
lines(tmp.grid,tmp.dens)

################################################################
# 
# Elicitations from class
# 
# Each row is one prior.  I've adjusted a couple -- used Mike's
#  100000000000 to replace infinity in one case, used 0.000001 to
#  replace 0 in another case.  All have been translated into pounds
#  and inches.  Distributional forms for the analysis have all been
#  presumed to be normal, and hierarchies have been trimmed.  Also,
#  I've resolved ambiguities as best I could.  
#
# First two components are mean vector for beta, next four components
#  are the covariance matrix.  Mine is the last one.  
#   
################################################################

prior.mat <- matrix(rep(0,15*6),ncol=6)
prior.mat[1,] <- c(0,0,100000000000,0,0,100000000000)
prior.mat[2,] <- c(0,0,10,0,0,10)
prior.mat[3,] <- c(0,1.862667,249806,0,0,249806)
prior.mat[4,] <- c(20,2,5,0.2,0.2,0.8)
prior.mat[5,] <- c(16.764,2.2352,3122.574,0,0,0.25)
prior.mat[6,] <- c(-53,2.5,100,0,0,1)
prior.mat[7,] <- c(0,3,100,0,0,2.25)
prior.mat[8,] <- c(-180,5,11714,-162,-162,2.25)
prior.mat[9,] <- c(-170,5,2500,0,0,9)
prior.mat[10,] <- c(0,5,10000,0,0,9)
prior.mat[11,] <- c(0,5,100,0,0,100)
prior.mat[12,] <- c(0,5,1,-0.5,-0.5,1)
prior.mat[13,] <- c(0,5,0.000001,0,0,6.25)
prior.mat[14,] <- c(0,5.588,100,10,10,100)
prior.mat[15,] <- c(0,10,100000000000,0,0,100000000000)

param$sig.sq <- 660    # approximately sigma^2 from a least squares analysis

posterior.mat <- prior.mat
for (i in 1:15)
{
 tmp <- prior.mat[i,]
 prior$beta$mean <- tmp[1:2]
 prior$beta$var <- matrix(tmp[3:6],ncol=2,byrow=T)
 prior$beta$prec <- solve(prior$beta$var)
 tmp <- fn.fit.beta.bls(prior=prior$beta,
                        sig.sq=param$sig.sq,
                        data=data.s)
 posterior.mat[i,] <- c(as.vector(tmp$beta),as.vector(tmp$Sigma))
}
matrix(round(c(posterior.mat[,1],sqrt(posterior.mat[,3]),
               posterior.mat[,2],sqrt(posterior.mat[,6])),2),ncol=4)

# A plot of posterior mean for beta_1 vs. its prior mean 
#  Priors where the intercept has small variance are plotted
#  as 'x'; where the intercept has large variance as 'o'.  
plot(prior.mat[,2],posterior.mat[,2],type='n',xlab='Prior Mean',ylab='Posterior Mean')
tmp <- (1:15)[prior.mat[,3] < 5000]
points(prior.mat[tmp,2],posterior.mat[tmp,2],pch='x')
points(prior.mat[-tmp,2],posterior.mat[-tmp,2],pch='o')


# Cross-validation and the success of prior elicitation.  I've relied
#  on the prior that I specified.  
 tmp <- prior.mat[8,]
 prior$beta$mean <- tmp[1:2]
 prior$beta$var <- matrix(tmp[3:6],ncol=2,byrow=T)
 prior$beta$prec <- solve(prior$beta$var)

coll.res <- matrix(rep(0,20*3),ncol=3) 
for (j in 1:20)
{
n.train <- 10 * j
set.seed(544)
res.mat <- matrix(rep(0,20000),ncol=2)

for (i in 1:10000)
{
test <- NULL
 perm <- sample(x=1:251)
 train.ind <- perm[1:n.train]
 test.ind <- perm[(n.train+1):251]
 data.train <- data.s
 data.train$y <- data.train$y[train.ind]
 data.train$X <- data.train$X[train.ind,]
 data.train$n <- n.train
 data.train$fits <- rep(0,n.train)
 data.train$resids <- rep(0,n.train)
 tmp <- fn.fit.beta.bls(prior=prior$beta,
                        sig.sq=param$sig.sq,
                        data=data.train)
 test$X <- data.s$X[test.ind,]
 test$y <- data.s$y[test.ind]
 ls.fit <- test$X %*% tmp$beta.ls
 bayes.fit <- test$X %*% tmp$beta
 ls.mse <- mean((test$y - ls.fit)^2)
 bayes.mse <- mean((test$y - bayes.fit)^2)
res.mat[i,] <- c(ls.mse,bayes.mse)
}
coll.res[j,] <- c(n.train,apply(res.mat,2,mean))
}

# A plot for the handout on prior elicitation
plot(rep(coll.res[,1],2),c(coll.res[,2],coll.res[,3]),
     xlab='training sample size',ylab='predictive mse')
lines(coll.res[,1],coll.res[,2])
lines(coll.res[,1],coll.res[,3])