### Site Tools

codes:mdp_regression_r_code
# Supplement to MDP code for regression modelling.

# Code relies on the MASS library

# The model to be fit with the code below is a
#  traditional linear regression model, with one
#  exception.  The errors are presumed to follow
#  an "arbitrary" continuous distribution.  The
#  prior over this distribution is an MDP model.
#  Note that the parameter sigma^2 is not the
#  usual residual variance.  Also, one must be
#  careful in interpretation of "beta_0".  The
#  value of the mean of the distribution F must
#  be swept into the intercept.

################################################################
#
# Part 1.  A description of the model
#
# The model is a conjugate, normal theory regression model
# Variances are treated as known throughout
#
#  The model is the basic model discussed in class
#  beta          ~ N(mu_beta, Sig_beta)
#  F             ~ DP(M F_0);            F_0 = N(0, tau^2)
#  theta_i | F   ~ F
#  y_i | theta_i ~ N(x_i^t beta + theta_i, sigma^2)
#
################################################################

################################################################
#
# Part 2.  Some useful objects - the initial ones are just those
#          from the simpler normal means problem.
#
# First, we'll need information on the clusters
#
# clust$s # clust$k
# clust$n.i # clust$theta.star
#
# It's also good to have the data stored as an object
#
# data$y # data$X
# data$n # data$p
# data$fits # data$resids
#
# The parameters governing likelihood and prior are also essential
#
# prior$beta$mean
# prior$beta$var
# prior$beta$prec
# prior$sig.sq$a
# prior$sig.sq$b
# prior$M # prior$tau.sq
#
# Also, we'll track the value of mu as the gibbs sampler progresses
#
# param$beta # param$sig.sq
# param$tau.sq # ################################################################ ################################################################ # # Function 1. Generation of the beta vector # # The calculations for the conditional generation match those # of the typical parametric model exactly. # # Makes use of mvrnorm from the MASS library. ################################################################ fn.gen.beta <- function(prior,param,clust,data) { s <- clust$s
theta.star <- clust$theta.star y <- data$y - theta.star[s]
X <- data$X prior.mean <- prior$mean
prior.var <- prior$var prior.prec <- prior$prec
sig.sq <- param$sig.sq post.prec <- prior.prec + ((t(X) %*% X)/sig.sq) post.var <- solve(post.prec) tmp <- (prior.prec %*% prior.mean) + ((t(X) %*% y)/sig.sq) post.mean <- post.var %*% tmp beta <- mvrnorm(n=1,mu=post.mean,Sigma=post.var) return(beta) } ################################################################ # # Function 2. Initialization of the beta vector and sigma^2 # # This routine initializes the beta vector at the posterior mean # for the regression coefficients -- given an estimated value # of sigma^2 based on least squares. # ################################################################ fn.init.beta.bls <- function(prior,data) { X <- data$X
y <- data$y n <- data$n
p <- data$p prior.mean <- prior$mean
prior.var <- prior$var prior.prec <- prior$prec

beta.ls <- solve(t(X) %*% X) %*% t(X) %*% y
fit <- X %*% beta.ls
sig.sq <- sum((y - fit)^2) / (n - p)

post.prec <- prior.prec + ((t(X) %*% X)/sig.sq)
post.var <- solve(post.prec)
tmp <- (prior.prec %*% prior.mean) + ((t(X) %*% y)/sig.sq)
beta <- post.var %*% tmp

tmp <- NULL
tmp$beta <- beta tmp$sig.sq <- sig.sq
return(tmp)
}

################################################################
#
# Function 3.  One iterate of the Gibbs sampler
#
# Tasks:  Generate each theta_i in turn
#         Generate theta_star
#
################################################################

fn.one.iterate.4 <- function(clust,prior,mu=0,data)
{
n <- length(data$x) for (i in 1:n) { clust <- fn.remove.theta.i(i,clust) clust <- fn.gen.theta.i.2(i,clust,prior,mu,data) } clust <- fn.gen.theta.star.3(clust,prior,mu,data) ret.obj <- NULL ret.obj$clust <- clust
ret.obj$prior <- prior return(ret.obj) } ################################################################ # # Function 4. The nonparametric bits # # These conditional generations match those for the model for # normal means. One need only adjust the data fed into the # routine. Work with the residuals. # ################################################################ fn.gibbs.sampler.reg.3 <- function(n.reps,prior,data,clust,mu,n.burn=10) { n <- data$n
p <- data$p # Insert initialization routine if desired tmp <- fn.init.beta.bls(prior$beta,data)
param$beta <- as.vector(tmp$beta)
# param$sig.sq <- tmp$sig.sq
data$fits <- data$X %*% param$beta data$resids <- data$y - data$fits
clust$theta.star <- data$resids

print("init done")

# Insert burn-in period if desired
for (i in 1:n.burn)
{
tmp.data <- NULL
tmp.data$x <- data$resids
tmp.prior <- NULL
tmp.prior$tau.sq <- prior$tau.sq
tmp.prior$sig.sq <- param$sig.sq
tmp.prior$M <- prior$M
tmp <- fn.one.iterate.4(clust=clust,prior=tmp.prior,data=tmp.data)
clust <- tmp$clust param$beta <- fn.gen.beta(prior$beta,param,clust,data) } # Set up output object and run chain for estimation period res.mat <- matrix(rep(0,n.reps*(p+1+n)),nrow=n.reps) for (i in 1:n.reps) { tmp.data <- NULL tmp.data$x <- data$resids tmp.prior <- NULL tmp.prior$tau.sq <- prior$tau.sq tmp.prior$sig.sq <- param$sig.sq tmp.prior$M <- prior$M tmp <- fn.one.iterate.4(clust=clust,prior=tmp.prior,data=tmp.data) clust <- tmp$clust
param$beta <- fn.gen.beta(prior$beta,param,clust,data)
res.mat[i,] <- c(param$beta,param$sig.sq,clust$theta.star[clust$s])
}

return(res.mat)
}

################################################################
#
# Function 5.  A density estimate of the residual distribution
#
#  This one relies on slow and inefficient coding.  If k << n,
#   substantial gains in speed are possible by working with the
#   theta.star instead of the theta.
#
#  Input:  theta is a matrix, each row of which contains the
#          values of theta associated with the n cases
#
#  Usage tmp.prior <- NULL; tmp.prior$sig.sq <- prior$sig.sq;
#        tmp.prior$tau.sq <- prior$tau.sq; tmp.prior$M <- M; # plot(fn.dens.est(tmp.prior,theta)) # ################################################################ fn.dens.est <- function(prior,theta,num.grid=1000) { sig.sq <- prior$sig.sq
tau.sq <- prior$tau.sq M <- prior$M
n <- dim(theta)[2]
n.reps <- dim(theta)[1]
sigma <- sqrt(sig.sq)
min.pt <- min(theta) - 2.5 * sigma
max.pt <- max(theta) + 2.5 * sigma
grid <- (0:num.grid)/(num.grid + 1)
grid <- min.pt + grid * (max.pt - min.pt)
theta.vec <- as.vector(theta)
y <- rep(0,num.grid+1)
for (i in 1:length(theta.vec))
{
y <- y + dnorm(x=grid,mean=theta.vec[i],sd=sigma)
}
y <- y + M * n.reps * dnorm(x=grid,mean=0,sd=sqrt(sig.sq + tau.sq))

density <- NULL
density$x <- grid density$y <- y / ((M+n)*n.reps)

return(density)
}

################################################################
#
# Making use of the code.
#
# The bit below makes use of the regression model with a nonparametric
#  error distribution.  It fits a simple linear regression model to
#  weight as a function of height to the bodyfat data set.
#
################################################################

# sets up the objects to run the code

data <- NULL
data$y <- bodyfat.mat[,4] data$X <- cbind(rep(1,length(data$y)),bodyfat.mat[,5]) data$n <- length(data$y) data$p <- dim(data$X)[2] # OLS fits and residuals data$fits <- data$X %*% solve(t(data$X) %*% data$X) %*% t(data$X) %*% data$y data$resids <- data$y - data$fits

prior <- NULL
prior$beta$mean <- c(0,0)
prior$beta$var <- diag(c(10000,10000))
prior$beta$prec <- solve(prior$beta$var)
# prior$sig.sq$a  # These lines would be used if sigma^2
# prior$sig.sq$b  # were treated as unknown
prior$M <- 1 prior$tau.sq <- 400

clust <- NULL
clust$k <- data$n
clust$s <- 1:data$n
clust$n.i <- rep(1,data$n)
clust$theta.star <- rep(0,data$n)

param <- NULL
param$beta <- c(0,0) param$sig.sq <- 100

# runs the code.  100 burn-in, 1000 reps for estimation --too small, but relatively quick.
set.seed(544)
res.mat.a <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100)

# takes a look at the output.
#  plots of residual density estimates - kde from least squares and the Bayes model
tmp.prior <- prior
tmp.prior$sig.sq <- param$sig.sq
plot(fn.dens.est(tmp.prior,res.mat.a[,4:255]))
lines(density(data$resids)) # least squares: residuals vs. fits plot(data$fits,data$resids) # a plot of residuals from this model vs. those from a least squares fit plot(data$resids,apply(res.mat.a[,4:255],2,mean))

## How stable is the output from this sampler?  We can investigate this through
#   a variety of diagnostics.  We'll pursue the approach of looking at independent
#   runs of the code.  All runs are based on the same initialization.
res.mat.b <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100)
res.mat.c <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100)
res.mat.d <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100)
res.mat.e <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100)
big.res.mat <- rbind(res.mat.a,res.mat.b,res.mat.c,res.mat.d,res.mat.e)

# standard trace-plot diagnostics for the runs
plot(big.res.mat[,1],type='n'); points(big.res.mat[,1],pch='.')
plot(big.res.mat[,2],type='n'); points(big.res.mat[,2],pch='.')
plot(big.res.mat[,103],type='n'); points(big.res.mat[,103],pch='.')

# a plot of beta_1 vs beta_0
plot(big.res.mat[,1],big.res.mat[,2],type='n')
points(big.res.mat[,1],big.res.mat[,2],pch='.')

# density estimates of the residual distribution
lines(fn.dens.est(tmp.prior,res.mat.b[,4:255]))
dens.est.a <- fn.dens.est(tmp.prior,res.mat.a[,4:255])
dens.est.b <- fn.dens.est(tmp.prior,res.mat.b[,4:255])
dens.est.c <- fn.dens.est(tmp.prior,res.mat.c[,4:255])
dens.est.d <- fn.dens.est(tmp.prior,res.mat.d[,4:255])
dens.est.e <- fn.dens.est(tmp.prior,res.mat.e[,4:255])
plot(dens.est.a,type="n")
lines(dens.est.a)
lines(dens.est.b)
lines(dens.est.c)
lines(dens.est.d)
lines(dens.est.e)

mean.mat <- matrix(rep(0,10),ncol=2)
mean.mat[1,] <- apply(res.mat.a[,1:2],2,mean)
mean.mat[2,] <- apply(res.mat.b[,1:2],2,mean)
mean.mat[3,] <- apply(res.mat.c[,1:2],2,mean)
mean.mat[4,] <- apply(res.mat.d[,1:2],2,mean)
mean.mat[5,] <- apply(res.mat.e[,1:2],2,mean)
apply(mean.mat,2,mean)
sqrt(apply(mean.mat,2,var)/5)

quantile(x=big.res.mat[,2],probs=c(0.025,0.5,0.975))

sqrt(apply(res.mat.a[,1:2],2,var))
sqrt(apply(res.mat.b[,1:2],2,var))
sqrt(apply(res.mat.c[,1:2],2,var))
sqrt(apply(res.mat.d[,1:2],2,var))
sqrt(apply(res.mat.e[,1:2],2,var))

################# A modified data set, removing case 42, the 29.5 inch tall man

data.f <- data          # these for backup
clust.f <- clust
prior.f <- prior
param.f <- param

# adjust data and clust.  prior and param are unchanged.
data$y <- data$y[-42]
data$X <- data$X[-42,]
data$n <- length(data$y)
data$p <- dim(data$X)[2]
# OLS fits and residuals
data$fits <- data$X %*% solve(t(data$X) %*% data$X) %*% t(data$X) %*% data$y
data$resids <- data$y - data$fits clust$k <- data$n clust$s <- 1:data$n clust$n.i <- rep(1,data$n) clust$theta.star <- rep(0,data$n) data.s <- data clust.s <- clust prior.s <- prior param.s <- param ################ plots for handout res.mat <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100) plot(data$resids,apply(res.mat[,4:255],2,mean))
plot(data$fits,data$resids)
plot(density(data$resids)) plot(density(res.mat[,4:255])) #ThetaHatvsResids.ps tmp.prior <- prior tmp.prior$sig.sq <- param$sig.sq plot(fn.dens.est(tmp.prior,res.mat[,4:255])) #PredictiveResidualDensities plot(dens.est.a,type="n") lines(dens.est.a) lines(dens.est.b) lines(dens.est.c) lines(dens.est.d) lines(dens.est.e) #Beta0TimePlot plot(big.res.mat[,1],type='n'); points(big.res.mat[,1],pch='.') #Beta1TimePlot plot(big.res.mat[,2],type='n'); points(big.res.mat[,2],pch='.') #Theta100TimePlot plot(big.res.mat[,103],type='n'); points(big.res.mat[,103],pch='.') #Beta1vsBeta0 plot(big.res.mat[,1],big.res.mat[,2],type='n') points(big.res.mat[,1],big.res.mat[,2],pch='.') #Beta1Density plot(density(big.res.mat[,2])) #LeastSquaresResidsVsFits plot(data$fits,data$resids) ################################################################ # # Function. A parametric Bayes fit of the model -- known variance # ################################################################ fn.fit.beta.bls <- function(prior,sig.sq,data) { X <- data$X
y <- data$y n <- data$n
p <- data$p prior.mean <- prior$mean
prior.var <- prior$var prior.prec <- prior$prec

beta.ls <- solve(t(X) %*% X) %*% t(X) %*% y
fit <- X %*% beta.ls
sig.sq.ls <- sum((y - fit)^2) / (n - p)

post.prec <- prior.prec + ((t(X) %*% X)/sig.sq)
post.var <- solve(post.prec)
tmp <- (prior.prec %*% prior.mean) + ((t(X) %*% y)/sig.sq)
beta <- post.var %*% tmp

fits <- X %*% beta
resids <- y - fits
sig.sq <- sum(resids^2)

tmp <- NULL
tmp$beta <- beta tmp$Sigma <- post.var
tmp$fits <- X %*% beta tmp$resids <- y - tmp$fits tmp$sig.sq <- sig.sq
tmp$beta.ls <- beta.ls tmp$sig.sq.ls <- sig.sq.ls
return(tmp)
}

# The small data set (case 42 removed)
prior$beta$mean <- c(0,0)
prior$beta$var <- diag(c(10000,10000))
prior$beta$prec <- solve(prior$beta$var)
param$sig.sq <- 500 bayes.regression <- fn.fit.beta.bls(prior=prior$beta,
sig.sq=param$sig.sq, data=data.s) cbind(bayes.regression$beta,bayes.regression$beta.ls) bayes.regression$Sigma
plot(density(bayes.regression$resids)) # Adjusting the prior for beta_1 prior$beta$mean <- c(0,5) prior$beta$var <- diag(c(10000,2.25)) prior$beta$prec <- solve(prior$beta$var) bayes.regression <- fn.fit.beta.bls(prior=prior$beta,
sig.sq=param$sig.sq, data=data.s) cbind(bayes.regression$beta,bayes.regression$beta.ls) bayes.regression$Sigma
plot(density(bayes.regression$resids)) # Working with recentered data - 72 inches is recentering point prior$gamma$mean <- c(180,5) prior$gamma$var <- diag(c(50,2.25)) prior$gamma$prec <- solve(prior$gamma$var) trans.mat <- matrix(c(1,-72,0,1),ncol=2,byrow=T) prior$beta$mean <- trans.mat %*% prior$gamma$mean prior$beta$var <- trans.mat %*% prior$gamma$var %*% t(trans.mat) prior$beta$prec <- solve(prior$beta$var) trans.mat prior$gamma$mean prior$beta$mean prior$gamma$var prior$beta$var bayes.regression <- fn.fit.beta.bls(prior=prior$beta,
sig.sq=param$sig.sq, data=data.s) cbind(bayes.regression$beta,bayes.regression$beta.ls) bayes.regression$Sigma
plot(density(bayes.regression$resids)) # Recentered data, full data set bayes.regression <- fn.fit.beta.bls(prior=prior$beta,
sig.sq=param$sig.sq, data=data.f) cbind(bayes.regression$beta,bayes.regression$beta.ls) bayes.regression$Sigma
plot(density(bayes.regression$resids)) tmp.grid <- (-60:200) tmp.dens <- dnorm(x=tmp.grid,mean=0,sd=sqrt(500)) lines(tmp.grid,tmp.dens) ################################################################ # # Elicitations from class # # Each row is one prior. I've adjusted a couple -- used Mike's # 100000000000 to replace infinity in one case, used 0.000001 to # replace 0 in another case. All have been translated into pounds # and inches. Distributional forms for the analysis have all been # presumed to be normal, and hierarchies have been trimmed. Also, # I've resolved ambiguities as best I could. # # First two components are mean vector for beta, next four components # are the covariance matrix. Mine is the last one. # ################################################################ prior.mat <- matrix(rep(0,15*6),ncol=6) prior.mat[1,] <- c(0,0,100000000000,0,0,100000000000) prior.mat[2,] <- c(0,0,10,0,0,10) prior.mat[3,] <- c(0,1.862667,249806,0,0,249806) prior.mat[4,] <- c(20,2,5,0.2,0.2,0.8) prior.mat[5,] <- c(16.764,2.2352,3122.574,0,0,0.25) prior.mat[6,] <- c(-53,2.5,100,0,0,1) prior.mat[7,] <- c(0,3,100,0,0,2.25) prior.mat[8,] <- c(-180,5,11714,-162,-162,2.25) prior.mat[9,] <- c(-170,5,2500,0,0,9) prior.mat[10,] <- c(0,5,10000,0,0,9) prior.mat[11,] <- c(0,5,100,0,0,100) prior.mat[12,] <- c(0,5,1,-0.5,-0.5,1) prior.mat[13,] <- c(0,5,0.000001,0,0,6.25) prior.mat[14,] <- c(0,5.588,100,10,10,100) prior.mat[15,] <- c(0,10,100000000000,0,0,100000000000) param$sig.sq <- 660    # approximately sigma^2 from a least squares analysis

posterior.mat <- prior.mat
for (i in 1:15)
{
tmp <- prior.mat[i,]
prior$beta$mean <- tmp[1:2]
prior$beta$var <- matrix(tmp[3:6],ncol=2,byrow=T)
prior$beta$prec <- solve(prior$beta$var)
tmp <- fn.fit.beta.bls(prior=prior$beta, sig.sq=param$sig.sq,
data=data.s)
posterior.mat[i,] <- c(as.vector(tmp$beta),as.vector(tmp$Sigma))
}
matrix(round(c(posterior.mat[,1],sqrt(posterior.mat[,3]),
posterior.mat[,2],sqrt(posterior.mat[,6])),2),ncol=4)

# A plot of posterior mean for beta_1 vs. its prior mean
#  Priors where the intercept has small variance are plotted
#  as 'x'; where the intercept has large variance as 'o'.
plot(prior.mat[,2],posterior.mat[,2],type='n',xlab='Prior Mean',ylab='Posterior Mean')
tmp <- (1:15)[prior.mat[,3] < 5000]
points(prior.mat[tmp,2],posterior.mat[tmp,2],pch='x')
points(prior.mat[-tmp,2],posterior.mat[-tmp,2],pch='o')

# Cross-validation and the success of prior elicitation.  I've relied
#  on the prior that I specified.
tmp <- prior.mat[8,]
prior$beta$mean <- tmp[1:2]
prior$beta$var <- matrix(tmp[3:6],ncol=2,byrow=T)
prior$beta$prec <- solve(prior$beta$var)

coll.res <- matrix(rep(0,20*3),ncol=3)
for (j in 1:20)
{
n.train <- 10 * j
set.seed(544)
res.mat <- matrix(rep(0,20000),ncol=2)

for (i in 1:10000)
{
test <- NULL
perm <- sample(x=1:251)
train.ind <- perm[1:n.train]
test.ind <- perm[(n.train+1):251]
data.train <- data.s
data.train$y <- data.train$y[train.ind]
data.train$X <- data.train$X[train.ind,]
data.train$n <- n.train data.train$fits <- rep(0,n.train)
data.train$resids <- rep(0,n.train) tmp <- fn.fit.beta.bls(prior=prior$beta,
sig.sq=param$sig.sq, data=data.train) test$X <- data.s$X[test.ind,] test$y <- data.s$y[test.ind] ls.fit <- test$X %*% tmp$beta.ls bayes.fit <- test$X %*% tmp$beta ls.mse <- mean((test$y - ls.fit)^2)
bayes.mse <- mean((test\$y - bayes.fit)^2)
res.mat[i,] <- c(ls.mse,bayes.mse)
}
coll.res[j,] <- c(n.train,apply(res.mat,2,mean))
}

# A plot for the handout on prior elicitation
plot(rep(coll.res[,1],2),c(coll.res[,2],coll.res[,3]),
xlab='training sample size',ylab='predictive mse')
lines(coll.res[,1],coll.res[,2])
lines(coll.res[,1],coll.res[,3])