codes:mdp_regression_r_code
# Supplement to MDP code for regression modelling.
# Code relies on the MASS library
# The model to be fit with the code below is a
# traditional linear regression model, with one
# exception. The errors are presumed to follow
# an "arbitrary" continuous distribution. The
# prior over this distribution is an MDP model.
# Note that the parameter sigma^2 is not the
# usual residual variance. Also, one must be
# careful in interpretation of "beta_0". The
# value of the mean of the distribution F must
# be swept into the intercept.
################################################################
#
# Part 1. A description of the model
#
# The model is a conjugate, normal theory regression model
# Variances are treated as known throughout
#
# The model is the basic model discussed in class
# beta ~ N(mu_beta, Sig_beta)
# F ~ DP(M F_0); F_0 = N(0, tau^2)
# theta_i | F ~ F
# y_i | theta_i ~ N(x_i^t beta + theta_i, sigma^2)
#
################################################################
################################################################
#
# Part 2. Some useful objects - the initial ones are just those
# from the simpler normal means problem.
#
# First, we'll need information on the clusters
#
# clust$s
# clust$k
# clust$n.i
# clust$theta.star
#
# It's also good to have the data stored as an object
#
# data$y
# data$X
# data$n
# data$p
# data$fits
# data$resids
#
# The parameters governing likelihood and prior are also essential
#
# prior$beta$mean
# prior$beta$var
# prior$beta$prec
# prior$sig.sq$a
# prior$sig.sq$b
# prior$M
# prior$tau.sq
#
# Also, we'll track the value of mu as the gibbs sampler progresses
#
# param$beta
# param$sig.sq
# param$tau.sq
#
################################################################
################################################################
#
# Function 1. Generation of the beta vector
#
# The calculations for the conditional generation match those
# of the typical parametric model exactly.
#
# Makes use of mvrnorm from the MASS library.
################################################################
fn.gen.beta <- function(prior,param,clust,data)
{
s <- clust$s
theta.star <- clust$theta.star
y <- data$y - theta.star[s]
X <- data$X
prior.mean <- prior$mean
prior.var <- prior$var
prior.prec <- prior$prec
sig.sq <- param$sig.sq
post.prec <- prior.prec + ((t(X) %*% X)/sig.sq)
post.var <- solve(post.prec)
tmp <- (prior.prec %*% prior.mean) + ((t(X) %*% y)/sig.sq)
post.mean <- post.var %*% tmp
beta <- mvrnorm(n=1,mu=post.mean,Sigma=post.var)
return(beta)
}
################################################################
#
# Function 2. Initialization of the beta vector and sigma^2
#
# This routine initializes the beta vector at the posterior mean
# for the regression coefficients -- given an estimated value
# of sigma^2 based on least squares.
#
################################################################
fn.init.beta.bls <- function(prior,data)
{
X <- data$X
y <- data$y
n <- data$n
p <- data$p
prior.mean <- prior$mean
prior.var <- prior$var
prior.prec <- prior$prec
beta.ls <- solve(t(X) %*% X) %*% t(X) %*% y
fit <- X %*% beta.ls
sig.sq <- sum((y - fit)^2) / (n - p)
post.prec <- prior.prec + ((t(X) %*% X)/sig.sq)
post.var <- solve(post.prec)
tmp <- (prior.prec %*% prior.mean) + ((t(X) %*% y)/sig.sq)
beta <- post.var %*% tmp
tmp <- NULL
tmp$beta <- beta
tmp$sig.sq <- sig.sq
return(tmp)
}
################################################################
#
# Function 3. One iterate of the Gibbs sampler
#
# Tasks: Generate each theta_i in turn
# Generate theta_star
#
################################################################
fn.one.iterate.4 <- function(clust,prior,mu=0,data)
{
n <- length(data$x)
for (i in 1:n)
{
clust <- fn.remove.theta.i(i,clust)
clust <- fn.gen.theta.i.2(i,clust,prior,mu,data)
}
clust <- fn.gen.theta.star.3(clust,prior,mu,data)
ret.obj <- NULL
ret.obj$clust <- clust
ret.obj$prior <- prior
return(ret.obj)
}
################################################################
#
# Function 4. The nonparametric bits
#
# These conditional generations match those for the model for
# normal means. One need only adjust the data fed into the
# routine. Work with the residuals.
#
################################################################
fn.gibbs.sampler.reg.3 <- function(n.reps,prior,data,clust,mu,n.burn=10)
{
n <- data$n
p <- data$p
# Insert initialization routine if desired
tmp <- fn.init.beta.bls(prior$beta,data)
param$beta <- as.vector(tmp$beta)
# param$sig.sq <- tmp$sig.sq
data$fits <- data$X %*% param$beta
data$resids <- data$y - data$fits
clust$theta.star <- data$resids
print("init done")
# Insert burn-in period if desired
for (i in 1:n.burn)
{
tmp.data <- NULL
tmp.data$x <- data$resids
tmp.prior <- NULL
tmp.prior$tau.sq <- prior$tau.sq
tmp.prior$sig.sq <- param$sig.sq
tmp.prior$M <- prior$M
tmp <- fn.one.iterate.4(clust=clust,prior=tmp.prior,data=tmp.data)
clust <- tmp$clust
param$beta <- fn.gen.beta(prior$beta,param,clust,data)
}
# Set up output object and run chain for estimation period
res.mat <- matrix(rep(0,n.reps*(p+1+n)),nrow=n.reps)
for (i in 1:n.reps)
{
tmp.data <- NULL
tmp.data$x <- data$resids
tmp.prior <- NULL
tmp.prior$tau.sq <- prior$tau.sq
tmp.prior$sig.sq <- param$sig.sq
tmp.prior$M <- prior$M
tmp <- fn.one.iterate.4(clust=clust,prior=tmp.prior,data=tmp.data)
clust <- tmp$clust
param$beta <- fn.gen.beta(prior$beta,param,clust,data)
res.mat[i,] <- c(param$beta,param$sig.sq,clust$theta.star[clust$s])
}
return(res.mat)
}
################################################################
#
# Function 5. A density estimate of the residual distribution
#
# This one relies on slow and inefficient coding. If k << n,
# substantial gains in speed are possible by working with the
# theta.star instead of the theta.
#
# Input: theta is a matrix, each row of which contains the
# values of theta associated with the n cases
#
# Usage tmp.prior <- NULL; tmp.prior$sig.sq <- prior$sig.sq;
# tmp.prior$tau.sq <- prior$tau.sq; tmp.prior$M <- M;
# plot(fn.dens.est(tmp.prior,theta))
#
################################################################
fn.dens.est <- function(prior,theta,num.grid=1000)
{
sig.sq <- prior$sig.sq
tau.sq <- prior$tau.sq
M <- prior$M
n <- dim(theta)[2]
n.reps <- dim(theta)[1]
sigma <- sqrt(sig.sq)
min.pt <- min(theta) - 2.5 * sigma
max.pt <- max(theta) + 2.5 * sigma
grid <- (0:num.grid)/(num.grid + 1)
grid <- min.pt + grid * (max.pt - min.pt)
theta.vec <- as.vector(theta)
y <- rep(0,num.grid+1)
for (i in 1:length(theta.vec))
{
y <- y + dnorm(x=grid,mean=theta.vec[i],sd=sigma)
}
y <- y + M * n.reps * dnorm(x=grid,mean=0,sd=sqrt(sig.sq + tau.sq))
density <- NULL
density$x <- grid
density$y <- y / ((M+n)*n.reps)
return(density)
}
################################################################
#
# Making use of the code.
#
# The bit below makes use of the regression model with a nonparametric
# error distribution. It fits a simple linear regression model to
# weight as a function of height to the bodyfat data set.
#
################################################################
# sets up the objects to run the code
data <- NULL
data$y <- bodyfat.mat[,4]
data$X <- cbind(rep(1,length(data$y)),bodyfat.mat[,5])
data$n <- length(data$y)
data$p <- dim(data$X)[2]
# OLS fits and residuals
data$fits <- data$X %*% solve(t(data$X) %*% data$X) %*% t(data$X) %*% data$y
data$resids <- data$y - data$fits
prior <- NULL
prior$beta$mean <- c(0,0)
prior$beta$var <- diag(c(10000,10000))
prior$beta$prec <- solve(prior$beta$var)
# prior$sig.sq$a # These lines would be used if sigma^2
# prior$sig.sq$b # were treated as unknown
prior$M <- 1
prior$tau.sq <- 400
clust <- NULL
clust$k <- data$n
clust$s <- 1:data$n
clust$n.i <- rep(1,data$n)
clust$theta.star <- rep(0,data$n)
param <- NULL
param$beta <- c(0,0)
param$sig.sq <- 100
# runs the code. 100 burn-in, 1000 reps for estimation --too small, but relatively quick.
set.seed(544)
res.mat.a <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100)
# takes a look at the output.
# plots of residual density estimates - kde from least squares and the Bayes model
tmp.prior <- prior
tmp.prior$sig.sq <- param$sig.sq
plot(fn.dens.est(tmp.prior,res.mat.a[,4:255]))
lines(density(data$resids))
# least squares: residuals vs. fits
plot(data$fits,data$resids)
# a plot of residuals from this model vs. those from a least squares fit
plot(data$resids,apply(res.mat.a[,4:255],2,mean))
## How stable is the output from this sampler? We can investigate this through
# a variety of diagnostics. We'll pursue the approach of looking at independent
# runs of the code. All runs are based on the same initialization.
res.mat.b <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100)
res.mat.c <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100)
res.mat.d <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100)
res.mat.e <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100)
big.res.mat <- rbind(res.mat.a,res.mat.b,res.mat.c,res.mat.d,res.mat.e)
# standard trace-plot diagnostics for the runs
plot(big.res.mat[,1],type='n'); points(big.res.mat[,1],pch='.')
plot(big.res.mat[,2],type='n'); points(big.res.mat[,2],pch='.')
plot(big.res.mat[,103],type='n'); points(big.res.mat[,103],pch='.')
# a plot of beta_1 vs beta_0
plot(big.res.mat[,1],big.res.mat[,2],type='n')
points(big.res.mat[,1],big.res.mat[,2],pch='.')
# density estimates of the residual distribution
lines(fn.dens.est(tmp.prior,res.mat.b[,4:255]))
dens.est.a <- fn.dens.est(tmp.prior,res.mat.a[,4:255])
dens.est.b <- fn.dens.est(tmp.prior,res.mat.b[,4:255])
dens.est.c <- fn.dens.est(tmp.prior,res.mat.c[,4:255])
dens.est.d <- fn.dens.est(tmp.prior,res.mat.d[,4:255])
dens.est.e <- fn.dens.est(tmp.prior,res.mat.e[,4:255])
plot(dens.est.a,type="n")
lines(dens.est.a)
lines(dens.est.b)
lines(dens.est.c)
lines(dens.est.d)
lines(dens.est.e)
mean.mat <- matrix(rep(0,10),ncol=2)
mean.mat[1,] <- apply(res.mat.a[,1:2],2,mean)
mean.mat[2,] <- apply(res.mat.b[,1:2],2,mean)
mean.mat[3,] <- apply(res.mat.c[,1:2],2,mean)
mean.mat[4,] <- apply(res.mat.d[,1:2],2,mean)
mean.mat[5,] <- apply(res.mat.e[,1:2],2,mean)
apply(mean.mat,2,mean)
sqrt(apply(mean.mat,2,var)/5)
quantile(x=big.res.mat[,2],probs=c(0.025,0.5,0.975))
sqrt(apply(res.mat.a[,1:2],2,var))
sqrt(apply(res.mat.b[,1:2],2,var))
sqrt(apply(res.mat.c[,1:2],2,var))
sqrt(apply(res.mat.d[,1:2],2,var))
sqrt(apply(res.mat.e[,1:2],2,var))
################# A modified data set, removing case 42, the 29.5 inch tall man
data.f <- data # these for backup
clust.f <- clust
prior.f <- prior
param.f <- param
# adjust data and clust. prior and param are unchanged.
data$y <- data$y[-42]
data$X <- data$X[-42,]
data$n <- length(data$y)
data$p <- dim(data$X)[2]
# OLS fits and residuals
data$fits <- data$X %*% solve(t(data$X) %*% data$X) %*% t(data$X) %*% data$y
data$resids <- data$y - data$fits
clust$k <- data$n
clust$s <- 1:data$n
clust$n.i <- rep(1,data$n)
clust$theta.star <- rep(0,data$n)
data.s <- data
clust.s <- clust
prior.s <- prior
param.s <- param
################ plots for handout
res.mat <- fn.gibbs.sampler.reg.3(n.reps=1000,prior=prior,data=data,clust=clust,mu=0,n.burn=100)
plot(data$resids,apply(res.mat[,4:255],2,mean))
plot(data$fits,data$resids)
plot(density(data$resids))
plot(density(res.mat[,4:255]))
#ThetaHatvsResids.ps
tmp.prior <- prior
tmp.prior$sig.sq <- param$sig.sq
plot(fn.dens.est(tmp.prior,res.mat[,4:255]))
#PredictiveResidualDensities
plot(dens.est.a,type="n")
lines(dens.est.a)
lines(dens.est.b)
lines(dens.est.c)
lines(dens.est.d)
lines(dens.est.e)
#Beta0TimePlot
plot(big.res.mat[,1],type='n'); points(big.res.mat[,1],pch='.')
#Beta1TimePlot
plot(big.res.mat[,2],type='n'); points(big.res.mat[,2],pch='.')
#Theta100TimePlot
plot(big.res.mat[,103],type='n'); points(big.res.mat[,103],pch='.')
#Beta1vsBeta0
plot(big.res.mat[,1],big.res.mat[,2],type='n')
points(big.res.mat[,1],big.res.mat[,2],pch='.')
#Beta1Density
plot(density(big.res.mat[,2]))
#LeastSquaresResidsVsFits
plot(data$fits,data$resids)
################################################################
#
# Function. A parametric Bayes fit of the model -- known variance
#
################################################################
fn.fit.beta.bls <- function(prior,sig.sq,data)
{
X <- data$X
y <- data$y
n <- data$n
p <- data$p
prior.mean <- prior$mean
prior.var <- prior$var
prior.prec <- prior$prec
beta.ls <- solve(t(X) %*% X) %*% t(X) %*% y
fit <- X %*% beta.ls
sig.sq.ls <- sum((y - fit)^2) / (n - p)
post.prec <- prior.prec + ((t(X) %*% X)/sig.sq)
post.var <- solve(post.prec)
tmp <- (prior.prec %*% prior.mean) + ((t(X) %*% y)/sig.sq)
beta <- post.var %*% tmp
fits <- X %*% beta
resids <- y - fits
sig.sq <- sum(resids^2)
tmp <- NULL
tmp$beta <- beta
tmp$Sigma <- post.var
tmp$fits <- X %*% beta
tmp$resids <- y - tmp$fits
tmp$sig.sq <- sig.sq
tmp$beta.ls <- beta.ls
tmp$sig.sq.ls <- sig.sq.ls
return(tmp)
}
# The small data set (case 42 removed)
prior$beta$mean <- c(0,0)
prior$beta$var <- diag(c(10000,10000))
prior$beta$prec <- solve(prior$beta$var)
param$sig.sq <- 500
bayes.regression <- fn.fit.beta.bls(prior=prior$beta,
sig.sq=param$sig.sq,
data=data.s)
cbind(bayes.regression$beta,bayes.regression$beta.ls)
bayes.regression$Sigma
plot(density(bayes.regression$resids))
# Adjusting the prior for beta_1
prior$beta$mean <- c(0,5)
prior$beta$var <- diag(c(10000,2.25))
prior$beta$prec <- solve(prior$beta$var)
bayes.regression <- fn.fit.beta.bls(prior=prior$beta,
sig.sq=param$sig.sq,
data=data.s)
cbind(bayes.regression$beta,bayes.regression$beta.ls)
bayes.regression$Sigma
plot(density(bayes.regression$resids))
# Working with recentered data - 72 inches is recentering point
prior$gamma$mean <- c(180,5)
prior$gamma$var <- diag(c(50,2.25))
prior$gamma$prec <- solve(prior$gamma$var)
trans.mat <- matrix(c(1,-72,0,1),ncol=2,byrow=T)
prior$beta$mean <- trans.mat %*% prior$gamma$mean
prior$beta$var <- trans.mat %*% prior$gamma$var %*% t(trans.mat)
prior$beta$prec <- solve(prior$beta$var)
trans.mat
prior$gamma$mean
prior$beta$mean
prior$gamma$var
prior$beta$var
bayes.regression <- fn.fit.beta.bls(prior=prior$beta,
sig.sq=param$sig.sq,
data=data.s)
cbind(bayes.regression$beta,bayes.regression$beta.ls)
bayes.regression$Sigma
plot(density(bayes.regression$resids))
# Recentered data, full data set
bayes.regression <- fn.fit.beta.bls(prior=prior$beta,
sig.sq=param$sig.sq,
data=data.f)
cbind(bayes.regression$beta,bayes.regression$beta.ls)
bayes.regression$Sigma
plot(density(bayes.regression$resids))
tmp.grid <- (-60:200)
tmp.dens <- dnorm(x=tmp.grid,mean=0,sd=sqrt(500))
lines(tmp.grid,tmp.dens)
################################################################
#
# Elicitations from class
#
# Each row is one prior. I've adjusted a couple -- used Mike's
# 100000000000 to replace infinity in one case, used 0.000001 to
# replace 0 in another case. All have been translated into pounds
# and inches. Distributional forms for the analysis have all been
# presumed to be normal, and hierarchies have been trimmed. Also,
# I've resolved ambiguities as best I could.
#
# First two components are mean vector for beta, next four components
# are the covariance matrix. Mine is the last one.
#
################################################################
prior.mat <- matrix(rep(0,15*6),ncol=6)
prior.mat[1,] <- c(0,0,100000000000,0,0,100000000000)
prior.mat[2,] <- c(0,0,10,0,0,10)
prior.mat[3,] <- c(0,1.862667,249806,0,0,249806)
prior.mat[4,] <- c(20,2,5,0.2,0.2,0.8)
prior.mat[5,] <- c(16.764,2.2352,3122.574,0,0,0.25)
prior.mat[6,] <- c(-53,2.5,100,0,0,1)
prior.mat[7,] <- c(0,3,100,0,0,2.25)
prior.mat[8,] <- c(-180,5,11714,-162,-162,2.25)
prior.mat[9,] <- c(-170,5,2500,0,0,9)
prior.mat[10,] <- c(0,5,10000,0,0,9)
prior.mat[11,] <- c(0,5,100,0,0,100)
prior.mat[12,] <- c(0,5,1,-0.5,-0.5,1)
prior.mat[13,] <- c(0,5,0.000001,0,0,6.25)
prior.mat[14,] <- c(0,5.588,100,10,10,100)
prior.mat[15,] <- c(0,10,100000000000,0,0,100000000000)
param$sig.sq <- 660 # approximately sigma^2 from a least squares analysis
posterior.mat <- prior.mat
for (i in 1:15)
{
tmp <- prior.mat[i,]
prior$beta$mean <- tmp[1:2]
prior$beta$var <- matrix(tmp[3:6],ncol=2,byrow=T)
prior$beta$prec <- solve(prior$beta$var)
tmp <- fn.fit.beta.bls(prior=prior$beta,
sig.sq=param$sig.sq,
data=data.s)
posterior.mat[i,] <- c(as.vector(tmp$beta),as.vector(tmp$Sigma))
}
matrix(round(c(posterior.mat[,1],sqrt(posterior.mat[,3]),
posterior.mat[,2],sqrt(posterior.mat[,6])),2),ncol=4)
# A plot of posterior mean for beta_1 vs. its prior mean
# Priors where the intercept has small variance are plotted
# as 'x'; where the intercept has large variance as 'o'.
plot(prior.mat[,2],posterior.mat[,2],type='n',xlab='Prior Mean',ylab='Posterior Mean')
tmp <- (1:15)[prior.mat[,3] < 5000]
points(prior.mat[tmp,2],posterior.mat[tmp,2],pch='x')
points(prior.mat[-tmp,2],posterior.mat[-tmp,2],pch='o')
# Cross-validation and the success of prior elicitation. I've relied
# on the prior that I specified.
tmp <- prior.mat[8,]
prior$beta$mean <- tmp[1:2]
prior$beta$var <- matrix(tmp[3:6],ncol=2,byrow=T)
prior$beta$prec <- solve(prior$beta$var)
coll.res <- matrix(rep(0,20*3),ncol=3)
for (j in 1:20)
{
n.train <- 10 * j
set.seed(544)
res.mat <- matrix(rep(0,20000),ncol=2)
for (i in 1:10000)
{
test <- NULL
perm <- sample(x=1:251)
train.ind <- perm[1:n.train]
test.ind <- perm[(n.train+1):251]
data.train <- data.s
data.train$y <- data.train$y[train.ind]
data.train$X <- data.train$X[train.ind,]
data.train$n <- n.train
data.train$fits <- rep(0,n.train)
data.train$resids <- rep(0,n.train)
tmp <- fn.fit.beta.bls(prior=prior$beta,
sig.sq=param$sig.sq,
data=data.train)
test$X <- data.s$X[test.ind,]
test$y <- data.s$y[test.ind]
ls.fit <- test$X %*% tmp$beta.ls
bayes.fit <- test$X %*% tmp$beta
ls.mse <- mean((test$y - ls.fit)^2)
bayes.mse <- mean((test$y - bayes.fit)^2)
res.mat[i,] <- c(ls.mse,bayes.mse)
}
coll.res[j,] <- c(n.train,apply(res.mat,2,mean))
}
# A plot for the handout on prior elicitation
plot(rep(coll.res[,1],2),c(coll.res[,2],coll.res[,3]),
xlab='training sample size',ylab='predictive mse')
lines(coll.res[,1],coll.res[,2])
lines(coll.res[,1],coll.res[,3])
codes/mdp_regression_r_code.txt · Last modified: 2016/01/24 09:48 by 127.0.0.1
