##### R code (1) to explore distributional shapes in the 
##### Beta family of distributions on ( 0, 1 ), 
##### (2) to look at the elicited prior distribution in the
##### AMI mortality case study (Lecture Notes, Day 1, Part 2),
##### (3) to compare prior, likelihood and posterior densities,
##### (4) to compare the Beta posterior with its Gaussian
##### approximation, and (5) to do sensitivity analysis
##### of the effect of the prior on the posterior

########## (1) various shapes in the Beta family

theta.for.beta.grid <- seq( 0.01, 0.99, length = 500 )

plot( theta.for.beta.grid, dbeta( theta.for.beta.grid, 0.5, 0.5 ),
  type = 'l', lty = 1, xlab = 'theta', ylab = 'Density',
  ylim = c( 0, 6 ) )

text( 0.1, 3, '( 0.5, 0.5 )' )

lines( theta.for.beta.grid, dbeta( theta.for.beta.grid, 1, 1 ),
  lty = 2 )

text( 0.8, 1.5, '( 1, 1 )' )

lines( theta.for.beta.grid, dbeta( theta.for.beta.grid, 2, 3 ),
  lty = 3 )

text( 0.3, 2.25, '( 2, 3 )' )

lines( theta.for.beta.grid, dbeta( theta.for.beta.grid, 30, 20 ),
  lty = 4 )

text( 0.75, 5, '( 30, 20 )' )

########## (2) the elicited Beta prior

theta.prior.grid <- seq( 0, 0.4, length = 500 )

plot( theta.prior.grid, dbeta( theta.prior.grid, 4.5, 25.5 ),
  type = 'l', lty = 1, xlab = 'theta', ylab = 'Density' )

########## (3) prior, likelihood and posterior plot

prior.likelihood.posterior.grid <- seq( 0, 0.4, length = 500 )

plot( prior.likelihood.posterior.grid, 
  dbeta( prior.likelihood.posterior.grid, 4.5, 25.5 ),
  type = 'l', lty = 1, xlab = 'theta', ylab = 'Density',
  ylim = c( 0, 22.5 ) )

text( 0.1, 8, 'prior' )

lines( prior.likelihood.posterior.grid, 
  dbeta( prior.likelihood.posterior.grid, 73, 329 ),
  lty = 2 )

text( 0.225, 17.5, 'likelihood' )

lines( prior.likelihood.posterior.grid, 
  dbeta( prior.likelihood.posterior.grid, 76.5, 353.5 ),
  lty = 3 )

text( 0.125, 20, 'posterior' )

########## (4) Beta posterior and Gaussian approximation

theta.approximation.grid <- seq( 0.12, 0.24, length = 500 )

plot( theta.approximation.grid, 
  dbeta( theta.approximation.grid, 76.5, 353.5 ),
  type = 'l', lty = 1, xlab = 'theta', ylab = 'Density',
  ylim = c( 0, 22.5 ) )

text( 0.155, 20, 'posterior' )

alpha <- 4.5

beta <- 25.5

n <- 400

s <- 72

print( alpha.star <- alpha + s )

# 76.5

print( beta.star <- beta + n - s )

# 353.5

print( posterior.mean <- alpha.star / ( alpha.star + beta.star ) )

# 0.1779070

print( posterior.sd <- sqrt( ( alpha.star / ( alpha.star + beta.star ) ) *
  ( beta.star / ( alpha.star + beta.star ) ) /
  ( alpha.star + beta.star + 1 ) ) )

# 0.01842122

##### alpha.star / ( alpha.star + beta.star ) is like p.hat

##### beta.star / ( alpha.star + beta.star ) is like ( 1 - p.hat ) 

##### ( alpha.star + beta.star + 1 ) is like the sample size n

##### so this posterior sd expression is like
##### sqrt( p.hat * ( 1 - p.hat ) / n ), the frequentist
##### standard error for a proportion estimate, except that
##### p.hat is the posterior mean rather than the MLE
##### and ( alpha.star + beta.star + 1 ) is essentially
##### the posterior sample size (prior + likelihood) instead
##### of the data sample size

lines( theta.approximation.grid, 
  dnorm( theta.approximation.grid, posterior.mean, posterior.sd ),
  lty = 2 )

text( 0.22, 17.5, 'Gaussian approximation' )

##### exact 95% posterior interval for theta with this prior:

c( qbeta( 0.025, alpha.star, beta.star ), 
  qbeta( 0.975, alpha.star, beta.star ) )

# 0.1432590 0.2153918

##### Gaussian approximate 95% posterior interval

c( posterior.mean - qnorm( 0.975 ) * posterior.sd,
  posterior.mean + qnorm( 0.975 ) * posterior.sd )

# 0.1418021 0.2140119

########## (5) sensitivity analysis

theta.sensitivity.analysis.grid <- seq( 0.12, 0.24, length = 500 )

n <- 400

s <- 72

##### prior 1 is the elicited prior:

alpha.1 <- 4.5

beta.1 <- 25.5

print( alpha.star.1 <- alpha.1 + s )

# 76.5

print( beta.star.1 <- beta.1 + n - s )

# 353.5

plot( theta.sensitivity.analysis.grid, 
  dbeta( theta.sensitivity.analysis.grid, alpha.star.1, beta.star.1 ),
  type = 'l', lty = 1, xlab = 'theta', ylab = 'Density',
  ylim = c( 0, 22.5 ), main = 'Posterior With Varying Priors' )

text( 0.155, 20, 'prior 1' )

##### prior 2 is a diffuse prior (uniform):

alpha.2 <- 1

beta.2 <- 1

print( alpha.star.2 <- alpha.2 + s )

# 73

print( beta.star.2 <- beta.2 + n - s )

# 329

##### note that this is the same as the likelihood distribution

lines( theta.sensitivity.analysis.grid, 
  dbeta( theta.sensitivity.analysis.grid, alpha.star.2, beta.star.2 ),
  lty = 2 )

text( 0.205, 19, 'prior 2' )

##### prior 3 is even a bit more diffuse:

alpha.3 <- 0.5

beta.3 <- 0.5

print( alpha.star.3 <- alpha.3 + s )

# 72.5

print( beta.star.3 <- beta.3 + n - s )

# 328.5

lines( theta.sensitivity.analysis.grid, 
  dbeta( theta.sensitivity.analysis.grid, alpha.star.3, beta.star.3 ),
  lty = 3 )

text( 0.2075, 17, 'prior 3' )

##### now let's look at the posterior mean, SD and 95% interval
##### across the 3 priors

##### prior 1

print( alpha.star.1 <- alpha.1 + s )

# 76.5

print( beta.star.1 <- beta.1 + n - s )

# 353.5

print( posterior.mean.1 <- alpha.star.1 / ( alpha.star.1 + beta.star.1 ) )

# 0.1779070

print( posterior.sd.1 <- 
  sqrt( ( alpha.star.1 / ( alpha.star.1 + beta.star.1 ) ) *
  ( beta.star.1 / ( alpha.star.1 + beta.star.1 ) ) /
  ( alpha.star.1 + beta.star.1 + 1 ) ) )

# 0.01842122

c( qbeta( 0.025, alpha.star.1, beta.star.1 ), 
  qbeta( 0.975, alpha.star.1, beta.star.1 ) )

# 0.1432590 0.2153918

##### prior 2

print( alpha.star.2 <- alpha.2 + s )

# 73

print( beta.star.2 <- beta.2 + n - s )

# 329

print( posterior.mean.2 <- alpha.star.2 / ( alpha.star.2 + beta.star.2 ) )

# 0.1815920

print( posterior.sd.2 <- 
  sqrt( ( alpha.star.2 / ( alpha.star.2 + beta.star.2 ) ) *
  ( beta.star.2 / ( alpha.star.2 + beta.star.2 ) ) /
  ( alpha.star.2 + beta.star.2 + 1 ) ) )

# 0.01920352

c( qbeta( 0.025, alpha.star.2, beta.star.2 ), 
  qbeta( 0.975, alpha.star.2, beta.star.2 ) )

# 0.1454958 0.2206880

##### prior 3

print( alpha.star.3 <- alpha.3 + s )

# 72.5

print( beta.star.3 <- beta.3 + n - s )

# 328.5

print( posterior.mean.3 <- alpha.star.3 / ( alpha.star.3 + beta.star.3 ) )

# 0.180798

print( posterior.sd.3 <- 
  sqrt( ( alpha.star.3 / ( alpha.star.3 + beta.star.3 ) ) *
  ( beta.star.3 / ( alpha.star.3 + beta.star.3 ) ) /
  ( alpha.star.3 + beta.star.3 + 1 ) ) )

# 0.01919461

c( qbeta( 0.025, alpha.star.3, beta.star.3 ), 
  qbeta( 0.975, alpha.star.3, beta.star.3 ) )

# 0.1447270 0.2198837

##### ML analysis

print( theta.hat <- s / n )

# 0.18

print( theta.hat.se <- sqrt( theta.hat * ( 1 - theta.hat ) / n ) )

# 0.01920937

c( theta.hat - qnorm( 0.975 ) * theta.hat.se, 
  theta.hat + qnorm( 0.975 ) * theta.hat.se )

# 0.1423503 0.2176497

##### ML analysis in this problem corresponds to Bayesian analysis
##### with an improper Beta( 0, 0 ) prior

##### comparison of results

#####                                        95% interval
#####     method        estimate    SD/SE    lower   upper

##### Bayes, prior 1     0.1779    0.01842   0.1433  0.2154
##### Bayes, prior 2     0.1816    0.01920   0.1455  0.2207
##### Bayes, prior 3     0.1808    0.01919   0.1447  0.2199
#####       ML           0.1800    0.01921   0.1424  0.2176

##### all methods essentially agree to two significant figures: 
##### theta is about 0.18, give or take about 0.19, and a 95%
##### interval for theta runs from 0.14 to 0.22

##### this agreement occurs for two reasons: (1) all the priors
##### are relatively diffuse and (2) n is large relative to
##### the prior sample size (this is the Bernstein-von Mises
##### theorem in action)

##### later we'll see examples in which the Bayesian approach
##### produces better inferences than ML even with diffuse priors,
##### where better means interval estimates whose actual frequentist
##### coverage is (far) closer to nominal