# this code samples from a DP prior with c = cc,
# centered at the standard lognormal distribution F,
# truncating the infinite sum in the stick-breaking
# algorithm to m terms

# note: what's called c here is often instead called
# alpha in the notation DP( alpha F ) or DP( alpha, F )

set.seed( 5446663333 )

rdir.ln <- function( m, cc ) {

  theta <- rlnorm( m, 0.0, 1.0 )

  v <- rep( 0, m )

  w <- rbeta( m, 1.0, cc )

  v[1] <- w[1]

  for ( j in 2:m ) {

    v[j] <- w[j] * v[j-1] * ( 1.0 - w[j - 1] ) / w[j - 1]

  }
    
  print( sum( v ) )

  temp <- cbind( v, theta )

  return( temp[order(temp[, 2], temp[, 1]), 1:2] )

}

# this function samples from a DP posterior arrived at
# by updating a data vector y, starting with a DP prior 
# with c = cc, centered at the standard lognormal 
# distribution

rdir.Fstar <- function( m, cc, y ) {

  n <- length( y )

  theta <- rep( 0, m )

  for ( i in 1:m ) {

    U <- runif( 1 ) 

    S <- U < ( cc / ( cc + n ) )

    if ( S ) theta[i] <- rlnorm( 1, 0.0, 1.0 )

    else theta[i] <- sample( y, 1 )

  }

  v <- rep( 0, m )

  w <- rbeta( m, 1.0, cc )

  v[1] <- w[1]

  for ( j in 2:m ) {

    v[j] <- w[j] * v[j-1] * ( 1.0 - w[j - 1] ) / w[j - 1]

  }

  print( sum( v ) )

  temp <- cbind( v, theta )

  return( temp[order(temp[, 2], temp[, 1]), 1:2] )

}

# this code plots the output of rdir.ln with 100
# simulation replications and c = 1, on the log scale
# to compare with the standard normal density,
# and with m = 100 terms in the stick-breaking sums

grid <- log(seq(0,25,length=1000))

plot(grid,dnorm(grid),type='l',lwd=2,ylim=c(0,1.5),
  xlim=c(-4,4),xlab='log(y)',ylab='Density')

for ( i in 1:50) {

  temp <- rdir.ln(100,1)

  data <- rep( temp[,2], round(10000*temp[,1]) )

  temp <- density(log(data),width=(max(data)-min(data))/8)

  lines(temp,lty=2)

}

# the numerical output is the stick-breaking sums with
# the value of m used; if they're all close to 1, that
# value of m is big enough (if not, m needs to be
# increased)

# you can see that, with such a small c value, the
# prior is not being guided much on any single draw
# by the centering distribution (but, believe it or not,
# if you averaged all of these densities they would
# average out to N( 0, 1 ) on the log scale)

# this code repeats with c = 5

plot(grid,dnorm(grid),type='l',lwd=2,ylim=c(0,1.5),
  xlim=c(-4,4),xlab='log(y)',ylab='Density')

for ( i in 1:50) {

  temp <- rdir.ln(100,5)

  data <- rep( temp[,2], round(10000*temp[,1]) )

  temp <- density(log(data),width=(max(data)-min(data))/8)

  lines(temp,lty=2)

}

# the draws from the prior are now being gently
# pulled toward the centering distribution

# this code repeats with c = 20

plot(grid,dnorm(grid),type='l',lwd=2,ylim=c(0,1.5),
  xlim=c(-4,4),xlab='log(y)',ylab='Density')

for ( i in 1:50) {

  temp <- rdir.ln(100,20)

  data <- rep( temp[,2], round(10000*temp[,1]) )

  temp <- density(log(data),width=(max(data)-min(data))/8)

  lines(temp,lty=2)

}

# now the influence of the centering distribution
# is strong

# this code repeats with c = 100

plot(grid,dnorm(grid),type='l',lwd=2,ylim=c(0,1.5),
  xlim=c(-4,4),xlab='log(y)',ylab='Density')

for ( i in 1:50) {

  temp <- rdir.ln(100,100)

  data <- rep( temp[,2], round(10000*temp[,1]) )

  temp <- density(log(data),width=(max(data)-min(data))/8)

  lines(temp,lty=2)

}

# you can now see that i need a bigger value of m
# to get accurate results, because the cumulative sums
# are only on the order of 0.6-0.7; i'll try m = 500

plot(grid,dnorm(grid),type='l',lwd=2,ylim=c(0,1.5),
  xlim=c(-4,4),xlab='log(y)',ylab='Density')

for ( i in 1:50) {

  temp <- rdir.ln(500,100)

  data <- rep( temp[,2], round(10000*temp[,1]) )

  temp <- density(log(data),width=(max(data)-min(data))/8)

  lines(temp,lty=2)

}

# this is almost (but not quite) like putting point
# mass on the centering distribution (i.e., fitting a
# Bayesian parametric model in which the data were
# regarded as IID from the LN( 0, 1 ) distribution, 
# which is what would happen in the limit as c -> infinity

# this code generates a data set with n = 100 and a long
# right-hand tail, even on the log scale

y = rlnorm( 100 ) + 1

grid <- log(seq(0,25,length=1000))

hist( log( y ), breaks = 10, probability = T,
  xlim = c( -4, 4 ), col = 'blue', lwd = 3, main = '' )

# this code samples from the DP posterior obtained
# by (a) starting with a DP prior centered at the standard
# lognormal distribution and with c = 1 and (b) updating
# that prior in light of the data set y, with m = 150
# terms in the stick-breaking sums

lines(grid,dnorm(grid),lty=1,lwd=2,ylim=c(0,1.5),
  xlim=c(-4,4),xlab='log(y)',ylab='Density')

for ( i in 1:50) {

  temp <- rdir.Fstar(150,1,y)

  data <- rep( temp[,2], round(10000*temp[,1]) )

  temp <- density(log(data),width=(max(data)-min(data))/8)

  lines(temp,lty=2)

}

# with c = 1 almost all of the posterior draws are being
# strongly guided by the data but have a lot of
# variability (because n is only 100)

# this code repeats with the same data set and c = 50

hist( log( y ), breaks = 10, probability = T,
  xlim = c( -4, 4 ), col = 'blue', lwd = 3, main = '' )

lines(grid,dnorm(grid),lty=1,lwd=2,ylim=c(0,1.5),
  xlim=c(-4,4),xlab='log(y)',ylab='Density')

for ( i in 1:50) {

  temp <- rdir.Fstar(150,50,y)

  data <- rep( temp[,2], round(10000*temp[,1]) )

  temp <- density(log(data),width=(max(data)-min(data))/8)

  lines(temp,lty=2)

}

# here i again need a bigger value of m; i'll try 250

hist( log( y ), breaks = 10, probability = T,
  xlim = c( -4, 4 ), col = 'blue', lwd = 3, main = '' )

lines(grid,dnorm(grid),lty=1,lwd=2,ylim=c(0,1.5),
  xlim=c(-4,4),xlab='log(y)',ylab='Density')

for ( i in 1:50) {

  temp <- rdir.Fstar(250,50,y)

  data <- rep( temp[,2], round(10000*temp[,1]) )

  temp <- density(log(data),width=(max(data)-min(data))/8)

  lines(temp,lty=2)

}

# now the posterior draws are starting to be a compromise
# between the prior centering distribution and the data

# this code repeats with c = 250

hist( log( y ), breaks = 10, probability = T,
  xlim = c( -4, 4 ), col = 'blue', lwd = 3, main = '' )

lines(grid,dnorm(grid),lty=1,lwd=2,ylim=c(0,1.5),
  xlim=c(-4,4),xlab='log(y)',ylab='Density')

for ( i in 1:50) {

  temp <- rdir.Fstar(250,250,y)

  data <- rep( temp[,2], round(10000*temp[,1]) )

  temp <- density(log(data),width=(max(data)-min(data))/8)

  lines(temp,lty=2)

}

# again i need a bigger value of m; i'll try 500

hist( log( y ), breaks = 10, probability = T,
  xlim = c( -4, 4 ), col = 'blue', lwd = 3, main = '' )

lines(grid,dnorm(grid),lty=1,lwd=2,ylim=c(0,1.5),
  xlim=c(-4,4),xlab='log(y)',ylab='Density')

for ( i in 1:50) {

  temp <- rdir.Fstar(500,250,y)

  data <- rep( temp[,2], round(10000*temp[,1]) )

  temp <- density(log(data),width=(max(data)-min(data))/8)

  lines(temp,lty=2)

}

# still not big enough; i'll try m = 5000

hist( log( y ), breaks = 10, probability = T,
  xlim = c( -4, 4 ), col = 'blue', lwd = 3, main = '' )

lines(grid,dnorm(grid),lty=1,lwd=2,ylim=c(0,1.5),
  xlim=c(-4,4),xlab='log(y)',ylab='Density')

for ( i in 1:50) {

  temp <- rdir.Fstar(5000,250,y)

  data <- rep( temp[,2], round(10000*temp[,1]) )

  temp <- density(log(data),width=(max(data)-min(data))/8)

  lines(temp,lty=2)

}

# now the posterior draws are largely ignoring the data and
# coming mostly from the prior centering distribution

# in the limit as c -> infinity the posterior will
# completely ignore the data and draw directly from
# the prior centering distribution