# this code samples from a DP prior with c = cc, # centered at the standard lognormal distribution F, # truncating the infinite sum in the stick-breaking # algorithm to m terms # note: what's called c here is often instead called # alpha in the notation DP( alpha F ) or DP( alpha, F ) set.seed( 5446663333 ) rdir.ln <- function( m, cc ) { theta <- rlnorm( m, 0.0, 1.0 ) v <- rep( 0, m ) w <- rbeta( m, 1.0, cc ) v[1] <- w[1] for ( j in 2:m ) { v[j] <- w[j] * v[j-1] * ( 1.0 - w[j - 1] ) / w[j - 1] } print( sum( v ) ) temp <- cbind( v, theta ) return( temp[order(temp[, 2], temp[, 1]), 1:2] ) } # this function samples from a DP posterior arrived at # by updating a data vector y, starting with a DP prior # with c = cc, centered at the standard lognormal # distribution rdir.Fstar <- function( m, cc, y ) { n <- length( y ) theta <- rep( 0, m ) for ( i in 1:m ) { U <- runif( 1 ) S <- U < ( cc / ( cc + n ) ) if ( S ) theta[i] <- rlnorm( 1, 0.0, 1.0 ) else theta[i] <- sample( y, 1 ) } v <- rep( 0, m ) w <- rbeta( m, 1.0, cc ) v[1] <- w[1] for ( j in 2:m ) { v[j] <- w[j] * v[j-1] * ( 1.0 - w[j - 1] ) / w[j - 1] } print( sum( v ) ) temp <- cbind( v, theta ) return( temp[order(temp[, 2], temp[, 1]), 1:2] ) } # this code plots the output of rdir.ln with 100 # simulation replications and c = 1, on the log scale # to compare with the standard normal density, # and with m = 100 terms in the stick-breaking sums grid <- log(seq(0,25,length=1000)) plot(grid,dnorm(grid),type='l',lwd=2,ylim=c(0,1.5), xlim=c(-4,4),xlab='log(y)',ylab='Density') for ( i in 1:50) { temp <- rdir.ln(100,1) data <- rep( temp[,2], round(10000*temp[,1]) ) temp <- density(log(data),width=(max(data)-min(data))/8) lines(temp,lty=2) } # the numerical output is the stick-breaking sums with # the value of m used; if they're all close to 1, that # value of m is big enough (if not, m needs to be # increased) # you can see that, with such a small c value, the # prior is not being guided much on any single draw # by the centering distribution (but, believe it or not, # if you averaged all of these densities they would # average out to N( 0, 1 ) on the log scale) # this code repeats with c = 5 plot(grid,dnorm(grid),type='l',lwd=2,ylim=c(0,1.5), xlim=c(-4,4),xlab='log(y)',ylab='Density') for ( i in 1:50) { temp <- rdir.ln(100,5) data <- rep( temp[,2], round(10000*temp[,1]) ) temp <- density(log(data),width=(max(data)-min(data))/8) lines(temp,lty=2) } # the draws from the prior are now being gently # pulled toward the centering distribution # this code repeats with c = 20 plot(grid,dnorm(grid),type='l',lwd=2,ylim=c(0,1.5), xlim=c(-4,4),xlab='log(y)',ylab='Density') for ( i in 1:50) { temp <- rdir.ln(100,20) data <- rep( temp[,2], round(10000*temp[,1]) ) temp <- density(log(data),width=(max(data)-min(data))/8) lines(temp,lty=2) } # now the influence of the centering distribution # is strong # this code repeats with c = 100 plot(grid,dnorm(grid),type='l',lwd=2,ylim=c(0,1.5), xlim=c(-4,4),xlab='log(y)',ylab='Density') for ( i in 1:50) { temp <- rdir.ln(100,100) data <- rep( temp[,2], round(10000*temp[,1]) ) temp <- density(log(data),width=(max(data)-min(data))/8) lines(temp,lty=2) } # you can now see that i need a bigger value of m # to get accurate results, because the cumulative sums # are only on the order of 0.6-0.7; i'll try m = 500 plot(grid,dnorm(grid),type='l',lwd=2,ylim=c(0,1.5), xlim=c(-4,4),xlab='log(y)',ylab='Density') for ( i in 1:50) { temp <- rdir.ln(500,100) data <- rep( temp[,2], round(10000*temp[,1]) ) temp <- density(log(data),width=(max(data)-min(data))/8) lines(temp,lty=2) } # this is almost (but not quite) like putting point # mass on the centering distribution (i.e., fitting a # Bayesian parametric model in which the data were # regarded as IID from the LN( 0, 1 ) distribution, # which is what would happen in the limit as c -> infinity # this code generates a data set with n = 100 and a long # right-hand tail, even on the log scale y = rlnorm( 100 ) + 1 grid <- log(seq(0,25,length=1000)) hist( log( y ), breaks = 10, probability = T, xlim = c( -4, 4 ), col = 'blue', lwd = 3, main = '' ) # this code samples from the DP posterior obtained # by (a) starting with a DP prior centered at the standard # lognormal distribution and with c = 1 and (b) updating # that prior in light of the data set y, with m = 150 # terms in the stick-breaking sums lines(grid,dnorm(grid),lty=1,lwd=2,ylim=c(0,1.5), xlim=c(-4,4),xlab='log(y)',ylab='Density') for ( i in 1:50) { temp <- rdir.Fstar(150,1,y) data <- rep( temp[,2], round(10000*temp[,1]) ) temp <- density(log(data),width=(max(data)-min(data))/8) lines(temp,lty=2) } # with c = 1 almost all of the posterior draws are being # strongly guided by the data but have a lot of # variability (because n is only 100) # this code repeats with the same data set and c = 50 hist( log( y ), breaks = 10, probability = T, xlim = c( -4, 4 ), col = 'blue', lwd = 3, main = '' ) lines(grid,dnorm(grid),lty=1,lwd=2,ylim=c(0,1.5), xlim=c(-4,4),xlab='log(y)',ylab='Density') for ( i in 1:50) { temp <- rdir.Fstar(150,50,y) data <- rep( temp[,2], round(10000*temp[,1]) ) temp <- density(log(data),width=(max(data)-min(data))/8) lines(temp,lty=2) } # here i again need a bigger value of m; i'll try 250 hist( log( y ), breaks = 10, probability = T, xlim = c( -4, 4 ), col = 'blue', lwd = 3, main = '' ) lines(grid,dnorm(grid),lty=1,lwd=2,ylim=c(0,1.5), xlim=c(-4,4),xlab='log(y)',ylab='Density') for ( i in 1:50) { temp <- rdir.Fstar(250,50,y) data <- rep( temp[,2], round(10000*temp[,1]) ) temp <- density(log(data),width=(max(data)-min(data))/8) lines(temp,lty=2) } # now the posterior draws are starting to be a compromise # between the prior centering distribution and the data # this code repeats with c = 250 hist( log( y ), breaks = 10, probability = T, xlim = c( -4, 4 ), col = 'blue', lwd = 3, main = '' ) lines(grid,dnorm(grid),lty=1,lwd=2,ylim=c(0,1.5), xlim=c(-4,4),xlab='log(y)',ylab='Density') for ( i in 1:50) { temp <- rdir.Fstar(250,250,y) data <- rep( temp[,2], round(10000*temp[,1]) ) temp <- density(log(data),width=(max(data)-min(data))/8) lines(temp,lty=2) } # again i need a bigger value of m; i'll try 500 hist( log( y ), breaks = 10, probability = T, xlim = c( -4, 4 ), col = 'blue', lwd = 3, main = '' ) lines(grid,dnorm(grid),lty=1,lwd=2,ylim=c(0,1.5), xlim=c(-4,4),xlab='log(y)',ylab='Density') for ( i in 1:50) { temp <- rdir.Fstar(500,250,y) data <- rep( temp[,2], round(10000*temp[,1]) ) temp <- density(log(data),width=(max(data)-min(data))/8) lines(temp,lty=2) } # still not big enough; i'll try m = 5000 hist( log( y ), breaks = 10, probability = T, xlim = c( -4, 4 ), col = 'blue', lwd = 3, main = '' ) lines(grid,dnorm(grid),lty=1,lwd=2,ylim=c(0,1.5), xlim=c(-4,4),xlab='log(y)',ylab='Density') for ( i in 1:50) { temp <- rdir.Fstar(5000,250,y) data <- rep( temp[,2], round(10000*temp[,1]) ) temp <- density(log(data),width=(max(data)-min(data))/8) lines(temp,lty=2) } # now the posterior draws are largely ignoring the data and # coming mostly from the prior centering distribution # in the limit as c -> infinity the posterior will # completely ignore the data and draw directly from # the prior centering distribution