# change directory to where the following files are:

# large-data-example-treatment.txt
# large-data-example-control.txt

# unbuffer the output

# the data set here was generated by randomizing
# about 24 million subjects into two groups (treatment
# and control) of about 12 million people each

# the outcome variable y has lots of zeros, and in this
# version of the data the nonzero values have been
# scaled to live between 0 and 1 for confidentiality reasons

# big on this outcome scale is better than small

# the population quantity of interest is

#           ( treatment population mean - control population mean )
#   theta = -------------------------------------------------------
#                           control population mean

#  Group     n zeros  n nonzeros    n total

# Treatment 11100587     1133706   12234293
# Control   11096065     1135435   12231500

n.treatment.zeros <- 11100587

n.treatment.nonzeros <- 1133706

print( n.treatment.total <- n.treatment.zeros + 
  n.treatment.nonzeros )

# 12234293

print( treatment.zero.proportion <- n.treatment.zeros / 
  n.treatment.total )

# 0.9073338

nonzero.treatment.values <- 
  scan( "large-data-example-treatment.txt" )

# Read 1133706 items

n.control.zeros <- 11096065

n.control.nonzeros <- 1135435

print( n.control.total <- n.control.zeros + 
  n.control.nonzeros )

# 12231500

print( control.zero.proportion <- n.control.zeros / 
  n.control.total )

# 0.9071712

print( relative.difference.in.zero.abundance <-
  ( treatment.zero.proportion - control.zero.proportion ) /
  control.zero.proportion )

# 0.0001791461

# the control group had a tiny edge (only about 0.02%)
# in generating fewer zeros

nonzero.control.values <- scan( "large-data-example-control.txt" )

# Read 1135435 items

print( length( full.treatment.sample <- c( rep( 0, n.treatment.zeros ),
  nonzero.treatment.values ) ) )

# 12234293

print( length( full.control.sample <- c( rep( 0, n.control.zeros ),
  nonzero.control.values ) ) )

# 12231500

print( y.bar.treatment <- mean( full.treatment.sample ) )

# 9.98479e-05

print( s.treatment <- sd( full.treatment.sample ) )

# 0.001419089

print( se.treatment.mean <- s.treatment / sqrt( n.treatment.total ) )

# 4.057142e-07

print( y.bar.control <- mean( full.control.sample ) )

# 5.695974e-05

print( s.control <- sd( full.control.sample ) )

# 0.0009145258

print( se.control.mean <- s.control / sqrt( n.control.total ) )

# 2.614906e-07

print( se.for.difference.of.means <- sqrt( se.treatment.mean^2 +
  se.control.mean^2 ) )

# 4.826814e-07

print( theta.hat.mean.based <- ( y.bar.treatment - y.bar.control ) / 
  y.bar.control )

# 0.7529557

# the overall treament mean (zeros included) is about 75%
# bigger than overall the control mean (a huge difference)

print( se.theta.hat.mean.based <- sqrt( 
  ( - y.bar.treatment / y.bar.control^2 )^2 * se.control.mean^2 +
  ( 1 / y.bar.control )^2 * se.treatment.mean^2 ) )

# this is the Delta-method repeated-sampling standard error
# of the sample-mean-based lift estimate

# 0.01074692

theta.hat.mean.based / se.theta.hat.mean.based 

# 70.06247 (z-score on the percentage (relative comparison) scale)

print( theta.hat.interval <- c( theta.hat.mean.based -
  qnorm( 1 - 0.05 / 2 ) * se.theta.hat.mean.based, theta.hat.mean.based +
  qnorm( 1 - 0.05 / 2 ) * se.theta.hat.mean.based ) )

# 0.7318921 0.7740193 (95% central-limit-theorem (CLT)-based interval); 
# this is the result when the mean is internally calibrated

# this is a nonparametric analysis (no assumptions about
# control or treatment CDFs); it has assumed that the CLT operates on
# a distribution with 90% zeros and (as will be seen below)
# an extremely skewed histogram for the nonzero values
# (pro: very big sample sizes; con: violently non-normal)

# here's a detailed look at the nonzero treatment values

quantile( nonzero.treatment.values )

#           0%          25%          50%          75%         100% 
# 1.093889e-07 1.257972e-04 3.276197e-04 8.664692e-04 1.000000e+00 

hist( nonzero.treatment.values, breaks = 1000, probability = T,
  main = '', xlab = 'Nonzero Treatment Values' )

# extraordinarily long right-hand tail

log.nonzero.treatment.values <- log( nonzero.treatment.values )

hist( log.nonzero.treatment.values, breaks = 1000, probability = T,
  main = '', xlab = 'Log Nonzero Treatment Values' )

# lots of spikes representing frequent values of the
# outcome variable

# let's take a closer look between -12 and -11

table( log.nonzero.treatment.values[ 
  ( log.nonzero.treatment.values > -12 ) & 
  ( log.nonzero.treatment.values < -11 ) ] )

# -11.9853053552893 -11.9679137508367 -11.9508192961058 -11.9340121571302 
#                11                13                33                52 
# -11.9174828351971 -11.9012222949881 -11.8852219349182 -11.8694735598118 
#                17                17                 8                15 
# -11.8539694963371  -11.838702005028 -11.8236641090575 -11.8088490052137 
#                37                19                10                14 

# (some rows omitted)

# -11.2325663903357 -11.2243358045011 -11.2161724085735 -11.2080751144281 
#                23                18                26                31 
# -11.2000428601607  -11.192074609252  -11.184169349765 -11.1763260935738 
#              1125                96                56                46 
#  -11.168544584281 -11.1608224564187 -11.1531595031532  -11.145554824468 
#               567               183                52                50 
# -11.1380075407252 -11.1305167920543  -11.123081737765 -11.1157015557803 
#                91               158               272               117 
# -11.1083761093688 -11.1011032726742 -11.0938829484484 -11.0867143838068 
#               107               118               412                86 
# -11.0795968419411 -11.0725296016647 -11.0655119569746 -11.0585432166275 
#                35                38                20                60 
# -11.0516227037316 -11.0447503814958 -11.0379243440142 -11.0311445855962 
#               164                36                44               116 
#  -11.024410482941  -11.017721425256 -11.0110768139242 -11.0044760621824 
#               686               806                72                76 

# this suggests removing the spikes and treating them
# multinomially (their locations are known; all that's unknown
# are the relative frequencies)

# the file "treatment-top-500-deleted.txt" contains
# the nonzero treatment values with the spikes with
# the 500 highest frequencies removed

nonzero.treatment.values.with.spikes.removed <- 
  scan( "large-data-example-treatment-with-spikes-removed.txt" )

# Read 611655 items

log.nonzero.treatment.values.with.spikes.removed <-
  log( nonzero.treatment.values.with.spikes.removed )

par( mfrow = c( 2, 1 ) )

hist( log.nonzero.treatment.values.with.spikes.removed, 
  breaks = 1000, main = '', 
  xlab = 'Log( Treatment Values [Spikes Removed] )', probability = T,
  ylim = c( 0, 1 ), xlim = c( -15.5, 0 ) )

hist( log.nonzero.treatment.values, breaks = 1000, main = '',
  xlab = 'Log( All Treatment Values )', probability = T, 
  xlim = c( -15.5, 0 ), ylim = c( 0, 1 ) )

# the distribution of the log treatment values 
# with the spikes removed suggests fitting
# either a single normal distribution or a mixture
# of G >= 2 normal distributions

# the models for this can be set up readily in WinBUGS/rJAGS,
# but with 611,655 observations the calculations would need
# to be done by parallel processing to result in realistic
# clock times

# here i'll illustrate a maximum-likelihood approximation
# to the full-bayes calculation, using the R function Mclust

# here you have to (install and) load the mclust package
# before going on

library( mclust )

# the following command estimates the mixture model with G = 2

log.nonzero.treatment.values.with.spikes.removed.Mclust <- 
  Mclust( log.nonzero.treatment.values.with.spikes.removed, 
  G = 2, modelNames = "V" )

log.nonzero.treatment.values.with.spikes.removed.Mclust$parameters

# $Vinv
# NULL

# $pro
# [1] 0.4751107 0.5248893

# $mean
#         1         2 
# -7.748309 -7.279574 

# $variance
# $variance$modelName
# [1] "V"

# $variance$d
# [1] 1

# $variance$G
# [1] 2

# $variance$sigmasq
# [1] 2.875039 1.596238

# $variance$scale
# [1] 2.875039 1.596238

# this specifies a mixture of two normal distributions with
# weights 0.475 and 0.525 on the two mixture components,
# with component 1 having mean -7.748 and variance 2.875
# and component 2 having mean -7.280 and variance 1.596

par( mfrow = c( 1, 1 ) )

hist( log.nonzero.treatment.values.with.spikes.removed, 
  breaks = 100, main = '',
  xlab = 'Log( Treatment Values [Spikes Removed] )', probability = T,
  xlim = c( -15.5, 0 ) )

log.values.grid <- seq( -15.5, 0, length = 500 )

mixture.gaussian.density <- function( y, p, mu, sigma ) {

  G <- length( p )

  result <- 0

  for ( i in 1:G ) {

    result <- result + p[ i ] * dnorm( y, mu[ i ], sigma[ i ] )

  }

  return( result )

}

p <- log.nonzero.treatment.values.with.spikes.removed.Mclust$parameters$pro

mu <- log.nonzero.treatment.values.with.spikes.removed.Mclust$parameters$mean

sigma <- 
  sqrt( log.nonzero.treatment.values.with.spikes.removed.Mclust$parameters$variance$sigmasq )

lines( log.values.grid, mixture.gaussian.density( log.values.grid,
  p, mu, sigma ), lty = 1, col = 'red' )

# the fit with 2 mixture components is terrific

# i conclude that a good parametric model for the treatment values
# in this experiment is as follows -- 

# a mixture model with 3 components:

# (1) a large spike at 0 (with probability about 0.907)

# (2) 500 smaller spikes at known locations with unknown probabilities
#     (this is modeled multinomially)  

# (3) a mixture of 2 normal distributions for the data with the
#     spikes removed

# a similar parallel analysis on the control data yields the
# same basic parametric model in that group

# this analysis can be used to draw parametric inferences about 
# theta, either of a maximum-likelihood or bayesian nature
# (the latter would involve significant clock time even with
# a decent parallel cluster of machines)

# another nonparametric alternative: the bootstrap

M <- 1000

mu.star.treatment <- rep( NA, M )

mu.star.control <- rep( NA, M )

for ( i in 1:M ) { 

  mu.star.treatment[ i ] <- mean( sample( full.treatment.sample,
    n.treatment.total, replace = T ) )

  mu.star.control[ i ] <- mean( sample( full.control.sample,
    n.control.total, replace = T ) )

  if ( ( i %% 10 ) == 0 ) print( i )

}

theta.star <- ( mu.star.treatment - mu.star.control ) /
  mu.star.control

hist( theta.star, breaks = 100, probability = T, main = '',
  xlab = 'Bootstrapped Lift Values', ylab = 'Density' )

print( bootstrap.interval <- quantile( theta.star, 
  probs = c( 0.025, 0.975 ) ) )

#      2.5%     97.5% 
# 0.7323679 0.7734107 

# CLT result was ( 0.7318921, 0.7740193 ); quite close