# change directory to where the following files are: # large-data-example-treatment.txt # large-data-example-control.txt # unbuffer the output # the data set here was generated by randomizing # about 24 million subjects into two groups (treatment # and control) of about 12 million people each # the outcome variable y has lots of zeros, and in this # version of the data the nonzero values have been # scaled to live between 0 and 1 for confidentiality reasons # big on this outcome scale is better than small # the population quantity of interest is # ( treatment population mean - control population mean ) # theta = ------------------------------------------------------- # control population mean # Group n zeros n nonzeros n total # Treatment 11100587 1133706 12234293 # Control 11096065 1135435 12231500 n.treatment.zeros <- 11100587 n.treatment.nonzeros <- 1133706 print( n.treatment.total <- n.treatment.zeros + n.treatment.nonzeros ) # 12234293 print( treatment.zero.proportion <- n.treatment.zeros / n.treatment.total ) # 0.9073338 nonzero.treatment.values <- scan( "large-data-example-treatment.txt" ) # Read 1133706 items n.control.zeros <- 11096065 n.control.nonzeros <- 1135435 print( n.control.total <- n.control.zeros + n.control.nonzeros ) # 12231500 print( control.zero.proportion <- n.control.zeros / n.control.total ) # 0.9071712 print( relative.difference.in.zero.abundance <- ( treatment.zero.proportion - control.zero.proportion ) / control.zero.proportion ) # 0.0001791461 # the control group had a tiny edge (only about 0.02%) # in generating fewer zeros nonzero.control.values <- scan( "large-data-example-control.txt" ) # Read 1135435 items print( length( full.treatment.sample <- c( rep( 0, n.treatment.zeros ), nonzero.treatment.values ) ) ) # 12234293 print( length( full.control.sample <- c( rep( 0, n.control.zeros ), nonzero.control.values ) ) ) # 12231500 print( y.bar.treatment <- mean( full.treatment.sample ) ) # 9.98479e-05 print( s.treatment <- sd( full.treatment.sample ) ) # 0.001419089 print( se.treatment.mean <- s.treatment / sqrt( n.treatment.total ) ) # 4.057142e-07 print( y.bar.control <- mean( full.control.sample ) ) # 5.695974e-05 print( s.control <- sd( full.control.sample ) ) # 0.0009145258 print( se.control.mean <- s.control / sqrt( n.control.total ) ) # 2.614906e-07 print( se.for.difference.of.means <- sqrt( se.treatment.mean^2 + se.control.mean^2 ) ) # 4.826814e-07 print( theta.hat.mean.based <- ( y.bar.treatment - y.bar.control ) / y.bar.control ) # 0.7529557 # the overall treament mean (zeros included) is about 75% # bigger than overall the control mean (a huge difference) print( se.theta.hat.mean.based <- sqrt( ( - y.bar.treatment / y.bar.control^2 )^2 * se.control.mean^2 + ( 1 / y.bar.control )^2 * se.treatment.mean^2 ) ) # this is the Delta-method repeated-sampling standard error # of the sample-mean-based lift estimate # 0.01074692 theta.hat.mean.based / se.theta.hat.mean.based # 70.06247 (z-score on the percentage (relative comparison) scale) print( theta.hat.interval <- c( theta.hat.mean.based - qnorm( 1 - 0.05 / 2 ) * se.theta.hat.mean.based, theta.hat.mean.based + qnorm( 1 - 0.05 / 2 ) * se.theta.hat.mean.based ) ) # 0.7318921 0.7740193 (95% central-limit-theorem (CLT)-based interval); # this is the result when the mean is internally calibrated # this is a nonparametric analysis (no assumptions about # control or treatment CDFs); it has assumed that the CLT operates on # a distribution with 90% zeros and (as will be seen below) # an extremely skewed histogram for the nonzero values # (pro: very big sample sizes; con: violently non-normal) # here's a detailed look at the nonzero treatment values quantile( nonzero.treatment.values ) # 0% 25% 50% 75% 100% # 1.093889e-07 1.257972e-04 3.276197e-04 8.664692e-04 1.000000e+00 hist( nonzero.treatment.values, breaks = 1000, probability = T, main = '', xlab = 'Nonzero Treatment Values' ) # extraordinarily long right-hand tail log.nonzero.treatment.values <- log( nonzero.treatment.values ) hist( log.nonzero.treatment.values, breaks = 1000, probability = T, main = '', xlab = 'Log Nonzero Treatment Values' ) # lots of spikes representing frequent values of the # outcome variable # let's take a closer look between -12 and -11 table( log.nonzero.treatment.values[ ( log.nonzero.treatment.values > -12 ) & ( log.nonzero.treatment.values < -11 ) ] ) # -11.9853053552893 -11.9679137508367 -11.9508192961058 -11.9340121571302 # 11 13 33 52 # -11.9174828351971 -11.9012222949881 -11.8852219349182 -11.8694735598118 # 17 17 8 15 # -11.8539694963371 -11.838702005028 -11.8236641090575 -11.8088490052137 # 37 19 10 14 # (some rows omitted) # -11.2325663903357 -11.2243358045011 -11.2161724085735 -11.2080751144281 # 23 18 26 31 # -11.2000428601607 -11.192074609252 -11.184169349765 -11.1763260935738 # 1125 96 56 46 # -11.168544584281 -11.1608224564187 -11.1531595031532 -11.145554824468 # 567 183 52 50 # -11.1380075407252 -11.1305167920543 -11.123081737765 -11.1157015557803 # 91 158 272 117 # -11.1083761093688 -11.1011032726742 -11.0938829484484 -11.0867143838068 # 107 118 412 86 # -11.0795968419411 -11.0725296016647 -11.0655119569746 -11.0585432166275 # 35 38 20 60 # -11.0516227037316 -11.0447503814958 -11.0379243440142 -11.0311445855962 # 164 36 44 116 # -11.024410482941 -11.017721425256 -11.0110768139242 -11.0044760621824 # 686 806 72 76 # this suggests removing the spikes and treating them # multinomially (their locations are known; all that's unknown # are the relative frequencies) # the file "treatment-top-500-deleted.txt" contains # the nonzero treatment values with the spikes with # the 500 highest frequencies removed nonzero.treatment.values.with.spikes.removed <- scan( "large-data-example-treatment-with-spikes-removed.txt" ) # Read 611655 items log.nonzero.treatment.values.with.spikes.removed <- log( nonzero.treatment.values.with.spikes.removed ) par( mfrow = c( 2, 1 ) ) hist( log.nonzero.treatment.values.with.spikes.removed, breaks = 1000, main = '', xlab = 'Log( Treatment Values [Spikes Removed] )', probability = T, ylim = c( 0, 1 ), xlim = c( -15.5, 0 ) ) hist( log.nonzero.treatment.values, breaks = 1000, main = '', xlab = 'Log( All Treatment Values )', probability = T, xlim = c( -15.5, 0 ), ylim = c( 0, 1 ) ) # the distribution of the log treatment values # with the spikes removed suggests fitting # either a single normal distribution or a mixture # of G >= 2 normal distributions # the models for this can be set up readily in WinBUGS/rJAGS, # but with 611,655 observations the calculations would need # to be done by parallel processing to result in realistic # clock times # here i'll illustrate a maximum-likelihood approximation # to the full-bayes calculation, using the R function Mclust # here you have to (install and) load the mclust package # before going on library( mclust ) # the following command estimates the mixture model with G = 2 log.nonzero.treatment.values.with.spikes.removed.Mclust <- Mclust( log.nonzero.treatment.values.with.spikes.removed, G = 2, modelNames = "V" ) log.nonzero.treatment.values.with.spikes.removed.Mclust$parameters # $Vinv # NULL # $pro # [1] 0.4751107 0.5248893 # $mean # 1 2 # -7.748309 -7.279574 # $variance # $variance$modelName # [1] "V" # $variance$d # [1] 1 # $variance$G # [1] 2 # $variance$sigmasq # [1] 2.875039 1.596238 # $variance$scale # [1] 2.875039 1.596238 # this specifies a mixture of two normal distributions with # weights 0.475 and 0.525 on the two mixture components, # with component 1 having mean -7.748 and variance 2.875 # and component 2 having mean -7.280 and variance 1.596 par( mfrow = c( 1, 1 ) ) hist( log.nonzero.treatment.values.with.spikes.removed, breaks = 100, main = '', xlab = 'Log( Treatment Values [Spikes Removed] )', probability = T, xlim = c( -15.5, 0 ) ) log.values.grid <- seq( -15.5, 0, length = 500 ) mixture.gaussian.density <- function( y, p, mu, sigma ) { G <- length( p ) result <- 0 for ( i in 1:G ) { result <- result + p[ i ] * dnorm( y, mu[ i ], sigma[ i ] ) } return( result ) } p <- log.nonzero.treatment.values.with.spikes.removed.Mclust$parameters$pro mu <- log.nonzero.treatment.values.with.spikes.removed.Mclust$parameters$mean sigma <- sqrt( log.nonzero.treatment.values.with.spikes.removed.Mclust$parameters$variance$sigmasq ) lines( log.values.grid, mixture.gaussian.density( log.values.grid, p, mu, sigma ), lty = 1, col = 'red' ) # the fit with 2 mixture components is terrific # i conclude that a good parametric model for the treatment values # in this experiment is as follows -- # a mixture model with 3 components: # (1) a large spike at 0 (with probability about 0.907) # (2) 500 smaller spikes at known locations with unknown probabilities # (this is modeled multinomially) # (3) a mixture of 2 normal distributions for the data with the # spikes removed # a similar parallel analysis on the control data yields the # same basic parametric model in that group # this analysis can be used to draw parametric inferences about # theta, either of a maximum-likelihood or bayesian nature # (the latter would involve significant clock time even with # a decent parallel cluster of machines) # another nonparametric alternative: the bootstrap M <- 1000 mu.star.treatment <- rep( NA, M ) mu.star.control <- rep( NA, M ) for ( i in 1:M ) { mu.star.treatment[ i ] <- mean( sample( full.treatment.sample, n.treatment.total, replace = T ) ) mu.star.control[ i ] <- mean( sample( full.control.sample, n.control.total, replace = T ) ) if ( ( i %% 10 ) == 0 ) print( i ) } theta.star <- ( mu.star.treatment - mu.star.control ) / mu.star.control hist( theta.star, breaks = 100, probability = T, main = '', xlab = 'Bootstrapped Lift Values', ylab = 'Density' ) print( bootstrap.interval <- quantile( theta.star, probs = c( 0.025, 0.975 ) ) ) # 2.5% 97.5% # 0.7323679 0.7734107 # CLT result was ( 0.7318921, 0.7740193 ); quite close