# unbuffer the output rm( list = ls( ) ) library( moments ) seed <- 1 set.seed( seed ) # set the working directory to the appropriate place setwd( '' ) nonzero.T.values <- scan( 'pgmb-raw-5108-treatment-v1.txt' ) # Read 1133706 items n.T.zero <- 11100587 print( n.T.nonzero <- length( nonzero.T.values ) ) # 1133706 print( n.T <- n.T.zero + n.T.nonzero ) # 12234293 nonzero.C.values <- scan( 'pgmb-raw-5108-control-v1.txt' ) # Read 1135435 items n.C.zero <- 11096065 print( n.C.nonzero <- length( nonzero.C.values ) ) # 1135435 print( n.C <- n.C.zero + n.C.nonzero ) # 12231500 print( p.hat.0.T <- n.T.zero / n.T ) # 0.9073338 print( n.0.T.star <- rbinom( 1, n.T, p.hat.0.T ) ) # 11100633 print( mu.T.star <- sum( sample( nonzero.T.values, n.T - n.0.T.star, replace = T ) ) / n.T ) # 9.108956 library( doParallel ) n.processors <- makeCluster( 8 ) registerDoParallel( n.processors ) parallel.mean.bootstrap <- function( x, M, n.T, p.hat.0.T ) { foreach( i = 1:M, .inorder = F, .multicombine = T, .combine = 'c' ) %dopar% { sum( sample( x, n.T - rbinom( 1, n.T, p.hat.0.T ), replace = T ) ) / n.T } } M.b <- 10000 system.time( mu.T.star.1 <- parallel.mean.bootstrap( nonzero.T.values, M.b, n.T, p.hat.0.T ) ) # on laptop with 4 cores and 8 threads, and resource monitors on # user system elapsed 6.98 0.72 119.28 # on home desktop with 4 cores and 8 threads # user system elapsed # 5.85 0.47 104.82 # on ebay workstation with 12 cores and 24 threads # user system elapsed # 10.03 4.93 65.67 # after initialization, all 8 threads ran at 100% cpu capacity, # using 3.98 GB memory with 32 GB available) c( mean( mu.T.star.1 ), sd( mu.T.star.1 ), skewness( mu.T.star.1 ), kurtosis( mu.T.star.1 ) - 3 ) # 9.12793089 0.03670651 0.07031938 -0.09545653 # on ebay workstation with 12 cores and 24 threads # 9.12759890 0.03713947 0.05379655 0.01791277 qqnorm( mu.T.star.1 ) abline( mean( mu.T.star.1 ), sd( mu.T.star.1 ) ) M.b <- 10000 set.seed( 2 ) system.time( mu.T.star.2 <- parallel.mean.bootstrap( nonzero.T.values, M.b, n.T, p.hat.0.T ) ) # on laptop with 4 cores and 8 threads, and resource monitors on # user system elapsed # 6.68 0.64 118.41 # on home desktop with 4 cores and 8 threads # user system elapsed # 65.95 4.45 1049.81 # second time # user system elapsed # 92.82 4.49 1077.34 # on ebay workstation with 12 cores and 24 threads # user system elapsed # 140.00 40.34 694.97 c( mean( mu.T.star.2 ), sd( mu.T.star.2 ), skewness( mu.T.star.2 ), kurtosis( mu.T.star.2 ) - 3 ) # 9.1277984478 0.0370740579 0.0413938322 0.0009448229 # on ebay workstation with 12 cores and 24 threads # 9.12760604 0.03708640 0.04856156 0.00870698 M.b <- 10000 set.seed( 3 ) print( p.hat.0.C <- n.C.zero / n.C ) system.time( mu.C.star.1 <- parallel.mean.bootstrap( nonzero.C.values, M.b, n.C, p.hat.0.C ) ) # user system elapsed # 8.74 0.41 114.64 c( mean( mu.C.star.1 ), sd( mu.C.star.1 ), skewness( mu.C.star.1 ), kurtosis( mu.C.star.1 ) - 3 ) # 9.20306533 0.04240193 0.04627489 0.01990905 M.b <- 10000 set.seed( 4 ) system.time( mu.C.star.2 <- parallel.mean.bootstrap( nonzero.C.values, M.b, n.C, p.hat.0.C ) ) # user system elapsed # 93.91 4.60 1076.14 c( mean( mu.C.star.2 ), sd( mu.C.star.2 ), skewness( mu.C.star.2 ), kurtosis( mu.C.star.2 ) - 3 ) # 9.20307313 0.04235194 0.08615757 0.05813485 qqnorm( mu.C.star.2, main = 'mu.C.star' ) abline( mean( mu.C.star.2 ), sd( mu.C.star.2 ), lwd = 2, col = 'red' ) hist( mu.C.star.2, breaks = 1000, main = '', prob = T, xlab = 'mu.C.star' ) mu.C.star.grid <- seq( min( mu.C.star.2 ), max( mu.C.star.2 ), length = 500 ) lines( mu.C.star.grid, dnorm( mu.C.star.grid, mean( mu.C.star.2 ), sd( mu.C.star.2 ) ), lty = 1, lwd = 2, col = 'red' ) theta.star.2 <- ( mu.T.star.2 - mu.C.star.2 ) / mu.C.star.2 qqnorm( theta.star.2, main = 'theta' ) abline( mean( theta.star.2 ), sd( theta.star.2 ), lwd = 2, col = 'red' ) hist( theta.star.2, breaks = 1000, main = '', prob = T, xlab = 'theta' ) theta.star.grid <- seq( min( theta.star.2 ), max( theta.star.2 ), length = 500 ) lines( theta.star.grid, dnorm( theta.star.grid, mean( theta.star.2 ), sd( theta.star.2 ) ), lty = 1, lwd = 2, col = 'red' ) sum( theta.star.2 > 0 ) / length( theta.star.2 ) # 0.09011 1 - sum( theta.star.2 > 0 ) / length( theta.star.2 ) # 0.90989 1 - pnorm( ( 0 - mean( theta.star.2 ) ) / sd( theta.star.2 ) ) # 0.09039402 pnorm( ( 0 - mean( theta.star.2 ) ) / sd( theta.star.2 ) ) # 0.909606 # pdf( 'figure-3.pdf' ) par( mfrow = c( 2, 2 ) ) qqnorm( mu.C.star.2, main = 'mu.C.star' ) abline( mean( mu.C.star.2 ), sd( mu.C.star.2 ), lwd = 2, col = 'red' ) hist( mu.C.star.2, breaks = 1000, main = '', prob = T, xlab = 'mu.C.star' ) mu.C.star.grid <- seq( min( mu.C.star.2 ), max( mu.C.star.2 ), length = 500 ) lines( mu.C.star.grid, dnorm( mu.C.star.grid, mean( mu.C.star.2 ), sd( mu.C.star.2 ) ), lty = 1, lwd = 2, col = 'red' ) qqnorm( theta.star.2, main = 'theta' ) abline( mean( theta.star.2 ), sd( theta.star.2 ), lwd = 2, col = 'red' ) hist( theta.star.2, breaks = 1000, main = '', prob = T, xlab = 'theta' ) theta.star.grid <- seq( min( theta.star.2 ), max( theta.star.2 ), length = 500 ) lines( theta.star.grid, dnorm( theta.star.grid, mean( theta.star.2 ), sd( theta.star.2 ) ), lty = 1, lwd = 2, col = 'red' ) segments( 0, 0, 0, 45, lwd = 2, col = 'blue' ) # dev.off( ) y.T <- c( rep( 0, n.T.zero ), nonzero.T.values ) y.bar.T <- mean( y.T ) y.C <- c( rep( 0, n.C.zero ), nonzero.C.values ) y.bar.C <- mean( y.C ) print( theta.hat <- ( y.bar.T - y.bar.C ) / y.bar.C ) # -0.00818323 s.C <- sd( y.C ) s.T <- sd( y.T ) print( se.theta.hat <- sqrt( ( y.bar.T^2 * s.C^2 ) / ( y.bar.C^4 * n.C ) + ( s.T^2 ) / ( y.bar.C^2 * n.T ) ) ) # 0.006080573 print( se.theta.hat.2 <- sqrt( ( 1 / y.bar.C^2 ) * ( s.C^2 / n.C + s.T^2 / n.T ) ) ) # 0.006108755 pnorm( theta.hat / se.theta.hat ) # 0.08918365 1 - pnorm( theta.hat / se.theta.hat ) # 0.9108164 pnorm( theta.hat / se.theta.hat.2 ) # 0.09018926 1 - pnorm( theta.hat / se.theta.hat.2 ) # 0.9098107