rm( list = ls( ) ) # set working directory to the appropriate place setwd( '' ) y.T.nonzero <- scan( 'pgmb-raw-5108-treatment-v1.txt' ) # Read 1133706 items y.C.nonzero <- scan( 'pgmb-raw-5108-control-v1.txt' ) # Read 1135435 items n.zero.T.values <- 11100587 n.zero.C.values <- 11096065 n.nonzero.T.values <- length( y.T.nonzero ) n.nonzero.C.values <- length( y.C.nonzero ) print( n.T <- n.zero.T.values + n.nonzero.T.values ) # 12234293 print( n.C <- n.zero.C.values + n.nonzero.C.values ) # 12231500 print( p.hat.T <- n.zero.T.values / n.T ) # 0.9073338 print( p.hat.C <- n.zero.C.values / n.C ) # 0.9071712 c( mean( y.T.nonzero ), sd( y.T.nonzero ) ) # 98.50182 415.70605 c( mean( y.C.nonzero ), sd( y.C.nonzero ) ) # 99.14066 475.69558 mean( c( y.T.nonzero, y.C.nonzero ) ) # 98.82148 sd( c( y.T.nonzero, y.C.nonzero ) ) # 446.7318 length( y.T <- c( rep( 0, n.zero.T.values ), y.T.nonzero ) ) # 12234293 c( mean( y.T ), sd( y.T ) ) # 9.127794 129.728831 length( y.C <- c( rep( 0, n.zero.C.values ), y.C.nonzero ) ) # 12231500 c( mean( y.C ), sd( y.C ) ) # 9.203105 147.761848 length( y <- c( y.T, y.C ) ) # 24465793 c( mean( y ), sd( y ) ) # 9.165445 139.036979 y.bar.T <- mean( y.T ) y.bar.C <- mean( y.C ) print( theta.hat <- ( y.bar.T - y.bar.C ) / y.bar.C ) # -0.00818323 quantile( y.T.nonzero ) # 0% 25% 50% 75% 100% # 0.01 11.50 29.95 79.21 91416.98 quantile( y.C.nonzero ) # 0% 25% 50% 75% 100% # 0.01 11.54 29.94 79.00 161572.10 # pdf( 'figure-1.pdf' ) par( mfrow = c( 2, 1 ) ) hist( y.T.nonzero, breaks = 100000, prob = T, xlab = 'Non-Zero GMVB', main = 'Treatment', xlim = c( 0, 162000 ), ylim = c( 0, 0.035 ) ) hist( y.C.nonzero, breaks = 100000, prob = T, xlab = 'Non-Zero GMVB', main = 'Control', xlim = c( 0, 162000 ), ylim = c( 0, 0.035 ) ) # dev.off( ) log.y.T.nonzero <- log( y.T.nonzero ) log.y.C.nonzero <- log( y.C.nonzero ) quantile( log.y.T.nonzero ) # 0% 25% 50% 75% 100% # -4.605170 2.442347 3.399529 4.372103 11.423187 quantile( log.y.C.nonzero ) # 0% 25% 50% 75% 100% # -4.605170 2.445819 3.399195 4.369448 11.992707 # pdf( 'figure-2.pdf' ) par( mfrow = c( 2, 1 ) ) hist( log.y.T.nonzero, breaks = 100000, prob = T, xlab = 'Log( Non-Zero GMVB )', main = 'Treatment', xlim = c( -4.61, 12.0 ), ylim = c( 0, 70 ) ) hist( log.y.C.nonzero, breaks = 100000, prob = T, xlab = 'Log( Non-Zero GMVB )', main = 'Control', xlim = c( -4.61, 12.0 ), ylim = c( 0, 70 ) ) dev.off( ) table( y.T.nonzero[ ( y.T.nonzero >= 9 ) & ( y.T.nonzero <= 11 ) ] ) # shows that the enormous spike is at $9.99 library( moments ) kurtosis( rnorm( 1000 ) ) # 2.976983 c( skewness( y.T.nonzero ), kurtosis( y.T.nonzero ) - 3 ) # 51.04047 6016.96940 c( skewness( y.C.nonzero ), kurtosis( y.C.nonzero ) - 3 ) # 105.6247 26658.1007 c( skewness( y.T ), kurtosis( y.T ) - 3 ) # 157.6237 59247.1719 c( skewness( y.C ), kurtosis( y.C ) - 3 ) # 328.9281 266639.8269 c( skewness( y ), kurtosis( y ) - 3 ) # 261.413 192503.853 c( skewness( c( y.T.nonzero, y.C.nonzero ) ), kurtosis( c( y.T.nonzero, y.C.nonzero ) ) - 3 ) # 84.3619 19404.0676 sd( y.T ) / mean( y.T ) # 14.21251 sd( y.T.nonzero ) / mean( y.T.nonzero ) # 4.220288 sd( y.C ) / mean( y.C ) # 16.05565 sd( y.C.nonzero ) / mean( y.C.nonzero ) # 4.798188 sd( y ) / mean( y ) # 15.16969 sd( c( y.T.nonzero, y.C.nonzero ) ) / mean( c( y.T.nonzero, y.C.nonzero ) ) # 4.520594 s.C <- sd( y.C ) s.T <- sd( y.T ) print( se.theta.hat <- sqrt( y.bar.T^2 * s.C^2 / ( y.bar.C^4 * n.C ) + s.T^2 / ( y.bar.C^2 * n.T ) ) ) # 0.006080573 print( se.theta.hat.2 <- sqrt( ( 1 / y.bar.C^2 ) * ( s.C^2 / n.C + s.T^2 / n.T ) ) ) # 0.006108755 se.theta.hat / se.theta.hat.2 # 0.9953866 print( se.theta.hat.another.way <- sqrt( ( 1 / y.bar.C^2 ) * ( ( theta.hat + 1 )^2 * s.C^2 / n.C + s.T^2 / n.T ) ) ) # 0.006080573