Copy-paste Python code to calculate statistical significance for your A/B tests. Includes p-value calculation, confidence intervals, sample size estimation, and a complete analysis function. Uses scipy and numpy—no external A/B testing libraries needed.
Who this is for
- Data analysts who need to validate A/B test results
- Engineers building internal experimentation tools
- Anyone who wants to understand the math behind A/B testing
Prerequisites
Install required packages:
pip install scipy numpyComplete A/B Test Analysis Script
Copy this entire script. It includes all functions you need for A/B test analysis.
"""
A/B Test Statistical Significance Calculator
Copy-paste ready Python script for analyzing A/B test results.
Author: ExperimentHQ (experimenthq.io)
License: MIT
"""
import numpy as np
from scipy import stats
from typing import Dict, Tuple, Optional
from dataclasses import dataclass
@dataclass
class ABTestResult:
"""Container for A/B test analysis results."""
control_rate: float
variant_rate: float
relative_lift: float
absolute_lift: float
p_value: float
confidence_level: float
is_significant: bool
confidence_interval: Tuple[float, float]
sample_size_control: int
sample_size_variant: int
power: float
def calculate_significance(
control_visitors: int,
control_conversions: int,
variant_visitors: int,
variant_conversions: int,
confidence_level: float = 0.95
) -> ABTestResult:
"""
Calculate statistical significance for an A/B test.
Args:
control_visitors: Number of visitors in control group
control_conversions: Number of conversions in control group
variant_visitors: Number of visitors in variant group
variant_conversions: Number of conversions in variant group
confidence_level: Desired confidence level (default 0.95 for 95%)
Returns:
ABTestResult with all calculated metrics
Example:
>>> result = calculate_significance(
... control_visitors=1000,
... control_conversions=50,
... variant_visitors=1000,
... variant_conversions=65
... )
>>> print(f"P-value: {result.p_value:.4f}")
>>> print(f"Significant: {result.is_significant}")
"""
# Calculate conversion rates
control_rate = control_conversions / control_visitors
variant_rate = variant_conversions / variant_visitors
# Calculate lift
absolute_lift = variant_rate - control_rate
relative_lift = (variant_rate - control_rate) / control_rate if control_rate > 0 else 0
# Pooled probability for two-proportion z-test
pooled_prob = (control_conversions + variant_conversions) / (control_visitors + variant_visitors)
# Standard error
se = np.sqrt(pooled_prob * (1 - pooled_prob) * (1/control_visitors + 1/variant_visitors))
# Z-score
z_score = absolute_lift / se if se > 0 else 0
# Two-tailed p-value
p_value = 2 * (1 - stats.norm.cdf(abs(z_score)))
# Confidence interval for the difference
alpha = 1 - confidence_level
z_critical = stats.norm.ppf(1 - alpha/2)
# Standard error for confidence interval (unpooled)
se_ci = np.sqrt(
(control_rate * (1 - control_rate) / control_visitors) +
(variant_rate * (1 - variant_rate) / variant_visitors)
)
ci_lower = absolute_lift - z_critical * se_ci
ci_upper = absolute_lift + z_critical * se_ci
# Calculate statistical power (post-hoc)
effect_size = absolute_lift / np.sqrt(pooled_prob * (1 - pooled_prob))
ncp = effect_size * np.sqrt(control_visitors * variant_visitors / (control_visitors + variant_visitors))
power = 1 - stats.norm.cdf(z_critical - ncp) + stats.norm.cdf(-z_critical - ncp)
return ABTestResult(
control_rate=control_rate,
variant_rate=variant_rate,
relative_lift=relative_lift,
absolute_lift=absolute_lift,
p_value=p_value,
confidence_level=confidence_level,
is_significant=p_value < (1 - confidence_level),
confidence_interval=(ci_lower, ci_upper),
sample_size_control=control_visitors,
sample_size_variant=variant_visitors,
power=power
)
def calculate_sample_size(
baseline_rate: float,
minimum_detectable_effect: float,
power: float = 0.8,
confidence_level: float = 0.95
) -> int:
"""
Calculate required sample size per variant.
Args:
baseline_rate: Current conversion rate (e.g., 0.05 for 5%)
minimum_detectable_effect: Relative effect to detect (e.g., 0.1 for 10%)
power: Statistical power (default 0.8 for 80%)
confidence_level: Confidence level (default 0.95 for 95%)
Returns:
Required sample size per variant
Example:
>>> n = calculate_sample_size(
... baseline_rate=0.05,
... minimum_detectable_effect=0.10
... )
>>> print(f"Need {n:,} visitors per variant")
"""
alpha = 1 - confidence_level
# Z-scores
z_alpha = stats.norm.ppf(1 - alpha/2) # Two-tailed
z_beta = stats.norm.ppf(power)
# Expected variant rate
variant_rate = baseline_rate * (1 + minimum_detectable_effect)
# Pooled standard deviation
p_bar = (baseline_rate + variant_rate) / 2
# Sample size formula
n = (
2 * p_bar * (1 - p_bar) * (z_alpha + z_beta)**2
) / (baseline_rate - variant_rate)**2
return int(np.ceil(n))
def detect_srm(
observed_control: int,
observed_variant: int,
expected_ratio: float = 0.5
) -> Dict[str, any]:
"""
Detect Sample Ratio Mismatch (SRM).
Args:
observed_control: Number of users in control
observed_variant: Number of users in variant
expected_ratio: Expected ratio for control (default 0.5 for 50/50)
Returns:
Dictionary with chi_squared, p_value, and srm_detected
Example:
>>> result = detect_srm(5500, 4500)
>>> if result['srm_detected']:
... print("WARNING: Sample Ratio Mismatch detected!")
"""
total = observed_control + observed_variant
expected_control = total * expected_ratio
expected_variant = total * (1 - expected_ratio)
chi_sq, p_value = stats.chisquare(
f_obs=[observed_control, observed_variant],
f_exp=[expected_control, expected_variant]
)
return {
'chi_squared': chi_sq,
'p_value': p_value,
'srm_detected': p_value < 0.001,
'observed_ratio': observed_control / total,
'expected_ratio': expected_ratio
}
def print_results(result: ABTestResult) -> None:
"""Pretty print A/B test results."""
print("\n" + "="*50)
print("A/B TEST RESULTS")
print("="*50)
print(f"\nSample Sizes:")
print(f" Control: {result.sample_size_control:,}")
print(f" Variant: {result.sample_size_variant:,}")
print(f"\nConversion Rates:")
print(f" Control: {result.control_rate:.2%}")
print(f" Variant: {result.variant_rate:.2%}")
print(f"\nLift:")
print(f" Relative: {result.relative_lift:+.2%}")
print(f" Absolute: {result.absolute_lift:+.4f}")
print(f"\nStatistical Analysis:")
print(f" P-value: {result.p_value:.4f}")
print(f" Confidence Level: {result.confidence_level:.0%}")
print(f" Confidence Interval: [{result.confidence_interval[0]:.4f}, {result.confidence_interval[1]:.4f}]")
print(f" Statistical Power: {result.power:.2%}")
print(f"\nConclusion:")
if result.is_significant:
if result.relative_lift > 0:
print(f" ✓ WINNER: Variant is significantly better ({result.relative_lift:+.2%})")
else:
print(f" ✗ LOSER: Variant is significantly worse ({result.relative_lift:+.2%})")
else:
print(f" ○ INCONCLUSIVE: No significant difference detected")
print("="*50 + "\n")
# ============================================================
# EXAMPLE USAGE
# ============================================================
if __name__ == "__main__":
# Example 1: Analyze A/B test results
print("Example 1: A/B Test Analysis")
result = calculate_significance(
control_visitors=10000,
control_conversions=500, # 5% conversion rate
variant_visitors=10000,
variant_conversions=550 # 5.5% conversion rate
)
print_results(result)
# Example 2: Calculate required sample size
print("\nExample 2: Sample Size Calculation")
sample_size = calculate_sample_size(
baseline_rate=0.05, # 5% current conversion rate
minimum_detectable_effect=0.10 # Want to detect 10% relative lift
)
print(f"Required sample size: {sample_size:,} visitors per variant")
print(f"Total visitors needed: {sample_size * 2:,}")
# Example 3: Check for Sample Ratio Mismatch
print("\nExample 3: SRM Detection")
srm_result = detect_srm(
observed_control=5500,
observed_variant=4500
)
print(f"SRM Detected: {srm_result['srm_detected']}")
print(f"P-value: {srm_result['p_value']:.6f}")
print(f"Observed ratio: {srm_result['observed_ratio']:.2%}")Quick Reference
Calculate Significance
result = calculate_significance(
control_visitors=1000,
control_conversions=50,
variant_visitors=1000,
variant_conversions=65
)
print(f"P-value: {result.p_value:.4f}")
print(f"Significant: {result.is_significant}")Calculate Sample Size
n = calculate_sample_size(
baseline_rate=0.05, # 5% conversion rate
minimum_detectable_effect=0.10 # 10% lift
)
print(f"Need {n:,} visitors per variant")Detect SRM
srm = detect_srm(
observed_control=5500,
observed_variant=4500
)
if srm['srm_detected']:
print("WARNING: SRM detected!")Sample Size Reference Table
Quick reference for sample size per variant (95% confidence, 80% power):
| Baseline Rate | 10% Lift | 20% Lift | 30% Lift |
|---|---|---|---|
| 1% | ~310,000 | ~78,000 | ~35,000 |
| 2% | ~153,000 | ~38,000 | ~17,000 |
| 5% | ~60,000 | ~15,000 | ~6,700 |
| 10% | ~28,000 | ~7,200 | ~3,200 |
Common Mistakes
❌ Peeking and stopping early
Don't check significance repeatedly and stop when p < 0.05. This inflates false positive rate to 30%+. Decide sample size upfront.
❌ Using one-tailed tests
Always use two-tailed tests unless you have a strong reason not to. One-tailed tests are easier to "game."
❌ Ignoring SRM
Always check for Sample Ratio Mismatch before analyzing results. SRM invalidates your entire experiment.