data-jupyter-notebooks/ab_test_guest_journey_monitoring.ipynb

708 lines
66 KiB
Text
Raw Normal View History

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# A/B test monitoring\n",
"\n",
"## Initial setup\n",
"This first section just ensures that the connection to DWH works correctly."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pathlib\n",
"import yaml\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sqlalchemy import create_engine\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from statsmodels.stats.proportion import proportions_ztest\n",
"from scipy import stats\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/uri/.superhog-dwh/credentials.yml\n"
]
}
],
"source": [
"CREDS_FILEPATH = pathlib.Path.home() / \".superhog-dwh\" / \"credentials.yml\"\n",
"print(CREDS_FILEPATH)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Prepare connection to DWH\n",
"# Function to read credentials from the YAML file\n",
"def read_credentials(yaml_path: str, env: str = \"prd\"):\n",
" with open(yaml_path, \"r\") as file:\n",
" credentials = yaml.safe_load(file)\n",
" return credentials[\"envs\"][env]\n",
"# Function to create a PostgreSQL connection string\n",
"def create_postgres_engine(creds: dict):\n",
" user = creds[\"user\"]\n",
" password = creds[\"password\"]\n",
" host = creds[\"host\"]\n",
" port = creds[\"port\"]\n",
" database = creds[\"database\"]\n",
" # Create the connection string for SQLAlchemy\n",
" connection_string = f\"postgresql://{user}:{password}@{host}:{port}/{database}\"\n",
" engine = create_engine(connection_string)\n",
" return engine\n",
"# Function to execute a query and return the result as a pandas DataFrame\n",
"def query_to_dataframe(engine, query: str):\n",
" with engine.connect() as connection:\n",
" df = pd.read_sql(query, connection)\n",
" return df\n",
"dwh_creds = read_credentials(yaml_path=CREDS_FILEPATH, env=\"prd\")\n",
"dwh_pg_engine = create_postgres_engine(creds=dwh_creds)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ?column?\n",
"0 1\n"
]
}
],
"source": [
"# Silly query to test things out\n",
"test_df = query_to_dataframe(engine=dwh_pg_engine, query=\"SELECT 1;\")\n",
"print(test_df.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data Extraction\n",
"In this section we extract the data from the Guest Journey monitoring within DWH by configuring which A/B test we want to measure. Here we already handle the basic aggregations that will be needed in the future, directly in SQL."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ab_test_name variation last_update guest_journeys_count \\\n",
"0 AAVariantTest A 2024-12-04 470 \n",
"1 AAVariantTest B 2024-12-04 478 \n",
"\n",
" guest_journey_started_count guest_journey_completed_count \\\n",
"0 470 284 \n",
"1 478 270 \n",
"\n",
" guest_journey_with_responses_count guest_journey_with_payment_count \\\n",
"0 75 146 \n",
"1 72 143 \n",
"\n",
" guest_revenue_count deposit_count ... \\\n",
"0 146 29 ... \n",
"1 143 24 ... \n",
"\n",
" guest_revenue_avg_per_guest_journey guest_revenue_sdv_per_guest_journey \\\n",
"0 7.836050 13.857327 \n",
"1 7.631339 13.202289 \n",
"\n",
" deposit_avg_per_guest_journey deposit_sdv_per_guest_journey \\\n",
"0 0.496000 2.104774 \n",
"1 0.367761 1.715542 \n",
"\n",
" waiver_avg_per_guest_journey waiver_sdv_per_guest_journey \\\n",
"0 7.180958 13.653492 \n",
"1 7.090684 12.993245 \n",
"\n",
" check_in_cover_avg_per_guest_journey check_in_cover_sdv_per_guest_journey \\\n",
"0 0.159091 1.210487 \n",
"1 0.172894 1.253337 \n",
"\n",
" csat_avg_per_guest_journey_with_response \\\n",
"0 3.853333 \n",
"1 3.972222 \n",
"\n",
" csat_sdv_per_guest_journey_with_response \n",
"0 1.204646 \n",
"1 1.020525 \n",
"\n",
"[2 rows x 26 columns]\n"
]
}
],
"source": [
"# A/B test name to measure\n",
"ab_test_name = \"AAVariantTest\"\n",
"\n",
"# Query to extract data\n",
"data_extraction_query = \"\"\"\n",
"select \n",
"\tab_test_name,\n",
"\tvariation,\n",
"\tmax(first_appearance_date_utc) as last_update,\n",
" \n",
" -- SIMPLE COUNTS --\n",
"\tcount(id_verification_request) as guest_journeys_count,\n",
"\tcount(verification_started_date_utc) as guest_journey_started_count,\n",
"\tcount(verification_completed_date_utc) as guest_journey_completed_count,\n",
"\tcount(experience_rating) as guest_journey_with_responses_count,\n",
"\tcount(last_payment_paid_date_utc) as guest_journey_with_payment_count,\n",
"\tcount(guest_revenue_without_taxes_in_gbp) as guest_revenue_count,\n",
"\tcount(deposit_fees_without_taxes_in_gbp) as deposit_count,\n",
"\tcount(waiver_fees_without_taxes_in_gbp) as waiver_count,\n",
"\tcount(check_in_cover_fees_without_taxes_in_gbp) as check_in_cover_count,\n",
" \n",
" -- SIMPLE SUMS --\n",
"\tsum(guest_revenue_without_taxes_in_gbp) as guest_revenue_sum,\n",
"\tsum(deposit_fees_without_taxes_in_gbp) as deposit_sum,\n",
"\tsum(waiver_fees_without_taxes_in_gbp) as waiver_sum,\n",
"\tsum(check_in_cover_fees_without_taxes_in_gbp) as check_in_cover_sum,\n",
" \n",
" -- AVGs/SDVs PER GUEST JOURNEY (ANY GJ APPEARING IN THE A/B TEST) --\n",
" -- NOTE THE COALESCE HERE. THIS IS IMPORTANT FOR THE T-TEST COMPUTATION --\n",
" avg(coalesce(guest_revenue_without_taxes_in_gbp,0)) as guest_revenue_avg_per_guest_journey,\n",
" stddev(coalesce(guest_revenue_without_taxes_in_gbp,0)) as guest_revenue_sdv_per_guest_journey,\n",
" avg(coalesce(deposit_fees_without_taxes_in_gbp,0)) as deposit_avg_per_guest_journey,\n",
" stddev(coalesce(deposit_fees_without_taxes_in_gbp,0)) as deposit_sdv_per_guest_journey,\n",
" avg(coalesce(waiver_fees_without_taxes_in_gbp,0)) as waiver_avg_per_guest_journey,\n",
" stddev(coalesce(waiver_fees_without_taxes_in_gbp,0)) as waiver_sdv_per_guest_journey,\n",
" avg(coalesce(check_in_cover_fees_without_taxes_in_gbp,0)) as check_in_cover_avg_per_guest_journey,\n",
" stddev(coalesce(check_in_cover_fees_without_taxes_in_gbp,0)) as check_in_cover_sdv_per_guest_journey,\n",
" \n",
" -- AVGs/SDVs PER GUEST JOURNEY WITH CSAT RESPONSE --\n",
" -- NOTE THAT THERE'S NO COALESCE HERE. THIS IS IMPORTANT FOR THE T-TEST COMPUTATION --\n",
" avg(experience_rating) as csat_avg_per_guest_journey_with_response,\n",
" stddev(experience_rating) as csat_sdv_per_guest_journey_with_response\n",
" \n",
"from\n",
"\tintermediate.int_core__ab_test_monitoring_guest_journey\n",
"where\n",
"\tab_test_name = '{}'\n",
"group by\n",
"\t1,2\n",
"\"\"\".format(ab_test_name)\n",
"\n",
"# Retrieve Data from Query\n",
"df = query_to_dataframe(engine=dwh_pg_engine, query=data_extraction_query)\n",
"print(df.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Check A/B test Allocation to Variation"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAf0AAAIYCAYAAABnrTUkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABv7ElEQVR4nO3dd3hUZdoG8Htaeu+VNJJAEkLvvSMiClhYBVRUPlfR3dVdUVfRta8KrouFFRRFELBQLKD0HgiBQEIgCZBCCOkhjSRTz/fHmJFhkpBAkjPl/l1XLuBMOc9khrnPW857JIIgCCAiIiKrJxW7ACIiIuoaDH0iIiIbwdAnIiKyEQx9IiIiG8HQJyIishEMfSIiIhvB0CciIrIRDH0iIiIbwdAnIiKyEQz9djp69CheeuklTJ06FQMHDkR8fDwGDx6Mu+++G6+99hoOHz4MW1vkcOPGjYiNjcW4cePELoVa8MADDyA2Nhbvv/9+m+7/xhtvIDY2Fo899lin1vX8888jNjYWGzdu7NT9XCs2NhaxsbFdtr+OMH36dMTGxiIhIQFXrlxp9b4t/U6b/p8+//zznVmqKMaNG4fY2FhcunRJ7FLMHkO/jSorK/HII49g3rx5+O6771BXV4d+/fphypQp6NOnDyorK7F27Vo8/PDDmDlzptjltsvRo0cRGxuLuXPnil0KdZK7774bALB582ZotdpW76tSqfDTTz8ZPc5SzJ07F7GxsTh69KjYpXSYtLQ0ZGVlAQDUajV+/PFHkSvqWmIcGFozudgFWIKamhrcf//9yM3NRWRkJF555RUMGTLE5H7Z2dn48ssvsXXrVhGqJGrZlClT8MYbb6CsrAz79+/H2LFjW7zvrl27UFVVBS8vr07vvXnmmWfw2GOPwc/Pr1P3cy1L+//5/fffAwD8/f1RUlKC77//Hg8++KDIVZmXL7/8Emq1Gv7+/mKXYvbY0m+D119/Hbm5uQgNDcX69eubDXwAiImJwVtvvYXVq1d3cYVErXN0dMTtt98OADdsMTXdPn36dCgUik6ty8/PD1FRUXB1de3U/VwrKioKUVFRXba/W9HQ0IBffvkFAPDuu+/CyckJ2dnZSEtLE7ky89KtWzdERUV1+ufVGjD0b+DixYv4+eefAQAvvPAC3N3db/iYxMREk203GnO6URdWUlISFi5ciBEjRiAhIQFDhw7Fk08+idTU1Gbvn5eXhxdeeAHjxo1DQkIC+vbti7Fjx2LBggX44YcfDPebO3cu5s2bBwBITk42jHd25Bh9cXExXn/9dUyaNAm9evVC//79MXv2bKxfv77ZruZly5YhNjYWy5Yta/b5WhqOuHZ7Q0MDPvzwQ9x2223o3bu34bVcO65ZX1+PJUuWYOLEiUhISMDw4cOxaNEilJSUtPhaSkpK8Pbbbxuet2/fvpg1axbWrFkDjUZjdN85c+YgNjbW8PlpzooVKxAbG4u//OUvLd6nozR11e/ZsweVlZXN3qekpASHDh0yun9lZSVWr16Nxx57DOPGjUNiYiL69euHmTNn4rPPPoNSqWz2ua4dO//hhx9w3333oX///kb/D1r63NfV1eHbb7/FwoULMWnSJPTp0wd9+vTBHXfcgQ8++AA1NTVG929675OTkwEA8+bNM/osX/v8rY3pV1VVYenSpbj99tsN7+/MmTOxYsUKNDY2mtz/2s+cWq3GZ599httvvx2JiYkYPHgwFi5ciAsXLjS7r7b49ddfUVdXh5iYGAwZMgRTp04F8EfrvyOlpaXhL3/5i9F3zOOPP274PLQkKSkJTz/9NEaNGoWEhAQMGTIEs2bNwn//+1+j+QdqtRpbtmzBs88+iylTpqBfv35ITEzE5MmT8cYbb5j8v7t06RJiY2OxadMmAPrv32vf02u/H1r7fm1oaMBnn32GGTNmoG/fvujduzduv/12fPDBB6iurja5f9N+x40bB0EQsGHDBsycORN9+vRB//79MX/+/Ba/dy0Bu/dvYM+ePdDpdHB3d2+1S7Qz/fvf/8YXX3wBqVSKhIQE9O/fH0VFRdi1axf27NmD119/HbNmzTLcPzs7G3/6059QV1eHiIgIjB07FlKpFCUlJTh27BhKSkoM9x85ciTs7Oxw8OBB+Pj4YOTIkYbn8fT0vOXa09LS8Nhjj6GqqgpBQUGYMGECamtrkZycjNTUVOzYsQOffvop7OzsbnlfTZRKJebOnYsLFy5gwIAB6NGjB6qqqozuU1tbi9mzZ6OoqAj9+/dHdHQ0Tp48ic2bN+PYsWPYsmWLSevz2LFjePLJJ1FdXY3g4GAMGzYMKpUK6enpeP3117Fnzx4sX77c0NqYN28ejh07hjVr1mDatGkmdep0Oqxbtw6A/gChsyUmJiImJgbZ2dn48ccf8dBDD5ncZ9OmTdBqtejduzeio6MBAAcOHMCbb74Jf39/hIWFGeawnDp1CkuWLMHu3buxevXqFt/D119/Hd988w369u2LMWPGoKCgABKJpNVaMzMz8fLLL8PLywsRERGIj49HTU0NTp8+jeXLl2Pbtm3YsGGD4TPq4+ODGTNm4MCBAygvL8eIESPg6+treL5u3brd8PdTUFCABx98EIWFhfDy8sLo0aOhVqtx9OhRvP/++9i2bRtWrVrV7IG/Wq3GggULkJqaigEDBiAqKgppaWnYsWMHjh49ik2bNiEkJOSGNVyvKdyb/r/OmjUL33//PbZu3YoXX3wRDg4O7X7O5nz77bd45ZVXoNPpEBcXh8GDB6OwsBB79uzBnj178NRTT2HhwoUmj3vjjTfw9ddfAwB69uyJAQMGoLa2Frm5ufj4448xePBgDB48GABQUVGB5557Dq6uroiKikJsbCwaGhpw9uxZfP311/jll1+wfv16hIWFAQCcnJwwY8YMHD9+HBcvXkS/fv0MtzXt70aqqqrw0EMP4ezZs3BxccGQIUOgUCiQnJyM5cuX4+eff8ZXX33V4nvzwgsv4Oeff0b//v0xZswYnD17FocOHTL8v+7du3e7f9eiE6hV//jHP4SYmBjhwQcfvKXnGTt2rBATEyMUFBQ0e/uiRYuEmJgY4YcffjDavmHDBiEmJkaYOHGicPbsWaPbkpOThb59+wrx8fFCbm6uYfvzzz8vxMTECJ988onJfhoaGoTk5GSjbUeOHBFiYmKEOXPm3NRr++GHH4SYmBhh7NixRtuVSqXhdS9evFhQqVSG2y5evGi4benSpUaP++9//yvExMQI//3vf5vdX0v1Nm2PiYkR7rjjDqG0tLTFWmNiYoT58+cLtbW1htuqqqqEO++8U4iJiRGWL19u9LjS0lJh0KBBQmxsrLB27VpBq9UabqusrBTmzZsnxMTECMuWLTNs12g0hteYkZFhUsvu3bsNtXaVL7/8UoiJiRGmTZvW7O2TJk0SYmJihA0bNhi2nT9/XkhNTTW5b1VVlTB//nwhJiZGWLFihcntTb/nfv36Nft4QWj5c19UVCQcPnzY6PcsCIJQX18vPPfcc0JMTIzw6quvmjzfnDlzhJiYGOHIkSPN7u/auq53zz33CDExMcLjjz8uXL161bC9oqJCmDFjhhATEyM888wzRo+59jN31113GX3mGhsbDb+fl19+ucV6WpKTkyPExMQI8fHxQkVFhWH7lClThJiYGGHTpk3NPq6l32nTZ3/RokVG2zMzM4W4uDghNjbW5Dn37t0rxMfHCzExMcLBgweNblu9erUQExMjDBo0SEhKSjKp49SpU8Lly5cN/66trRV27twpKJVKo/upVCphyZIlQkxMjPDYY4+1+fVcq6Xv17/+9a9CTEyMcM899wiVlZWG7XV1dcKjjz4qxMTECPfdd5/RYwoKCgzv6dixY4WcnBzDbRqNRnjhhRcM3x+WiN37N9DUPeXl5dXs7ZmZmXj++edNflJSUm553zqdztCFtXTpUvTo0cPo9oEDB+KJJ56AWq3Ghg0bDNsrKioAAKNHjzZ5TgcHBwwcOPCWa2uLbdu2obCwEH5+fvjnP/9pNN4WGhqKRYsWAQC+/vrrFruIb9bixYuNWnrXc3Jywttvvw0XFxfDNnd3dyxYsAA
"text/plain": [
"<Figure size 800x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Ensure Seaborn styling\n",
"sns.set_theme(style=\"whitegrid\")\n",
"\n",
"# Calculate the total guest_journeys_count per variation\n",
"grouped_data = df.groupby('variation')['guest_journeys_count'].sum()\n",
"\n",
"# Find the total count and other metadata\n",
"total_count = grouped_data.sum()\n",
"ab_test_name = df['ab_test_name'].iloc[0] # Assuming all rows are for the same A/B test\n",
"last_update = df['last_update'].max()\n",
"\n",
"# Create a pie chart using Seaborn styling\n",
"plt.figure(figsize=(8, 6))\n",
"colors = sns.color_palette(\"pastel\") # Seaborn pastel colors\n",
"\n",
"# Pie chart with labels inside each sector\n",
"plt.pie(\n",
" grouped_data, \n",
" labels=[f\"{var}\\n{count} ({count/total_count:.1%})\" for var, count in grouped_data.items()],\n",
" autopct=None, \n",
" colors=colors, \n",
" startangle=90,\n",
" wedgeprops={'edgecolor': 'none'}, # Remove edges around sectors\n",
" pctdistance=0.70, # Places the labels closer to the center (inside)\n",
" labeldistance=0.2 # Ensure labels are positioned inside the sectors\n",
")\n",
"\n",
"# Add title\n",
"plt.title(\"Guest Journey - Variation Allocation\", fontsize=16)\n",
"\n",
"# Add total count to the bottom-left\n",
"plt.text(-1.4, -1.3, f\"Total Count: {total_count}\", fontsize=10, ha='left', color='black')\n",
"\n",
"# Add A/B test name and last update to the bottom-right\n",
"plt.text(1.2, -1.3, f\"A/B Test: {ab_test_name}\", fontsize=8, ha='right', color='gray')\n",
"plt.text(1.2, -1.4, f\"Last Update: {last_update}\", fontsize=8, ha='right', color='gray')\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Statistical Analysis\n",
"In this section we compute the metrics needed for monitoring as well as check if there's any statistical difference between the different variations."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Z-test for Proportion Metrics (Rates)\n",
"This section defines the functions used to compute Z-test Proportion analysis"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# Generalized function to calculate Z-test for any metric\n",
"def calculate_z_test(df, metric_name, variation_a, variation_b, success_counts, total_counts):\n",
"\n",
" # Aggregate the success counts (numerator) and total counts (denominator) for each variation\n",
" success_a = df[df['variation'] == variation_a][success_counts].sum()\n",
" success_b = df[df['variation'] == variation_b][success_counts].sum()\n",
"\n",
" total_a = df[df['variation'] == variation_a][total_counts].sum()\n",
" total_b = df[df['variation'] == variation_b][total_counts].sum()\n",
"\n",
" # Calculate conversion rates for each variation\n",
" value_A = success_a / total_a if total_a != 0 else 0\n",
" value_B = success_b / total_b if total_b != 0 else 0\n",
"\n",
" # Absolute difference (B - A)\n",
" abs_diff = value_B - value_A\n",
"\n",
" # Relative difference (B - A) / A\n",
" rel_diff = (value_B - value_A) / value_A if value_A != 0 else 0\n",
"\n",
" # Perform the z-test for proportions\n",
" count = [success_a, success_b] # Success counts for A and B\n",
" nobs = [total_a, total_b] # Total counts for A and B\n",
" \n",
" # Calculate z-stat and p-value\n",
" z_stat, p_value = proportions_ztest(count, nobs)\n",
" \n",
" # Flag for significance at 95% level (p-value < 0.05)\n",
" is_significant = p_value < 0.05\n",
"\n",
" # Return the result as a dictionary\n",
" return {\n",
" 'metric': metric_name,\n",
" 'variation_A_name': variation_a,\n",
" 'variation_B_name': variation_b,\n",
" 'variation_A_value': value_A,\n",
" 'variation_B_value': value_B,\n",
" 'absolute_difference': abs_diff,\n",
" 'relative_difference': rel_diff,\n",
" 'statistic': z_stat,\n",
" 'p_value': p_value,\n",
" 'is_significant_95': is_significant\n",
" }\n",
"\n",
"# Function to run Z-tests for multiple metrics and aggregate results into a DataFrame\n",
"def run_z_tests(df, z_stat_metric_definition, variations):\n",
" results = []\n",
" \n",
" # Loop over all metrics in z_stat_metric_definition\n",
" for metric_name, metric_definition in z_stat_metric_definition.items():\n",
" success_counts = metric_definition['success_counts']\n",
" total_counts = metric_definition['total_counts']\n",
" \n",
" # Run the Z-test for each metric\n",
" result = calculate_z_test(df, metric_name, variation_a=variations[0], variation_b=variations[1], \n",
" success_counts=success_counts, total_counts=total_counts)\n",
" results.append(result)\n",
" \n",
" # Create a DataFrame from the results\n",
" results_df = pd.DataFrame(results)\n",
" \n",
" return results_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### T-test for non-proportion metrics\n",
"This section defines the functions used to compute T-tests for metrics outside of the proportion scope, mostly Revenue-related metrics."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Generalized function to calculate T-test for revenue-related metrics\n",
"def calculate_t_test(df, metric_name, variation_a, variation_b, metric_avg_column, metric_sdv_column, total_counts):\n",
" # Aggregate the avgs and standard deviations for each variation\n",
" mean_a = df[df['variation'] == variation_a][metric_avg_column].mean() # Assuming the avg is calculated for each group\n",
" mean_b = df[df['variation'] == variation_b][metric_avg_column].mean() # Assuming the avg is calculated for each group\n",
" \n",
" sdv_a = df[df['variation'] == variation_a][metric_sdv_column].mean() # Assuming the stddev is calculated for each group\n",
" sdv_b = df[df['variation'] == variation_b][metric_sdv_column].mean() # Assuming the stddev is calculated for each group\n",
" \n",
" total_a = df[df['variation'] == variation_a][total_counts].sum()\n",
" total_b = df[df['variation'] == variation_b][total_counts].sum()\n",
"\n",
" # Absolute difference (B - A)\n",
" abs_diff = mean_b - mean_a\n",
"\n",
" # Relative difference (B - A) / A\n",
" rel_diff = (mean_b - mean_a) / mean_a if mean_a != 0 else 0\n",
"\n",
" # Calculate the T-statistic and p-value using the formula for two-sample T-test\n",
" se_a = sdv_a / (total_a ** 0.5) if total_a != 0 else 0\n",
" se_b = sdv_b / (total_b ** 0.5) if total_b != 0 else 0\n",
"\n",
" # Standard error of the difference between the means\n",
" se_diff = (se_a ** 2 + se_b ** 2) ** 0.5\n",
" \n",
" # T-statistic formula\n",
" if se_diff != 0:\n",
" t_stat = (mean_a - mean_b) / se_diff\n",
" else:\n",
" t_stat = 0\n",
" \n",
" # Degrees of freedom (for independent samples)\n",
" df_degrees = min(total_a - 1, total_b - 1) # Using the smaller of the two sample sizes minus 1\n",
" \n",
" # P-value from the T-distribution\n",
" p_value = stats.t.sf(abs(t_stat), df_degrees) * 2 # Two-tailed test\n",
" \n",
" # Flag for significance at 95% level (p-value < 0.05)\n",
" is_significant = p_value < 0.05\n",
"\n",
" # Return the result as a dictionary\n",
" return {\n",
" 'metric': metric_name,\n",
" 'variation_A_name': variation_a,\n",
" 'variation_B_name': variation_b,\n",
" 'variation_A_value': mean_a,\n",
" 'variation_B_value': mean_b,\n",
" 'absolute_difference': abs_diff,\n",
" 'relative_difference': rel_diff,\n",
" 'statistic': t_stat,\n",
" 'p_value': p_value,\n",
" 'is_significant_95': is_significant\n",
" }\n",
"\n",
"# Function to run T-tests for multiple revenue metrics and aggregate results into a DataFrame\n",
"def run_t_tests(df, t_stat_metric_definition, variations):\n",
" results = []\n",
" \n",
" # Loop over all metrics in t_stat_metric_definition\n",
" for metric_name, metric_definition in t_stat_metric_definition.items():\n",
" metric_avg_column = metric_definition['metric_avg_column']\n",
" metric_sdv_column = metric_definition['metric_sdv_column']\n",
" total_counts = metric_definition['total_counts']\n",
" \n",
" # Run the T-test for each metric\n",
" result = calculate_t_test(df, metric_name, variation_a=variations[0], variation_b=variations[1], \n",
" metric_avg_column=metric_avg_column, metric_sdv_column=metric_sdv_column, \n",
" total_counts=total_counts)\n",
" results.append(result)\n",
" \n",
" # Create a DataFrame from the results\n",
" results_df = pd.DataFrame(results)\n",
" \n",
" return results_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Specify the metric definition for Z-stat and T-stat tests"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"# Define the variations in which we want to run the tests\n",
"var_A = 'A'\n",
"var_B = 'B'\n",
"variations = [var_A, var_B]\n",
"\n",
"# Define the Z-test metric definitions (with both success_counts and total_counts)\n",
"z_stat_metric_definition = {\n",
" 'conversion_rate': {\n",
" 'success_counts': 'guest_journey_completed_count',\n",
" 'total_counts': 'guest_journeys_count'\n",
" },\n",
" 'payment_rate': {\n",
" 'success_counts': 'guest_journey_with_payment_count',\n",
" 'total_counts': 'guest_journeys_count'\n",
" },\n",
" 'waiver_payment_rate': {\n",
" 'success_counts': 'waiver_count',\n",
" 'total_counts': 'guest_journeys_count'\n",
" },\n",
" 'deposit_payment_rate': {\n",
" 'success_counts': 'deposit_count',\n",
" 'total_counts': 'guest_journeys_count'\n",
" },\n",
" 'CIH_payment_rate': {\n",
" 'success_counts': 'check_in_cover_count',\n",
" 'total_counts': 'guest_journeys_count'\n",
" }\n",
"}\n",
"\n",
"# Define the T-test metric definitions (with both metric_avg_column and metric_sdv_column)\n",
"t_stat_metric_definition = {\n",
" 'avg_guest_revenue_per_gj': {\n",
" 'metric_avg_column': 'guest_revenue_avg_per_guest_journey',\n",
" 'metric_sdv_column': 'guest_revenue_sdv_per_guest_journey',\n",
" 'total_counts': 'guest_journeys_count'\n",
" },\n",
" 'avg_waiver_revenue_per_gj': {\n",
" 'metric_avg_column': 'waiver_avg_per_guest_journey',\n",
" 'metric_sdv_column': 'waiver_sdv_per_guest_journey',\n",
" 'total_counts': 'guest_journeys_count'\n",
" },\n",
" 'avg_deposit_revenue_per_gj': {\n",
" 'metric_avg_column': 'deposit_avg_per_guest_journey',\n",
" 'metric_sdv_column': 'deposit_sdv_per_guest_journey',\n",
" 'total_counts': 'guest_journeys_count'\n",
" },\n",
" 'avg_CIH_revenue_per_gj': {\n",
" 'metric_avg_column': 'check_in_cover_avg_per_guest_journey',\n",
" 'metric_sdv_column': 'check_in_cover_sdv_per_guest_journey',\n",
" 'total_counts': 'guest_journeys_count'\n",
" },\n",
" 'avg_csat_per_gj_with_response': {\n",
" 'metric_avg_column': 'csat_avg_per_guest_journey_with_response',\n",
" 'metric_sdv_column': 'csat_sdv_per_guest_journey_with_response',\n",
" 'total_counts': 'guest_journey_with_responses_count'\n",
" }\n",
"\n",
"}\n",
"\n",
"# Define the metrics that will be the main ones for this A/B test:\n",
"main_metrics = ['avg_guest_revenue_per_gj', 'conversion_rate', 'payment_rate']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Run the computation of the metrics and statistical significance"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" metric relative_difference is_significant_95\n",
"0 conversion_rate -0.065207 False\n",
"1 payment_rate -0.036940 False\n",
"2 waiver_payment_rate 0.000072 False\n",
"3 deposit_payment_rate -0.186265 False\n",
"4 CIH_payment_rate 0.106172 False\n",
"5 avg_guest_revenue_per_gj -0.026124 False\n",
"6 avg_waiver_revenue_per_gj -0.012571 False\n",
"7 avg_deposit_revenue_per_gj -0.258547 False\n",
"8 avg_CIH_revenue_per_gj 0.086757 False\n",
"9 avg_csat_per_gj_with_response 0.030854 False\n"
]
}
],
"source": [
"# Call the function to calculate the Z-test for each metric and aggregate the results\n",
"z_test_results_df = run_z_tests(df, z_stat_metric_definition=z_stat_metric_definition, variations=variations)\n",
"\n",
"# Call the function to calculate the T-test for each metric and aggregate the results\n",
"t_test_results_df = run_t_tests(df, t_stat_metric_definition=t_stat_metric_definition, variations=variations)\n",
"\n",
"# Add a new column to identify whether it's from Z-test or T-test\n",
"z_test_results_df['test_type'] = 'Z-test'\n",
"t_test_results_df['test_type'] = 'T-test'\n",
"\n",
"# Combine the dataframes after adding the 'test_type' column\n",
"combined_results_df = pd.concat([z_test_results_df, t_test_results_df], ignore_index=True)\n",
"\n",
"# Print the main aggregated DataFrame\n",
"print(combined_results_df[['metric','relative_difference','is_significant_95']])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Results\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"AAVariantTest results (last updated at 2024-12-04)\n",
"\n",
"Total Guest Journeys affected by this A/B test: 948 - Total Guest Revenue: 7330 GBP.\n",
" Variation A: Guest Journeys 470 (49.6%) - Guest Revenue: 3682 GBP (50.2%).\n",
" Variation B: Guest Journeys 478 (50.4%) - Guest Revenue: 3647 GBP (49.8%).\n",
"\n",
"Main Metrics - Comparing B vs. A.\n",
"\n",
"CONVERSION RATE (not significant): 56.5% vs. 60.4% (-3.9% | -6.5%).\n",
"PAYMENT RATE (not significant): 29.9% vs. 31.1% (-1.1% | -3.7%).\n",
"AVG GUEST REVENUE PER GJ (not significant): 7.63 vs. 7.84 (-0.2 | -2.6%).\n",
"\n",
"Other Metrics\n",
"\n",
"WAIVER PAYMENT RATE (not significant): 24.9% vs. 24.9% (0.0% | 0.0%).\n",
"DEPOSIT PAYMENT RATE (not significant): 5.0% vs. 6.2% (-1.1% | -18.6%).\n",
"CIH PAYMENT RATE (not significant): 1.9% vs. 1.7% (0.2% | 10.6%).\n",
"AVG WAIVER REVENUE PER GJ (not significant): 7.09 vs. 7.18 (-0.09 | -1.3%).\n",
"AVG DEPOSIT REVENUE PER GJ (not significant): 0.37 vs. 0.5 (-0.13 | -25.9%).\n",
"AVG CIH REVENUE PER GJ (not significant): 0.17 vs. 0.16 (0.01 | 8.7%).\n",
"AVG CSAT PER GJ WITH RESPONSE (not significant): 3.97 vs. 3.85 (0.12 | 3.1%).\n"
]
}
],
"source": [
"print('\\n{} results (last updated at {})\\n'.format(ab_test_name, last_update))\n",
"\n",
"# Get main volume indicators per variation\n",
"grouped_data = df.groupby('variation')[[\"guest_journeys_count\",\"guest_revenue_sum\"]].sum()\n",
"\n",
"# Find the totals over any variation\n",
"total_count = grouped_data.sum()\n",
"\n",
"# Print overall indicators for volumes\n",
"print('Total Guest Journeys affected by this A/B test: {} - Total Guest Revenue: {} GBP.'.format(int(total_count.loc[\"guest_journeys_count\"]), \n",
" int(total_count.loc[\"guest_revenue_sum\"])))\n",
"for var in variations:\n",
" print(' Variation {}: Guest Journeys {} ({}%) - Guest Revenue: {} GBP ({}%).'.format(\n",
" var, \n",
" int(grouped_data.loc[var,'guest_journeys_count']), \n",
" round(100*(grouped_data.loc[var,'guest_journeys_count']/total_count.loc[\"guest_journeys_count\"]),1),\n",
" int(grouped_data.loc[var,'guest_revenue_sum']),\n",
" round(100*(grouped_data.loc[var,'guest_revenue_sum']/total_count.loc[\"guest_revenue_sum\"]),1)\n",
" ))\n",
"\n",
"# Split results whether the metrics are main metrics or not\n",
"main_metrics_rows = combined_results_df[combined_results_df['metric'].isin(main_metrics)]\n",
"other_metrics_rows = combined_results_df[~combined_results_df['metric'].isin(main_metrics)]\n",
"\n",
"def print_metrics(df, header=None):\n",
" if header:\n",
" print(f'\\n{header}\\n')\n",
"\n",
" for row in df.iterrows():\n",
" metric = row[1]['metric'].upper().replace('_', ' ')\n",
" if row[1]['test_type'] == 'Z-test':\n",
" value_a = str(round(100 * row[1]['variation_A_value'], 1)) + '%'\n",
" value_b = str(round(100 * row[1]['variation_B_value'], 1)) + '%'\n",
" abs_diff = str(round(100 * row[1]['absolute_difference'], 1)) + '%'\n",
" else:\n",
" value_a = str(round(row[1]['variation_A_value'], 2))\n",
" value_b = str(round(row[1]['variation_B_value'], 2))\n",
" abs_diff = str(round(row[1]['absolute_difference'], 2))\n",
" rel_diff = str(round(100 * row[1]['relative_difference'], 1)) + '%'\n",
" stat_sign = row[1]['is_significant_95']\n",
"\n",
" if stat_sign:\n",
" print(f\"{metric} - SIGNIFICANT RESULT: {value_b} vs. {value_a} ({abs_diff} | {rel_diff}).\")\n",
" else:\n",
" print(f\"{metric} (not significant): {value_b} vs. {value_a} ({abs_diff} | {rel_diff}).\")\n",
"\n",
"# Print main metrics\n",
"print_metrics(main_metrics_rows, header=\"Main Metrics - Comparing {} vs. {}.\".format(var_B, var_A))\n",
"\n",
"# Print other metrics\n",
"print_metrics(other_metrics_rows, header=\"Other Metrics\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}