data-jupyter-notebooks/ab_test_guest_journey_monitoring.ipynb

709 lines
70 KiB
Text
Raw Normal View History

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# A/B test monitoring\n",
"\n",
"## Initial setup\n",
"This first section just ensures that the connection to DWH works correctly."
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"import pathlib\n",
"import yaml\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sqlalchemy import create_engine\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from statsmodels.stats.proportion import proportions_ztest\n",
"from scipy import stats\n"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/uri/.superhog-dwh/credentials.yml\n"
]
}
],
"source": [
"CREDS_FILEPATH = pathlib.Path.home() / \".superhog-dwh\" / \"credentials.yml\"\n",
"print(CREDS_FILEPATH)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"# Prepare connection to DWH\n",
"# Function to read credentials from the YAML file\n",
"def read_credentials(yaml_path: str, env: str = \"prd\"):\n",
" with open(yaml_path, \"r\") as file:\n",
" credentials = yaml.safe_load(file)\n",
" return credentials[\"envs\"][env]\n",
"# Function to create a PostgreSQL connection string\n",
"def create_postgres_engine(creds: dict):\n",
" user = creds[\"user\"]\n",
" password = creds[\"password\"]\n",
" host = creds[\"host\"]\n",
" port = creds[\"port\"]\n",
" database = creds[\"database\"]\n",
" # Create the connection string for SQLAlchemy\n",
" connection_string = f\"postgresql://{user}:{password}@{host}:{port}/{database}\"\n",
" engine = create_engine(connection_string)\n",
" return engine\n",
"# Function to execute a query and return the result as a pandas DataFrame\n",
"def query_to_dataframe(engine, query: str):\n",
" with engine.connect() as connection:\n",
" df = pd.read_sql(query, connection)\n",
" return df\n",
"dwh_creds = read_credentials(yaml_path=CREDS_FILEPATH, env=\"prd\")\n",
"dwh_pg_engine = create_postgres_engine(creds=dwh_creds)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ?column?\n",
"0 1\n"
]
}
],
"source": [
"# Silly query to test things out\n",
"test_df = query_to_dataframe(engine=dwh_pg_engine, query=\"SELECT 1;\")\n",
"print(test_df.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data Extraction\n",
"In this section we extract the data from the Guest Journey monitoring within DWH by configuring which A/B test we want to measure. Here we already handle the basic aggregations that will be needed in the future, directly in SQL."
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ab_test_name variation last_update \\\n",
"0 VerificationProductSelectionButtonPosition Fixed 2024-12-11 \n",
"1 VerificationProductSelectionButtonPosition Relative 2024-12-11 \n",
"\n",
" guest_journeys_count guest_journey_started_count \\\n",
"0 18 18 \n",
"1 24 24 \n",
"\n",
" guest_journey_completed_count guest_journey_with_responses_count \\\n",
"0 3 2 \n",
"1 10 3 \n",
"\n",
" guest_journey_with_payment_count guest_revenue_count deposit_count ... \\\n",
"0 4 4 1 ... \n",
"1 5 5 1 ... \n",
"\n",
" guest_revenue_avg_per_guest_journey guest_revenue_sdv_per_guest_journey \\\n",
"0 5.800667 13.402593 \n",
"1 5.902642 13.025999 \n",
"\n",
" deposit_avg_per_guest_journey deposit_sdv_per_guest_journey \\\n",
"0 0.367500 1.559170 \n",
"1 0.261454 1.280859 \n",
"\n",
" waiver_avg_per_guest_journey waiver_sdv_per_guest_journey \\\n",
"0 4.910261 11.769242 \n",
"1 5.641188 13.081060 \n",
"\n",
" check_in_cover_avg_per_guest_journey check_in_cover_sdv_per_guest_journey \\\n",
"0 0.522906 2.2185 \n",
"1 0.000000 0.0000 \n",
"\n",
" csat_avg_per_guest_journey_with_response \\\n",
"0 4.0 \n",
"1 3.0 \n",
"\n",
" csat_sdv_per_guest_journey_with_response \n",
"0 1.414214 \n",
"1 1.732051 \n",
"\n",
"[2 rows x 26 columns]\n"
]
}
],
"source": [
"# A/B test name to measure\n",
"#ab_test_name = \"AAVariantTest\"\n",
"ab_test_name = \"VerificationProductSelectionButtonPosition\"\n",
"\n",
"# Query to extract data\n",
"data_extraction_query = \"\"\"\n",
"select \n",
"\tab_test_name,\n",
"\tvariation,\n",
"\tmax(first_appearance_date_utc) as last_update,\n",
" \n",
" -- SIMPLE COUNTS --\n",
"\tcount(id_verification_request) as guest_journeys_count,\n",
"\tcount(verification_started_date_utc) as guest_journey_started_count,\n",
"\tcount(verification_completed_date_utc) as guest_journey_completed_count,\n",
"\tcount(experience_rating) as guest_journey_with_responses_count,\n",
"\tcount(last_payment_paid_date_utc) as guest_journey_with_payment_count,\n",
"\tcount(guest_revenue_without_taxes_in_gbp) as guest_revenue_count,\n",
"\tcount(deposit_fees_without_taxes_in_gbp) as deposit_count,\n",
"\tcount(waiver_fees_without_taxes_in_gbp) as waiver_count,\n",
"\tcount(check_in_cover_fees_without_taxes_in_gbp) as check_in_cover_count,\n",
" \n",
" -- SIMPLE SUMS --\n",
"\tsum(guest_revenue_without_taxes_in_gbp) as guest_revenue_sum,\n",
"\tsum(deposit_fees_without_taxes_in_gbp) as deposit_sum,\n",
"\tsum(waiver_fees_without_taxes_in_gbp) as waiver_sum,\n",
"\tsum(check_in_cover_fees_without_taxes_in_gbp) as check_in_cover_sum,\n",
" \n",
" -- AVGs/SDVs PER GUEST JOURNEY (ANY GJ APPEARING IN THE A/B TEST) --\n",
" -- NOTE THE COALESCE HERE. THIS IS IMPORTANT FOR THE T-TEST COMPUTATION --\n",
" avg(coalesce(guest_revenue_without_taxes_in_gbp,0)) as guest_revenue_avg_per_guest_journey,\n",
" stddev(coalesce(guest_revenue_without_taxes_in_gbp,0)) as guest_revenue_sdv_per_guest_journey,\n",
" avg(coalesce(deposit_fees_without_taxes_in_gbp,0)) as deposit_avg_per_guest_journey,\n",
" stddev(coalesce(deposit_fees_without_taxes_in_gbp,0)) as deposit_sdv_per_guest_journey,\n",
" avg(coalesce(waiver_fees_without_taxes_in_gbp,0)) as waiver_avg_per_guest_journey,\n",
" stddev(coalesce(waiver_fees_without_taxes_in_gbp,0)) as waiver_sdv_per_guest_journey,\n",
" avg(coalesce(check_in_cover_fees_without_taxes_in_gbp,0)) as check_in_cover_avg_per_guest_journey,\n",
" stddev(coalesce(check_in_cover_fees_without_taxes_in_gbp,0)) as check_in_cover_sdv_per_guest_journey,\n",
" \n",
" -- AVGs/SDVs PER GUEST JOURNEY WITH CSAT RESPONSE --\n",
" -- NOTE THAT THERE'S NO COALESCE HERE. THIS IS IMPORTANT FOR THE T-TEST COMPUTATION --\n",
" avg(experience_rating) as csat_avg_per_guest_journey_with_response,\n",
" stddev(experience_rating) as csat_sdv_per_guest_journey_with_response\n",
" \n",
"from\n",
"\tintermediate.int_core__ab_test_monitoring_guest_journey\n",
"where\n",
"\tab_test_name = '{}'\n",
"group by\n",
"\t1,2\n",
"\"\"\".format(ab_test_name)\n",
"\n",
"# Retrieve Data from Query\n",
"df = query_to_dataframe(engine=dwh_pg_engine, query=data_extraction_query)\n",
"print(df.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Check A/B test Allocation to Variation"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAf0AAAIYCAYAAABnrTUkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB52UlEQVR4nO3dd3gUdf4H8PeW9N5DeiEJkBAg9F6kg4gURaUoKocKnKf3s52Hnnqn3tk9lcMCIqKgNEVAivSeEAiEBAJJSO+9bZ3fH2tWlnRSZjf7fj1PHsju7MxnS/Y93zIzEkEQBBAREVG3JxW7ACIiIuoaDH0iIiIzwdAnIiIyEwx9IiIiM8HQJyIiMhMMfSIiIjPB0CciIjITDH0iIiIzwdAnIiIyEwz9Njpz5gxefvllTJ8+HYMHD0ZkZCSGDh2KefPm4bXXXsPJkydhbic53LZtGyIiIjBhwgSxS6EmPPTQQ4iIiMA777zTquXfeOMNRERE4PHHH+/Uul544QVERERg27ZtnbqdW0VERCAiIqLLttcRZs2ahYiICERFRaG0tLTZZZt6Tev/Tl944YXOLFUUEyZMQEREBLKyssQuxegx9FuppKQEjz76KBYvXowffvgBVVVViImJwdSpU9G/f3+UlJTg22+/xSOPPII5c+aIXW6bnDlzBhEREVi0aJHYpVAnmTdvHgBgx44d0Gg0zS6rVCrx888/GzzOVCxatAgRERE4c+aM2KV0mISEBFy9ehUAoFKp8NNPP4lcUdcSY8ewO5OLXYApqKiowIMPPoi0tDSEhITglVdewbBhwxosd+3aNaxfvx67d+8WoUqipk2dOhVvvPEGCgsLcfToUYwfP77JZQ8ePIiysjK4urp2eu/NM888g8cffxyenp6dup1bmdrf548//ggA8PLyQn5+Pn788UcsWbJE5KqMy/r166FSqeDl5SV2KUaPLf1WeP3115GWlgZ/f398//33jQY+AISHh+Nf//oXNmzY0MUVEjXPxsYGM2bMAIAWW0z198+aNQsWFhadWpenpydCQ0Ph4ODQqdu5VWhoKEJDQ7tse+1RW1uLX375BQDw73//G7a2trh27RoSEhJErsy4BAQEIDQ0tNM/r90BQ78FGRkZ2LVrFwDgxRdfhJOTU4uPiY6ObnBbS2NOLXVhnTp1CitWrMCoUaMQFRWF4cOH46mnnkJ8fHyjy6enp+PFF1/EhAkTEBUVhQEDBmD8+PFYtmwZtm7dql9u0aJFWLx4MQDg7Nmz+vHOjhyjz8vLw+uvv47Jkyejb9++GDhwIBYsWIDvv/++0a7mjz/+GBEREfj4448bXV9TwxG33l5bW4sPP/wQ06ZNQ79+/fTP5dZxzZqaGrz77ruYNGkSoqKiMHLkSDz//PPIz89v8rnk5+fjzTff1K93wIABmDt3LjZu3Ai1Wm2w7MKFCxEREaH//DTm888/R0REBP785z83uUxHqe+qP3ToEEpKShpdJj8/HydOnDBYvqSkBBs2bMDjjz+OCRMmIDo6GjExMZgzZw7Wrl0LhULR6LpuHTvfunUr7r//fgwcONDg76Cpz31VVRW2bNmCFStWYPLkyejfvz/69++Pu+++G++//z4qKioMlq9/78+ePQsAWLx4scFn+db1NzemX1ZWhvfeew8zZszQv79z5szB559/jrq6ugbL3/qZU6lUWLt2LWbMmIHo6GgMHToUK1aswI0bNxrdVmvs3bsXVVVVCA8Px7BhwzB9+nQAf7T+O1JCQgL+/Oc/G3zHLF++XP95aMqpU6ewatUqjBkzBlFRURg2bBjmzp2Ljz76yGD+gUqlws6dO/Hss89i6tSpiImJQXR0NKZMmYI33nijwd9dVlYWIiIisH37dgC6799b39Nbvx+a+36tra3F2rVrce+992LAgAHo168fZsyYgffffx/l5eUNlq/f7oQJEyAIAjZv3ow5c+agf//+GDhwIJYuXdrk964pYPd+Cw4dOgStVgsnJ6dmu0Q709tvv42vvvoKUqkUUVFRGDhwIHJzc3Hw4EEcOnQIr7/+OubOnatf/tq1a3jggQdQVVWF4OBgjB8/HlKpFPn5+Th37hzy8/P1y48ePRqWlpY4fvw43N3dMXr0aP16XFxc2l17QkICHn/8cZSVlcHHxwcTJ05EZWUlzp49i/j4eOzfvx+fffYZLC0t272tegqFAosWLcKNGzcwaNAg9OrVC2VlZQbLVFZWYsGCBcjNzcXAgQMRFhaGCxcuYMeOHTh37hx27tzZoPV57tw5PPXUUygvL4evry9GjBgBpVKJS5cu4fXXX8ehQ4ewZs0afWtj8eLFOHfuHDZu3IiZM2c2qFOr1eK7774DoNtB6GzR0dEIDw/HtWvX8NNPP+Hhhx9usMz27duh0WjQr18/hIWFAQCOHTuGf/7zn/Dy8kJgYKB+DsvFixfx7rvv4rfffsOGDRuafA9ff/11bNq0CQMGDMC4ceOQmZkJiUTSbK3Jycn4+9//DldXVwQHByMyMhIVFRW4fPky1qxZgz179mDz5s36z6i7uzvuvfdeHDt2DEVFRRg1ahQ8PDz06wsICGjx9cnMzMSSJUuQnZ0NV1dXjB07FiqVCmfOnME777yDPXv2YN26dY3u+KtUKixbtgzx8fEYNGgQQkNDkZCQgP379+PMmTPYvn07/Pz8WqzhdvXhXv/3OnfuXPz444/YvXs3XnrpJVhbW7d5nY3ZsmULXnnlFWi1WvTp0wdDhw5FdnY2Dh06hEOHDmHlypVYsWJFg8e98cYb+OabbwAAvXv3xqBBg1BZWYm0tDR88sknGDp0KIYOHQoAKC4uxnPPPQcHBweEhoYiIiICtbW1SEpKwjfffINffvkF33//PQIDAwEAtra2uPfeexEXF4eMjAzExMTo76vfXkvKysrw8MMPIykpCfb29hg2bBgsLCxw9uxZrFmzBrt27cLXX3/d5Hvz4osvYteuXRg4cCDGjRuHpKQknDhxQv933a9fvza/1qITqFn/93//J4SHhwtLlixp13rGjx8vhIeHC5mZmY3e//zzzwvh4eHC1q1bDW7fvHmzEB4eLkyaNElISkoyuO/s2bPCgAEDhMjISCEtLU1/+wsvvCCEh4cLn376aYPt1NbWCmfPnjW47fTp00J4eLiwcOHCO3puW7duFcLDw4Xx48cb3K5QKPTPe/Xq1YJSqdTfl5GRob/vvffeM3jcRx99JISHhwsfffRRo9trqt7628PDw4W7775bKCgoaLLW8PBwYenSpUJlZaX+vrKyMuGee+4RwsPDhTVr1hg8rqCgQBgyZIgQEREhfPvtt4JGo9HfV1JSIixevFgIDw8XPv74Y/3tarVa/xwTExMb1PLbb7/pa+0q69evF8LDw4WZM2c2ev/kyZOF8PBwYfPmzfrbrl+/LsTHxzdYtqysTFi6dKkQHh4ufP755w3ur3+dY2JiGn28IDT9uc/NzRVOnjxp8DoLgiDU1NQIzz33nBAeHi68+uqrDda3cOFCITw8XDh9+nSj27u1rtvNnz9fCA8PF5YvXy5UV1frby8uLhbuvfdeITw8XHjmmWcMHnPrZ2727NkGn7m6ujr96/P3v/+9yXqakpqaKoSHhwuRkZFCcXGx/vapU6cK4eHhwvbt2xt9XFOvaf1n//nnnze4PTk5WejTp48QERHRYJ2HDx8WIiMjhfDwcOH48eMG923YsEEIDw8XhgwZIpw6dapBHRcvXhRycnL0v1dWVgoHDhwQFAqFwXJKpVJ49913hfDwcOHxxx9v9fO5VVPfr08//bQQHh4uzJ8/XygpKdHfXlVVJTz22GNCeHi4cP/99xs8JjMzU/+ejh8/XkhNTdXfp1arhRdffFH//WGK2L3fgvruKVdX10bvT05OxgsvvNDgJzY2tt3b1mq1+i6s9957D7169TK4f/DgwXjyySehUqmwefNm/e3FxcUAgLFjxzZYp7W1NQYPHtzu2lpjz549yM7OhqenJ/72t78ZjLf5+/vj+eefBwB88803TXYR36nVq1cbtPRuZ2trizfffBP29vb625ycnLBs2TIAwMmTJw2W//r
"text/plain": [
"<Figure size 800x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Ensure Seaborn styling\n",
"sns.set_theme(style=\"whitegrid\")\n",
"\n",
"# Calculate the total guest_journeys_count per variation\n",
"grouped_data = df.groupby('variation')['guest_journeys_count'].sum()\n",
"\n",
"# Find the total count and other metadata\n",
"total_count = grouped_data.sum()\n",
"ab_test_name = df['ab_test_name'].iloc[0] # Assuming all rows are for the same A/B test\n",
"last_update = df['last_update'].max()\n",
"\n",
"# Create a pie chart using Seaborn styling\n",
"plt.figure(figsize=(8, 6))\n",
"colors = sns.color_palette(\"pastel\") # Seaborn pastel colors\n",
"\n",
"# Pie chart with labels inside each sector\n",
"plt.pie(\n",
" grouped_data, \n",
" labels=[f\"{var}\\n{count} ({count/total_count:.1%})\" for var, count in grouped_data.items()],\n",
" autopct=None, \n",
" colors=colors, \n",
" startangle=90,\n",
" wedgeprops={'edgecolor': 'none'}, # Remove edges around sectors\n",
" pctdistance=0.70, # Places the labels closer to the center (inside)\n",
" labeldistance=0.2 # Ensure labels are positioned inside the sectors\n",
")\n",
"\n",
"# Add title\n",
"plt.title(\"Guest Journey - Variation Allocation\", fontsize=16)\n",
"\n",
"# Add total count to the bottom-left\n",
"plt.text(-1.4, -1.3, f\"Total Count: {total_count}\", fontsize=10, ha='left', color='black')\n",
"\n",
"# Add A/B test name and last update to the bottom-right\n",
"plt.text(1.2, -1.3, f\"A/B Test: {ab_test_name}\", fontsize=8, ha='right', color='gray')\n",
"plt.text(1.2, -1.4, f\"Last Update: {last_update}\", fontsize=8, ha='right', color='gray')\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Statistical Analysis\n",
"In this section we compute the metrics needed for monitoring as well as check if there's any statistical difference between the different variations."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Z-test for Proportion Metrics (Rates)\n",
"This section defines the functions used to compute Z-test Proportion analysis"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"# Generalized function to calculate Z-test for any metric\n",
"def calculate_z_test(df, metric_name, variation_a, variation_b, success_counts, total_counts):\n",
"\n",
" # Aggregate the success counts (numerator) and total counts (denominator) for each variation\n",
" success_a = df[df['variation'] == variation_a][success_counts].sum()\n",
" success_b = df[df['variation'] == variation_b][success_counts].sum()\n",
"\n",
" total_a = df[df['variation'] == variation_a][total_counts].sum()\n",
" total_b = df[df['variation'] == variation_b][total_counts].sum()\n",
"\n",
" # Calculate conversion rates for each variation\n",
" value_A = success_a / total_a if total_a != 0 else 0\n",
" value_B = success_b / total_b if total_b != 0 else 0\n",
"\n",
" # Absolute difference (B - A)\n",
" abs_diff = value_B - value_A\n",
"\n",
" # Relative difference (B - A) / A\n",
" rel_diff = (value_B - value_A) / value_A if value_A != 0 else 0\n",
"\n",
" # Perform the z-test for proportions\n",
" count = [success_a, success_b] # Success counts for A and B\n",
" nobs = [total_a, total_b] # Total counts for A and B\n",
" \n",
" # Calculate z-stat and p-value\n",
" z_stat, p_value = proportions_ztest(count, nobs)\n",
" \n",
" # Flag for significance at 95% level (p-value < 0.05)\n",
" is_significant = p_value < 0.05\n",
"\n",
" # Return the result as a dictionary\n",
" return {\n",
" 'metric': metric_name,\n",
" 'variation_A_name': variation_a,\n",
" 'variation_B_name': variation_b,\n",
" 'variation_A_value': value_A,\n",
" 'variation_B_value': value_B,\n",
" 'absolute_difference': abs_diff,\n",
" 'relative_difference': rel_diff,\n",
" 'statistic': z_stat,\n",
" 'p_value': p_value,\n",
" 'is_significant_95': is_significant\n",
" }\n",
"\n",
"# Function to run Z-tests for multiple metrics and aggregate results into a DataFrame\n",
"def run_z_tests(df, z_stat_metric_definition, variations):\n",
" results = []\n",
" \n",
" # Loop over all metrics in z_stat_metric_definition\n",
" for metric_name, metric_definition in z_stat_metric_definition.items():\n",
" success_counts = metric_definition['success_counts']\n",
" total_counts = metric_definition['total_counts']\n",
" \n",
" # Run the Z-test for each metric\n",
" result = calculate_z_test(df, metric_name, variation_a=variations[0], variation_b=variations[1], \n",
" success_counts=success_counts, total_counts=total_counts)\n",
" results.append(result)\n",
" \n",
" # Create a DataFrame from the results\n",
" results_df = pd.DataFrame(results)\n",
" \n",
" return results_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### T-test for non-proportion metrics\n",
"This section defines the functions used to compute T-tests for metrics outside of the proportion scope, mostly Revenue-related metrics."
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Generalized function to calculate T-test for revenue-related metrics\n",
"def calculate_t_test(df, metric_name, variation_a, variation_b, metric_avg_column, metric_sdv_column, total_counts):\n",
" # Aggregate the avgs and standard deviations for each variation\n",
" mean_a = df[df['variation'] == variation_a][metric_avg_column].mean() # Assuming the avg is calculated for each group\n",
" mean_b = df[df['variation'] == variation_b][metric_avg_column].mean() # Assuming the avg is calculated for each group\n",
" \n",
" sdv_a = df[df['variation'] == variation_a][metric_sdv_column].mean() # Assuming the stddev is calculated for each group\n",
" sdv_b = df[df['variation'] == variation_b][metric_sdv_column].mean() # Assuming the stddev is calculated for each group\n",
" \n",
" total_a = df[df['variation'] == variation_a][total_counts].sum()\n",
" total_b = df[df['variation'] == variation_b][total_counts].sum()\n",
"\n",
" # Absolute difference (B - A)\n",
" abs_diff = mean_b - mean_a\n",
"\n",
" # Relative difference (B - A) / A\n",
" rel_diff = (mean_b - mean_a) / mean_a if mean_a != 0 else 0\n",
"\n",
" # Calculate the T-statistic and p-value using the formula for two-sample T-test\n",
" se_a = sdv_a / (total_a ** 0.5) if total_a != 0 else 0\n",
" se_b = sdv_b / (total_b ** 0.5) if total_b != 0 else 0\n",
"\n",
" # Standard error of the difference between the means\n",
" se_diff = (se_a ** 2 + se_b ** 2) ** 0.5\n",
" \n",
" # T-statistic formula\n",
" if se_diff != 0:\n",
" t_stat = (mean_a - mean_b) / se_diff\n",
" else:\n",
" t_stat = 0\n",
" \n",
" # Degrees of freedom (for independent samples)\n",
" df_degrees = min(total_a - 1, total_b - 1) # Using the smaller of the two sample sizes minus 1\n",
" \n",
" # P-value from the T-distribution\n",
" p_value = stats.t.sf(abs(t_stat), df_degrees) * 2 # Two-tailed test\n",
" \n",
" # Flag for significance at 95% level (p-value < 0.05)\n",
" is_significant = p_value < 0.05\n",
"\n",
" # Return the result as a dictionary\n",
" return {\n",
" 'metric': metric_name,\n",
" 'variation_A_name': variation_a,\n",
" 'variation_B_name': variation_b,\n",
" 'variation_A_value': mean_a,\n",
" 'variation_B_value': mean_b,\n",
" 'absolute_difference': abs_diff,\n",
" 'relative_difference': rel_diff,\n",
" 'statistic': t_stat,\n",
" 'p_value': p_value,\n",
" 'is_significant_95': is_significant\n",
" }\n",
"\n",
"# Function to run T-tests for multiple revenue metrics and aggregate results into a DataFrame\n",
"def run_t_tests(df, t_stat_metric_definition, variations):\n",
" results = []\n",
" \n",
" # Loop over all metrics in t_stat_metric_definition\n",
" for metric_name, metric_definition in t_stat_metric_definition.items():\n",
" metric_avg_column = metric_definition['metric_avg_column']\n",
" metric_sdv_column = metric_definition['metric_sdv_column']\n",
" total_counts = metric_definition['total_counts']\n",
" \n",
" # Run the T-test for each metric\n",
" result = calculate_t_test(df, metric_name, variation_a=variations[0], variation_b=variations[1], \n",
" metric_avg_column=metric_avg_column, metric_sdv_column=metric_sdv_column, \n",
" total_counts=total_counts)\n",
" results.append(result)\n",
" \n",
" # Create a DataFrame from the results\n",
" results_df = pd.DataFrame(results)\n",
" \n",
" return results_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Specify the metric definition for Z-stat and T-stat tests"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"# Define the variations in which we want to run the tests\n",
"var_A = 'Fixed'\n",
"var_B = 'Relative'\n",
"variations = [var_A, var_B]\n",
"\n",
"# Define the Z-test metric definitions (with both success_counts and total_counts)\n",
"z_stat_metric_definition = {\n",
" 'conversion_rate': {\n",
" 'success_counts': 'guest_journey_completed_count',\n",
" 'total_counts': 'guest_journeys_count'\n",
" },\n",
" 'payment_rate': {\n",
" 'success_counts': 'guest_journey_with_payment_count',\n",
" 'total_counts': 'guest_journeys_count'\n",
" },\n",
" 'waiver_payment_rate': {\n",
" 'success_counts': 'waiver_count',\n",
" 'total_counts': 'guest_journeys_count'\n",
" },\n",
" 'deposit_payment_rate': {\n",
" 'success_counts': 'deposit_count',\n",
" 'total_counts': 'guest_journeys_count'\n",
" },\n",
" 'CIH_payment_rate': {\n",
" 'success_counts': 'check_in_cover_count',\n",
" 'total_counts': 'guest_journeys_count'\n",
" }\n",
"}\n",
"\n",
"# Define the T-test metric definitions (with both metric_avg_column and metric_sdv_column)\n",
"t_stat_metric_definition = {\n",
" 'avg_guest_revenue_per_gj': {\n",
" 'metric_avg_column': 'guest_revenue_avg_per_guest_journey',\n",
" 'metric_sdv_column': 'guest_revenue_sdv_per_guest_journey',\n",
" 'total_counts': 'guest_journeys_count'\n",
" },\n",
" 'avg_waiver_revenue_per_gj': {\n",
" 'metric_avg_column': 'waiver_avg_per_guest_journey',\n",
" 'metric_sdv_column': 'waiver_sdv_per_guest_journey',\n",
" 'total_counts': 'guest_journeys_count'\n",
" },\n",
" 'avg_deposit_revenue_per_gj': {\n",
" 'metric_avg_column': 'deposit_avg_per_guest_journey',\n",
" 'metric_sdv_column': 'deposit_sdv_per_guest_journey',\n",
" 'total_counts': 'guest_journeys_count'\n",
" },\n",
" 'avg_CIH_revenue_per_gj': {\n",
" 'metric_avg_column': 'check_in_cover_avg_per_guest_journey',\n",
" 'metric_sdv_column': 'check_in_cover_sdv_per_guest_journey',\n",
" 'total_counts': 'guest_journeys_count'\n",
" },\n",
" 'avg_csat_per_gj_with_response': {\n",
" 'metric_avg_column': 'csat_avg_per_guest_journey_with_response',\n",
" 'metric_sdv_column': 'csat_sdv_per_guest_journey_with_response',\n",
" 'total_counts': 'guest_journey_with_responses_count'\n",
" }\n",
"\n",
"}\n",
"\n",
"# Define the metrics that will be the main ones for this A/B test:\n",
"main_metrics = ['avg_guest_revenue_per_gj', 'conversion_rate', 'payment_rate']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Run the computation of the metrics and statistical significance"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" metric relative_difference p_value\n",
"0 conversion_rate 1.500000 0.082857\n",
"1 payment_rate -0.062500 0.913554\n",
"2 waiver_payment_rate 0.000000 1.000000\n",
"3 deposit_payment_rate -0.250000 0.834319\n",
"4 CIH_payment_rate -1.000000 0.242526\n",
"5 avg_guest_revenue_per_gj 0.017580 0.980584\n",
"6 avg_waiver_revenue_per_gj 0.148857 0.851687\n",
"7 avg_deposit_revenue_per_gj -0.288560 0.816919\n",
"8 avg_CIH_revenue_per_gj -1.000000 0.331333\n",
"9 avg_csat_per_gj_with_response -0.250000 0.608173\n"
]
}
],
"source": [
"# Call the function to calculate the Z-test for each metric and aggregate the results\n",
"z_test_results_df = run_z_tests(df, z_stat_metric_definition=z_stat_metric_definition, variations=variations)\n",
"\n",
"# Call the function to calculate the T-test for each metric and aggregate the results\n",
"t_test_results_df = run_t_tests(df, t_stat_metric_definition=t_stat_metric_definition, variations=variations)\n",
"\n",
"# Add a new column to identify whether it's from Z-test or T-test\n",
"z_test_results_df['test_type'] = 'Z-test'\n",
"t_test_results_df['test_type'] = 'T-test'\n",
"\n",
"# Combine the dataframes after adding the 'test_type' column\n",
"combined_results_df = pd.concat([z_test_results_df, t_test_results_df], ignore_index=True)\n",
"\n",
"# Print the main aggregated DataFrame\n",
"print(combined_results_df[['metric','relative_difference','p_value']])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Results\n"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"VerificationProductSelectionButtonPosition results (last updated at 2024-12-11)\n",
"\n",
"Total Guest Journeys affected by this A/B test: 42 - Total Guest Revenue: 246 GBP.\n",
" Variation Fixed: Guest Journeys 18 (42.9%) - Guest Revenue: 104 GBP (42.4%).\n",
" Variation Relative: Guest Journeys 24 (57.1%) - Guest Revenue: 141 GBP (57.6%).\n",
"\n",
"Main Metrics - Comparing Relative vs. Fixed.\n",
"\n",
"CONVERSION RATE (not significant): 41.7% vs. 16.7% (25.0% | 150.0%).\n",
"PAYMENT RATE (not significant): 20.8% vs. 22.2% (-1.4% | -6.2%).\n",
"AVG GUEST REVENUE PER GJ (not significant): 5.9 vs. 5.8 (0.1 | 1.8%).\n",
"\n",
"Other Metrics\n",
"\n",
"WAIVER PAYMENT RATE (not significant): 16.7% vs. 16.7% (0.0% | 0.0%).\n",
"DEPOSIT PAYMENT RATE (not significant): 4.2% vs. 5.6% (-1.4% | -25.0%).\n",
"CIH PAYMENT RATE (not significant): 0.0% vs. 5.6% (-5.6% | -100.0%).\n",
"AVG WAIVER REVENUE PER GJ (not significant): 5.64 vs. 4.91 (0.73 | 14.9%).\n",
"AVG DEPOSIT REVENUE PER GJ (not significant): 0.26 vs. 0.37 (-0.11 | -28.9%).\n",
"AVG CIH REVENUE PER GJ (not significant): 0.0 vs. 0.52 (-0.52 | -100.0%).\n",
"AVG CSAT PER GJ WITH RESPONSE (not significant): 3.0 vs. 4.0 (-1.0 | -25.0%).\n"
]
}
],
"source": [
"print('\\n{} results (last updated at {})\\n'.format(ab_test_name, last_update))\n",
"\n",
"# Get main volume indicators per variation\n",
"grouped_data = df.groupby('variation')[[\"guest_journeys_count\",\"guest_revenue_sum\"]].sum()\n",
"\n",
"# Find the totals over any variation\n",
"total_count = grouped_data.sum()\n",
"\n",
"# Print overall indicators for volumes\n",
"print('Total Guest Journeys affected by this A/B test: {} - Total Guest Revenue: {} GBP.'.format(int(total_count.loc[\"guest_journeys_count\"]), \n",
" int(total_count.loc[\"guest_revenue_sum\"])))\n",
"for var in variations:\n",
" print(' Variation {}: Guest Journeys {} ({}%) - Guest Revenue: {} GBP ({}%).'.format(\n",
" var, \n",
" int(grouped_data.loc[var,'guest_journeys_count']), \n",
" round(100*(grouped_data.loc[var,'guest_journeys_count']/total_count.loc[\"guest_journeys_count\"]),1),\n",
" int(grouped_data.loc[var,'guest_revenue_sum']),\n",
" round(100*(grouped_data.loc[var,'guest_revenue_sum']/total_count.loc[\"guest_revenue_sum\"]),1)\n",
" ))\n",
"\n",
"# Split results whether the metrics are main metrics or not\n",
"main_metrics_rows = combined_results_df[combined_results_df['metric'].isin(main_metrics)]\n",
"other_metrics_rows = combined_results_df[~combined_results_df['metric'].isin(main_metrics)]\n",
"\n",
"def print_metrics(df, header=None):\n",
" if header:\n",
" print(f'\\n{header}\\n')\n",
"\n",
" for row in df.iterrows():\n",
" metric = row[1]['metric'].upper().replace('_', ' ')\n",
" if row[1]['test_type'] == 'Z-test':\n",
" value_a = str(round(100 * row[1]['variation_A_value'], 1)) + '%'\n",
" value_b = str(round(100 * row[1]['variation_B_value'], 1)) + '%'\n",
" abs_diff = str(round(100 * row[1]['absolute_difference'], 1)) + '%'\n",
" else:\n",
" value_a = str(round(row[1]['variation_A_value'], 2))\n",
" value_b = str(round(row[1]['variation_B_value'], 2))\n",
" abs_diff = str(round(row[1]['absolute_difference'], 2))\n",
" rel_diff = str(round(100 * row[1]['relative_difference'], 1)) + '%'\n",
" stat_sign = row[1]['is_significant_95']\n",
"\n",
" if stat_sign:\n",
" print(f\"{metric} - SIGNIFICANT RESULT: {value_b} vs. {value_a} ({abs_diff} | {rel_diff}).\")\n",
" else:\n",
" print(f\"{metric} (not significant): {value_b} vs. {value_a} ({abs_diff} | {rel_diff}).\")\n",
"\n",
"# Print main metrics\n",
"print_metrics(main_metrics_rows, header=\"Main Metrics - Comparing {} vs. {}.\".format(var_B, var_A))\n",
"\n",
"# Print other metrics\n",
"print_metrics(other_metrics_rows, header=\"Other Metrics\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}