722 lines
73 KiB
Text
722 lines
73 KiB
Text
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# A/B test monitoring\n",
|
|
"\n",
|
|
"## Initial setup\n",
|
|
"This first section just ensures that the connection to DWH works correctly."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 37,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pathlib\n",
|
|
"import yaml\n",
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"from sqlalchemy import create_engine\n",
|
|
"import seaborn as sns\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"from statsmodels.stats.proportion import proportions_ztest\n",
|
|
"from scipy import stats\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 38,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/home/joaquin/.superhog-dwh/credentials.yml\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"CREDS_FILEPATH = pathlib.Path.home() / \".superhog-dwh\" / \"credentials.yml\"\n",
|
|
"print(CREDS_FILEPATH)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 39,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Prepare connection to DWH\n",
|
|
"# Function to read credentials from the YAML file\n",
|
|
"def read_credentials(yaml_path: str, env: str = \"prd\"):\n",
|
|
" with open(yaml_path, \"r\") as file:\n",
|
|
" credentials = yaml.safe_load(file)\n",
|
|
" return credentials[\"envs\"][env]\n",
|
|
"# Function to create a PostgreSQL connection string\n",
|
|
"def create_postgres_engine(creds: dict):\n",
|
|
" user = creds[\"user\"]\n",
|
|
" password = creds[\"password\"]\n",
|
|
" host = creds[\"host\"]\n",
|
|
" port = creds[\"port\"]\n",
|
|
" database = creds[\"database\"]\n",
|
|
" # Create the connection string for SQLAlchemy\n",
|
|
" connection_string = f\"postgresql://{user}:{password}@{host}:{port}/{database}\"\n",
|
|
" engine = create_engine(connection_string)\n",
|
|
" return engine\n",
|
|
"# Function to execute a query and return the result as a pandas DataFrame\n",
|
|
"def query_to_dataframe(engine, query: str):\n",
|
|
" with engine.connect() as connection:\n",
|
|
" df = pd.read_sql(query, connection)\n",
|
|
" return df\n",
|
|
"dwh_creds = read_credentials(yaml_path=CREDS_FILEPATH, env=\"prd\")\n",
|
|
"dwh_pg_engine = create_postgres_engine(creds=dwh_creds)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 40,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" ?column?\n",
|
|
"0 1\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Silly query to test things out\n",
|
|
"test_df = query_to_dataframe(engine=dwh_pg_engine, query=\"SELECT 1;\")\n",
|
|
"print(test_df.head())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## A/B test configuration\n",
|
|
"In this section we configure the parameters for the A/B test. Likely you do NOT need to change anything else than this, unless of course you want to create new metrics and so on.\n",
|
|
"\n",
|
|
"The parameters to be specified are:\n",
|
|
"* **ab_test_name**: this should be the name of the feature flag corresponding to the A/B test. If you don't know the name, ask Guest Squad\n",
|
|
"* **var_A** and **var_B**: these correspond to the name of the variants. At this moment, we can only handle univariant testing (though updating the code to include multivariant testing should not be extremely difficult). In general, choose var_A to be the Control group."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# A/B test name to measure\n",
|
|
"#ab_test_name = \"AAVariantTest\"\n",
|
|
"ab_test_name = \"WelcomePageDestinationContext\"\n",
|
|
"\n",
|
|
"# Define the variations in which we want to run the tests\n",
|
|
"var_A = 'GenericImageAndCopy' # Ideally, this should be the control group\n",
|
|
"var_B = 'ContextSpecificImageAndCopy' # Ideally, this should be the study group\n",
|
|
"\n",
|
|
"variations = [var_A, var_B]\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Data Extraction\n",
|
|
"In this section we extract the data from the Guest Journey monitoring within DWH by configuring which A/B test we want to measure. Here we already handle the basic aggregations that will be needed in the future, directly in SQL."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" ab_test_name variation last_update guest_journeys_count \\\n",
|
|
"0 ShowNewIllustrations NewIllustrations 2025-04-16 20916 \n",
|
|
"1 ShowNewIllustrations OldIllustrations 2025-04-16 21018 \n",
|
|
"\n",
|
|
" guest_journey_started_count guest_journey_completed_count \\\n",
|
|
"0 20916 16155 \n",
|
|
"1 21018 16187 \n",
|
|
"\n",
|
|
" guest_journey_with_responses_count guest_journey_with_payment_count \\\n",
|
|
"0 5820 8677 \n",
|
|
"1 6084 8580 \n",
|
|
"\n",
|
|
" guest_revenue_count deposit_count ... \\\n",
|
|
"0 8677 1439 ... \n",
|
|
"1 8580 1393 ... \n",
|
|
"\n",
|
|
" guest_revenue_avg_per_guest_journey guest_revenue_sdv_per_guest_journey \\\n",
|
|
"0 10.703579 16.931697 \n",
|
|
"1 10.488324 16.978108 \n",
|
|
"\n",
|
|
" deposit_avg_per_guest_journey deposit_sdv_per_guest_journey \\\n",
|
|
"0 0.488697 2.369654 \n",
|
|
"1 0.462561 1.947974 \n",
|
|
"\n",
|
|
" waiver_avg_per_guest_journey waiver_sdv_per_guest_journey \\\n",
|
|
"0 10.042173 16.872259 \n",
|
|
"1 9.869315 16.968309 \n",
|
|
"\n",
|
|
" check_in_cover_avg_per_guest_journey check_in_cover_sdv_per_guest_journey \\\n",
|
|
"0 0.172710 1.222131 \n",
|
|
"1 0.156449 1.162772 \n",
|
|
"\n",
|
|
" csat_avg_per_guest_journey_with_response \\\n",
|
|
"0 3.778179 \n",
|
|
"1 3.797337 \n",
|
|
"\n",
|
|
" csat_sdv_per_guest_journey_with_response \n",
|
|
"0 1.018528 \n",
|
|
"1 1.025494 \n",
|
|
"\n",
|
|
"[2 rows x 26 columns]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Query to extract data\n",
|
|
"data_extraction_query = \"\"\"\n",
|
|
"select \n",
|
|
"\tab_test_name,\n",
|
|
"\tvariation,\n",
|
|
"\tmax(first_appearance_date_utc) as last_update,\n",
|
|
" \n",
|
|
" -- SIMPLE COUNTS --\n",
|
|
"\tcount(id_verification_request) as guest_journeys_count,\n",
|
|
"\tcount(verification_started_date_utc) as guest_journey_started_count,\n",
|
|
"\tcount(verification_completed_date_utc) as guest_journey_completed_count,\n",
|
|
"\tcount(experience_rating) as guest_journey_with_responses_count,\n",
|
|
"\tcount(last_payment_paid_date_utc) as guest_journey_with_payment_count,\n",
|
|
"\tcount(guest_revenue_without_taxes_in_gbp) as guest_revenue_count,\n",
|
|
"\tcount(deposit_fees_without_taxes_in_gbp) as deposit_count,\n",
|
|
"\tcount(waiver_fees_without_taxes_in_gbp) as waiver_count,\n",
|
|
"\tcount(check_in_cover_fees_without_taxes_in_gbp) as check_in_cover_count,\n",
|
|
" \n",
|
|
" -- SIMPLE SUMS --\n",
|
|
"\tsum(guest_revenue_without_taxes_in_gbp) as guest_revenue_sum,\n",
|
|
"\tsum(deposit_fees_without_taxes_in_gbp) as deposit_sum,\n",
|
|
"\tsum(waiver_fees_without_taxes_in_gbp) as waiver_sum,\n",
|
|
"\tsum(check_in_cover_fees_without_taxes_in_gbp) as check_in_cover_sum,\n",
|
|
" \n",
|
|
" -- AVGs/SDVs PER GUEST JOURNEY (ANY GJ APPEARING IN THE A/B TEST) --\n",
|
|
" -- NOTE THE COALESCE HERE. THIS IS IMPORTANT FOR THE T-TEST COMPUTATION --\n",
|
|
" avg(coalesce(guest_revenue_without_taxes_in_gbp,0)) as guest_revenue_avg_per_guest_journey,\n",
|
|
" stddev(coalesce(guest_revenue_without_taxes_in_gbp,0)) as guest_revenue_sdv_per_guest_journey,\n",
|
|
" avg(coalesce(deposit_fees_without_taxes_in_gbp,0)) as deposit_avg_per_guest_journey,\n",
|
|
" stddev(coalesce(deposit_fees_without_taxes_in_gbp,0)) as deposit_sdv_per_guest_journey,\n",
|
|
" avg(coalesce(waiver_fees_without_taxes_in_gbp,0)) as waiver_avg_per_guest_journey,\n",
|
|
" stddev(coalesce(waiver_fees_without_taxes_in_gbp,0)) as waiver_sdv_per_guest_journey,\n",
|
|
" avg(coalesce(check_in_cover_fees_without_taxes_in_gbp,0)) as check_in_cover_avg_per_guest_journey,\n",
|
|
" stddev(coalesce(check_in_cover_fees_without_taxes_in_gbp,0)) as check_in_cover_sdv_per_guest_journey,\n",
|
|
" \n",
|
|
" -- AVGs/SDVs PER GUEST JOURNEY WITH CSAT RESPONSE --\n",
|
|
" -- NOTE THAT THERE'S NO COALESCE HERE. THIS IS IMPORTANT FOR THE T-TEST COMPUTATION --\n",
|
|
" avg(experience_rating) as csat_avg_per_guest_journey_with_response,\n",
|
|
" stddev(experience_rating) as csat_sdv_per_guest_journey_with_response\n",
|
|
" \n",
|
|
"from\n",
|
|
"\tintermediate.int_core__ab_test_monitoring_guest_journey\n",
|
|
"where\n",
|
|
"\tab_test_name = '{}'\n",
|
|
" and first_appearance_at_utc >= '2025-04-23 12:50:00'\n",
|
|
"group by\n",
|
|
"\t1,2\n",
|
|
"\"\"\".format(ab_test_name)\n",
|
|
"\n",
|
|
"# Retrieve Data from Query\n",
|
|
"df = query_to_dataframe(engine=dwh_pg_engine, query=data_extraction_query)\n",
|
|
"print(df.head())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Check A/B test Allocation to Variation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 43,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"image/png": "",
|
|
"text/plain": [
|
|
"<Figure size 800x600 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Ensure Seaborn styling\n",
|
|
"sns.set_theme(style=\"whitegrid\")\n",
|
|
"\n",
|
|
"# Calculate the total guest_journeys_count per variation\n",
|
|
"grouped_data = df.groupby('variation')['guest_journeys_count'].sum()\n",
|
|
"\n",
|
|
"# Find the total count and other metadata\n",
|
|
"total_count = grouped_data.sum()\n",
|
|
"ab_test_name = df['ab_test_name'].iloc[0] # Assuming all rows are for the same A/B test\n",
|
|
"last_update = df['last_update'].max()\n",
|
|
"\n",
|
|
"# Create a pie chart using Seaborn styling\n",
|
|
"plt.figure(figsize=(8, 6))\n",
|
|
"colors = sns.color_palette(\"pastel\") # Seaborn pastel colors\n",
|
|
"\n",
|
|
"# Pie chart with labels inside each sector\n",
|
|
"plt.pie(\n",
|
|
" grouped_data, \n",
|
|
" labels=[f\"{var}\\n{count} ({count/total_count:.1%})\" for var, count in grouped_data.items()],\n",
|
|
" autopct=None, \n",
|
|
" colors=colors, \n",
|
|
" startangle=90,\n",
|
|
" wedgeprops={'edgecolor': 'none'}, # Remove edges around sectors\n",
|
|
" pctdistance=0.70, # Places the labels closer to the center (inside)\n",
|
|
" labeldistance=0.2 # Ensure labels are positioned inside the sectors\n",
|
|
")\n",
|
|
"\n",
|
|
"# Add title\n",
|
|
"plt.title(\"Guest Journey - Variation Allocation\", fontsize=16)\n",
|
|
"\n",
|
|
"# Add total count to the bottom-left\n",
|
|
"plt.text(-1.4, -1.3, f\"Total Count: {total_count}\", fontsize=10, ha='left', color='black')\n",
|
|
"\n",
|
|
"# Add A/B test name and last update to the bottom-right\n",
|
|
"plt.text(1.2, -1.3, f\"A/B Test: {ab_test_name}\", fontsize=8, ha='right', color='gray')\n",
|
|
"plt.text(1.2, -1.4, f\"Last Update: {last_update}\", fontsize=8, ha='right', color='gray')\n",
|
|
"\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Statistical Analysis\n",
|
|
"In this section we compute the metrics needed for monitoring as well as check if there's any statistical difference between the different variations."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Z-test for Proportion Metrics (Rates)\n",
|
|
"This section defines the functions used to compute Z-test Proportion analysis"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 44,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Generalized function to calculate Z-test for any metric\n",
|
|
"def calculate_z_test(df, metric_name, variation_a, variation_b, success_counts, total_counts):\n",
|
|
"\n",
|
|
" # Aggregate the success counts (numerator) and total counts (denominator) for each variation\n",
|
|
" success_a = df[df['variation'] == variation_a][success_counts].sum()\n",
|
|
" success_b = df[df['variation'] == variation_b][success_counts].sum()\n",
|
|
"\n",
|
|
" total_a = df[df['variation'] == variation_a][total_counts].sum()\n",
|
|
" total_b = df[df['variation'] == variation_b][total_counts].sum()\n",
|
|
"\n",
|
|
" # Calculate conversion rates for each variation\n",
|
|
" value_A = success_a / total_a if total_a != 0 else 0\n",
|
|
" value_B = success_b / total_b if total_b != 0 else 0\n",
|
|
"\n",
|
|
" # Absolute difference (B - A)\n",
|
|
" abs_diff = value_B - value_A\n",
|
|
"\n",
|
|
" # Relative difference (B - A) / A\n",
|
|
" rel_diff = (value_B - value_A) / value_A if value_A != 0 else 0\n",
|
|
"\n",
|
|
" # Perform the z-test for proportions\n",
|
|
" count = [success_a, success_b] # Success counts for A and B\n",
|
|
" nobs = [total_a, total_b] # Total counts for A and B\n",
|
|
" \n",
|
|
" # Calculate z-stat and p-value\n",
|
|
" z_stat, p_value = proportions_ztest(count, nobs)\n",
|
|
" \n",
|
|
" # Flag for significance at 95% level (p-value < 0.05)\n",
|
|
" is_significant = p_value < 0.05\n",
|
|
"\n",
|
|
" # Return the result as a dictionary\n",
|
|
" return {\n",
|
|
" 'metric': metric_name,\n",
|
|
" 'variation_A_name': variation_a,\n",
|
|
" 'variation_B_name': variation_b,\n",
|
|
" 'variation_A_value': value_A,\n",
|
|
" 'variation_B_value': value_B,\n",
|
|
" 'absolute_difference': abs_diff,\n",
|
|
" 'relative_difference': rel_diff,\n",
|
|
" 'statistic': z_stat,\n",
|
|
" 'p_value': p_value,\n",
|
|
" 'is_significant_95': is_significant\n",
|
|
" }\n",
|
|
"\n",
|
|
"# Function to run Z-tests for multiple metrics and aggregate results into a DataFrame\n",
|
|
"def run_z_tests(df, z_stat_metric_definition, variations):\n",
|
|
" results = []\n",
|
|
" \n",
|
|
" # Loop over all metrics in z_stat_metric_definition\n",
|
|
" for metric_name, metric_definition in z_stat_metric_definition.items():\n",
|
|
" success_counts = metric_definition['success_counts']\n",
|
|
" total_counts = metric_definition['total_counts']\n",
|
|
" \n",
|
|
" # Run the Z-test for each metric\n",
|
|
" result = calculate_z_test(df, metric_name, variation_a=variations[0], variation_b=variations[1], \n",
|
|
" success_counts=success_counts, total_counts=total_counts)\n",
|
|
" results.append(result)\n",
|
|
" \n",
|
|
" # Create a DataFrame from the results\n",
|
|
" results_df = pd.DataFrame(results)\n",
|
|
" \n",
|
|
" return results_df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### T-test for non-proportion metrics\n",
|
|
"This section defines the functions used to compute T-tests for metrics outside of the proportion scope, mostly Revenue-related metrics."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 45,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"# Generalized function to calculate T-test for revenue-related metrics\n",
|
|
"def calculate_t_test(df, metric_name, variation_a, variation_b, metric_avg_column, metric_sdv_column, total_counts):\n",
|
|
" # Aggregate the avgs and standard deviations for each variation\n",
|
|
" mean_a = df[df['variation'] == variation_a][metric_avg_column].mean() # Assuming the avg is calculated for each group\n",
|
|
" mean_b = df[df['variation'] == variation_b][metric_avg_column].mean() # Assuming the avg is calculated for each group\n",
|
|
" \n",
|
|
" sdv_a = df[df['variation'] == variation_a][metric_sdv_column].mean() # Assuming the stddev is calculated for each group\n",
|
|
" sdv_b = df[df['variation'] == variation_b][metric_sdv_column].mean() # Assuming the stddev is calculated for each group\n",
|
|
" \n",
|
|
" total_a = df[df['variation'] == variation_a][total_counts].sum()\n",
|
|
" total_b = df[df['variation'] == variation_b][total_counts].sum()\n",
|
|
"\n",
|
|
" # Absolute difference (B - A)\n",
|
|
" abs_diff = mean_b - mean_a\n",
|
|
"\n",
|
|
" # Relative difference (B - A) / A\n",
|
|
" rel_diff = (mean_b - mean_a) / mean_a if mean_a != 0 else 0\n",
|
|
"\n",
|
|
" # Calculate the T-statistic and p-value using the formula for two-sample T-test\n",
|
|
" se_a = sdv_a / (total_a ** 0.5) if total_a != 0 else 0\n",
|
|
" se_b = sdv_b / (total_b ** 0.5) if total_b != 0 else 0\n",
|
|
"\n",
|
|
" # Standard error of the difference between the means\n",
|
|
" se_diff = (se_a ** 2 + se_b ** 2) ** 0.5\n",
|
|
" \n",
|
|
" # T-statistic formula\n",
|
|
" if se_diff != 0:\n",
|
|
" t_stat = (mean_a - mean_b) / se_diff\n",
|
|
" else:\n",
|
|
" t_stat = 0\n",
|
|
" \n",
|
|
" # Degrees of freedom (for independent samples)\n",
|
|
" df_degrees = min(total_a - 1, total_b - 1) # Using the smaller of the two sample sizes minus 1\n",
|
|
" \n",
|
|
" # P-value from the T-distribution\n",
|
|
" p_value = stats.t.sf(abs(t_stat), df_degrees) * 2 # Two-tailed test\n",
|
|
" \n",
|
|
" # Flag for significance at 95% level (p-value < 0.05)\n",
|
|
" is_significant = p_value < 0.05\n",
|
|
"\n",
|
|
" # Return the result as a dictionary\n",
|
|
" return {\n",
|
|
" 'metric': metric_name,\n",
|
|
" 'variation_A_name': variation_a,\n",
|
|
" 'variation_B_name': variation_b,\n",
|
|
" 'variation_A_value': mean_a,\n",
|
|
" 'variation_B_value': mean_b,\n",
|
|
" 'absolute_difference': abs_diff,\n",
|
|
" 'relative_difference': rel_diff,\n",
|
|
" 'statistic': t_stat,\n",
|
|
" 'p_value': p_value,\n",
|
|
" 'is_significant_95': is_significant\n",
|
|
" }\n",
|
|
"\n",
|
|
"# Function to run T-tests for multiple revenue metrics and aggregate results into a DataFrame\n",
|
|
"def run_t_tests(df, t_stat_metric_definition, variations):\n",
|
|
" results = []\n",
|
|
" \n",
|
|
" # Loop over all metrics in t_stat_metric_definition\n",
|
|
" for metric_name, metric_definition in t_stat_metric_definition.items():\n",
|
|
" metric_avg_column = metric_definition['metric_avg_column']\n",
|
|
" metric_sdv_column = metric_definition['metric_sdv_column']\n",
|
|
" total_counts = metric_definition['total_counts']\n",
|
|
" \n",
|
|
" # Run the T-test for each metric\n",
|
|
" result = calculate_t_test(df, metric_name, variation_a=variations[0], variation_b=variations[1], \n",
|
|
" metric_avg_column=metric_avg_column, metric_sdv_column=metric_sdv_column, \n",
|
|
" total_counts=total_counts)\n",
|
|
" results.append(result)\n",
|
|
" \n",
|
|
" # Create a DataFrame from the results\n",
|
|
" results_df = pd.DataFrame(results)\n",
|
|
" \n",
|
|
" return results_df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Specify the metric definition for Z-stat and T-stat tests"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 46,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Define the Z-test metric definitions (with both success_counts and total_counts)\n",
|
|
"z_stat_metric_definition = {\n",
|
|
" 'conversion_rate': {\n",
|
|
" 'success_counts': 'guest_journey_completed_count',\n",
|
|
" 'total_counts': 'guest_journeys_count'\n",
|
|
" },\n",
|
|
" 'payment_rate': {\n",
|
|
" 'success_counts': 'guest_journey_with_payment_count',\n",
|
|
" 'total_counts': 'guest_journeys_count'\n",
|
|
" },\n",
|
|
" 'waiver_payment_rate': {\n",
|
|
" 'success_counts': 'waiver_count',\n",
|
|
" 'total_counts': 'guest_journeys_count'\n",
|
|
" },\n",
|
|
" 'deposit_payment_rate': {\n",
|
|
" 'success_counts': 'deposit_count',\n",
|
|
" 'total_counts': 'guest_journeys_count'\n",
|
|
" },\n",
|
|
" 'CIH_payment_rate': {\n",
|
|
" 'success_counts': 'check_in_cover_count',\n",
|
|
" 'total_counts': 'guest_journeys_count'\n",
|
|
" }\n",
|
|
"}\n",
|
|
"\n",
|
|
"# Define the T-test metric definitions (with both metric_avg_column and metric_sdv_column)\n",
|
|
"t_stat_metric_definition = {\n",
|
|
" 'avg_guest_revenue_per_gj': {\n",
|
|
" 'metric_avg_column': 'guest_revenue_avg_per_guest_journey',\n",
|
|
" 'metric_sdv_column': 'guest_revenue_sdv_per_guest_journey',\n",
|
|
" 'total_counts': 'guest_journeys_count'\n",
|
|
" },\n",
|
|
" 'avg_waiver_revenue_per_gj': {\n",
|
|
" 'metric_avg_column': 'waiver_avg_per_guest_journey',\n",
|
|
" 'metric_sdv_column': 'waiver_sdv_per_guest_journey',\n",
|
|
" 'total_counts': 'guest_journeys_count'\n",
|
|
" },\n",
|
|
" 'avg_deposit_revenue_per_gj': {\n",
|
|
" 'metric_avg_column': 'deposit_avg_per_guest_journey',\n",
|
|
" 'metric_sdv_column': 'deposit_sdv_per_guest_journey',\n",
|
|
" 'total_counts': 'guest_journeys_count'\n",
|
|
" },\n",
|
|
" 'avg_CIH_revenue_per_gj': {\n",
|
|
" 'metric_avg_column': 'check_in_cover_avg_per_guest_journey',\n",
|
|
" 'metric_sdv_column': 'check_in_cover_sdv_per_guest_journey',\n",
|
|
" 'total_counts': 'guest_journeys_count'\n",
|
|
" },\n",
|
|
" 'avg_csat_per_gj_with_response': {\n",
|
|
" 'metric_avg_column': 'csat_avg_per_guest_journey_with_response',\n",
|
|
" 'metric_sdv_column': 'csat_sdv_per_guest_journey_with_response',\n",
|
|
" 'total_counts': 'guest_journey_with_responses_count'\n",
|
|
" }\n",
|
|
"\n",
|
|
"}\n",
|
|
"\n",
|
|
"# Define the metrics that will be the main ones for this A/B test:\n",
|
|
"main_metrics = ['avg_guest_revenue_per_gj', 'conversion_rate', 'payment_rate']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Run the computation of the metrics and statistical significance"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 47,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" metric relative_difference p_value\n",
|
|
"0 conversion_rate 0.002890 0.587414\n",
|
|
"1 payment_rate 0.016237 0.167865\n",
|
|
"2 waiver_payment_rate 0.011461 0.398951\n",
|
|
"3 deposit_payment_rate 0.038060 0.303386\n",
|
|
"4 CIH_payment_rate 0.100579 0.173649\n",
|
|
"5 avg_guest_revenue_per_gj 0.020523 0.193648\n",
|
|
"6 avg_waiver_revenue_per_gj 0.017515 0.295568\n",
|
|
"7 avg_deposit_revenue_per_gj 0.056502 0.217435\n",
|
|
"8 avg_CIH_revenue_per_gj 0.103937 0.162819\n",
|
|
"9 avg_csat_per_gj_with_response -0.005045 0.306604\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Call the function to calculate the Z-test for each metric and aggregate the results\n",
|
|
"z_test_results_df = run_z_tests(df, z_stat_metric_definition=z_stat_metric_definition, variations=variations)\n",
|
|
"\n",
|
|
"# Call the function to calculate the T-test for each metric and aggregate the results\n",
|
|
"t_test_results_df = run_t_tests(df, t_stat_metric_definition=t_stat_metric_definition, variations=variations)\n",
|
|
"\n",
|
|
"# Add a new column to identify whether it's from Z-test or T-test\n",
|
|
"z_test_results_df['test_type'] = 'Z-test'\n",
|
|
"t_test_results_df['test_type'] = 'T-test'\n",
|
|
"\n",
|
|
"# Combine the dataframes after adding the 'test_type' column\n",
|
|
"combined_results_df = pd.concat([z_test_results_df, t_test_results_df], ignore_index=True)\n",
|
|
"\n",
|
|
"# Print the main aggregated DataFrame\n",
|
|
"print(combined_results_df[['metric','relative_difference','p_value']])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Results\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 48,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"ShowNewIllustrations results (last updated at 2025-04-16)\n",
|
|
"\n",
|
|
"Total Guest Journeys affected by this A/B test: 41934 - Total Guest Revenue: 444319 GBP.\n",
|
|
" Variation OldIllustrations: Guest Journeys 21018 (50.1%) - Guest Revenue: 220443 GBP (49.6%).\n",
|
|
" Variation NewIllustrations: Guest Journeys 20916 (49.9%) - Guest Revenue: 223876 GBP (50.4%).\n",
|
|
"\n",
|
|
"Main Metrics - Comparing NewIllustrations vs. OldIllustrations.\n",
|
|
"\n",
|
|
"CONVERSION RATE (not significant): 77.2% vs. 77.0% (0.2% ppts.| 0.3%).\n",
|
|
"PAYMENT RATE (not significant): 41.5% vs. 40.8% (0.7% ppts.| 1.6%).\n",
|
|
"AVG GUEST REVENUE PER GJ (not significant): 10.7 vs. 10.49 (0.22 ppts.| 2.1%).\n",
|
|
"\n",
|
|
"Other Metrics\n",
|
|
"\n",
|
|
"WAIVER PAYMENT RATE (not significant): 34.5% vs. 34.1% (0.4% ppts.| 1.1%).\n",
|
|
"DEPOSIT PAYMENT RATE (not significant): 6.9% vs. 6.6% (0.3% ppts.| 3.8%).\n",
|
|
"CIH PAYMENT RATE (not significant): 2.0% vs. 1.8% (0.2% ppts.| 10.1%).\n",
|
|
"AVG WAIVER REVENUE PER GJ (not significant): 10.04 vs. 9.87 (0.17 ppts.| 1.8%).\n",
|
|
"AVG DEPOSIT REVENUE PER GJ (not significant): 0.49 vs. 0.46 (0.03 ppts.| 5.7%).\n",
|
|
"AVG CIH REVENUE PER GJ (not significant): 0.17 vs. 0.16 (0.02 ppts.| 10.4%).\n",
|
|
"AVG CSAT PER GJ WITH RESPONSE (not significant): 3.78 vs. 3.8 (-0.02 ppts.| -0.5%).\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print('\\n{} results (last updated at {})\\n'.format(ab_test_name, last_update))\n",
|
|
"\n",
|
|
"# Get main volume indicators per variation\n",
|
|
"grouped_data = df.groupby('variation')[[\"guest_journeys_count\",\"guest_revenue_sum\"]].sum()\n",
|
|
"\n",
|
|
"# Find the totals over any variation\n",
|
|
"total_count = grouped_data.sum()\n",
|
|
"\n",
|
|
"# Print overall indicators for volumes\n",
|
|
"print('Total Guest Journeys affected by this A/B test: {} - Total Guest Revenue: {} GBP.'.format(int(total_count.loc[\"guest_journeys_count\"]), \n",
|
|
" int(total_count.loc[\"guest_revenue_sum\"])))\n",
|
|
"for var in variations:\n",
|
|
" print(' Variation {}: Guest Journeys {} ({}%) - Guest Revenue: {} GBP ({}%).'.format(\n",
|
|
" var, \n",
|
|
" int(grouped_data.loc[var,'guest_journeys_count']), \n",
|
|
" round(100*(grouped_data.loc[var,'guest_journeys_count']/total_count.loc[\"guest_journeys_count\"]),1),\n",
|
|
" int(grouped_data.loc[var,'guest_revenue_sum']),\n",
|
|
" round(100*(grouped_data.loc[var,'guest_revenue_sum']/total_count.loc[\"guest_revenue_sum\"]),1)\n",
|
|
" ))\n",
|
|
"\n",
|
|
"# Split results whether the metrics are main metrics or not\n",
|
|
"main_metrics_rows = combined_results_df[combined_results_df['metric'].isin(main_metrics)]\n",
|
|
"other_metrics_rows = combined_results_df[~combined_results_df['metric'].isin(main_metrics)]\n",
|
|
"\n",
|
|
"def print_metrics(df, header=None):\n",
|
|
" if header:\n",
|
|
" print(f'\\n{header}\\n')\n",
|
|
"\n",
|
|
" for row in df.iterrows():\n",
|
|
" metric = row[1]['metric'].upper().replace('_', ' ')\n",
|
|
" if row[1]['test_type'] == 'Z-test':\n",
|
|
" value_a = str(round(100 * row[1]['variation_A_value'], 1)) + '%'\n",
|
|
" value_b = str(round(100 * row[1]['variation_B_value'], 1)) + '%'\n",
|
|
" abs_diff = str(round(100 * row[1]['absolute_difference'], 1)) + '%'\n",
|
|
" else:\n",
|
|
" value_a = str(round(row[1]['variation_A_value'], 2))\n",
|
|
" value_b = str(round(row[1]['variation_B_value'], 2))\n",
|
|
" abs_diff = str(round(row[1]['absolute_difference'], 2))\n",
|
|
" rel_diff = str(round(100 * row[1]['relative_difference'], 1)) + '%'\n",
|
|
" stat_sign = row[1]['is_significant_95']\n",
|
|
"\n",
|
|
" if stat_sign:\n",
|
|
" print(f\"{metric} - SIGNIFICANT RESULT: {value_b} vs. {value_a} ({abs_diff} ppts.| {rel_diff}).\")\n",
|
|
" else:\n",
|
|
" print(f\"{metric} (not significant): {value_b} vs. {value_a} ({abs_diff} ppts.| {rel_diff}).\")\n",
|
|
"\n",
|
|
"# Print main metrics\n",
|
|
"print_metrics(main_metrics_rows, header=\"Main Metrics - Comparing {} vs. {}.\".format(var_B, var_A))\n",
|
|
"\n",
|
|
"# Print other metrics\n",
|
|
"print_metrics(other_metrics_rows, header=\"Other Metrics\")\n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|