337 lines
13 KiB
Text
337 lines
13 KiB
Text
|
|
{
|
|||
|
|
"cells": [
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"id": "84dcd475",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"# DDRA - Random Predictor\n",
|
|||
|
|
"\n",
|
|||
|
|
"## Initial setup\n",
|
|||
|
|
"This first section just ensures that the connection to DWH works correctly."
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "12368ce1",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# This script connects to a Data Warehouse (DWH) using PostgreSQL. \n",
|
|||
|
|
"# This should be common for all Notebooks, but you might need to adjust the path to the `dwh_utils` module.\n",
|
|||
|
|
"\n",
|
|||
|
|
"import sys\n",
|
|||
|
|
"import os\n",
|
|||
|
|
"sys.path.append(os.path.abspath(\"../../utils\")) # Adjust path if needed\n",
|
|||
|
|
"\n",
|
|||
|
|
"from dwh_utils import read_credentials, create_postgres_engine, query_to_dataframe, test_connection\n",
|
|||
|
|
"\n",
|
|||
|
|
"# --- Connect to DWH ---\n",
|
|||
|
|
"creds = read_credentials()\n",
|
|||
|
|
"dwh_pg_engine = create_postgres_engine(creds)\n",
|
|||
|
|
"\n",
|
|||
|
|
"# --- Test Query ---\n",
|
|||
|
|
"test_connection()"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"id": "c86f94f1",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"## Data Extraction\n",
|
|||
|
|
"In this section we extract the data for our random predictor."
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "3e3ed391",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# Initialise all imports needed for the Notebook\n",
|
|||
|
|
"import pandas as pd\n",
|
|||
|
|
"import numpy as np\n",
|
|||
|
|
"from datetime import date\n",
|
|||
|
|
"from sklearn.metrics import (\n",
|
|||
|
|
" precision_score,\n",
|
|||
|
|
" recall_score,\n",
|
|||
|
|
" fbeta_score,\n",
|
|||
|
|
" confusion_matrix\n",
|
|||
|
|
")\n",
|
|||
|
|
"import matplotlib.pyplot as plt\n",
|
|||
|
|
"import seaborn as sns"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "db5e3098",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# Query to extract data\n",
|
|||
|
|
"data_extraction_query = \"\"\"\n",
|
|||
|
|
"select \n",
|
|||
|
|
" id_booking,\n",
|
|||
|
|
" booking_created_date_utc,\n",
|
|||
|
|
" has_resolution_incident\n",
|
|||
|
|
"from intermediate.int_booking_summary\n",
|
|||
|
|
"where \n",
|
|||
|
|
" -- 1. Bookings from New Dash users with Id Deal\n",
|
|||
|
|
" is_user_in_new_dash = True and \n",
|
|||
|
|
" is_missing_id_deal = False and\n",
|
|||
|
|
" -- 2. Protected Bookings with a Protection or a Deposit Management service\n",
|
|||
|
|
" (has_protection_service_business_type or \n",
|
|||
|
|
" has_deposit_management_service_business_type) and\n",
|
|||
|
|
" -- 3. Bookings with flagging categorisation (this excludes Cancelled/Incomplete/Rejected bookings)\n",
|
|||
|
|
" is_booking_flagged_as_risk is not null and \n",
|
|||
|
|
" -- 4. Booking is completed\n",
|
|||
|
|
" is_booking_past_completion_date = True \n",
|
|||
|
|
"\"\"\"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Retrieve Data from Query\n",
|
|||
|
|
"df = query_to_dataframe(engine=dwh_pg_engine, query=data_extraction_query)\n",
|
|||
|
|
"print(df.head())\n",
|
|||
|
|
"print(f\"Total Bookings: {len(df):,}\")\n"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"id": "d36c9276",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"## Processing\n",
|
|||
|
|
"\n",
|
|||
|
|
"We implement a very simple processing:\n",
|
|||
|
|
"1. We split the dataset between Train and Test. The strategy followed is just to apply a cutoff date on booking creation.\n",
|
|||
|
|
"2. We retrieve from the Train dataset the actual distribution of Bookings that raise a Resolution Incident (or Claim).\n",
|
|||
|
|
"3. We randomly flag in the Test dataset bookings according to the actual distribution observed from the Train set.\n",
|
|||
|
|
"4. This random flagging is stored in a new column called \"is_flagged_as_risk\""
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "943ef7d6",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# Ensure booking_created_date_utc is datetime\n",
|
|||
|
|
"df[\"booking_created_date_utc\"] = pd.to_datetime(df[\"booking_created_date_utc\"])\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Split data into train and test\n",
|
|||
|
|
"cutoff_date = pd.Timestamp(\"2025-04-01\")\n",
|
|||
|
|
"train_df = df[df[\"booking_created_date_utc\"] < cutoff_date]\n",
|
|||
|
|
"test_df = df[df[\"booking_created_date_utc\"] >= cutoff_date].copy() # Copy for modification\n",
|
|||
|
|
"print(f\"Train set size: {len(train_df):,} bookings\")\n",
|
|||
|
|
"print(f\"Test set size : {len(test_df):,} bookings\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Get the distribution from train set\n",
|
|||
|
|
"positive_rate = train_df[\"has_resolution_incident\"].mean()\n",
|
|||
|
|
"print(f\"Positive rate (has_resolution_incident = True) in train set: {positive_rate:.2%}\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Apply random prediction to test set using that distribution\n",
|
|||
|
|
"np.random.seed(123) # For reproducibility\n",
|
|||
|
|
"test_df[\"is_flagged_as_risk\"] = np.random.rand(len(test_df)) < positive_rate\n"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"id": "fc2fcc89",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"## Evaluation\n",
|
|||
|
|
"This section aims to evaluate the Test set on the performance of our random predictor vs. the actual Resolution Incidents.\n",
|
|||
|
|
"We start by computing the standard classification scores. This is later stored in a dataframe to be used in an adapted version of the confusion matrix, which is heavily inspired from the flagging_performance_monitoring notebook."
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "f7f3b80e",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# Actual and predicted\n",
|
|||
|
|
"y_true = test_df[\"has_resolution_incident\"]\n",
|
|||
|
|
"y_pred = test_df[\"is_flagged_as_risk\"]\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Compute confusion matrix: [ [TN, FP], [FN, TP] ]\n",
|
|||
|
|
"tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Total predictions\n",
|
|||
|
|
"total = tp + tn + fp + fn\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Compute all requested metrics\n",
|
|||
|
|
"recall = recall_score(y_true, y_pred)\n",
|
|||
|
|
"precision = precision_score(y_true, y_pred)\n",
|
|||
|
|
"f1 = fbeta_score(y_true, y_pred, beta=1)\n",
|
|||
|
|
"f2 = fbeta_score(y_true, y_pred, beta=2)\n",
|
|||
|
|
"fpr = fp / (fp + tn) if (fp + tn) != 0 else 0\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Scores relative to total\n",
|
|||
|
|
"tp_score = tp / total\n",
|
|||
|
|
"tn_score = tn / total\n",
|
|||
|
|
"fp_score = fp / total\n",
|
|||
|
|
"fn_score = fn / total\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Create DataFrame\n",
|
|||
|
|
"summary_df = pd.DataFrame([{\n",
|
|||
|
|
" \"flagging_analysis_type\": \"RISK_VS_CLAIM\",\n",
|
|||
|
|
" \"count_total\": total,\n",
|
|||
|
|
" \"count_true_positive\": tp,\n",
|
|||
|
|
" \"count_true_negative\": tn,\n",
|
|||
|
|
" \"count_false_positive\": fp,\n",
|
|||
|
|
" \"count_false_negative\": fn,\n",
|
|||
|
|
" \"true_positive_score\": tp_score,\n",
|
|||
|
|
" \"true_negative_score\": tn_score,\n",
|
|||
|
|
" \"false_positive_score\": fp_score,\n",
|
|||
|
|
" \"false_negative_score\": fn_score,\n",
|
|||
|
|
" \"recall_score\": recall,\n",
|
|||
|
|
" \"precision_score\": precision,\n",
|
|||
|
|
" \"false_positive_rate_score\": fpr,\n",
|
|||
|
|
" \"f1_score\": f1,\n",
|
|||
|
|
" \"f2_score\": f2\n",
|
|||
|
|
"}])"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"id": "db3bf8e2",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"Function to compute the confusion matrix from the summary dataframe:"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "8885dc39",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"def plot_confusion_matrix_from_df(df, flagging_analysis_type):\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Subset - just retrieve one row depending on the flagging_analysis_type\n",
|
|||
|
|
" row = df[df['flagging_analysis_type'] == flagging_analysis_type].iloc[0]\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Define custom x-axis labels and wording\n",
|
|||
|
|
" if flagging_analysis_type == 'RISK_VS_CLAIM':\n",
|
|||
|
|
" x_labels = ['With Submitted Claim', 'Without Submitted Claim']\n",
|
|||
|
|
" outcome_label = \"submitted claim\"\n",
|
|||
|
|
" elif flagging_analysis_type == 'RISK_VS_SUBMITTED_PAYOUT':\n",
|
|||
|
|
" x_labels = ['With Submitted Payout', 'Without Submitted Payout']\n",
|
|||
|
|
" outcome_label = \"submitted payout\"\n",
|
|||
|
|
" else:\n",
|
|||
|
|
" x_labels = ['Actual Positive', 'Actual Negative'] \n",
|
|||
|
|
" outcome_label = \"outcome\"\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Confusion matrix structure\n",
|
|||
|
|
" cm = np.array([\n",
|
|||
|
|
" [row['count_true_positive'], row['count_false_positive']],\n",
|
|||
|
|
" [row['count_false_negative'], row['count_true_negative']]\n",
|
|||
|
|
" ])\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Create annotations for the confusion matrix\n",
|
|||
|
|
" labels = [['True Positives', 'False Positives'], ['False Negatives', 'True Negatives']]\n",
|
|||
|
|
" counts = [[f\"{v:,}\" for v in [row['count_true_positive'], row['count_false_positive']]],\n",
|
|||
|
|
" [f\"{v:,}\" for v in [row['count_false_negative'], row['count_true_negative']]]]\n",
|
|||
|
|
" percentages = [[f\"{round(100*v,2):,}\" for v in [row['true_positive_score'], row['false_positive_score']]],\n",
|
|||
|
|
" [f\"{round(100*v,2):,}\" for v in [row['false_negative_score'], row['true_negative_score']]]]\n",
|
|||
|
|
" annot = [[f\"{labels[i][j]}\\n{counts[i][j]} ({percentages[i][j]}%)\" for j in range(2)] for i in range(2)]\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Scores formatted as percentages\n",
|
|||
|
|
" recall = row['recall_score'] * 100\n",
|
|||
|
|
" precision = row['precision_score'] * 100\n",
|
|||
|
|
" f1 = row['f1_score'] * 100\n",
|
|||
|
|
" f2 = row['f2_score'] * 100\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Set up figure and axes manually for precise control\n",
|
|||
|
|
" fig = plt.figure(figsize=(9, 8))\n",
|
|||
|
|
" grid = fig.add_gridspec(nrows=4, height_ratios=[2, 2, 15, 2])\n",
|
|||
|
|
"\n",
|
|||
|
|
" \n",
|
|||
|
|
" ax_main_title = fig.add_subplot(grid[0])\n",
|
|||
|
|
" ax_main_title.axis('off')\n",
|
|||
|
|
" ax_main_title.set_title(f\"Random Predictor - Flagged as Risk vs. {outcome_label.title()}\", fontsize=14, weight='bold')\n",
|
|||
|
|
" \n",
|
|||
|
|
" # Business explanation text\n",
|
|||
|
|
" ax_text = fig.add_subplot(grid[1])\n",
|
|||
|
|
" ax_text.axis('off')\n",
|
|||
|
|
" business_text = (\n",
|
|||
|
|
" f\"Random Predictor:\\n\\n\"\n",
|
|||
|
|
" f\"- We split bookings between train and test, considering as train those created before {cutoff_date.strftime('%Y-%m-%d')}.\\n\"\n",
|
|||
|
|
" f\"- We retrieve the actual distribution of incidents from the train set of {len(train_df):,} bookings, which is {positive_rate:.2%}.\\n\"\n",
|
|||
|
|
" f\"- We flag as risk randomly {positive_rate:.2%} bookings in the test set, which has a total of {len(test_df):,} bookings.\\n\"\n",
|
|||
|
|
" )\n",
|
|||
|
|
" ax_text.text(0.0, 0.0, business_text, fontsize=10.5, ha='left', va='bottom', wrap=False, linespacing=1.5)\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Heatmap\n",
|
|||
|
|
" ax_heatmap = fig.add_subplot(grid[2])\n",
|
|||
|
|
" ax_heatmap.set_title(f\"Confusion Matrix – Risk vs. {outcome_label.title()}\", fontsize=12, weight='bold', ha='center', va='center', wrap=False)\n",
|
|||
|
|
"\n",
|
|||
|
|
" cmap = sns.light_palette(\"#315584\", as_cmap=True)\n",
|
|||
|
|
"\n",
|
|||
|
|
" sns.heatmap(cm, annot=annot, fmt='', cmap=cmap, cbar=False,\n",
|
|||
|
|
" xticklabels=x_labels,\n",
|
|||
|
|
" yticklabels=['Flagged as Risk', 'Flagged as No Risk'],\n",
|
|||
|
|
" ax=ax_heatmap,\n",
|
|||
|
|
" linewidths=1.0,\n",
|
|||
|
|
" annot_kws={'fontsize': 10, 'linespacing': 1.2})\n",
|
|||
|
|
" ax_heatmap.set_xlabel(\"Resolution Outcome (Actual)\", fontsize=11, labelpad=10)\n",
|
|||
|
|
" ax_heatmap.set_ylabel(\"Flagging (Prediction)\", fontsize=11, labelpad=10)\n",
|
|||
|
|
" \n",
|
|||
|
|
" # Make borders visible\n",
|
|||
|
|
" for _, spine in ax_heatmap.spines.items():\n",
|
|||
|
|
" spine.set_visible(True)\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Footer with metrics and date\n",
|
|||
|
|
" ax_footer = fig.add_subplot(grid[3])\n",
|
|||
|
|
" ax_footer.axis('off')\n",
|
|||
|
|
" metrics_text = f\"Total Booking Count: {row['count_total']} | Recall: {recall:.2f}% | Precision: {precision:.2f}% | F1 Score: {f1:.2f}% | F2 Score: {f2:.2f}%\"\n",
|
|||
|
|
" date_text = f\"Generated on {date.today().strftime('%B %d, %Y')}\"\n",
|
|||
|
|
" ax_footer.text(0.5, 0.7, metrics_text, ha='center', fontsize=9)\n",
|
|||
|
|
" ax_footer.text(0.5, 0.1, date_text, ha='center', fontsize=8, color='gray')\n",
|
|||
|
|
"\n",
|
|||
|
|
" plt.tight_layout()\n",
|
|||
|
|
" plt.show()"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "9cd5e165",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# Plot confusion matrix for claim scenario\n",
|
|||
|
|
"plot_confusion_matrix_from_df(summary_df, 'RISK_VS_CLAIM')"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"metadata": {
|
|||
|
|
"kernelspec": {
|
|||
|
|
"display_name": "venv",
|
|||
|
|
"language": "python",
|
|||
|
|
"name": "python3"
|
|||
|
|
},
|
|||
|
|
"language_info": {
|
|||
|
|
"codemirror_mode": {
|
|||
|
|
"name": "ipython",
|
|||
|
|
"version": 3
|
|||
|
|
},
|
|||
|
|
"file_extension": ".py",
|
|||
|
|
"mimetype": "text/x-python",
|
|||
|
|
"name": "python",
|
|||
|
|
"nbconvert_exporter": "python",
|
|||
|
|
"pygments_lexer": "ipython3",
|
|||
|
|
"version": "3.12.3"
|
|||
|
|
}
|
|||
|
|
},
|
|||
|
|
"nbformat": 4,
|
|||
|
|
"nbformat_minor": 5
|
|||
|
|
}
|