204 lines
7.7 KiB
Text
204 lines
7.7 KiB
Text
|
|
{
|
|||
|
|
"cells": [
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"# Flagging Performance Monitoring\n",
|
|||
|
|
"\n",
|
|||
|
|
"## Initial setup\n",
|
|||
|
|
"This first section just ensures that the connection to DWH works correctly."
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# This script connects to a Data Warehouse (DWH) using PostgreSQL. \n",
|
|||
|
|
"# This should be common for all Notebooks, but you might need to adjust the path to the `dwh_utils` module.\n",
|
|||
|
|
"\n",
|
|||
|
|
"import sys\n",
|
|||
|
|
"import os\n",
|
|||
|
|
"sys.path.append(os.path.abspath(\"../utils\")) # Adjust path if needed\n",
|
|||
|
|
"\n",
|
|||
|
|
"from dwh_utils import read_credentials, create_postgres_engine, query_to_dataframe, test_connection\n",
|
|||
|
|
"\n",
|
|||
|
|
"# --- Connect to DWH ---\n",
|
|||
|
|
"creds = read_credentials()\n",
|
|||
|
|
"dwh_pg_engine = create_postgres_engine(creds)\n",
|
|||
|
|
"\n",
|
|||
|
|
"# --- Test Query ---\n",
|
|||
|
|
"test_connection()\n"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"## Data Extraction\n",
|
|||
|
|
"In this section we extract the data from the Flagging Performance Analysis within DWH."
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"import pandas as pd\n",
|
|||
|
|
"import matplotlib.pyplot as plt\n",
|
|||
|
|
"import seaborn as sns\n",
|
|||
|
|
"import numpy as np\n",
|
|||
|
|
"from datetime import date"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# Query to extract data\n",
|
|||
|
|
"data_extraction_query = \"\"\"\n",
|
|||
|
|
"select *\n",
|
|||
|
|
"from intermediate.int_flagging_performance_analysis \n",
|
|||
|
|
"\"\"\"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Retrieve Data from Query\n",
|
|||
|
|
"df = query_to_dataframe(engine=dwh_pg_engine, query=data_extraction_query)\n",
|
|||
|
|
"print(df.head())"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"def plot_confusion_matrix_from_df(df, flagging_analysis_type):\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Subset - just retrieve one row depending on the flagging_analysis_type\n",
|
|||
|
|
" row = df[df['flagging_analysis_type'] == flagging_analysis_type].iloc[0]\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Define custom x-axis labels and wording\n",
|
|||
|
|
" if flagging_analysis_type == 'RISK_VS_CLAIM':\n",
|
|||
|
|
" x_labels = ['With Submitted Claim', 'Without Submitted Claim']\n",
|
|||
|
|
" outcome_label = \"submitted claim\"\n",
|
|||
|
|
" elif flagging_analysis_type == 'RISK_VS_SUBMITTED_PAYOUT':\n",
|
|||
|
|
" x_labels = ['With Submitted Payout', 'Without Submitted Payout']\n",
|
|||
|
|
" outcome_label = \"submitted payout\"\n",
|
|||
|
|
" else:\n",
|
|||
|
|
" x_labels = ['Actual Positive', 'Actual Negative'] \n",
|
|||
|
|
" outcome_label = \"outcome\"\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Confusion matrix structure\n",
|
|||
|
|
" cm = np.array([\n",
|
|||
|
|
" [row['count_true_positive'], row['count_false_positive']],\n",
|
|||
|
|
" [row['count_false_negative'], row['count_true_negative']]\n",
|
|||
|
|
" ])\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Create annotations for the confusion matrix\n",
|
|||
|
|
" labels = [['True Positives', 'False Positives'], ['False Negatives', 'True Negatives']]\n",
|
|||
|
|
" counts = [[f\"{v:,}\" for v in [row['count_true_positive'], row['count_false_positive']]],\n",
|
|||
|
|
" [f\"{v:,}\" for v in [row['count_false_negative'], row['count_true_negative']]]]\n",
|
|||
|
|
" percentages = [[f\"{round(100*v,2):,}\" for v in [row['true_positive_score'], row['false_positive_score']]],\n",
|
|||
|
|
" [f\"{round(100*v,2):,}\" for v in [row['false_negative_score'], row['true_negative_score']]]]\n",
|
|||
|
|
" annot = [[f\"{labels[i][j]}\\n{counts[i][j]} ({percentages[i][j]}%)\" for j in range(2)] for i in range(2)]\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Scores formatted as percentages\n",
|
|||
|
|
" recall = row['recall_score'] * 100\n",
|
|||
|
|
" precision = row['precision_score'] * 100\n",
|
|||
|
|
" f1 = row['f1_score'] * 100\n",
|
|||
|
|
" f2 = row['f2_score'] * 100\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Set up figure and axes manually for precise control\n",
|
|||
|
|
" fig = plt.figure(figsize=(9, 8))\n",
|
|||
|
|
" grid = fig.add_gridspec(nrows=4, height_ratios=[2, 3, 15, 2])\n",
|
|||
|
|
"\n",
|
|||
|
|
" \n",
|
|||
|
|
" ax_main_title = fig.add_subplot(grid[0])\n",
|
|||
|
|
" ax_main_title.axis('off')\n",
|
|||
|
|
" ax_main_title.set_title(f\"Flagged as Risk vs. {outcome_label.title()}\", fontsize=14, weight='bold')\n",
|
|||
|
|
" \n",
|
|||
|
|
" # Business explanation text\n",
|
|||
|
|
" ax_text = fig.add_subplot(grid[1])\n",
|
|||
|
|
" ax_text.axis('off')\n",
|
|||
|
|
" business_text = (\n",
|
|||
|
|
" f\"Flagging performance analysis:\\n\\n\"\n",
|
|||
|
|
" f\"- Of all the bookings we flagged as at Risk, {precision:.2f}% actually turned into a {outcome_label}.\\n\"\n",
|
|||
|
|
" f\"- Of all the bookings that resulted in a {outcome_label}, we correctly flagged {recall:.2f}% of them.\\n\"\n",
|
|||
|
|
" f\"- The pure balance between these two is summarized by a score of {f1:.2f}%.\\n\"\n",
|
|||
|
|
" f\"- If we prioritise better probability of detection of a {outcome_label}, the balanced score is {f2:.2f}%.\\n\"\n",
|
|||
|
|
" )\n",
|
|||
|
|
" ax_text.text(0.0, 0.0, business_text, fontsize=10.5, ha='left', va='bottom', wrap=False, linespacing=1.5)\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Heatmap\n",
|
|||
|
|
" ax_heatmap = fig.add_subplot(grid[2])\n",
|
|||
|
|
" ax_heatmap.set_title(f\"Confusion Matrix – Risk vs. {outcome_label.title()}\", fontsize=12, weight='bold', ha='center', va='center', wrap=False)\n",
|
|||
|
|
"\n",
|
|||
|
|
" cmap = sns.light_palette(\"#318450\", as_cmap=True)\n",
|
|||
|
|
"\n",
|
|||
|
|
" sns.heatmap(cm, annot=annot, fmt='', cmap=cmap, cbar=False,\n",
|
|||
|
|
" xticklabels=x_labels,\n",
|
|||
|
|
" yticklabels=['Flagged as Risk', 'Flagged as No Risk'],\n",
|
|||
|
|
" ax=ax_heatmap,\n",
|
|||
|
|
" linewidths=1.0,\n",
|
|||
|
|
" annot_kws={'fontsize': 10, 'linespacing': 1.2})\n",
|
|||
|
|
" ax_heatmap.set_xlabel(\"Resolution Outcome (Actual)\", fontsize=11, labelpad=10)\n",
|
|||
|
|
" ax_heatmap.set_ylabel(\"Booking Status (Prediction)\", fontsize=11, labelpad=10)\n",
|
|||
|
|
" \n",
|
|||
|
|
" # Make borders visible\n",
|
|||
|
|
" for _, spine in ax_heatmap.spines.items():\n",
|
|||
|
|
" spine.set_visible(True)\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Footer with metrics and date\n",
|
|||
|
|
" ax_footer = fig.add_subplot(grid[3])\n",
|
|||
|
|
" ax_footer.axis('off')\n",
|
|||
|
|
" metrics_text = f\"Total Booking Count: {row['count_total']} | Recall: {recall:.2f}% | Precision: {precision:.2f}% | F1 Score: {f1:.2f}% | F2 Score: {f2:.2f}%\"\n",
|
|||
|
|
" date_text = f\"Generated on {date.today().strftime('%B %d, %Y')}\"\n",
|
|||
|
|
" ax_footer.text(0.5, 0.7, metrics_text, ha='center', fontsize=9)\n",
|
|||
|
|
" ax_footer.text(0.5, 0.1, date_text, ha='center', fontsize=8, color='gray')\n",
|
|||
|
|
"\n",
|
|||
|
|
" plt.tight_layout()\n",
|
|||
|
|
" plt.show()"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# Plot confusion matrix for claim scenario\n",
|
|||
|
|
"plot_confusion_matrix_from_df(df, 'RISK_VS_CLAIM')\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Plot confusion matrix for submitted payout scenario\n",
|
|||
|
|
"plot_confusion_matrix_from_df(df, 'RISK_VS_SUBMITTED_PAYOUT')"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"metadata": {
|
|||
|
|
"kernelspec": {
|
|||
|
|
"display_name": "venv",
|
|||
|
|
"language": "python",
|
|||
|
|
"name": "python3"
|
|||
|
|
},
|
|||
|
|
"language_info": {
|
|||
|
|
"codemirror_mode": {
|
|||
|
|
"name": "ipython",
|
|||
|
|
"version": 3
|
|||
|
|
},
|
|||
|
|
"file_extension": ".py",
|
|||
|
|
"mimetype": "text/x-python",
|
|||
|
|
"name": "python",
|
|||
|
|
"nbconvert_exporter": "python",
|
|||
|
|
"pygments_lexer": "ipython3",
|
|||
|
|
"version": "3.12.3"
|
|||
|
|
}
|
|||
|
|
},
|
|||
|
|
"nbformat": 4,
|
|||
|
|
"nbformat_minor": 2
|
|||
|
|
}
|