data-jupyter-notebooks/data_driven_risk_assessment/flagging_performance_monitoring.ipynb

204 lines
7.7 KiB
Text
Raw Normal View History

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Flagging Performance Monitoring\n",
"\n",
"## Initial setup\n",
"This first section just ensures that the connection to DWH works correctly."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# This script connects to a Data Warehouse (DWH) using PostgreSQL. \n",
"# This should be common for all Notebooks, but you might need to adjust the path to the `dwh_utils` module.\n",
"\n",
"import sys\n",
"import os\n",
"sys.path.append(os.path.abspath(\"../utils\")) # Adjust path if needed\n",
"\n",
"from dwh_utils import read_credentials, create_postgres_engine, query_to_dataframe, test_connection\n",
"\n",
"# --- Connect to DWH ---\n",
"creds = read_credentials()\n",
"dwh_pg_engine = create_postgres_engine(creds)\n",
"\n",
"# --- Test Query ---\n",
"test_connection()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data Extraction\n",
"In this section we extract the data from the Flagging Performance Analysis within DWH."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import numpy as np\n",
"from datetime import date"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Query to extract data\n",
"data_extraction_query = \"\"\"\n",
"select *\n",
"from intermediate.int_flagging_performance_analysis \n",
"\"\"\"\n",
"\n",
"# Retrieve Data from Query\n",
"df = query_to_dataframe(engine=dwh_pg_engine, query=data_extraction_query)\n",
"print(df.head())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def plot_confusion_matrix_from_df(df, flagging_analysis_type):\n",
"\n",
" # Subset - just retrieve one row depending on the flagging_analysis_type\n",
" row = df[df['flagging_analysis_type'] == flagging_analysis_type].iloc[0]\n",
"\n",
" # Define custom x-axis labels and wording\n",
" if flagging_analysis_type == 'RISK_VS_CLAIM':\n",
" x_labels = ['With Submitted Claim', 'Without Submitted Claim']\n",
" outcome_label = \"submitted claim\"\n",
" elif flagging_analysis_type == 'RISK_VS_SUBMITTED_PAYOUT':\n",
" x_labels = ['With Submitted Payout', 'Without Submitted Payout']\n",
" outcome_label = \"submitted payout\"\n",
" else:\n",
" x_labels = ['Actual Positive', 'Actual Negative'] \n",
" outcome_label = \"outcome\"\n",
"\n",
" # Confusion matrix structure\n",
" cm = np.array([\n",
" [row['count_true_positive'], row['count_false_positive']],\n",
" [row['count_false_negative'], row['count_true_negative']]\n",
" ])\n",
"\n",
" # Create annotations for the confusion matrix\n",
" labels = [['True Positives', 'False Positives'], ['False Negatives', 'True Negatives']]\n",
" counts = [[f\"{v:,}\" for v in [row['count_true_positive'], row['count_false_positive']]],\n",
" [f\"{v:,}\" for v in [row['count_false_negative'], row['count_true_negative']]]]\n",
" percentages = [[f\"{round(100*v,2):,}\" for v in [row['true_positive_score'], row['false_positive_score']]],\n",
" [f\"{round(100*v,2):,}\" for v in [row['false_negative_score'], row['true_negative_score']]]]\n",
" annot = [[f\"{labels[i][j]}\\n{counts[i][j]} ({percentages[i][j]}%)\" for j in range(2)] for i in range(2)]\n",
"\n",
" # Scores formatted as percentages\n",
" recall = row['recall_score'] * 100\n",
" precision = row['precision_score'] * 100\n",
" f1 = row['f1_score'] * 100\n",
" f2 = row['f2_score'] * 100\n",
"\n",
" # Set up figure and axes manually for precise control\n",
" fig = plt.figure(figsize=(9, 8))\n",
" grid = fig.add_gridspec(nrows=4, height_ratios=[2, 3, 15, 2])\n",
"\n",
" \n",
" ax_main_title = fig.add_subplot(grid[0])\n",
" ax_main_title.axis('off')\n",
" ax_main_title.set_title(f\"Flagged as Risk vs. {outcome_label.title()}\", fontsize=14, weight='bold')\n",
" \n",
" # Business explanation text\n",
" ax_text = fig.add_subplot(grid[1])\n",
" ax_text.axis('off')\n",
" business_text = (\n",
" f\"Flagging performance analysis:\\n\\n\"\n",
" f\"- Of all the bookings we flagged as at Risk, {precision:.2f}% actually turned into a {outcome_label}.\\n\"\n",
" f\"- Of all the bookings that resulted in a {outcome_label}, we correctly flagged {recall:.2f}% of them.\\n\"\n",
" f\"- The pure balance between these two is summarized by a score of {f1:.2f}%.\\n\"\n",
" f\"- If we prioritise better probability of detection of a {outcome_label}, the balanced score is {f2:.2f}%.\\n\"\n",
" )\n",
" ax_text.text(0.0, 0.0, business_text, fontsize=10.5, ha='left', va='bottom', wrap=False, linespacing=1.5)\n",
"\n",
" # Heatmap\n",
" ax_heatmap = fig.add_subplot(grid[2])\n",
" ax_heatmap.set_title(f\"Confusion Matrix Risk vs. {outcome_label.title()}\", fontsize=12, weight='bold', ha='center', va='center', wrap=False)\n",
"\n",
" cmap = sns.light_palette(\"#318450\", as_cmap=True)\n",
"\n",
" sns.heatmap(cm, annot=annot, fmt='', cmap=cmap, cbar=False,\n",
" xticklabels=x_labels,\n",
" yticklabels=['Flagged as Risk', 'Flagged as No Risk'],\n",
" ax=ax_heatmap,\n",
" linewidths=1.0,\n",
" annot_kws={'fontsize': 10, 'linespacing': 1.2})\n",
" ax_heatmap.set_xlabel(\"Resolution Outcome (Actual)\", fontsize=11, labelpad=10)\n",
" ax_heatmap.set_ylabel(\"Booking Status (Prediction)\", fontsize=11, labelpad=10)\n",
" \n",
" # Make borders visible\n",
" for _, spine in ax_heatmap.spines.items():\n",
" spine.set_visible(True)\n",
"\n",
" # Footer with metrics and date\n",
" ax_footer = fig.add_subplot(grid[3])\n",
" ax_footer.axis('off')\n",
" metrics_text = f\"Total Booking Count: {row['count_total']} | Recall: {recall:.2f}% | Precision: {precision:.2f}% | F1 Score: {f1:.2f}% | F2 Score: {f2:.2f}%\"\n",
" date_text = f\"Generated on {date.today().strftime('%B %d, %Y')}\"\n",
" ax_footer.text(0.5, 0.7, metrics_text, ha='center', fontsize=9)\n",
" ax_footer.text(0.5, 0.1, date_text, ha='center', fontsize=8, color='gray')\n",
"\n",
" plt.tight_layout()\n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Plot confusion matrix for claim scenario\n",
"plot_confusion_matrix_from_df(df, 'RISK_VS_CLAIM')\n",
"\n",
"# Plot confusion matrix for submitted payout scenario\n",
"plot_confusion_matrix_from_df(df, 'RISK_VS_SUBMITTED_PAYOUT')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}