{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Flagging Performance Monitoring\n", "\n", "## Initial setup\n", "This first section just ensures that the connection to DWH works correctly." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# This script connects to a Data Warehouse (DWH) using PostgreSQL. \n", "# This should be common for all Notebooks, but you might need to adjust the path to the `dwh_utils` module.\n", "\n", "import sys\n", "import os\n", "sys.path.append(os.path.abspath(\"../utils\")) # Adjust path if needed\n", "\n", "from dwh_utils import read_credentials, create_postgres_engine, query_to_dataframe, test_connection\n", "\n", "# --- Connect to DWH ---\n", "creds = read_credentials()\n", "dwh_pg_engine = create_postgres_engine(creds)\n", "\n", "# --- Test Query ---\n", "test_connection()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data Extraction\n", "In this section we extract the data from the Flagging Performance Analysis within DWH." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import numpy as np\n", "from datetime import date" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Query to extract data\n", "data_extraction_query = \"\"\"\n", "select *\n", "from intermediate.int_flagging_performance_analysis \n", "\"\"\"\n", "\n", "# Retrieve Data from Query\n", "df = query_to_dataframe(engine=dwh_pg_engine, query=data_extraction_query)\n", "print(df.head())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def plot_confusion_matrix_from_df(df, flagging_analysis_type):\n", "\n", " # Subset - just retrieve one row depending on the flagging_analysis_type\n", " row = df[df['flagging_analysis_type'] == flagging_analysis_type].iloc[0]\n", "\n", " # Define custom x-axis labels and wording\n", " if flagging_analysis_type == 'RISK_VS_CLAIM':\n", " x_labels = ['With Submitted Claim', 'Without Submitted Claim']\n", " outcome_label = \"submitted claim\"\n", " elif flagging_analysis_type == 'RISK_VS_SUBMITTED_PAYOUT':\n", " x_labels = ['With Submitted Payout', 'Without Submitted Payout']\n", " outcome_label = \"submitted payout\"\n", " else:\n", " x_labels = ['Actual Positive', 'Actual Negative'] \n", " outcome_label = \"outcome\"\n", "\n", " # Confusion matrix structure\n", " cm = np.array([\n", " [row['count_true_positive'], row['count_false_positive']],\n", " [row['count_false_negative'], row['count_true_negative']]\n", " ])\n", "\n", " # Create annotations for the confusion matrix\n", " labels = [['True Positives', 'False Positives'], ['False Negatives', 'True Negatives']]\n", " counts = [[f\"{v:,}\" for v in [row['count_true_positive'], row['count_false_positive']]],\n", " [f\"{v:,}\" for v in [row['count_false_negative'], row['count_true_negative']]]]\n", " percentages = [[f\"{round(100*v,2):,}\" for v in [row['true_positive_score'], row['false_positive_score']]],\n", " [f\"{round(100*v,2):,}\" for v in [row['false_negative_score'], row['true_negative_score']]]]\n", " annot = [[f\"{labels[i][j]}\\n{counts[i][j]} ({percentages[i][j]}%)\" for j in range(2)] for i in range(2)]\n", "\n", " # Scores formatted as percentages\n", " recall = row['recall_score'] * 100\n", " precision = row['precision_score'] * 100\n", " f1 = row['f1_score'] * 100\n", " f2 = row['f2_score'] * 100\n", "\n", " # Set up figure and axes manually for precise control\n", " fig = plt.figure(figsize=(9, 8))\n", " grid = fig.add_gridspec(nrows=4, height_ratios=[2, 3, 15, 2])\n", "\n", " \n", " ax_main_title = fig.add_subplot(grid[0])\n", " ax_main_title.axis('off')\n", " ax_main_title.set_title(f\"Flagged as Risk vs. {outcome_label.title()}\", fontsize=14, weight='bold')\n", " \n", " # Business explanation text\n", " ax_text = fig.add_subplot(grid[1])\n", " ax_text.axis('off')\n", " business_text = (\n", " f\"Flagging performance analysis:\\n\\n\"\n", " f\"- Of all the bookings we flagged as at Risk, {precision:.2f}% actually turned into a {outcome_label}.\\n\"\n", " f\"- Of all the bookings that resulted in a {outcome_label}, we correctly flagged {recall:.2f}% of them.\\n\"\n", " f\"- The pure balance between these two is summarized by a score of {f1:.2f}%.\\n\"\n", " f\"- If we prioritise better probability of detection of a {outcome_label}, the balanced score is {f2:.2f}%.\\n\"\n", " )\n", " ax_text.text(0.0, 0.0, business_text, fontsize=10.5, ha='left', va='bottom', wrap=False, linespacing=1.5)\n", "\n", " # Heatmap\n", " ax_heatmap = fig.add_subplot(grid[2])\n", " ax_heatmap.set_title(f\"Confusion Matrix – Risk vs. {outcome_label.title()}\", fontsize=12, weight='bold', ha='center', va='center', wrap=False)\n", "\n", " cmap = sns.light_palette(\"#318450\", as_cmap=True)\n", "\n", " sns.heatmap(cm, annot=annot, fmt='', cmap=cmap, cbar=False,\n", " xticklabels=x_labels,\n", " yticklabels=['Flagged as Risk', 'Flagged as No Risk'],\n", " ax=ax_heatmap,\n", " linewidths=1.0,\n", " annot_kws={'fontsize': 10, 'linespacing': 1.2})\n", " ax_heatmap.set_xlabel(\"Resolution Outcome (Actual)\", fontsize=11, labelpad=10)\n", " ax_heatmap.set_ylabel(\"Booking Status (Prediction)\", fontsize=11, labelpad=10)\n", " \n", " # Make borders visible\n", " for _, spine in ax_heatmap.spines.items():\n", " spine.set_visible(True)\n", "\n", " # Footer with metrics and date\n", " ax_footer = fig.add_subplot(grid[3])\n", " ax_footer.axis('off')\n", " metrics_text = f\"Total Booking Count: {row['count_total']} | Recall: {recall:.2f}% | Precision: {precision:.2f}% | F1 Score: {f1:.2f}% | F2 Score: {f2:.2f}%\"\n", " date_text = f\"Generated on {date.today().strftime('%B %d, %Y')}\"\n", " ax_footer.text(0.5, 0.7, metrics_text, ha='center', fontsize=9)\n", " ax_footer.text(0.5, 0.1, date_text, ha='center', fontsize=8, color='gray')\n", "\n", " plt.tight_layout()\n", " plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Plot confusion matrix for claim scenario\n", "plot_confusion_matrix_from_df(df, 'RISK_VS_CLAIM')\n", "\n", "# Plot confusion matrix for submitted payout scenario\n", "plot_confusion_matrix_from_df(df, 'RISK_VS_SUBMITTED_PAYOUT')" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 2 }