2171 lines
1 MiB
Text
2171 lines
1 MiB
Text
|
|
{
|
|||
|
|
"cells": [
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"id": "84dcd475",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"# DDRA - Contactless (Reduced)\n",
|
|||
|
|
"\n",
|
|||
|
|
"## General Idea\n",
|
|||
|
|
"The idea is to play only with numeric features (floats, integers or booleans) that are CONTACTLESS.\n",
|
|||
|
|
"\n",
|
|||
|
|
"This considers a subset of the features. This is mostly a copy from 002_contactless_full_attributes that just selects the most relevant attributes.\n",
|
|||
|
|
"\n",
|
|||
|
|
"## Initial setup\n",
|
|||
|
|
"This first section just ensures that the connection to DWH works correctly."
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "12368ce1",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stdout",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"🔌 Testing connection using credentials at: /home/uri/.superhog-dwh/credentials.yml\n",
|
|||
|
|
"✅ Connection successful.\n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"# This script connects to a Data Warehouse (DWH) using PostgreSQL. \n",
|
|||
|
|
"# This should be common for all Notebooks, but you might need to adjust the path to the `dwh_utils` module.\n",
|
|||
|
|
"\n",
|
|||
|
|
"import sys\n",
|
|||
|
|
"import os\n",
|
|||
|
|
"sys.path.append(os.path.abspath(\"../../utils\")) # Adjust path if needed\n",
|
|||
|
|
"\n",
|
|||
|
|
"from dwh_utils import read_credentials, create_postgres_engine, query_to_dataframe, test_connection\n",
|
|||
|
|
"\n",
|
|||
|
|
"# --- Connect to DWH ---\n",
|
|||
|
|
"creds = read_credentials()\n",
|
|||
|
|
"dwh_pg_engine = create_postgres_engine(creds)\n",
|
|||
|
|
"\n",
|
|||
|
|
"# --- Test Query ---\n",
|
|||
|
|
"test_connection()"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"id": "c86f94f1",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"## Data Extraction\n",
|
|||
|
|
"In this section we extract the data.\n",
|
|||
|
|
"\n",
|
|||
|
|
"This SQL query retrieves a clean and relevant subset of booking data for our model. It includes:\n",
|
|||
|
|
"- A **unique booking ID**\n",
|
|||
|
|
"- Key **numeric features** such as number of services, time between booking creation and check-in, number of nights, etc.\n",
|
|||
|
|
"- Several **categorical (boolean) features** related to service usage\n",
|
|||
|
|
"- A **target variable** (`has_resolution_incident`) indicating whether a resolution incident occurred\n",
|
|||
|
|
"\n",
|
|||
|
|
"Filters applied being:\n",
|
|||
|
|
"1. Bookings from **\"New Dash\" users** with a valid deal ID\n",
|
|||
|
|
"2. Only **protected bookings**, i.e., those with Protection or Deposit Management services\n",
|
|||
|
|
"3. Bookings flagged for **risk categorisation** (excluding incomplete/rejected ones)\n",
|
|||
|
|
"4. Bookings that are **already completed**\n",
|
|||
|
|
"\n",
|
|||
|
|
"The result is converted into a pandas DataFrame for further processing and modeling.\n"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "3e3ed391",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# Initialise all imports needed for the Notebook\n",
|
|||
|
|
"from sklearn.model_selection import (\n",
|
|||
|
|
" train_test_split, \n",
|
|||
|
|
" GridSearchCV\n",
|
|||
|
|
")\n",
|
|||
|
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
|||
|
|
"from sklearn.pipeline import Pipeline\n",
|
|||
|
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
|
"import pandas as pd\n",
|
|||
|
|
"import numpy as np\n",
|
|||
|
|
"from datetime import date\n",
|
|||
|
|
"from sklearn.metrics import (\n",
|
|||
|
|
" roc_auc_score, \n",
|
|||
|
|
" average_precision_score,\n",
|
|||
|
|
" classification_report,\n",
|
|||
|
|
" roc_curve, \n",
|
|||
|
|
" auc,\n",
|
|||
|
|
" precision_recall_curve,\n",
|
|||
|
|
" precision_score,\n",
|
|||
|
|
" recall_score,\n",
|
|||
|
|
" fbeta_score,\n",
|
|||
|
|
" confusion_matrix\n",
|
|||
|
|
")\n",
|
|||
|
|
"import matplotlib.pyplot as plt\n",
|
|||
|
|
"import shap\n",
|
|||
|
|
"import seaborn as sns"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "db5e3098",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stdout",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"Total Bookings: 21,384\n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"# Query to extract data\n",
|
|||
|
|
"data_extraction_query = \"\"\"\n",
|
|||
|
|
"WITH \n",
|
|||
|
|
"service_information AS (\n",
|
|||
|
|
"\tSELECT\n",
|
|||
|
|
"\t\tid_booking,\n",
|
|||
|
|
"\t\tcount(DISTINCT CASE WHEN service_business_type = 'SCREENING' THEN id_booking_service_detail ELSE NULL END) AS number_of_applied_screening_services,\n",
|
|||
|
|
"\t\tcount(DISTINCT CASE WHEN service_business_type = 'DEPOSIT_MANAGEMENT' THEN id_booking_service_detail ELSE NULL END) AS number_of_applied_deposit_management_services,\n",
|
|||
|
|
"\t\tcount(DISTINCT CASE WHEN service_business_type = 'PROTECTION' THEN id_booking_service_detail ELSE NULL END) AS number_of_applied_protection_services,\n",
|
|||
|
|
"\t\tcount(DISTINCT CASE WHEN service_name = 'WAIVER PRO' THEN id_booking ELSE NULL END)>0 AS has_waiver_pro,\n",
|
|||
|
|
"\t\tcount(DISTINCT CASE WHEN service_name IN ('BASIC DAMAGE DEPOSIT','BASIC DAMAGE DEPOSIT OR BASIC WAIVER','BASIC DAMAGE DEPOSIT OR WAIVER PLUS','BASIC WAIVER','WAIVER PLUS') THEN id_booking ELSE NULL END)>0 AS has_guest_facing_waiver_or_deposit,\n",
|
|||
|
|
"\t\tcount(DISTINCT CASE WHEN service_name = 'GUEST AGREEMENT' THEN id_booking ELSE NULL END)>0 AS has_guest_agreement,\n",
|
|||
|
|
"\t\tcount(DISTINCT CASE WHEN service_name = 'BASIC PROTECTION' THEN id_booking ELSE NULL END)>0 AS has_basic_protection,\n",
|
|||
|
|
"\t\tcount(DISTINCT CASE WHEN service_name = 'PROTECTION PLUS' THEN id_booking ELSE NULL END)>0 AS has_protection_plus,\n",
|
|||
|
|
"\t\tcount(DISTINCT CASE WHEN service_name = 'PROTECTION PRO' THEN id_booking ELSE NULL END)>0 AS has_protection_pro,\n",
|
|||
|
|
"\t\tcount(DISTINCT CASE WHEN service_name = 'ID VERIFICATION' THEN id_booking ELSE NULL END)>0 AS has_id_verification,\n",
|
|||
|
|
"\t\tcount(DISTINCT CASE WHEN service_name = 'SCREENING PLUS' THEN id_booking ELSE NULL END)>0 AS has_screening_plus,\n",
|
|||
|
|
"\t\tcount(DISTINCT CASE WHEN service_name = 'SEX OFFENDER CHECK' THEN id_booking ELSE NULL END)>0 AS has_sex_offender_check\n",
|
|||
|
|
"\tFROM\n",
|
|||
|
|
"\t\tintermediate.int_core__booking_service_detail\n",
|
|||
|
|
"\tGROUP BY\n",
|
|||
|
|
"\t\t1\n",
|
|||
|
|
"),\n",
|
|||
|
|
"listing_information AS (\n",
|
|||
|
|
"SELECT \n",
|
|||
|
|
"\tica.id_accommodation,\n",
|
|||
|
|
"\t-- Defaults to 0 if null\n",
|
|||
|
|
"\tCOALESCE(ica.number_of_bedrooms, 0) AS listing_number_of_bedrooms,\n",
|
|||
|
|
"\t-- Defaults to 0 if null\n",
|
|||
|
|
"\tCOALESCE(ica.number_of_bathrooms, 0) AS listing_number_of_bathrooms\n",
|
|||
|
|
"\tFROM intermediate.int_core__accommodation ica \n",
|
|||
|
|
"),\n",
|
|||
|
|
"raw_bookings_checked_in_prior_to_TCR AS (\n",
|
|||
|
|
"\tSELECT\n",
|
|||
|
|
"\t\tb.id_booking,\n",
|
|||
|
|
"\t\t-- Using group by on check-in date to remove booking duplicates\n",
|
|||
|
|
"\t\tb2.booking_check_in_date_utc,\n",
|
|||
|
|
"\t\t-- Using min as a conservative approach to reduce outliers\n",
|
|||
|
|
"\t\tmin(b2.booking_number_of_nights) AS min_booking_number_of_nights\n",
|
|||
|
|
"\tFROM\n",
|
|||
|
|
"\t\tintermediate.int_booking_summary b\n",
|
|||
|
|
"\t-- Note that by joining with BS we're only considering New Dash bookings\n",
|
|||
|
|
"\tLEFT JOIN intermediate.int_booking_summary b2\n",
|
|||
|
|
" ON\n",
|
|||
|
|
"\t\tb2.id_accommodation = b.id_accommodation\n",
|
|||
|
|
"\t\t-- Exclusion based on actual booking creation!\n",
|
|||
|
|
"\t\tAND b2.booking_check_in_date_utc >= b.booking_created_date_utc - INTERVAL '30 days'\n",
|
|||
|
|
"\t\tAND b2.booking_check_in_date_utc < b.booking_created_date_utc\n",
|
|||
|
|
"\t\t-- Note that since is based on TCR we can remove Cancelled\n",
|
|||
|
|
"\t\tAND b2.booking_status NOT IN ('CANCELLED')\n",
|
|||
|
|
"\tGROUP BY\n",
|
|||
|
|
"\t\tb.id_booking,\n",
|
|||
|
|
"\t\tb2.booking_check_in_date_utc\n",
|
|||
|
|
"),\n",
|
|||
|
|
"bookings_checked_in_prior_to_TCR AS (\n",
|
|||
|
|
"\tSELECT\n",
|
|||
|
|
"\t\tid_booking,\n",
|
|||
|
|
"\t\tLEAST(\n",
|
|||
|
|
"\t\t\tcount(booking_check_in_date_utc),\n",
|
|||
|
|
"\t\t\t30\n",
|
|||
|
|
"\t\t) AS listing_check_ins_prior_to_TCR_in_30_days,\n",
|
|||
|
|
"\t\t-- Capping\n",
|
|||
|
|
"\t\tLEAST(\n",
|
|||
|
|
"\t\t\tGREATEST(\n",
|
|||
|
|
"\t\t\t\tsum(min_booking_number_of_nights),\n",
|
|||
|
|
"\t\t\t\t0\n",
|
|||
|
|
"\t\t\t),\n",
|
|||
|
|
"\t\t\t30\n",
|
|||
|
|
"\t\t) AS listing_occupancy_prior_to_TCR_in_30_days\n",
|
|||
|
|
"\tFROM\n",
|
|||
|
|
"\t\traw_bookings_checked_in_prior_to_TCR\n",
|
|||
|
|
"\tGROUP BY\n",
|
|||
|
|
"\t\t1\n",
|
|||
|
|
"),\n",
|
|||
|
|
"raw_known_bookings_checking_in_prior_to_TCI AS (\n",
|
|||
|
|
"\tSELECT\n",
|
|||
|
|
"\t\tb.id_booking,\n",
|
|||
|
|
"\t\tb.booking_check_in_date_utc,\n",
|
|||
|
|
"\t\t-- Using group by on check-in date to remove booking duplicates\n",
|
|||
|
|
"\t\tb2.booking_check_in_date_utc AS other_bookings_check_in_date_utc,\n",
|
|||
|
|
"\t\t-- Using min as a conservative approach to reduce outliers\n",
|
|||
|
|
"\t\tmin(b2.booking_number_of_nights) AS min_booking_number_of_nights\n",
|
|||
|
|
"\tFROM\n",
|
|||
|
|
"\t\tintermediate.int_booking_summary b\n",
|
|||
|
|
"\t-- Note that by joining with BS we're only considering New Dash bookings\n",
|
|||
|
|
"\tLEFT JOIN intermediate.int_booking_summary b2\n",
|
|||
|
|
" ON\n",
|
|||
|
|
"\t\tb2.id_accommodation = b.id_accommodation\n",
|
|||
|
|
"\t\t-- Exclusion based on check-in\n",
|
|||
|
|
"\t\tAND b2.booking_check_in_date_utc >= b.booking_check_in_date_utc - INTERVAL '30 days'\n",
|
|||
|
|
"\t\tAND b2.booking_check_in_date_utc < b.booking_check_in_date_utc\n",
|
|||
|
|
"\t\t-- that are known!\n",
|
|||
|
|
"\t\tAND b2.booking_created_date_utc < b.booking_created_date_utc\n",
|
|||
|
|
"\t\t-- Note that since is based on TCI we cannot remove Cancelled\n",
|
|||
|
|
"\tGROUP BY\n",
|
|||
|
|
"\t\tb.id_booking,\n",
|
|||
|
|
"\t\tb.booking_check_in_date_utc,\n",
|
|||
|
|
"\t\tb2.booking_check_in_date_utc\n",
|
|||
|
|
"),\n",
|
|||
|
|
"known_bookings_checking_in_prior_to_TCI AS (\n",
|
|||
|
|
"\tSELECT\n",
|
|||
|
|
"\t\tid_booking,\n",
|
|||
|
|
"\t\tLEAST(\n",
|
|||
|
|
"\t\t\tcount(other_bookings_check_in_date_utc),\n",
|
|||
|
|
"\t\t\t30\n",
|
|||
|
|
"\t\t) AS listing_known_check_ins_prior_to_TCI_in_30_days,\n",
|
|||
|
|
"\t\t-- Capping\n",
|
|||
|
|
"\t\tLEAST(\n",
|
|||
|
|
"\t\t\tGREATEST(\n",
|
|||
|
|
"\t\t\t\tsum(min_booking_number_of_nights),\n",
|
|||
|
|
"\t\t\t\t0\n",
|
|||
|
|
"\t\t\t),\n",
|
|||
|
|
"\t\t\t30\n",
|
|||
|
|
"\t\t) AS listing_known_occupancy_prior_to_TCI_in_30_days,\n",
|
|||
|
|
"\t\tCOALESCE(\n",
|
|||
|
|
"\t\t\tbooking_check_in_date_utc - max(other_bookings_check_in_date_utc),\n",
|
|||
|
|
"\t\t\t30\n",
|
|||
|
|
"\t\t) AS lead_time_between_prior_known_check_in_to_TCI_30_days\n",
|
|||
|
|
"\tFROM\n",
|
|||
|
|
"\t\traw_known_bookings_checking_in_prior_to_TCI\n",
|
|||
|
|
"\tGROUP BY\n",
|
|||
|
|
"\t\tid_booking, \n",
|
|||
|
|
"\t\tbooking_check_in_date_utc\n",
|
|||
|
|
"),\n",
|
|||
|
|
"incidents_prior_to_TCP AS (\n",
|
|||
|
|
"\tSELECT\n",
|
|||
|
|
"\t\tb.id_booking,\n",
|
|||
|
|
"\t\t-- Using distinct count on check-in date to remove booking duplicates\n",
|
|||
|
|
"\t\tCOUNT(DISTINCT b2.booking_check_in_date_utc) AS listing_incidents_prior_to_TCP_in_30_days\n",
|
|||
|
|
"\tFROM\n",
|
|||
|
|
"\t\tintermediate.int_booking_summary b\n",
|
|||
|
|
"\tLEFT JOIN intermediate.int_booking_summary b2\n",
|
|||
|
|
" ON\n",
|
|||
|
|
"\t\tb2.id_accommodation = b.id_accommodation\n",
|
|||
|
|
"\t\t-- Filter on Check Out date\n",
|
|||
|
|
"\t\tAND b2.booking_completed_date_utc >= b.booking_created_date_utc - INTERVAL '30 days'\n",
|
|||
|
|
"\t\tAND b2.booking_completed_date_utc < b.booking_created_date_utc\n",
|
|||
|
|
"\t\tAND b2.has_resolution_incident = TRUE\n",
|
|||
|
|
"\tGROUP BY\n",
|
|||
|
|
"\t\tb.id_booking\n",
|
|||
|
|
")\n",
|
|||
|
|
"SELECT\n",
|
|||
|
|
"\t-- UNIQUE BOOKING ID --\n",
|
|||
|
|
"\tbooking_summary.id_booking,\n",
|
|||
|
|
"\t\n",
|
|||
|
|
"\t-- CONTEXTUAL SERVICE INFORMATION --\n",
|
|||
|
|
"\t-- We're not including number_of_applied_services as it 1-correlates with upgraded services\n",
|
|||
|
|
"\tbooking_summary.number_of_applied_upgraded_services,\n",
|
|||
|
|
" --Removed! booking_summary.number_of_applied_billable_services,\n",
|
|||
|
|
"\tservice_information.number_of_applied_screening_services,\n",
|
|||
|
|
"\tservice_information.number_of_applied_deposit_management_services,\n",
|
|||
|
|
"\tservice_information.number_of_applied_protection_services,\n",
|
|||
|
|
"\t--Removed! service_information.has_waiver_pro,\n",
|
|||
|
|
"\t--Removed! service_information.has_guest_facing_waiver_or_deposit,\n",
|
|||
|
|
"\t--Removed! service_information.has_guest_agreement,\n",
|
|||
|
|
"\t--Removed! service_information.has_basic_protection,\n",
|
|||
|
|
"\t--Removed! service_information.has_protection_plus,\n",
|
|||
|
|
"\t--Removed! service_information.has_protection_pro,\n",
|
|||
|
|
"\t--Removed! service_information.has_id_verification,\n",
|
|||
|
|
"\t--Removed! service_information.has_screening_plus,\n",
|
|||
|
|
"\t--Removed! service_information.has_sex_offender_check,\n",
|
|||
|
|
"\tNOT booking_summary.has_verification_request AS is_contactless_booking,\n",
|
|||
|
|
"\t\n",
|
|||
|
|
"\t-- CONTEXTUAL LISTING INFORMATION --\n",
|
|||
|
|
"\tlisting_information.listing_number_of_bedrooms,\n",
|
|||
|
|
"\tlisting_information.listing_number_of_bathrooms,\n",
|
|||
|
|
"\t\n",
|
|||
|
|
"\t-- CONTEXTUAL TIMELINE OF OUR BOOKING\n",
|
|||
|
|
"\t-- Defaults to 0 if booking_created_date_utc > booking_check_in_date_utc\n",
|
|||
|
|
"\tGREATEST(booking_summary.booking_check_in_date_utc - booking_summary.booking_created_date_utc, 0) AS booking_lead_time,\n",
|
|||
|
|
"\tbooking_summary.booking_check_out_date_utc - booking_summary.booking_check_in_date_utc AS booking_duration,\n",
|
|||
|
|
"\t\n",
|
|||
|
|
"\t-- SAME-LISTING, OTHER BOOKING INTERACTIONS: PRIOR TO TCR\n",
|
|||
|
|
"\t-- Removed! bookings_checked_in_prior_to_TCR.listing_check_ins_prior_to_TCR_in_30_days,\n",
|
|||
|
|
"\tbookings_checked_in_prior_to_TCR.listing_occupancy_prior_to_TCR_in_30_days,\n",
|
|||
|
|
"\t\n",
|
|||
|
|
"\t-- SAME-LISTING, OTHER BOOKING INTERACTIONS: PRIOR TO TCI (KNOWN)\n",
|
|||
|
|
"\t-- Removed! known_bookings_checking_in_prior_to_TCI.listing_known_check_ins_prior_to_TCI_in_30_days,\n",
|
|||
|
|
"\tknown_bookings_checking_in_prior_to_TCI.listing_known_occupancy_prior_to_TCI_in_30_days,\n",
|
|||
|
|
"\tknown_bookings_checking_in_prior_to_TCI.lead_time_between_prior_known_check_in_to_TCI_30_days,\n",
|
|||
|
|
"\t\n",
|
|||
|
|
"\t-- SAME-LISTING, OTHER BOOKING INTERACTIONS: INCIDENTAL BOOKINGS\n",
|
|||
|
|
"\t-- Removed! incidents_prior_to_TCP.listing_incidents_prior_to_TCP_in_30_days,\n",
|
|||
|
|
"\t\n",
|
|||
|
|
"\t-- TARGET (BOOLEAN) --\n",
|
|||
|
|
"\tbooking_summary.has_resolution_incident\n",
|
|||
|
|
"\n",
|
|||
|
|
"FROM\n",
|
|||
|
|
"\tintermediate.int_booking_summary booking_summary\n",
|
|||
|
|
"LEFT JOIN service_information \n",
|
|||
|
|
"\tON\n",
|
|||
|
|
"\tbooking_summary.id_booking = service_information.id_booking\n",
|
|||
|
|
"LEFT JOIN listing_information \n",
|
|||
|
|
"\tON booking_summary.id_accommodation = listing_information.id_accommodation\n",
|
|||
|
|
"LEFT JOIN bookings_checked_in_prior_to_TCR\n",
|
|||
|
|
"\tON booking_summary.id_booking = bookings_checked_in_prior_to_TCR.id_booking\n",
|
|||
|
|
"LEFT JOIN known_bookings_checking_in_prior_to_TCI\n",
|
|||
|
|
"\tON booking_summary.id_booking = known_bookings_checking_in_prior_to_TCI.id_booking\n",
|
|||
|
|
"LEFT JOIN incidents_prior_to_TCP\n",
|
|||
|
|
"\tON booking_summary.id_booking = incidents_prior_to_TCP.id_booking\n",
|
|||
|
|
"WHERE\n",
|
|||
|
|
"\t-- 1. Bookings from New Dash users with Id Deal\n",
|
|||
|
|
"\tbooking_summary.is_user_in_new_dash = TRUE\n",
|
|||
|
|
"\tAND \n",
|
|||
|
|
" booking_summary.is_missing_id_deal = FALSE\n",
|
|||
|
|
"\tAND\n",
|
|||
|
|
"\t-- 2. Protected Bookings with a Protection or a Deposit Management service\n",
|
|||
|
|
" (\n",
|
|||
|
|
"\t\tbooking_summary.has_protection_service_business_type\n",
|
|||
|
|
"\t\t\tOR \n",
|
|||
|
|
" booking_summary.has_deposit_management_service_business_type\n",
|
|||
|
|
"\t)\n",
|
|||
|
|
"\tAND\n",
|
|||
|
|
"\t-- 3. Bookings with flagging categorisation (this excludes Cancelled/Incomplete/Rejected bookings)\n",
|
|||
|
|
"\tbooking_summary.is_booking_flagged_as_risk IS NOT NULL\n",
|
|||
|
|
"\tAND\n",
|
|||
|
|
"\t-- 4. Booking is completed\n",
|
|||
|
|
"\tbooking_summary.is_booking_past_completion_date = TRUE\n",
|
|||
|
|
"\n",
|
|||
|
|
"\n",
|
|||
|
|
"\"\"\"\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Retrieve Data from Query\n",
|
|||
|
|
"df_extraction = query_to_dataframe(engine=dwh_pg_engine, query=data_extraction_query)\n",
|
|||
|
|
"print(f\"Total Bookings: {len(df_extraction):,}\")\n"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"## Preprocessing\n",
|
|||
|
|
"Preprocessing in this notebook is quite straight-forward: we just drop id booking and split the features and target."
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# Drop ID column\n",
|
|||
|
|
"df = df_extraction.copy().drop(columns=['id_booking'])\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Separate features and target\n",
|
|||
|
|
"target_col = 'has_resolution_incident'\n",
|
|||
|
|
"X = df.drop(columns=[target_col])\n",
|
|||
|
|
"y = df[target_col]\n"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"## Exploratory Data Analysis\n",
|
|||
|
|
"In this section we focus on explore the different features."
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"### EDA - Dataset Overview"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stdout",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"Shape: (21384, 12)\n",
|
|||
|
|
"has_resolution_incident\n",
|
|||
|
|
"False 98.8\n",
|
|||
|
|
"True 1.2\n",
|
|||
|
|
"Name: proportion, dtype: float64\n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"# Shape and types\n",
|
|||
|
|
"print(f\"Shape: {X.shape}\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Target distribution\n",
|
|||
|
|
"print(round(100*df[target_col].value_counts(normalize=True),2))\n"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"text/html": [
|
|||
|
|
"<div>\n",
|
|||
|
|
"<style scoped>\n",
|
|||
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
|
" vertical-align: middle;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe tbody tr th {\n",
|
|||
|
|
" vertical-align: top;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe thead th {\n",
|
|||
|
|
" text-align: right;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"</style>\n",
|
|||
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
|
" <thead>\n",
|
|||
|
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
|
" <th></th>\n",
|
|||
|
|
" <th>count</th>\n",
|
|||
|
|
" <th>mean</th>\n",
|
|||
|
|
" <th>std</th>\n",
|
|||
|
|
" <th>min</th>\n",
|
|||
|
|
" <th>5%</th>\n",
|
|||
|
|
" <th>25%</th>\n",
|
|||
|
|
" <th>50%</th>\n",
|
|||
|
|
" <th>75%</th>\n",
|
|||
|
|
" <th>95%</th>\n",
|
|||
|
|
" <th>99%</th>\n",
|
|||
|
|
" <th>max</th>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </thead>\n",
|
|||
|
|
" <tbody>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>number_of_applied_upgraded_services</th>\n",
|
|||
|
|
" <td>21384.0</td>\n",
|
|||
|
|
" <td>2.664282</td>\n",
|
|||
|
|
" <td>1.532038</td>\n",
|
|||
|
|
" <td>1.0</td>\n",
|
|||
|
|
" <td>1.0</td>\n",
|
|||
|
|
" <td>1.0</td>\n",
|
|||
|
|
" <td>2.0</td>\n",
|
|||
|
|
" <td>4.0</td>\n",
|
|||
|
|
" <td>5.0</td>\n",
|
|||
|
|
" <td>6.0</td>\n",
|
|||
|
|
" <td>7.0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>number_of_applied_screening_services</th>\n",
|
|||
|
|
" <td>21384.0</td>\n",
|
|||
|
|
" <td>2.007903</td>\n",
|
|||
|
|
" <td>0.985649</td>\n",
|
|||
|
|
" <td>1.0</td>\n",
|
|||
|
|
" <td>1.0</td>\n",
|
|||
|
|
" <td>1.0</td>\n",
|
|||
|
|
" <td>2.0</td>\n",
|
|||
|
|
" <td>3.0</td>\n",
|
|||
|
|
" <td>4.0</td>\n",
|
|||
|
|
" <td>4.0</td>\n",
|
|||
|
|
" <td>4.0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>number_of_applied_deposit_management_services</th>\n",
|
|||
|
|
" <td>21384.0</td>\n",
|
|||
|
|
" <td>0.620651</td>\n",
|
|||
|
|
" <td>0.485814</td>\n",
|
|||
|
|
" <td>0.0</td>\n",
|
|||
|
|
" <td>0.0</td>\n",
|
|||
|
|
" <td>0.0</td>\n",
|
|||
|
|
" <td>1.0</td>\n",
|
|||
|
|
" <td>1.0</td>\n",
|
|||
|
|
" <td>1.0</td>\n",
|
|||
|
|
" <td>1.0</td>\n",
|
|||
|
|
" <td>2.0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>number_of_applied_protection_services</th>\n",
|
|||
|
|
" <td>21384.0</td>\n",
|
|||
|
|
" <td>0.727132</td>\n",
|
|||
|
|
" <td>0.445444</td>\n",
|
|||
|
|
" <td>0.0</td>\n",
|
|||
|
|
" <td>0.0</td>\n",
|
|||
|
|
" <td>0.0</td>\n",
|
|||
|
|
" <td>1.0</td>\n",
|
|||
|
|
" <td>1.0</td>\n",
|
|||
|
|
" <td>1.0</td>\n",
|
|||
|
|
" <td>1.0</td>\n",
|
|||
|
|
" <td>1.0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>listing_number_of_bedrooms</th>\n",
|
|||
|
|
" <td>21384.0</td>\n",
|
|||
|
|
" <td>2.049476</td>\n",
|
|||
|
|
" <td>1.755499</td>\n",
|
|||
|
|
" <td>0.0</td>\n",
|
|||
|
|
" <td>0.0</td>\n",
|
|||
|
|
" <td>1.0</td>\n",
|
|||
|
|
" <td>2.0</td>\n",
|
|||
|
|
" <td>3.0</td>\n",
|
|||
|
|
" <td>5.0</td>\n",
|
|||
|
|
" <td>8.0</td>\n",
|
|||
|
|
" <td>15.0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>listing_number_of_bathrooms</th>\n",
|
|||
|
|
" <td>21384.0</td>\n",
|
|||
|
|
" <td>1.590816</td>\n",
|
|||
|
|
" <td>1.312573</td>\n",
|
|||
|
|
" <td>0.0</td>\n",
|
|||
|
|
" <td>0.0</td>\n",
|
|||
|
|
" <td>1.0</td>\n",
|
|||
|
|
" <td>1.0</td>\n",
|
|||
|
|
" <td>2.0</td>\n",
|
|||
|
|
" <td>4.0</td>\n",
|
|||
|
|
" <td>6.0</td>\n",
|
|||
|
|
" <td>17.0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>booking_lead_time</th>\n",
|
|||
|
|
" <td>21384.0</td>\n",
|
|||
|
|
" <td>18.151422</td>\n",
|
|||
|
|
" <td>24.349579</td>\n",
|
|||
|
|
" <td>0.0</td>\n",
|
|||
|
|
" <td>0.0</td>\n",
|
|||
|
|
" <td>2.0</td>\n",
|
|||
|
|
" <td>9.0</td>\n",
|
|||
|
|
" <td>25.0</td>\n",
|
|||
|
|
" <td>69.0</td>\n",
|
|||
|
|
" <td>113.0</td>\n",
|
|||
|
|
" <td>220.0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>booking_duration</th>\n",
|
|||
|
|
" <td>21384.0</td>\n",
|
|||
|
|
" <td>4.175084</td>\n",
|
|||
|
|
" <td>4.851055</td>\n",
|
|||
|
|
" <td>0.0</td>\n",
|
|||
|
|
" <td>1.0</td>\n",
|
|||
|
|
" <td>2.0</td>\n",
|
|||
|
|
" <td>3.0</td>\n",
|
|||
|
|
" <td>5.0</td>\n",
|
|||
|
|
" <td>10.0</td>\n",
|
|||
|
|
" <td>28.0</td>\n",
|
|||
|
|
" <td>116.0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>listing_occupancy_prior_to_tcr_in_30_days</th>\n",
|
|||
|
|
" <td>21384.0</td>\n",
|
|||
|
|
" <td>8.780817</td>\n",
|
|||
|
|
" <td>9.260855</td>\n",
|
|||
|
|
" <td>0.0</td>\n",
|
|||
|
|
" <td>0.0</td>\n",
|
|||
|
|
" <td>0.0</td>\n",
|
|||
|
|
" <td>6.0</td>\n",
|
|||
|
|
" <td>16.0</td>\n",
|
|||
|
|
" <td>27.0</td>\n",
|
|||
|
|
" <td>30.0</td>\n",
|
|||
|
|
" <td>30.0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>listing_known_occupancy_prior_to_tci_in_30_days</th>\n",
|
|||
|
|
" <td>21384.0</td>\n",
|
|||
|
|
" <td>9.470913</td>\n",
|
|||
|
|
" <td>9.715511</td>\n",
|
|||
|
|
" <td>0.0</td>\n",
|
|||
|
|
" <td>0.0</td>\n",
|
|||
|
|
" <td>0.0</td>\n",
|
|||
|
|
" <td>6.0</td>\n",
|
|||
|
|
" <td>17.0</td>\n",
|
|||
|
|
" <td>30.0</td>\n",
|
|||
|
|
" <td>30.0</td>\n",
|
|||
|
|
" <td>30.0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>lead_time_between_prior_known_check_in_to_tci_30_days</th>\n",
|
|||
|
|
" <td>21384.0</td>\n",
|
|||
|
|
" <td>15.287318</td>\n",
|
|||
|
|
" <td>11.424657</td>\n",
|
|||
|
|
" <td>1.0</td>\n",
|
|||
|
|
" <td>2.0</td>\n",
|
|||
|
|
" <td>5.0</td>\n",
|
|||
|
|
" <td>11.0</td>\n",
|
|||
|
|
" <td>30.0</td>\n",
|
|||
|
|
" <td>30.0</td>\n",
|
|||
|
|
" <td>30.0</td>\n",
|
|||
|
|
" <td>30.0</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </tbody>\n",
|
|||
|
|
"</table>\n",
|
|||
|
|
"</div>"
|
|||
|
|
],
|
|||
|
|
"text/plain": [
|
|||
|
|
" count mean \\\n",
|
|||
|
|
"number_of_applied_upgraded_services 21384.0 2.664282 \n",
|
|||
|
|
"number_of_applied_screening_services 21384.0 2.007903 \n",
|
|||
|
|
"number_of_applied_deposit_management_services 21384.0 0.620651 \n",
|
|||
|
|
"number_of_applied_protection_services 21384.0 0.727132 \n",
|
|||
|
|
"listing_number_of_bedrooms 21384.0 2.049476 \n",
|
|||
|
|
"listing_number_of_bathrooms 21384.0 1.590816 \n",
|
|||
|
|
"booking_lead_time 21384.0 18.151422 \n",
|
|||
|
|
"booking_duration 21384.0 4.175084 \n",
|
|||
|
|
"listing_occupancy_prior_to_tcr_in_30_days 21384.0 8.780817 \n",
|
|||
|
|
"listing_known_occupancy_prior_to_tci_in_30_days 21384.0 9.470913 \n",
|
|||
|
|
"lead_time_between_prior_known_check_in_to_tci_3... 21384.0 15.287318 \n",
|
|||
|
|
"\n",
|
|||
|
|
" std min 5% 25% \\\n",
|
|||
|
|
"number_of_applied_upgraded_services 1.532038 1.0 1.0 1.0 \n",
|
|||
|
|
"number_of_applied_screening_services 0.985649 1.0 1.0 1.0 \n",
|
|||
|
|
"number_of_applied_deposit_management_services 0.485814 0.0 0.0 0.0 \n",
|
|||
|
|
"number_of_applied_protection_services 0.445444 0.0 0.0 0.0 \n",
|
|||
|
|
"listing_number_of_bedrooms 1.755499 0.0 0.0 1.0 \n",
|
|||
|
|
"listing_number_of_bathrooms 1.312573 0.0 0.0 1.0 \n",
|
|||
|
|
"booking_lead_time 24.349579 0.0 0.0 2.0 \n",
|
|||
|
|
"booking_duration 4.851055 0.0 1.0 2.0 \n",
|
|||
|
|
"listing_occupancy_prior_to_tcr_in_30_days 9.260855 0.0 0.0 0.0 \n",
|
|||
|
|
"listing_known_occupancy_prior_to_tci_in_30_days 9.715511 0.0 0.0 0.0 \n",
|
|||
|
|
"lead_time_between_prior_known_check_in_to_tci_3... 11.424657 1.0 2.0 5.0 \n",
|
|||
|
|
"\n",
|
|||
|
|
" 50% 75% 95% 99% \\\n",
|
|||
|
|
"number_of_applied_upgraded_services 2.0 4.0 5.0 6.0 \n",
|
|||
|
|
"number_of_applied_screening_services 2.0 3.0 4.0 4.0 \n",
|
|||
|
|
"number_of_applied_deposit_management_services 1.0 1.0 1.0 1.0 \n",
|
|||
|
|
"number_of_applied_protection_services 1.0 1.0 1.0 1.0 \n",
|
|||
|
|
"listing_number_of_bedrooms 2.0 3.0 5.0 8.0 \n",
|
|||
|
|
"listing_number_of_bathrooms 1.0 2.0 4.0 6.0 \n",
|
|||
|
|
"booking_lead_time 9.0 25.0 69.0 113.0 \n",
|
|||
|
|
"booking_duration 3.0 5.0 10.0 28.0 \n",
|
|||
|
|
"listing_occupancy_prior_to_tcr_in_30_days 6.0 16.0 27.0 30.0 \n",
|
|||
|
|
"listing_known_occupancy_prior_to_tci_in_30_days 6.0 17.0 30.0 30.0 \n",
|
|||
|
|
"lead_time_between_prior_known_check_in_to_tci_3... 11.0 30.0 30.0 30.0 \n",
|
|||
|
|
"\n",
|
|||
|
|
" max \n",
|
|||
|
|
"number_of_applied_upgraded_services 7.0 \n",
|
|||
|
|
"number_of_applied_screening_services 4.0 \n",
|
|||
|
|
"number_of_applied_deposit_management_services 2.0 \n",
|
|||
|
|
"number_of_applied_protection_services 1.0 \n",
|
|||
|
|
"listing_number_of_bedrooms 15.0 \n",
|
|||
|
|
"listing_number_of_bathrooms 17.0 \n",
|
|||
|
|
"booking_lead_time 220.0 \n",
|
|||
|
|
"booking_duration 116.0 \n",
|
|||
|
|
"listing_occupancy_prior_to_tcr_in_30_days 30.0 \n",
|
|||
|
|
"listing_known_occupancy_prior_to_tci_in_30_days 30.0 \n",
|
|||
|
|
"lead_time_between_prior_known_check_in_to_tci_3... 30.0 "
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "display_data"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"text/html": [
|
|||
|
|
"<div>\n",
|
|||
|
|
"<style scoped>\n",
|
|||
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
|
" vertical-align: middle;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe tbody tr th {\n",
|
|||
|
|
" vertical-align: top;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe thead th {\n",
|
|||
|
|
" text-align: right;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"</style>\n",
|
|||
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
|
" <thead>\n",
|
|||
|
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
|
" <th></th>\n",
|
|||
|
|
" <th>count</th>\n",
|
|||
|
|
" <th>unique</th>\n",
|
|||
|
|
" <th>top</th>\n",
|
|||
|
|
" <th>freq</th>\n",
|
|||
|
|
" <th>freq/count</th>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </thead>\n",
|
|||
|
|
" <tbody>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>is_contactless_booking</th>\n",
|
|||
|
|
" <td>21384</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>False</td>\n",
|
|||
|
|
" <td>13185</td>\n",
|
|||
|
|
" <td>0.616582</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>has_resolution_incident</th>\n",
|
|||
|
|
" <td>21384</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>False</td>\n",
|
|||
|
|
" <td>21127</td>\n",
|
|||
|
|
" <td>0.987982</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </tbody>\n",
|
|||
|
|
"</table>\n",
|
|||
|
|
"</div>"
|
|||
|
|
],
|
|||
|
|
"text/plain": [
|
|||
|
|
" count unique top freq freq/count\n",
|
|||
|
|
"is_contactless_booking 21384 2 False 13185 0.616582\n",
|
|||
|
|
"has_resolution_incident 21384 2 False 21127 0.987982"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "display_data"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"# Summary statistics for numerical features\n",
|
|||
|
|
"display(df.describe(include= ['number'], percentiles=[.05,.25,.5,.75,.95,.99]).T)\n",
|
|||
|
|
"# Summary statistics for boolean features\n",
|
|||
|
|
"summary = df.describe(include= ['bool']).T\n",
|
|||
|
|
"summary['freq/count'] = summary['freq']/summary['count']\n",
|
|||
|
|
"display(summary)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABJcAAAQwCAYAAABYEL++AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzddXQUVxvA4d8mhLgH4sQF10CCa3FKcWnxIiW4FwvQ4g6lxaEUd3cILe4OAYIESwJRosS+PwILS4QQlqbt9z7n7DnZmffeeWd2MsneufeOIi0tLQ0hhBBCCCGEEEIIIXJBI68TEEIIIYQQQgghhBD/XtK4JIQQQgghhBBCCCFyTRqXhBBCCCGEEEIIIUSuSeOSEEIIIYQQQgghhMg1aVwSQgghhBBCCCGEELkmjUtCCCGEEEIIIYQQItekcUkIIYQQQgghhBBC5Jo0LgkhhBBCCCGEEEKIXJPGJSGEEEIIIYQQQgiRa9K4JIQQQggh/u+sWLEChULBw4cP1Vbnw4cPUSgUrFixQm11/ttVr16d6tWr53UaQgghvjBpXBJCCCGEEGoRGBhIjx49cHZ2RkdHByMjIypVqsScOXOIj4/P6/TUZs2aNcyePTuv01DRqVMnFAoFRkZGmR7ru3fvolAoUCgUTJ8+/ZPrf/bsGX5+fly+fFkN2QohhPivyZfXCQghhBBCiH+/3bt307JlS7S1tenQoQPFihXj9evXHD9+nCFDhnDjxg0WLVqU12mqxZo1a7h+/Tr9+/dXWe7g4EB8fDxaWlp5kle+fPmIi4tj586dtGrVSmXd6tWr0dHRISEhIVd1P3v2jHHjxuHo6EipUqVyXO7AgQO52p4QQoh/F2lcEkIIIYQQn+XBgwe0adMGBwcHjhw5grW1tXJd7969uXfvHrt37/7s7aSlpZGQkICurm6GdQkJCeTPnx8NjbzrmK9QKNDR0cmz7Wtra1OpUiXWrl2boXFpzZo1NGzYkM2bN/8tucTFxaGnp0f+/Pn/lu0JIYTIWzIsTgghhBBCfJapU6cSExPD0qVLVRqW3nJ1daVfv37K98nJyUyYMAEXFxe0tbVxdHTkxx9/JDExUaWco6MjjRo1Yv/+/ZQrVw5dXV0WLlyIv78/CoWCdevWMWrUKGxtbdHT0yM6OhqAM2fOUK9ePYyNjdHT06NatWqcOHHio/uxfft2GjZsiI2NDdra2ri4uDBhwgRSUlKUMdWrV2f37t08evRIOczM0dERyHrOpSNHjlClShX09fUxMTHh66+/5tatWyoxfn5+KBQK7t27R6dOnTAxMcHY2JjOnTsTFxf30dzfateuHXv37iUyMlK57Ny5c9y9e5d27dpliA8PD2fw4MEUL14cAwMDjIyMqF+/PleuXFHG+Pv74+XlBUDnzp2V+/12P6tXr06xYsW4cOECVatWRU9Pjx9//FG57v05lzp27IiOjk6G/a9bty6mpqY8e/Ysx/sqhBDin0N6LgkhhBBCiM+yc+dOnJ2dqVixYo7iu3XrxsqVK2nRogWDBg3izJkzTJo0iVu3brF161aV2ICAANq2bUuPHj34/vvv8fDwUK6bMGEC+fPnZ/DgwSQmJpI/f36OHDlC/fr1KVu2LGPHjkVDQ4Ply5dTs2ZN/vrrL8qXL59lXitWrMDAwICBAwdiYGDAkSNHGDNmDNHR0UybNg2AkSNHEhUVxZMnT5g1axYABgYGWdZ56NAh6tevj7OzM35+fsTHxzNv3jwqVarExYsXlQ1Tb7Vq1QonJycmTZrExYsXWbJkCQULFmTKlCk5OrbNmjWjZ8+ebNmyhS5dugDpvZY8PT0pU6ZMhvj79++zbds2WrZsiZOTEyEhISxcuJBq1apx8+ZNbGxsKFy4MOPHj2fMmDF0796dKlWqAKh83mFhYdSvX582bdrw7bffYmlpmWl+c+bM4ciRI3Ts2JFTp06hqanJwoULOXDgAKtWrcLGxiZH+ymEEOIfJk0IIYQQQohcioqKSgPSvv766xzFX758OQ1I69atm8rywYMHpwFpR44cUS5zcHBIA9L27dunEnv06NE0IM3Z2TktLi5OuTw1NTXNzc0trW7dummpqanK5XFxcWlOTk5pderUUS5bvnx5GpD24MEDlbgP9ejRI01PTy8tISFBuaxhw4ZpDg4OGWIfPHiQBqQtX75cuaxUqVJpBQsWTAsLC1Muu3LlSpqGhkZahw4dlMvGjh2bBqR16dJFpc5vvvkmzdzcPMO2PtSxY8c0fX39tLS0tLQWLVqk1apVKy0tLS0tJSUlzcrKKm3cuHHK/KZNm6Ysl5CQkJaSkpJhP7S1tdPGjx+vXHbu3LkM+/ZWtWrV0oC03377LdN11apVU1m2f//+NCDtp59+Srt//36agYFBWtOmTT+6j0IIIf65ZFicEEIIIYTItbdD0QwNDXMUv2fPHgAGDhyosnzQoEEAGeZmcnJyom7dupnW1bFjR5X5ly5fvqwc/hUWFsbLly95+fIlsbGx1KpViz///JPU1NQsc3u/rlevXvHy5UuqVKlCXFwct2/fztH+ve/58+dcvnyZTp06YWZmplxeokQJ6tSpozwW7+vZs6fK+ypVqhAWFqY8zjnRrl07/P39CQ4O5siRIwQHB2c6JA7S52l6O09VSkoKYWFhGBgY4OHhwcWLF3O8TW1tbTp37pyj2K+++ooePXowfvx4mjVrho6ODgsXLszxtoQQQvzzyLA4IYQQQgiRa0ZGRkB6Y0xOPHr0CA0NDVxdXVWWW1lZYWJiwqNHj1SWOzk5ZVnXh+vu3r0LpDc6ZSUqKgpTU9NM1924cYNRo0Zx5MiRDI05UVFRWdaZlbf78v5QvrcKFy7M/v37iY2NRV9fX7m8UKFCKnFvc42IiFAe649p0KABhoaGrF+/nsuXL+Pl5YWrqysPHz7MEJuamsqcOXNYsGABDx48UJlfytzcPEfbA7C1tf2kybunT5/O9u3buXz5MmvWrKFgwYI5LiuEEOKfRxqXhBBCCCFErhkZGWFjY8P169c/qZxCochRXGZPhstq3dteSdOmTaNUqVKZlslqfqTIyEiqVauGkZER48ePx8XFBR0dHS5evMiwYcOy7fGkTpqampkuT0tLy3Ed2traNGvWjJUrV3L//n38/PyyjJ04cSKjR4+mS5cuTJgwATMzMzQ0NOjfv/8n7XN2n1NmLl26RGhoKADXrl2jbdu2n1ReCCHEP4s0LgkhhBBCiM/SqFEjFi1axKlTp/Dx8ck21sHBgdTUVO7evUvhwoWVy0NCQoiMjMTBwSHXebi4uADpDV61a9f+pLL+/v6EhYWxZcsWqlatqlz+4MGDDLE5bRh7uy8BAQEZ1t2+fRsLCwuVXkvq1K5dO5YtW4aGhgZt2rTJMm7Tpk3UqFGDpUuXqiyPjIzEwsJC+T6n+5wTsbGxdO7cmSJFilCxYkWmTp3KN998o3winRBCiH8fmXNJCCGEEEJ8lqFDh6Kvr0+3bt0ICQnJsD4wMJA5c+YA6UO2AGbPnq0SM3PmTAAaNmyY6zzKli2Li4sL06dPJyYmJsP6Fy9eZFn2bY+h93sIvX79mgULFmSI1dfXz9EwOWtra0qVKsXKlSuJjIxULr9+/ToHDhxQHosvoUaNGkyYMIH58+djZWWVZZympmaGXlEbN27k6dOnKsveNoK9vx+5NWzYMIKCgli5ciUzZ87E0dGRjh07kpiY+Nl1CyGEyBvSc0kIIYQQQnwWFxcX1qxZQ+vWrSlcuDAdOnSgWLFivH79mpMnT7Jx40Y6deoEQMmSJenYsSOLFi1SDkU7e/YsK1eupGnTptSoUSPXeWhoaLBkyRLq169P0aJF6dy5M7a2tjx9+pSjR49iZGTEzp07My1bsWJFTE1N6dixI3379kWhULBq1apMh6OVLVuW9evXM3DgQLy8vDAwMKBx48aZ1jtt2jTq16+Pj48PXbt2JT4+nnnz5mFsbJztcLXPpaGhwahRoz4a16hRI8aPH0/nzp2pWLEi165dY/Xq1Tg7O6vEubi4YGJiwm+//YahoSH6+vpUqFAh2zmxMnPkyBEWLFjA2LFjKVO
|
|||
|
|
"text/plain": [
|
|||
|
|
"<Figure size 1000x800 with 2 Axes>"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "display_data"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"# Correlation heatmap\n",
|
|||
|
|
"plt.figure(figsize=(10, 8))\n",
|
|||
|
|
"cmap = sns.diverging_palette(220, 20, as_cmap=True)\n",
|
|||
|
|
"sns.heatmap(df.corr(), annot=True, cmap=cmap, fmt=\".2f\", linewidths=.5,)\n",
|
|||
|
|
"plt.title(\"Correlation Matrix\")\n",
|
|||
|
|
"plt.show()"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"## Processing for modelling\n",
|
|||
|
|
"Afterwards, we split the dataset between train and test and display their sizes and target distribution."
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stdout",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"Training set size: 14968 rows\n",
|
|||
|
|
"Test set size: 6416 rows\n",
|
|||
|
|
"\n",
|
|||
|
|
"Training target distribution:\n",
|
|||
|
|
"has_resolution_incident\n",
|
|||
|
|
"False 0.98744\n",
|
|||
|
|
"True 0.01256\n",
|
|||
|
|
"Name: proportion, dtype: float64\n",
|
|||
|
|
"\n",
|
|||
|
|
"Test target distribution:\n",
|
|||
|
|
"has_resolution_incident\n",
|
|||
|
|
"False 0.989246\n",
|
|||
|
|
"True 0.010754\n",
|
|||
|
|
"Name: proportion, dtype: float64\n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"# Split the data\n",
|
|||
|
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(f\"Training set size: {X_train.shape[0]} rows\")\n",
|
|||
|
|
"print(f\"Test set size: {X_test.shape[0]} rows\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\nTraining target distribution:\")\n",
|
|||
|
|
"print(y_train.value_counts(normalize=True))\n",
|
|||
|
|
"\n",
|
|||
|
|
"print(\"\\nTest target distribution:\")\n",
|
|||
|
|
"print(y_test.value_counts(normalize=True))"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"id": "d36c9276",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"## Classification Model with Random Forest\n",
|
|||
|
|
"\n",
|
|||
|
|
"We define a machine learning pipeline that includes:\n",
|
|||
|
|
"- **Scaling numeric features** with `StandardScaler`\n",
|
|||
|
|
"- **Training a Random Forest classifier** with balanced class weights to handle the imbalanced dataset\n",
|
|||
|
|
"\n",
|
|||
|
|
"We then use `GridSearchCV` to perform a **grid search with cross-validation** over a range of key hyperparameters (e.g., number of trees, max depth, etc.). \n",
|
|||
|
|
"The model is evaluated using **Average Precision**, which is better suited for imbalanced classification tasks.\n",
|
|||
|
|
"\n",
|
|||
|
|
"The best combination of parameters is selected, and the resulting model is used to make predictions on the test set.\n"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "943ef7d6",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stdout",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"Fitting 4 folds for each of 72 candidates, totalling 288 fits\n"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "stdout",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 1.9s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 1.9s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 1.9s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 1.8s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 1.9s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 3.5s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 3.7s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 4.2s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 4.2s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 3.4s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 3.5s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 3.7s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 3.8s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 5.7s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 4.0s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 4.1s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 6.1s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 6.3s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 4.1s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 2.0s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 1.7s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 1.7s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 1.7s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 6.2s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 5.6s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 2.1s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 2.2s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 2.3s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 4.4s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 4.6s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 6.6s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 6.1s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 2.2s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 6.3s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 1.8s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 10.5s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 6.1s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 6.1s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 6.2s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 3.5s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 3.5s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 3.7s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 6.6s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 3.5s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 1.8s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 2.6s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 2.7s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 5.2s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 8.9s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 5.6s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 3.4s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 2.0s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 6.1s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 7.0s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 4.8s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 4.9s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 2.1s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 2.7s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 2.5s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 5.2s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 5.4s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 5.3s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 5.1s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 6.2s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 1.8s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 3.4s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 3.8s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 1.7s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 3.8s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 2.3s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 2.6s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 4.9s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 3.2s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 3.3s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 1.9s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 3.5s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 6.2s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 3.0s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 5.0s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 5.8s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 3.6s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 6.5s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 6.5s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 3.8s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 5.5s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 5.6s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 6.1s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 4.1s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 4.1s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 6.1s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 2.0s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 2.0s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 4.3s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 4.3s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 3.2s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 3.6s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 1.9s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 3.6s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 1.8s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 3.6s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 3.5s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 3.6s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 7.0s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 6.9s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 5.1s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 5.2s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 4.0s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 3.5s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 5.7s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 3.9s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 5.8s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 3.2s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 3.4s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 3.3s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 1.7s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 1.8s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 1.8s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 4.9s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 4.9s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 11.0s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 3.0s\n",
|
|||
|
|
"[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 11.1s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 5.1s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 1.7s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 3.5s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 5.0s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 3.3s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 3.5s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 1.6s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 1.7s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 3.1s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 5.3s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 2.0s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 5.1s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 5.2s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 2.0s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 6.1s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 5.3s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 3.7s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 4.0s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 1.9s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 2.4s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 5.5s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 5.4s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 5.3s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 5.7s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 5.7s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 3.3s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 3.4s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 1.7s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 3.5s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 3.4s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 5.8s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 2.1s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 2.1s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 2.2s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 1.7s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 5.0s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 1.9s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 4.9s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 3.2s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 1.9s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 1.9s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 5.1s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 6.2s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 3.1s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 3.3s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 3.3s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 1.5s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 4.7s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 4.7s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 4.8s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 1.6s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 1.7s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 4.8s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 2.0s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 3.1s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 3.4s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 3.2s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 4.3s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 1.9s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 2.2s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 4.9s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 1.9s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 3.4s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 3.5s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 3.4s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 3.4s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 5.3s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 3.0s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 7.1s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 7.1s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 5.1s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 5.3s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 1.9s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 5.3s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 3.8s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 3.7s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 3.6s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 3.8s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 2.1s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 2.1s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 2.1s\n",
|
|||
|
|
"[CV] END model__max_depth=10, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 7.3s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 5.3s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 5.2s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 3.3s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 3.3s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 5.6s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 3.5s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 3.6s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 5.5s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 2.4s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 2.4s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 2.3s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 2.2s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 5.7s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 6.1s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 5.9s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 4.2s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 4.3s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 4.2s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 4.2s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 6.1s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 3.5s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 3.8s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 4.0s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 3.9s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 5.9s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 6.1s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 6.0s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 6.1s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 3.5s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 3.6s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 3.9s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 3.1s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 2.9s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 2.7s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 3.6s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time= 3.2s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 5.3s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 1.9s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 5.4s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 5.5s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 3.5s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 5.5s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 3.7s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 3.8s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time= 3.6s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 2.4s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 2.4s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time= 2.2s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 2.2s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 2.1s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 5.5s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 5.8s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 5.8s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 3.8s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 3.4s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 4.0s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time= 3.4s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 3.9s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time= 4.0s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 1.7s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 5.5s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time= 7.9s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 5.6s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 1.7s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 2.0s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 5.5s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 3.4s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 3.3s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 3.4s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time= 3.3s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time= 1.7s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time= 7.3s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 4.7s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 4.6s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 4.6s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 2.9s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 3.0s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 2.9s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time= 4.9s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=200; total time= 3.0s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 3.6s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 3.6s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 3.8s\n",
|
|||
|
|
"[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time= 3.5s\n",
|
|||
|
|
"Best hyperparameters: {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'model__n_estimators': 300}\n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"\n",
|
|||
|
|
"# Define pipeline (scaling numeric features only)\n",
|
|||
|
|
"pipeline = Pipeline([\n",
|
|||
|
|
" ('scaler', StandardScaler()),\n",
|
|||
|
|
" ('model', RandomForestClassifier(class_weight='balanced', # We have an imbalanced dataset\n",
|
|||
|
|
" random_state=123))\n",
|
|||
|
|
"])\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Define parameter grid\n",
|
|||
|
|
"param_grid = {\n",
|
|||
|
|
" 'model__n_estimators': [100, 200, 300],\n",
|
|||
|
|
" 'model__max_depth': [None, 10, 20],\n",
|
|||
|
|
" 'model__min_samples_split': [2, 5],\n",
|
|||
|
|
" 'model__min_samples_leaf': [1, 2],\n",
|
|||
|
|
" 'model__max_features': ['sqrt', 'log2']\n",
|
|||
|
|
"}\n",
|
|||
|
|
"\n",
|
|||
|
|
"# GridSearchCV\n",
|
|||
|
|
"grid_search = GridSearchCV(\n",
|
|||
|
|
" estimator=pipeline,\n",
|
|||
|
|
" param_grid=param_grid,\n",
|
|||
|
|
" scoring='average_precision', # For imbalanced classification\n",
|
|||
|
|
" cv=4, # 4-fold cross-validation\n",
|
|||
|
|
" n_jobs=-1, # Use all available cores\n",
|
|||
|
|
" verbose=2, # Verbose output for progress tracking,\n",
|
|||
|
|
" refit=True # Refit the best model on the entire training set - it's already true by default\n",
|
|||
|
|
")\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Fit the grid search on training data\n",
|
|||
|
|
"grid_search.fit(X_train, y_train)\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Best model\n",
|
|||
|
|
"best_pipeline = grid_search.best_estimator_\n",
|
|||
|
|
"print(\"Best hyperparameters:\", grid_search.best_params_)\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Predict on test set\n",
|
|||
|
|
"y_pred_proba = best_pipeline.predict_proba(X_test)[:, 1]\n",
|
|||
|
|
"y_pred = best_pipeline.predict(X_test)\n"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"text/html": [
|
|||
|
|
"<div>\n",
|
|||
|
|
"<style scoped>\n",
|
|||
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
|
" vertical-align: middle;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe tbody tr th {\n",
|
|||
|
|
" vertical-align: top;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"\n",
|
|||
|
|
" .dataframe thead th {\n",
|
|||
|
|
" text-align: right;\n",
|
|||
|
|
" }\n",
|
|||
|
|
"</style>\n",
|
|||
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
|
" <thead>\n",
|
|||
|
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
|
" <th></th>\n",
|
|||
|
|
" <th>mean_fit_time</th>\n",
|
|||
|
|
" <th>std_fit_time</th>\n",
|
|||
|
|
" <th>mean_score_time</th>\n",
|
|||
|
|
" <th>std_score_time</th>\n",
|
|||
|
|
" <th>param_model__max_depth</th>\n",
|
|||
|
|
" <th>param_model__max_features</th>\n",
|
|||
|
|
" <th>param_model__min_samples_leaf</th>\n",
|
|||
|
|
" <th>param_model__min_samples_split</th>\n",
|
|||
|
|
" <th>param_model__n_estimators</th>\n",
|
|||
|
|
" <th>params</th>\n",
|
|||
|
|
" <th>split0_test_score</th>\n",
|
|||
|
|
" <th>split1_test_score</th>\n",
|
|||
|
|
" <th>split2_test_score</th>\n",
|
|||
|
|
" <th>split3_test_score</th>\n",
|
|||
|
|
" <th>mean_test_score</th>\n",
|
|||
|
|
" <th>std_test_score</th>\n",
|
|||
|
|
" <th>rank_test_score</th>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </thead>\n",
|
|||
|
|
" <tbody>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>5</th>\n",
|
|||
|
|
" <td>5.800567</td>\n",
|
|||
|
|
" <td>0.367533</td>\n",
|
|||
|
|
" <td>0.309758</td>\n",
|
|||
|
|
" <td>0.016611</td>\n",
|
|||
|
|
" <td>None</td>\n",
|
|||
|
|
" <td>sqrt</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>5</td>\n",
|
|||
|
|
" <td>300</td>\n",
|
|||
|
|
" <td>{'model__max_depth': None, 'model__max_feature...</td>\n",
|
|||
|
|
" <td>0.032795</td>\n",
|
|||
|
|
" <td>0.020415</td>\n",
|
|||
|
|
" <td>0.031554</td>\n",
|
|||
|
|
" <td>0.052539</td>\n",
|
|||
|
|
" <td>0.034326</td>\n",
|
|||
|
|
" <td>0.011568</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>17</th>\n",
|
|||
|
|
" <td>5.748260</td>\n",
|
|||
|
|
" <td>0.156803</td>\n",
|
|||
|
|
" <td>0.519434</td>\n",
|
|||
|
|
" <td>0.307019</td>\n",
|
|||
|
|
" <td>None</td>\n",
|
|||
|
|
" <td>log2</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>5</td>\n",
|
|||
|
|
" <td>300</td>\n",
|
|||
|
|
" <td>{'model__max_depth': None, 'model__max_feature...</td>\n",
|
|||
|
|
" <td>0.032795</td>\n",
|
|||
|
|
" <td>0.020415</td>\n",
|
|||
|
|
" <td>0.031554</td>\n",
|
|||
|
|
" <td>0.052539</td>\n",
|
|||
|
|
" <td>0.034326</td>\n",
|
|||
|
|
" <td>0.011568</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>29</th>\n",
|
|||
|
|
" <td>4.784500</td>\n",
|
|||
|
|
" <td>0.083096</td>\n",
|
|||
|
|
" <td>0.176412</td>\n",
|
|||
|
|
" <td>0.006317</td>\n",
|
|||
|
|
" <td>10</td>\n",
|
|||
|
|
" <td>sqrt</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>5</td>\n",
|
|||
|
|
" <td>300</td>\n",
|
|||
|
|
" <td>{'model__max_depth': 10, 'model__max_features'...</td>\n",
|
|||
|
|
" <td>0.032233</td>\n",
|
|||
|
|
" <td>0.018502</td>\n",
|
|||
|
|
" <td>0.027846</td>\n",
|
|||
|
|
" <td>0.058432</td>\n",
|
|||
|
|
" <td>0.034253</td>\n",
|
|||
|
|
" <td>0.014815</td>\n",
|
|||
|
|
" <td>3</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>41</th>\n",
|
|||
|
|
" <td>4.521759</td>\n",
|
|||
|
|
" <td>0.073640</td>\n",
|
|||
|
|
" <td>0.206560</td>\n",
|
|||
|
|
" <td>0.009525</td>\n",
|
|||
|
|
" <td>10</td>\n",
|
|||
|
|
" <td>log2</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>5</td>\n",
|
|||
|
|
" <td>300</td>\n",
|
|||
|
|
" <td>{'model__max_depth': 10, 'model__max_features'...</td>\n",
|
|||
|
|
" <td>0.032233</td>\n",
|
|||
|
|
" <td>0.018502</td>\n",
|
|||
|
|
" <td>0.027846</td>\n",
|
|||
|
|
" <td>0.058432</td>\n",
|
|||
|
|
" <td>0.034253</td>\n",
|
|||
|
|
" <td>0.014815</td>\n",
|
|||
|
|
" <td>3</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>16</th>\n",
|
|||
|
|
" <td>3.828132</td>\n",
|
|||
|
|
" <td>0.550318</td>\n",
|
|||
|
|
" <td>0.146853</td>\n",
|
|||
|
|
" <td>0.016658</td>\n",
|
|||
|
|
" <td>None</td>\n",
|
|||
|
|
" <td>log2</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>5</td>\n",
|
|||
|
|
" <td>200</td>\n",
|
|||
|
|
" <td>{'model__max_depth': None, 'model__max_feature...</td>\n",
|
|||
|
|
" <td>0.033227</td>\n",
|
|||
|
|
" <td>0.020472</td>\n",
|
|||
|
|
" <td>0.030666</td>\n",
|
|||
|
|
" <td>0.051437</td>\n",
|
|||
|
|
" <td>0.033951</td>\n",
|
|||
|
|
" <td>0.011166</td>\n",
|
|||
|
|
" <td>5</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>...</th>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" <td>...</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>1</th>\n",
|
|||
|
|
" <td>3.745528</td>\n",
|
|||
|
|
" <td>0.295001</td>\n",
|
|||
|
|
" <td>0.159567</td>\n",
|
|||
|
|
" <td>0.023629</td>\n",
|
|||
|
|
" <td>None</td>\n",
|
|||
|
|
" <td>sqrt</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>200</td>\n",
|
|||
|
|
" <td>{'model__max_depth': None, 'model__max_feature...</td>\n",
|
|||
|
|
" <td>0.029798</td>\n",
|
|||
|
|
" <td>0.017825</td>\n",
|
|||
|
|
" <td>0.030080</td>\n",
|
|||
|
|
" <td>0.039780</td>\n",
|
|||
|
|
" <td>0.029371</td>\n",
|
|||
|
|
" <td>0.007784</td>\n",
|
|||
|
|
" <td>67</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>61</th>\n",
|
|||
|
|
" <td>3.490800</td>\n",
|
|||
|
|
" <td>0.096002</td>\n",
|
|||
|
|
" <td>0.163926</td>\n",
|
|||
|
|
" <td>0.008971</td>\n",
|
|||
|
|
" <td>20</td>\n",
|
|||
|
|
" <td>log2</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>200</td>\n",
|
|||
|
|
" <td>{'model__max_depth': 20, 'model__max_features'...</td>\n",
|
|||
|
|
" <td>0.031250</td>\n",
|
|||
|
|
" <td>0.017032</td>\n",
|
|||
|
|
" <td>0.028030</td>\n",
|
|||
|
|
" <td>0.040588</td>\n",
|
|||
|
|
" <td>0.029225</td>\n",
|
|||
|
|
" <td>0.008416</td>\n",
|
|||
|
|
" <td>69</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>49</th>\n",
|
|||
|
|
" <td>3.569089</td>\n",
|
|||
|
|
" <td>0.091260</td>\n",
|
|||
|
|
" <td>0.151084</td>\n",
|
|||
|
|
" <td>0.003098</td>\n",
|
|||
|
|
" <td>20</td>\n",
|
|||
|
|
" <td>sqrt</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>200</td>\n",
|
|||
|
|
" <td>{'model__max_depth': 20, 'model__max_features'...</td>\n",
|
|||
|
|
" <td>0.031250</td>\n",
|
|||
|
|
" <td>0.017032</td>\n",
|
|||
|
|
" <td>0.028030</td>\n",
|
|||
|
|
" <td>0.040588</td>\n",
|
|||
|
|
" <td>0.029225</td>\n",
|
|||
|
|
" <td>0.008416</td>\n",
|
|||
|
|
" <td>69</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>0</th>\n",
|
|||
|
|
" <td>2.332713</td>\n",
|
|||
|
|
" <td>0.932992</td>\n",
|
|||
|
|
" <td>0.105683</td>\n",
|
|||
|
|
" <td>0.037904</td>\n",
|
|||
|
|
" <td>None</td>\n",
|
|||
|
|
" <td>sqrt</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>100</td>\n",
|
|||
|
|
" <td>{'model__max_depth': None, 'model__max_feature...</td>\n",
|
|||
|
|
" <td>0.030112</td>\n",
|
|||
|
|
" <td>0.017368</td>\n",
|
|||
|
|
" <td>0.028927</td>\n",
|
|||
|
|
" <td>0.039004</td>\n",
|
|||
|
|
" <td>0.028853</td>\n",
|
|||
|
|
" <td>0.007690</td>\n",
|
|||
|
|
" <td>71</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" <tr>\n",
|
|||
|
|
" <th>12</th>\n",
|
|||
|
|
" <td>2.138600</td>\n",
|
|||
|
|
" <td>0.426632</td>\n",
|
|||
|
|
" <td>0.101666</td>\n",
|
|||
|
|
" <td>0.020386</td>\n",
|
|||
|
|
" <td>None</td>\n",
|
|||
|
|
" <td>log2</td>\n",
|
|||
|
|
" <td>1</td>\n",
|
|||
|
|
" <td>2</td>\n",
|
|||
|
|
" <td>100</td>\n",
|
|||
|
|
" <td>{'model__max_depth': None, 'model__max_feature...</td>\n",
|
|||
|
|
" <td>0.030112</td>\n",
|
|||
|
|
" <td>0.017368</td>\n",
|
|||
|
|
" <td>0.028927</td>\n",
|
|||
|
|
" <td>0.039004</td>\n",
|
|||
|
|
" <td>0.028853</td>\n",
|
|||
|
|
" <td>0.007690</td>\n",
|
|||
|
|
" <td>71</td>\n",
|
|||
|
|
" </tr>\n",
|
|||
|
|
" </tbody>\n",
|
|||
|
|
"</table>\n",
|
|||
|
|
"<p>72 rows × 17 columns</p>\n",
|
|||
|
|
"</div>"
|
|||
|
|
],
|
|||
|
|
"text/plain": [
|
|||
|
|
" mean_fit_time std_fit_time mean_score_time std_score_time \\\n",
|
|||
|
|
"5 5.800567 0.367533 0.309758 0.016611 \n",
|
|||
|
|
"17 5.748260 0.156803 0.519434 0.307019 \n",
|
|||
|
|
"29 4.784500 0.083096 0.176412 0.006317 \n",
|
|||
|
|
"41 4.521759 0.073640 0.206560 0.009525 \n",
|
|||
|
|
"16 3.828132 0.550318 0.146853 0.016658 \n",
|
|||
|
|
".. ... ... ... ... \n",
|
|||
|
|
"1 3.745528 0.295001 0.159567 0.023629 \n",
|
|||
|
|
"61 3.490800 0.096002 0.163926 0.008971 \n",
|
|||
|
|
"49 3.569089 0.091260 0.151084 0.003098 \n",
|
|||
|
|
"0 2.332713 0.932992 0.105683 0.037904 \n",
|
|||
|
|
"12 2.138600 0.426632 0.101666 0.020386 \n",
|
|||
|
|
"\n",
|
|||
|
|
" param_model__max_depth param_model__max_features \\\n",
|
|||
|
|
"5 None sqrt \n",
|
|||
|
|
"17 None log2 \n",
|
|||
|
|
"29 10 sqrt \n",
|
|||
|
|
"41 10 log2 \n",
|
|||
|
|
"16 None log2 \n",
|
|||
|
|
".. ... ... \n",
|
|||
|
|
"1 None sqrt \n",
|
|||
|
|
"61 20 log2 \n",
|
|||
|
|
"49 20 sqrt \n",
|
|||
|
|
"0 None sqrt \n",
|
|||
|
|
"12 None log2 \n",
|
|||
|
|
"\n",
|
|||
|
|
" param_model__min_samples_leaf param_model__min_samples_split \\\n",
|
|||
|
|
"5 1 5 \n",
|
|||
|
|
"17 1 5 \n",
|
|||
|
|
"29 1 5 \n",
|
|||
|
|
"41 1 5 \n",
|
|||
|
|
"16 1 5 \n",
|
|||
|
|
".. ... ... \n",
|
|||
|
|
"1 1 2 \n",
|
|||
|
|
"61 1 2 \n",
|
|||
|
|
"49 1 2 \n",
|
|||
|
|
"0 1 2 \n",
|
|||
|
|
"12 1 2 \n",
|
|||
|
|
"\n",
|
|||
|
|
" param_model__n_estimators \\\n",
|
|||
|
|
"5 300 \n",
|
|||
|
|
"17 300 \n",
|
|||
|
|
"29 300 \n",
|
|||
|
|
"41 300 \n",
|
|||
|
|
"16 200 \n",
|
|||
|
|
".. ... \n",
|
|||
|
|
"1 200 \n",
|
|||
|
|
"61 200 \n",
|
|||
|
|
"49 200 \n",
|
|||
|
|
"0 100 \n",
|
|||
|
|
"12 100 \n",
|
|||
|
|
"\n",
|
|||
|
|
" params split0_test_score \\\n",
|
|||
|
|
"5 {'model__max_depth': None, 'model__max_feature... 0.032795 \n",
|
|||
|
|
"17 {'model__max_depth': None, 'model__max_feature... 0.032795 \n",
|
|||
|
|
"29 {'model__max_depth': 10, 'model__max_features'... 0.032233 \n",
|
|||
|
|
"41 {'model__max_depth': 10, 'model__max_features'... 0.032233 \n",
|
|||
|
|
"16 {'model__max_depth': None, 'model__max_feature... 0.033227 \n",
|
|||
|
|
".. ... ... \n",
|
|||
|
|
"1 {'model__max_depth': None, 'model__max_feature... 0.029798 \n",
|
|||
|
|
"61 {'model__max_depth': 20, 'model__max_features'... 0.031250 \n",
|
|||
|
|
"49 {'model__max_depth': 20, 'model__max_features'... 0.031250 \n",
|
|||
|
|
"0 {'model__max_depth': None, 'model__max_feature... 0.030112 \n",
|
|||
|
|
"12 {'model__max_depth': None, 'model__max_feature... 0.030112 \n",
|
|||
|
|
"\n",
|
|||
|
|
" split1_test_score split2_test_score split3_test_score mean_test_score \\\n",
|
|||
|
|
"5 0.020415 0.031554 0.052539 0.034326 \n",
|
|||
|
|
"17 0.020415 0.031554 0.052539 0.034326 \n",
|
|||
|
|
"29 0.018502 0.027846 0.058432 0.034253 \n",
|
|||
|
|
"41 0.018502 0.027846 0.058432 0.034253 \n",
|
|||
|
|
"16 0.020472 0.030666 0.051437 0.033951 \n",
|
|||
|
|
".. ... ... ... ... \n",
|
|||
|
|
"1 0.017825 0.030080 0.039780 0.029371 \n",
|
|||
|
|
"61 0.017032 0.028030 0.040588 0.029225 \n",
|
|||
|
|
"49 0.017032 0.028030 0.040588 0.029225 \n",
|
|||
|
|
"0 0.017368 0.028927 0.039004 0.028853 \n",
|
|||
|
|
"12 0.017368 0.028927 0.039004 0.028853 \n",
|
|||
|
|
"\n",
|
|||
|
|
" std_test_score rank_test_score \n",
|
|||
|
|
"5 0.011568 1 \n",
|
|||
|
|
"17 0.011568 1 \n",
|
|||
|
|
"29 0.014815 3 \n",
|
|||
|
|
"41 0.014815 3 \n",
|
|||
|
|
"16 0.011166 5 \n",
|
|||
|
|
".. ... ... \n",
|
|||
|
|
"1 0.007784 67 \n",
|
|||
|
|
"61 0.008416 69 \n",
|
|||
|
|
"49 0.008416 69 \n",
|
|||
|
|
"0 0.007690 71 \n",
|
|||
|
|
"12 0.007690 71 \n",
|
|||
|
|
"\n",
|
|||
|
|
"[72 rows x 17 columns]"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"execution_count": 19,
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "execute_result"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"# Retrieve cv results\n",
|
|||
|
|
"pd.DataFrame(grid_search.cv_results_).sort_values(by='mean_test_score', ascending=False)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"We apply a threshold selector to find a proper value for F2 optimisation, rather than defaulting to 0.5."
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"# Find the best threshold for F2 score\n",
|
|||
|
|
"\n",
|
|||
|
|
"def find_best_threshold(y_true, y_proba, beta=2.0):\n",
|
|||
|
|
" thresholds = np.linspace(0, 1, 200)\n",
|
|||
|
|
" f2_scores = []\n",
|
|||
|
|
"\n",
|
|||
|
|
" for t in thresholds:\n",
|
|||
|
|
" preds = (y_proba >= t).astype(int)\n",
|
|||
|
|
" score = fbeta_score(y_true, preds, beta=beta)\n",
|
|||
|
|
" f2_scores.append(score)\n",
|
|||
|
|
"\n",
|
|||
|
|
" best_index = np.argmax(f2_scores)\n",
|
|||
|
|
" return thresholds[best_index], f2_scores[best_index]"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stdout",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"Best threshold: 5.0% — F2 score: 13.95%\n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"# Predict probabilities\n",
|
|||
|
|
"y_pred_proba = best_pipeline.predict_proba(X_test)[:, 1]\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Find best threshold for F2\n",
|
|||
|
|
"best_thresh, best_f2 = find_best_threshold(y_test, y_pred_proba, beta=2.0)\n",
|
|||
|
|
"print(f\"Best threshold: {100*best_thresh:.1f}% — F2 score: {100*best_f2:.2f}%\")\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Use that threshold for final classification\n",
|
|||
|
|
"y_pred_opt = (y_pred_proba >= best_thresh).astype(int)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"id": "fc2fcc89",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"## Evaluation\n",
|
|||
|
|
"This section aims to evaluate how good the new model is vs. the actual Resolution Incidents.\n",
|
|||
|
|
"\n",
|
|||
|
|
"We start by computing and displaying the classification report, ROC Curve, PR Curve and the respective Area Under the Curve (AUC)."
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "30786f7c",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stdout",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
" precision recall f1-score support\n",
|
|||
|
|
"\n",
|
|||
|
|
" No Incident 0.99 0.88 0.93 6347\n",
|
|||
|
|
" Incident 0.04 0.43 0.07 69\n",
|
|||
|
|
"\n",
|
|||
|
|
" accuracy 0.87 6416\n",
|
|||
|
|
" macro avg 0.52 0.66 0.50 6416\n",
|
|||
|
|
"weighted avg 0.98 0.87 0.92 6416\n",
|
|||
|
|
"\n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"# Print classification report\n",
|
|||
|
|
"print(classification_report(y_test, y_pred_opt, target_names=['No Incident', 'Incident']))"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"### Interpreting the Classification Report\n",
|
|||
|
|
"\n",
|
|||
|
|
"The **Classification Report** provides key metrics to evaluate how well the model performed on each class.\n",
|
|||
|
|
"\n",
|
|||
|
|
"It includes the following metrics for each class (0 and 1):\n",
|
|||
|
|
"* Precision: Out of all predicted positives, how many were actually positive?\n",
|
|||
|
|
"* Recall: Out of all actual positives, how many did we correctly identify?\n",
|
|||
|
|
"* F1-score: Harmonic mean of precision and recall (balances both)\n",
|
|||
|
|
"* Support: Number of true samples of that class in the test data\n",
|
|||
|
|
"\n",
|
|||
|
|
"Interpretation:\n",
|
|||
|
|
"* Class 0 = No incident\n",
|
|||
|
|
"* Class 1 = Has resolution incident (rare, but important!)\n",
|
|||
|
|
"\n",
|
|||
|
|
"A few explanatory cases:\n",
|
|||
|
|
"* A high recall for class 1 means we're catching most incidents.\n",
|
|||
|
|
"* A high precision for class 1 means when we predict an incident, we're often correct.\n",
|
|||
|
|
"* The F1-score gives a single balanced measure (good for imbalanced data).\n",
|
|||
|
|
"\n",
|
|||
|
|
"Special note for imbalanced data:\n",
|
|||
|
|
"Since class 1 (or just True) is rare (1% in our case), metrics for that class are more critical.\n",
|
|||
|
|
"We want to maximize recall to catch as many real incidents as possible — without letting precision drop too low (to avoid too many false alarms)."
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "4b4da914",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAhgAAAHWCAYAAAA1jvBJAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB6s0lEQVR4nO3dd1hTZ/8G8DsBwpAlIoKIIgruiaPuhaKtqy5Q62prl7b96dtWbavWDu1bW2uHra3WWq0C7lEVV6t1a1XcooA4QeV1IDPr+f1BiSBBCZ7kJHB/riuXyck5J3ceA/lyznOeRyGEECAiIiKSkFLuAERERFT2sMAgIiIiybHAICIiIsmxwCAiIiLJscAgIiIiybHAICIiIsmxwCAiIiLJscAgIiIiybHAICIiIsmxwCAiIiLJscAgKgcWL14MhUJhuNnb28Pf3x+jR4/G9evXjW4jhMDSpUvRsWNHeHp6wsXFBY0aNcLHH3+MzMzMYl9r7dq16NWrF7y9vaFSqVC1alUMGTIEf/75Z4my5uTk4Ouvv0br1q3h4eEBJycnhISEYPz48bhw4UKp3j8RWZ6Cc5EQlX2LFy/GmDFj8PHHH6NmzZrIycnBwYMHsXjxYgQGBuL06dNwcnIyrK/T6TBs2DCsWLECHTp0wIABA+Di4oI9e/Zg+fLlqF+/Pnbs2IEqVaoYthFC4MUXX8TixYvRrFkzDBo0CL6+vkhJScHatWtx9OhR7Nu3D23bti02Z1paGnr27ImjR4+id+/eCAsLg6urK+Lj4xEdHY3U1FSo1WqzthURSUQQUZn366+/CgDiyJEjhZZPmjRJABAxMTGFls+cOVMAEO+8806RfW3YsEEolUrRs2fPQstnz54tAIj/+7//E3q9vsh2S5YsEYcOHXpszueee04olUqxatWqIs/l5OSI//znP4/dvqQ0Go3Izc2VZF9EZBwLDKJyoLgC448//hAAxMyZMw3LsrKyRMWKFUVISIjQaDRG9zdmzBgBQBw4cMCwjZeXl6hbt67QarWlynjw4EEBQIwdO7ZE63fq1El06tSpyPJRo0aJGjVqGB5funRJABCzZ88WX3/9tQgKChJKpVIcPHhQ2NnZiY8++qjIPs6fPy8AiO+++86w7O7du+Ltt98W1apVEyqVStSqVUt8/vnnQqfTmfxeicoD9sEgKseSk5MBABUrVjQs27t3L+7evYthw4bB3t7e6HYjR44EAPzxxx+Gbe7cuYNhw4bBzs6uVFk2bNgAABgxYkSptn+SX3/9Fd999x1eeeUVfPXVV/Dz80OnTp2wYsWKIuvGxMTAzs4OgwcPBgBkZWWhU6dO+P333zFy5Eh8++23aNeuHaZMmYKJEyeaJS+RrTP+24OIyqT79+8jLS0NOTk5OHToEGbMmAFHR0f07t3bsM7Zs2cBAE2aNCl2P/nPnTt3rtC/jRo1KnU2KfbxONeuXUNCQgIqV65sWBYREYFXX30Vp0+fRsOGDQ3LY2Ji0KlTJ0Mfkzlz5iAxMRHHjx9HcHAwAODVV19F1apVMXv2bPznP/9BQECAWXIT2SoewSAqR8LCwlC5cmUEBARg0KBBqFChAjZs2IBq1aoZ1nnw4AEAwM3Nrdj95D+Xnp5e6N/HbfMkUuzjcQYOHFiouACAAQMGwN7eHjExMYZlp0+fxtmzZxEREWFYtnLlSnTo0AEVK1ZEWlqa4RYWFgadToe///7bLJmJbBmPYBCVI/PmzUNISAju37+PRYsW4e+//4ajo2OhdfK/4PMLDWMeLULc3d2fuM2TFNyHp6dnqfdTnJo1axZZ5u3tjW7dumHFihX45JNPAOQdvbC3t8eAAQMM6128eBEnT54sUqDku3XrluR5iWwdCwyicqRVq1Zo0aIFAKB///5o3749hg0bhvj4eLi6ugIA6tWrBwA4efIk+vfvb3Q/J0+eBADUr18fAFC3bl0AwKlTp4rd5kkK7qNDhw5PXF+hUEAYucpep9MZXd/Z2dno8sjISIwZMwZxcXFo2rQpVqxYgW7dusHb29uwjl6vR/fu3fHee+8Z3UdISMgT8xKVNzxFQlRO2dnZYdasWbhx4wa+//57w/L27dvD09MTy5cvL/bLesmSJQBg6LvRvn17VKxYEVFRUcVu8yR9+vQBAPz+++8lWr9ixYq4d+9ekeWXL1826XX79+8PlUqFmJgYxMXF4cKFC4iMjCy0Tq1atZCRkYGwsDCjt+rVq5v0mkTlAQsMonKsc+fOaNWqFebOnYucnBwAgIuLC9555x3Ex8fjgw8+KLLNpk2bsHjxYoSHh+OZZ54xbDNp0iScO3cOkyZNMnpk4ffff8fhw4eLzdKmTRv07NkTCxcuxLp164o8r1ar8c477xge16pVC+fPn8ft27cNy06cOIF9+/aV+P0DgKenJ8LDw7FixQpER0dDpVIVOQozZMgQHDhwAFu3bi2y/b1796DVak16TaLygCN5EpUD+SN5HjlyxHCKJN+qVaswePBg/Pjjj3jttdcA5J1miIiIwOrVq9GxY0cMHDgQzs7O2Lt3L37//XfUq1cPO3fuLDSSp16vx+jRo7F06VI0b97cMJJnamoq1q1bh8OHD2P//v1o06ZNsTlv376NHj164MSJE+jTpw+6deuGChUq4OLFi4iOjkZKSgpyc3MB5F110rBhQzRp0gQvvfQSbt26hfnz56NKlSpIT083XIKbnJyMmjVrYvbs2YUKlIKWLVuGF154AW5ubujcubPhktl8WVlZ6NChA06ePInRo0cjNDQUmZmZOHXqFFatWoXk5ORCp1SICBzJk6g8KG6gLSGE0Ol0olatWqJWrVqFBsnS6XTi119/Fe3atRPu7u7CyclJNGjQQMyYMUNkZGQU+1qrVq0SPXr0EF5eXsLe3l74+fmJiIgIsWvXrhJlzcrKEl9++aVo2bKlcHV1FSqVSgQHB4s333xTJCQkFFr3999/F0FBQUKlUommTZuKrVu3PnagreKkp6cLZ2dnAUD8/vvvRtd58OCBmDJliqhdu7ZQqVTC29tbtG3bVnz55ZdCrVaX6L0RlSc8gkFERESSYx8MIiIikhwLDCIiIpIcCwwiIiKSHAsMIiIikhwLDCIiIpIcCwwiIiKSXLmbi0Sv1+PGjRtwc3ODQqGQOw4REZHNEELgwYMHqFq1KpTKxx+jKHcFxo0bNxAQECB3DCIiIpt19epVVKtW7bHrlLsCI3966atXrxqmh35aGo0G27ZtQ48ePeDg4CDJPss7tqn02KbSYntKj20qLXO0Z3p6OgICAgzfpY9T7gqM/NMi7u7ukhYYLi4ucHd35w+FRNim0mObSovtKT22qbTM2Z4l6WLATp5EREQkORYYREREJDkWGERERCQ5FhhEREQkORYYREREJDkWGERERCQ5FhhEREQkORYYREREJDkWGERERCQ5FhhEREQkOVkLjL///ht9+vRB1apVoVAosG7duidus2vXLjRv3hyOjo6oXbs2Fi9ebPacREREZBpZC4zMzEw0adIE8+bNK9H6ly5dwnPPPYcuXbogLi4O//d//4eXX34ZW7duNXNSIiIiMoWsk5316tULvXr1KvH68+fPR82aNfHVV18BAOrVq4e9e/fi66+/Rnh4uLliEhER2RQhgNRU4Nw5LzRqBAQFWT6DTc2meuDAAYSFhRVaFh4ejv/7v/8rdpvc3Fzk5uYaHqenpwPIm2VOo9FIkit/P1Ltj9im5sA2lRbbU3psU9PodMC1a0BSkgKJiUBiosJwS07WwcfnMhITO0ChUGPiRGm/70rCpgqM1NRUVKlSpdCyKlWqID09HdnZ2XB2di6yzaxZszBjxowiy7dt2wYXFxdJ823fvl3S/RHb1BzYptJie0qPbfqQRqPArVsuSE2tgJSUCkhNLXhzgVZrV2QbF5csDBmyAjVqXEZU1FD8/bc96tY9JUmerKysEq9rUwVGaUyZMgUTJ040PE5PT0dAQAB69OgBd3d3SV5Do9Fg+/bt6N69OxwcHCTZZ3nHNpUe21RabE/pldc2zcoCkpIeHoHIv5+UpMDly4Beryjxvnx8bmHYsCh4et6
|
|||
|
|
"text/plain": [
|
|||
|
|
"<Figure size 600x500 with 1 Axes>"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "display_data"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"# ROC Curve\n",
|
|||
|
|
"fpr, tpr, _ = roc_curve(y_test, y_pred_proba)\n",
|
|||
|
|
"roc_auc = auc(fpr, tpr)\n",
|
|||
|
|
"\n",
|
|||
|
|
"plt.figure(figsize=(6, 5))\n",
|
|||
|
|
"plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')\n",
|
|||
|
|
"plt.plot([0, 1], [0, 1], color='gray', linestyle='--')\n",
|
|||
|
|
"plt.xlabel('False Positive Rate')\n",
|
|||
|
|
"plt.ylabel('True Positive Rate')\n",
|
|||
|
|
"plt.title('ROC Curve')\n",
|
|||
|
|
"plt.legend(loc='lower right')\n",
|
|||
|
|
"plt.grid(True)\n",
|
|||
|
|
"plt.show()"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"### Interpreting the ROC Curve\n",
|
|||
|
|
"\n",
|
|||
|
|
"The **Receiver Operating Characteristic (ROC) curve** shows how well the model distinguishes between the positive and negative classes across all decision thresholds.\n",
|
|||
|
|
"\n",
|
|||
|
|
"A quick reminder of the definitions:\n",
|
|||
|
|
"* True Positive Rate (TPR) = Recall\n",
|
|||
|
|
"* False Positive Rate (FPR) = Proportion of negatives wrongly classified as positives\n",
|
|||
|
|
"\n",
|
|||
|
|
"What we display in this plot is:\n",
|
|||
|
|
"* The x-axis is False Positive Rate\n",
|
|||
|
|
"* The y-axis is True Positive Rate\n",
|
|||
|
|
"\n",
|
|||
|
|
"The curve shows how TPR and FPR change as the threshold varies\n",
|
|||
|
|
"\n",
|
|||
|
|
"It's important to note that:\n",
|
|||
|
|
"* A model with no skill will produce a diagonal line (AUC = 0.5)\n",
|
|||
|
|
"* A model with perfect discrimination will hug the top-left corner (AUC = 1.0)\n",
|
|||
|
|
"\n",
|
|||
|
|
"The Area Under the Curve (ROC AUC) gives a single performance score:\n",
|
|||
|
|
"* Closer to 1 means better at ranking positive cases higher than negative ones\n",
|
|||
|
|
"\n",
|
|||
|
|
"**Important!**\n",
|
|||
|
|
"\n",
|
|||
|
|
"While useful, the ROC curve can sometimes overestimate performance when the dataset is imbalanced, because it includes negatives (which dominate in our case, around 99%!). That’s why we also MUST check the Precision-Recall curve."
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "6790d41d",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAhgAAAHWCAYAAAA1jvBJAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABYUUlEQVR4nO3deVhUZf8G8HtmmBlAQFA2QRR3TU0Nl9DMDUFNyzZNza00N963JCvNlMwSt1wqlzK3+mmaVma5EkpulKVii/uKqSCoLLIMszy/P3g5Oc6ggA+M6P25Li6ZZ55zzne+gNycbVRCCAEiIiIiidSOLoCIiIjuPwwYREREJB0DBhEREUnHgEFERETSMWAQERGRdAwYREREJB0DBhEREUnHgEFERETSMWAQERGRdAwYRBXUkCFDEBwcXKJl4uPjoVKpEB8fXyY1VXQdO3ZEx44dlcfnzp2DSqXCihUrHFYTUUXFgEFUTCtWrIBKpVI+nJ2dUb9+fURGRiIlJcXR5d3zCn9ZF36o1WpUqVIF3bt3R0JCgqPLkyIlJQXjxo1Dw4YN4erqikqVKiEkJATvv/8+0tPTHV0eUblycnQBRBXNe++9h1q1aiEvLw979uzBokWLsHnzZvz1119wdXUttzqWLFkCi8VSomUef/xx5ObmQqfTlVFVd9avXz/06NEDZrMZJ06cwMKFC9GpUyf89ttvaNq0qcPqulu//fYbevTogRs3buDFF19ESEgIAOD333/H9OnTsWvXLmzfvt3BVRKVHwYMohLq3r07WrZsCQAYNmwYqlatijlz5uD7779Hv3797C6TnZ2NSpUqSa1Dq9WWeBm1Wg1nZ2epdZTUI488ghdffFF53L59e3Tv3h2LFi3CwoULHVhZ6aWnp+Ppp5+GRqPBoUOH0LBhQ6vnP/jgAyxZskTKtsrie4moLPAQCdFd6ty5MwDg7NmzAArOjXBzc8Pp06fRo0cPuLu7Y8CAAQAAi8WCefPmoXHjxnB2doafnx9GjBiB69ev26x3y5Yt6NChA9zd3eHh4YFWrVph9erVyvP2zsFYs2YNQkJClGWaNm2K+fPnK88XdQ7GunXrEBISAhcXF3h7e+PFF1/ExYsXreYUvq6LFy+id+/ecHNzg4+PD8aNGwez2Vzq/rVv3x4AcPr0aavx9PR0vPbaawgKCoJer0fdunUxY8YMm702FosF8+fPR9OmTeHs7AwfHx9069YNv//+uzJn+fLl6Ny5M3x9faHX6/HQQw9h0aJFpa75Vp9++ikuXryIOXPm2IQLAPDz88M777yjPFapVHj33Xdt5gUHB2PIkCHK48LDcj///DNGjx4NX19fVK9eHevXr1fG7dWiUqnw119/KWPHjh3Dc889hypVqsDZ2RktW7bExo0b7+5FE90B92AQ3aXCX4xVq1ZVxkwmEyIiIvDYY49h9uzZyqGTESNGYMWKFRg6dCj++9//4uzZs/jkk09w6NAh7N27V9krsWLFCrz00kto3LgxJkyYAE9PTxw6dAhbt25F//797dYRGxuLfv36oUuXLpgxYwYA4OjRo9i7dy9effXVIusvrKdVq1aIiYlBSkoK5s+fj7179+LQoUPw9PRU5prNZkRERKBNmzaYPXs2fvrpJ3z44YeoU6cORo0aVar+nTt3DgDg5eWljOXk5KBDhw64ePEiRowYgRo1amDfvn2YMGECLl++jHnz5ilzX375ZaxYsQLdu3fHsGHDYDKZsHv3bvzyyy/KnqZFixahcePGePLJJ+Hk5IQffvgBo0ePhsViwZgxY0pV9802btwIFxcXPPfcc3e9LntGjx4NHx8fTJ48GdnZ2XjiiSfg5uaGr7/+Gh06dLCau3btWjRu3BhNmjQBAPz9999o164dAgMDMX78eFSqVAlff/01evfujW+++QZPP/10mdRMBEFExbJ8+XIBQPz0008iNTVVXLhwQaxZs0ZUrVpVuLi4iH/++UcIIcTgwYMFADF+/Hir5Xfv3i0AiFWrVlmNb9261Wo8PT1duLu7izZt2ojc3FyruRaLRfl88ODBombNmsrjV199VXh4eAiTyVTka9i5c6cAIHbu3CmEECI/P1/4+vqKJk2aWG3rxx9/FADE5MmTrbYHQLz33ntW62zRooUICQkpcpuFzp49KwCIKVOmiNTUVJGcnCx2794tWrVqJQCIdevWKXOnTp0qKlWqJE6cOGG1jvHjxwuNRiOSkpKEEELs2LFDABD//e9/bbZ3c69ycnJsno+IiBC1a9e2GuvQoYPo0KGDTc3Lly+/7Wvz8vISzZo1u+2cmwEQ0dHRNuM1a9YUgwcPVh4Xfs899thjNl/Xfv36CV9fX6vxy5cvC7VabfU16tKli2jatKnIy8tTxiwWi2jbtq2oV69esWsmKikeIiEqobCwMPj4+CAoKAgvvPAC3Nzc8N133yEwMNBq3q1/0a9btw6VK1dG165dkZaWpnyEhITAzc0NO3fuBFCwJyIrKwvjx4+3OV9CpVIVWZenpyeys7MRGxtb7Nfy+++/48qVKxg9erTVtp544gk0bNgQmzZtsllm5MiRVo/bt2+PM2fOFHub0dHR8PHxgb+/P9q3b4+jR4/iww8/tPrrf926dWjfvj28vLysehUWFgaz2Yxdu3YBAL755huoVCpER0fbbOfmXrm4uCifZ2RkIC0tDR06dMCZM2eQkZFR7NqLkpmZCXd397teT1GGDx8OjUZjNda3b19cuXLF6nDX+vXrYbFY0LdvXwDAtWvXsGPHDvTp0wdZWVlKH69evYqIiAicPHnS5lAYkSw8REJUQgsWLED9+vXh5OQEPz8/NGjQAGq1dVZ3cnJC9erVrcZOnjyJjIwM+Pr62l3vlStXAPx7yKVwF3dxjR49Gl9//TW6d++OwMBAhIeHo0+fPujWrVuRy5w/fx4A0KBBA5vnGjZsiD179liNFZ7jcDMvLy+rc0hSU1Otzslwc3ODm5ub8viVV17B888/j7y8POzYsQMfffSRzTkcJ0+exB9//GGzrUI39yogIABVqlQp8jUCwN69exEdHY2EhATk5ORYPZeRkYHKlSvfdvk78fDwQFZW1l2t43Zq1aplM9atWzdUrlwZa9euRZcuXQAUHB5p3rw56tevDwA4deoUhBCYNGkSJk2aZHfdV65csQnHRDIwYBCVUOvWrZVj+0XR6/U2ocNiscDX1xerVq2yu0xRv0yLy9fXF4mJidi2bRu2bNmCLVu2YPny5Rg0aBBWrlx5V+sudOtf0fa0atVKCS5AwR6Lm09orFevHsLCwgAAPXv2hEajwfjx49GpUyelrxaLBV27dsWbb75pdxuFv0CL4/Tp0+jSpQsaNmyIOXPmICgoCDqdDps3b8bcuXNLfKmvPQ0bNkRiYiLy8/Pv6hLgok6WvXkPTCG9Xo/evXvju+++w8KFC5GSkoK9e/di2rRpypzC1zZu3DhERETYXXfdunVLXS/R7TBgEJWTOnXq4KeffkK7du3s/sK4eR4A/PXXXyX+z1+n06FXr17o1asXLBYLRo8ejU8//RSTJk2yu66aNWsCAI4fP65cDVPo+PHjyvMlsWrVKuTm5iqPa9eufdv5EydOxJIlS/DOO+9g69atAAp6cOPGDSWIFKVOnTrYtm0brl27VuRejB9++AEGgwEbN25EjRo1lPHCQ1Iy9OrVCwkJCfjmm2+KvFT5Zl5eXjY33srPz8fly5dLtN2+ffti5cqViIuLw9GjRyGEUA6PAP/2XqvV3rGXRLLxHAyictKnTx+YzWZMnTrV5jmTyaT8wgkPD4e7uztiYmKQl5dnNU8IUeT6r169avVYrVbj4YcfBgAYDAa7y7Rs2RK+vr5YvHix1ZwtW7bg6NGjeOKJJ4r12m7Wrl07hIWFKR93Chienp4YMWIEtm3bhsTERAAFvUpISMC2bdts5qenp8NkMgEAnn32WQghMGXKFJt5hb0q3Otyc+8yMjKwfPnyEr+2oowcORLVqlXD66+/jhMnTtg8f+XKFbz//vvK4zp16ijnkRT67LPPSny5b1hYGKpUqYK1a9di7dq1aN26tdXhFF9fX3T
|
|||
|
|
"text/plain": [
|
|||
|
|
"<Figure size 600x500 with 1 Axes>"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "display_data"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"# PR Curve\n",
|
|||
|
|
"precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)\n",
|
|||
|
|
"pr_auc = average_precision_score(y_test, y_pred_proba)\n",
|
|||
|
|
"\n",
|
|||
|
|
"plt.figure(figsize=(6, 5))\n",
|
|||
|
|
"plt.plot(recall, precision, color='green', lw=2, label=f'PR curve (AUC = {pr_auc:.4f})')\n",
|
|||
|
|
"plt.xlabel('Recall')\n",
|
|||
|
|
"plt.ylabel('Precision')\n",
|
|||
|
|
"plt.title('Precision-Recall Curve')\n",
|
|||
|
|
"plt.legend(loc='lower left')\n",
|
|||
|
|
"plt.grid(True)\n",
|
|||
|
|
"plt.show()"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"### Interpreting the Precision-Recall (PR) Curve\n",
|
|||
|
|
"\n",
|
|||
|
|
"The **Precision-Recall (PR) curve** helps evaluate model performance, especially on imbalanced datasets like ours (where positive cases are rare).\n",
|
|||
|
|
"\n",
|
|||
|
|
"A quick reminder of the definitions:\n",
|
|||
|
|
"* Precision = How many of the predicted positives are actually positive\n",
|
|||
|
|
"* Recall = How many of the actual positives the model correctly identifies\n",
|
|||
|
|
"\n",
|
|||
|
|
"What we display in this plot is:\n",
|
|||
|
|
"* The x-axis is Recall \n",
|
|||
|
|
"* The y-axis is Precision \n",
|
|||
|
|
"\n",
|
|||
|
|
"The curve shows the trade-off between them at different model thresholds\n",
|
|||
|
|
"\n",
|
|||
|
|
"In imbalanced datasets, accuracy can be misleading — the PR curve focuses only on the positive class, making it much more meaningful:\n",
|
|||
|
|
"* A higher curve means better performance\n",
|
|||
|
|
"* The area under the curve (PR AUC) summarizes this: closer to 1 is better"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"\n",
|
|||
|
|
"# Compute confusion matrix: [ [TN, FP], [FN, TP] ]\n",
|
|||
|
|
"tn, fp, fn, tp = confusion_matrix(y_test, y_pred_opt).ravel()\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Total predictions\n",
|
|||
|
|
"total = tp + tn + fp + fn\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Compute all requested metrics\n",
|
|||
|
|
"recall = recall_score(y_test, y_pred_opt)\n",
|
|||
|
|
"precision = precision_score(y_test, y_pred_opt)\n",
|
|||
|
|
"f1 = fbeta_score(y_test, y_pred_opt, beta=1)\n",
|
|||
|
|
"f2 = fbeta_score(y_test, y_pred_opt, beta=2)\n",
|
|||
|
|
"f3 = fbeta_score(y_test, y_pred_opt, beta=3)\n",
|
|||
|
|
"fpr = fp / (fp + tn) if (fp + tn) != 0 else 0\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Scores relative to total\n",
|
|||
|
|
"tp_score = tp / total\n",
|
|||
|
|
"tn_score = tn / total\n",
|
|||
|
|
"fp_score = fp / total\n",
|
|||
|
|
"fn_score = fn / total\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Create DataFrame\n",
|
|||
|
|
"summary_df = pd.DataFrame([{\n",
|
|||
|
|
" \"flagging_analysis_type\": \"RISK_VS_CLAIM\",\n",
|
|||
|
|
" \"count_total\": total,\n",
|
|||
|
|
" \"count_true_positive\": tp,\n",
|
|||
|
|
" \"count_true_negative\": tn,\n",
|
|||
|
|
" \"count_false_positive\": fp,\n",
|
|||
|
|
" \"count_false_negative\": fn,\n",
|
|||
|
|
" \"true_positive_score\": tp_score,\n",
|
|||
|
|
" \"true_negative_score\": tn_score,\n",
|
|||
|
|
" \"false_positive_score\": fp_score,\n",
|
|||
|
|
" \"false_negative_score\": fn_score,\n",
|
|||
|
|
" \"recall_score\": recall,\n",
|
|||
|
|
" \"precision_score\": precision,\n",
|
|||
|
|
" \"false_positive_rate_score\": fpr,\n",
|
|||
|
|
" \"f1_score\": f1,\n",
|
|||
|
|
" \"f2_score\": f2,\n",
|
|||
|
|
" \"f3_score\": f3,\n",
|
|||
|
|
" \"roc_auc_score\": roc_auc,\n",
|
|||
|
|
" \"pr_auc_score\": pr_auc\n",
|
|||
|
|
"}])"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [],
|
|||
|
|
"source": [
|
|||
|
|
"def plot_confusion_matrix_from_df(df, flagging_analysis_type, name_of_the_experiment=\"\"):\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Subset - just retrieve one row depending on the flagging_analysis_type\n",
|
|||
|
|
" row = df[df['flagging_analysis_type'] == flagging_analysis_type].iloc[0]\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Define custom x-axis labels and wording\n",
|
|||
|
|
" if flagging_analysis_type == 'RISK_VS_CLAIM':\n",
|
|||
|
|
" x_labels = ['With Submitted Claim', 'Without Submitted Claim']\n",
|
|||
|
|
" outcome_label = \"submitted claim\"\n",
|
|||
|
|
" elif flagging_analysis_type == 'RISK_VS_SUBMITTED_PAYOUT':\n",
|
|||
|
|
" x_labels = ['With Submitted Payout', 'Without Submitted Payout']\n",
|
|||
|
|
" outcome_label = \"submitted payout\"\n",
|
|||
|
|
" else:\n",
|
|||
|
|
" x_labels = ['Actual Positive', 'Actual Negative'] \n",
|
|||
|
|
" outcome_label = \"outcome\"\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Confusion matrix structure\n",
|
|||
|
|
" cm = np.array([\n",
|
|||
|
|
" [row['count_true_positive'], row['count_false_positive']],\n",
|
|||
|
|
" [row['count_false_negative'], row['count_true_negative']]\n",
|
|||
|
|
" ])\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Create annotations for the confusion matrix\n",
|
|||
|
|
" labels = [['True Positives', 'False Positives'], ['False Negatives', 'True Negatives']]\n",
|
|||
|
|
" counts = [[f\"{v:,}\" for v in [row['count_true_positive'], row['count_false_positive']]],\n",
|
|||
|
|
" [f\"{v:,}\" for v in [row['count_false_negative'], row['count_true_negative']]]]\n",
|
|||
|
|
" percentages = [[f\"{round(100*v,2):,}\" for v in [row['true_positive_score'], row['false_positive_score']]],\n",
|
|||
|
|
" [f\"{round(100*v,2):,}\" for v in [row['false_negative_score'], row['true_negative_score']]]]\n",
|
|||
|
|
" annot = [[f\"{labels[i][j]}\\n{counts[i][j]} ({percentages[i][j]}%)\" for j in range(2)] for i in range(2)]\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Scores formatted as percentages\n",
|
|||
|
|
" recall = row['recall_score'] * 100\n",
|
|||
|
|
" precision = row['precision_score'] * 100\n",
|
|||
|
|
" f1 = row['f1_score'] * 100\n",
|
|||
|
|
" f2 = row['f2_score'] * 100\n",
|
|||
|
|
" f3 = row['f3_score'] * 100\n",
|
|||
|
|
" roc_auc = row['roc_auc_score'] * 100\n",
|
|||
|
|
" pr_auc = row['pr_auc_score'] * 100\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Set up figure and axes manually for precise control\n",
|
|||
|
|
" fig = plt.figure(figsize=(9, 8))\n",
|
|||
|
|
" grid = fig.add_gridspec(nrows=3, height_ratios=[1, 15, 2])\n",
|
|||
|
|
"\n",
|
|||
|
|
" \n",
|
|||
|
|
" ax_main_title = fig.add_subplot(grid[0])\n",
|
|||
|
|
" ax_main_title.axis('off')\n",
|
|||
|
|
" ax_main_title.set_title(f\"{name_of_the_experiment} - Flagged as Risk vs. {outcome_label.title()}\", fontsize=14, weight='bold')\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Heatmap\n",
|
|||
|
|
" ax_heatmap = fig.add_subplot(grid[1])\n",
|
|||
|
|
" ax_heatmap.set_title(f\"Confusion Matrix – Risk vs. {outcome_label.title()}\", fontsize=12, weight='bold', ha='center', va='center', wrap=False)\n",
|
|||
|
|
"\n",
|
|||
|
|
" cmap = sns.light_palette(\"#A73A52\", as_cmap=True)\n",
|
|||
|
|
"\n",
|
|||
|
|
" sns.heatmap(cm, annot=annot, fmt='', cmap=cmap, cbar=False,\n",
|
|||
|
|
" xticklabels=x_labels,\n",
|
|||
|
|
" yticklabels=['Flagged as Risk', 'Flagged as No Risk'],\n",
|
|||
|
|
" ax=ax_heatmap,\n",
|
|||
|
|
" linewidths=1.0,\n",
|
|||
|
|
" annot_kws={'fontsize': 10, 'linespacing': 1.2})\n",
|
|||
|
|
" ax_heatmap.set_xlabel(\"Resolution Outcome (Actual)\", fontsize=11, labelpad=10)\n",
|
|||
|
|
" ax_heatmap.set_ylabel(\"Flagging (Prediction)\", fontsize=11, labelpad=10)\n",
|
|||
|
|
" \n",
|
|||
|
|
" # Make borders visible\n",
|
|||
|
|
" for _, spine in ax_heatmap.spines.items():\n",
|
|||
|
|
" spine.set_visible(True)\n",
|
|||
|
|
"\n",
|
|||
|
|
" # Footer with metrics and date\n",
|
|||
|
|
" ax_footer = fig.add_subplot(grid[2])\n",
|
|||
|
|
" ax_footer.axis('off')\n",
|
|||
|
|
" metrics_text = f\"Total Booking Count: {row['count_total']} | Recall: {recall:.2f}% | Precision: {precision:.2f}% | F1 Score: {f1:.2f}% | F2 Score: {f2:.2f}% | ROC AUC: {roc_auc:.2f}% | PR AUC: {pr_auc:.2f}%\"\n",
|
|||
|
|
" date_text = f\"Generated on {date.today().strftime('%B %d, %Y')}\"\n",
|
|||
|
|
" ax_footer.text(0.5, 0.7, metrics_text, ha='center', fontsize=9)\n",
|
|||
|
|
" ax_footer.text(0.5, 0.1, date_text, ha='center', fontsize=8, color='gray')\n",
|
|||
|
|
"\n",
|
|||
|
|
" plt.tight_layout()\n",
|
|||
|
|
" plt.show()"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA5wAAAMVCAYAAAAbDfvBAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd5wN1//H8ffdvmxfltV2sXrvvUTvEoIo0SIEIVpI00IiCCLElyQ6QQjRa0IQJbpEL6v3VVYvu/P7w28ne22xdl3Xxuv5eNzH3nvmzMznzp07O597zpyxGIZhCAAAAACA58zB3gEAAAAAAP6bSDgBAAAAADZBwgkAAAAAsAkSTgAAAACATZBwAgAAAABsgoQTAAAAAGATJJwAAAAAAJsg4QQAAAAA2AQJJwAAAADAJkg4ASTZiRMnZLFYzMe6devsHVKyMGXKFKvt9qpbt26d1fY4ceKEvUOyqQEDBpjvNTg4ONHLYT+yjef1+SRVcv18X3TcrVu3NtdVsWLFJC8vODjYXN6AAQOSvDzgVUbCiVfaxYsXNWjQIFWoUEFp0qSRi4uLUqZMqTx58uidd97R8uXLZRiGXWJ7WU6+SSYTJvrJaXyPKVOm2DtUPEdPnlRHPZycnOTv76+SJUtq8ODBunHjhr1D/c+YPXu2qlevrjRp0sjZ2Vne3t7KnDmzKlasqA8++EArV660d4gvTEKTuuR4DHr06JFmz56txo0bK0uWLPLw8JCLi4syZMig2rVra+zYsbp27Zq9wwSQAE72DgCwl3Hjxqlnz566d++eVfnDhw+1f/9+7d+/X5MmTVJoaKhdf90GkPxERETo6tWr2rp1q7Zu3aqZM2fqr7/+kqenp1mnWrVq8vDwkCR5e3vbK9RkpWXLlpo+fbpVWXh4uMLDw3XixAn98ccfOnnypKpXr26nCJ+/YsWKafjw4fYO44X6559/1KRJE+3fvz/GtLNnz+rs2bNatmyZrly5YrPWx08//dT8oah06dI2WQfwqiDhxCtp2LBh6tOnj/na0dFRtWvXVpEiRWSxWHT06FGtXLlSFy9etGOUSM4++eQT+fr6xigvVqyYHaLBi/Lee+8pa9asCgsL0+zZs82eCQcPHtTkyZPVtWtXs27p0qU5kX0GK1assEo2ixQpourVq8vDw0OXL1/Wzp07tXnzZjtGaBt58uRRnjx57B3GC3Pw4EFVqFBBV69eNcvy5s2rGjVqyM/PT5cuXdKGDRu0Y8cOm8bx7rvv2nT5wCvFAF4x+/btMxwdHQ1JhiQjICDA2LlzZ4x6Dx48ML7//nvj4sWLVuVnzpwxevXqZeTNm9dImTKl4erqagQFBRnNmzc3tm7dGmM5/fv3N9cVFBRkXL9+3ejVq5eRKVMmw9nZ2cicObPxxRdfGJGRkeY8UfXjerRq1cowDMN4+PCh8dlnnxk1a9Y0smTJYnh7extOTk6Gn5+fUbZsWePbb781Hjx4EOt2OH36tNG7d2+jYMGChqenp+Hq6mpkzJjRqF+/vrFq1SrDMAwjKCgo3jgqVKhgGIZhhIaGWpWvXbs2xvoWLVpk1KtXz0ibNq3h7Oxs+Pj4GK+99poxY8YMq/ceZf369cbrr79upEuXznB2djZSpkxpBAUFGTVq1DD69+9vXL9+3ax769YtY+DAgUahQoUMDw8Pw8nJyUidOrVRoEABo127dsby5ctj3QbPU/TPWZIRGhr61HkmT55sNU90a9euNdq2bWsUKlTISJs2reHi4mK4u7sbWbNmNVq3bm3s3bs31mWeOHHCaNq0qeHn52ekTJnSKFeunPHbb7/Fuy7DMIy9e/caderUMTw9PQ1PT0+jRo0axq5du2Lsv0+6ceOG8eWXXxrFixc3vLy8DGdnZyNjxoxGq1atjH/++SfWGK9cuWJ06NDBCAgIMNzc3IwiRYoYs2fPNtauXfvM29AwDGP+/PlGixYtjHz58hkBAQHm/pIrVy6jc+fOsS7n8uXLRs+ePY3cuXMbKVKkMJydnY00adIYxYoVMzp37mxs3rw5Qet+crtG3/cPHDhgNa1Dhw5W88a3bU+cOGG0b9/eCAkJMdzc3AxXV1cjXbp0RunSpY3u3bsb+/fvjzOG6D744AOz3MHBwZg4cWKc7yUiIsLIlCmTWb9///4x6vTu3ducni1bNrN87969RvPmzY2goCDDxcXFcHNzMzJmzGi89tprxkcffWScOXMmAVszft27dzfXHRISYjx69ChGnRs3bhgbN260KotvO8d37HpyvvDwcKNHjx5GhgwZDFdXVyNXrlzGmDFjYhy/WrVqZXWMPHTokPH6668bXl5ehq+vr9G0aVPjwoULhmEYxpo1a4yyZcsa7u7uRqpUqYy2bdsaV69etVpebJ/vk3HH9ujfv79RoUKFeOs8uT0uXLhgfPzxx0aBAgUMDw8Pw9XV1ciaNavRqVMn4+TJk7F+LidOnDDeeustw9fX10iRIoVRrlw5Y/Xq1U895sSlVKlSVvN9+eWXsf6P2L59u7Fw4cI4t3t0EydONBo1amTkzJnT8Pf3N5ycnAxPT0+jQIECRu/evY3Lly/HWH70/3/RvwtPHqcOHjxo9OvXz8iUKZPh7u5uFCtWzPyfc+nSJaNt27ZGqlSpDDc3N6NMmTLG+vXrE7wtgP8KEk68ct577z2rfxa//PJLguf9448/DF9f3zj/eTs4OBgjRoywmif6SYu/v7+RK1euWOft27evOU9CE86bN28+tW6VKlVinJgtXbrU8PT0jHOeDz74wDCM55NwRkREGG+//Xa8y2nUqJFVjGvWrLH6USC2x4EDB8z6FStWjLdukyZNEvwZJ9bzTjh79uwZ73tycXExVq9ebTVPaGiokTZt2lj3y9q1a8e5rm3bthkeHh4x5nNzczOqVq0a58np4cOHjeDg4DhjdHV1NX7++Werea5du2bkzJkz1vpPxpjQhLNhw4bxbisvLy+rBP3u3btGjhw54p2nT58+CVp3fAlneHi41bRPP/3Uat64EqGLFy8aqVOnjje+//3vf3HGEOXDDz80yxwdHY2ZM2c+9f307dvXnCd79uxW0yIjI60S0i+//NIwjMc/4qVIkSLeeJ/Hjz5dunQxl5cqVSrj6NGjCZrveSScadKkMYoWLRrre+vSpYvVMqMnPpkzZ471f0aOHDmMadOmGQ4ODjGmlS9f3mp5Lyrh3LRpk5EqVao463p7e8dIluI65lgsFqNWrVpxHnPismXLFqt56tatm6D5ntzuTyacRYoUiXc7pE+f3jh79qzVPAlNOGNbtoODgzF79mwjc+bMMaa5urpa/WAEvAroUotXzm+//WY+9/X11euvv56g+a5fv64GDRqYgxS4u7urTZs28vLy0qxZs3Ty5ElFRkaqV69eKlKkiCpUqBBjGWFhYbp27ZpatmypdOnS6ccff9SVK1ckSaNHj9Znn30mFxcXDR8+XMeOHdP48ePNeaN30cybN6+kxwNBZMmSRSVLllT69Onl6+urhw8f6uDBg5o7d64ePXqkNWvW6JdfflHjxo0lSSdPnlSjRo10584dcxn16tVTwYIFdfnyZf3+++/mOj/99FOdOHFCX375pVkW1WVQkjJmzPjU7TZs2DCzG5zFYlHDhg1VoEABhYaGavr06Xr48KHmzp2rggUL6pNPPpEkff/994qIiJAk5cyZU40aNZKTk5NOnTql3bt3a+fOnebyDxw4YA5k5ODgoJYtWyp79uy6cuWKQkND7TbI0Q8//BBrl9pevXolaP6UKVOqQoUKypcvn/z8/OTu7q6wsDAtXbpUBw4c0IMHD9S1a1era5zef/99XbhwwXxdq1YtFSlSREuXLtXSpUtjXY9hGGrbtq1u3bplljVt2lRZsmTRzz//rNWrV8c6X0REhN544w2zy2jq1KnVrFkz+fn5aeXKldq0aZPu37+vli1bqkiRIsqSJYsk6bPPPtPBgwfN5VSoUEEVKlTQn3/+GWeMT+Pj46Nq1aopV65c8vX1lYuLiy5evKgFCxbo1KlTCg8PV58+fbRs2TJJ0tq1a3Xo0CFJkpubm9555x2lT59eFy5
|
|||
|
|
"text/plain": [
|
|||
|
|
"<Figure size 900x800 with 3 Axes>"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "display_data"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"# Plot confusion matrix for claim scenario\n",
|
|||
|
|
"plot_confusion_matrix_from_df(summary_df, 'RISK_VS_CLAIM', 'Contactless')"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"## Feature Importance\n",
|
|||
|
|
"Understanding what drives the prediction is useful for future experiments and business knowledge. Here we track both the native feature importances of the trees, as well as a more heavy SHAP values analysis.\n",
|
|||
|
|
"\n",
|
|||
|
|
"Important! Be aware that SHAP analysis might take quite a bit of time."
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "d66ffe2c",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx0AAAMWCAYAAACUXutQAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADqGUlEQVR4nOzdeXhN1/v//+cJmUdBhAgxRCSamLXmqLQISgdB1VhT1Rutsb9Wxawq5lJFE1U6mapEK1VSYoohaohQQ6Mt1ZqHlkjO7w/f7I/TBEk4gr4e13WuK2cPa99rZZ9k32ettbfJbDabERERERERsRKb/A5AREREREQeb0o6RERERETEqpR0iIiIiIiIVSnpEBERERERq1LSISIiIiIiVqWkQ0RERERErEpJh4iIiIiIWJWSDhERERERsSolHSIiIiIiYlVKOkRERERExKqUdIiIiAgAMTExmEymbF/Dhg2zyjE3b95MZGQk58+ft0r59yKzPXbs2JHfoeTZrFmziImJye8wRCiY3wGIiIjIw2XUqFGUKVPGYtkTTzxhlWNt3ryZkSNH0qVLFzw8PKxyjP+yWbNmUaRIEbp06ZLfoch/nJIOERERsdCsWTNq1KiR32HckytXruDs7JzfYeSbq1ev4uTklN9hiBg0vEpERERyZc2aNdSvXx9nZ2dcXV1p3rw5+/fvt9jmp59+okuXLpQtWxYHBwe8vb3p1q0bZ86cMbaJjIxk8ODBAJQpU8YYynX8+HGOHz+OyWTKdmiQyWQiMjLSohyTycSBAwd4+eWXKVSoEPXq1TPWf/rpp1SvXh1HR0c8PT1p164dJ06cyFPdu3TpgouLC6mpqbRo0QIXFxd8fHz44IMPANi7dy9PP/00zs7OlC5dmsWLF1vsnzlk68cff6RXr14ULlwYNzc3OnXqxLlz57Icb9asWVSqVAl7e3tKlCjB66+/nmUoWmhoKE888QQ7d+6kQYMGODk58f/9f/8ffn5+7N+/n/j4eKNtQ0NDATh79iyDBg0iODgYFxcX3NzcaNasGXv27LEoe8OGDZhMJr788kvGjh1LyZIlcXBwoHHjxvz8889Z4t22bRvh4eEUKlQIZ2dnQkJCmDZtmsU2Bw8e5KWXXsLT0xMHBwdq1KjBypUrLbZJS0tj5MiR+Pv74+DgQOHChalXrx5xcXE5+j3Jw0c9HSIiImLhwoUL/PXXXxbLihQpAsDChQvp3LkzTZo04b333uPq1avMnj2bevXqsXv3bvz8/ACIi4vj6NGjdO3aFW9vb/bv389HH33E/v372bp1KyaTiRdeeIFDhw7x2WefMWXKFOMYRYsW5c8//8x13G3atMHf359x48ZhNpsBGDt2LMOHDyciIoLu3bvz559/MmPGDBo0aMDu3bvzNKQrPT2dZs2a0aBBAyZOnMiiRYvo27cvzs7OvP3223To0IEXXniBDz/8kE6dOlG7du0sw9X69u2Lh4cHkZGRpKSkMHv2bH755RfjIh9uJlMjR44kLCyM1157zdguMTGRhIQEbG1tjfLOnDlDs2bNaNeuHa+88grFihUjNDSU//3vf7i4uPD2228DUKxYMQCOHj3KihUraNOmDWXKlOGPP/5gzpw5NGzYkAMHDlCiRAmLeCdMmICNjQ2DBg3iwoULTJw4kQ4dOrBt2zZjm7i4OFq0aEHx4sXp378/3t7eJCcns2rVKvr37w/A/v37qVu3Lj4+PgwbNgxnZ2e+/PJLWrduzdKlS3n++eeNuo8fP57u3btTq1YtLl68yI4dO9i1axfPPPNMrn9n8hAwi4iIiJjN5ujoaDOQ7ctsNpsvXbpk9vDwMPfo0cNiv1OnTpnd3d0tll+9ejVL+Z999pkZMP/444/Gsvfff98MmI8dO2ax7bFjx8yAOTo6Oks5gHnEiBHG+xEjRpgBc/v27S22O378uLlAgQLmsWPHWizfu3evuWDBglmW3649EhMTjWWdO3c2A+Zx48YZy86dO2d2dHQ0m0wm8+eff24sP3jwYJZYM8usXr26+fr168byiRMnmgHz119/bTabzebTp0+b7ezszM8++6w5PT3d2G7mzJlmwPzxxx8byxo2bGgGzB9++GGWOlSqVMncsGHDLMv/+ecfi3LN5pttbm9vbx41apSxbP369WbAHBgYaL527ZqxfNq0aWbAvHfvXrPZbDbfuHHDXKZMGXPp0qXN586dsyg3IyPD+Llx48bm4OBg8z///GOxvk6dOmZ/f39jWeXKlc3NmzfPErc8ujS8SkRERCx88MEHxMXFWbzg5jfZ58+fp3379vz111/Gq0CBAjz55JOsX7/eKMPR0dH4+Z9//uGvv/7iqaeeAmDXrl1Wibt3794W75ctW0ZGRgYREREW8Xp7e+Pv728Rb251797d+NnDw4OAgACcnZ2JiIgwlgcEBODh4cHRo0ez7N+zZ0+LnorXXnuNggULEhsbC8D333/P9evXGTBgADY2/3e51qNHD9zc3Fi9erVFefb29nTt2jXH8dvb2xvlpqenc+bMGVxcXAgICMj299O1a1fs7OyM9/Xr1wcw6rZ7926OHTvGgAEDsvQeZfbcnD17lh9++IGIiAguXbpk/D7OnDlDkyZNOHz4ML/99htws03379/P4cOHc1wnebhpeJWIiIhYqFWrVrYTyTMvAJ9++uls93NzczN+Pnv2LCNHjuTzzz/n9OnTFttduHDhPkb7f/49hOnw4cOYzWb8/f2z3f7Wi/7ccHBwoGjRohbL3N3dKVmypHGBfevy7OZq/DsmFxcXihcvzvHjxwH45ZdfgJuJy63s7OwoW7assT6Tj4+PRVJwNxkZGUybNo1Zs2Zx7Ngx0tPTjXWFCxfOsn2pUqUs3hcqVAjAqNuRI0eAO9/l7Oeff8ZsNjN8+HCGDx+e7TanT5/Gx8eHUaNG0apVKypUqMATTzxB06ZN6dixIyEhITmuozxclHSIiIhIjmRkZAA353V4e3tnWV+w4P9dVkRERLB582YGDx5MlSpVcHFxISMjg6ZNmxrl3Mm/L94z3Xpx/G+39q5kxmsymVizZg0FChTIsr2Li8td48hOdmXdabn5/80vsaZ/1/1uxo0bx/Dhw+nWrRujR4/G09MTGxsbBgwYkO3v537ULbPcQYMG0aRJk2y3KV++PAANGjTgyJEjfP3116xdu5Z58+YxZcoUPvzwQ4teJnl0KOkQERGRHClXrhwAXl5ehIWF3Xa7c+fOsW7dOkaOHMm7775rLM9uqMztkovMb9L/faemf3/Df7d4zWYzZcqUoUKFCjne70E4fPgwjRo1Mt5fvnyZkydPEh4eDkDp0qUBSElJoWzZssZ2169f59ixY3ds/1vdrn2XLFlCo0aNmD9/vsXy8+fPGxP6cyPz3Ni3b99tY8ush62tbY7i9/T0pGvXrnTt2pXLly/ToEEDIiMjlXQ8ojSnQ0RERHKkSZMmuLm5MW7cONLS0rKsz7zjVOa34v/+Fnzq1KlZ9sl8lsa/kws3NzeKFCnCjz/+aLF81qxZOY73hRdeoECBAowcOTJLLGaz2eL2vQ/aRx99ZNGGs2fP5saNGzRr1gyAsLAw7OzsmD59ukXs8+fP58KFCzRv3jxHx3F2ds72ae8FChTI0iZfffWVMacit6pVq0aZMmWYOnVqluNlHsfLy4vQ0FDmzJnDyZMns5Rx6x3L/v27cXFxoXz58ly7di1P8Un+U0+HiIiI5IibmxuzZ8+mY8eOVKtWjXbt2lG0aFFSU1NZvXo1devWZebMmbi5uRm3k01LS8PHx4e1a9dy7NixLGVWr14dgLfffpt27dpha2tLy5YtcXZ2pnv37kyYMIHu3btTo0YNfvzxRw4dOpTjeMuVK8eYMWN46623OH78OK1bt8bV1ZVjx46xfPlyevbsyaBBg+5b++TG9evXady4MREREaSkpDBr1izq1avHc889B9y8bfBbb73FyJEjadq0Kc8995yxXc2aNXnllVdydJzq1asze/ZsxowZQ/ny5fHy8uLpp5+mRYsWjBo1iq5du1KnTh327t3LokWLLHpVcsPGxobZs2fTsmVLqlSpQte
|
|||
|
|
"text/plain": [
|
|||
|
|
"<Figure size 800x800 with 1 Axes>"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "display_data"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"## BUILT-IN\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Get feature importances from the model\n",
|
|||
|
|
"importances = best_pipeline.named_steps['model'].feature_importances_\n",
|
|||
|
|
"features = X.columns\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Create a Series and sort\n",
|
|||
|
|
"feat_series = pd.Series(importances, index=features).sort_values(ascending=True) # ascending=True for horizontal plot\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Plot Feature Importances\n",
|
|||
|
|
"plt.figure(figsize=(8, 8))\n",
|
|||
|
|
"feat_series.plot(kind='barh', color='skyblue')\n",
|
|||
|
|
"plt.title('Feature Importances')\n",
|
|||
|
|
"plt.xlabel('Importance')\n",
|
|||
|
|
"plt.grid(axis='x')\n",
|
|||
|
|
"plt.tight_layout()\n",
|
|||
|
|
"plt.show()"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"### Interpreting the Feature Importance Plot\n",
|
|||
|
|
"The **feature importance plot** shows how much each feature contributes to the model’s overall decision-making.\n",
|
|||
|
|
"\n",
|
|||
|
|
"For tree-based models like Random Forest, importance is based on how often and how effectively a feature is used to split the data across all trees.\n",
|
|||
|
|
"A higher score means the feature plays a bigger role in improving prediction accuracy.\n",
|
|||
|
|
"\n",
|
|||
|
|
"In the graph you will see that:\n",
|
|||
|
|
"* Features are ranked from most to least important.\n",
|
|||
|
|
"* The values are relative and model-specific — not directly interpretable as weights or probabilities.\n",
|
|||
|
|
"\n",
|
|||
|
|
"This helps us identify which features the model relies on most when making predictions.\n",
|
|||
|
|
"\n",
|
|||
|
|
"**Important!**\n",
|
|||
|
|
"Unlike SHAP values, native importance doesn't show how a feature affects predictions — only how useful it is to the model overall. For deeper interpretability (e.g., direction and context), SHAP is better (but it takes more time to run)."
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "e2197cea",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stderr",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"PermutationExplainer explainer: 6417it [1:26:06, 1.24it/s] \n"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"## SHAP VALUES\n",
|
|||
|
|
"\n",
|
|||
|
|
"# SHAP requires that all features passed to Explainer be numeric (floats/ints)\n",
|
|||
|
|
"X_test_shap = X_test.copy()\n",
|
|||
|
|
"X_test_shap = X_test_shap.astype(float)\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Function that returns the probability of the positive class\n",
|
|||
|
|
"def model_predict(data):\n",
|
|||
|
|
" return best_pipeline.predict_proba(data)[:, 1]\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Ensure input to SHAP is numeric\n",
|
|||
|
|
"X_test_shap = X_test.astype(float)\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Create SHAP explainer\n",
|
|||
|
|
"explainer = shap.Explainer(model_predict, X_test_shap)\n",
|
|||
|
|
"\n",
|
|||
|
|
"# Compute SHAP values\n",
|
|||
|
|
"shap_values = explainer(X_test_shap)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"id": "9cae1a51",
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"name": "stderr",
|
|||
|
|
"output_type": "stream",
|
|||
|
|
"text": [
|
|||
|
|
"/tmp/ipykernel_795/3711913411.py:2: FutureWarning: The NumPy global RNG was seeded by calling `np.random.seed`. In a future version this function will no longer use the global RNG. Pass `rng` explicitly to opt-in to the new behaviour and silence this warning.\n",
|
|||
|
|
" shap.summary_plot(shap_values.values, X_test_shap)\n"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAzYAAAJsCAYAAAAvLTZkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3gUZdfA4d9sS+8JIRAIvYNAQlNApIl0aWJBBKWDAoIiFtBXP+XVF2lSRAVpShcSAaWDIF1R6b0kBAjp2STb5vsjZMmyCSQIJMC5r2svss88O3N2dnaZM08ZRVVVFSGEEEIIIYR4gGkKOwAhhBBCCCGE+LcksRFCCCGEEEI88CSxEUIIIYQQQjzwJLERQgghhBBCPPAksRFCCCGEEEI88CSxEUIIIYQQQjzwJLERQgghhBBCPPAksRFCCCGEEEI88CSxEUIIIYQQQjzwJLERQgghhBDiITN+/Hg8PT1vu+zs2bMoisKyZcsKtP47fd29pCvsAIQQQgghhBCFIyQkhN9//51KlSoVdij/miQ2QgghhBBCPKJcXFxo2LBhYYdxV0hXNCGEEEIIIR5RuXUpM5lMvP766/j7++Pr68uAAQNYtGgRiqJw9uxZh9dnZGQwdOhQ/Pz8CAkJYdSoUVgslvv8LrJIYiOEEEIIIcRDymKxOD1sNtstXzNmzBhmzZrF22+/zeLFi7HZbIwZMybXuu+++y4ajYYlS5YwcOBA/ve///HNN9/ci7dyW9IVTQghhBBCiIdQWloaer0+12UeHh65lsfHxzNjxgzee+893n77bQCefvppWrZsyYULF5zqN2jQgClTpgDQqlUrNm/ezLJlyxg4cOBdehf5J4mNEEIIIUQRZzabmTNnDgB9+vTJ82RVPKSULnkvU1fkucjNzY1t27Y5lX/99dcsWrQo19f8/fffZGRk0LFjR4fyTp06sXHjRqf6rVu3dnherVo1Nm3alHe895AkNkIIIYQQQjyENBoNERERTuVRUVF5vubSpUsABAUFOZQXK1Ys1/q+vr4Ozw0GAxkZGQWM9O6QMTZCCCGEEEIUacotHndXSEgIAFevXnUov3Llyl3f1t0miY0QQgghhBACgBo1auDq6sqqVascyn/66afCCagApCuaEEIIIYQQRdrdb5nJS0BAAIMGDeKTTz7B1dWV2rVrs3TpUo4fPw5kdW8rqopuZEIIIYQQQoj77rPPPqN///58+umndO/eHbPZbJ/u2cfHp5Cjy5uiqqpa2EEIIYQQQoi8yaxojzile97L1KX3JYRevXrx22+/cebMmfuyvTshXdGEEEIIIYQQdlu3bmXHjh2Eh4djs9mIiopi4cKFTJw4sbBDuyVJbIQQQgghhBB2np6eREVFMWHCBNLT0ylbtiwTJ05k+PDhhR3aLUliI4QQQgghRJF2/yYPAAgPD2fnzp33dZt3g0weIIQQQgghhHjgSWIjhBBCCCGEeOBJYiOEEEIIIYR44MkYGyGEEEIIIYq0+zvG5kElLTZCCCGEEEKIB5602AghhBBCCFGkSYtNfkiLjRBCCCGEEOKBJy02QgghhBBCFGnSYpMf0mIjhBBCCCGEeOBJi40QQgghhBBFmrTY5Ie02AghhBBCCCEeeNJiI4QQQgghRJEmLTb5IS02QgghhBBCiAeetNgIIYQQQghRpEmLTX5Ii40QQgghxINm9BxQumQ9tF1h48HCjkiIQieJjRBCCCHEg6TXFPgi8sZzmwotP4RMU+HFJEQRIImNEEIIIcQDwjsmGd3iHbkvfOqD+xuMuG9UlDwf4gZJbIQQQgghHhAdPt2Z96ns78fvZyhCFDmS2AghhBBCFEE//pXJh6viiTda7GVu0ttMiDzJrGhCCCGEEEXIlTQb5Sem4mY1E37xNN/sCUV1N/BBUGFHJkTRJomNEEIIIUQR0vireLr/fYAZK77BxWrBotHwfuseTKvUhH6su/WL95+E8Ar3J1BxH8lYmvyQrmhCCCGEEEXI5XQdU3+ag4s1qwuazmbj418W0/jcidu/+Kn373F0QhRd0mIjhBBCCFFEJGXYqBIXg4c506Fcq6pc9A3CBmhvtYKUzFstFQ+oW81+Jm05N0iLjRBCCCFEEdFm2lWSDW4ku7g5lJs1WvaXKk+m9tbXpNV7GZwQRZwkNkIIIYQQRcQuqy8ni4XQr9sAUlxcAUjX6RnesTcxPv5YNbc+dVMAwt+894GK+0y5xUNkk8TmIRETE0NERASzZs0q7FDuiqVLl9K1a1caNWpEREQEMTExhR1Sgezbt4+IiAgiI2/cGfpefkazZs266/spIiKC8ePH37X13W8PevxCFFRkZCQRERHs27evsEMRd+jnUxa8M4xYtDqW1H6cEu/N4onBHxH63kymP9EGgHh3r9uuRz1wBpLS7nW4QhQ5MsZGFDn79u1jwoQJPPnkk/Tu3RudToefn19hhyUeArNmzaJy5co0a9assEMRhSQmJobIyEiaNWtG5cqVCzscIez2HU2h78J0kr1v/H+X6urGzrJV7M/rXDxNWNK1265LAfDtBamLwMP1HkQr7j9pmckPSWxEkbN7924APvjgA3x8fAo5mrsnJCSEHTt2oNXectinuEty29ezZ8+mffv2ktg8wmJiYpg9ezYlSpR46BKbtm3b0rp1a/R6fWGHIgqo2Xsn6L5xPZ9npNP7xTfyrHcyMIQ/S4RRO+bcbdepAhbPF7B90RuXNzvdxWiFKLoksRF3xGKxYLVacXFxuevrjouLA3iokhoARVHuyf4SuZN9LR5kGRkZ6HQ6dLr8/zet1WrlwkkRpqoqZxKsvLsuDeO+s9T4+xC/l63MrrDKeOgCSG7QkiqxF265jhRXN3q8NIJj/x1+2+v3CqAH1FHfkzlqHoqLgqFHExjbBSqVBEXJeogHwq1mRRM3SGJTAJGRkXz44YfMmDGDo0ePsmzZMq5cuUJISAh9+/alffv2QNYVwY4dO9KvXz8GDBjgsI5Zs2Yxe/ZsVq9eTYkSJQAYP348UVFRbNiwgUmTJrF9+3bMZjP16tXjnXfeITAwkBUrVrBo0SJiYmIICQlh2LBheV51XrduHXPnzuX8+fP4+fnRsWNHXn31Vaf/IOPi4pg9eza//fYb165dw9fXlyZNmjBo0CD8/f2dYl68eDGrVq1iw4YNxMXFMX36dCIiIvK9/7Zs2cK8efM4fvw4iqJQsWJFXn75Zfv7yN5v2bLXXbduXb7++ut8bePq1assWLCAvXv3cunSJTIzMylZsiTt2rWjV69eDv/pZ3+eX331FX/++SeRkZFcu3aNsLAw+vTpw9NPP+2w7g4dOhASEsLIkSOZNGkShw4dQq/X06RJE9544w2HfZabWx0Xv/76K4sXL+bEiRNYrVYqVKhAr169aNmypUM9m83G999/z8qVK4mLiyM0NJQ+ffrka9/k5dSpU0yaNIk//vgDg8HA448/zsiRI/Osn99YIyIiaN++Pc888wwzZszgxIkTeHp60qpVKwYPHoy7u7vT/pkxYwa7d+8mJSWFYsWK0bp1a1599VVcXW90pUhKSuKbb75h27ZtXL16FTc3N0JCQmjdujUvv/yy0/bHjx/vcGxFRUURFRVlr1eQ8QjZx8CoUaOYNGkSf//9N66urrRt25Zhw4ZhtVqZMWMGv/zyC0lJSVSvXp2xY8dStmxZ+zrS0tL4/vvv2b17NxcvXsRoNBIcHEyLFi3o16+fw3vdt28fAwcOZNy4caiqyoIFC7hw4QIBAQF0796d3r17O8S3a9cuVq1axeHDh4mLi0Ov11O9enX69u1LeHi40/vZuHEj33zzDefOncPPz49OnTrx2GOPMWTIEMaNG0eHDh3sdU0mEwsWLGDdunVcvHgRg8FAnTp1GDBgAFWq3OgqkzPmjIwMfvjhB2JjYylVqhRDhw6lSZMmnDx5ksmTJ/PXX3+h0+lo06YNI0aMcPqNOn/+PLNnz2bPnj0kJSURFBREy5Yt6d+/P25uN2aMyv4N3bJlC1OnTmXTpk2kpaVRpUoVRo4cSY0aNYAb33mADz/80P53QX5jIOsYWrJkCefPn8disRA
|
|||
|
|
"text/plain": [
|
|||
|
|
"<Figure size 800x630 with 2 Axes>"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "display_data"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"# Plot summary\n",
|
|||
|
|
"shap.summary_plot(shap_values.values, X_test_shap)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "markdown",
|
|||
|
|
"metadata": {},
|
|||
|
|
"source": [
|
|||
|
|
"### Interpreting the SHAP Summary Plot\n",
|
|||
|
|
"\n",
|
|||
|
|
"Each point on a row represents a SHAP value for a single prediction (row = feature).\n",
|
|||
|
|
"The x-axis shows how much the feature contributed to increasing or decreasing the prediction.\n",
|
|||
|
|
"* Right (positive SHAP value): pushes prediction toward the positive class (i.e., higher chance of incident).\n",
|
|||
|
|
"* Left (negative SHAP value): pushes prediction toward the negative class (i.e., lower chance of incident).\n",
|
|||
|
|
"\n",
|
|||
|
|
"Color shows the actual feature value for that point:\n",
|
|||
|
|
"* Red = high value\n",
|
|||
|
|
"* Blue = low value\n",
|
|||
|
|
"\n",
|
|||
|
|
"In other words:\n",
|
|||
|
|
"* The position tells you impact.\n",
|
|||
|
|
"* The color tells you feature value.\n",
|
|||
|
|
"* The density (thickness) of dots shows how often a value occurs."
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABMcAAAKlCAYAAADPf4s8AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd1hT1xsH8G8gYQ/ZUwQcuMA90Lr3qlsrWq2jjqq11lFHt9a2au2ytlat/dWtuOreWxQnuMGBiyFT9sz5/UETiQkQEAzI9/M8eZRzz73nvTc3gbw5QyKEECAiIiIiIiIiIqqA9HQdABERERERERERka4wOUZERERERERERBUWk2NERERERERERFRhMTlGREREREREREQVFpNjRERERERERERUYTE5RkREREREREREFRaTY0REREREREREVGExOUZERERERERERBUWk2NERERERERERFRhMTlGREREREREREQVFpNjRERERERERERUYTE5RkRERERERERUwuRyOebPn4+qVatCJpOhatWqWLhwIWrWrAm5XF7k4/3xxx9wc3NDRkZGKURbsUmEEELXQRARERERERERvUmWLl2KDz/8ENOmTYOPjw8sLS0xcuRILF68GCNHjizy8dLT0+Hu7o45c+bgww8/LIWINcvJycHFixcRGhqKjIwMWFtbo0mTJnB1dS3ysS5fvoyLFy/CysoKAwcOVNkWHR2NCxcuICoqCgBgb2+PZs2awdbWVqVeVlYWgoKC8OzZM0RHRyMjIwNt2rSBl5dXsc+RPceIiIiIiIiIiErY6tWr0alTJyxatAjvvvsu7t+/j+zsbAwZMqRYxzMyMsKIESOwZMkSvM5+TsePH0dwcDCqVauGFi1aQE9PD/v27UNkZGSRjpOcnIyrV69CKpWqbYuJicG///6LpKQkNGrUCA0bNkRiYiJ27dqFhIQElbrp6em4fPkyEhISYG1t/SqnpsTkGBERERERERFRCUpPT0dQUBBat26tLFu9ejXefvttGBkZFfu4gwYNwsOHD3Hs2LGSCLNQz549w71799C0aVM0b94ctWrVQo8ePWBubo7z588X6Vjnzp2Dvb097Ozs1LZduHABUqkUvXv3ho+PD+rVq4fevXsDAAIDA1XqmpiYYNiwYfDz80Pz5s2Lf3J5MDlGRERERERERFRCRo8eDWNjY+Tk5ODTTz+FRCKBk5MTgoOD0bFjR7X6T58+hZGREUaNGqVSfvjwYchkMkydOlVZ1qhRI1hbW2Pnzp2lfh4AcP/+fUgkEtSqVUtZJpVK4eXlhaioKCQnJ2t1nIiICDx48AAtWrTQuD0yMhIuLi4qiUMTExM4OTnh0aNHyMrKUpbr6+vDxMSkmGekGZNjREREREREREQlZOjQoRg3bhwA4Oeff8aaNWswfvx4AEDDhg3V6ru4uGDMmDFYu3YtHj58CAC4ffs2Bg4ciG7duuGHH35Qqd+wYUOcOXOm0DjkcjnS09O1euQ3TDM2NhaWlpYwMDBQKbe3t1du1yaOM2fOoGbNmvkOg8zJyYG+vr5auVQqhVwuR1xcXKHtvAr1gZ5ERERERERERFQs7du3x5EjR2BqaopJkyZBT08Pn332GQDAw8ND4z6zZ8/GypUr8f3332PevHno2bMn3N3dsWHDBujpqfZr8vT0xJo1awqNIzIyErt379Yq5iFDhsDc3FytPDU1VWMvLUVZSkpKoce+desWkpOT0aNHj3zrVKpUCc+ePYNcLleeb05ODp49e6Z1O6+CyTEiIiIiIiIiohIUHByMOnXqKBM9sbGxkEqlMDMz01jfxcUF77//PlasWIHLly8jLS0NJ06cgKmpqVpdKysrpKWl5Zu4UrCxsUH37t21itfY2FhjeXZ2tsYeXYqynJycAo+bnp6OixcvomHDhvm2AQC1a9fG6dOncfLkSdSrVw9CCFy+fBmpqalatfOqmBwjIiIiIiIiIipBQUFB6NKlS5H2mT59OpYuXYrg4GCcOnUKLi4uGusphkBKJJICj2doaAhXV9cixfAyqVSqMTGlKNOUOMvrwoULMDQ0RJ06dQqsV7t2bSQnJyM4OBghISEAADs7O9SrVw9XrlyBTCYr5hloh8kxIiIiIiIiIqISkpCQgMePH8Pb21tZZmNjg+zsbCQlJWkcvggA33zzDYDc3lr5zc0FAPHx8TAxMSmwJxaQm8DKyMjQKmYjIyO14ZtA7vBJTUMaFT26NPVsU3j+/Dlu374NX19fZX1FXHK5HElJSZDJZMpJ+Js2bYp69eohPj4eBgYGsLa2Vq5UaWlpqdV5FBeTY0REREREREREJSQ4OBgA4OPjoyyrWbMmAODBgwcq5QqLFi3CypUrsXTpUsyYMQPffPMNVq5cqfH4Dx48UFk9Mj9RUVGvPOeYjY0NwsPDkZmZqTIpv2IuMBsbm3yPmZKSAiEEzp49i7Nnz6pt37BhA+rWrauygqWhoSEcHR2VPz99+hSmpqaoVKmSVudRXEyOERERERERERGVkKCgIACqyTFfX18AwMWLF9WSYzt27MCsWbMwb948TJw4EaGhoVi2bBnmzp2rcQL/y5cvY+jQoYXGURJzjnl6eiI4OBi3bt1CvXr1AOT2/Lpz5w7s7e2Vc6hlZ2cjOTkZRkZGyp5g1tbW6Ny5s9oxL1y4gKysLLRo0QIWFhb5xnTv3j1ER0ejefPmhQ4hfVVMjhERERERERERlZDg4GC4uLioDI309PRE3bp1cfjwYYwaNUpZfunSJQwdOhRDhw7F3LlzAQAzZ87EH3/8obH32KVLlxAXF4fevXsXGkdJzDlmb28PT09PBAYGIi0tDZaWlggJCUFSUhLatGmjrPfs2TPs3r0bDRs2ROPGjQHkDtV0d3dXO+a1a9cAQGVbREQELl++DBcXFxgZGeHZs2e4c+cOKleujLp166od4/r168jMzFQO13z48KFy+GfdunVVerlpg8kxIiIiIiIiIqISEhwcrHHo5KhRo/D5558jLS0NxsbGePLkCXr16oUGDRpgxYoVynrOzs4YNWoUVq5cqdZ7bMuWLXBzc0P79u1fy7kAQNu2bWFmZobQ0FBkZmbC2toaXbt2hZOTU4m1YWpqColEguDgYGRlZcHc3BxNmjSBt7e3xrnQgoODkZycrPw5LCwMYWFhAIDq1asXOTkmEYplDoiIiIiIiIiIqFQ8f/4cnp6eWLhwIUaPHl3k/TMyMuDu7o5Zs2ZhypQppRBhxaWefiMiIiIiIiIiohJlaWmJmTNnYtGiRZDL5UXef/Xq1ZDJZBg/fnwpRFexsecYERERERERERFVWOw5RkREREREREREFRaTY0REREREREREVGExOUZERERERERERBUWk2NERERERERERFRhMTlGREREREREREQVFpNjREREREREREQ6kJiYiLZt2yIxMVHXoVRoTI4REREREREREelAYmIiTpw4weSYjjE5RkREREREREREFRaTY0REREREREREVGExOUZERERERERERBUWk2NERERERERERDpgYWGBVq1awcLCQtehVGgSIYTQdRBERERERERERBVRcnIyzMzMdB1GhcaeY0REREREREREOnL37l1dh1DhMTlGREREREREREQVFpNjREREREREREQ64u7urusQKjwmx4iIiIiIiIiIdCQlJUXXIVR4TI4REREREREREelIdHS0rkOo8JgcIyIiIiIiIiKiCksihBC6DoKIiIiIiIiIqCISQkAikeg6jAqNPceIiIiIiIiIiHTk1q1bug6hwmNyjIiIiIiIiIhIRzIzM3UdQoXH5BgRERERERERkY5YWFjoOoQKj3OOERERERERERHpSGpqKkxMTHQdRoXGnmNERERERERERDoSEhKi6xAqPCbHiIiIiIiIiIiowmJyjIiIiIiIiIhIR9zc3HQdQoXH5BgRERERERERkY5kZGToOoQKj8kxIiIiIiIiIiIdiYqK0nUIFR6TY0REREREREREVGFJhBBC10EQEREREREREVVEOTk50NfX13UYFRp7jhERERERERER6UhoaKiuQ6jwmBwjIiIiIiIiItK
|
|||
|
|
"text/plain": [
|
|||
|
|
"<Figure size 800x750 with 3 Axes>"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "display_data"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"# Show the individual prediction for the highest predicted instance\n",
|
|||
|
|
"highest_pred_index = np.argmax(shap_values.values[:, 0]) \n",
|
|||
|
|
"\n",
|
|||
|
|
"# Use waterfall plot for a single instance\n",
|
|||
|
|
"shap.plots.waterfall(shap_values[highest_pred_index], max_display=20)"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"cell_type": "code",
|
|||
|
|
"execution_count": null,
|
|||
|
|
"metadata": {},
|
|||
|
|
"outputs": [
|
|||
|
|
{
|
|||
|
|
"data": {
|
|||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABMcAAAKlCAYAAADPf4s8AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd1hTZxsG8DuBsDcIMgRBFHHgVnDvhVonjlK31larraute1ZbR9tPbeveWhUcrXtvFMWBG5y4WSKyBc73R5pISICAQJDcv+vKpbznPed9zslJIE/eIRIEQQAREREREREREZEWEms6ACIiIiIiIiIiIk1hcoyIiIiIiIiIiLQWk2NERERERERERKS1mBwjIiIiIiIiIiKtxeQYERERERERERFpLSbHiIiIiIiIiIhIazE5RkREREREREREWovJMSIiIiIiIiIi0lpMjhERERERERERkdZicoyIiIiIiIiIiLQWk2NERERERERERKS1mBwjIiIiIiIirZGZmYk5c+agQoUKkEgkqFChAn755RdUrlwZmZmZ+T7eX3/9BWdnZ6SmphZBtERUHESCIAiaDoKIiIiIiIioOCxduhSjR4/GuHHj4OXlBXNzcwwaNAgLFy7EoEGD8n28lJQUlC9fHpMmTcLo0aOLIGLVMjIycPnyZYSHhyM1NRVWVlaoV68enJyc8tz3/fv3uH79OiIjIxEVFYXU1FQ0a9YMHh4eCvUiIyMRFhaGly9f4t27d9DX14ednR3q1q0LCwuLIjozouLHnmNERERERESkNdauXYs2bdpgwYIF+OKLL/Dw4UOkp6ejb9++BTqegYEBBgwYgMWLF6M4+56cPHkSoaGhcHd3R8OGDSEWi3HgwAG8evUqz31TUlJw5coVxMXFwcrKKsd6169fx6NHj+Dg4ICGDRvC09MTL1++xM6dOxEbG1uYp0OkUUyOERERERERkVZISUnB9evX0bRpU3nZ2rVr0aVLFxgYGBT4uH5+fnjy5AlOnDhRGGHmKTIyEg8ePED9+vXh7e0NT09P+Pr6wtTUFBcvXsxzfyMjI/j7+6Nfv37w9vbOsV716tXRr18/NGrUCJUrV0bt2rXRpUsXCIKAa9euFeIZEWkWk2NERERERERU6g0ZMgSGhobIyMjAlClTIBKJYG9vj9DQULRu3Vqp/vPnz2FgYIDBgwcrlB89ehQSiQTfffedvKxOnTqwsrLCnj17ivw8AODhw4cQiUTw9PSUl+nq6sLDwwOvX79GQkJCrvvr6OjAyMgoz3bKli0LHR0dhTJzc3NYWloiLi6uQLETlURMjhEREREREVGp9/nnn+PLL78EAPz+++/YuHEjRowYAQCoXbu2Un1HR0cMHToUmzZtwpMnTwAAd+/eRa9evdChQwcsWrRIoX7t2rVx7ty5POPIzMxESkqKWo+chmnGxMTA3Nwcenp6CuW2trby7UVFEAQkJyd/VE87opJGV9MBEBERERERERW1li1b4tixYzA2NsaoUaMgFosxdepUAICrq6vKfX788UesWrUKP//8M2bPno1OnTqhfPny2Lp1K8Rixb4mbm5u2LhxY55xvHr1Cnv37lUr5r59+8LU1FSpPCkpSWXPL1lZYmKiWscviPv37yMxMRF16tQpsjaIihuTY0RERERERKQVQkNDUbVqVXliKyYmBrq6ujAxMVFZ39HREcOGDcPKlStx5coVJCcn49SpUzA2Nlaqa2lpieTk5BwTVzLW1tbo2LGjWvEaGhqqLE9PT1ca7ghAXpaRkaHW8fMrLi4OZ8+ehZ2dHSpVqlQkbRBpApNjREREREREpBWuX7+Odu3a5Wuf8ePHY+nSpQgNDcWZM2fg6Oiosp5sCKRIJMr1ePr6+nBycspXDNnp6uqqTIDJylQlzj5WUlISDhw4AD09PbRu3Vqp5xzRp4zJMSIiIiIiIir14uLi8PTpU1SvXl1eZm1tjfT0dLx7907l8EUAmDt3LgBpby0rK6scj//mzRsYGRnl2NtLJiMjA6mpqWrFbGBgoDIJZWRkpHLoZFJSEgCo7Nn2MdLS0nDgwAGkpaWhS5cuhX58Ik1jcoyIiIiIiIhKvdDQUACAl5eXvKxy5coAgEePHimUyyxYsACrVq3C0qVLMWHCBMydOxerVq1SefxHjx4prB6Zk9evX3/0nGPW1tZ48eIF0tLSFCblj4yMlG8vLOnp6Th48CDevn0LX19fWFpaFtqxiUoKJseIiIiIiIio1Lt+/ToAxeSYj48PAODy5ctKybHdu3fjhx9+wOzZszFy5EiEh4fjjz/+wOTJk1VO4H/lyhV8/vnnecZRGHOOubm5ITQ0FHfu3EGNGjUASHuk3bt3D7a2tvI51NLT05GQkAADA4MCrS6ZmZmJY8eO4fXr12jXrh3s7OzyfQyiT4FIyGltWCIiIiIiIqJSYtiwYThw4ACePXumUF69enVUr14dW7ZskZeFhISgadOm6NGjBzZs2AAAePHiBdzc3ODv76/UeywkJAR169bF0aNH0apVq6I/GQBHjx7Fo0ePUL16dZibmyMsLAyRkZHo1KkT7O3t5THv3bsXtWvXRt26dRX2v3nzJtLS0pCUlITbt2+jfPnysLGxAQBUq1YNenp6OH/+PG7evAlnZ2dUqFBBKYaKFSsW/YkSFQP2HCMiIiIiIqJSLzQ0VOXQycGDB2PatGlITk6GoaEhnj17hs6dO6NWrVpYuXKlvJ6DgwMGDx6MVatWKfUe27FjB5ydndGyZctiORcAaN68OUxMTBAeHo60tDRYWVmhffv28sRYXkJDQ5GQkCD/+fHjx3j8+DEAadJLT08PMTExAICIiAhEREQoHYPJMSot2HOMiIiIiIiItNbbt2/h5uaGX375BUOGDMn3/qmpqShfvjx++OEHjBkzpggiJKKixrVXiYiIiIiISGuZm5tj4sSJWLBgATIzM/O9/9q1ayGRSDBixIgiiI6IigN7jhERERERERERkdZizzEiIiIiIiIiItJaTI4REREREREREZHWYnKMiIiIiIiIiIi0FpNjRERERERERESktZgcIyIiIiIiIiIircXkGBEREREREVEW8fHxaN68OeLj4zUdChEVAybHiIiIiIiIiLKIj4/HqVOnmBwj0hJMjhERERERERERkdZicoyIiIiIiIiIiLQWk2NERERERERERKS1mBwjIiIiIiIiysLMzAxNmjSBmZmZpkMhomIgEgRB0HQQRERERERERCVJQkICTExMNB0GERUD9hwjIiIiIiIiyub+/fuaDoGIigmTY0REREREREREpLWYHCMiIiIiIiLKpnz58poOgYiKCZNjRERERERERNkkJiZqOgQiKiZMjhERERERERFlExUVpekQiKiYMDlGRERERERERERaSyQIgqDpIIiIiIiIiIhKEkEQIBKJNB0GERUD9hwjIiIiIiIiyubOnTuaDoGIigmTY0RERERERETZpKWlaToEIiomTI4RERERERERZWNmZqbpEIiomHDOMSIiIiIiIqJskpKSYGRkpOkwiKgYsOcYERERERERUTZhYWGaDoGIigmTY0REREREREREpLWYHCMiIiIiIiLKxtnZWdMhEFExYXKMiIiIiIiIKJvU1FRNh0BExYTJMSIiIiIiIqJsXr9+rekQiKiYMDlGRERERERERERaSyQIgqDpIIiIiIiIiIhKkoyMDOjo6Gg6DCIqBuw5RkRERERERJRNeHi4pkMgomLC5BgRERERERFRNikpKZoOgYiKCZNjRERERERERNmYmppqOgQiKiacc4yIiIiIiIgom5SUFBgYGGg6DCIqBuw5RkRERERERJTN3bt3NR0CERUTXU0HQERERERERPQpSX4v4OJLDsKiomVrJEIVG5Gmw9AKTI4RERERERERZePk5JTjttU3BHxzPLMYoyFtpCsCbgwUobK1jqZDKfU4rJKIiIiIiIgom4yMDNXlmQIWXGJijIpeugA8iUrQdBhagckxIiIiIiIiomxevnypsnzPfQER74o5GNJar1690nQIWoHJMSIiIiIiIiI1zQ/OhJjTQBGVKpxzjIiIiIi
|
|||
|
|
"text/plain": [
|
|||
|
|
"<Figure size 800x750 with 3 Axes>"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
"metadata": {},
|
|||
|
|
"output_type": "display_data"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"source": [
|
|||
|
|
"# Show the individual prediction for the lowest predicted instance\n",
|
|||
|
|
"lowest_pred_index = np.argmin(shap_values.values[:, 0]) \n",
|
|||
|
|
"\n",
|
|||
|
|
"# Use waterfall plot for a single instance\n",
|
|||
|
|
"shap.plots.waterfall(shap_values[lowest_pred_index], max_display=20)"
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"metadata": {
|
|||
|
|
"kernelspec": {
|
|||
|
|
"display_name": "venv",
|
|||
|
|
"language": "python",
|
|||
|
|
"name": "python3"
|
|||
|
|
},
|
|||
|
|
"language_info": {
|
|||
|
|
"codemirror_mode": {
|
|||
|
|
"name": "ipython",
|
|||
|
|
"version": 3
|
|||
|
|
},
|
|||
|
|
"file_extension": ".py",
|
|||
|
|
"mimetype": "text/x-python",
|
|||
|
|
"name": "python",
|
|||
|
|
"nbconvert_exporter": "python",
|
|||
|
|
"pygments_lexer": "ipython3",
|
|||
|
|
"version": "3.12.3"
|
|||
|
|
}
|
|||
|
|
},
|
|||
|
|
"nbformat": 4,
|
|||
|
|
"nbformat_minor": 5
|
|||
|
|
}
|