{ "cells": [ { "cell_type": "markdown", "id": "84dcd475", "metadata": {}, "source": [ "# DDRA - Pablo's EDA\n", "\n", "General fuck around to understand what might drive claims.\n", "\n", "A handful of ideas I want to give a first shot to:\n", "- Time features\n", " - Length of stay\n", " - Check In and Check out dates\n", " - Lead time between creation and check out\n", " - checkin as week/month/year cycle, checkout as week/month/year cycle\n", " - Duration between starting GJ and booking checkin\n", "- Same country, same town features\n", "- Tokenize listing names and correlate them\n", " - And specifically get bedrooms with regex\n", "- Number of active listings of host\n", "- Number of bookings created by host in last 12 months (and monthly/per listing average?)\n", "- Number of bookings cancelled on the host in last 12 months (and monthly/per listing average?)\n", "- Number of claims created by host in last 12 months (and monthly/per listing average?)\n", "- Number of claims with positive settlemend by host in last 12 months (and monthly/per listing average?)\n", "- Total invoiced to host in last 12 months (and monthly/per listing average?)\n", "- Guest age\n", "- Paid for waiver\n", "- Paid for CIH\n", "\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "12368ce1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "š Testing connection using credentials at: /home/pablo/.superhog-dwh/credentials.yml\n", "ā Connection successful.\n" ] } ], "source": [ "import sys\n", "import os\n", "sys.path.append(os.path.abspath(\"../../utils\")) # Adjust path if needed\n", "\n", "from dwh_utils import read_credentials, create_postgres_engine, query_to_dataframe, test_connection\n", "\n", "# --- Connect to DWH ---\n", "creds = read_credentials()\n", "dwh_pg_engine = create_postgres_engine(creds)\n", "\n", "# --- Test Query ---\n", "test_connection()" ] }, { "cell_type": "code", "execution_count": 29, "id": "385c350b", "metadata": {}, "outputs": [], "source": [ "# Other imports\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.decomposition import TruncatedSVD\n", "import string" ] }, { "cell_type": "markdown", "id": "78cbf43d", "metadata": {}, "source": [ "# Getting data" ] }, { "cell_type": "code", "execution_count": null, "id": "ba87ff74", "metadata": {}, "outputs": [], "source": [ "# Get all bookings and their claims\n", "df_bookings_and_claims = query_to_dataframe(\n", " engine=dwh_pg_engine, \n", " query=\"\"\"\n", " select \n", " -- Unique ID --\n", " ibs.id_booking,\n", " -- Target (Boolean) --\n", " ibs.has_resolution_incident,\n", " -- Various features -- \n", " ibs.booking_created_date_utc, \n", " ibs.booking_check_in_date_utc,\n", " ibs.booking_check_out_date_utc,\n", " vr.link_used_date_utc as guest_journey_started_date_utc,\n", "\n", " -- Other --\n", " ibs.id_accommodation\n", "\n", "from intermediate.int_booking_summary ibs\n", "left join intermediate.int_core__verification_requests vr\n", " on ibs.id_verification_request = vr.id_verification_request\n", "where \n", " -- 1. Bookings from New Dash users with Id Deal\n", " ibs.is_user_in_new_dash = True and \n", " ibs.is_missing_id_deal = False and\n", " -- 2. Protected Bookings with a Protection or a Deposit Management service\n", " (ibs.has_protection_service_business_type or \n", " ibs.has_deposit_management_service_business_type) and\n", " -- 3. Bookings with flagging categorisation (this excludes Cancelled/Incomplete/Rejected bookings)\n", " ibs.is_booking_flagged_as_risk is not null and \n", " -- 4. Booking is completed\n", " ibs.is_booking_past_completion_date = True\n", " \"\"\"\n", ")\n", "\n", "# Get listing details\n", "\n", "df_listing_details = query_to_dataframe(\n", " engine=dwh_pg_engine,\n", " query=\"\"\"\n", " select\n", " a.id_accommodation,\n", " a.friendly_name\n", " from intermediate.int_core__accommodation a\n", " where a.id_accommodation in (\n", " select distinct id_accommodation\n", " from intermediate.int_booking_summary ibs\n", " where \n", " -- 1. Bookings from New Dash users with Id Deal\n", " ibs.is_user_in_new_dash = True and \n", " ibs.is_missing_id_deal = False and\n", " -- 2. Protected Bookings with a Protection or a Deposit Management service\n", " (ibs.has_protection_service_business_type or \n", " ibs.has_deposit_management_service_business_type) and\n", " -- 3. Bookings with flagging categorisation (this excludes Cancelled/Incomplete/Rejected bookings)\n", " ibs.is_booking_flagged_as_risk is not null and \n", " -- 4. Booking is completed\n", " ibs.is_booking_past_completion_date = True)\n", " \"\"\"\n", ")\n", "\n", "# Get last 12 months host KPIs \n", "\n", "# Get guest data\n", "\n", "# Get host data\n", "\n", "# Get guest journey sales\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "9848916e", "metadata": {}, "outputs": [ { "data": { "application/vnd.microsoft.datawrangler.viewer.v0+json": { "columns": [ { "name": "index", "rawType": "int64", "type": "integer" }, { "name": "id_booking", "rawType": "int64", "type": "integer" }, { "name": "has_resolution_incident", "rawType": "bool", "type": "boolean" }, { "name": "booking_created_date_utc", "rawType": "object", "type": "unknown" }, { "name": "booking_check_in_date_utc", "rawType": "object", "type": "unknown" }, { "name": "booking_check_out_date_utc", "rawType": "object", "type": "unknown" }, { "name": "guest_journey_started_date_utc", "rawType": "object", "type": "unknown" } ], "ref": "b9c31069-7ef4-4cf3-8954-d283b42f6e64", "rows": [ [ "0", "975057", "False", "2024-12-04", "2024-12-05", "2025-03-31", null ], [ "1", "975059", "False", "2024-12-04", "2024-12-06", "2024-12-08", "2024-12-07" ], [ "2", "975060", "False", "2024-12-04", "2025-01-26", "2025-01-29", "2024-12-07" ], [ "3", "975061", "False", "2024-12-04", "2024-12-15", "2025-03-15", null ], [ "4", "975062", "False", "2024-12-04", "2024-12-11", "2025-03-10", "2024-12-10" ], [ "5", "975063", "False", "2024-12-04", "2024-12-10", "2025-03-11", "2024-12-04" ], [ "6", "975065", "False", "2024-12-04", "2024-12-05", "2024-12-10", "2024-12-04" ], [ "7", "975066", "False", "2024-12-04", "2024-12-20", "2024-12-23", "2024-12-04" ], [ "8", "975067", "False", "2024-12-04", "2025-01-14", "2025-01-18", "2025-01-03" ], [ "9", "975068", "False", "2024-12-04", "2025-05-20", "2025-05-23", null ], [ "10", "975070", "False", "2024-12-04", "2025-01-25", "2025-01-27", "2024-12-04" ], [ "11", "975071", "False", "2024-12-04", "2025-01-27", "2025-01-30", "2024-12-04" ], [ "12", "982700", "False", "2024-12-11", "2024-12-13", "2024-12-14", "2024-12-11" ], [ "13", "984590", "False", "2024-12-12", "2025-02-05", "2025-02-10", "2024-12-12" ], [ "14", "985483", "False", "2024-12-13", "2024-12-25", "2024-12-29", "2024-12-14" ], [ "15", "986720", "False", "2024-12-14", "2025-01-23", "2025-01-25", "2024-12-14" ], [ "16", "987812", "False", "2024-12-15", "2025-02-10", "2025-02-15", "2024-12-15" ], [ "17", "989579", "False", "2024-12-17", "2024-12-23", "2024-12-26", "2024-12-17" ], [ "18", "989580", "False", "2024-12-17", "2024-12-20", "2024-12-22", "2024-12-17" ], [ "19", "989581", "True", "2024-12-17", "2024-12-31", "2025-01-02", "2024-12-24" ], [ "20", "989582", "False", "2024-12-17", "2024-12-23", "2024-12-26", "2024-12-17" ], [ "21", "990071", "False", "2024-12-17", "2024-12-31", "2025-01-02", "2024-12-19" ], [ "22", "990606", "False", "2024-12-17", "2024-12-18", "2024-12-22", "2024-12-17" ], [ "23", "991162", "False", "2024-12-18", "2024-12-29", "2024-12-31", "2024-12-19" ], [ "24", "991894", "False", "2024-12-18", "2025-02-28", "2025-03-03", "2024-12-18" ], [ "25", "993698", "False", "2024-12-20", "2024-12-30", "2024-12-31", "2024-12-20" ], [ "26", "994300", "True", "2024-12-20", "2025-01-18", "2025-01-20", "2024-12-20" ], [ "27", "994888", "False", "2024-12-21", "2024-12-23", "2024-12-24", "2024-12-21" ], [ "28", "994974", "False", "2024-12-21", "2024-12-30", "2025-01-02", "2024-12-21" ], [ "29", "995617", "False", "2024-12-22", "2024-12-28", "2024-12-30", "2024-12-22" ], [ "30", "995692", "True", "2024-12-22", "2024-12-30", "2025-01-02", "2024-12-22" ], [ "31", "996081", "False", "2024-12-22", "2025-01-27", "2025-02-02", "2024-12-29" ], [ "32", "996092", "False", "2024-12-22", "2025-01-30", "2025-02-04", "2024-12-22" ], [ "33", "996397", "False", "2024-12-22", "2025-01-11", "2025-01-15", "2024-12-22" ], [ "34", "997018", "False", "2024-12-23", "2025-02-15", "2025-02-21", "2024-12-24" ], [ "35", "997710", "False", "2024-12-24", "2025-01-09", "2025-01-13", "2024-12-24" ], [ "36", "997777", "False", "2024-12-24", "2024-12-23", "2024-12-26", "2024-12-24" ], [ "37", "998900", "False", "2024-12-25", "2025-01-02", "2025-01-05", "2024-12-27" ], [ "38", "998926", "False", "2024-12-25", "2024-12-26", "2024-12-31", "2024-12-25" ], [ "39", "999495", "False", "2024-12-25", "2024-12-27", "2024-12-28", "2024-12-25" ], [ "40", "999663", "False", "2024-12-26", "2024-12-26", "2024-12-30", "2024-12-26" ], [ "41", "1000059", "False", "2024-12-26", "2024-12-27", "2024-12-30", "2024-12-27" ], [ "42", "1000743", "False", "2024-12-27", "2025-03-22", "2025-03-29", "2024-12-27" ], [ "43", "1000745", "False", "2024-12-27", "2024-12-27", "2024-12-29", "2024-12-27" ], [ "44", "1000746", "False", "2024-12-27", "2024-12-29", "2025-01-02", "2024-12-27" ], [ "45", "1000808", "False", "2024-12-27", "2024-12-27", "2024-12-29", null ], [ "46", "1000809", "False", "2024-12-27", "2025-02-06", "2025-02-07", "2024-12-28" ], [ "47", "1000883", "False", "2024-12-27", "2025-01-01", "2025-01-05", "2024-12-27" ], [ "48", "1000951", "True", "2024-12-27", "2025-01-09", "2025-01-15", "2024-12-27" ], [ "49", "1001807", "False", "2024-12-27", "2024-12-27", "2024-12-28", "2024-12-27" ] ], "shape": { "columns": 6, "rows": 20280 } }, "text/html": [ "
| \n", " | id_booking | \n", "has_resolution_incident | \n", "booking_created_date_utc | \n", "booking_check_in_date_utc | \n", "booking_check_out_date_utc | \n", "guest_journey_started_date_utc | \n", "
|---|---|---|---|---|---|---|
| 0 | \n", "975057 | \n", "False | \n", "2024-12-04 | \n", "2024-12-05 | \n", "2025-03-31 | \n", "None | \n", "
| 1 | \n", "975059 | \n", "False | \n", "2024-12-04 | \n", "2024-12-06 | \n", "2024-12-08 | \n", "2024-12-07 | \n", "
| 2 | \n", "975060 | \n", "False | \n", "2024-12-04 | \n", "2025-01-26 | \n", "2025-01-29 | \n", "2024-12-07 | \n", "
| 3 | \n", "975061 | \n", "False | \n", "2024-12-04 | \n", "2024-12-15 | \n", "2025-03-15 | \n", "None | \n", "
| 4 | \n", "975062 | \n", "False | \n", "2024-12-04 | \n", "2024-12-11 | \n", "2025-03-10 | \n", "2024-12-10 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 20275 | \n", "931096 | \n", "False | \n", "2024-10-31 | \n", "2024-11-08 | \n", "2024-11-13 | \n", "None | \n", "
| 20276 | \n", "931086 | \n", "False | \n", "2024-10-31 | \n", "2024-11-15 | \n", "2024-11-18 | \n", "None | \n", "
| 20277 | \n", "931082 | \n", "False | \n", "2024-10-31 | \n", "2024-12-20 | \n", "2024-12-27 | \n", "None | \n", "
| 20278 | \n", "926634 | \n", "False | \n", "2024-10-27 | \n", "2025-02-13 | \n", "2025-02-16 | \n", "None | \n", "
| 20279 | \n", "919656 | \n", "False | \n", "2024-10-21 | \n", "2025-01-16 | \n", "2025-01-20 | \n", "None | \n", "
20280 rows Ć 6 columns
\n", "| \n", " | id_accommodation | \n", "friendly_name | \n", "
|---|---|---|
| 0 | \n", "10368 | \n", "Maddox St | \n", "
| 1 | \n", "11059 | \n", "HIL-1 | \n", "
| 2 | \n", "14345 | \n", "SUS-2 | \n", "
| 3 | \n", "277469 | \n", "4000 sqft Lakefront Retreat | Private Hot Tub #NH | \n", "
| 4 | \n", "28561 | \n", "LAN-3 | \n", "
| ... | \n", "... | \n", "... | \n", "
| 3627 | \n", "197269 | \n", "Luxury Glamping | Hot Tub, Firepit & Grill | \n", "
| 3628 | \n", "198130 | \n", "Brick Haven House: 10min Walk to Shakespeare F... | \n", "
| 3629 | \n", "205403 | \n", "NO FEES! Pool+Hot Tub/Volley&Bocce Ball+Firepit | \n", "
| 3630 | \n", "263762 | \n", "Brasada Ranch | Hot Tub | Guest Casita | 5 Bed | \n", "
| 3631 | \n", "267589 | \n", "10% Off July 6-10 ⢠Creek ⢠3 Dogs ⢠Fenced Yard | \n", "
3632 rows Ć 2 columns
\n", "| \n", " | id_booking | \n", "has_resolution_incident | \n", "booking_created_date_utc | \n", "booking_check_in_date_utc | \n", "booking_check_out_date_utc | \n", "guest_journey_started_date_utc | \n", "id_accommodation | \n", "
|---|---|---|---|---|---|---|---|
| 0 | \n", "975057 | \n", "False | \n", "2024-12-04 | \n", "2024-12-05 | \n", "2025-03-31 | \n", "None | \n", "196871 | \n", "
| 1 | \n", "975059 | \n", "False | \n", "2024-12-04 | \n", "2024-12-06 | \n", "2024-12-08 | \n", "2024-12-07 | \n", "196875 | \n", "
| 2 | \n", "975060 | \n", "False | \n", "2024-12-04 | \n", "2025-01-26 | \n", "2025-01-29 | \n", "2024-12-07 | \n", "196876 | \n", "
| 3 | \n", "975061 | \n", "False | \n", "2024-12-04 | \n", "2024-12-15 | \n", "2025-03-15 | \n", "None | \n", "196872 | \n", "
| 4 | \n", "975062 | \n", "False | \n", "2024-12-04 | \n", "2024-12-11 | \n", "2025-03-10 | \n", "2024-12-10 | \n", "196875 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 20275 | \n", "931096 | \n", "False | \n", "2024-10-31 | \n", "2024-11-08 | \n", "2024-11-13 | \n", "None | \n", "187560 | \n", "
| 20276 | \n", "931086 | \n", "False | \n", "2024-10-31 | \n", "2024-11-15 | \n", "2024-11-18 | \n", "None | \n", "187585 | \n", "
| 20277 | \n", "931082 | \n", "False | \n", "2024-10-31 | \n", "2024-12-20 | \n", "2024-12-27 | \n", "None | \n", "187585 | \n", "
| 20278 | \n", "926634 | \n", "False | \n", "2024-10-27 | \n", "2025-02-13 | \n", "2025-02-16 | \n", "None | \n", "185004 | \n", "
| 20279 | \n", "919656 | \n", "False | \n", "2024-10-21 | \n", "2025-01-16 | \n", "2025-01-20 | \n", "None | \n", "185004 | \n", "
20280 rows Ć 7 columns
\n", "| \n", " | id_accommodation | \n", "friendly_name | \n", "clean_name | \n", "char_count | \n", "word_count | \n", "unique_word_count | \n", "tfidf_svd_0 | \n", "tfidf_svd_1 | \n", "tfidf_svd_2 | \n", "tfidf_svd_3 | \n", "... | \n", "tfidf_svd_20 | \n", "tfidf_svd_21 | \n", "tfidf_svd_22 | \n", "tfidf_svd_23 | \n", "tfidf_svd_24 | \n", "tfidf_svd_25 | \n", "tfidf_svd_26 | \n", "tfidf_svd_27 | \n", "tfidf_svd_28 | \n", "tfidf_svd_29 | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "10368 | \n", "Maddox St | \n", "maddox st | \n", "9 | \n", "2 | \n", "2 | \n", "0.010855 | \n", "-0.029763 | \n", "-0.021051 | \n", "0.009316 | \n", "... | \n", "0.004607 | \n", "-0.010669 | \n", "0.022564 | \n", "0.009577 | \n", "-0.004964 | \n", "0.002377 | \n", "0.008182 | \n", "0.020271 | \n", "0.011362 | \n", "0.014186 | \n", "
| 1 | \n", "11059 | \n", "HIL-1 | \n", "hil1 | \n", "4 | \n", "1 | \n", "1 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "
| 2 | \n", "14345 | \n", "SUS-2 | \n", "sus2 | \n", "4 | \n", "1 | \n", "1 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "
| 3 | \n", "277469 | \n", "4000 sqft Lakefront Retreat | Private Hot Tub #NH | \n", "4000 sqft lakefront retreat private hot tub nh | \n", "47 | \n", "8 | \n", "8 | \n", "0.118079 | \n", "-0.331729 | \n", "-0.036329 | \n", "-0.111210 | \n", "... | \n", "-0.036141 | \n", "-0.052405 | \n", "0.093227 | \n", "0.019269 | \n", "-0.006774 | \n", "0.070027 | \n", "0.102554 | \n", "-0.007378 | \n", "0.041281 | \n", "0.015410 | \n", "
| 4 | \n", "28561 | \n", "LAN-3 | \n", "lan3 | \n", "4 | \n", "1 | \n", "1 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 3627 | \n", "197269 | \n", "Luxury Glamping | Hot Tub, Firepit & Grill | \n", "luxury glamping hot tub firepit grill | \n", "39 | \n", "6 | \n", "6 | \n", "0.086114 | \n", "-0.296208 | \n", "-0.030307 | \n", "-0.076856 | \n", "... | \n", "0.100707 | \n", "0.100131 | \n", "-0.025985 | \n", "0.049267 | \n", "0.009551 | \n", "0.043639 | \n", "0.076095 | \n", "-0.009465 | \n", "0.057498 | \n", "0.044279 | \n", "
| 3628 | \n", "198130 | \n", "Brick Haven House: 10min Walk to Shakespeare F... | \n", "brick haven house 10min walk to shakespeare fest | \n", "48 | \n", "8 | \n", "8 | \n", "0.152979 | \n", "-0.182920 | \n", "0.248066 | \n", "0.178298 | \n", "... | \n", "0.018835 | \n", "0.034150 | \n", "-0.033340 | \n", "0.000108 | \n", "-0.001094 | \n", "0.028110 | \n", "0.066771 | \n", "-0.008312 | \n", "0.007832 | \n", "0.015454 | \n", "
| 3629 | \n", "205403 | \n", "NO FEES! Pool+Hot Tub/Volley&Bocce Ball+Firepit | \n", "no fees poolhot tubvolleybocce ballfirepit | \n", "42 | \n", "5 | \n", "5 | \n", "0.005290 | \n", "-0.007561 | \n", "0.000197 | \n", "-0.004329 | \n", "... | \n", "0.004708 | \n", "-0.005517 | \n", "0.004417 | \n", "-0.006661 | \n", "-0.006363 | \n", "-0.003132 | \n", "-0.005854 | \n", "0.012305 | \n", "-0.005357 | \n", "-0.006805 | \n", "
| 3630 | \n", "263762 | \n", "Brasada Ranch | Hot Tub | Guest Casita | 5 Bed | \n", "brasada ranch hot tub guest casita 5 bed | \n", "43 | \n", "8 | \n", "8 | \n", "0.076370 | \n", "-0.220398 | \n", "-0.006391 | \n", "-0.042772 | \n", "... | \n", "0.033248 | \n", "0.035303 | \n", "0.025965 | \n", "0.063088 | \n", "0.014874 | \n", "0.003055 | \n", "-0.013777 | \n", "-0.012340 | \n", "-0.018223 | \n", "0.016823 | \n", "
| 3631 | \n", "267589 | \n", "10% Off July 6-10 ⢠Creek ⢠3 Dogs ⢠Fenced Yard | \n", "10 off july 610 ⢠creek ⢠3 dogs ⢠fenced yard | \n", "46 | \n", "12 | \n", "10 | \n", "0.015270 | \n", "-0.046531 | \n", "0.000051 | \n", "-0.017550 | \n", "... | \n", "-0.074968 | \n", "-0.004176 | \n", "-0.051906 | \n", "-0.114376 | \n", "-0.010725 | \n", "-0.071384 | \n", "0.044820 | \n", "-0.035996 | \n", "-0.038547 | \n", "0.000560 | \n", "
3632 rows Ć 36 columns
\n", "