data-dwh-dbt-project/tests/kpis_global_metrics_outlier_detection.sql

/*
This test is applied in the reporting layer for Main KPIs,
specifically on reporting.mtd_aggregated_metrics.

It's supposed to run every day for the latest upate of KPIs.

There's chances that false positives are risen by these test. If at some
point it becomes too sensitive, just adapt the following parameters.

*/
-- Add here additive metrics that you would like to check
-- Recommended to exclude metrics that represent new products,
-- since there will be no history to check against. 
-- Do NOT include rates/percentages/ratios.
{% set metric_names = (
    "Cancelled Check Out Bookings",
    "Cancelled Created Bookings",
    "Check Out Bookings (Excl. Cancelled)",
    "Created Bookings (Excl. Cancelled)",
    "Damage Host-Waiver Payments",
    "Deposit Fees Revenue",
    "Est. Billable Bookings",
    "First Time Booked Deals",
    "First Time Booked Listings",
    "Guest Journey Completed",
    "Guest Journey Created",
    "Guest Journey Started",
    "Guest Journey with Payment",
    "Guest Revenue",
    "Host Resolutions Amount Paid",
    "Host Resolutions Payment Count",
    "Invoiced APIs Revenue",
    "Invoiced Athena Revenue",
    "Invoiced Old Dashboard Booking Fees Revenue",
    "Invoiced Total Booking Fees Revenue",
    "Invoiced E-Deposit Revenue",
    "Invoiced Listing Fees Revenue",
    "Invoiced Operator Revenue",
    "Invoiced Verification Fees Revenue",
    "New Deals",
    "New Listings",
    "Revenue Retained",
    "Revenue Retained Post-Resolutions",
    "Total Check Out Bookings",
    "Total Created Bookings",
    "Total Revenue",
    "Waiver Revenue",
    "Waiver Retained",
) %}

-- Specify here the day of the month that will start to be considered
-- for outlier detection. Keep in mind that 1st of every month is quite
-- unreliable thus false positives could appear. Recommended minimum 2.
{% set start_validating_on_this_day_month = 2 %}

-- Specify here the strength of the detector. A higher value
-- means that this test will allow for more variance to be accepted,
-- thus it will be more tolerant.
-- A lower value means that the chances of detecting outliers 
-- and false positives will be higher. Recommended around 10.
{% set detector_tolerance = 5 %}

-- Specify here the number of days in the past that will be used
-- to compare against. Keep in mind that we only keep the daily
-- information for the current month, thus having 180 days here
-- it means that we will take 1) all values of the current month
-- except the latest update and 2) the end of month figures for the 
-- past 6 months max.
{% set timeline_to_compare_against = 180 %}

with
    max_date as (
        select max(date) as max_date
        from {{ ref("mtd_aggregated_metrics") }}
        -- First days of the month is usually not estable to run
        where day >= {{ start_validating_on_this_day_month }}

    ),
    metric_data as (
        select
            date,
            metric,
            value,
            coalesce(abs(value), 0) / day as abs_daily_value,
            case when date = max_date then 1 else 0 end as is_max_date
        from {{ ref("mtd_aggregated_metrics") }}
        cross join max_date
        where
            day >= {{ start_validating_on_this_day_month }}
            and dimension = 'Global'
            and date between max_date -{{ timeline_to_compare_against }} and max_date
            and metric in {{ metric_names }}
    ),
    metrics_to_validate as (
        select date, metric, value, abs_daily_value
        from metric_data
        where is_max_date = 1
    ),
    metrics_to_compare_against as (
        select
            metric,
            avg(abs_daily_value) as avg_daily_value_previous_dates,
            stddev(abs_daily_value) as std_daily_value_previous_dates,
            greatest(
                avg(abs_daily_value)
                - {{ detector_tolerance }} * stddev(abs_daily_value),
                0
            ) as lower_bound,
            avg(abs_daily_value)
            + {{ detector_tolerance }} * stddev(abs_daily_value) as upper_bound
        from metric_data
        where is_max_date = 0
        group by 1
    ),
    metrics_comparison as (
        select
            mtv.date,
            mtv.metric,
            mtv.value,
            mtv.abs_daily_value,
            mtca.avg_daily_value_previous_dates,
            mtca.std_daily_value_previous_dates,
            mtca.lower_bound,
            mtca.upper_bound,
            case
                when
                    mtv.abs_daily_value >= mtca.lower_bound
                    and mtv.abs_daily_value <= mtca.upper_bound
                then true
                else false
            end as is_abs_daily_value_accepted,
            abs(mtv.abs_daily_value - mtca.avg_daily_value_previous_dates)
            / mtca.std_daily_value_previous_dates as signal_to_noise_factor
        from metrics_to_validate mtv
        inner join metrics_to_compare_against mtca on mtv.metric = mtca.metric
    )
select *
from metrics_comparison
where is_abs_daily_value_accepted = false
order by signal_to_noise_factor desc