diff --git a/dbt_project.yml b/dbt_project.yml index f6d8a00..13ba36f 100644 --- a/dbt_project.yml +++ b/dbt_project.yml @@ -47,4 +47,6 @@ seeds: vars: "dbt_date:time_zone": "Europe/London" - "start_date": "'2020-01-01'" + # A general cutoff date for relevancy. Many models assume this to be the point + # in time after which they should work. + "start_date": "'2020-01-01'" \ No newline at end of file diff --git a/models/intermediate/cross/int_daily_currency_exchange_rates.sql b/models/intermediate/cross/int_daily_currency_exchange_rates.sql new file mode 100644 index 0000000..bb5eb88 --- /dev/null +++ b/models/intermediate/cross/int_daily_currency_exchange_rates.sql @@ -0,0 +1,147 @@ +/* +How deep into the future since the last available real rate should rates be +projected. Must follow be a valid postgres interval string, such as '1 years' +See examples here: +https://www.postgresql.org/docs/current/datatype-datetime.html#DATATYPE-INTERVAL-INPUT-EXAMPLES +*/ +{% set interval_into_the_future = "'3 years'" %} + +with + stg_xedotcom__exchange_rates as ( + select * from {{ ref("stg_xedotcom__exchange_rates") }} + ), + -- CTEs for projecting forward + most_recent_xe_date_per_curr_pair as ( + select from_currency, to_currency, max(rate_date_utc) as most_recent_date_utc + from stg_xedotcom__exchange_rates + group by 1, 2 + ), + last_xe_rate_per_curr_pair as ( + select xe.from_currency, xe.to_currency, xe.rate + from stg_xedotcom__exchange_rates xe + inner join + most_recent_xe_date_per_curr_pair recent + on xe.from_currency = recent.from_currency + and xe.to_currency = recent.to_currency + and xe.rate_date_utc = recent.most_recent_date_utc + ), + future_dates as ( + select date_trunc('day', day_series)::date as future_date + from + generate_series + ( + ( + select max(most_recent_date_utc) + from most_recent_xe_date_per_curr_pair + )::timestamp + + interval '1 days', -- we don't want to pick the last real day + ( + select max(most_recent_date_utc) + from most_recent_xe_date_per_curr_pair + )::timestamp + + interval {{ interval_into_the_future }}, + '1 day'::interval + ) day_series + ), + projected_future_rates as ( + select + concat( + last_rate.from_currency, + last_rate.to_currency, + future_dates.future_date, + 'xe_inferred', + 'forecast' + ) as id_exchange_rate, + last_rate.from_currency, + last_rate.to_currency, + last_rate.rate, + future_dates.future_date as rate_date_utc, + 'xe_inferred' as source, + 'forecast' as rate_version, + now() as updated_at_utc + from future_dates future_dates + cross join last_xe_rate_per_curr_pair last_rate + ), + -- CTEs for projecting backward + oldest_xe_date_per_curr_pair as ( + select from_currency, to_currency, max(rate_date_utc) as oldest_date_utc + from stg_xedotcom__exchange_rates + group by 1, 2 + ), + first_xe_rate_per_curr_pair as ( + select xe.from_currency, xe.to_currency, xe.rate + from stg_xedotcom__exchange_rates xe + inner join + oldest_xe_date_per_curr_pair recent + on xe.from_currency = recent.from_currency + and xe.to_currency = recent.to_currency + and xe.rate_date_utc = recent.oldest_date_utc + ), + past_dates as ( + select date_trunc('day', day_series)::date as past_date + from + generate_series + ( + {{ var("start_date") }}::timestamp, + ( + select max(most_recent_date_utc) + from most_recent_xe_date_per_curr_pair + )::timestamp + - interval '2 days', -- we don't want to pick the last real day, + '1 day'::interval + ) day_series + ), + projected_past_rates as ( + select + concat( + first_rate.from_currency, + first_rate.to_currency, + past_dates.past_date, + 'xe_inferred', + 'guess' + ) as id_exchange_rate, + first_rate.from_currency, + first_rate.to_currency, + first_rate.rate, + past_dates.past_date as rate_date_utc, + 'xe_inferred' as source, + 'guess' as rate_version, + now() as updated_at_utc + from past_dates + cross join first_xe_rate_per_curr_pair first_rate + ) +-- here we union actual stuff, future projected stuff and past assumed stuff +select + concat( + xe.from_currency, xe.to_currency, xe.rate_date_utc, 'xedotcom', 'actual' + ) as id_exchange_rate, -- the id from staging is not useful here because dimensionality is different: here we have rate_source and rate_version + xe.from_currency, + xe.to_currency, + xe.rate, + xe.rate_date_utc, + 'xedotcom' as source, + 'actual' as rate_version, + xe.dwh_extracted_at_utc as updated_at_utc +from stg_xedotcom__exchange_rates xe +union all +select + fu.id_exchange_rate, + fu.from_currency, + fu.to_currency, + fu.rate, + fu.rate_date_utc, + fu.source, + fu.rate_version, + fu.updated_at_utc +from projected_future_rates fu +union all +select + pa.id_exchange_rate, + pa.from_currency, + pa.to_currency, + pa.rate, + pa.rate_date_utc, + pa.source, + pa.rate_version, + pa.updated_at_utc +from projected_past_rates pa diff --git a/models/intermediate/int_dates.sql b/models/intermediate/cross/int_dates.sql similarity index 100% rename from models/intermediate/int_dates.sql rename to models/intermediate/cross/int_dates.sql diff --git a/models/intermediate/int_dates_mtd.sql b/models/intermediate/cross/int_dates_mtd.sql similarity index 100% rename from models/intermediate/int_dates_mtd.sql rename to models/intermediate/cross/int_dates_mtd.sql diff --git a/models/intermediate/int_hardcoded_historical_currency_rates.sql b/models/intermediate/cross/int_hardcoded_historical_currency_rates.sql similarity index 79% rename from models/intermediate/int_hardcoded_historical_currency_rates.sql rename to models/intermediate/cross/int_hardcoded_historical_currency_rates.sql index 32d225e..8e6d81e 100644 --- a/models/intermediate/int_hardcoded_historical_currency_rates.sql +++ b/models/intermediate/cross/int_hardcoded_historical_currency_rates.sql @@ -4,4 +4,4 @@ with select d.date_day as rate_date, r.* from dates d cross join hardcoded_rates r -where d.date_day > cast('2020-01-01' as date) +where d.date_day > cast({{ var("start_date") }} as date) diff --git a/models/intermediate/cross/schema.yml b/models/intermediate/cross/schema.yml new file mode 100644 index 0000000..809bbcc --- /dev/null +++ b/models/intermediate/cross/schema.yml @@ -0,0 +1,86 @@ +models: + - name: int_daily_currency_exchange_rates + description: >- + This model holds a lot of data on currency exchange rates. The time + granularity is daily. Each record holds a currency pair for a specific + day, source and version. + + Actual rates are sourced from xe.com data. The `guessed` and `forecast` + versions are built by simply 'pushing' the first/last exchange rate on + record. Basically, wherever we dont' have data for a date, we pick the + closest actual data point that comes from xe.com. Bear in mind this means + that `forecast` version records will change on a daily basis as actual + data moves forwards, meaning you shouldn't assume your money amounts + converted in the future should always stay put. + + Note that, given the dimensionality, getting a simple time series for a + currency pair will require a bit of filtering. + + Reverse rates are explicit. This means that, for any given day and any + given currency pair, you will find two records with opposite from/to + positions. So, for 2024-01-01, you will find both a EUR->USD record and a + USD->EUR record with the opposite rate (1/rate). + columns: + - name: id_exchange_rate + data_type: text + description: A unique ID for the record, derived from concatenating the + currencies, date, source and version. Currency order is relevant + (EURUSD != USDEUR). + tests: + - not_null + - unique + - name: from_currency + data_type: character + description: The source currency, represented as an ISO 4217 code. + tests: + - not_null + - name: to_currency + data_type: character + description: The target currency, represented as an ISO 4217 code. + tests: + - not_null + - name: rate + data_type: numeric + description: >- + The exchange rate, represented as the units of the target currency + that one unit of source currency gets you. So, from_currency=USD to + to_currency=PLN with rate=4.2 should be read as '1 US Dollar buys me + 4.2 Polish Zlotys'. + + For same currency pairs (EUR to EUR, USD to USD, etc). The rate will + always be one. + + The rate can be smaller than one, but can't be negative. + tests: + - not_negative_or_zero + - not_null + - name: rate_date_utc + data_type: date + description: The date in which the rate record is relevant. + tests: + - not_null + - name: source + data_type: text + description: Where is the data coming from. Records that are composed from + making assumptions on real data will contain `_inferred`. + - name: rate_version + data_type: text + description: The version of the rate. This can be one of `actual` (the rate is a + reality fact), `forecast` (the rate sits in the future and is a guess + in nature) or `guess` (the rate sits in the past and is a guess in + nature). Note that one currency pair can have multiple rate versions + on the same date. + tests: + - accepted_values: + values: + - guess + - actual + - forecast + - not_null + - name: updated_at_utc + data_type: timestamp with time zone + description: For external sources, this will be the point in time when the + information was obtained from them. For stuff we make up here in the + DWH, this will be the point in time when we made the assumption. + tests: + - not_null