From 1e53b3895cf57e9e1237c9c9ee3b20ec3ca9e48a Mon Sep 17 00:00:00 2001 From: Pablo Martin Date: Thu, 2 Nov 2023 17:05:44 +0100 Subject: [PATCH] Thingies --- .../dbtlearn/snapshots/scd_raw_listings.sql | 17 +++++++++++ notes/8.md | 28 ++++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 code_thingies/dbtlearn/snapshots/scd_raw_listings.sql diff --git a/code_thingies/dbtlearn/snapshots/scd_raw_listings.sql b/code_thingies/dbtlearn/snapshots/scd_raw_listings.sql new file mode 100644 index 0000000..4108a21 --- /dev/null +++ b/code_thingies/dbtlearn/snapshots/scd_raw_listings.sql @@ -0,0 +1,17 @@ +{% snapshot scd_raw_listings %} + +{{ + config( + target_schema = 'dev', + unique_key = 'id', + strategy = 'timestamp', + updated_at = 'updated_at', + invalidate_hard_deletes = True + ) +}} + +SELECT * +FROM + {{ source('airbnb', 'listings')}} + +{% endsnapshot %} diff --git a/notes/8.md b/notes/8.md index 22ed54d..f96b728 100644 --- a/notes/8.md +++ b/notes/8.md @@ -47,4 +47,30 @@ Bear in mind that how to define the strategy to determine what should be loaded Seeds are local files that you upload to a DWH from dbt. You place them as CSVs in the `seeds` folder. -Sources are an abstraction layer on top of the input tables. They are not strictly necessary, but can help make the project more structured. To create sources, you create a `sources.yml` file and place it in the `models` dir. \ No newline at end of file +Sources are an abstraction layer on top of the input tables. They are not strictly necessary, but can help make the project more structured. To create sources, you create a `sources.yml` file and place it in the `models` dir. Here, you can reference models created in the `models` dir to mark them as sources. You can reference sources in other models like this: + +```python +{{ source('domain_name', 'source_name')}} +``` + +Sources can define _freshness_ constraints that will provide warnings or errors when there is a significant delay. + + +## Snapshots + +Snapshots are a way to build SCD2s. There are two strategies to get this done: + - Timestamp: all records have a unique key and an `update_at` field. dbt will consider a new record is necessary in the SCD2 whenever the `updated_at` field increases. + - Check: dbt will monitor a set of columns and consider any changes in any of the columns as a new version of the record. + +Snapshots get defined with a sql file in the `snapshots` folder using the `snapshot` macro block. + +Once snapshots are defined, "snapshooting" can be triggered at any time by running `dbt snapshot`. dbt will create the SCD tables in the defined schema and play the `valid_from`, `valid_to` game whenever changes are detected. + +## Tests + +There are two kinds of tests: + +- Singular tests: you make any `SELECT` statement you want. If the `SELECT` statement is run and any data is found, the test is considered failed. If the statement is run and no rows are returned, the test is considered passed. +- Built-in test: just a bunch of typical stuff: uniqueness, nullability, enum validations and relationship (referential integrity) + +You can also define your own custom generic tests. \ No newline at end of file