From f85adbde931ab7d6ea3c7f6909f69a37eb87d32c Mon Sep 17 00:00:00 2001 From: Pablo Martin Date: Mon, 30 Oct 2023 18:59:39 +0100 Subject: [PATCH 01/10] More thingies --- code_thingies/dbtlearn/dbt_project.yml | 2 ++ code_thingies/dbtlearn/models/dim/dim_hosts_cleansed.sql | 6 ++++++ code_thingies/dbtlearn/models/dim/dim_listings_cleansed.sql | 6 ++++++ 3 files changed, 14 insertions(+) diff --git a/code_thingies/dbtlearn/dbt_project.yml b/code_thingies/dbtlearn/dbt_project.yml index 3c2a10f..b8a4515 100644 --- a/code_thingies/dbtlearn/dbt_project.yml +++ b/code_thingies/dbtlearn/dbt_project.yml @@ -30,5 +30,7 @@ clean-targets: # directories to be removed by `dbt clean` models: dbtlearn: +materialized: view # Default way to materialize is view + src: + +materialized: ephemeral dim: +materialized: table diff --git a/code_thingies/dbtlearn/models/dim/dim_hosts_cleansed.sql b/code_thingies/dbtlearn/models/dim/dim_hosts_cleansed.sql index c489d1b..debe188 100644 --- a/code_thingies/dbtlearn/models/dim/dim_hosts_cleansed.sql +++ b/code_thingies/dbtlearn/models/dim/dim_hosts_cleansed.sql @@ -1,3 +1,9 @@ +{{ + config( + materialized = 'view' + ) +}} + WITH src_hosts AS( SELECT * FROM {{ ref('src_hosts') }} diff --git a/code_thingies/dbtlearn/models/dim/dim_listings_cleansed.sql b/code_thingies/dbtlearn/models/dim/dim_listings_cleansed.sql index 6e5de55..c3371aa 100644 --- a/code_thingies/dbtlearn/models/dim/dim_listings_cleansed.sql +++ b/code_thingies/dbtlearn/models/dim/dim_listings_cleansed.sql @@ -1,3 +1,9 @@ +{{ + config( + materialized = 'view' + ) +}} + WITH src_listings AS ( SELECT * FROM From 2b6c385b8cfad0adfc03a689cc554690061ae6ff Mon Sep 17 00:00:00 2001 From: Pablo Martin Date: Tue, 31 Oct 2023 17:00:58 +0100 Subject: [PATCH 02/10] Add csv --- .../dbtlearn/seeds/seed_full_moon_dates.csv | 273 ++++++++++++++++++ notes/sections1-7.md | 10 +- 2 files changed, 282 insertions(+), 1 deletion(-) create mode 100644 code_thingies/dbtlearn/seeds/seed_full_moon_dates.csv diff --git a/code_thingies/dbtlearn/seeds/seed_full_moon_dates.csv b/code_thingies/dbtlearn/seeds/seed_full_moon_dates.csv new file mode 100644 index 0000000..c9d373e --- /dev/null +++ b/code_thingies/dbtlearn/seeds/seed_full_moon_dates.csv @@ -0,0 +1,273 @@ +full_moon_date +2009-01-11 +2009-02-09 +2009-03-11 +2009-04-09 +2009-05-09 +2009-06-07 +2009-07-07 +2009-08-06 +2009-09-04 +2009-10-04 +2009-11-02 +2009-12-02 +2009-12-31 +2010-01-30 +2010-02-28 +2010-03-30 +2010-04-28 +2010-05-28 +2010-06-26 +2010-07-26 +2010-08-24 +2010-09-23 +2010-10-23 +2010-11-21 +2010-12-21 +2011-01-19 +2011-02-18 +2011-03-19 +2011-04-18 +2011-05-17 +2011-06-15 +2011-07-15 +2011-08-13 +2011-09-12 +2011-10-12 +2011-11-10 +2011-12-10 +2012-01-09 +2012-02-07 +2012-03-08 +2012-04-06 +2012-05-06 +2012-06-04 +2012-07-03 +2012-08-02 +2012-08-31 +2012-09-30 +2012-10-29 +2012-11-28 +2012-12-28 +2013-01-27 +2013-02-25 +2013-03-27 +2013-04-25 +2013-05-25 +2013-06-23 +2013-07-22 +2013-08-21 +2013-09-19 +2013-10-19 +2013-11-17 +2013-12-17 +2014-01-16 +2014-02-15 +2014-03-16 +2014-04-15 +2014-05-14 +2014-06-13 +2014-07-12 +2014-08-10 +2014-09-09 +2014-10-08 +2014-11-06 +2014-12-06 +2015-01-05 +2015-02-04 +2015-03-05 +2015-04-04 +2015-05-04 +2015-06-02 +2015-07-02 +2015-07-31 +2015-08-29 +2015-09-28 +2015-10-27 +2015-11-25 +2015-12-25 +2016-01-24 +2016-02-22 +2016-03-23 +2016-04-22 +2016-05-21 +2016-06-20 +2016-07-20 +2016-08-18 +2016-09-16 +2016-10-16 +2016-11-14 +2016-12-14 +2017-01-12 +2017-02-11 +2017-03-12 +2017-04-11 +2017-05-10 +2017-06-09 +2017-07-09 +2017-08-07 +2017-09-06 +2017-10-05 +2017-11-04 +2017-12-03 +2018-01-02 +2018-01-31 +2018-03-02 +2018-03-31 +2018-04-30 +2018-05-29 +2018-06-28 +2018-07-27 +2018-08-26 +2018-09-25 +2018-10-24 +2018-11-23 +2018-12-22 +2019-01-21 +2019-02-19 +2019-03-21 +2019-04-19 +2019-05-18 +2019-06-17 +2019-07-16 +2019-08-15 +2019-09-14 +2019-10-13 +2019-11-12 +2019-12-12 +2020-01-10 +2020-02-09 +2020-03-09 +2020-04-08 +2020-05-07 +2020-06-05 +2020-07-05 +2020-08-03 +2020-09-02 +2020-10-01 +2020-10-31 +2020-11-30 +2020-12-30 +2021-01-28 +2021-02-27 +2021-03-28 +2021-04-27 +2021-05-26 +2021-06-24 +2021-07-24 +2021-08-22 +2021-09-21 +2021-10-20 +2021-11-19 +2021-12-19 +2022-01-18 +2022-02-16 +2022-03-18 +2022-04-16 +2022-05-16 +2022-06-14 +2022-07-13 +2022-08-12 +2022-09-10 +2022-10-09 +2022-11-08 +2022-12-08 +2023-01-07 +2023-02-05 +2023-03-07 +2023-04-06 +2023-05-05 +2023-06-04 +2023-07-03 +2023-08-01 +2023-08-31 +2023-09-29 +2023-10-28 +2023-11-27 +2023-12-27 +2024-01-25 +2024-02-24 +2024-03-25 +2024-04-24 +2024-05-23 +2024-06-22 +2024-07-21 +2024-08-19 +2024-09-18 +2024-10-17 +2024-11-15 +2024-12-15 +2025-01-13 +2025-02-12 +2025-03-14 +2025-04-13 +2025-05-12 +2025-06-11 +2025-07-10 +2025-08-09 +2025-09-07 +2025-10-07 +2025-11-05 +2025-12-05 +2026-01-03 +2026-02-01 +2026-03-03 +2026-04-02 +2026-05-01 +2026-05-31 +2026-06-30 +2026-07-29 +2026-08-28 +2026-09-26 +2026-10-26 +2026-11-24 +2026-12-24 +2027-01-22 +2027-02-21 +2027-03-22 +2027-04-21 +2027-05-20 +2027-06-19 +2027-07-18 +2027-08-17 +2027-09-16 +2027-10-15 +2027-11-14 +2027-12-13 +2028-01-12 +2028-02-10 +2028-03-11 +2028-04-09 +2028-05-08 +2028-06-07 +2028-07-06 +2028-08-05 +2028-09-04 +2028-10-03 +2028-11-02 +2028-12-02 +2028-12-31 +2029-01-30 +2029-02-28 +2029-03-30 +2029-04-28 +2029-05-27 +2029-06-26 +2029-07-25 +2029-08-24 +2029-09-22 +2029-10-22 +2029-11-21 +2029-12-20 +2030-01-19 +2030-02-18 +2030-03-19 +2030-04-18 +2030-05-17 +2030-06-15 +2030-07-15 +2030-08-13 +2030-09-11 +2030-10-11 +2030-11-10 +2030-12-09 diff --git a/notes/sections1-7.md b/notes/sections1-7.md index 4e25c8a..22bc677 100644 --- a/notes/sections1-7.md +++ b/notes/sections1-7.md @@ -105,4 +105,12 @@ dbt makes sense nowadays because the modern data stack makes transformations wit - `dbt_project.yml`: header of the project, with stuff like versioning, the default profile for the project, the paths to different folders, etc. -This is a pic of the data flow we are going to build: ![img.png](../images/dataflow_overview.png) \ No newline at end of file +This is a pic of the data flow we are going to build: ![img.png](../images/dataflow_overview.png) + +## Sources and seeds + +Seeds are local files that you upload to a DWH from dbt. You place them as CSVs in the `seeds` folder. + + +Sources are an abstraction layer on top of the input tables. + From 7480222cc765e815212ca3f8eb0dd1d7a6d310ee Mon Sep 17 00:00:00 2001 From: Pablo Martin Date: Tue, 31 Oct 2023 17:22:51 +0100 Subject: [PATCH 03/10] Thingies --- .../models/mart/mart_fullmoon_reviews.sql | 27 +++++++++++++++++++ code_thingies/dbtlearn/models/sources.yml | 12 +++++++++ .../dbtlearn/models/src/src_hosts.sql | 2 +- .../dbtlearn/models/src/src_listings.sql | 2 +- .../dbtlearn/models/src/src_reviews.sql | 2 +- notes/8.md | 7 ++++- notes/sections1-7.md | 10 +------ 7 files changed, 49 insertions(+), 13 deletions(-) create mode 100644 code_thingies/dbtlearn/models/mart/mart_fullmoon_reviews.sql create mode 100644 code_thingies/dbtlearn/models/sources.yml diff --git a/code_thingies/dbtlearn/models/mart/mart_fullmoon_reviews.sql b/code_thingies/dbtlearn/models/mart/mart_fullmoon_reviews.sql new file mode 100644 index 0000000..199b808 --- /dev/null +++ b/code_thingies/dbtlearn/models/mart/mart_fullmoon_reviews.sql @@ -0,0 +1,27 @@ +{{ + config( + materialized = 'table' + ) +}} + +WITH fact_reviews AS ( + SELECT * + FROM + {{ ref('fact_reviews') }} +), +full_moon_dates AS ( + SELECT * + FROM + {{ ref('seed_full_moon_dates')}} +) + +SELECT + fr.*, + CASE + WHEN fm.full_moon_date IS NULL THEN 'not full moon' + ELSE 'full moon' + END AS is_full_moon +FROM + fact_reviews fr + LEFT JOIN full_moon_dates fm + ON (fr.review_date::date) = (fm.full_moon_date + interval '1' day) \ No newline at end of file diff --git a/code_thingies/dbtlearn/models/sources.yml b/code_thingies/dbtlearn/models/sources.yml new file mode 100644 index 0000000..0e2db1c --- /dev/null +++ b/code_thingies/dbtlearn/models/sources.yml @@ -0,0 +1,12 @@ +version: 2 + +sources: + - name: airbnb + schema: raw + tables: + - name: listings + identifier: raw_listings + - name: hosts + identifier: raw_hosts + - name: reviews + identifier: raw_reviews \ No newline at end of file diff --git a/code_thingies/dbtlearn/models/src/src_hosts.sql b/code_thingies/dbtlearn/models/src/src_hosts.sql index b33b25d..9d9d5b3 100644 --- a/code_thingies/dbtlearn/models/src/src_hosts.sql +++ b/code_thingies/dbtlearn/models/src/src_hosts.sql @@ -1,6 +1,6 @@ WITH raw_hosts AS ( SELECT * - FROM raw.raw_hosts + FROM {{ source ('airbnb', 'hosts')}} ) SELECT id as host_id, diff --git a/code_thingies/dbtlearn/models/src/src_listings.sql b/code_thingies/dbtlearn/models/src/src_listings.sql index c68838c..4c09b3e 100644 --- a/code_thingies/dbtlearn/models/src/src_listings.sql +++ b/code_thingies/dbtlearn/models/src/src_listings.sql @@ -1,6 +1,6 @@ WITH raw_listings AS ( SELECT * - FROM raw.raw_listings + FROM {{ source ('airbnb', 'listings')}} ) SELECT id AS listing_id, diff --git a/code_thingies/dbtlearn/models/src/src_reviews.sql b/code_thingies/dbtlearn/models/src/src_reviews.sql index 59d8167..987faeb 100644 --- a/code_thingies/dbtlearn/models/src/src_reviews.sql +++ b/code_thingies/dbtlearn/models/src/src_reviews.sql @@ -1,6 +1,6 @@ WITH raw_reviews AS ( SELECT * - FROM raw.raw_reviews + FROM {{ source ('airbnb', 'reviews')}} ) SELECT listing_id, diff --git a/notes/8.md b/notes/8.md index 0d1c83f..22ed54d 100644 --- a/notes/8.md +++ b/notes/8.md @@ -42,4 +42,9 @@ WHERE Bear in mind that how to define the strategy to determine what should be loaded is up to the engineer. Any SQL can be placed within the `if is_incremental()` block. In the example above, we have a date field that easily signals what's the most recent date the table has currently seen. -## \ No newline at end of file +## Sources and seeds + +Seeds are local files that you upload to a DWH from dbt. You place them as CSVs in the `seeds` folder. + + +Sources are an abstraction layer on top of the input tables. They are not strictly necessary, but can help make the project more structured. To create sources, you create a `sources.yml` file and place it in the `models` dir. \ No newline at end of file diff --git a/notes/sections1-7.md b/notes/sections1-7.md index 22bc677..4e25c8a 100644 --- a/notes/sections1-7.md +++ b/notes/sections1-7.md @@ -105,12 +105,4 @@ dbt makes sense nowadays because the modern data stack makes transformations wit - `dbt_project.yml`: header of the project, with stuff like versioning, the default profile for the project, the paths to different folders, etc. -This is a pic of the data flow we are going to build: ![img.png](../images/dataflow_overview.png) - -## Sources and seeds - -Seeds are local files that you upload to a DWH from dbt. You place them as CSVs in the `seeds` folder. - - -Sources are an abstraction layer on top of the input tables. - +This is a pic of the data flow we are going to build: ![img.png](../images/dataflow_overview.png) \ No newline at end of file From 1e53b3895cf57e9e1237c9c9ee3b20ec3ca9e48a Mon Sep 17 00:00:00 2001 From: Pablo Martin Date: Thu, 2 Nov 2023 17:05:44 +0100 Subject: [PATCH 04/10] Thingies --- .../dbtlearn/snapshots/scd_raw_listings.sql | 17 +++++++++++ notes/8.md | 28 ++++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 code_thingies/dbtlearn/snapshots/scd_raw_listings.sql diff --git a/code_thingies/dbtlearn/snapshots/scd_raw_listings.sql b/code_thingies/dbtlearn/snapshots/scd_raw_listings.sql new file mode 100644 index 0000000..4108a21 --- /dev/null +++ b/code_thingies/dbtlearn/snapshots/scd_raw_listings.sql @@ -0,0 +1,17 @@ +{% snapshot scd_raw_listings %} + +{{ + config( + target_schema = 'dev', + unique_key = 'id', + strategy = 'timestamp', + updated_at = 'updated_at', + invalidate_hard_deletes = True + ) +}} + +SELECT * +FROM + {{ source('airbnb', 'listings')}} + +{% endsnapshot %} diff --git a/notes/8.md b/notes/8.md index 22ed54d..f96b728 100644 --- a/notes/8.md +++ b/notes/8.md @@ -47,4 +47,30 @@ Bear in mind that how to define the strategy to determine what should be loaded Seeds are local files that you upload to a DWH from dbt. You place them as CSVs in the `seeds` folder. -Sources are an abstraction layer on top of the input tables. They are not strictly necessary, but can help make the project more structured. To create sources, you create a `sources.yml` file and place it in the `models` dir. \ No newline at end of file +Sources are an abstraction layer on top of the input tables. They are not strictly necessary, but can help make the project more structured. To create sources, you create a `sources.yml` file and place it in the `models` dir. Here, you can reference models created in the `models` dir to mark them as sources. You can reference sources in other models like this: + +```python +{{ source('domain_name', 'source_name')}} +``` + +Sources can define _freshness_ constraints that will provide warnings or errors when there is a significant delay. + + +## Snapshots + +Snapshots are a way to build SCD2s. There are two strategies to get this done: + - Timestamp: all records have a unique key and an `update_at` field. dbt will consider a new record is necessary in the SCD2 whenever the `updated_at` field increases. + - Check: dbt will monitor a set of columns and consider any changes in any of the columns as a new version of the record. + +Snapshots get defined with a sql file in the `snapshots` folder using the `snapshot` macro block. + +Once snapshots are defined, "snapshooting" can be triggered at any time by running `dbt snapshot`. dbt will create the SCD tables in the defined schema and play the `valid_from`, `valid_to` game whenever changes are detected. + +## Tests + +There are two kinds of tests: + +- Singular tests: you make any `SELECT` statement you want. If the `SELECT` statement is run and any data is found, the test is considered failed. If the statement is run and no rows are returned, the test is considered passed. +- Built-in test: just a bunch of typical stuff: uniqueness, nullability, enum validations and relationship (referential integrity) + +You can also define your own custom generic tests. \ No newline at end of file From bccbff7205622291276b7920859e1c991076cf2f Mon Sep 17 00:00:00 2001 From: Pablo Martin Date: Thu, 2 Nov 2023 17:45:20 +0100 Subject: [PATCH 05/10] Thingies --- code_thingies/dbtlearn/models/schema.yml | 27 +++++++++++++++++++ .../dbtlearn/tests/consistent_created_at.sql | 9 +++++++ .../tests/dim_listings_minimum_nights.sql | 5 ++++ 3 files changed, 41 insertions(+) create mode 100644 code_thingies/dbtlearn/models/schema.yml create mode 100644 code_thingies/dbtlearn/tests/consistent_created_at.sql create mode 100644 code_thingies/dbtlearn/tests/dim_listings_minimum_nights.sql diff --git a/code_thingies/dbtlearn/models/schema.yml b/code_thingies/dbtlearn/models/schema.yml new file mode 100644 index 0000000..b1f9258 --- /dev/null +++ b/code_thingies/dbtlearn/models/schema.yml @@ -0,0 +1,27 @@ +version: 2 + +models: + - name: dim_listings_cleansed + columns: + - name: listing_id + tests: + - unique + - not_null + + - name: host_id + tests: + - not_null + - relationships: + to: ref('dim_hosts_cleansed') + field: host_id + + - name: room_type + tests: + - accepted_values: + values: [ + 'Entire home/apt', + 'Private room', + 'Shared room', + 'Hotel room' + ] + diff --git a/code_thingies/dbtlearn/tests/consistent_created_at.sql b/code_thingies/dbtlearn/tests/consistent_created_at.sql new file mode 100644 index 0000000..2fb6893 --- /dev/null +++ b/code_thingies/dbtlearn/tests/consistent_created_at.sql @@ -0,0 +1,9 @@ +SELECT * +FROM + {{ ref('fact_reviews') }} fr +LEFT JOIN + {{ ref('dim_listings_cleansed') }} dl +ON + fr.listing_id = dl.listing_id +WHERE + fr.review_date < dl.created_at \ No newline at end of file diff --git a/code_thingies/dbtlearn/tests/dim_listings_minimum_nights.sql b/code_thingies/dbtlearn/tests/dim_listings_minimum_nights.sql new file mode 100644 index 0000000..8dfaa20 --- /dev/null +++ b/code_thingies/dbtlearn/tests/dim_listings_minimum_nights.sql @@ -0,0 +1,5 @@ +SELECT * +FROM + {{ ref('dim_listings_cleansed') }} +WHERE + mininum_nights < 1 From c2f2739e7ea51433051efac65ee84849e0c22264 Mon Sep 17 00:00:00 2001 From: Pablo Martin Date: Thu, 2 Nov 2023 18:13:56 +0100 Subject: [PATCH 06/10] Thingies --- code_thingies/dbtlearn/macros/no_nulls_in_columns.sql | 10 ++++++++++ .../dbtlearn/tests/no_nulls_in_dim_listings.sql | 1 + notes/8.md | 9 ++++++++- 3 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 code_thingies/dbtlearn/macros/no_nulls_in_columns.sql create mode 100644 code_thingies/dbtlearn/tests/no_nulls_in_dim_listings.sql diff --git a/code_thingies/dbtlearn/macros/no_nulls_in_columns.sql b/code_thingies/dbtlearn/macros/no_nulls_in_columns.sql new file mode 100644 index 0000000..40af5d4 --- /dev/null +++ b/code_thingies/dbtlearn/macros/no_nulls_in_columns.sql @@ -0,0 +1,10 @@ +{% macro no_nulls_in_columns(model) %} + SELECT * + FROM + {{ model }} + WHERE + {% for col in adapter.get_columns_in_relation(model) -%} + {{col.column}} IS NULL OR + {% endfor %} + FALSE +{% endmacro %} \ No newline at end of file diff --git a/code_thingies/dbtlearn/tests/no_nulls_in_dim_listings.sql b/code_thingies/dbtlearn/tests/no_nulls_in_dim_listings.sql new file mode 100644 index 0000000..e2a6fb5 --- /dev/null +++ b/code_thingies/dbtlearn/tests/no_nulls_in_dim_listings.sql @@ -0,0 +1 @@ +{{ no_nulls_in_columns(ref('dim_listings_cleansed')) }} \ No newline at end of file diff --git a/notes/8.md b/notes/8.md index f96b728..bd7e45a 100644 --- a/notes/8.md +++ b/notes/8.md @@ -73,4 +73,11 @@ There are two kinds of tests: - Singular tests: you make any `SELECT` statement you want. If the `SELECT` statement is run and any data is found, the test is considered failed. If the statement is run and no rows are returned, the test is considered passed. - Built-in test: just a bunch of typical stuff: uniqueness, nullability, enum validations and relationship (referential integrity) -You can also define your own custom generic tests. \ No newline at end of file +You can also define your own custom generic tests. + + +## Macros + +- Macros are jinja templates. +- There are many built-in macros in dbt, but you can also use your own macros. +- dbt packages exist and you can use them to have more tests and macros that you can use. \ No newline at end of file From 045ce1ec45bc5521e10b859932beaf087ca8933d Mon Sep 17 00:00:00 2001 From: Pablo Martin Date: Thu, 2 Nov 2023 18:24:30 +0100 Subject: [PATCH 07/10] Fix typo in col name, move test to generic test --- code_thingies/dbtlearn/macros/positive_value.sql | 8 ++++++++ .../dbtlearn/models/dim/dim_listings_cleansed.sql | 2 +- .../dbtlearn/models/dim/dim_listings_with_hosts.sql | 2 +- code_thingies/dbtlearn/models/schema.yml | 4 ++++ .../dbtlearn/tests/dim_listings_minimum_nights.sql | 5 ----- 5 files changed, 14 insertions(+), 7 deletions(-) create mode 100644 code_thingies/dbtlearn/macros/positive_value.sql delete mode 100644 code_thingies/dbtlearn/tests/dim_listings_minimum_nights.sql diff --git a/code_thingies/dbtlearn/macros/positive_value.sql b/code_thingies/dbtlearn/macros/positive_value.sql new file mode 100644 index 0000000..148a7bc --- /dev/null +++ b/code_thingies/dbtlearn/macros/positive_value.sql @@ -0,0 +1,8 @@ +{% test positive_value(model, column_name) %} +SELECT + * +FROM + {{ model }} +WHERE + {{ column_name }} < 1 +{% endtest %} \ No newline at end of file diff --git a/code_thingies/dbtlearn/models/dim/dim_listings_cleansed.sql b/code_thingies/dbtlearn/models/dim/dim_listings_cleansed.sql index c3371aa..de68ced 100644 --- a/code_thingies/dbtlearn/models/dim/dim_listings_cleansed.sql +++ b/code_thingies/dbtlearn/models/dim/dim_listings_cleansed.sql @@ -16,7 +16,7 @@ SELECT CASE WHEN minimum_nights = 0 THEN 1 ELSE minimum_nights - END AS mininum_nights, + END AS minimum_nights, host_id, REPLACE(price_str,'$','')::money AS price, created_at, diff --git a/code_thingies/dbtlearn/models/dim/dim_listings_with_hosts.sql b/code_thingies/dbtlearn/models/dim/dim_listings_with_hosts.sql index ee05cc2..6ccf1f0 100644 --- a/code_thingies/dbtlearn/models/dim/dim_listings_with_hosts.sql +++ b/code_thingies/dbtlearn/models/dim/dim_listings_with_hosts.sql @@ -13,7 +13,7 @@ SELECT listings.listing_id, listings.listing_name, listings.room_type, - listings.mininum_nights, + listings.minimum_nights, listings.price, listings.host_id, hosts.host_name, diff --git a/code_thingies/dbtlearn/models/schema.yml b/code_thingies/dbtlearn/models/schema.yml index b1f9258..cefd2d4 100644 --- a/code_thingies/dbtlearn/models/schema.yml +++ b/code_thingies/dbtlearn/models/schema.yml @@ -25,3 +25,7 @@ models: 'Hotel room' ] + - name: minimum_nights + tests: + - positive_value + diff --git a/code_thingies/dbtlearn/tests/dim_listings_minimum_nights.sql b/code_thingies/dbtlearn/tests/dim_listings_minimum_nights.sql deleted file mode 100644 index 8dfaa20..0000000 --- a/code_thingies/dbtlearn/tests/dim_listings_minimum_nights.sql +++ /dev/null @@ -1,5 +0,0 @@ -SELECT * -FROM - {{ ref('dim_listings_cleansed') }} -WHERE - mininum_nights < 1 From 3fce12819bcc798ada1782c2c613db61e719a2ca Mon Sep 17 00:00:00 2001 From: Pablo Martin Date: Fri, 3 Nov 2023 10:26:11 +0100 Subject: [PATCH 08/10] Thingies --- .../dbtlearn/models/fact/fact_reviews.sql | 4 ++- .../dbtlearn/models/src/src_reviews.sql | 1 + code_thingies/dbtlearn/packages.yml | 3 ++ notes/8.md | 29 ++++++++++++++++++- 4 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 code_thingies/dbtlearn/packages.yml diff --git a/code_thingies/dbtlearn/models/fact/fact_reviews.sql b/code_thingies/dbtlearn/models/fact/fact_reviews.sql index 8a707fa..9fd3dc8 100644 --- a/code_thingies/dbtlearn/models/fact/fact_reviews.sql +++ b/code_thingies/dbtlearn/models/fact/fact_reviews.sql @@ -9,7 +9,9 @@ WITH src_reviews AS ( FROM {{ ref('src_reviews') }} ) -SELECT * +SELECT + {{ dbt_utils.surrogate_key(['listing_id', 'review_date', 'reviewer_name', 'review_text']) }} as review_id, + * FROM src_reviews WHERE diff --git a/code_thingies/dbtlearn/models/src/src_reviews.sql b/code_thingies/dbtlearn/models/src/src_reviews.sql index 987faeb..2a99151 100644 --- a/code_thingies/dbtlearn/models/src/src_reviews.sql +++ b/code_thingies/dbtlearn/models/src/src_reviews.sql @@ -5,6 +5,7 @@ WITH raw_reviews AS ( SELECT listing_id, date AS review_date, + reviewer_name AS reviewer_name, comments AS review_text, sentiment AS review_sentiment FROM diff --git a/code_thingies/dbtlearn/packages.yml b/code_thingies/dbtlearn/packages.yml new file mode 100644 index 0000000..9e74ff0 --- /dev/null +++ b/code_thingies/dbtlearn/packages.yml @@ -0,0 +1,3 @@ +packages: + - package: dbt-labs/dbt_utils + version: 0.8.0 \ No newline at end of file diff --git a/notes/8.md b/notes/8.md index bd7e45a..33003e5 100644 --- a/notes/8.md +++ b/notes/8.md @@ -80,4 +80,31 @@ You can also define your own custom generic tests. - Macros are jinja templates. - There are many built-in macros in dbt, but you can also use your own macros. -- dbt packages exist and you can use them to have more tests and macros that you can use. \ No newline at end of file +- dbt packages exist and you can use them to have more tests and macros that you can use. + + +## Documentation + +- Documentation is kept in the repo (yay) +- Documentation can be defined in yaml files or in standalone markdown files. For example, the landing page can be customized with an `overview.md` file. +- Documentation can be quick-served with dbt, but ideally you should compile it and serve it with a regular web server, like Nginx. +- + + + + + + + + + + + + + + + + + + + From fb4534db58edb2616911851509176349b4b723c7 Mon Sep 17 00:00:00 2001 From: Pablo Martin Date: Mon, 27 Nov 2023 17:58:10 +0100 Subject: [PATCH 09/10] Alternative to AWS download --- code_thingies/database/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/code_thingies/database/README.md b/code_thingies/database/README.md index 4dba9cd..f767fff 100644 --- a/code_thingies/database/README.md +++ b/code_thingies/database/README.md @@ -70,6 +70,12 @@ After, you will have to download some CSV files with the data to populate the da aws s3 cp s3://dbtlearn/listings.csv listings.csv aws s3 cp s3://dbtlearn/reviews.csv reviews.csv aws s3 cp s3://dbtlearn/hosts.csv hosts.csv + +# or, to avoid using aws + +wget http://dbtlearn.s3.amazonaws.com/listings.csv +wget http://dbtlearn.s3.amazonaws.com/reviews.csv +wget http://dbtlearn.s3.amazonaws.com/hosts.csv ``` How to put the data into the databases is up to you. I've done it successfully using the import functionality of DBeaver. From cc644be0bca44854e17cecec416038cd20027156 Mon Sep 17 00:00:00 2001 From: Pablo Martin Date: Tue, 28 Nov 2023 18:06:41 +0100 Subject: [PATCH 10/10] Few small upgrades --- code_thingies/dbtlearn/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/code_thingies/dbtlearn/README.md b/code_thingies/dbtlearn/README.md index d470985..0ab7c8e 100644 --- a/code_thingies/dbtlearn/README.md +++ b/code_thingies/dbtlearn/README.md @@ -4,6 +4,8 @@ This is the dbt project for the course. ## Set up +Make a venv and install the requirements listed in `requirements.txt`. + You need to place a profile for the local postgres instance in `~/.dbt/profiles.yaml`. See below a sample config that should be a good starting point if you follow the instructions in the `database` dir of this project. ```yaml @@ -25,6 +27,8 @@ dbtlearn: Once you have set this up and the database as well, you can run `dbt debug` to ensure everything is set up correctly and dbt can reach the database. +To install the required dbt packages, run `dbt deps`. + You should also delete the lines under the `dbtlearn` key in the `dbt_project.yml` file. Also delete the contents of the `models` folder.