From 426f0fbb0d05c0c00499783a3f16f3500024b709 Mon Sep 17 00:00:00 2001 From: Pablo Martin Date: Thu, 18 Jan 2024 17:25:41 +0100 Subject: [PATCH] many things --- .vscode/settings.json | 12 ++- README.md | 73 ++++++++++++++++--- dbt_project.yml | 2 +- .../int_core_deal_id_master_list.sql | 4 + .../int_core_unified_user.sql} | 0 models/reporting/core_deal_id_master_list.sql | 3 + .../rpg_core_deal_id_master_list.sql | 3 - models/{sources.yml => sync/sync_core.yml} | 0 .../working/wkg_core_deal_id_master_list.sql | 4 - profiles.yml.example | 23 ++++++ 10 files changed, 103 insertions(+), 21 deletions(-) create mode 100644 models/intermediate/int_core_deal_id_master_list.sql rename models/{working/wkg_core_unified_user.sql => intermediate/int_core_unified_user.sql} (100%) create mode 100644 models/reporting/core_deal_id_master_list.sql delete mode 100644 models/reporting/rpg_core_deal_id_master_list.sql rename models/{sources.yml => sync/sync_core.yml} (100%) delete mode 100644 models/working/wkg_core_deal_id_master_list.sql create mode 100644 profiles.yml.example diff --git a/.vscode/settings.json b/.vscode/settings.json index 930d475..d74be45 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,5 +1,13 @@ { - "sql-formatter.uppercase": true, "dbt.queryLimit": 500, - "dbt.enableNewQueryPanel": true + "dbt.enableNewQueryPanel": true, + "emeraldwalk.runonsave": { + "commands": [ + { + "match": ".*\\.sql(\\.jinja)?", + "isAsync": true, + "cmd": "sqlfmt ${file}" + } + ] + } } \ No newline at end of file diff --git a/README.md b/README.md index b9017dc..fcada0b 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,67 @@ -Welcome to your new dbt project! +# DWH dbt -### Using the starter project +Welcome to Superhog's DWH dbt project. Here we model the entire DWH. -Try running the following commands: +## How to set up your environment -- dbt run -- dbt test +- Pre-requisites + - You need a Linux environment. That can be Linux, macOS or WSL. + - You need to install Python `>=3.10` and `poetry`. + - All docs will assume you are using VSCode. +- Prepare SSH tunnels + - We currently use SSH tunnels to reach both the `dev` and `prd` instances. You can ask Pablo how to set these up. + - You will need to activate the tunnels in order to run the dbt models on the databases. It will probably pay off to make them easy to activate in your terminal, you can make an alias. +- Set up + - Create an entry for this project `profiles.yml` file at `~/.dbt/profiles.yml`. You have a suggested template at `profiles.yml.example` + - Make sure that the `profiles.yml` host and port settings are consistent with the tunnels. + - Use `poetry install` to get dependencies in place. +- Check + - Ensure you are running in the project venv, either by setting VSCode Python interpreter to the one created by `poetry`, or by running `poetry shell` in the console when in the root dir. + - Turn on your tunnel to `dev` and run `dbt debug`. If it runs well, you are all set. If it fails, there's something wrong with your set up. Grab the terminal output and pull the thread. +- Complements + - If you are in VSCode, you most probably want to have this extension installed: [dbt Power User](https://marketplace.visualstudio.com/items?itemName=innoverio.vscode-dbt-power-user) + - It is advised to use [this autoformatter](https://sqlfmt.com/) and to automatically [run it on save](https://docs.sqlfmt.com/integrations/vs-code). -### Resources +## Branching strategy -- Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) -- Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers -- Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support -- Find [dbt events](https://events.getdbt.com) near you -- Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices +This repo works in a trunk-based-development philosophy (). + +Open a feature branch (`feature/your-branch-name`) for any changes and make it short-lived. It's fine and encouraged to build incrementally towards a `mart` level table with multiple PRs as long as you keep the model buildable along the way. + +## Project organization + +We organize models in four folders: + +- `sync` + - Dedicated to sources. + - One `.yml` per `sync` schema. + - No SQL models go here. +- `staging` + - Pretty much this: + - All models go prefixed with `stg_`. + - Avoid `SELECT *`. We don't know what dirty stuff can come from the `sync` schemas. +- `intermediate` + - Pretty much this: + - It's strictly forbidden to use tables here to end users. + - Make an effort to practice DRY. +- `reporting` + - Pretty much this: + - For now, we follow a monolithic approach and just have one `reporting` schema. When this becomes insufficient, we will judge splitting into several schemas. + - Make an effort to keep this layer stable like you would do with a library's API so that downstream dependencies don't break without control. + +## Conventions + +- Always use CTEs in your models to `source` and `ref` other models. +- We follow [snake case](https://en.wikipedia.org/wiki/Snake_case). +- Identifier columns should begin with `id_`, not finish with `_id`. +- Use binary question-like column names for binary, bool, and flag columns (i.e. not `active` but `is_active`, not `verified` but `has_been_verified`, not `imported` but `was_imported`) +- Datetime columns should either finish in `_utc` or `_local`. If they finish in local, the table should contain a `local_timezone` column that contains the [timezone identifier](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones). +- We work with many currencies and lack a single main once. Hence, any money fields will be ambiguous on their own. To address this, any table that has money related columns should also have a column named `currency`. We currently have no policy for tables where a single record has columns in different currencies. If you face this, assemble the data team and decide on something. + +## Stuff that we haven't done but we would like to + +- Automate formatting with git pre-commit. +- Define conventions on testing (and enforce them). +- Define conventions on documentation (and enforce them). +- Replace SSH tunneling with a Wireguard VPN access. +- Prepare a quick way to replicate parts of the `prd` dwh in our local machines. diff --git a/dbt_project.yml b/dbt_project.yml index e235877..71bd1d1 100644 --- a/dbt_project.yml +++ b/dbt_project.yml @@ -33,7 +33,7 @@ clean-targets: # directories to be removed by `dbt clean` models: dwh_dbt: staging: - +materialized: table + +materialized: view +schema: staging working: +materialized: view diff --git a/models/intermediate/int_core_deal_id_master_list.sql b/models/intermediate/int_core_deal_id_master_list.sql new file mode 100644 index 0000000..9732842 --- /dev/null +++ b/models/intermediate/int_core_deal_id_master_list.sql @@ -0,0 +1,4 @@ +with int_core_unified_user as (select * from {{ ref("int_core_unified_user") }}) +select id_deal, count(1) as users_with_this_id_deal +from int_core_unified_user +group by id_deal diff --git a/models/working/wkg_core_unified_user.sql b/models/intermediate/int_core_unified_user.sql similarity index 100% rename from models/working/wkg_core_unified_user.sql rename to models/intermediate/int_core_unified_user.sql diff --git a/models/reporting/core_deal_id_master_list.sql b/models/reporting/core_deal_id_master_list.sql new file mode 100644 index 0000000..668a415 --- /dev/null +++ b/models/reporting/core_deal_id_master_list.sql @@ -0,0 +1,3 @@ +with int_core_deal_id_master_list as (select * from {{ ref("int_core_deal_id_master_list") }}) +select * +from int_core_deal_id_master_list diff --git a/models/reporting/rpg_core_deal_id_master_list.sql b/models/reporting/rpg_core_deal_id_master_list.sql deleted file mode 100644 index c8d9628..0000000 --- a/models/reporting/rpg_core_deal_id_master_list.sql +++ /dev/null @@ -1,3 +0,0 @@ -with wkg_core_deal_id_master_list as (select * from {{ ref("wkg_core_deal_id_master_list") }}) -select * -from wkg_core_deal_id_master_list diff --git a/models/sources.yml b/models/sync/sync_core.yml similarity index 100% rename from models/sources.yml rename to models/sync/sync_core.yml diff --git a/models/working/wkg_core_deal_id_master_list.sql b/models/working/wkg_core_deal_id_master_list.sql deleted file mode 100644 index b8147e8..0000000 --- a/models/working/wkg_core_deal_id_master_list.sql +++ /dev/null @@ -1,4 +0,0 @@ -with wkg_core_unified_user as (select * from {{ ref("wkg_core_unified_user") }}) -select id_deal, count(1) as users_with_this_id_deal -from wkg_core_unified_user -group by id_deal diff --git a/profiles.yml.example b/profiles.yml.example new file mode 100644 index 0000000..087342c --- /dev/null +++ b/profiles.yml.example @@ -0,0 +1,23 @@ +dwh_dbt: + outputs: + dev: + dbname: dwh + host: localhost + user: + pass: + port: + schema: working + threads: 4 + type: postgres + + prd: + dbname: dwh + host: localhost + user: + pass: + port: + schema: working + threads: 4 + type: postgres + + target: dev \ No newline at end of file