From db0d97d2e88248e5fc111db06a066f685354d47c Mon Sep 17 00:00:00 2001 From: Pablo Martin Date: Wed, 2 Apr 2025 15:00:22 +0200 Subject: [PATCH 1/7] wip --- README.md | 6 +++++- ci/.azure-pipelines.pr.yml | 2 +- ci/README.md | 30 +++++++++++++++++++++++++++++- ci/build-master-artifacts.sh | 24 ++++++++++++++++++++++++ ci/ci-vm-setup.sh | 8 ++++++++ ci/ci.env | 10 ++++++++++ ci/ci.profiles.yml | 13 +++++++++++++ ci/docker-compose.yml | 35 +++++++++++++++++++++++++++++++++++ ci/postgres-initial-setup.sql | 33 +++++++++++++++++++++++++++++++++ 9 files changed, 158 insertions(+), 3 deletions(-) create mode 100644 ci/build-master-artifacts.sh create mode 100644 ci/ci-vm-setup.sh create mode 100644 ci/ci.env create mode 100644 ci/ci.profiles.yml create mode 100644 ci/docker-compose.yml create mode 100644 ci/postgres-initial-setup.sql diff --git a/README.md b/README.md index 408ca5b..54d3f71 100644 --- a/README.md +++ b/README.md @@ -136,7 +136,11 @@ This goes beyond the scope of this project: to understand how you can serve thes ## CI -TBD. +CI can be setup to review PRs and make the developer experience more solid and less error prone. + +You can find more details on the topic in the `ci` folder. + +Note that this is an optional part of the project: you can happily work without CI if needed. ## Stuff that we haven't done but we would like to diff --git a/ci/.azure-pipelines.pr.yml b/ci/.azure-pipelines.pr.yml index ab0fcce..c12223a 100644 --- a/ci/.azure-pipelines.pr.yml +++ b/ci/.azure-pipelines.pr.yml @@ -37,7 +37,7 @@ steps: displayName: 'Sync Foreign Data Wrappers schemas' - script: | - cd ~/dbt-ci + cd ci /bin/bash build-master-artifacts.sh displayName: 'Build master artifacts' diff --git a/ci/README.md b/ci/README.md index 1df486a..eb0e374 100644 --- a/ci/README.md +++ b/ci/README.md @@ -1,3 +1,31 @@ # CI -This folder contains things we use for Continuous Integration. \ No newline at end of file +You can setup CI pipelines for the project if you want. This enables performing certain checks in PRs and master commits, which is useful to minimize errors and ensure certain quality levels are met. + +The details here are specific to Azure Devops. If you need to set things up in a different Git/CI env, you'll have to adjust your way into it. + +## CI VM Setup + +### Requirements + +These instructions assume that: +- You have a VM ready to be setup as the CI server. +- You can SSH into it. +- The VM has Docker and Docker Compose installed and ready to run. +- The VM has `psql` installed. +- The VM has the Azure CI agent installed. +- That you have cloned this repository in the home folder of the user you use in that VM. +- The DWH production instance has a CI dedicated user that can read from all sync schemas as well as `staging`, `intermediate` and `reporting`, and you have the credentials. + +### Setting things up + +- Create a folder in the user home directory named `dbt-ci`. +- Create a copy of the `ci.env` file there naming it `.env` (`cp ci.env ~/dbt-ci/.env`) and fill it with values of your choice. +- Execute the script named `ci-vm-setup.sh` in this folder. This script will take care of most of the setup that need to be executed, including: + - Preparing the postgres database. + - Setting up the dockerized postgres with the right database, FDW, etc. + - Prepare the `profiles.yml` file. + +### Connecting to Devops + +- TBD \ No newline at end of file diff --git a/ci/build-master-artifacts.sh b/ci/build-master-artifacts.sh new file mode 100644 index 0000000..0b39bab --- /dev/null +++ b/ci/build-master-artifacts.sh @@ -0,0 +1,24 @@ +#!/bin/bash + + +cd ~/data-dwh-dbt-project + +git checkout master +git pull + +rm -rf venv +python3 -m venv venv +source venv/bin/activate +pip install -r requirements.txt +dbt deps + +rm .env +cp ~/dbt-ci/.env .env +set -a && source .env && set +a + +rm -rf target/ + +dbt compile + +mkdir -p ~/dbt-ci/master-artifacts/ +cp target/manifest.json ~/dbt-ci/master-artifacts/manifest.json \ No newline at end of file diff --git a/ci/ci-vm-setup.sh b/ci/ci-vm-setup.sh new file mode 100644 index 0000000..997ce86 --- /dev/null +++ b/ci/ci-vm-setup.sh @@ -0,0 +1,8 @@ +# Start container +docker compose build -d + +# Run script to set things up in Postgres (DB, FDWs, etc) +envsubst < postgres-initial-setup.sql | psql -h $POSTGRES_HOST -U $POSTGRES_USER -d postgres + +# Copy profiles file +cp ci.profiles.yml ~/.dbt/profiles.yml diff --git a/ci/ci.env b/ci/ci.env new file mode 100644 index 0000000..6ad6370 --- /dev/null +++ b/ci/ci.env @@ -0,0 +1,10 @@ +POSTGRES_HOST=localhost +POSTGRES_USER=place a user here +PGPASSWORD=place a password here +POSTGRES_PORT=5432 +PRD_SCHEMAS_TO_SYNC="'sync_xero_superhog_limited','sync_xedotcom_currency_rates','sync_stripe_us','sync_stripe_uk','sync_hubspot','sync_guest_product','sync_default','sync_core','sync_cdb_screening','sync_cdb_screen_and_protect','sync_cdb_resolutions','sync_cdb_edeposit','sync_cdb_check_in_hero','sync_cdb_athena','staging','reporting','intermediate'" +PRD_CI_USER='ci_reader' +PRD_CI_PASSWORD= +PRD_HOST=the host +PRD_DB=the database +PRD_PORT=the port \ No newline at end of file diff --git a/ci/ci.profiles.yml b/ci/ci.profiles.yml new file mode 100644 index 0000000..06190b1 --- /dev/null +++ b/ci/ci.profiles.yml @@ -0,0 +1,13 @@ +dwh_dbt: + outputs: + prd-pointer: + dbname: prd-pointer + host: "{{ env_var('POSTGRES_HOST') }}" + port: "{{ env_var('POSTGRES_PORT') | as_number }}" + schema: public + user: "{{ env_var('POSTGRES_USER') }}" + pass: "{{ env_var('PGPASSWORD') }}" + type: postgres + threads: 1 + + target: prd-pointer \ No newline at end of file diff --git a/ci/docker-compose.yml b/ci/docker-compose.yml new file mode 100644 index 0000000..c1c14ec --- /dev/null +++ b/ci/docker-compose.yml @@ -0,0 +1,35 @@ +services: + postgres: + image: postgres:16 + container_name: postgres_db + environment: + POSTGRES_USER: ${POSTGRES_USER} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + POSTGRES_DB: postgres + ports: + - "5432:5432" + volumes: + - postgres_data:/var/lib/postgresql/data + # Note that some of the values below are very HW specific. You should + # absolutely adjust them to the available hardware where this will be + # running. This might help if you feel lost: + command: [ + "-c", "max_connections=XX", + "-c", "shared_buffers=XGB", + "-c", "effective_cache_size=XXXGB", + "-c", "maintenance_work_mem=XXXMB", + "-c", "checkpoint_completion_target=0.9", + "-c", "wal_buffers=XXXMB", + "-c", "default_statistics_target=XXX", + "-c", "random_page_cost=1.1", + "-c", "effective_io_concurrency=XXX", + "-c", "work_mem=XXXkB", + "-c", "huge_pages=off", + "-c", "min_wal_size=XXXGB", + "-c", "max_wal_size=XXXGB" + ] + restart: unless-stopped + +volumes: + postgres_data: + driver: local \ No newline at end of file diff --git a/ci/postgres-initial-setup.sql b/ci/postgres-initial-setup.sql new file mode 100644 index 0000000..dc3928c --- /dev/null +++ b/ci/postgres-initial-setup.sql @@ -0,0 +1,33 @@ +CREATE DATABASE prd-pointer; +\c prd-pointer + +CREATE EXTENSION postgres_fdw; + +CREATE SERVER dwh_prd +FOREIGN DATA WRAPPER postgres_fdw +OPTIONS (host '$PRD_HOST', dbname '$PRD_DB', port '$PRD_PORT'); + +CREATE USER MAPPING FOR current_user +SERVER dwh_prd +OPTIONS (user '$PRD_CI_USER', password '$PRD_CI_PASSWORD'); + +CREATE OR REPLACE FUNCTION refresh_foreign_schemas(schema_list TEXT[]) RETURNS void AS $$ +DECLARE + schema_name TEXT; +BEGIN + -- Loop through each schema in the provided list + FOREACH schema_name IN ARRAY schema_list LOOP + + -- Drop and recreate the schema to avoid conflicts + EXECUTE format('DROP SCHEMA IF EXISTS %I CASCADE', schema_name); + EXECUTE format('CREATE SCHEMA %I', schema_name); + + -- Import all tables from the foreign server + EXECUTE format( + 'IMPORT FOREIGN SCHEMA %I FROM SERVER dwh_prd INTO %I', + schema_name, schema_name + ); + + END LOOP; +END; +$$ LANGUAGE plpgsql; From 38ee399156d0c25f3a9ae3fc1aed90cbbc487ba9 Mon Sep 17 00:00:00 2001 From: Pablo Martin Date: Wed, 2 Apr 2025 15:54:41 +0200 Subject: [PATCH 2/7] more stuff in readme --- ci/README.md | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/ci/README.md b/ci/README.md index eb0e374..a4bcf90 100644 --- a/ci/README.md +++ b/ci/README.md @@ -17,8 +17,11 @@ These instructions assume that: - That you have cloned this repository in the home folder of the user you use in that VM. - The DWH production instance has a CI dedicated user that can read from all sync schemas as well as `staging`, `intermediate` and `reporting`, and you have the credentials. +If you don't have this, it probably means you need to review our Infrastructure repository where we describe how to set a VM up with all of this. + ### Setting things up +- SSH into the CI VM. - Create a folder in the user home directory named `dbt-ci`. - Create a copy of the `ci.env` file there naming it `.env` (`cp ci.env ~/dbt-ci/.env`) and fill it with values of your choice. - Execute the script named `ci-vm-setup.sh` in this folder. This script will take care of most of the setup that need to be executed, including: @@ -26,6 +29,13 @@ These instructions assume that: - Setting up the dockerized postgres with the right database, FDW, etc. - Prepare the `profiles.yml` file. -### Connecting to Devops +### Testing -- TBD \ No newline at end of file +- If the infra was set correctly and you followed the previous steps, you should be ready to roll. +- You might want to activate pipeline executions in Devops if you had it off while preparing everything. +- Once that's done: + - Create a branch in this repository. + - Add some silly change to any dbt model. + - Open a PR in Devops from the branch. +- If everything is fine, you should see in Devops the pipeline getting triggered automatically and walking through all the steps described in `.azure-pipelines.master.yml`. +- Once you make a commit to `master` or merge PR to `master`, you should also see pipelines getting triggered automatically `.azure-pipelines.master.yml`. From 913f001465287fe1330966f3247a685c74a2ccb9 Mon Sep 17 00:00:00 2001 From: Pablo Martin Date: Wed, 2 Apr 2025 16:10:12 +0200 Subject: [PATCH 3/7] add file inventory --- ci/README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/ci/README.md b/ci/README.md index a4bcf90..534c42e 100644 --- a/ci/README.md +++ b/ci/README.md @@ -39,3 +39,17 @@ If you don't have this, it probably means you need to review our Infrastructure - Open a PR in Devops from the branch. - If everything is fine, you should see in Devops the pipeline getting triggered automatically and walking through all the steps described in `.azure-pipelines.master.yml`. - Once you make a commit to `master` or merge PR to `master`, you should also see pipelines getting triggered automatically `.azure-pipelines.master.yml`. + +### What the hell are these files + +A small inventory of the funky files here: +- `ci-vm-setup.sh`: executes some set up steps that are needed the first time you prepare the CI VM. +- `ci.env`: template for the `.env` file that needs to be placed in the CI VM. +- `ci.profiles.yml`: template for the dbt `profiles.yml` file that needs to be placed in the CI VM. +- `ci-requirements.txt`: CI specific Python packages that need to be installed in CI runs (but not for running or developing on this project). +- `docker-compose.yml`: the docker compose file that defines the Postgres that runs in the CI VM. +- `postgres-initial-setup.sql`: a SQL file that completes set up steps required in the CI Postgres in the one-off initial setup. +- `sqlfluff-check.sh`: a script to check a folder's SQL files and validate them. Fails if any SQL is not parseable. +- `.sqlfluff`: some config for sqlfluff. +- `build-master-artifacts.sh`: a script that generates the `manifest.json` for the master branch and places it in a target folder. +- `.azure-pipelines.blablabla.yml`: the actual pipeline definitions for Azure. From 901d39a045f910cffb96e7a6fe739882e98b8b4f Mon Sep 17 00:00:00 2001 From: Pablo Martin Date: Thu, 3 Apr 2025 13:50:42 +0200 Subject: [PATCH 4/7] change fetch size --- ci/postgres-initial-setup.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/postgres-initial-setup.sql b/ci/postgres-initial-setup.sql index dc3928c..e0c28ff 100644 --- a/ci/postgres-initial-setup.sql +++ b/ci/postgres-initial-setup.sql @@ -7,6 +7,8 @@ CREATE SERVER dwh_prd FOREIGN DATA WRAPPER postgres_fdw OPTIONS (host '$PRD_HOST', dbname '$PRD_DB', port '$PRD_PORT'); +ALTER SERVER dwh_prd OPTIONS (fetch_size '100000'); + CREATE USER MAPPING FOR current_user SERVER dwh_prd OPTIONS (user '$PRD_CI_USER', password '$PRD_CI_PASSWORD'); From 8ea3f0b6a27b12ce914276cfc20e33400e6dab07 Mon Sep 17 00:00:00 2001 From: Pablo Martin Date: Fri, 4 Apr 2025 14:46:31 +0200 Subject: [PATCH 5/7] testing adjustmets --- ci/README.md | 5 +++-- ci/ci-vm-setup.sh | 4 +++- ci/docker-compose.yml | 4 ++-- ci/postgres-initial-setup.sql | 4 ++-- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/ci/README.md b/ci/README.md index 534c42e..9094747 100644 --- a/ci/README.md +++ b/ci/README.md @@ -23,8 +23,9 @@ If you don't have this, it probably means you need to review our Infrastructure - SSH into the CI VM. - Create a folder in the user home directory named `dbt-ci`. -- Create a copy of the `ci.env` file there naming it `.env` (`cp ci.env ~/dbt-ci/.env`) and fill it with values of your choice. -- Execute the script named `ci-vm-setup.sh` in this folder. This script will take care of most of the setup that need to be executed, including: +- Create a copy of the `ci/ci.env` file there naming it `.env` (assuming you're in the repo root dir, `cp ci/ci.env ~/dbt-ci/.env`) and fill it with values of your choice. +- Modify the `docker-compose.yml` file with values for the Postgres server parameters. Which values to set depend on your hardware. If you don't want or can't decide values for these parameters, you can just comment the lines. +- Enter the `ci` folder and execute the script named `ci-vm-setup.sh` in with `.env` file you just filled in sourced (you can run this: `(set -a && source ~/dbt-ci/.env && set +a && bash ci-vm-setup.sh)`). This script will take care of most of the setup that need to be executed, including: - Preparing the postgres database. - Setting up the dockerized postgres with the right database, FDW, etc. - Prepare the `profiles.yml` file. diff --git a/ci/ci-vm-setup.sh b/ci/ci-vm-setup.sh index 997ce86..a2b8ca3 100644 --- a/ci/ci-vm-setup.sh +++ b/ci/ci-vm-setup.sh @@ -1,8 +1,10 @@ # Start container -docker compose build -d +docker compose --env-file ~/dbt-ci/.env up -d # Run script to set things up in Postgres (DB, FDWs, etc) + envsubst < postgres-initial-setup.sql | psql -h $POSTGRES_HOST -U $POSTGRES_USER -d postgres # Copy profiles file +mkdir -p ~/.dbt cp ci.profiles.yml ~/.dbt/profiles.yml diff --git a/ci/docker-compose.yml b/ci/docker-compose.yml index c1c14ec..31f0d6d 100644 --- a/ci/docker-compose.yml +++ b/ci/docker-compose.yml @@ -4,7 +4,7 @@ services: container_name: postgres_db environment: POSTGRES_USER: ${POSTGRES_USER} - POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + POSTGRES_PASSWORD: ${PGPASSWORD} POSTGRES_DB: postgres ports: - "5432:5432" @@ -12,7 +12,7 @@ services: - postgres_data:/var/lib/postgresql/data # Note that some of the values below are very HW specific. You should # absolutely adjust them to the available hardware where this will be - # running. This might help if you feel lost: + # running. This might help if you feel lost: https://pgtune.leopard.in.ua/ command: [ "-c", "max_connections=XX", "-c", "shared_buffers=XGB", diff --git a/ci/postgres-initial-setup.sql b/ci/postgres-initial-setup.sql index e0c28ff..805c065 100644 --- a/ci/postgres-initial-setup.sql +++ b/ci/postgres-initial-setup.sql @@ -1,5 +1,5 @@ -CREATE DATABASE prd-pointer; -\c prd-pointer +CREATE DATABASE prd_pointer; +\c prd_pointer CREATE EXTENSION postgres_fdw; From 184ea4dec591960eea85512112e54fbb6e3a85b8 Mon Sep 17 00:00:00 2001 From: Pablo Martin Date: Fri, 4 Apr 2025 15:30:26 +0200 Subject: [PATCH 6/7] rename prd_pointer --- ci/.azure-pipelines.pr.yml | 6 +++--- ci/ci.profiles.yml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ci/.azure-pipelines.pr.yml b/ci/.azure-pipelines.pr.yml index c12223a..3f5eb2c 100644 --- a/ci/.azure-pipelines.pr.yml +++ b/ci/.azure-pipelines.pr.yml @@ -33,7 +33,7 @@ steps: - script: | set -a && source .env && set +a - psql -h $POSTGRES_HOST -U $POSTGRES_USER -d prd-pointer -c "select refresh_foreign_schemas(ARRAY[$PRD_SCHEMAS_TO_SYNC]);" + psql -h $POSTGRES_HOST -U $POSTGRES_USER -d prd_pointer -c "select refresh_foreign_schemas(ARRAY[$PRD_SCHEMAS_TO_SYNC]);" displayName: 'Sync Foreign Data Wrappers schemas' - script: | @@ -83,7 +83,7 @@ steps: - script: | set -a && source .env && set +a - psql -h $POSTGRES_HOST -U $POSTGRES_USER -d prd-pointer -c "DROP SCHEMA IF EXISTS $CI_SCHEMA_NAME CASCADE;" + psql -h $POSTGRES_HOST -U $POSTGRES_USER -d prd_pointer -c "DROP SCHEMA IF EXISTS $CI_SCHEMA_NAME CASCADE;" displayName: "Preemptive DROP SCHEMA" @@ -125,7 +125,7 @@ steps: - script: | set -a && source .env && set +a - psql -h $POSTGRES_HOST -U $POSTGRES_USER -d prd-pointer -c "DROP SCHEMA IF EXISTS $CI_SCHEMA_NAME CASCADE;" + psql -h $POSTGRES_HOST -U $POSTGRES_USER -d prd_pointer -c "DROP SCHEMA IF EXISTS $CI_SCHEMA_NAME CASCADE;" condition: always() displayName: 'Delete PR schema' \ No newline at end of file diff --git a/ci/ci.profiles.yml b/ci/ci.profiles.yml index 06190b1..fab5dfa 100644 --- a/ci/ci.profiles.yml +++ b/ci/ci.profiles.yml @@ -1,7 +1,7 @@ dwh_dbt: outputs: - prd-pointer: - dbname: prd-pointer + prd_pointer: + dbname: prd_pointer host: "{{ env_var('POSTGRES_HOST') }}" port: "{{ env_var('POSTGRES_PORT') | as_number }}" schema: public @@ -10,4 +10,4 @@ dwh_dbt: type: postgres threads: 1 - target: prd-pointer \ No newline at end of file + target: prd_pointer \ No newline at end of file From 3428231ad7bcbe89faabf3ff8f482eeaaa743ecb Mon Sep 17 00:00:00 2001 From: Pablo Martin Date: Tue, 8 Apr 2025 12:32:56 +0200 Subject: [PATCH 7/7] typos and touches --- ci/README.md | 4 ++-- ci/ci-vm-setup.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/README.md b/ci/README.md index 9094747..4bbb78a 100644 --- a/ci/README.md +++ b/ci/README.md @@ -24,8 +24,8 @@ If you don't have this, it probably means you need to review our Infrastructure - SSH into the CI VM. - Create a folder in the user home directory named `dbt-ci`. - Create a copy of the `ci/ci.env` file there naming it `.env` (assuming you're in the repo root dir, `cp ci/ci.env ~/dbt-ci/.env`) and fill it with values of your choice. -- Modify the `docker-compose.yml` file with values for the Postgres server parameters. Which values to set depend on your hardware. If you don't want or can't decide values for these parameters, you can just comment the lines. -- Enter the `ci` folder and execute the script named `ci-vm-setup.sh` in with `.env` file you just filled in sourced (you can run this: `(set -a && source ~/dbt-ci/.env && set +a && bash ci-vm-setup.sh)`). This script will take care of most of the setup that need to be executed, including: +- Copy the `docker-compose.yml` file into `dbt-ci`. Modify your copy with values for the Postgres server parameters. Which values to set depend on your hardware. If you don't want or can't decide values for these parameters, you can just comment the lines. +- Enter the `ci` folder and execute the script named `ci-vm-setup.sh` in with `.env` file you just filled in sourced (you can run this: `(set -a && source ~/dbt-ci/.env && set +a && bash ci-vm-setup.sh)`). This script will take care of most of the setup that need to be executed, including: - Preparing the postgres database. - Setting up the dockerized postgres with the right database, FDW, etc. - Prepare the `profiles.yml` file. diff --git a/ci/ci-vm-setup.sh b/ci/ci-vm-setup.sh index a2b8ca3..7bc18ca 100644 --- a/ci/ci-vm-setup.sh +++ b/ci/ci-vm-setup.sh @@ -1,5 +1,5 @@ # Start container -docker compose --env-file ~/dbt-ci/.env up -d +docker compose -f ~/dbt-ci/docker-compose.yml --env-file ~/dbt-ci/.env up -d # Run script to set things up in Postgres (DB, FDWs, etc)