From 2044ad002bd61a9c5a8a31ccf4e5790180aa6189 Mon Sep 17 00:00:00 2001
From: Pablo Martin <pablo.martin@superhog.com>
Date: Tue, 28 May 2024 18:50:02 +0200
Subject: [PATCH 01/12] refurbish user permissions setup for dwh

---
 human-script.md | 108 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 79 insertions(+), 29 deletions(-)
diff --git a/human-script.md b/human-script.md
index cd07f69..0935305 100644
--- a/human-script.md
+++ b/human-script.md
@@ -410,60 +410,110 @@ Follow this to deploy the entire data infra.
 
 - Validate the deployment by trying to log into the database with the `dwh_admin_<your-env>` user from your favourite SQL client (you can use DBeaver, for example). Be aware that your VPN connection should be active so that the DWH is reachable from your device.
 
-### 4.2 Create database and schemas
+### 4.2 Create database
 
-- Run the following commands to create a new database and the needed schemas
+- Run the following commands to create a new database
 
   ```sql
   CREATE DATABASE dwh;
-  -- Change active DB to dwh
-  CREATE SCHEMA staging;
-  CREATE SCHEMA intermediate;
-  CREATE SCHEMA reporting;
   ```
 
-### 4.3 Create users and roles
+- From now on, use this database for everything
+
+### 4.3 Create schemas, roles and users
 
 - Run the following script to create:
-  - A `modeler` role, owner of the `staging`, `intermediate` and `reporting` schemas.
-  - A `consumer` role, capable of reading the `reporting` schema.
-  - A dbt user, with `modeler` role.
-  - An airbyte user, with permission to create new schemas.
-  - A Power BI user, with `consumer` role.
+  - A `dwh_builder` role, which:
+    - Owns the `staging`, `intermediate` and `reporting` schemas.
+    - Can read `sync_` schemas.
+    - Thus, can do as he pleases in them.
+    - Designed to support `dbt run`.
+  - A `modeler` role, which:
+    - Can read the `staging`, `intermediate` and `reporting` schemas.
+    - Can read `sync_` schemas.
+    - Designed for data team members and power users to be able to read everything within the DWH.
+  - A `consumer` role, which:
+    - Can read the `reporting` schema.
+  - A user `dbt_user`, with `dwh_builder` role.
+  - A user `powerbi_user`, with `consumer` role.
+  - A user `airbyte user`, with permission to create new schemas.
   - *Note: replace the password fields with serious passwords and note them down.*
   - *Note: replace the name of the admin user*
 
   ```sql
-
+  -- Start here, logged in as the dwh admin
   CREATE ROLE airbyte_user LOGIN PASSWORD 'password' VALID UNTIL 'infinity';
   GRANT CREATE ON DATABASE dwh TO airbyte_user;
 
-  CREATE ROLE modeler INHERIT; 
-  GRANT USAGE ON SCHEMA staging TO modeler;
-  GRANT USAGE ON SCHEMA intermediate TO modeler;
-  GRANT USAGE ON SCHEMA reporting TO modeler;
-  GRANT ALL ON ALL TABLES IN SCHEMA staging TO modeler;
-  GRANT ALL ON ALL TABLES IN SCHEMA intermediate TO modeler;
-  GRANT ALL ON ALL TABLES IN SCHEMA reporting TO modeler;
+  -- Login as the airbyte_user
+  CREATE SCHEMA sync_default;
 
-  GRANT modeler TO dwh_admin_<your-env>;
-  ALTER SCHEMA staging OWNER TO modeler;
-  ALTER SCHEMA intermediate OWNER TO modeler;
-  ALTER SCHEMA reporting OWNER TO modeler;
+  -- Login as the dwh admin again
+  CREATE SCHEMA staging;
+  CREATE SCHEMA intermediate;
+  CREATE SCHEMA reporting;
+  
+  CREATE ROLE dwh_builder INHERIT;
+  ALTER SCHEMA staging OWNER TO dwh_builder;
+  ALTER SCHEMA intermediate OWNER TO dwh_builder;
+  ALTER SCHEMA reporting OWNER TO dwh_builder;
 
   CREATE ROLE dbt_user LOGIN PASSWORD 'password' VALID UNTIL 'infinity';
-  GRANT modeler to dbt_user;
+  GRANT dwh_builder to dbt_user;
 
   CREATE ROLE consumer INHERIT;
+  CREATE ROLE powerbi_user LOGIN PASSWORD 'password' VALID UNTIL 'infinity';
+  GRANT consumer to powerbi_user;
+
+  CREATE ROLE modeler INHERIT; 
+  -- You might want to create a first personal user with modeler role here
+
+  -- Login as airbyte_user
+
+  GRANT USAGE ON SCHEMA sync_default TO dwh_builder;
+  GRANT SELECT ON ALL TABLES IN SCHEMA sync_default TO dwh_builder;
+  ALTER DEFAULT PRIVILEGES IN SCHEMA sync_default GRANT SELECT ON TABLES TO dwh_builder;
+  
+  GRANT USAGE ON SCHEMA sync_default TO modeler;
+  GRANT SELECT ON ALL TABLES IN SCHEMA sync_default TO modeler;
+  ALTER DEFAULT PRIVILEGES IN SCHEMA sync_default GRANT SELECT ON TABLES TO modeler;
+
+  -- Login as dbt_user
+  
+  GRANT USAGE ON SCHEMA staging TO modeler;
+  GRANT SELECT ON ALL TABLES IN SCHEMA staging TO modeler;
+  ALTER DEFAULT PRIVILEGES IN SCHEMA staging GRANT SELECT ON TABLES TO modeler;
+  GRANT USAGE ON SCHEMA intermediate TO modeler;
+  GRANT SELECT ON ALL TABLES IN SCHEMA intermediate TO modeler;
+  ALTER DEFAULT PRIVILEGES IN SCHEMA intermediate GRANT SELECT ON TABLES TO modeler;
+  GRANT USAGE ON SCHEMA reporting TO modeler;
+  GRANT SELECT ON ALL TABLES IN SCHEMA reporting TO modeler;
+  ALTER DEFAULT PRIVILEGES IN SCHEMA reporting GRANT SELECT ON TABLES TO modeler;
+
+  
   GRANT USAGE ON SCHEMA reporting TO consumer;
   GRANT SELECT ON ALL TABLES IN SCHEMA reporting TO consumer;
   ALTER DEFAULT PRIVILEGES IN SCHEMA reporting GRANT SELECT ON TABLES TO consumer;
-
-  CREATE ROLE powerbi_user LOGIN PASSWORD 'password' VALID UNTIL 'infinity';
-  GRANT consumer to powerbi_user;
   ```
 
-- If you want, you might also want to create more users depending on your needs. Typically, date team members should also have the `modeler` role.
+- On adding new users:
+  - Typically, you will want to create personal accounts for data team members with `modeler` role so that they can query everywhere in the dwh.
+  - Any other services or users that need to access the reporting layer can be given the `consumer` role.
+- Furthermore, `sync_` schema permissions need to be dynamically managed from this point on. This means that:
+  - Generally, all `sync_` schemas should be created by the `airbyte_user`.
+  - Whenever a new `sync_` schema comes to life, both the `modeler` and `dwh_builder` roles should receive access. You can use the following command template:
+
+    ```sql
+    -- Login as airbyte_user
+
+    GRANT USAGE ON SCHEMA sync_<some-new-source> TO dwh_builder;
+    GRANT SELECT ON ALL TABLES IN SCHEMA sync_<some-new-source> TO dwh_builder;
+    ALTER DEFAULT PRIVILEGES IN SCHEMA sync_<some-new-source> GRANT SELECT ON TABLES TO dwh_builder;
+    
+    GRANT USAGE ON SCHEMA sync_<some-new-source> TO modeler;
+    GRANT SELECT ON ALL TABLES IN SCHEMA sync_<some-new-source> TO modeler;
+    ALTER DEFAULT PRIVILEGES IN SCHEMA sync_<some-new-source> GRANT SELECT ON TABLES TO modeler;
+    ```
 
 ## 5. Airbyte
 

From f64b5d20c2054c6d9bd7195918717d144b91d064 Mon Sep 17 00:00:00 2001
From: Pablo Martin <pablo.martin@superhog.com>
Date: Fri, 14 Jun 2024 10:52:40 +0200
Subject: [PATCH 02/12] fix headers

---
 human-script.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/human-script.md b/human-script.md
index cd07f69..235e70c 100644
--- a/human-script.md
+++ b/human-script.md
@@ -631,20 +631,20 @@ WIP: we need support to learn how to use statuspage.io
 
 - If you are working on a dev or staging environment, you might want to skip this section.
 
-## 9.1 DWH
+### 9.1 DWH
 
 - Backups are managed with Azure. In the Azure Portal page for the PostgreSQL service, visit section `Backup and restore`. Production servers should have 14 days as a retention period.
 
-## 9.2 Jumphost
+### 9.2 Jumphost
 
 - Jumphosts barely hold any data at all. Although it's quite tempting to forget about this and simply raise another VM if something goes wrong, it would be annoying to have to regenerate the keys of both the VPN server and other clients.
 - To solve this, make a habit of making regular copies of the Wireguard config file in another machine. Theoretically, only making a copy everytime it gets modified should be enough.
 
-## 9.3 Airbyte
+### 9.3 Airbyte
 
 - Our strategy for backing up Airbyte is to backup the entire VM.
 - WIP
 
-## 9.4 PBI Gateway
+### 9.4 PBI Gateway
 
 - The PBI Gateway is pretty much stateless. Given this, if there are any issues or disasters on the current VM, simply create another one and set up the gateway again.

From c986715e984cfe4176a77f08c123e46dc39fd73b Mon Sep 17 00:00:00 2001
From: Pablo Martin <pablo.martin@superhog.com>
Date: Thu, 14 Nov 2024 16:19:22 +0100
Subject: [PATCH 03/12] wip

---
 human-script.md | 177 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 145 insertions(+), 32 deletions(-)

diff --git a/human-script.md b/human-script.md
index dee7d18..ae30413 100644
--- a/human-script.md
+++ b/human-script.md
@@ -115,7 +115,7 @@ Follow this to deploy the entire data infra.
       - Protocol: TCP
       - Action: Allow
       - Priority: 110
-    - Airbyte web rule
+    - Web server Rule
       - Name: AllowWebFromJumphostInbound
       - Source: the addresss range for the `jumphost-subnet`. In this example, `10.69.0.0/29`.
       - Source port ranges: *
@@ -515,6 +515,62 @@ Follow this to deploy the entire data infra.
     ALTER DEFAULT PRIVILEGES IN SCHEMA sync_<some-new-source> GRANT SELECT ON TABLES TO modeler;
     ```
 
+## 5. Web Gateway
+
+We will deploy a dedicated VM to act as a web server for internal services.
+
+### 5.1 Deploy Web Gateway VM
+
+- Create a new VM following these steps.
+  - Basic settings
+    - Name it: `web-gateway-<your-env>`
+    - Use Ubuntu Server 22.04
+    - Use size: `Standard_B1s`
+    - Use username: `azureuser`
+    - Use the SSH Key: `superhog-data-general-ssh-<your-env>`
+    - Select the option `None` for Public inbound ports.
+  - Disk settings
+    - Defaults are fine. This barely needs any disk.
+  - Networking
+    - Attach to the virtual network `superhog-data-vnet-<your-env>`
+    - Attach to the subnet `services-subnet`
+    - Assign no public IP.
+    - For setting `NIC network security group` select option `None`
+  - Management settings
+    - Defaults are fine.
+  - Monitoring
+    - Defaults are fine.
+  - Advanced
+    - Defaults are fine.
+  - Add tags:
+    - `team: data`
+    - `environment: <your-env>`
+    - `project: network`
+- Once the VM is running, you should be able to ssh into the machine when your VPN is active.
+
+### 9.2 Deploying Caddy
+
+- We need to install caddy in the VM. You can do so with the following commands:
+  
+  ```bash
+  sudo apt install -y debian-keyring debian-archive-keyring apt-transport-https curl
+  curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/gpg.key' | sudo gpg --dearmor -o /usr/share/keyrings/caddy-stable-archive-keyring.gpg
+  curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/debian.deb.txt' | sudo tee /etc/apt/sources.list.d/caddy-stable.list
+  sudo apt update
+  sudo apt install caddy
+  ```
+
+- After the previous commands, you can verify that caddy is running properly as a systemd service with: `systemctl status caddy`
+- You can also verify that Caddy is reachable (should be) by running the following command from your laptop while connected to the VPN: `curl web-gateway-<your-env>.<your-env>.data.superhog.com`. If you see a wall of HTML that looks like Caddy's demo page, it means Caddy is working as expected.
+
+### 9.3 Pointing Caddy to internal services
+
+- Caddy will need to be configured to act as the web server or reverse proxy of the different services within the services subnet. The details of these configurations are defined in sections below.
+- As a general note, the pattern will generally be:
+  - You will need to include the right entry in the `Caddyfile` at `/etc/caddy/Caddyfile`.
+  - You will need to reload caddy with `sudo systemctl reload caddy.service`.
+  - If the web server needs to reach a specific port in some other VM, you will need to sort networking security out. If the VM you need to reach from the web server is within the internal services subnet, you'll have to add the necessary Inbound rules in the NSG `superhog-data-nsg-services-<your-env>`.
+
 ## 5. Airbyte
 
 ### 5.1 Deploying Airbyte VM
@@ -556,8 +612,6 @@ Follow this to deploy the entire data infra.
 
   AIRBYTE_ADMIN_USER=your-user-here
   AIRBYTE_ADMIN_PASSWORD=your-password-here
-  YOUR_ENV=<your-env>
-  PRIVATE_DNS_ZONE_NAME=${YOUR_ENV}.data.superhog.com
 
   echo "Installing docker."
   apt-get update -y
@@ -585,34 +639,64 @@ Follow this to deploy the entire data infra.
   echo "Restarting Airbyte."
   docker compose down; docker compose up -d
 
-  echo "Deploying Caddy Webserver"
-  apt install -y debian-keyring debian-archive-keyring apt-transport-https curl
-  curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/gpg.key' | sudo gpg --dearmor -o /usr/share/keyrings/caddy-stable-archive-keyring.gpg
-  curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/debian.deb.txt' | sudo tee /etc/apt/sources.list.d/caddy-stable.list
-  apt update
-  apt install caddy
-
-  echo "Write caddyfile"
-
-  touch /etc/caddy/Caddyfile
-  cat > /etc/caddy/Caddyfile << EOL
-
-  # Airbyte web UI
-  http://airbyte-${YOUR_ENV}.${PRIVATE_DNS_ZONE_NAME} {
-  reverse_proxy localhost:8000
-  }
-
-  EOL
-
-  echo "Restart caddy"
-  systemctl restart caddy
-
-  echo "You can now access at http://airbyte-${YOUR_ENV}.${PRIVATE_DNS_ZONE_NAME}"
+  echo "You can now access at http://localhost:8000"
 
   echo "Finished."
   ```
 
-- Visit <http://airbyte-><your-env>.<your-env>.data.superhog.com. If you are prompted for user and password, it means Airbyte is running properly and is reachable.
+- To check that Airbyte is running fine, run this command from a terminal within the Airbyte VM: `curl localhost:8000`. You should see some HTML for Airbyte's access denied page.
+
+### 5.3 Making Airbyte Web UI reachable
+
+- To provide access to the Airbyte UI, we will have to integrate it with the web gateway and our networking configurations.
+- First, we need to allow the web gateway to reach Airbyte locally-served webserver.
+  - Use the Azure portal to navigate to the NSG `superhog-data-nsg-services-<your-env>` page.
+  - Add a new Inbound rule with the following details:
+    - Name: `Allow8000TCPWithinSubnet`
+    - Source: the addresss range for the `services-subnet`. In this example, `10.69.0.64/26`.
+    - Source port ranges: *
+    - Destination: the addresss range for the `services-subnet`. In this example, `10.69.0.64/26`.
+    - Destination port ranges: 8000
+    - Protocol: TCP
+    - Action: Allow
+    - Priority: Set something above existing rules, but below the `DenyAllInbound` rules.
+- Next, we need to set a DNS entry to generate the URL that will be used to navigate to the Airbyte UI.
+  - Use the Azure portal to navigate to the Private DNS Zone `<your-env>.data.superhog.com` page.
+  - Create a new record with the following details:
+    - Name: `airbyte`
+    - Type: `A`
+    - IP Address: Look for the private IP address that was assigned to the VM `web-gateway-<your-env>` and place it here.
+- Finally, we must create an entry in caddy's config file.
+  - SSH into the web gateway VM.
+  - Make a script with these commands and run it:
+
+  ```bash
+
+    YOUR_ENV=<your-env>
+    PRIVATE_DNS_ZONE_NAME=${YOUR_ENV}.data.superhog.com
+    AIRBYTE_SUBDOMAIN=airbyte # If you followed this guide for the DNS bit, leave this value. If you chose a different subdomain, adjust accordingly
+    FULL_AIRBYTE_URL=http://${AIRBYTE_SUBDOMAIN}.${PRIVATE_DNS_ZONE_NAME} 
+    echo "Write caddyfile"
+
+    touch /etc/caddy/Caddyfile
+    cat > /etc/caddy/Caddyfile << EOL
+
+    # Airbyte web UI
+    http://${FULL_AIRBYTE_URL} {
+      reverse_proxy http://airbyte-${YOUR_ENV}.${PRIVATE_DNS_ZONE_NAME}:8000
+    }
+
+    EOL
+
+    echo "Restart caddy"
+    systemctl restart caddy
+
+    echo "You can now access at http://${FULL_AIRBYTE_URL}
+  ```
+
+- If everything is working properly, you should now be able to reach airbyte at the printed URL.
+- If something doesn't work, I would advise troubleshooting through the chain of VMs to find where is the connection breaking down.
+#TODO CONTINUE HERE
 
 ## 6. Power BI
 
@@ -677,24 +761,53 @@ WIP: we are planning on using Azure Dashboards with metrics.
 
 WIP: we need support to learn how to use statuspage.io
 
-## 9. Backups
+### 9.3 Configuring Caddy
+
+- Now that caddy is running, you can configure it to serve whatever you need.
+- This instance is designed to be the external entrypoint to serve any internal webpages to users of web services within the data virtual network. It's possible that, by the time you are reading this, there are more services that we planned originally.
+- As an example, we will now show how to reverse proxy the Airbyte UI. For other services, you can follow a similar pattern.
+  - Edit the caddy config file with `sudo nano /etc/caddy/Caddyfile`
+  - To add a reverse proxy for Airbyte, add this entry:
+
+    ```bash
+    http://airbyte.prd.data.superhog.com {
+        reverse_proxy http://airbyte-<your-env>.<your-env>.data.superhog.com {
+        #reverse_proxy http://10.69.0.68:80 {
+            header_up Cookie {>Cookie}
+            header_up Host airbyte-prd.prd.data.superhog.com
+            header_up X-Real-IP {remote}
+            header_up X-Forwarded-For {remote}
+            header_up X-Forwarded-Proto {scheme}
+        }
+    }
+    ```
+
+
+- Note that, if you need to do more changes in configuration, you can have Caddy pick up the changes by running `sudo systemctl reload caddy`. This will reload the configuration without incurring any downtime, as `stop` and `start` would.
+
+### 9.4 Additional networking actions
+
+- Allow internal service VMs to reach each other at port 80, it's in the NSG for the services subnet
+
+
+## 10. Backups
 
 - If you are working on a dev or staging environment, you might want to skip this section.
 
-### 9.1 DWH
+### 10.1 DWH
 
 - Backups are managed with Azure. In the Azure Portal page for the PostgreSQL service, visit section `Backup and restore`. Production servers should have 14 days as a retention period.
 
-### 9.2 Jumphost
+### 10.2 Jumphost
 
 - Jumphosts barely hold any data at all. Although it's quite tempting to forget about this and simply raise another VM if something goes wrong, it would be annoying to have to regenerate the keys of both the VPN server and other clients.
 - To solve this, make a habit of making regular copies of the Wireguard config file in another machine. Theoretically, only making a copy everytime it gets modified should be enough.
 
-### 9.3 Airbyte
+### 10.3 Airbyte
 
 - Our strategy for backing up Airbyte is to backup the entire VM.
 - WIP
 
-### 9.4 PBI Gateway
+### 10.4 PBI Gateway
 
 - The PBI Gateway is pretty much stateless. Given this, if there are any issues or disasters on the current VM, simply create another one and set up the gateway again.

From 8b1c8b5c9c4a85b7fd38e6cf4f71f185eb8873ae Mon Sep 17 00:00:00 2001
From: Pablo Martin <pablo.martin@superhog.com>
Date: Tue, 26 Nov 2024 11:05:27 +0100
Subject: [PATCH 04/12] wip

---
 human-script.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/human-script.md b/human-script.md
index ae30413..7017394 100644
--- a/human-script.md
+++ b/human-script.md
@@ -695,8 +695,7 @@ We will deploy a dedicated VM to act as a web server for internal services.
   ```
 
 - If everything is working properly, you should now be able to reach airbyte at the printed URL.
-- If something doesn't work, I would advise troubleshooting through the chain of VMs to find where is the connection breaking down.
-#TODO CONTINUE HERE
+- If something doesn't work, I would advise troubleshooting through the chain of machines (your device to the VPN box, then to the web gateway, then to the airbyte machine) to find where is the connection breaking down.
 
 ## 6. Power BI
 

From 94191161b6f21172ed1a990d92fd73b3e0ec45f6 Mon Sep 17 00:00:00 2001
From: Pablo Martin <pablo.martin@superhog.com>
Date: Tue, 26 Nov 2024 11:13:18 +0100
Subject: [PATCH 05/12] change section numbering

---
 human-script.md | 100 +++++++++++++++++-------------------------------
 1 file changed, 36 insertions(+), 64 deletions(-)

diff --git a/human-script.md b/human-script.md
index 7017394..feda00d 100644
--- a/human-script.md
+++ b/human-script.md
@@ -2,14 +2,14 @@
 
 Follow this to deploy the entire data infra.
 
-## 0. Pre-requisites and conventions
+## 000. Pre-requisites and conventions
 
 - You need an Azure subscription and a user with administrator rights in it.
 - Whenever you see `<your-env>`, you should replace that with `dev`,`uat`, `prd` or whatever fits your environment.
 - We traditionally deploy resources on the `UK South` region. Unless stated otherwise, you should deploy resources there.
 - You have an SSH key pair ready to use for access to the different machines. You can always add more pairs later.
 
-## 1. Resource group and SSH Keypair
+## 010. Resource group and SSH Keypair
 
 ### 1.1 Create Resource Group
 
@@ -30,9 +30,9 @@ Follow this to deploy the entire data infra.
   - Pay attention when storing the private key. You probably want to store it in a safe password manager, like Keeper.
   - Optionally, you can also be extra paranoid, generate the SSH key locally and only upload the public key to Azure. Up to you.
 
-## 2. Networking
+## 020. Networking
 
-### 2.1 VNET
+### 020.1 VNET
 
 - Create a virtual network. This virtual network is where all our infra will live. For the rest of this guide, assume this is the network where you must connect services.
 - Name it: `superhog-data-vnet-<your-env>`
@@ -60,7 +60,7 @@ Follow this to deploy the entire data infra.
   - `environment: <your-env>`
   - `project: network`
 
-### 2.2 Network security groups
+### 020.2 Network security groups
 
 - You will create three network security groups (NSG)
 - Jumphost NSG
@@ -172,7 +172,7 @@ Follow this to deploy the entire data infra.
   - Visit the virtual network page and look for the subnets list
   - For each subnet, select its NSG and attach it
 
-### 2.3 Private DNS Zone
+### 020.3 Private DNS Zone
 
 - We will set up a private DNS Zone to avoid using hardcoded IPs to refer to services within the virtual network. This makes integrations more resilient because a service can change its IP and still be reached by other services (as long as other network configs like firewalls are still fine).
 - Create the Private DNS Zone
@@ -186,7 +186,7 @@ Follow this to deploy the entire data infra.
   - Associate it to the virtual network.
   - Enable autoregistration
 
-### 2.4 Public IP
+### 020.4 Public IP
 
 - We will need a public IP for the jumphost.
 - Create the public IP
@@ -197,9 +197,9 @@ Follow this to deploy the entire data infra.
     - `environment: <your-env>`
     - `project: network`
 
-## 3. Jumphost
+## 030. Jumphost
 
-### 3.1 Deploy Jumphost VM
+### 030.1 Deploy Jumphost VM
 
 - The first VM we must deploy is a jumphost, since that will be our door to all other services inside the virtual network.
 - Create the VM
@@ -228,7 +228,7 @@ Follow this to deploy the entire data infra.
     - `environment: <your-env>`
     - `project: network`
 
-### 3.2 Configure a VPN Server
+### 030.2 Configure a VPN Server
 
 - The jumphost we just created is not accessible via SSH from WAN due to the NSG set in the jumphost subnet.
 - To make it so, you should temporarily create a new rule like this in the NSG `superhog-data-nsg-jumphost-<your-env>`.
@@ -322,7 +322,7 @@ Follow this to deploy the entire data infra.
   - Look for the jumphost VM Network Interface.
   - In the `IP configurations` session, activate the flag `Enable IP forwarding`.
 
-### 3.3 Configure a DNS Server
+### 030.3 Configure a DNS Server
 
 - The jumphost is now ready. When the VPN is active on our local device, we can access the services within the virtual network.
 - There is one issue, though: we would like to access services through names, not IPs.
@@ -379,14 +379,14 @@ Follow this to deploy the entire data infra.
   - In your client Wireguard configuration, uncomment the DNS server line we left before
   - Check that the service is running fine by running `dig google.com`. You should see in the output that your laptop has relied on our new DNS to do the name resolution.
 
-### 3.4 Harden the Jumphost VM
+### 030.4 Harden the Jumphost VM
 
 - In the Jumphost, run the following command to disable password based SSH authentication fully. This way, access can only be granted with SSH key pairs, which is way more secure: `sudo sed -i -e 's/#PasswordAuthentication yes/PasswordAuthentication no/g' /etc/ssh/sshd_config; sudo systemctl restart ssh`.
 - Remove the AllowSSHInboundTemporarily rule that you added to the NSG `superhog-data-nsg-jumphost-<your-env>`. We don't need that anymore since we can SSH through the VPN tunnel.
 
-## 4. DWH
+## 040. DWH
 
-### 4.1 Deploy PostgreSQL Server
+### 040.1 Deploy PostgreSQL Server
 
 - Next, we will deploy a Postgres server to act as the DWH.
   - Create a new Azure Database for PostgreSQL flexible servers.
@@ -410,7 +410,7 @@ Follow this to deploy the entire data infra.
 
 - Validate the deployment by trying to log into the database with the `dwh_admin_<your-env>` user from your favourite SQL client (you can use DBeaver, for example). Be aware that your VPN connection should be active so that the DWH is reachable from your device.
 
-### 4.2 Create database
+### 040.2 Create database
 
 - Run the following commands to create a new database
 
@@ -420,7 +420,7 @@ Follow this to deploy the entire data infra.
 
 - From now on, use this database for everything
 
-### 4.3 Create schemas, roles and users
+### 040.3 Create schemas, roles and users
 
 - Run the following script to create:
   - A `dwh_builder` role, which:
@@ -515,11 +515,11 @@ Follow this to deploy the entire data infra.
     ALTER DEFAULT PRIVILEGES IN SCHEMA sync_<some-new-source> GRANT SELECT ON TABLES TO modeler;
     ```
 
-## 5. Web Gateway
+## 050. Web Gateway
 
 We will deploy a dedicated VM to act as a web server for internal services.
 
-### 5.1 Deploy Web Gateway VM
+### 050.1 Deploy Web Gateway VM
 
 - Create a new VM following these steps.
   - Basic settings
@@ -548,7 +548,7 @@ We will deploy a dedicated VM to act as a web server for internal services.
     - `project: network`
 - Once the VM is running, you should be able to ssh into the machine when your VPN is active.
 
-### 9.2 Deploying Caddy
+### 050.2 Deploying Caddy
 
 - We need to install caddy in the VM. You can do so with the following commands:
   
@@ -563,7 +563,7 @@ We will deploy a dedicated VM to act as a web server for internal services.
 - After the previous commands, you can verify that caddy is running properly as a systemd service with: `systemctl status caddy`
 - You can also verify that Caddy is reachable (should be) by running the following command from your laptop while connected to the VPN: `curl web-gateway-<your-env>.<your-env>.data.superhog.com`. If you see a wall of HTML that looks like Caddy's demo page, it means Caddy is working as expected.
 
-### 9.3 Pointing Caddy to internal services
+### 050.3 Pointing Caddy to internal services
 
 - Caddy will need to be configured to act as the web server or reverse proxy of the different services within the services subnet. The details of these configurations are defined in sections below.
 - As a general note, the pattern will generally be:
@@ -571,9 +571,9 @@ We will deploy a dedicated VM to act as a web server for internal services.
   - You will need to reload caddy with `sudo systemctl reload caddy.service`.
   - If the web server needs to reach a specific port in some other VM, you will need to sort networking security out. If the VM you need to reach from the web server is within the internal services subnet, you'll have to add the necessary Inbound rules in the NSG `superhog-data-nsg-services-<your-env>`.
 
-## 5. Airbyte
+## 060. Airbyte
 
-### 5.1 Deploying Airbyte VM
+### 060.1 Deploying Airbyte VM
 
 - Airbyte lives on its own VM. To do so, create a new VM following these steps.
   - Basic settings
@@ -602,7 +602,7 @@ We will deploy a dedicated VM to act as a web server for internal services.
     - `project: airbyte`
 - Once the VM is running, you should be able to ssh into the machine when your VPN is active.
 
-### 5.2 Deploying Airbyte
+### 060.2 Deploying Airbyte
 
 - SSH into the VM.
 - Run the following script to install docker and deploy Airbyte
@@ -646,7 +646,7 @@ We will deploy a dedicated VM to act as a web server for internal services.
 
 - To check that Airbyte is running fine, run this command from a terminal within the Airbyte VM: `curl localhost:8000`. You should see some HTML for Airbyte's access denied page.
 
-### 5.3 Making Airbyte Web UI reachable
+### 060.3 Making Airbyte Web UI reachable
 
 - To provide access to the Airbyte UI, we will have to integrate it with the web gateway and our networking configurations.
 - First, we need to allow the web gateway to reach Airbyte locally-served webserver.
@@ -697,9 +697,9 @@ We will deploy a dedicated VM to act as a web server for internal services.
 - If everything is working properly, you should now be able to reach airbyte at the printed URL.
 - If something doesn't work, I would advise troubleshooting through the chain of machines (your device to the VPN box, then to the web gateway, then to the airbyte machine) to find where is the connection breaking down.
 
-## 6. Power BI
+## 070. Power BI
 
-### 6.1 Deploying Power BI VM
+### 070.1 Deploying Power BI VM
 
 - We need to deploy a Windows VM.
 - Create the VM
@@ -729,7 +729,7 @@ We will deploy a dedicated VM to act as a web server for internal services.
     - `project: pbi`
 - Try to connect with RDP at `pbi-gateway-<your-env>.<your-env>.data.superhog.com`.
 
-### 6.2 Installing Power BI Data Gateway
+### 070.2 Installing Power BI Data Gateway
 
 - Login the VM.
 - Follow the instructions here to download the installer in the VM and set it up: <https://learn.microsoft.com/en-us/data-integration/gateway/service-gateway-install>
@@ -745,68 +745,40 @@ We will deploy a dedicated VM to act as a web server for internal services.
   - Turn the `require_secure_transport` parameter to `Off`.
 - Once you are done, you should be able to visit the PBI Service (the online UI), visit the gateways page in settings and see the gateway listed in the `On-premises data gateways` section.
 
-## 7. dbt
+## 080. dbt
 
 - Our dbt project (<https://guardhog.visualstudio.com/Data/_git/data-dwh-dbt-project>) can be deployed on any linux VM within the virtual network. The instructions on how to deploy and schedule it are in the project repository.
 - You can opt to deploy it in the same machine where airbyte is stored, since that machine is probably fairly underutilized.
 
-## 8. Monitoring
+## 090. Monitoring
 
-### 8.1 Infra monitoring
+### 090.1 Infra monitoring
 
 WIP: we are planning on using Azure Dashboards with metrics.
 
-### 8.2 Service status
+### 090.2 Service status
 
 WIP: we need support to learn how to use statuspage.io
 
-### 9.3 Configuring Caddy
 
-- Now that caddy is running, you can configure it to serve whatever you need.
-- This instance is designed to be the external entrypoint to serve any internal webpages to users of web services within the data virtual network. It's possible that, by the time you are reading this, there are more services that we planned originally.
-- As an example, we will now show how to reverse proxy the Airbyte UI. For other services, you can follow a similar pattern.
-  - Edit the caddy config file with `sudo nano /etc/caddy/Caddyfile`
-  - To add a reverse proxy for Airbyte, add this entry:
-
-    ```bash
-    http://airbyte.prd.data.superhog.com {
-        reverse_proxy http://airbyte-<your-env>.<your-env>.data.superhog.com {
-        #reverse_proxy http://10.69.0.68:80 {
-            header_up Cookie {>Cookie}
-            header_up Host airbyte-prd.prd.data.superhog.com
-            header_up X-Real-IP {remote}
-            header_up X-Forwarded-For {remote}
-            header_up X-Forwarded-Proto {scheme}
-        }
-    }
-    ```
-
-
-- Note that, if you need to do more changes in configuration, you can have Caddy pick up the changes by running `sudo systemctl reload caddy`. This will reload the configuration without incurring any downtime, as `stop` and `start` would.
-
-### 9.4 Additional networking actions
-
-- Allow internal service VMs to reach each other at port 80, it's in the NSG for the services subnet
-
-
-## 10. Backups
+## 100. Backups
 
 - If you are working on a dev or staging environment, you might want to skip this section.
 
-### 10.1 DWH
+### 100.1 DWH
 
 - Backups are managed with Azure. In the Azure Portal page for the PostgreSQL service, visit section `Backup and restore`. Production servers should have 14 days as a retention period.
 
-### 10.2 Jumphost
+### 100.2 Jumphost
 
 - Jumphosts barely hold any data at all. Although it's quite tempting to forget about this and simply raise another VM if something goes wrong, it would be annoying to have to regenerate the keys of both the VPN server and other clients.
 - To solve this, make a habit of making regular copies of the Wireguard config file in another machine. Theoretically, only making a copy everytime it gets modified should be enough.
 
-### 10.3 Airbyte
+### 100.3 Airbyte
 
 - Our strategy for backing up Airbyte is to backup the entire VM.
 - WIP
 
-### 10.4 PBI Gateway
+### 100.4 PBI Gateway
 
 - The PBI Gateway is pretty much stateless. Given this, if there are any issues or disasters on the current VM, simply create another one and set up the gateway again.

From 5cd91f8f678f852030c165e7fae6cd1dbc7ac968 Mon Sep 17 00:00:00 2001
From: Pablo Martin <pablo.martin@superhog.com>
Date: Tue, 26 Nov 2024 11:27:49 +0100
Subject: [PATCH 06/12] add missing dns reference

---
 human-script.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/human-script.md b/human-script.md
index feda00d..7539da5 100644
--- a/human-script.md
+++ b/human-script.md
@@ -7,11 +7,10 @@ Follow this to deploy the entire data infra.
 - You need an Azure subscription and a user with administrator rights in it.
 - Whenever you see `<your-env>`, you should replace that with `dev`,`uat`, `prd` or whatever fits your environment.
 - We traditionally deploy resources on the `UK South` region. Unless stated otherwise, you should deploy resources there.
-- You have an SSH key pair ready to use for access to the different machines. You can always add more pairs later.
 
 ## 010. Resource group and SSH Keypair
 
-### 1.1 Create Resource Group
+### 010.1 Create Resource Group
 
 - Create a resource group. This resource group will hold all the resources. For the rest of this guide, assume this is the resource group where you must create resources.
 - Name it: `superhog-data-rg-<your-env>`
@@ -19,7 +18,7 @@ Follow this to deploy the entire data infra.
   - `team: data`
   - `environment: <your-env>`
 
-### 1.2 SSH Keypair
+### 010.2 SSH Keypair
 
 - We will create an SSH Keypair for this deployment. It will be used to access VMs, Git repos and other services.
 - Create the SSH Key pair
@@ -567,6 +566,7 @@ We will deploy a dedicated VM to act as a web server for internal services.
 
 - Caddy will need to be configured to act as the web server or reverse proxy of the different services within the services subnet. The details of these configurations are defined in sections below.
 - As a general note, the pattern will generally be:
+  - Create the right A record in the Private DNS records so that you point users with some subdomain towards the web gateway.
   - You will need to include the right entry in the `Caddyfile` at `/etc/caddy/Caddyfile`.
   - You will need to reload caddy with `sudo systemctl reload caddy.service`.
   - If the web server needs to reach a specific port in some other VM, you will need to sort networking security out. If the VM you need to reach from the web server is within the internal services subnet, you'll have to add the necessary Inbound rules in the NSG `superhog-data-nsg-services-<your-env>`.

From 6a5f6ad0ff7de27138f96e84dbb897177abe1cf7 Mon Sep 17 00:00:00 2001
From: Pablo Martin <pablo.martin@superhog.com>
Date: Wed, 18 Dec 2024 12:56:55 +0100
Subject: [PATCH 07/12] add user creation and permissions pattern

---
 human-script.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/human-script.md b/human-script.md
index 7539da5..03b581b 100644
--- a/human-script.md
+++ b/human-script.md
@@ -436,6 +436,7 @@ Follow this to deploy the entire data infra.
   - A user `dbt_user`, with `dwh_builder` role.
   - A user `powerbi_user`, with `consumer` role.
   - A user `airbyte user`, with permission to create new schemas.
+  - A user `billingdb_reader`, with permission to read some tables from the reporting schema.
   - *Note: replace the password fields with serious passwords and note them down.*
   - *Note: replace the name of the admin user*
 
@@ -464,6 +465,8 @@ Follow this to deploy the entire data infra.
   CREATE ROLE powerbi_user LOGIN PASSWORD 'password' VALID UNTIL 'infinity';
   GRANT consumer to powerbi_user;
 
+  CREATE ROLE billingdb_reader LOGIN PASSWORD 'password' VALID UNTIL 'infinity';
+
   CREATE ROLE modeler INHERIT; 
   -- You might want to create a first personal user with modeler role here
 
@@ -513,6 +516,17 @@ Follow this to deploy the entire data infra.
     GRANT SELECT ON ALL TABLES IN SCHEMA sync_<some-new-source> TO modeler;
     ALTER DEFAULT PRIVILEGES IN SCHEMA sync_<some-new-source> GRANT SELECT ON TABLES TO modeler;
     ```
+  
+  - This script also doesn't specify exactly which tables should the `billingdb_reader` read from, since providing full access to the entire reporting schema would be excessive. You can specify which tables should be readable by the user like this:
+
+  ```sql
+  -- Login as dbt_user
+
+  GRANT USAGE ON SCHEMA reporting TO billingdb_reader;
+  GRANT SELECT ON TABLE reporting.<some_table> TO billingdb_reader;
+  GRANT SELECT ON TABLE reporting. <some_other_table> TO billingdb_reader;
+  ...
+  ```
 
 ## 050. Web Gateway
 

From 70d296594f4ff8937ad773f494cdac0aff57295e Mon Sep 17 00:00:00 2001
From: Pablo Martin <pablo.martin@superhog.com>
Date: Wed, 18 Dec 2024 14:38:01 +0100
Subject: [PATCH 08/12] typo

---
 human-script.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/human-script.md b/human-script.md
index 03b581b..c262f02 100644
--- a/human-script.md
+++ b/human-script.md
@@ -524,7 +524,7 @@ Follow this to deploy the entire data infra.
 
   GRANT USAGE ON SCHEMA reporting TO billingdb_reader;
   GRANT SELECT ON TABLE reporting.<some_table> TO billingdb_reader;
-  GRANT SELECT ON TABLE reporting. <some_other_table> TO billingdb_reader;
+  GRANT SELECT ON TABLE reporting.<some_other_table> TO billingdb_reader;
   ...
   ```
 

From 797e9506bfbdbdadef659fb6168d6144e3876ac5 Mon Sep 17 00:00:00 2001
From: Pablo Martin <pablo.martin@superhog.com>
Date: Wed, 2 Apr 2025 15:58:55 +0200
Subject: [PATCH 09/12] add new steps

---
 human-script.md | 122 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)

diff --git a/human-script.md b/human-script.md
index c262f02..6a23fad 100644
--- a/human-script.md
+++ b/human-script.md
@@ -437,6 +437,7 @@ Follow this to deploy the entire data infra.
   - A user `powerbi_user`, with `consumer` role.
   - A user `airbyte user`, with permission to create new schemas.
   - A user `billingdb_reader`, with permission to read some tables from the reporting schema.
+  - A user `ci_reader`, with `modeler` role.
   - *Note: replace the password fields with serious passwords and note them down.*
   - *Note: replace the name of the admin user*
 
@@ -468,6 +469,10 @@ Follow this to deploy the entire data infra.
   CREATE ROLE billingdb_reader LOGIN PASSWORD 'password' VALID UNTIL 'infinity';
 
   CREATE ROLE modeler INHERIT; 
+
+  CREATE ROLE ci_reader LOGIN PASSWORD 'password' VALID UNTIL 'infinity';
+  GRANT modeler to ci_reader;
+
   -- You might want to create a first personal user with modeler role here
 
   -- Login as airbyte_user
@@ -764,6 +769,123 @@ We will deploy a dedicated VM to act as a web server for internal services.
 - Our dbt project (<https://guardhog.visualstudio.com/Data/_git/data-dwh-dbt-project>) can be deployed on any linux VM within the virtual network. The instructions on how to deploy and schedule it are in the project repository.
 - You can opt to deploy it in the same machine where airbyte is stored, since that machine is probably fairly underutilized.
 
+### 080.1 dbt CI server
+
+Having CI pipelines in the dbt git project is a great way to automate certain quality checks around the DWH code. The way our CI strategy is designed, you need to prepare a VM within our Data private network for CI jobs to run in there. This section explains how to set up the VM. Note that we will only cover infrastructure topics here: you'll have to check the dbt repository for the full story on how to set up the CI. We recommend covering the steps describe here before jumping into the dbt specific part of things.
+
+#### 080.1.1 Deploying the CI VM
+
+- We will have a dedicated VM for the CI pipelines. The pipelines can be resource hungry at times, so I recommend having a dedicated VM that is not shared with other workloads so you can assign resources adequately and avoid resource competition with other services.
+- Create a new VM following these steps.
+  - Basic settings
+    - Name it: `pipeline-host-<your-env>`
+    - Use Ubuntu Server 22.04
+    - Size should be adjusted to the needs of the dbt project. I suggest starting on a `B2s` instance and drive upgrade decisions based on what you observe during normal usage.
+    - Use username: `azureuser`
+    - Use the SSH Key: `superhog-data-general-ssh-<your-env>`
+    - Select the option `None` for Public inbound ports.
+  - Disk settings
+    - Disk requirements will vary depending on the nature of the dbt project state and the PRs. I suggest starting with the default 30gb and monitoring usage. If you see spikes that get close to 100%, increase the size to prevent a particularly heavy PR to consume all space.
+  - Networking
+    - Attach to the virtual network `superhog-data-vnet-<your-env>`
+    - Attach to the subnet `services-subnet`
+    - Assign no public IP.
+    - For setting `NIC network security group` select option `None`
+  - Management settings
+    - Defaults are fine.
+  - Monitoring
+    - Defaults are fine.
+  - Advanced
+    - Defaults are fine.
+  - Add tags:
+    - `team: data`
+    - `environment: <your-env>`
+    - `project: dbt`
+- Once the VM is running, you should be able to ssh into the machine when your VPN is active.
+
+#### 080.1.2 Install docker and docker compose
+
+- We will use docker and docker compose to run a dockerized Postgres server in the VM.
+- You can install docker and docker compose by placing the following code in a script and running it:
+```bash
+#!/bin/bash
+set -e  # Exit on error
+
+echo "🔄 Updating system packages..."
+sudo apt update && sudo apt upgrade -y
+
+echo "📦 Installing dependencies..."
+sudo apt install -y \
+    apt-transport-https \
+    ca-certificates \
+    curl \
+    software-properties-common \
+    lsb-release \
+    gnupg2 \
+    jq \
+    lsb-release
+
+echo "🔑 Adding Docker GPG key..."
+curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
+
+echo "🖋️ Adding Docker repository..."
+echo "deb [arch=amd64 signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+
+echo "📦 Installing Docker..."
+sudo apt update
+sudo apt install -y docker-ce docker-ce-cli containerd.io
+
+echo "✅ Docker installed successfully!"
+
+echo "🔧 Enabling Docker to start on boot..."
+sudo systemctl enable docker
+
+echo "🔄 Installing Docker Compose..."
+DOCKER_COMPOSE_VERSION=$(curl -s https://api.github.com/repos/docker/compose/releases/latest | jq -r .tag_name)
+sudo curl -L "https://github.com/docker/compose/releases/download/${DOCKER_COMPOSE_VERSION}/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
+
+echo "📂 Setting permissions for Docker Compose..."
+sudo chmod +x /usr/local/bin/docker-compose
+
+echo "✅ Docker Compose installed successfully!"
+
+# Verifying installation
+echo "🔍 Verifying Docker and Docker Compose versions..."
+docker --version
+docker-compose --version
+
+usermod -a -G docker $USER
+newgrp docker
+
+echo "✅ Docker and Docker Compose installation completed!"
+```
+
+#### 080.1.3 Install psql
+
+- CI pipelines require `psql`, Postgres CLI client, to be available.
+- You can install it simply with `sudo apt-get install postgresql-client-16`.
+
+#### 080.1.4 Create user in DWH
+
+- The CI Postgres will use some Foreign Data Wrappers (FDW) pointing at the DWH. We will need a dedicated user in the DWH instance to control the permissions received by the CI server.
+- The section of this guide dedicated to setting up the DWH explains how to create this user. If you have followed it, you might have already created the user. Otherwise, head there to complete this part.
+
+#### 080.1.5 Install the Azure Devops agent and sync with Devops
+
+- The VM needs to have a Microsoft provided Azure agent to be reachable by Devops. This agent listens to requests from Devops, basically allowing Devops to execute things on the VM.
+- Some configuration needs to be done in the Azure Devops project to allow Azure Devops to reach the VM.
+- You can find how to set this up in Ubuntu in these links:
+  - Official MSFT docs: https://learn.microsoft.com/en-us/azure/devops/pipelines/agents/linux-agent?view=azure-devops
+  - Helpful walkthrough video: https://www.youtube.com/watch?v=Hy6fne9oQJM
+
+#### 080.1.6 Clone the project and further steps
+
+- We are going to need a local clone of the git repository to perform some setup steps, as well as for business as usual execution.
+- To do this:
+  - Use or create some SSH key to have access to clone repos from Azure Devops. This could be the key `superhog-data-general-ssh-<your-env>` or some other key. This guide leaves this detail up to you. You can read more on how to use SSH keys with Azure Devops here: https://learn.microsoft.com/en-us/azure/devops/repos/git/use-ssh-keys-to-authenticate?view=azure-devops.
+  - Once the CI VM is capable, clone the dbt project into the `azureuser` home dir.
+  - There are several steps after this, for which you should find instructions in the dbt repository itself.
+
 ## 090. Monitoring
 
 ### 090.1 Infra monitoring

From 1996bab59501f8d4aa3666b17ee2bffce8d2ed3b Mon Sep 17 00:00:00 2001
From: Pablo Martin <pablo.martin@superhog.com>
Date: Fri, 4 Apr 2025 11:54:22 +0200
Subject: [PATCH 10/12] fix psql install

---
 human-script.md | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/human-script.md b/human-script.md
index 6a23fad..6816a13 100644
--- a/human-script.md
+++ b/human-script.md
@@ -863,7 +863,18 @@ echo "✅ Docker and Docker Compose installation completed!"
 #### 080.1.3 Install psql
 
 - CI pipelines require `psql`, Postgres CLI client, to be available.
-- You can install it simply with `sudo apt-get install postgresql-client-16`.
+- You can install it with the following script:
+
+```bash
+sudo apt-get update
+sudo apt-get install -y gnupg2 wget nano
+
+sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list'
+curl -fsSL https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo gpg --dearmor -o /etc/apt/trusted.gpg.d/postgresql.gpg
+sudo apt-get update
+
+sudo apt-get install -y postgresql-client-16
+```
 
 #### 080.1.4 Create user in DWH
 

From 4035d96369ba753d789fa59703a2ec0b5a13697a Mon Sep 17 00:00:00 2001
From: Pablo Martin <pablo.martin@superhog.com>
Date: Fri, 4 Apr 2025 15:36:19 +0200
Subject: [PATCH 11/12] final touches from real life testing

---
 human-script.md | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/human-script.md b/human-script.md
index 6816a13..53b3674 100644
--- a/human-script.md
+++ b/human-script.md
@@ -854,7 +854,7 @@ echo "🔍 Verifying Docker and Docker Compose versions..."
 docker --version
 docker-compose --version
 
-usermod -a -G docker $USER
+sudo usermod -a -G docker $USER
 newgrp docker
 
 echo "✅ Docker and Docker Compose installation completed!"
@@ -876,25 +876,42 @@ sudo apt-get update
 sudo apt-get install -y postgresql-client-16
 ```
 
-#### 080.1.4 Create user in DWH
+#### 080.1.4 Install Python
+
+- Python is needed to create virtual environments and run dbt and other commands.
+- You can use the following script to install python and some required packages:
+
+```bash
+sudo apt-get install python3.10 python3.10-venv
+```
+
+#### 080.1.5 Create user in DWH
 
 - The CI Postgres will use some Foreign Data Wrappers (FDW) pointing at the DWH. We will need a dedicated user in the DWH instance to control the permissions received by the CI server.
 - The section of this guide dedicated to setting up the DWH explains how to create this user. If you have followed it, you might have already created the user. Otherwise, head there to complete this part.
 
-#### 080.1.5 Install the Azure Devops agent and sync with Devops
+#### 080.1.6 Install the Azure Devops agent and sync with Devops
 
 - The VM needs to have a Microsoft provided Azure agent to be reachable by Devops. This agent listens to requests from Devops, basically allowing Devops to execute things on the VM.
-- Some configuration needs to be done in the Azure Devops project to allow Azure Devops to reach the VM.
+- Some configuration needs to be done in the Azure Devops project to allow Azure Devops to reach the VM. This might include creating a pool if it doesn't exist.
 - You can find how to set this up in Ubuntu in these links:
   - Official MSFT docs: https://learn.microsoft.com/en-us/azure/devops/pipelines/agents/linux-agent?view=azure-devops
   - Helpful walkthrough video: https://www.youtube.com/watch?v=Hy6fne9oQJM
+- Make sure to install the agent as a systemd service to have it always run on boot. The details are explained in Microsoft's documentation.
+- Once the agent is installed and correctly linked to one of our Pools in Devops, you should see the agent listed in the Devops UI for that pool, with status online. Don't move on if you haven't succeeded on this point.
 
-#### 080.1.6 Clone the project and further steps
+#### 080.1.7 Clone the project and further steps
 
 - We are going to need a local clone of the git repository to perform some setup steps, as well as for business as usual execution.
 - To do this:
-  - Use or create some SSH key to have access to clone repos from Azure Devops. This could be the key `superhog-data-general-ssh-<your-env>` or some other key. This guide leaves this detail up to you. You can read more on how to use SSH keys with Azure Devops here: https://learn.microsoft.com/en-us/azure/devops/repos/git/use-ssh-keys-to-authenticate?view=azure-devops.
-  - Once the CI VM is capable, clone the dbt project into the `azureuser` home dir.
+  - Add some SSH key to the VM to have access to clone repos from Azure Devops. This could be the key `superhog-data-general-ssh-<your-env>` or some other key. This guide leaves this detail up to you. You can read more on how to use SSH keys with Azure Devops here: https://learn.microsoft.com/en-us/azure/devops/repos/git/use-ssh-keys-to-authenticate?view=azure-devops.
+  - Also add this config to make SSH cloning work. Note that the details might have changed since this guide was written, so your mileage may vary.
+  ```
+  Host ssh.dev.azure.com
+    Hostname ssh.dev.azure.com
+    IdentityFile ~/.ssh/<whatever-key-file-you-are-using>
+  ```
+  - Once the CI VM is SSH capable, clone the dbt project into the `azureuser` home dir.
   - There are several steps after this, for which you should find instructions in the dbt repository itself.
 
 ## 090. Monitoring

From 6a3cfb7f758e3de032b881bbbf871318335276e7 Mon Sep 17 00:00:00 2001
From: Pablo Martin <pablo.martin@superhog.com>
Date: Tue, 8 Apr 2025 12:08:53 +0200
Subject: [PATCH 12/12] tiny arg

---
 human-script.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/human-script.md b/human-script.md
index 53b3674..f197d76 100644
--- a/human-script.md
+++ b/human-script.md
@@ -882,7 +882,7 @@ sudo apt-get install -y postgresql-client-16
 - You can use the following script to install python and some required packages:
 
 ```bash
-sudo apt-get install python3.10 python3.10-venv
+sudo apt-get install -y python3.10 python3.10-venv
 ```
 
 #### 080.1.5 Create user in DWH