diff --git a/README.md b/README.md index 8ce5d98..bd575ab 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,8 @@ # The Complete dbt Bootcamp This is my personal repository for the Udemy course: The Complete dbt (Data Build Tool) Bootcamp: Zero to Hero. + + +## Links + +- Github for the course: https://github.com/nordquant/complete-dbt-bootcamp-zero-to-hero \ No newline at end of file diff --git a/code_thingies/database/README.md b/code_thingies/database/README.md new file mode 100644 index 0000000..823e65d --- /dev/null +++ b/code_thingies/database/README.md @@ -0,0 +1,128 @@ +# Database + +The course is designed to be done on Snowflake. But I am a stubborn idiot, and I want to try dbt with PostgreSQL, so I'll just do that. + +This dir contains some useful bits to raise a local PostgreSQL instance with Docker that resembles as much as possible the Snowflake environment proposed in the course. + +## Setup steps + +- Run a `docker compose up` with the yaml file of this dir. +- Run the following commands to get the database ready in it's starting state + +```SQL +CREATE USER transformation_user WITH ENCRYPTED PASSWORD 'transformation_user_password'; + +CREATE DATABASE airbnb; + +-- Connect to your newly created `airbnb` database for the next commands. + +CREATE SCHEMA raw; + +CREATE TABLE raw_listings ( + id INTEGER, + listing_url VARCHAR(1000), + name VARCHAR(256), + room_type VARCHAR(256), + minimum_nights INTEGER, + host_id INTEGER, + price VARCHAR(256), + created_at TIMESTAMP, + updated_at TIMESTAMP +); + +CREATE TABLE raw_reviews ( + listing_id INTEGER, + date TIMESTAMP, + reviewer_name VARCHAR(256), + comments TEXT, + sentiment TEXT +); + +CREATE TABLE raw_hosts ( + id INTEGER, + name VARCHAR(256), + is_superhost VARCHAR(256), + created_at TIMESTAMP, + updated_at TIMESTAMP +); + +``` + +After, you will have to download some CSV files with the data to populate the database. The AWS CLI commands below will do that: + +```bash +aws s3 cp s3://dbtlearn/listings.csv listings.csv +aws s3 cp s3://dbtlearn/reviews.csv reviews.csv +aws s3 cp s3://dbtlearn/hosts.csv hosts.csv +``` + +# Introduction and Environment Setup + +## Snowflake user creation +Copy these SQL statements into a Snowflake Worksheet, select all and execute them (i.e. pressing the play button). + + + + + + +## Snowflake data import + +Copy these SQL statements into a Snowflake Worksheet, select all and execute them (i.e. pressing the play button). + +```sql +-- Set up the defaults +USE WAREHOUSE COMPUTE_WH; +USE DATABASE airbnb; +USE SCHEMA RAW; + +-- Create our three tables and import the data from S3 +CREATE OR REPLACE TABLE raw_listings + (id integer, + listing_url string, + name string, + room_type string, + minimum_nights integer, + host_id integer, + price string, + created_at datetime, + updated_at datetime); + +COPY INTO raw_listings (id, + listing_url, + name, + room_type, + minimum_nights, + host_id, + price, + created_at, + updated_at) + from 's3://dbtlearn/listings.csv' + FILE_FORMAT = (type = 'CSV' skip_header = 1 + FIELD_OPTIONALLY_ENCLOSED_BY = '"'); + + +CREATE OR REPLACE TABLE raw_reviews + (listing_id integer, + date datetime, + reviewer_name string, + comments string, + sentiment string); + +COPY INTO raw_reviews (listing_id, date, reviewer_name, comments, sentiment) + from 's3://dbtlearn/reviews.csv' + FILE_FORMAT = (type = 'CSV' skip_header = 1 + FIELD_OPTIONALLY_ENCLOSED_BY = '"'); + + +CREATE OR REPLACE TABLE raw_hosts + (id integer, + name string, + is_superhost string, + created_at datetime, + updated_at datetime); + +COPY INTO raw_hosts (id, name, is_superhost, created_at, updated_at) + from 's3://dbtlearn/hosts.csv' + FILE_FORMAT = (type = 'CSV' skip_header = 1 + FIELD_OPTIONALLY_ENCLOSED_BY = '"'); diff --git a/code_thingies/database/docker-compose.yaml b/code_thingies/database/docker-compose.yaml new file mode 100644 index 0000000..b31f477 --- /dev/null +++ b/code_thingies/database/docker-compose.yaml @@ -0,0 +1,43 @@ +version: '3.5' + +services: + postgres: + container_name: dbt_postgres + image: postgres:16 + environment: + POSTGRES_USER: dbt_postgres_user + POSTGRES_PASSWORD: dbt_postgres_password + PGDATA: /data/postgres + volumes: + - postgres:/data/postgres + ports: + - "5432:5432" + networks: + - postgres + restart: unless-stopped + + pgadmin: + container_name: pgadmin_container + image: dpage/pgadmin4 + environment: + PGADMIN_DEFAULT_EMAIL: pgadmin_user@email.com + PGADMIN_DEFAULT_PASSWORD: pgadmin_password + PGADMIN_CONFIG_SERVER_MODE: 'False' + MASTER_PASSWORD: amasterpasswordshouldbelong + volumes: + - pgadmin:/var/lib/pgadmin + + ports: + - "12345:80" + networks: + - postgres + restart: unless-stopped + + +networks: + postgres: + driver: bridge + +volumes: + postgres: + pgadmin: \ No newline at end of file diff --git a/notes/1.md b/notes/1.md index 596ffef..ba1f52b 100644 --- a/notes/1.md +++ b/notes/1.md @@ -97,4 +97,5 @@ dbt makes sense nowadays because the modern data stack makes transformations wit - ELT in Airbnb. - Data from insideairbnb.com/berlin/ -- The project will use snowflake as a DWH and preset (managed superset) as a BI tool \ No newline at end of file +- The project will use snowflake as a DWH and preset (managed superset) as a BI tool. +