Preparing database stuff

This commit is contained in:
Pablo Martin 2023-10-27 10:05:53 +02:00
parent 2a76bdb73f
commit 9c96234a1f
4 changed files with 178 additions and 1 deletions

View file

@ -1,3 +1,8 @@
# The Complete dbt Bootcamp # The Complete dbt Bootcamp
This is my personal repository for the Udemy course: The Complete dbt (Data Build Tool) Bootcamp: Zero to Hero. This is my personal repository for the Udemy course: The Complete dbt (Data Build Tool) Bootcamp: Zero to Hero.
## Links
- Github for the course: https://github.com/nordquant/complete-dbt-bootcamp-zero-to-hero

View file

@ -0,0 +1,128 @@
# Database
The course is designed to be done on Snowflake. But I am a stubborn idiot, and I want to try dbt with PostgreSQL, so I'll just do that.
This dir contains some useful bits to raise a local PostgreSQL instance with Docker that resembles as much as possible the Snowflake environment proposed in the course.
## Setup steps
- Run a `docker compose up` with the yaml file of this dir.
- Run the following commands to get the database ready in it's starting state
```SQL
CREATE USER transformation_user WITH ENCRYPTED PASSWORD 'transformation_user_password';
CREATE DATABASE airbnb;
-- Connect to your newly created `airbnb` database for the next commands.
CREATE SCHEMA raw;
CREATE TABLE raw_listings (
id INTEGER,
listing_url VARCHAR(1000),
name VARCHAR(256),
room_type VARCHAR(256),
minimum_nights INTEGER,
host_id INTEGER,
price VARCHAR(256),
created_at TIMESTAMP,
updated_at TIMESTAMP
);
CREATE TABLE raw_reviews (
listing_id INTEGER,
date TIMESTAMP,
reviewer_name VARCHAR(256),
comments TEXT,
sentiment TEXT
);
CREATE TABLE raw_hosts (
id INTEGER,
name VARCHAR(256),
is_superhost VARCHAR(256),
created_at TIMESTAMP,
updated_at TIMESTAMP
);
```
After, you will have to download some CSV files with the data to populate the database. The AWS CLI commands below will do that:
```bash
aws s3 cp s3://dbtlearn/listings.csv listings.csv
aws s3 cp s3://dbtlearn/reviews.csv reviews.csv
aws s3 cp s3://dbtlearn/hosts.csv hosts.csv
```
# Introduction and Environment Setup
## Snowflake user creation
Copy these SQL statements into a Snowflake Worksheet, select all and execute them (i.e. pressing the play button).
## Snowflake data import
Copy these SQL statements into a Snowflake Worksheet, select all and execute them (i.e. pressing the play button).
```sql
-- Set up the defaults
USE WAREHOUSE COMPUTE_WH;
USE DATABASE airbnb;
USE SCHEMA RAW;
-- Create our three tables and import the data from S3
CREATE OR REPLACE TABLE raw_listings
(id integer,
listing_url string,
name string,
room_type string,
minimum_nights integer,
host_id integer,
price string,
created_at datetime,
updated_at datetime);
COPY INTO raw_listings (id,
listing_url,
name,
room_type,
minimum_nights,
host_id,
price,
created_at,
updated_at)
from 's3://dbtlearn/listings.csv'
FILE_FORMAT = (type = 'CSV' skip_header = 1
FIELD_OPTIONALLY_ENCLOSED_BY = '"');
CREATE OR REPLACE TABLE raw_reviews
(listing_id integer,
date datetime,
reviewer_name string,
comments string,
sentiment string);
COPY INTO raw_reviews (listing_id, date, reviewer_name, comments, sentiment)
from 's3://dbtlearn/reviews.csv'
FILE_FORMAT = (type = 'CSV' skip_header = 1
FIELD_OPTIONALLY_ENCLOSED_BY = '"');
CREATE OR REPLACE TABLE raw_hosts
(id integer,
name string,
is_superhost string,
created_at datetime,
updated_at datetime);
COPY INTO raw_hosts (id, name, is_superhost, created_at, updated_at)
from 's3://dbtlearn/hosts.csv'
FILE_FORMAT = (type = 'CSV' skip_header = 1
FIELD_OPTIONALLY_ENCLOSED_BY = '"');

View file

@ -0,0 +1,43 @@
version: '3.5'
services:
postgres:
container_name: dbt_postgres
image: postgres:16
environment:
POSTGRES_USER: dbt_postgres_user
POSTGRES_PASSWORD: dbt_postgres_password
PGDATA: /data/postgres
volumes:
- postgres:/data/postgres
ports:
- "5432:5432"
networks:
- postgres
restart: unless-stopped
pgadmin:
container_name: pgadmin_container
image: dpage/pgadmin4
environment:
PGADMIN_DEFAULT_EMAIL: pgadmin_user@email.com
PGADMIN_DEFAULT_PASSWORD: pgadmin_password
PGADMIN_CONFIG_SERVER_MODE: 'False'
MASTER_PASSWORD: amasterpasswordshouldbelong
volumes:
- pgadmin:/var/lib/pgadmin
ports:
- "12345:80"
networks:
- postgres
restart: unless-stopped
networks:
postgres:
driver: bridge
volumes:
postgres:
pgadmin:

View file

@ -97,4 +97,5 @@ dbt makes sense nowadays because the modern data stack makes transformations wit
- ELT in Airbnb. - ELT in Airbnb.
- Data from insideairbnb.com/berlin/ - Data from insideairbnb.com/berlin/
- The project will use snowflake as a DWH and preset (managed superset) as a BI tool - The project will use snowflake as a DWH and preset (managed superset) as a BI tool.