Preparing database stuff
This commit is contained in:
parent
2a76bdb73f
commit
9c96234a1f
4 changed files with 178 additions and 1 deletions
|
|
@ -1,3 +1,8 @@
|
|||
# The Complete dbt Bootcamp
|
||||
|
||||
This is my personal repository for the Udemy course: The Complete dbt (Data Build Tool) Bootcamp: Zero to Hero.
|
||||
|
||||
|
||||
## Links
|
||||
|
||||
- Github for the course: https://github.com/nordquant/complete-dbt-bootcamp-zero-to-hero
|
||||
128
code_thingies/database/README.md
Normal file
128
code_thingies/database/README.md
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
# Database
|
||||
|
||||
The course is designed to be done on Snowflake. But I am a stubborn idiot, and I want to try dbt with PostgreSQL, so I'll just do that.
|
||||
|
||||
This dir contains some useful bits to raise a local PostgreSQL instance with Docker that resembles as much as possible the Snowflake environment proposed in the course.
|
||||
|
||||
## Setup steps
|
||||
|
||||
- Run a `docker compose up` with the yaml file of this dir.
|
||||
- Run the following commands to get the database ready in it's starting state
|
||||
|
||||
```SQL
|
||||
CREATE USER transformation_user WITH ENCRYPTED PASSWORD 'transformation_user_password';
|
||||
|
||||
CREATE DATABASE airbnb;
|
||||
|
||||
-- Connect to your newly created `airbnb` database for the next commands.
|
||||
|
||||
CREATE SCHEMA raw;
|
||||
|
||||
CREATE TABLE raw_listings (
|
||||
id INTEGER,
|
||||
listing_url VARCHAR(1000),
|
||||
name VARCHAR(256),
|
||||
room_type VARCHAR(256),
|
||||
minimum_nights INTEGER,
|
||||
host_id INTEGER,
|
||||
price VARCHAR(256),
|
||||
created_at TIMESTAMP,
|
||||
updated_at TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE raw_reviews (
|
||||
listing_id INTEGER,
|
||||
date TIMESTAMP,
|
||||
reviewer_name VARCHAR(256),
|
||||
comments TEXT,
|
||||
sentiment TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE raw_hosts (
|
||||
id INTEGER,
|
||||
name VARCHAR(256),
|
||||
is_superhost VARCHAR(256),
|
||||
created_at TIMESTAMP,
|
||||
updated_at TIMESTAMP
|
||||
);
|
||||
|
||||
```
|
||||
|
||||
After, you will have to download some CSV files with the data to populate the database. The AWS CLI commands below will do that:
|
||||
|
||||
```bash
|
||||
aws s3 cp s3://dbtlearn/listings.csv listings.csv
|
||||
aws s3 cp s3://dbtlearn/reviews.csv reviews.csv
|
||||
aws s3 cp s3://dbtlearn/hosts.csv hosts.csv
|
||||
```
|
||||
|
||||
# Introduction and Environment Setup
|
||||
|
||||
## Snowflake user creation
|
||||
Copy these SQL statements into a Snowflake Worksheet, select all and execute them (i.e. pressing the play button).
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## Snowflake data import
|
||||
|
||||
Copy these SQL statements into a Snowflake Worksheet, select all and execute them (i.e. pressing the play button).
|
||||
|
||||
```sql
|
||||
-- Set up the defaults
|
||||
USE WAREHOUSE COMPUTE_WH;
|
||||
USE DATABASE airbnb;
|
||||
USE SCHEMA RAW;
|
||||
|
||||
-- Create our three tables and import the data from S3
|
||||
CREATE OR REPLACE TABLE raw_listings
|
||||
(id integer,
|
||||
listing_url string,
|
||||
name string,
|
||||
room_type string,
|
||||
minimum_nights integer,
|
||||
host_id integer,
|
||||
price string,
|
||||
created_at datetime,
|
||||
updated_at datetime);
|
||||
|
||||
COPY INTO raw_listings (id,
|
||||
listing_url,
|
||||
name,
|
||||
room_type,
|
||||
minimum_nights,
|
||||
host_id,
|
||||
price,
|
||||
created_at,
|
||||
updated_at)
|
||||
from 's3://dbtlearn/listings.csv'
|
||||
FILE_FORMAT = (type = 'CSV' skip_header = 1
|
||||
FIELD_OPTIONALLY_ENCLOSED_BY = '"');
|
||||
|
||||
|
||||
CREATE OR REPLACE TABLE raw_reviews
|
||||
(listing_id integer,
|
||||
date datetime,
|
||||
reviewer_name string,
|
||||
comments string,
|
||||
sentiment string);
|
||||
|
||||
COPY INTO raw_reviews (listing_id, date, reviewer_name, comments, sentiment)
|
||||
from 's3://dbtlearn/reviews.csv'
|
||||
FILE_FORMAT = (type = 'CSV' skip_header = 1
|
||||
FIELD_OPTIONALLY_ENCLOSED_BY = '"');
|
||||
|
||||
|
||||
CREATE OR REPLACE TABLE raw_hosts
|
||||
(id integer,
|
||||
name string,
|
||||
is_superhost string,
|
||||
created_at datetime,
|
||||
updated_at datetime);
|
||||
|
||||
COPY INTO raw_hosts (id, name, is_superhost, created_at, updated_at)
|
||||
from 's3://dbtlearn/hosts.csv'
|
||||
FILE_FORMAT = (type = 'CSV' skip_header = 1
|
||||
FIELD_OPTIONALLY_ENCLOSED_BY = '"');
|
||||
43
code_thingies/database/docker-compose.yaml
Normal file
43
code_thingies/database/docker-compose.yaml
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
version: '3.5'
|
||||
|
||||
services:
|
||||
postgres:
|
||||
container_name: dbt_postgres
|
||||
image: postgres:16
|
||||
environment:
|
||||
POSTGRES_USER: dbt_postgres_user
|
||||
POSTGRES_PASSWORD: dbt_postgres_password
|
||||
PGDATA: /data/postgres
|
||||
volumes:
|
||||
- postgres:/data/postgres
|
||||
ports:
|
||||
- "5432:5432"
|
||||
networks:
|
||||
- postgres
|
||||
restart: unless-stopped
|
||||
|
||||
pgadmin:
|
||||
container_name: pgadmin_container
|
||||
image: dpage/pgadmin4
|
||||
environment:
|
||||
PGADMIN_DEFAULT_EMAIL: pgadmin_user@email.com
|
||||
PGADMIN_DEFAULT_PASSWORD: pgadmin_password
|
||||
PGADMIN_CONFIG_SERVER_MODE: 'False'
|
||||
MASTER_PASSWORD: amasterpasswordshouldbelong
|
||||
volumes:
|
||||
- pgadmin:/var/lib/pgadmin
|
||||
|
||||
ports:
|
||||
- "12345:80"
|
||||
networks:
|
||||
- postgres
|
||||
restart: unless-stopped
|
||||
|
||||
|
||||
networks:
|
||||
postgres:
|
||||
driver: bridge
|
||||
|
||||
volumes:
|
||||
postgres:
|
||||
pgadmin:
|
||||
|
|
@ -97,4 +97,5 @@ dbt makes sense nowadays because the modern data stack makes transformations wit
|
|||
|
||||
- ELT in Airbnb.
|
||||
- Data from insideairbnb.com/berlin/
|
||||
- The project will use snowflake as a DWH and preset (managed superset) as a BI tool
|
||||
- The project will use snowflake as a DWH and preset (managed superset) as a BI tool.
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue