First commit

This commit is contained in:
uri 2024-11-21 11:36:30 +01:00
parent 1af838b661
commit fb36843b4f
5 changed files with 314 additions and 14 deletions

162
.gitignore vendored Normal file
View file

@ -0,0 +1,162 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

View file

@ -1,20 +1,30 @@
# Introduction
TODO: Give a short introduction of your project. Let this section explain the objectives or the motivation behind this project.
Small repository to save and share Jupyter Notebooks within Data Team.
# Getting Started
TODO: Guide users through getting your code up and running on their own system. In this section you can talk about:
1. Installation process
2. Software dependencies
3. Latest releases
4. API references
# Build and Test
TODO: Describe and show how to build your code and run the tests.
### Basics
# Contribute
TODO: Explain how other users and developers can contribute to make your code better.
- Pre-requisites
- You need a Linux environment. That can be Linux, macOS or WSL.
- You need to have Python `>=3.10` installed.
- All docs will assume you are using VSCode.
- Also install the following VSCode Python extension: ms-python.python
- Set up
- Create a virtual environment for the project with `python3 -m venv venv`.
- It's recommended that you set up the new `venv` as your default interpreter for VSCode. To do this, click Ctrl+Shift+P, and look for the `Python: Select interpreter` option. Choose the new `venv`.
- Ensure that VS code is using this virtual environment. You can activate it by running `source venv/bin/activate`
- Activate the virtual environment and run `pip install -r requirements.txt`
- Lastly, you need to install the following extension to ensure VS code can render the notebooks.
https://marketplace.visualstudio.com/items?itemName=ms-toolsai.jupyter
If you want to learn more about creating good readme files then refer the following [guidelines](https://docs.microsoft.com/en-us/azure/devops/repos/git/create-a-readme?view=azure-devops). You can also seek inspiration from the below readme files:
- [ASP.NET Core](https://github.com/aspnet/Home)
- [Visual Studio Code](https://github.com/Microsoft/vscode)
- [Chakra Core](https://github.com/Microsoft/ChakraCore)
### DWH connection
In order to connect to DWH, you will need to create a local file with the credentials. You can use the file `credentials_example.yml`. Remember to fill the user and password.
Once done, you need to save the credentials file in your local path:
`/home/{your_user}/.superhog-dwh/credentials.yml`
Since this file has credentials, we need to secure it by ensuring that only your user has permissions. You need to run:
`chmod 600 /home/{your_user}/.superhog-dwh/credentials.yml`
Once you've handled the previous steps, you can try to run the code in the template.ipynb file. If it works, then everything is successful. If not, check with someone in Data Team.

7
credentials_example.yml Normal file
View file

@ -0,0 +1,7 @@
envs:
prd:
user: <dwh-user>
password: <dwh-password>
host: superhog-dwh-prd.postgres.database.azure.com
port: 5432
database: dwh

6
requirements.txt Normal file
View file

@ -0,0 +1,6 @@
jupyter
pandas
pyyaml
sqlalchemy
psycopg2-binary
seaborn

115
template.ipynb Normal file
View file

@ -0,0 +1,115 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This is a template for general jupyter notebooks."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import pathlib\n",
"import yaml\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sqlalchemy import create_engine\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/uri/.superhog-dwh/credentials.yml\n"
]
}
],
"source": [
"CREDS_FILEPATH = pathlib.Path.home() / \".superhog-dwh\" / \"credentials.yml\"\n",
"print(CREDS_FILEPATH)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# Prepare connection to DWH\n",
"# Function to read credentials from the YAML file\n",
"def read_credentials(yaml_path: str, env: str = \"prd\"):\n",
" with open(yaml_path, \"r\") as file:\n",
" credentials = yaml.safe_load(file)\n",
" return credentials[\"envs\"][env]\n",
"# Function to create a PostgreSQL connection string\n",
"def create_postgres_engine(creds: dict):\n",
" user = creds[\"user\"]\n",
" password = creds[\"password\"]\n",
" host = creds[\"host\"]\n",
" port = creds[\"port\"]\n",
" database = creds[\"database\"]\n",
" # Create the connection string for SQLAlchemy\n",
" connection_string = f\"postgresql://{user}:{password}@{host}:{port}/{database}\"\n",
" engine = create_engine(connection_string)\n",
" return engine\n",
"# Function to execute a query and return the result as a pandas DataFrame\n",
"def query_to_dataframe(engine, query: str):\n",
" with engine.connect() as connection:\n",
" df = pd.read_sql(query, connection)\n",
" return df\n",
"dwh_creds = read_credentials(yaml_path=CREDS_FILEPATH, env=\"prd\")\n",
"dwh_pg_engine = create_postgres_engine(creds=dwh_creds)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ?column?\n",
"0 1\n"
]
}
],
"source": [
"# Silly query to test things out\n",
"test_df = query_to_dataframe(engine=dwh_pg_engine, query=\"SELECT 1;\")\n",
"print(test_df.head())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}