data-dwh-dbt-project/find_orphan_models_in_db.sh

147 lines
4.5 KiB
Bash
Raw Normal View History

2025-07-04 12:25:21 +02:00
#!/bin/bash
set -euo pipefail
STARTING_DIR="/home/azureuser"
2025-07-08 17:05:45 +02:00
cd "$STARTING_DIR"
2025-07-04 12:25:21 +02:00
# === CONFIGURATION ===
DBT_PROJECT="dwh_dbt"
DBT_TARGET="prd"
PROFILE_YML="$STARTING_DIR/.dbt/profiles.yml"
# === Flag defaults ===
SEND_SLACK=false
# === Parse flags ===
while [[ $# -gt 0 ]]; do
case "$1" in
-s|--slack)
SEND_SLACK=true
shift
;;
-*)
echo "❌ Unknown option: $1"
exit 1
;;
*)
break
;;
esac
done
# === Positional arguments ===
SCHEMAS="$1"
MANIFEST_PATH="$2"
shift 2
IFS=',' read -r -a SCHEMA_ARRAY <<< "$SCHEMAS"
# === Tool check/install ===
install_tool_if_missing() {
TOOL_CALL_NAME=$1
TOOL_APT_NAME=$2
if ! command -v "$TOOL_CALL_NAME" &>/dev/null; then
echo "🔧 Installing missing tool: $TOOL_APT_NAME"
sudo apt-get update -qq
sudo apt-get install -y "$TOOL_APT_NAME"
else
echo "$TOOL_APT_NAME is installed"
fi
}
install_tool_if_missing jq jq
install_tool_if_missing yq yq
install_tool_if_missing psql postgresql-client
# === Slack webhook setup ===
script_dir=$(dirname "$0")
webhooks_file="slack_webhook_urls.txt"
env_file="$script_dir/$webhooks_file"
if [ -f "$env_file" ]; then
export $(grep -v '^#' "$env_file" | xargs)
else
echo "Error: $webhooks_file file not found in the script directory."
exit 1
fi
# === Load DB credentials from profiles.yml ===
echo "🔐 Loading DB credentials from $PROFILE_YML..."
DB_NAME=$(yq e ".${DBT_PROJECT}.outputs.${DBT_TARGET}.dbname" "$PROFILE_YML")
DB_USER=$(yq e ".${DBT_PROJECT}.outputs.${DBT_TARGET}.user" "$PROFILE_YML")
DB_HOST=$(yq e ".${DBT_PROJECT}.outputs.${DBT_TARGET}.host" "$PROFILE_YML")
DB_PORT=$(yq e ".${DBT_PROJECT}.outputs.${DBT_TARGET}.port" "$PROFILE_YML")
export PGPASSWORD=$(yq e ".${DBT_PROJECT}.outputs.${DBT_TARGET}.pass" "$PROFILE_YML")
# === Get list of tables/views from Postgres ===
echo "🗃️ Reading current tables/views from PostgreSQL..."
POSTGRES_OBJECTS=()
for SCHEMA in "${SCHEMA_ARRAY[@]}"; do
echo "🔎 Scanning schema: $SCHEMA"
TABLES=$(psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" -Atc "
2025-07-08 17:05:45 +02:00
SELECT LOWER(table_schema || '.' || table_name)
2025-07-04 12:25:21 +02:00
FROM information_schema.tables
WHERE table_schema = '$SCHEMA'
2025-07-08 17:05:45 +02:00
AND table_type IN ('BASE TABLE', 'VIEW')
AND table_name NOT LIKE 'pg_%'
ORDER BY table_schema, table_name;
2025-07-04 12:25:21 +02:00
")
while IFS= read -r tbl; do
2025-07-08 17:05:45 +02:00
tbl_cleaned=$(echo "$tbl" | tr -d '[:space:]')
[[ -n "$tbl_cleaned" ]] && POSTGRES_OBJECTS+=("$tbl_cleaned")
2025-07-04 12:25:21 +02:00
done <<< "$TABLES"
done
POSTGRES_OBJECTS=($(printf "%s\n" "${POSTGRES_OBJECTS[@]}" | sort -u))
# === Parse manifest.json for dbt model output names ===
echo "📦 Extracting model output names from dbt manifest..."
DBT_OBJECTS=()
DBT_ENTRIES=$(jq -r '
.nodes | to_entries[] |
select(.value.resource_type == "model" or .value.resource_type == "seed") |
.value.schema + "." + .value.alias
' "$MANIFEST_PATH")
while IFS= read -r entry; do
2025-07-08 17:05:45 +02:00
entry_cleaned=$(echo "$entry" | tr -d '[:space:]' | tr '[:upper:]' '[:lower:]')
[[ -n "$entry_cleaned" ]] && DBT_OBJECTS+=("$entry_cleaned")
2025-07-04 12:25:21 +02:00
done <<< "$DBT_ENTRIES"
DBT_OBJECTS=($(printf "%s\n" "${DBT_OBJECTS[@]}" | sort -u))
# === Compare ===
echo "📊 Comparing DBT models vs Postgres state..."
2025-07-08 17:05:45 +02:00
RELEVANT_MODELS=($(comm -12 <(printf "%s\n" "${POSTGRES_OBJECTS[@]}" | sort) <(printf "%s\n" "${DBT_OBJECTS[@]}" | sort)))
STALE_MODELS=($(comm -23 <(printf "%s\n" "${POSTGRES_OBJECTS[@]}" | sort) <(printf "%s\n" "${DBT_OBJECTS[@]}" | sort)))
2025-07-04 12:25:21 +02:00
# === Output ===
echo ""
echo "✅ Relevant models (in both DB and DBT):"
printf "%s\n" "${RELEVANT_MODELS[@]}" | sort
echo ""
echo "⚠️ Stale models (in DB but NOT in DBT):"
printf "%s\n" "${STALE_MODELS[@]}" | sort
# === Format stale models for Slack ===
if [ "$SEND_SLACK" = true ]; then
echo "✅ Sending slack message with results."
if [ ${#STALE_MODELS[@]} -eq 0 ]; then
SLACK_MSG=":white_check_mark::white_check_mark::white_check_mark: dbt models reviewed. No stale models found in the database! :white_check_mark::white_check_mark::white_check_mark:"
curl -X POST -H 'Content-type: application/json' \
--data "{\"text\":\"$SLACK_MSG\"}" \
"$SLACK_RECEIPT_WEBHOOK_URL"
else
SLACK_MSG=":rotating_light::rotating_light::rotating_light: Stale models detected in Postgres (not in dbt manifest): :rotating_light::rotating_light::rotating_light:\n"
for model in "${STALE_MODELS[@]}"; do
SLACK_MSG+="- \`$model\`\n"
done
curl -X POST -H 'Content-type: application/json' \
--data "{\"text\":\"$SLACK_MSG\"}" \
"$SLACK_ALERT_WEBHOOK_URL"
fi
fi