Unfinished work

Use single quotes so that query works both in MySQL and Trino.
Naive duplication of MySQL elements but for Trino. Not working yet.
2023-04-20 15:36:16 +02:00 · 2023-03-30 13:58:15 +02:00 · 2023-03-29 17:28:16 +02:00 · 2023-03-29 17:18:41 +02:00 · 2023-03-29 17:18:14 +02:00 · 2023-03-29 17:18:00 +02:00
3 changed files with 237 additions and 79 deletions
--- a/README.md
+++ b/README.md
@ -143,6 +143,28 @@ with Flow(...) as flow:
        print("The data is bad!!!")
 ```

+**Run a Great Expectations validation on a Trino query**
+
+```python
+from lolafect.data_testing import run_data_test_on_trino
+
+with Flow(...) as flow:
+      
+    my_query = """SELECT something FROM somewhere"""
+    my_expectations = {...} # A bunch of things you want to validate on the result of the query
+    
+    validation_results = run_data_test_on_trino(
+        name="my-cool-validation",
+        trino_credentials={...},
+        query=my_query,
+        expectations=my_expectations
+    )
+
+    if not validation_results["success"]:
+        print("The data is bad!!!")
+```
+
+
 ### Slack

 **Send a warning message to slack if your tasks fails**
--- a/lolafect/data_testing.py
+++ b/lolafect/data_testing.py
@ -60,6 +60,51 @@ def run_data_test_on_mysql(

    return results

+@task()
+def run_data_test_on_trino(
+    name: str,
+    trino_credentials: dict,
+    query: str,
+    expectation_configurations: List[ExpectationConfiguration],
+    great_expectations_s3_bucket: str = DEFAULT_GREAT_EXPECTATIONS_S3_BUCKET,
+) -> dict:
+    """
+    Validate a query and an expectation suite against a given Trino server.
+
+    :param name: a unique name for the data test.
+    :param trino_credentials: credentials for the Trino cluster.
+    :param query: the query to test against.
+    :param expectation_configurations: the expectations on the dataset.
+    :param great_expectations_s3_bucket: the bucket where Great Expectations
+    files live.
+    :return: the result of the data test.
+    """
+    logger = prefect.context.get("logger")
+
+    logger.info("Creating data context.")
+    data_context = _create_in_memory_data_context_for_trino(
+        trino_credentials, great_expectations_s3_bucket
+    )
+    logger.info("Data context created.")
+    logger.info("Creating expectation suite.")
+    data_context = _create_expectation_suite(
+        data_context, name, expectation_configurations
+    )
+    logger.info("Expectation suite created.")
+    logger.info("Creating checkpoint.")
+    data_context = _create_checkpoint(
+        data_context,
+        f"{trino_credentials['host']}:{trino_credentials['port']}",
+        query,
+        name,
+    )
+    logger.info("Checkpoint created.")
+    logger.info("Running checkpoint.")
+    results = data_context.run_checkpoint(f"{name}_checkpoint")
+    logger.info("Checkpoint finished.")
+    logger.info(f"Validation result: {results['success']}")
+
+    return results

 def _create_in_memory_data_context_for_mysql(
    mysql_credentials: dict,
@ -111,6 +156,58 @@ def _create_in_memory_data_context_for_mysql(

    return data_context

+def _create_in_memory_data_context_for_trino(
+    trino_credentials: dict,
+    great_expectations_s3_bucket: str,
+) -> AbstractDataContext:
+    """
+    Create a DataContext without a YAML config file and specify a Trino
+    datasource.
+
+    :param trino_credentials: the creds to the mysql where the query will be
+    executed.
+    :param great_expectations_s3_bucket: the name of the bucket where Great
+    Exepctations files while be stored.
+    :return: the data context.
+    """
+
+    data_context = BaseDataContext(
+        project_config=DataContextConfig(
+            datasources={
+                f"{trino_credentials['host']}:{trino_credentials['port']}": DatasourceConfig(
+                    class_name="Datasource",
+                    execution_engine={
+                        "class_name": "SqlAlchemyExecutionEngine",
+                        "connection_string": f"trino://%s:%s@%s:%s/%s/%s"
+                        % (
+                                                 trino_credentials["user"],
+                                                 urlquote(trino_credentials["password"]),
+                                                 trino_credentials["host"],
+                                                 trino_credentials["port"],
+                                                 trino_credentials["catalog"],
+                                                 trino_credentials["schema"],
+                        ),
+                    },
+                    data_connectors={
+                        "default_runtime_data_connector_name": {
+                            "class_name": "RuntimeDataConnector",
+                            "batch_identifiers": ["default_identifier_name"],
+                        },
+                        "default_inferred_data_connector_name": {
+                            "class_name": "InferredAssetSqlDataConnector",
+                            "name": "whole_table",
+                        },
+                    },
+                )
+            },
+            store_backend_defaults=S3StoreBackendDefaults(
+                default_bucket_name=great_expectations_s3_bucket
+            ),
+        )
+    )
+
+    return data_context
+

 def _create_expectation_suite(
    data_context: AbstractDataContext,
--- a/tests/test_integration/test_data_testing.py
+++ b/tests/test_integration/test_data_testing.py
@ -1,7 +1,7 @@
 from great_expectations.core.expectation_configuration import ExpectationConfiguration

 from lolafect.lolaconfig import build_lolaconfig
-from lolafect.data_testing import run_data_test_on_mysql
+from lolafect.data_testing import run_data_test_on_mysql, run_data_test_on_trino
 from lolafect.connections import open_ssh_tunnel_with_s3_pkey, close_ssh_tunnel

 # __          __     _____  _   _ _____ _   _  _____ _
@ -18,100 +18,139 @@ from lolafect.connections import open_ssh_tunnel_with_s3_pkey, close_ssh_tunnel
 #   are working properly.

 TEST_LOLACONFIG = build_lolaconfig(flow_name="testing-suite")
+#1 AS a_one,
+#       'lol' AS a_string,
+#       NULL AS a_null
+TEST_QUERY = """
+SELECT *
+from app_lm_mysql_pl.comprea.market
+where id = 1
+"""

+TEST_EXPECTATIONS_THAT_FIT_DATA = [
+    ExpectationConfiguration(
+        expectation_type="expect_column_values_to_be_between",
+        kwargs={"column": "a_one", "min_value": 1, "max_value": 1},
+    ),
+    ExpectationConfiguration(
+        expectation_type="expect_column_values_to_match_like_pattern",
+        kwargs={"column": "a_string", "like_pattern": "%lol%"},
+    ),
+    ExpectationConfiguration(
+        expectation_type="expect_column_values_to_be_null",
+        kwargs={"column": "a_null"},
+    ),
+]

-def test_validation_on_mysql_succeeds():
+TEST_EXPECTATIONS_THAT_DONT_FIT_DATA = [
+    ExpectationConfiguration(
+        expectation_type="expect_column_values_to_be_between",
+        kwargs={"column": "a_one", "min_value": 2, "max_value": 2},
+    ),
+    ExpectationConfiguration(
+        expectation_type="expect_column_values_to_match_like_pattern",
+        kwargs={"column": "a_string", "like_pattern": "%xD%"},
+    ),
+    ExpectationConfiguration(
+        expectation_type="expect_column_values_to_not_be_null",
+        kwargs={"column": "a_null"},
+    ),
+]

-    test_query = """
-        SELECT 1 AS a_one,
-            "lol" AS a_string,
-             NULL AS a_null
-     """
-    test_expectations = [
-        ExpectationConfiguration(
-            expectation_type="expect_column_values_to_be_between",
-            kwargs={"column": "a_one", "min_value": 1, "max_value": 1},
-        ),
-        ExpectationConfiguration(
-            expectation_type="expect_column_values_to_match_like_pattern",
-            kwargs={"column": "a_string", "like_pattern": "%lol%"},
-        ),
-        ExpectationConfiguration(
-            expectation_type="expect_column_values_to_be_null",
-            kwargs={"column": "a_null"},
-        ),
-    ]
+#
+# def test_validation_on_mysql_succeeds():
+#     ssh_tunnel = open_ssh_tunnel_with_s3_pkey.run(
+#         s3_bucket_name=TEST_LOLACONFIG.S3_BUCKET_NAME,
+#         ssh_tunnel_credentials=TEST_LOLACONFIG.SSH_TUNNEL_CREDENTIALS,
+#         remote_target_host=TEST_LOLACONFIG.DW_CREDENTIALS["host"],
+#         remote_target_port=TEST_LOLACONFIG.DW_CREDENTIALS["port"],
+#     )
+#
+#     validation_result = run_data_test_on_mysql.run(
+#         name="lolafect-testing-test_validation_on_mysql_succeeds",
+#         mysql_credentials={
+#             "host": ssh_tunnel.local_bind_address[0],
+#             "port": ssh_tunnel.local_bind_address[1],
+#             "user": TEST_LOLACONFIG.DW_CREDENTIALS["user"],
+#             "password": TEST_LOLACONFIG.DW_CREDENTIALS["password"],
+#             "db": TEST_LOLACONFIG.DW_CREDENTIALS["default_db"],
+#         },
+#         query=TEST_QUERY,
+#         expectation_configurations=TEST_EXPECTATIONS_THAT_FIT_DATA,
+#     )
+#
+#     closed_tunnel = close_ssh_tunnel.run(ssh_tunnel)
+#
+#     data_test_passed = validation_result["success"] == True
+#
+#     assert data_test_passed
+#
+#
+# def test_validation_on_mysql_fails():
+#     ssh_tunnel = open_ssh_tunnel_with_s3_pkey.run(
+#         s3_bucket_name=TEST_LOLACONFIG.S3_BUCKET_NAME,
+#         ssh_tunnel_credentials=TEST_LOLACONFIG.SSH_TUNNEL_CREDENTIALS,
+#         remote_target_host=TEST_LOLACONFIG.DW_CREDENTIALS["host"],
+#         remote_target_port=TEST_LOLACONFIG.DW_CREDENTIALS["port"],
+#     )
+#
+#     validation_result = run_data_test_on_mysql.run(
+#         name="lolafect-testing-test_validation_on_mysql_fails",
+#         mysql_credentials={
+#             "host": ssh_tunnel.local_bind_address[0],
+#             "port": ssh_tunnel.local_bind_address[1],
+#             "user": TEST_LOLACONFIG.DW_CREDENTIALS["user"],
+#             "password": TEST_LOLACONFIG.DW_CREDENTIALS["password"],
+#             "db": TEST_LOLACONFIG.DW_CREDENTIALS["default_db"],
+#         },
+#         query=TEST_QUERY,
+#         expectation_configurations=TEST_EXPECTATIONS_THAT_DONT_FIT_DATA,
+#     )
+#
+#     closed_tunnel = close_ssh_tunnel.run(ssh_tunnel)
+#
+#     data_test_failed = validation_result["success"] == False
+#
+#     assert data_test_failed
+#

-    ssh_tunnel = open_ssh_tunnel_with_s3_pkey.run(
-        s3_bucket_name=TEST_LOLACONFIG.S3_BUCKET_NAME,
-        ssh_tunnel_credentials=TEST_LOLACONFIG.SSH_TUNNEL_CREDENTIALS,
-        remote_target_host=TEST_LOLACONFIG.DW_CREDENTIALS["host"],
-        remote_target_port=TEST_LOLACONFIG.DW_CREDENTIALS["port"],
-    )
-
-    validation_result = run_data_test_on_mysql.run(
-        name="lolafect-testing-test_validation_on_mysql_succeeds",
-        mysql_credentials={
-            "host": ssh_tunnel.local_bind_address[0],
-            "port": ssh_tunnel.local_bind_address[1],
-            "user": TEST_LOLACONFIG.DW_CREDENTIALS["user"],
-            "password": TEST_LOLACONFIG.DW_CREDENTIALS["password"],
-            "db": TEST_LOLACONFIG.DW_CREDENTIALS["default_db"],
+def test_validation_on_trino_succeeds():
+    validation_result = run_data_test_on_trino.run(
+        name="lolafect-testing-test_validation_on_mysql_fails",
+        trino_credentials={
+            "host": TEST_LOLACONFIG.TRINO_CREDENTIALS["host"],
+            "port": TEST_LOLACONFIG.TRINO_CREDENTIALS["port"],
+            "user": TEST_LOLACONFIG.TRINO_CREDENTIALS["user"],
+            "password": TEST_LOLACONFIG.TRINO_CREDENTIALS["password"],
+            "catalog": "data_dw",
+            "schema": "sandbox"
        },
-        query=test_query,
-        expectation_configurations=test_expectations,
+        query=TEST_QUERY,
+        expectation_configurations=TEST_EXPECTATIONS_THAT_FIT_DATA,
    )
-
-    closed_tunnel = close_ssh_tunnel.run(ssh_tunnel)
+    print("###############\n" * 20)
+    print(validation_result)

    data_test_passed = validation_result["success"] == True

    assert data_test_passed


-def test_validation_on_mysql_fails():
-    test_query = """
-        SELECT 1 AS a_one,
-            "lol" AS a_string,
-             NULL AS a_null
-    """
-    test_expectations = [
-        ExpectationConfiguration(
-            expectation_type="expect_column_values_to_be_between",
-            kwargs={"column": "a_one", "min_value": 2, "max_value": 2},
-        ),
-        ExpectationConfiguration(
-            expectation_type="expect_column_values_to_match_like_pattern",
-            kwargs={"column": "a_string", "like_pattern": "%xD%"},
-        ),
-        ExpectationConfiguration(
-            expectation_type="expect_column_values_to_not_be_null",
-            kwargs={"column": "a_null"},
-        ),
-    ]
-
-    ssh_tunnel = open_ssh_tunnel_with_s3_pkey.run(
-        s3_bucket_name=TEST_LOLACONFIG.S3_BUCKET_NAME,
-        ssh_tunnel_credentials=TEST_LOLACONFIG.SSH_TUNNEL_CREDENTIALS,
-        remote_target_host=TEST_LOLACONFIG.DW_CREDENTIALS["host"],
-        remote_target_port=TEST_LOLACONFIG.DW_CREDENTIALS["port"],
-    )
-
-    validation_result = run_data_test_on_mysql.run(
+def test_validation_on_trino_fails():
+    validation_result = run_data_test_on_trino.run(
        name="lolafect-testing-test_validation_on_mysql_fails",
-        mysql_credentials={
-            "host": ssh_tunnel.local_bind_address[0],
-            "port": ssh_tunnel.local_bind_address[1],
-            "user": TEST_LOLACONFIG.DW_CREDENTIALS["user"],
-            "password": TEST_LOLACONFIG.DW_CREDENTIALS["password"],
-            "db": TEST_LOLACONFIG.DW_CREDENTIALS["default_db"],
+        trino_credentials={
+            "host": TEST_LOLACONFIG.TRINO_CREDENTIALS["host"],
+            "port": TEST_LOLACONFIG.TRINO_CREDENTIALS["port"],
+            "user": TEST_LOLACONFIG.TRINO_CREDENTIALS["user"],
+            "password": TEST_LOLACONFIG.TRINO_CREDENTIALS["password"],
+            "catalog": "data_dw",
+            "schema": "sandbox"
        },
-        query=test_query,
-        expectation_configurations=test_expectations,
+        query=TEST_QUERY,
+        expectation_configurations=TEST_EXPECTATIONS_THAT_DONT_FIT_DATA,
    )

-    closed_tunnel = close_ssh_tunnel.run(ssh_tunnel)
-
    data_test_failed = validation_result["success"] == False

    assert data_test_failed
Author	SHA1	Message	Date
Pablo Martin	d9c131dcc6	Unfinished work	2023-04-20 15:36:16 +02:00
Pablo Martin	5066fe4382	Use single quotes so that query works both in MySQL and Trino.	2023-03-30 13:58:15 +02:00
Pablo Martin	19211eee73	Naive duplication of MySQL elements but for Trino. Not working yet.	2023-03-29 17:28:16 +02:00
Pablo Martin	5b97864e8d	Signature for new task	2023-03-29 17:18:41 +02:00
Pablo Martin	814a376e02	Add entry in readme showcasing new trino feature	2023-03-29 17:18:14 +02:00
Pablo Martin	b51c810a39	Add tests for trino feature	2023-03-29 17:18:00 +02:00
Pablo Martin	c3507cccec	Refactor shared elements to follow DRY	2023-03-29 17:17:44 +02:00