Typing, docstrings, formatting for capturer.py

2020-11-03 13:50:36 +01:00 · 2020-11-03 13:50:36 +01:00 · 43236c2884
commit 43236c2884
parent 3cf7dd8bd9
1 changed files with 101 additions and 26 deletions
--- a/capturer/capturer.py
+++ b/capturer/capturer.py
@ -15,23 +15,31 @@ import logging


 class Capturer:
-    sleep_time_no_work = 15
+    """
+    Daemon with the full flow of execution of individual ad requesting, data
+    scraping and db storage.
+    """

-    def __init__(self):
+    def __init__(self) -> None:
        self.last_try_datetime = datetime.datetime.now()

-    def start(self):
+    def start(self) -> None:
+        """
+        Full flow of execution. Checks whether it should capture a URL, tries
+        to do so and stores the result if successful.
+        :return: None
+        """

        logging.info("Starting capturer")
        while True:

-            if not self.in_working_hours():
+            if not self._in_working_hours():
                sleep(1800)
                logging.info("Waiting...")
                continue

            seconds_to_next_capture = (
-                minimum_seconds_between_tries() - self.seconds_since_last_try()
+                minimum_seconds_between_tries() - self._seconds_since_last_try()
            )
            if seconds_to_next_capture > 0:
                sleep(seconds_to_next_capture)
@ -39,7 +47,9 @@ class Capturer:

            pending_task = capturing_interface.get_pending_task()
            if not pending_task:
+                logging.info("No pending tasks.")
                continue
+
            task = CapturingTask(pending_task)
            self.last_try_datetime = datetime.datetime.now()
            task.capture()
@ -47,26 +57,46 @@ class Capturer:
            if task.status == "Data ready":
                ad_data = task.get_ad_data()
            else:
+                logging.warning("Something went wrong, not adding data.")
                continue

            capturas_interface.insert_captura(ad_data)
            task._update_status("Captura inserted")
+            logging.info("New ad inserted.")

-    def in_working_hours(self):
+    def _in_working_hours(self) -> bool:
+        """
+        Checks whether now is within the working hours of the daemon.
+        :return: True if so, false if not
+        """
        return (
            working_hours["start"]
            <= datetime.datetime.now().time()
            <= working_hours["end"]
        )

-    def seconds_since_last_try(self):
+    def _seconds_since_last_try(self) -> float:
+        """
+        Computes how many seconds have passed since the last capturing attempt
+        :return: seconds since last try as integer
+        """
        return (datetime.datetime.now() - self.last_try_datetime).total_seconds()


 class CapturingTask:
+    """
+    Task object wrapping the process of attempting to capture and ad, parsing
+    the data and sending to db.
+    """
+
    sleep_time_failed_request = 180

-    def __init__(self, parameters):
+    def __init__(self, parameters) -> None:
+        """
+        Initialize with task parameters and mark the task as being worked on
+        in the task queue.
+        :param parameters: dict with the necessary parameters for the task
+        """
        self.uuid = parameters["uuid"]
        self.ad_url = parameters["ad_url"]
        self.uuid_exploring = parameters["fk_uuid_exploring"]
@ -76,15 +106,20 @@ class CapturingTask:

        self._update_status("Loading")

-    def _update_status(self, new_status):
+    def _update_status(self, new_status) -> None:
+        """
+        Updates the task status and persists it in the task queue.
+        :param new_status: string describing the new status
+        :return: None
+        """
        self.status = new_status
        capturing_interface.update_capturing_task(
            self.uuid, self.uuid_exploring, self.status, self.ad_url
        )

-    def capture(self):
+    def capture(self) -> None:
        """
-        Metodo principal que contiene el flujo de captura
+        Main flow of work
        """
        self._update_status("WIP")

@ -94,18 +129,19 @@ class CapturingTask:

            if attack.success:
                self.html = attack.get_text()
-
                self._extract_data()
                self._check_data()
                return

-            else:
+            if not attack.success:
                try:
                    if Refresher.dead_ad_checker(attack.get_text()):
                        self._update_status("Dead ad")
                        return
                except AttributeError:
-                    pass
+                    logging.error(
+                        "Something went wrong when checking if the ad is gone"
+                    )

                self._update_status("Fail {}".format(self.request_failures))
                self.request_failures += 1
@ -115,11 +151,20 @@ class CapturingTask:
        self._update_status("Surrender")
        logging.warning(f"A task has surrendered. {self.ad_url}")

-    def _extract_data(self):
+    def _extract_data(self) -> None:
+        """
+        Parses the obtained html to extract the ad information.
+        :return: None
+        """
        self.parser = AdHtmlParser(self.html)
        self.parser.parse()

-    def _check_data(self):
+    def _check_data(self) -> None:
+        """
+        Validates that all compulsory fields have been obtained and that the
+        values are within the expected. Sets the status of task accordingly.
+        :return: None
+        """
        if self.parser.fields_missing():
            self._update_status("Fields missing")
            return
@ -130,12 +175,24 @@ class CapturingTask:

        self._update_status("Data ready")

-    def get_ad_data(self):
+    def get_ad_data(self) -> dict:
+        """
+        Returns the extracted data.
+        :return: dictionary with the data of the ad.
+        """
        return self.parser.get_data()


 class AdHtmlParser:
-    def __init__(self, html_string):
+    """
+    Object for parsing, storing and validating the data of the HTML of an ad.
+    """
+
+    def __init__(self, html_string: str) -> None:
+        """
+        Initializes an instance of the parser with the HTML of an ad.
+        :param html_string: the full HTML code of the ad page
+        """
        self.html = html_string

        self.ad_fields = {
@ -156,9 +213,12 @@ class AdHtmlParser:
            "personal": {"found": False, "optional": False, "value": None},
            "telefono": {"found": False, "optional": True, "value": None},
        }
-        # TODO añadir campos de visitas

-    def parse(self):
+    def parse(self) -> None:
+        """
+        Parses the HTML and stores the ad data.
+        :return: None
+        """

        soup = BeautifulSoup(self.html, "html5lib")

@ -272,9 +332,12 @@ class AdHtmlParser:
            ).text.replace(" ", "")
            self.ad_fields["telefono"]["found"] = True

-        # TODO capturar datos de visitas
-
-    def _validate(self):
+    def _validate(self) -> None:
+        """
+        Checks whether the extracted values are valid against the expected
+        typology. Stores the results.
+        :return: None
+        """
        self.invalid_fields = []

        if not re.match(r"[0-9]{4,20}", self.ad_fields["referencia"]["value"]):
@ -306,20 +369,32 @@ class AdHtmlParser:
            self.invalid_fields.append("telefono")
        # TODO añadir + a caracteres validos

-    def all_fields_are_valid(self):
+    def all_fields_are_valid(self) -> bool:
+        """
+        Reports on whether the extracted data is valid.
+        :return: True if values are valid, false if not
+        """
        self._validate()
        if self.invalid_fields:
            return False
        else:
            return True

-    def fields_missing(self):
+    def fields_missing(self) -> None:
+        """
+        Reports on whether all compulsory fields are present.
+        :return: True if some field is missing, false if not
+        """
        for key, contents in self.ad_fields.items():
            if not contents["optional"] and not contents["found"]:
                return True
        return False

-    def get_data(self):
+    def get_data(self) -> dict:
+        """
+        Returns the extracted data in the form of a dictionary.
+        :return: dictionary with the extracted data
+        """
        data = {}

        for ad_field in self.ad_fields.keys():