From 43236c2884770f80a60a3703a2783ce4354b3f7e Mon Sep 17 00:00:00 2001
From: pablo <pablomartincalvo@gmail.com>
Date: Tue, 3 Nov 2020 13:50:36 +0100
Subject: [PATCH] Typing, docstrings, formatting for capturer.py

---
 capturer/capturer.py | 127 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 101 insertions(+), 26 deletions(-)

diff --git a/capturer/capturer.py b/capturer/capturer.py
index 455eb15..d8a1bff 100644
--- a/capturer/capturer.py
+++ b/capturer/capturer.py
@@ -15,23 +15,31 @@ import logging
 
 
 class Capturer:
-    sleep_time_no_work = 15
+    """
+    Daemon with the full flow of execution of individual ad requesting, data
+    scraping and db storage.
+    """
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.last_try_datetime = datetime.datetime.now()
 
-    def start(self):
+    def start(self) -> None:
+        """
+        Full flow of execution. Checks whether it should capture a URL, tries
+        to do so and stores the result if successful.
+        :return: None
+        """
 
         logging.info("Starting capturer")
         while True:
 
-            if not self.in_working_hours():
+            if not self._in_working_hours():
                 sleep(1800)
                 logging.info("Waiting...")
                 continue
 
             seconds_to_next_capture = (
-                minimum_seconds_between_tries() - self.seconds_since_last_try()
+                minimum_seconds_between_tries() - self._seconds_since_last_try()
             )
             if seconds_to_next_capture > 0:
                 sleep(seconds_to_next_capture)
@@ -39,7 +47,9 @@ class Capturer:
 
             pending_task = capturing_interface.get_pending_task()
             if not pending_task:
+                logging.info("No pending tasks.")
                 continue
+
             task = CapturingTask(pending_task)
             self.last_try_datetime = datetime.datetime.now()
             task.capture()
@@ -47,26 +57,46 @@ class Capturer:
             if task.status == "Data ready":
                 ad_data = task.get_ad_data()
             else:
+                logging.warning("Something went wrong, not adding data.")
                 continue
 
             capturas_interface.insert_captura(ad_data)
             task._update_status("Captura inserted")
+            logging.info("New ad inserted.")
 
-    def in_working_hours(self):
+    def _in_working_hours(self) -> bool:
+        """
+        Checks whether now is within the working hours of the daemon.
+        :return: True if so, false if not
+        """
         return (
             working_hours["start"]
             <= datetime.datetime.now().time()
             <= working_hours["end"]
         )
 
-    def seconds_since_last_try(self):
+    def _seconds_since_last_try(self) -> float:
+        """
+        Computes how many seconds have passed since the last capturing attempt
+        :return: seconds since last try as integer
+        """
         return (datetime.datetime.now() - self.last_try_datetime).total_seconds()
 
 
 class CapturingTask:
+    """
+    Task object wrapping the process of attempting to capture and ad, parsing
+    the data and sending to db.
+    """
+
     sleep_time_failed_request = 180
 
-    def __init__(self, parameters):
+    def __init__(self, parameters) -> None:
+        """
+        Initialize with task parameters and mark the task as being worked on
+        in the task queue.
+        :param parameters: dict with the necessary parameters for the task
+        """
         self.uuid = parameters["uuid"]
         self.ad_url = parameters["ad_url"]
         self.uuid_exploring = parameters["fk_uuid_exploring"]
@@ -76,15 +106,20 @@ class CapturingTask:
 
         self._update_status("Loading")
 
-    def _update_status(self, new_status):
+    def _update_status(self, new_status) -> None:
+        """
+        Updates the task status and persists it in the task queue.
+        :param new_status: string describing the new status
+        :return: None
+        """
         self.status = new_status
         capturing_interface.update_capturing_task(
             self.uuid, self.uuid_exploring, self.status, self.ad_url
         )
 
-    def capture(self):
+    def capture(self) -> None:
         """
-        Metodo principal que contiene el flujo de captura
+        Main flow of work
         """
         self._update_status("WIP")
 
@@ -94,18 +129,19 @@ class CapturingTask:
 
             if attack.success:
                 self.html = attack.get_text()
-
                 self._extract_data()
                 self._check_data()
                 return
 
-            else:
+            if not attack.success:
                 try:
                     if Refresher.dead_ad_checker(attack.get_text()):
                         self._update_status("Dead ad")
                         return
                 except AttributeError:
-                    pass
+                    logging.error(
+                        "Something went wrong when checking if the ad is gone"
+                    )
 
                 self._update_status("Fail {}".format(self.request_failures))
                 self.request_failures += 1
@@ -115,11 +151,20 @@ class CapturingTask:
         self._update_status("Surrender")
         logging.warning(f"A task has surrendered. {self.ad_url}")
 
-    def _extract_data(self):
+    def _extract_data(self) -> None:
+        """
+        Parses the obtained html to extract the ad information.
+        :return: None
+        """
         self.parser = AdHtmlParser(self.html)
         self.parser.parse()
 
-    def _check_data(self):
+    def _check_data(self) -> None:
+        """
+        Validates that all compulsory fields have been obtained and that the
+        values are within the expected. Sets the status of task accordingly.
+        :return: None
+        """
         if self.parser.fields_missing():
             self._update_status("Fields missing")
             return
@@ -130,12 +175,24 @@ class CapturingTask:
 
         self._update_status("Data ready")
 
-    def get_ad_data(self):
+    def get_ad_data(self) -> dict:
+        """
+        Returns the extracted data.
+        :return: dictionary with the data of the ad.
+        """
         return self.parser.get_data()
 
 
 class AdHtmlParser:
-    def __init__(self, html_string):
+    """
+    Object for parsing, storing and validating the data of the HTML of an ad.
+    """
+
+    def __init__(self, html_string: str) -> None:
+        """
+        Initializes an instance of the parser with the HTML of an ad.
+        :param html_string: the full HTML code of the ad page
+        """
         self.html = html_string
 
         self.ad_fields = {
@@ -156,9 +213,12 @@ class AdHtmlParser:
             "personal": {"found": False, "optional": False, "value": None},
             "telefono": {"found": False, "optional": True, "value": None},
         }
-        # TODO añadir campos de visitas
 
-    def parse(self):
+    def parse(self) -> None:
+        """
+        Parses the HTML and stores the ad data.
+        :return: None
+        """
 
         soup = BeautifulSoup(self.html, "html5lib")
 
@@ -272,9 +332,12 @@ class AdHtmlParser:
             ).text.replace(" ", "")
             self.ad_fields["telefono"]["found"] = True
 
-        # TODO capturar datos de visitas
-
-    def _validate(self):
+    def _validate(self) -> None:
+        """
+        Checks whether the extracted values are valid against the expected
+        typology. Stores the results.
+        :return: None
+        """
         self.invalid_fields = []
 
         if not re.match(r"[0-9]{4,20}", self.ad_fields["referencia"]["value"]):
@@ -306,20 +369,32 @@ class AdHtmlParser:
             self.invalid_fields.append("telefono")
         # TODO añadir + a caracteres validos
 
-    def all_fields_are_valid(self):
+    def all_fields_are_valid(self) -> bool:
+        """
+        Reports on whether the extracted data is valid.
+        :return: True if values are valid, false if not
+        """
         self._validate()
         if self.invalid_fields:
             return False
         else:
             return True
 
-    def fields_missing(self):
+    def fields_missing(self) -> None:
+        """
+        Reports on whether all compulsory fields are present.
+        :return: True if some field is missing, false if not
+        """
         for key, contents in self.ad_fields.items():
             if not contents["optional"] and not contents["found"]:
                 return True
         return False
 
-    def get_data(self):
+    def get_data(self) -> dict:
+        """
+        Returns the extracted data in the form of a dictionary.
+        :return: dictionary with the extracted data
+        """
         data = {}
 
         for ad_field in self.ad_fields.keys():