From cbf1643fb56797d93451dca0acac44bb978bcb69 Mon Sep 17 00:00:00 2001
From: pablo <pablomartincalvo@gmail.com>
Date: Mon, 4 Jan 2021 22:17:40 +0100
Subject: [PATCH] Formatting, docstrings and other chores.

---
 capturer/capturer.py        | 269 +++---------------------------------
 core/scrapping_utils.py     |   2 +-
 tests/parsing_utils_test.py |  16 +--
 3 files changed, 32 insertions(+), 255 deletions(-)

diff --git a/capturer/capturer.py b/capturer/capturer.py
index 7397af6..6971cfb 100644
--- a/capturer/capturer.py
+++ b/capturer/capturer.py
@@ -1,5 +1,3 @@
-import sys
-
 from time import sleep
 import datetime
 
@@ -33,6 +31,19 @@ class Capturer:
         url_acquisition_object: Type[UrlAttack],
         dead_ad_checker: Callable,
     ) -> None:
+        """
+        Receive all required objects.
+        :param throttling_manager: takes care of deciding whether a task should
+        be started
+        :param capturing_tasks_interface: interface to interact with the tasks
+        database
+        :param capturas_interface: interface to interact with the ad database
+        :param parsing_flow_generator: an object capable of generating empty
+        parsing flows to give each task a new one
+        :param url_acquisition_object: gateway to obtaining the HTML of an url
+        :param dead_ad_checker: callable capable of checking if an ad is dead
+        through its HTML
+        """
         self._throttling_manager = throttling_manager
         self._capturing_tasks_interface = capturing_tasks_interface
         self._capturas_interface = capturas_interface
@@ -103,6 +114,10 @@ class CapturingTask:
         Initialize with task parameters and mark the task as being worked on
         in the task queue.
         :param task_parameters: dict with the necessary parameters for the task
+        :param capturing_interface: interface to interact with the ad database
+        :param new_parsing_flow: an empty parsing flow
+        :param url_acquisition_object: gateway to obtaining the HTML of an url
+        :param dead_ad_checker: callable capable of checking if an ad is dead
         """
         self.uuid = task_parameters["uuid"]
         self.ad_url = task_parameters["ad_url"]
@@ -160,30 +175,6 @@ class CapturingTask:
         self.update_status("Surrender")
         logging.warning(f"A task has surrendered. {self.ad_url}")
 
-    def _extract_data(self) -> None:
-        """
-        Parses the obtained html to extract the ad information.
-        :return: None
-        """
-        self.parser = AdHtmlParser(self.html)
-        self.parser.parse()
-
-    def _check_data(self) -> None:
-        """
-        Validates that all compulsory fields have been obtained and that the
-        values are within the expected. Sets the status of task accordingly.
-        :return: None
-        """
-        if self.parser.fields_missing():
-            self.update_status("Fields missing")
-            return
-
-        if not self.parser.all_fields_are_valid():
-            self.update_status("Invalid value fields")
-            return
-
-        self.update_status("Data ready")
-
     def get_ad_data(self) -> dict:
         """
         Returns the extracted data.
@@ -192,6 +183,12 @@ class CapturingTask:
         return self._parsing_flow.field_values
 
     def _parse_html(self, html: str) -> None:
+        """
+        Execute the complete parsing flow and report the task status depending
+        on the outcome.
+        :param html: the HTML of the ad
+        :return: None
+        """
         self._parsing_flow.execute_flow(soup=BeautifulSoup(html, "html5lib"))
 
         if not self._parsing_flow.issues:
@@ -211,226 +208,6 @@ class CapturingTask:
             return
 
 
-class AdHtmlParser:
-    """
-    Object for parsing, storing and validating the data of the HTML of an ad.
-    """
-
-    def __init__(self, html_string: str) -> None:
-        """
-        Initializes an instance of the parser with the HTML of an ad.
-        :param html_string: the full HTML code of the ad page
-        """
-        self.html = html_string
-
-        self.ad_fields = {
-            "referencia": {"found": False, "optional": False, "value": None},
-            "precio": {"found": False, "optional": False, "value": None},
-            "tamano_categorico": {"found": False, "optional": True, "value": None},
-            "m2": {"found": False, "optional": True, "value": None},
-            "tipo_anuncio": {"found": False, "optional": False, "value": None},
-            "calle": {"found": False, "optional": True, "value": None},
-            "barrio": {"found": False, "optional": False, "value": None},
-            "distrito": {"found": False, "optional": False, "value": None},
-            "ciudad": {"found": False, "optional": False, "value": None},
-            "cubierta": {"found": False, "optional": False, "value": None},
-            "puerta_auto": {"found": False, "optional": False, "value": None},
-            "ascensor": {"found": False, "optional": False, "value": None},
-            "alarma": {"found": False, "optional": False, "value": None},
-            "circuito": {"found": False, "optional": False, "value": None},
-            "personal": {"found": False, "optional": False, "value": None},
-            "telefono": {"found": False, "optional": True, "value": None},
-        }
-
-    def parse(self) -> None:
-        """
-        Parses the HTML and stores the ad data.
-        :return: None
-        """
-
-        soup = BeautifulSoup(self.html, "html5lib")
-
-        if soup.find_all("link", {"rel": "canonical"}) is not None:
-            self.ad_fields["referencia"]["value"] = re.findall(
-                r"[0-9]{5,20}", str(soup.find_all("link", {"rel": "canonical"})[0])
-            )[0]
-            self.ad_fields["referencia"]["found"] = True
-
-        if soup.find_all("strong", {"class": "price"}) is not None:
-            self.ad_fields["precio"]["value"] = "".join(
-                re.findall(
-                    r"[0-9]", str(soup.find_all("strong", {"class": "price"})[0])
-                )
-            )
-            self.ad_fields["precio"]["found"] = True
-
-        if soup.find("div", {"class": "info-features"}) is not None:
-            try:
-                if (
-                    "m²"
-                    not in soup.find("div", {"class": "info-features"})
-                    .find("span")
-                    .find("span")
-                    .text
-                ):
-                    self.ad_fields["tamano_categorico"]["value"] = (
-                        soup.find("div", {"class": "info-features"})
-                        .find("span")
-                        .find("span")
-                        .text
-                    )
-                    self.ad_fields["tamano_categorico"]["found"] = True
-            except:
-                pass
-
-        posible_m2 = [
-            tag.text
-            for tag in soup.find("div", {"class": "info-features"}).find_all("span")
-        ]
-        if [posible for posible in posible_m2 if "m²" in posible]:
-            self.ad_fields["m2"]["value"] = [
-                "".join(re.findall(r"[0-9]+,*[0-9]*", posible))
-                for posible in posible_m2
-                if "m²" in posible
-            ][0].replace(",", ".")
-            self.ad_fields["m2"]["found"] = True
-
-        if soup.find("title") is not None:
-            if "venta" in soup.find("title").text:
-                self.ad_fields["tipo_anuncio"]["value"] = 1
-            else:
-                self.ad_fields["tipo_anuncio"]["value"] = 2
-            self.ad_fields["tipo_anuncio"]["found"] = True
-
-        if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 3:
-            self.ad_fields["calle"]["value"] = ""
-            self.ad_fields["ciudad"]["value"] = (
-                soup.find("div", {"id": "headerMap"}).find_all("li")[-2].text.strip()
-            )
-            self.ad_fields["ciudad"]["found"] = True
-            self.ad_fields["distrito"]["value"] = (
-                soup.find("div", {"id": "headerMap"}).find_all("li")[-3].text.strip()
-            )
-            self.ad_fields["distrito"]["found"] = True
-            self.ad_fields["barrio"]["value"] = (
-                soup.find("div", {"id": "headerMap"}).find_all("li")[-4].text.strip()
-            )
-            self.ad_fields["barrio"]["found"] = True
-        if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 4:
-            self.ad_fields["calle"]["value"] = (
-                soup.find("div", {"id": "headerMap"}).find_all("li")[0].text.strip()
-            )
-            self.ad_fields["calle"]["found"] = True
-
-        features_lists = soup.find_all("div", {"class": "details-property_features"})
-        features = [
-            feature.text
-            for feature_list in features_lists
-            for feature in feature_list.find_all("li")
-        ]
-        self.ad_fields["cubierta"]["value"] = 1 * any(
-            "Cubierta" in feature for feature in features
-        )
-        self.ad_fields["puerta_auto"]["value"] = 1 * any(
-            "Puerta" in feature for feature in features
-        )
-        self.ad_fields["ascensor"]["value"] = 1 * any(
-            "ascensor" in feature for feature in features
-        )
-        self.ad_fields["alarma"]["value"] = 1 * any(
-            "Alarma" in feature for feature in features
-        )
-        self.ad_fields["circuito"]["value"] = 1 * any(
-            "Cámaras" in feature for feature in features
-        )
-        self.ad_fields["personal"]["value"] = 1 * any(
-            "Personal" in feature for feature in features
-        )
-
-        self.ad_fields["cubierta"]["found"] = True
-        self.ad_fields["puerta_auto"]["found"] = True
-        self.ad_fields["ascensor"]["found"] = True
-        self.ad_fields["alarma"]["found"] = True
-        self.ad_fields["circuito"]["found"] = True
-        self.ad_fields["personal"]["found"] = True
-
-        if soup.find("p", {"class": "txt-bold _browserPhone icon-phone"}) is not None:
-            self.ad_fields["telefono"]["value"] = soup.find(
-                "p", {"class": "txt-bold _browserPhone icon-phone"}
-            ).text.replace(" ", "")
-            self.ad_fields["telefono"]["found"] = True
-
-    def _validate(self) -> None:
-        """
-        Checks whether the extracted values are valid against the expected
-        typology. Stores the results.
-        :return: None
-        """
-        self.invalid_fields = []
-
-        if not re.match(r"[0-9]{4,20}", self.ad_fields["referencia"]["value"]):
-            self.invalid_fields.append("referencia")
-
-        if not re.match(r"[0-9]{1,20}", self.ad_fields["precio"]["value"]):
-            self.invalid_fields.append("precio")
-
-        possible_values_tamano = [
-            "2 coches o más",
-            "coche y moto",
-            "coche grande",
-            "coche pequeño",
-            "moto",
-            None,
-        ]
-        if self.ad_fields["tamano_categorico"]["value"] not in possible_values_tamano:
-            self.invalid_fields.append("tamano_categorico")
-
-        if not "Barrio" in self.ad_fields["barrio"]["value"]:
-            self.invalid_fields.append("barrio")
-
-        if not "Distrito" in self.ad_fields["distrito"]["value"]:
-            self.invalid_fields.append("distrito")
-
-        if self.ad_fields["telefono"]["found"] and not re.match(
-            r"\s*\+?[0-9\s]*", self.ad_fields["telefono"]["value"]
-        ):
-            self.invalid_fields.append("telefono")
-        # TODO añadir + a caracteres validos
-
-    def all_fields_are_valid(self) -> bool:
-        """
-        Reports on whether the extracted data is valid.
-        :return: True if values are valid, false if not
-        """
-        self._validate()
-        if self.invalid_fields:
-            return False
-        else:
-            return True
-
-    def fields_missing(self) -> bool:
-        """
-        Reports on whether all compulsory fields are present.
-        :return: True if some field is missing, false if not
-        """
-        for key, contents in self.ad_fields.items():
-            if not contents["optional"] and not contents["found"]:
-                return True
-        return False
-
-    def get_data(self) -> dict:
-        """
-        Returns the extracted data in the form of a dictionary.
-        :return: dictionary with the extracted data
-        """
-        data = {}
-
-        for ad_field in self.ad_fields.keys():
-            data[ad_field] = self.ad_fields[ad_field]["value"]
-
-        return data
-
-
 if __name__ == "__main__":
 
     capturing_tasks_interface = CapturingTasksInterface()
diff --git a/core/scrapping_utils.py b/core/scrapping_utils.py
index 6fff33b..3ab6386 100644
--- a/core/scrapping_utils.py
+++ b/core/scrapping_utils.py
@@ -92,7 +92,7 @@ class UrlAttack:
 
             if self.response.ok:
                 self.success = True
-        except Exception as e:
+        except Exception:
             self.success = False
 
         if (
diff --git a/tests/parsing_utils_test.py b/tests/parsing_utils_test.py
index de0beeb..9d9d56d 100644
--- a/tests/parsing_utils_test.py
+++ b/tests/parsing_utils_test.py
@@ -2518,8 +2518,8 @@ def test_referencia_instructions_extract_correctly(real_ad_html):
     referencia_instructions.validate()
 
     assert (
-        referencia_instructions.found == True
-        and referencia_instructions.valid == True
+        referencia_instructions.found is True
+        and referencia_instructions.valid is True
         and referencia_instructions.value is not None
         and referencia_instructions.search_issue is None
     )
@@ -2534,7 +2534,7 @@ def test_referencia_instructions_find_nothing_in_unrelated_html(unrelated_html):
     referencia_instructions.validate()
 
     assert (
-        referencia_instructions.found == False
+        referencia_instructions.found is False
         and referencia_instructions.valid is None
         and referencia_instructions.value is None
         and referencia_instructions.search_issue is not None
@@ -2580,8 +2580,8 @@ def test_all_instructions_extract_correctly(real_ad_html):
 
     assert all(
         [
-            instruction.found == True
-            and instruction.valid == True
+            instruction.found is True
+            and instruction.valid is True
             and instruction.value is not None
             and instruction.search_issue is None
             for instruction in all_instructions
@@ -2628,8 +2628,8 @@ def test_all_instructions_fail_on_unrelated_html(unrelated_html):
 
     assert all(
         [
-            instruction.found == False
-            and (instruction.valid == False or instruction.valid == None)
+            instruction.found is False
+            and (instruction.valid is False or instruction.valid is None)
             and instruction.value is None
             for instruction in all_instructions
         ]
@@ -2725,7 +2725,7 @@ def test_parsing_flow_fails_for_unrelated_html(unrelated_html):
 
     assert not parsing_flow.all_non_optional_fields_were_found and len(
         parsing_flow.issues
-    ) == len(all_instructions)
+    ) == len([field for field in all_instructions if not field.is_optional])
 
 
 def test_parsing_flow_generator_returns_proper_flows():