Formatting, docstrings and other chores.

2021-01-04 22:17:40 +01:00 · 2021-01-04 22:17:40 +01:00 · cbf1643fb5
commit cbf1643fb5
parent adf2cd26ba
3 changed files with 32 additions and 255 deletions
--- a/capturer/capturer.py
+++ b/capturer/capturer.py
@ -1,5 +1,3 @@
-import sys
-
 from time import sleep
 import datetime

@ -33,6 +31,19 @@ class Capturer:
        url_acquisition_object: Type[UrlAttack],
        dead_ad_checker: Callable,
    ) -> None:
+        """
+        Receive all required objects.
+        :param throttling_manager: takes care of deciding whether a task should
+        be started
+        :param capturing_tasks_interface: interface to interact with the tasks
+        database
+        :param capturas_interface: interface to interact with the ad database
+        :param parsing_flow_generator: an object capable of generating empty
+        parsing flows to give each task a new one
+        :param url_acquisition_object: gateway to obtaining the HTML of an url
+        :param dead_ad_checker: callable capable of checking if an ad is dead
+        through its HTML
+        """
        self._throttling_manager = throttling_manager
        self._capturing_tasks_interface = capturing_tasks_interface
        self._capturas_interface = capturas_interface
@ -103,6 +114,10 @@ class CapturingTask:
        Initialize with task parameters and mark the task as being worked on
        in the task queue.
        :param task_parameters: dict with the necessary parameters for the task
+        :param capturing_interface: interface to interact with the ad database
+        :param new_parsing_flow: an empty parsing flow
+        :param url_acquisition_object: gateway to obtaining the HTML of an url
+        :param dead_ad_checker: callable capable of checking if an ad is dead
        """
        self.uuid = task_parameters["uuid"]
        self.ad_url = task_parameters["ad_url"]
@ -160,30 +175,6 @@ class CapturingTask:
        self.update_status("Surrender")
        logging.warning(f"A task has surrendered. {self.ad_url}")

-    def _extract_data(self) -> None:
-        """
-        Parses the obtained html to extract the ad information.
-        :return: None
-        """
-        self.parser = AdHtmlParser(self.html)
-        self.parser.parse()
-
-    def _check_data(self) -> None:
-        """
-        Validates that all compulsory fields have been obtained and that the
-        values are within the expected. Sets the status of task accordingly.
-        :return: None
-        """
-        if self.parser.fields_missing():
-            self.update_status("Fields missing")
-            return
-
-        if not self.parser.all_fields_are_valid():
-            self.update_status("Invalid value fields")
-            return
-
-        self.update_status("Data ready")
-
    def get_ad_data(self) -> dict:
        """
        Returns the extracted data.
@ -192,6 +183,12 @@ class CapturingTask:
        return self._parsing_flow.field_values

    def _parse_html(self, html: str) -> None:
+        """
+        Execute the complete parsing flow and report the task status depending
+        on the outcome.
+        :param html: the HTML of the ad
+        :return: None
+        """
        self._parsing_flow.execute_flow(soup=BeautifulSoup(html, "html5lib"))

        if not self._parsing_flow.issues:
@ -211,226 +208,6 @@ class CapturingTask:
            return


-class AdHtmlParser:
-    """
-    Object for parsing, storing and validating the data of the HTML of an ad.
-    """
-
-    def __init__(self, html_string: str) -> None:
-        """
-        Initializes an instance of the parser with the HTML of an ad.
-        :param html_string: the full HTML code of the ad page
-        """
-        self.html = html_string
-
-        self.ad_fields = {
-            "referencia": {"found": False, "optional": False, "value": None},
-            "precio": {"found": False, "optional": False, "value": None},
-            "tamano_categorico": {"found": False, "optional": True, "value": None},
-            "m2": {"found": False, "optional": True, "value": None},
-            "tipo_anuncio": {"found": False, "optional": False, "value": None},
-            "calle": {"found": False, "optional": True, "value": None},
-            "barrio": {"found": False, "optional": False, "value": None},
-            "distrito": {"found": False, "optional": False, "value": None},
-            "ciudad": {"found": False, "optional": False, "value": None},
-            "cubierta": {"found": False, "optional": False, "value": None},
-            "puerta_auto": {"found": False, "optional": False, "value": None},
-            "ascensor": {"found": False, "optional": False, "value": None},
-            "alarma": {"found": False, "optional": False, "value": None},
-            "circuito": {"found": False, "optional": False, "value": None},
-            "personal": {"found": False, "optional": False, "value": None},
-            "telefono": {"found": False, "optional": True, "value": None},
-        }
-
-    def parse(self) -> None:
-        """
-        Parses the HTML and stores the ad data.
-        :return: None
-        """
-
-        soup = BeautifulSoup(self.html, "html5lib")
-
-        if soup.find_all("link", {"rel": "canonical"}) is not None:
-            self.ad_fields["referencia"]["value"] = re.findall(
-                r"[0-9]{5,20}", str(soup.find_all("link", {"rel": "canonical"})[0])
-            )[0]
-            self.ad_fields["referencia"]["found"] = True
-
-        if soup.find_all("strong", {"class": "price"}) is not None:
-            self.ad_fields["precio"]["value"] = "".join(
-                re.findall(
-                    r"[0-9]", str(soup.find_all("strong", {"class": "price"})[0])
-                )
-            )
-            self.ad_fields["precio"]["found"] = True
-
-        if soup.find("div", {"class": "info-features"}) is not None:
-            try:
-                if (
-                    "m²"
-                    not in soup.find("div", {"class": "info-features"})
-                    .find("span")
-                    .find("span")
-                    .text
-                ):
-                    self.ad_fields["tamano_categorico"]["value"] = (
-                        soup.find("div", {"class": "info-features"})
-                        .find("span")
-                        .find("span")
-                        .text
-                    )
-                    self.ad_fields["tamano_categorico"]["found"] = True
-            except:
-                pass
-
-        posible_m2 = [
-            tag.text
-            for tag in soup.find("div", {"class": "info-features"}).find_all("span")
-        ]
-        if [posible for posible in posible_m2 if "m²" in posible]:
-            self.ad_fields["m2"]["value"] = [
-                "".join(re.findall(r"[0-9]+,*[0-9]*", posible))
-                for posible in posible_m2
-                if "m²" in posible
-            ][0].replace(",", ".")
-            self.ad_fields["m2"]["found"] = True
-
-        if soup.find("title") is not None:
-            if "venta" in soup.find("title").text:
-                self.ad_fields["tipo_anuncio"]["value"] = 1
-            else:
-                self.ad_fields["tipo_anuncio"]["value"] = 2
-            self.ad_fields["tipo_anuncio"]["found"] = True
-
-        if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 3:
-            self.ad_fields["calle"]["value"] = ""
-            self.ad_fields["ciudad"]["value"] = (
-                soup.find("div", {"id": "headerMap"}).find_all("li")[-2].text.strip()
-            )
-            self.ad_fields["ciudad"]["found"] = True
-            self.ad_fields["distrito"]["value"] = (
-                soup.find("div", {"id": "headerMap"}).find_all("li")[-3].text.strip()
-            )
-            self.ad_fields["distrito"]["found"] = True
-            self.ad_fields["barrio"]["value"] = (
-                soup.find("div", {"id": "headerMap"}).find_all("li")[-4].text.strip()
-            )
-            self.ad_fields["barrio"]["found"] = True
-        if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 4:
-            self.ad_fields["calle"]["value"] = (
-                soup.find("div", {"id": "headerMap"}).find_all("li")[0].text.strip()
-            )
-            self.ad_fields["calle"]["found"] = True
-
-        features_lists = soup.find_all("div", {"class": "details-property_features"})
-        features = [
-            feature.text
-            for feature_list in features_lists
-            for feature in feature_list.find_all("li")
-        ]
-        self.ad_fields["cubierta"]["value"] = 1 * any(
-            "Cubierta" in feature for feature in features
-        )
-        self.ad_fields["puerta_auto"]["value"] = 1 * any(
-            "Puerta" in feature for feature in features
-        )
-        self.ad_fields["ascensor"]["value"] = 1 * any(
-            "ascensor" in feature for feature in features
-        )
-        self.ad_fields["alarma"]["value"] = 1 * any(
-            "Alarma" in feature for feature in features
-        )
-        self.ad_fields["circuito"]["value"] = 1 * any(
-            "Cámaras" in feature for feature in features
-        )
-        self.ad_fields["personal"]["value"] = 1 * any(
-            "Personal" in feature for feature in features
-        )
-
-        self.ad_fields["cubierta"]["found"] = True
-        self.ad_fields["puerta_auto"]["found"] = True
-        self.ad_fields["ascensor"]["found"] = True
-        self.ad_fields["alarma"]["found"] = True
-        self.ad_fields["circuito"]["found"] = True
-        self.ad_fields["personal"]["found"] = True
-
-        if soup.find("p", {"class": "txt-bold _browserPhone icon-phone"}) is not None:
-            self.ad_fields["telefono"]["value"] = soup.find(
-                "p", {"class": "txt-bold _browserPhone icon-phone"}
-            ).text.replace(" ", "")
-            self.ad_fields["telefono"]["found"] = True
-
-    def _validate(self) -> None:
-        """
-        Checks whether the extracted values are valid against the expected
-        typology. Stores the results.
-        :return: None
-        """
-        self.invalid_fields = []
-
-        if not re.match(r"[0-9]{4,20}", self.ad_fields["referencia"]["value"]):
-            self.invalid_fields.append("referencia")
-
-        if not re.match(r"[0-9]{1,20}", self.ad_fields["precio"]["value"]):
-            self.invalid_fields.append("precio")
-
-        possible_values_tamano = [
-            "2 coches o más",
-            "coche y moto",
-            "coche grande",
-            "coche pequeño",
-            "moto",
-            None,
-        ]
-        if self.ad_fields["tamano_categorico"]["value"] not in possible_values_tamano:
-            self.invalid_fields.append("tamano_categorico")
-
-        if not "Barrio" in self.ad_fields["barrio"]["value"]:
-            self.invalid_fields.append("barrio")
-
-        if not "Distrito" in self.ad_fields["distrito"]["value"]:
-            self.invalid_fields.append("distrito")
-
-        if self.ad_fields["telefono"]["found"] and not re.match(
-            r"\s*\+?[0-9\s]*", self.ad_fields["telefono"]["value"]
-        ):
-            self.invalid_fields.append("telefono")
-        # TODO añadir + a caracteres validos
-
-    def all_fields_are_valid(self) -> bool:
-        """
-        Reports on whether the extracted data is valid.
-        :return: True if values are valid, false if not
-        """
-        self._validate()
-        if self.invalid_fields:
-            return False
-        else:
-            return True
-
-    def fields_missing(self) -> bool:
-        """
-        Reports on whether all compulsory fields are present.
-        :return: True if some field is missing, false if not
-        """
-        for key, contents in self.ad_fields.items():
-            if not contents["optional"] and not contents["found"]:
-                return True
-        return False
-
-    def get_data(self) -> dict:
-        """
-        Returns the extracted data in the form of a dictionary.
-        :return: dictionary with the extracted data
-        """
-        data = {}
-
-        for ad_field in self.ad_fields.keys():
-            data[ad_field] = self.ad_fields[ad_field]["value"]
-
-        return data
-
-
 if __name__ == "__main__":

    capturing_tasks_interface = CapturingTasksInterface()
--- a/core/scrapping_utils.py
+++ b/core/scrapping_utils.py
@ -92,7 +92,7 @@ class UrlAttack:

            if self.response.ok:
                self.success = True
-        except Exception as e:
+        except Exception:
            self.success = False

        if (
--- a/tests/parsing_utils_test.py
+++ b/tests/parsing_utils_test.py
@ -2518,8 +2518,8 @@ def test_referencia_instructions_extract_correctly(real_ad_html):
    referencia_instructions.validate()

    assert (
-        referencia_instructions.found == True
-        and referencia_instructions.valid == True
+        referencia_instructions.found is True
+        and referencia_instructions.valid is True
        and referencia_instructions.value is not None
        and referencia_instructions.search_issue is None
    )
@ -2534,7 +2534,7 @@ def test_referencia_instructions_find_nothing_in_unrelated_html(unrelated_html):
    referencia_instructions.validate()

    assert (
-        referencia_instructions.found == False
+        referencia_instructions.found is False
        and referencia_instructions.valid is None
        and referencia_instructions.value is None
        and referencia_instructions.search_issue is not None
@ -2580,8 +2580,8 @@ def test_all_instructions_extract_correctly(real_ad_html):

    assert all(
        [
-            instruction.found == True
-            and instruction.valid == True
+            instruction.found is True
+            and instruction.valid is True
            and instruction.value is not None
            and instruction.search_issue is None
            for instruction in all_instructions
@ -2628,8 +2628,8 @@ def test_all_instructions_fail_on_unrelated_html(unrelated_html):

    assert all(
        [
-            instruction.found == False
-            and (instruction.valid == False or instruction.valid == None)
+            instruction.found is False
+            and (instruction.valid is False or instruction.valid is None)
            and instruction.value is None
            for instruction in all_instructions
        ]
@ -2725,7 +2725,7 @@ def test_parsing_flow_fails_for_unrelated_html(unrelated_html):

    assert not parsing_flow.all_non_optional_fields_were_found and len(
        parsing_flow.issues
-    ) == len(all_instructions)
+    ) == len([field for field in all_instructions if not field.is_optional])


 def test_parsing_flow_generator_returns_proper_flows():