Formatting, docstrings and other chores.

2021-01-04 22:17:40 +01:00 · 2021-01-04 22:17:40 +01:00 · cbf1643fb5
commit cbf1643fb5
parent adf2cd26ba
3 changed files with 32 additions and 255 deletions
--- a/capturer/capturer.py
+++ b/capturer/capturer.py
@ -1,5 +1,3 @@
 import sys
 from time import sleep
 import datetime
@ -33,6 +31,19 @@ class Capturer:
        url_acquisition_object: Type[UrlAttack],
        dead_ad_checker: Callable,
    ) -> None:
        """
        Receive all required objects.
        :param throttling_manager: takes care of deciding whether a task should
        be started
        :param capturing_tasks_interface: interface to interact with the tasks
        database
        :param capturas_interface: interface to interact with the ad database
        :param parsing_flow_generator: an object capable of generating empty
        parsing flows to give each task a new one
        :param url_acquisition_object: gateway to obtaining the HTML of an url
        :param dead_ad_checker: callable capable of checking if an ad is dead
        through its HTML
        """
        self._throttling_manager = throttling_manager
        self._capturing_tasks_interface = capturing_tasks_interface
        self._capturas_interface = capturas_interface
@ -103,6 +114,10 @@ class CapturingTask:
        Initialize with task parameters and mark the task as being worked on
        in the task queue.
        :param task_parameters: dict with the necessary parameters for the task
        :param capturing_interface: interface to interact with the ad database
        :param new_parsing_flow: an empty parsing flow
        :param url_acquisition_object: gateway to obtaining the HTML of an url
        :param dead_ad_checker: callable capable of checking if an ad is dead
        """
        self.uuid = task_parameters["uuid"]
        self.ad_url = task_parameters["ad_url"]
@ -160,30 +175,6 @@ class CapturingTask:
        self.update_status("Surrender")
        logging.warning(f"A task has surrendered. {self.ad_url}")
    def _extract_data(self) -> None:
        """
        Parses the obtained html to extract the ad information.
        :return: None
        """
        self.parser = AdHtmlParser(self.html)
        self.parser.parse()
    def _check_data(self) -> None:
        """
        Validates that all compulsory fields have been obtained and that the
        values are within the expected. Sets the status of task accordingly.
        :return: None
        """
        if self.parser.fields_missing():
            self.update_status("Fields missing")
            return
        if not self.parser.all_fields_are_valid():
            self.update_status("Invalid value fields")
            return
        self.update_status("Data ready")
    def get_ad_data(self) -> dict:
        """
        Returns the extracted data.
@ -192,6 +183,12 @@ class CapturingTask:
        return self._parsing_flow.field_values
    def _parse_html(self, html: str) -> None:
        """
        Execute the complete parsing flow and report the task status depending
        on the outcome.
        :param html: the HTML of the ad
        :return: None
        """
        self._parsing_flow.execute_flow(soup=BeautifulSoup(html, "html5lib"))
        if not self._parsing_flow.issues:
@ -211,226 +208,6 @@ class CapturingTask:
            return
 class AdHtmlParser:
    """
    Object for parsing, storing and validating the data of the HTML of an ad.
    """
    def __init__(self, html_string: str) -> None:
        """
        Initializes an instance of the parser with the HTML of an ad.
        :param html_string: the full HTML code of the ad page
        """
        self.html = html_string
        self.ad_fields = {
            "referencia": {"found": False, "optional": False, "value": None},
            "precio": {"found": False, "optional": False, "value": None},
            "tamano_categorico": {"found": False, "optional": True, "value": None},
            "m2": {"found": False, "optional": True, "value": None},
            "tipo_anuncio": {"found": False, "optional": False, "value": None},
            "calle": {"found": False, "optional": True, "value": None},
            "barrio": {"found": False, "optional": False, "value": None},
            "distrito": {"found": False, "optional": False, "value": None},
            "ciudad": {"found": False, "optional": False, "value": None},
            "cubierta": {"found": False, "optional": False, "value": None},
            "puerta_auto": {"found": False, "optional": False, "value": None},
            "ascensor": {"found": False, "optional": False, "value": None},
            "alarma": {"found": False, "optional": False, "value": None},
            "circuito": {"found": False, "optional": False, "value": None},
            "personal": {"found": False, "optional": False, "value": None},
            "telefono": {"found": False, "optional": True, "value": None},
        }
    def parse(self) -> None:
        """
        Parses the HTML and stores the ad data.
        :return: None
        """
        soup = BeautifulSoup(self.html, "html5lib")
        if soup.find_all("link", {"rel": "canonical"}) is not None:
            self.ad_fields["referencia"]["value"] = re.findall(
                r"[0-9]{5,20}", str(soup.find_all("link", {"rel": "canonical"})[0])
            )[0]
            self.ad_fields["referencia"]["found"] = True
        if soup.find_all("strong", {"class": "price"}) is not None:
            self.ad_fields["precio"]["value"] = "".join(
                re.findall(
                    r"[0-9]", str(soup.find_all("strong", {"class": "price"})[0])
                )
            )
            self.ad_fields["precio"]["found"] = True
        if soup.find("div", {"class": "info-features"}) is not None:
            try:
                if (
                    "m²"
                    not in soup.find("div", {"class": "info-features"})
                    .find("span")
                    .find("span")
                    .text
                ):
                    self.ad_fields["tamano_categorico"]["value"] = (
                        soup.find("div", {"class": "info-features"})
                        .find("span")
                        .find("span")
                        .text
                    )
                    self.ad_fields["tamano_categorico"]["found"] = True
            except:
                pass
        posible_m2 = [
            tag.text
            for tag in soup.find("div", {"class": "info-features"}).find_all("span")
        ]
        if [posible for posible in posible_m2 if "m²" in posible]:
            self.ad_fields["m2"]["value"] = [
                "".join(re.findall(r"[0-9]+,*[0-9]*", posible))
                for posible in posible_m2
                if "m²" in posible
            ][0].replace(",", ".")
            self.ad_fields["m2"]["found"] = True
        if soup.find("title") is not None:
            if "venta" in soup.find("title").text:
                self.ad_fields["tipo_anuncio"]["value"] = 1
            else:
                self.ad_fields["tipo_anuncio"]["value"] = 2
            self.ad_fields["tipo_anuncio"]["found"] = True
        if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 3:
            self.ad_fields["calle"]["value"] = ""
            self.ad_fields["ciudad"]["value"] = (
                soup.find("div", {"id": "headerMap"}).find_all("li")[-2].text.strip()
            )
            self.ad_fields["ciudad"]["found"] = True
            self.ad_fields["distrito"]["value"] = (
                soup.find("div", {"id": "headerMap"}).find_all("li")[-3].text.strip()
            )
            self.ad_fields["distrito"]["found"] = True
            self.ad_fields["barrio"]["value"] = (
                soup.find("div", {"id": "headerMap"}).find_all("li")[-4].text.strip()
            )
            self.ad_fields["barrio"]["found"] = True
        if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 4:
            self.ad_fields["calle"]["value"] = (
                soup.find("div", {"id": "headerMap"}).find_all("li")[0].text.strip()
            )
            self.ad_fields["calle"]["found"] = True
        features_lists = soup.find_all("div", {"class": "details-property_features"})
        features = [
            feature.text
            for feature_list in features_lists
            for feature in feature_list.find_all("li")
        ]
        self.ad_fields["cubierta"]["value"] = 1 * any(
            "Cubierta" in feature for feature in features
        )
        self.ad_fields["puerta_auto"]["value"] = 1 * any(
            "Puerta" in feature for feature in features
        )
        self.ad_fields["ascensor"]["value"] = 1 * any(
            "ascensor" in feature for feature in features
        )
        self.ad_fields["alarma"]["value"] = 1 * any(
            "Alarma" in feature for feature in features
        )
        self.ad_fields["circuito"]["value"] = 1 * any(
            "Cámaras" in feature for feature in features
        )
        self.ad_fields["personal"]["value"] = 1 * any(
            "Personal" in feature for feature in features
        )
        self.ad_fields["cubierta"]["found"] = True
        self.ad_fields["puerta_auto"]["found"] = True
        self.ad_fields["ascensor"]["found"] = True
        self.ad_fields["alarma"]["found"] = True
        self.ad_fields["circuito"]["found"] = True
        self.ad_fields["personal"]["found"] = True
        if soup.find("p", {"class": "txt-bold _browserPhone icon-phone"}) is not None:
            self.ad_fields["telefono"]["value"] = soup.find(
                "p", {"class": "txt-bold _browserPhone icon-phone"}
            ).text.replace(" ", "")
            self.ad_fields["telefono"]["found"] = True
    def _validate(self) -> None:
        """
        Checks whether the extracted values are valid against the expected
        typology. Stores the results.
        :return: None
        """
        self.invalid_fields = []
        if not re.match(r"[0-9]{4,20}", self.ad_fields["referencia"]["value"]):
            self.invalid_fields.append("referencia")
        if not re.match(r"[0-9]{1,20}", self.ad_fields["precio"]["value"]):
            self.invalid_fields.append("precio")
        possible_values_tamano = [
            "2 coches o más",
            "coche y moto",
            "coche grande",
            "coche pequeño",
            "moto",
            None,
        ]
        if self.ad_fields["tamano_categorico"]["value"] not in possible_values_tamano:
            self.invalid_fields.append("tamano_categorico")
        if not "Barrio" in self.ad_fields["barrio"]["value"]:
            self.invalid_fields.append("barrio")
        if not "Distrito" in self.ad_fields["distrito"]["value"]:
            self.invalid_fields.append("distrito")
        if self.ad_fields["telefono"]["found"] and not re.match(
            r"\s*\+?[0-9\s]*", self.ad_fields["telefono"]["value"]
        ):
            self.invalid_fields.append("telefono")
        # TODO añadir + a caracteres validos
    def all_fields_are_valid(self) -> bool:
        """
        Reports on whether the extracted data is valid.
        :return: True if values are valid, false if not
        """
        self._validate()
        if self.invalid_fields:
            return False
        else:
            return True
    def fields_missing(self) -> bool:
        """
        Reports on whether all compulsory fields are present.
        :return: True if some field is missing, false if not
        """
        for key, contents in self.ad_fields.items():
            if not contents["optional"] and not contents["found"]:
                return True
        return False
    def get_data(self) -> dict:
        """
        Returns the extracted data in the form of a dictionary.
        :return: dictionary with the extracted data
        """
        data = {}
        for ad_field in self.ad_fields.keys():
            data[ad_field] = self.ad_fields[ad_field]["value"]
        return data
 if __name__ == "__main__":
    capturing_tasks_interface = CapturingTasksInterface()
--- a/core/scrapping_utils.py
+++ b/core/scrapping_utils.py
@ -92,7 +92,7 @@ class UrlAttack:
            if self.response.ok:
                self.success = True
-        except Exception as e:
+        except Exception:
            self.success = False
        if (
--- a/tests/parsing_utils_test.py
+++ b/tests/parsing_utils_test.py
@ -2518,8 +2518,8 @@ def test_referencia_instructions_extract_correctly(real_ad_html):
    referencia_instructions.validate()
    assert (
-        referencia_instructions.found == True
+        referencia_instructions.found is True
-        and referencia_instructions.valid == True
+        and referencia_instructions.valid is True
        and referencia_instructions.value is not None
        and referencia_instructions.search_issue is None
    )
@ -2534,7 +2534,7 @@ def test_referencia_instructions_find_nothing_in_unrelated_html(unrelated_html):
    referencia_instructions.validate()
    assert (
-        referencia_instructions.found == False
+        referencia_instructions.found is False
        and referencia_instructions.valid is None
        and referencia_instructions.value is None
        and referencia_instructions.search_issue is not None
@ -2580,8 +2580,8 @@ def test_all_instructions_extract_correctly(real_ad_html):
    assert all(
        [
-            instruction.found == True
+            instruction.found is True
-            and instruction.valid == True
+            and instruction.valid is True
            and instruction.value is not None
            and instruction.search_issue is None
            for instruction in all_instructions
@ -2628,8 +2628,8 @@ def test_all_instructions_fail_on_unrelated_html(unrelated_html):
    assert all(
        [
-            instruction.found == False
+            instruction.found is False
-            and (instruction.valid == False or instruction.valid == None)
+            and (instruction.valid is False or instruction.valid is None)
            and instruction.value is None
            for instruction in all_instructions
        ]
@ -2725,7 +2725,7 @@ def test_parsing_flow_fails_for_unrelated_html(unrelated_html):
    assert not parsing_flow.all_non_optional_fields_were_found and len(
        parsing_flow.issues
-    ) == len(all_instructions)
+    ) == len([field for field in all_instructions if not field.is_optional])
 def test_parsing_flow_generator_returns_proper_flows():