From cbf1643fb56797d93451dca0acac44bb978bcb69 Mon Sep 17 00:00:00 2001 From: pablo Date: Mon, 4 Jan 2021 22:17:40 +0100 Subject: [PATCH] Formatting, docstrings and other chores. --- capturer/capturer.py | 269 +++--------------------------------- core/scrapping_utils.py | 2 +- tests/parsing_utils_test.py | 16 +-- 3 files changed, 32 insertions(+), 255 deletions(-) diff --git a/capturer/capturer.py b/capturer/capturer.py index 7397af6..6971cfb 100644 --- a/capturer/capturer.py +++ b/capturer/capturer.py @@ -1,5 +1,3 @@ -import sys - from time import sleep import datetime @@ -33,6 +31,19 @@ class Capturer: url_acquisition_object: Type[UrlAttack], dead_ad_checker: Callable, ) -> None: + """ + Receive all required objects. + :param throttling_manager: takes care of deciding whether a task should + be started + :param capturing_tasks_interface: interface to interact with the tasks + database + :param capturas_interface: interface to interact with the ad database + :param parsing_flow_generator: an object capable of generating empty + parsing flows to give each task a new one + :param url_acquisition_object: gateway to obtaining the HTML of an url + :param dead_ad_checker: callable capable of checking if an ad is dead + through its HTML + """ self._throttling_manager = throttling_manager self._capturing_tasks_interface = capturing_tasks_interface self._capturas_interface = capturas_interface @@ -103,6 +114,10 @@ class CapturingTask: Initialize with task parameters and mark the task as being worked on in the task queue. :param task_parameters: dict with the necessary parameters for the task + :param capturing_interface: interface to interact with the ad database + :param new_parsing_flow: an empty parsing flow + :param url_acquisition_object: gateway to obtaining the HTML of an url + :param dead_ad_checker: callable capable of checking if an ad is dead """ self.uuid = task_parameters["uuid"] self.ad_url = task_parameters["ad_url"] @@ -160,30 +175,6 @@ class CapturingTask: self.update_status("Surrender") logging.warning(f"A task has surrendered. {self.ad_url}") - def _extract_data(self) -> None: - """ - Parses the obtained html to extract the ad information. - :return: None - """ - self.parser = AdHtmlParser(self.html) - self.parser.parse() - - def _check_data(self) -> None: - """ - Validates that all compulsory fields have been obtained and that the - values are within the expected. Sets the status of task accordingly. - :return: None - """ - if self.parser.fields_missing(): - self.update_status("Fields missing") - return - - if not self.parser.all_fields_are_valid(): - self.update_status("Invalid value fields") - return - - self.update_status("Data ready") - def get_ad_data(self) -> dict: """ Returns the extracted data. @@ -192,6 +183,12 @@ class CapturingTask: return self._parsing_flow.field_values def _parse_html(self, html: str) -> None: + """ + Execute the complete parsing flow and report the task status depending + on the outcome. + :param html: the HTML of the ad + :return: None + """ self._parsing_flow.execute_flow(soup=BeautifulSoup(html, "html5lib")) if not self._parsing_flow.issues: @@ -211,226 +208,6 @@ class CapturingTask: return -class AdHtmlParser: - """ - Object for parsing, storing and validating the data of the HTML of an ad. - """ - - def __init__(self, html_string: str) -> None: - """ - Initializes an instance of the parser with the HTML of an ad. - :param html_string: the full HTML code of the ad page - """ - self.html = html_string - - self.ad_fields = { - "referencia": {"found": False, "optional": False, "value": None}, - "precio": {"found": False, "optional": False, "value": None}, - "tamano_categorico": {"found": False, "optional": True, "value": None}, - "m2": {"found": False, "optional": True, "value": None}, - "tipo_anuncio": {"found": False, "optional": False, "value": None}, - "calle": {"found": False, "optional": True, "value": None}, - "barrio": {"found": False, "optional": False, "value": None}, - "distrito": {"found": False, "optional": False, "value": None}, - "ciudad": {"found": False, "optional": False, "value": None}, - "cubierta": {"found": False, "optional": False, "value": None}, - "puerta_auto": {"found": False, "optional": False, "value": None}, - "ascensor": {"found": False, "optional": False, "value": None}, - "alarma": {"found": False, "optional": False, "value": None}, - "circuito": {"found": False, "optional": False, "value": None}, - "personal": {"found": False, "optional": False, "value": None}, - "telefono": {"found": False, "optional": True, "value": None}, - } - - def parse(self) -> None: - """ - Parses the HTML and stores the ad data. - :return: None - """ - - soup = BeautifulSoup(self.html, "html5lib") - - if soup.find_all("link", {"rel": "canonical"}) is not None: - self.ad_fields["referencia"]["value"] = re.findall( - r"[0-9]{5,20}", str(soup.find_all("link", {"rel": "canonical"})[0]) - )[0] - self.ad_fields["referencia"]["found"] = True - - if soup.find_all("strong", {"class": "price"}) is not None: - self.ad_fields["precio"]["value"] = "".join( - re.findall( - r"[0-9]", str(soup.find_all("strong", {"class": "price"})[0]) - ) - ) - self.ad_fields["precio"]["found"] = True - - if soup.find("div", {"class": "info-features"}) is not None: - try: - if ( - "m²" - not in soup.find("div", {"class": "info-features"}) - .find("span") - .find("span") - .text - ): - self.ad_fields["tamano_categorico"]["value"] = ( - soup.find("div", {"class": "info-features"}) - .find("span") - .find("span") - .text - ) - self.ad_fields["tamano_categorico"]["found"] = True - except: - pass - - posible_m2 = [ - tag.text - for tag in soup.find("div", {"class": "info-features"}).find_all("span") - ] - if [posible for posible in posible_m2 if "m²" in posible]: - self.ad_fields["m2"]["value"] = [ - "".join(re.findall(r"[0-9]+,*[0-9]*", posible)) - for posible in posible_m2 - if "m²" in posible - ][0].replace(",", ".") - self.ad_fields["m2"]["found"] = True - - if soup.find("title") is not None: - if "venta" in soup.find("title").text: - self.ad_fields["tipo_anuncio"]["value"] = 1 - else: - self.ad_fields["tipo_anuncio"]["value"] = 2 - self.ad_fields["tipo_anuncio"]["found"] = True - - if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 3: - self.ad_fields["calle"]["value"] = "" - self.ad_fields["ciudad"]["value"] = ( - soup.find("div", {"id": "headerMap"}).find_all("li")[-2].text.strip() - ) - self.ad_fields["ciudad"]["found"] = True - self.ad_fields["distrito"]["value"] = ( - soup.find("div", {"id": "headerMap"}).find_all("li")[-3].text.strip() - ) - self.ad_fields["distrito"]["found"] = True - self.ad_fields["barrio"]["value"] = ( - soup.find("div", {"id": "headerMap"}).find_all("li")[-4].text.strip() - ) - self.ad_fields["barrio"]["found"] = True - if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 4: - self.ad_fields["calle"]["value"] = ( - soup.find("div", {"id": "headerMap"}).find_all("li")[0].text.strip() - ) - self.ad_fields["calle"]["found"] = True - - features_lists = soup.find_all("div", {"class": "details-property_features"}) - features = [ - feature.text - for feature_list in features_lists - for feature in feature_list.find_all("li") - ] - self.ad_fields["cubierta"]["value"] = 1 * any( - "Cubierta" in feature for feature in features - ) - self.ad_fields["puerta_auto"]["value"] = 1 * any( - "Puerta" in feature for feature in features - ) - self.ad_fields["ascensor"]["value"] = 1 * any( - "ascensor" in feature for feature in features - ) - self.ad_fields["alarma"]["value"] = 1 * any( - "Alarma" in feature for feature in features - ) - self.ad_fields["circuito"]["value"] = 1 * any( - "Cámaras" in feature for feature in features - ) - self.ad_fields["personal"]["value"] = 1 * any( - "Personal" in feature for feature in features - ) - - self.ad_fields["cubierta"]["found"] = True - self.ad_fields["puerta_auto"]["found"] = True - self.ad_fields["ascensor"]["found"] = True - self.ad_fields["alarma"]["found"] = True - self.ad_fields["circuito"]["found"] = True - self.ad_fields["personal"]["found"] = True - - if soup.find("p", {"class": "txt-bold _browserPhone icon-phone"}) is not None: - self.ad_fields["telefono"]["value"] = soup.find( - "p", {"class": "txt-bold _browserPhone icon-phone"} - ).text.replace(" ", "") - self.ad_fields["telefono"]["found"] = True - - def _validate(self) -> None: - """ - Checks whether the extracted values are valid against the expected - typology. Stores the results. - :return: None - """ - self.invalid_fields = [] - - if not re.match(r"[0-9]{4,20}", self.ad_fields["referencia"]["value"]): - self.invalid_fields.append("referencia") - - if not re.match(r"[0-9]{1,20}", self.ad_fields["precio"]["value"]): - self.invalid_fields.append("precio") - - possible_values_tamano = [ - "2 coches o más", - "coche y moto", - "coche grande", - "coche pequeño", - "moto", - None, - ] - if self.ad_fields["tamano_categorico"]["value"] not in possible_values_tamano: - self.invalid_fields.append("tamano_categorico") - - if not "Barrio" in self.ad_fields["barrio"]["value"]: - self.invalid_fields.append("barrio") - - if not "Distrito" in self.ad_fields["distrito"]["value"]: - self.invalid_fields.append("distrito") - - if self.ad_fields["telefono"]["found"] and not re.match( - r"\s*\+?[0-9\s]*", self.ad_fields["telefono"]["value"] - ): - self.invalid_fields.append("telefono") - # TODO añadir + a caracteres validos - - def all_fields_are_valid(self) -> bool: - """ - Reports on whether the extracted data is valid. - :return: True if values are valid, false if not - """ - self._validate() - if self.invalid_fields: - return False - else: - return True - - def fields_missing(self) -> bool: - """ - Reports on whether all compulsory fields are present. - :return: True if some field is missing, false if not - """ - for key, contents in self.ad_fields.items(): - if not contents["optional"] and not contents["found"]: - return True - return False - - def get_data(self) -> dict: - """ - Returns the extracted data in the form of a dictionary. - :return: dictionary with the extracted data - """ - data = {} - - for ad_field in self.ad_fields.keys(): - data[ad_field] = self.ad_fields[ad_field]["value"] - - return data - - if __name__ == "__main__": capturing_tasks_interface = CapturingTasksInterface() diff --git a/core/scrapping_utils.py b/core/scrapping_utils.py index 6fff33b..3ab6386 100644 --- a/core/scrapping_utils.py +++ b/core/scrapping_utils.py @@ -92,7 +92,7 @@ class UrlAttack: if self.response.ok: self.success = True - except Exception as e: + except Exception: self.success = False if ( diff --git a/tests/parsing_utils_test.py b/tests/parsing_utils_test.py index de0beeb..9d9d56d 100644 --- a/tests/parsing_utils_test.py +++ b/tests/parsing_utils_test.py @@ -2518,8 +2518,8 @@ def test_referencia_instructions_extract_correctly(real_ad_html): referencia_instructions.validate() assert ( - referencia_instructions.found == True - and referencia_instructions.valid == True + referencia_instructions.found is True + and referencia_instructions.valid is True and referencia_instructions.value is not None and referencia_instructions.search_issue is None ) @@ -2534,7 +2534,7 @@ def test_referencia_instructions_find_nothing_in_unrelated_html(unrelated_html): referencia_instructions.validate() assert ( - referencia_instructions.found == False + referencia_instructions.found is False and referencia_instructions.valid is None and referencia_instructions.value is None and referencia_instructions.search_issue is not None @@ -2580,8 +2580,8 @@ def test_all_instructions_extract_correctly(real_ad_html): assert all( [ - instruction.found == True - and instruction.valid == True + instruction.found is True + and instruction.valid is True and instruction.value is not None and instruction.search_issue is None for instruction in all_instructions @@ -2628,8 +2628,8 @@ def test_all_instructions_fail_on_unrelated_html(unrelated_html): assert all( [ - instruction.found == False - and (instruction.valid == False or instruction.valid == None) + instruction.found is False + and (instruction.valid is False or instruction.valid is None) and instruction.value is None for instruction in all_instructions ] @@ -2725,7 +2725,7 @@ def test_parsing_flow_fails_for_unrelated_html(unrelated_html): assert not parsing_flow.all_non_optional_fields_were_found and len( parsing_flow.issues - ) == len(all_instructions) + ) == len([field for field in all_instructions if not field.is_optional]) def test_parsing_flow_generator_returns_proper_flows():