diff --git a/capturer/capturer.py b/capturer/capturer.py index 3b25056..7397af6 100644 --- a/capturer/capturer.py +++ b/capturer/capturer.py @@ -1,13 +1,10 @@ import sys -sys.path.append("..") from time import sleep -from bs4 import BeautifulSoup -import re import datetime -from db_layer.capturing_tasks_interface import capturing_interface -from db_layer.capturas_interface import capturas_interface +from db_layer.capturing_tasks_interface import CapturingTasksInterface +from db_layer.capturas_interface import CapturasInterface from core.scrapping_utils import UrlAttack from core.config import working_hours, minimum_seconds_between_tries from core.throttling_utils import ( @@ -17,6 +14,7 @@ from core.throttling_utils import ( DynamicThrottlingRule, ) from refresher.refresher import Refresher +from core.parsing_utils import * import logging @@ -26,8 +24,22 @@ class Capturer: scraping and db storage. """ - def __init__(self, throttling_manager: ThrottleManager) -> None: + def __init__( + self, + throttling_manager: ThrottleManager, + capturing_tasks_interface: CapturingTasksInterface, + capturas_interface: CapturasInterface, + parsing_flow_generator: ParsingFlowGenerator, + url_acquisition_object: Type[UrlAttack], + dead_ad_checker: Callable, + ) -> None: self._throttling_manager = throttling_manager + self._capturing_tasks_interface = capturing_tasks_interface + self._capturas_interface = capturas_interface + self._parsing_flow_generator = parsing_flow_generator + self._url_acquisition_object = url_acquisition_object + self._dead_ad_checker = dead_ad_checker + self.last_try_datetime = datetime.datetime.now() def start(self) -> None: @@ -46,11 +58,17 @@ class Capturer: sleep(10) logging.info("Waiting...") - pending_task = capturing_interface.get_pending_task() + pending_task = self._capturing_tasks_interface.get_pending_task() logging.info("Got a task") - task = CapturingTask(pending_task) + task = CapturingTask( + pending_task, + capturing_interface=self._capturing_tasks_interface, + new_parsing_flow=self._parsing_flow_generator.get_new_flow(), + url_acquisition_object=self._url_acquisition_object, + dead_ad_checker=self._dead_ad_checker, + ) self.last_try_datetime = datetime.datetime.now() task.capture() @@ -60,8 +78,8 @@ class Capturer: logging.warning("Something went wrong, not adding data.") continue - capturas_interface.insert_captura(ad_data) - task._update_status("Captura inserted") + self._capturas_interface.insert_captura(ad_data) + task.update_status("Captura inserted") logging.info("New ad inserted.") @@ -73,29 +91,40 @@ class CapturingTask: sleep_time_failed_request = 180 - def __init__(self, parameters) -> None: + def __init__( + self, + task_parameters: dict, + capturing_interface: CapturingTasksInterface, + new_parsing_flow: ParsingFlow, + url_acquisition_object: Type[UrlAttack], + dead_ad_checker: Callable, + ) -> None: """ Initialize with task parameters and mark the task as being worked on in the task queue. - :param parameters: dict with the necessary parameters for the task + :param task_parameters: dict with the necessary parameters for the task """ - self.uuid = parameters["uuid"] - self.ad_url = parameters["ad_url"] - self.uuid_exploring = parameters["fk_uuid_exploring"] - self.status = parameters["status"] + self.uuid = task_parameters["uuid"] + self.ad_url = task_parameters["ad_url"] + self.uuid_exploring = task_parameters["fk_uuid_exploring"] + self.status = task_parameters["status"] self.request_failures = 1 self.html = None + self._parsing_flow = new_parsing_flow + self._capturing_interface = capturing_interface + self._url_acquistion_object = url_acquisition_object + self._is_dead_ad = dead_ad_checker - self._update_status("Loading") + self.update_status("Loading") - def _update_status(self, new_status) -> None: + def update_status(self, new_status) -> None: """ Updates the task status and persists it in the task queue. :param new_status: string describing the new status :return: None """ self.status = new_status - capturing_interface.update_capturing_task( + self._capturing_interface.update_capturing_task( self.uuid, self.uuid_exploring, self.status, self.ad_url ) @@ -103,34 +132,32 @@ class CapturingTask: """ Main flow of work """ - self._update_status("WIP") + self.update_status("WIP") while self.request_failures < 4: - attack = UrlAttack(self.ad_url) + attack = self._url_acquistion_object(self.ad_url) attack.attack() if attack.success: - self.html = attack.get_text() - self._extract_data() - self._check_data() + self._parse_html(html=attack.get_text()) return if not attack.success: try: - if Refresher.dead_ad_checker(attack.get_text()): - self._update_status("Dead ad") + if self._is_dead_ad(attack.get_text()): + self.update_status("Dead ad") return except AttributeError: logging.error( "Something went wrong when checking if the ad is gone" ) - self._update_status("Fail {}".format(self.request_failures)) + self.update_status("Fail {}".format(self.request_failures)) self.request_failures += 1 sleep(CapturingTask.sleep_time_failed_request) continue - self._update_status("Surrender") + self.update_status("Surrender") logging.warning(f"A task has surrendered. {self.ad_url}") def _extract_data(self) -> None: @@ -148,21 +175,40 @@ class CapturingTask: :return: None """ if self.parser.fields_missing(): - self._update_status("Fields missing") + self.update_status("Fields missing") return if not self.parser.all_fields_are_valid(): - self._update_status("Invalid value fields") + self.update_status("Invalid value fields") return - self._update_status("Data ready") + self.update_status("Data ready") def get_ad_data(self) -> dict: """ Returns the extracted data. :return: dictionary with the data of the ad. """ - return self.parser.get_data() + return self._parsing_flow.field_values + + def _parse_html(self, html: str) -> None: + self._parsing_flow.execute_flow(soup=BeautifulSoup(html, "html5lib")) + + if not self._parsing_flow.issues: + self.update_status("Data ready") + return + + if not self._parsing_flow.all_found_fields_are_valid: + self.update_status("Invalid value fields") + logging.warning(f"Invalid fields found in ad: {self.ad_url}") + logging.warning(f"{self._parsing_flow.issues}") + return + if not self._parsing_flow.all_non_optional_fields_were_found: + self.update_status("Fields missing") + logging.warning( + f"Couldn't scrap necessary fields: {self._parsing_flow.issues}" + ) + return class AdHtmlParser: @@ -362,7 +408,7 @@ class AdHtmlParser: else: return True - def fields_missing(self) -> None: + def fields_missing(self) -> bool: """ Reports on whether all compulsory fields are present. :return: True if some field is missing, false if not @@ -387,13 +433,65 @@ class AdHtmlParser: if __name__ == "__main__": + capturing_tasks_interface = CapturingTasksInterface() + capturas_interface = CapturasInterface() + throttling_manager = ThrottleManager() throttling_manager.add_rule(WorkingHoursThrottlingRule(working_hours)).add_rule( CooldownThrottlingRule(minimum_seconds_between_tries), required_argument_names=["last_attempt_timestamp"], ).add_rule( - DynamicThrottlingRule(lambda: bool(capturing_interface.get_pending_task())) + DynamicThrottlingRule( + lambda: bool(capturing_tasks_interface.get_pending_task()) + ) ) - capturer = Capturer(throttling_manager=throttling_manager) + parsing_flow_generator = ParsingFlowGenerator( + ParsingFlow, + ( + (ReferenciaFieldInstructions, {}), + (PrecioFieldInstructions, {}), + (TamanoCategoricoFieldInstructions, {}), + (M2FieldInstructions, {}), + (TipoAnuncioFieldInstructions, {}), + (CalleFieldInstructions, {}), + (BarrioFieldInstructions, {}), + (DistritoFieldInstructions, {}), + (CiudadFieldInstructions, {}), + ( + SecondaryFeaturesFieldInstructions, + {"field_name": "cubierta", "search_keyword": "Cubierta"}, + ), + ( + SecondaryFeaturesFieldInstructions, + {"field_name": "puerta_auto", "search_keyword": "Puerta"}, + ), + ( + SecondaryFeaturesFieldInstructions, + {"field_name": "ascensor", "search_keyword": "ascensor"}, + ), + ( + SecondaryFeaturesFieldInstructions, + {"field_name": "alarma", "search_keyword": "Alarma"}, + ), + ( + SecondaryFeaturesFieldInstructions, + {"field_name": "circuito", "search_keyword": "Cámaras"}, + ), + ( + SecondaryFeaturesFieldInstructions, + {"field_name": "personal", "search_keyword": "Personal"}, + ), + (TelefonoFieldInstructions, {}), + ), + ) + + capturer = Capturer( + throttling_manager=throttling_manager, + capturing_tasks_interface=capturing_tasks_interface, + capturas_interface=capturas_interface, + parsing_flow_generator=parsing_flow_generator, + url_acquisition_object=UrlAttack, + dead_ad_checker=Refresher.dead_ad_checker, + ) capturer.start() diff --git a/core/parsing_utils.py b/core/parsing_utils.py index fd51b00..7d81ad5 100644 --- a/core/parsing_utils.py +++ b/core/parsing_utils.py @@ -258,7 +258,7 @@ class TipoAnuncioFieldInstructions(BaseTargetFieldInstructions): if "venta" in soup.find("title").text: self.value = 1 self.found = True - if "alquiler" in soup.find("title").text: + if "Alquiler" in soup.find("title").text: self.value = 2 self.found = True @@ -542,11 +542,11 @@ class ParsingFlow: if (field.found or field.is_optional) and field.valid: continue this_field_issues = {} - if not field.found: + if not field.found and not field.is_optional: this_field_issues["found"] = "Not found" if field.search_issue: this_field_issues["search_issue"] = field.search_issue - if not field.valid: + if not field.valid and field.valid is not None: this_field_issues["validity"] = "Not valid" this_field_issues["value"] = field.value diff --git a/core/scrapping_utils.py b/core/scrapping_utils.py index 67a47cf..6fff33b 100644 --- a/core/scrapping_utils.py +++ b/core/scrapping_utils.py @@ -95,7 +95,10 @@ class UrlAttack: except Exception as e: self.success = False - if random.randrange(0, 100) < UrlAttack.identity_change_probability: + if ( + not self.success + or random.randrange(0, 100) < UrlAttack.identity_change_probability + ): self._change_identity() def _change_identity(self) -> None: