import sys from time import sleep import datetime from db_layer.capturing_tasks_interface import CapturingTasksInterface from db_layer.capturas_interface import CapturasInterface from core.scrapping_utils import UrlAttack from core.config import working_hours, minimum_seconds_between_tries from core.throttling_utils import ( ThrottleManager, WorkingHoursThrottlingRule, CooldownThrottlingRule, DynamicThrottlingRule, ) from refresher.refresher import Refresher from core.parsing_utils import * import logging class Capturer: """ Daemon with the full flow of execution of individual ad requesting, data scraping and db storage. """ def __init__( self, throttling_manager: ThrottleManager, capturing_tasks_interface: CapturingTasksInterface, capturas_interface: CapturasInterface, parsing_flow_generator: ParsingFlowGenerator, url_acquisition_object: Type[UrlAttack], dead_ad_checker: Callable, ) -> None: self._throttling_manager = throttling_manager self._capturing_tasks_interface = capturing_tasks_interface self._capturas_interface = capturas_interface self._parsing_flow_generator = parsing_flow_generator self._url_acquisition_object = url_acquisition_object self._dead_ad_checker = dead_ad_checker self.last_try_datetime = datetime.datetime.now() def start(self) -> None: """ Full flow of execution. Checks whether it should capture a URL, tries to do so and stores the result if successful. :return: None """ logging.info("Starting capturer") while True: while not self._throttling_manager.allow_next_task( last_attempt_timestamp=self.last_try_datetime ): sleep(10) logging.info("Waiting...") pending_task = self._capturing_tasks_interface.get_pending_task() logging.info("Got a task") task = CapturingTask( pending_task, capturing_interface=self._capturing_tasks_interface, new_parsing_flow=self._parsing_flow_generator.get_new_flow(), url_acquisition_object=self._url_acquisition_object, dead_ad_checker=self._dead_ad_checker, ) self.last_try_datetime = datetime.datetime.now() task.capture() if task.status == "Data ready": ad_data = task.get_ad_data() else: logging.warning("Something went wrong, not adding data.") continue self._capturas_interface.insert_captura(ad_data) task.update_status("Captura inserted") logging.info("New ad inserted.") class CapturingTask: """ Task object wrapping the process of attempting to capture and ad, parsing the data and sending to db. """ sleep_time_failed_request = 180 def __init__( self, task_parameters: dict, capturing_interface: CapturingTasksInterface, new_parsing_flow: ParsingFlow, url_acquisition_object: Type[UrlAttack], dead_ad_checker: Callable, ) -> None: """ Initialize with task parameters and mark the task as being worked on in the task queue. :param task_parameters: dict with the necessary parameters for the task """ self.uuid = task_parameters["uuid"] self.ad_url = task_parameters["ad_url"] self.uuid_exploring = task_parameters["fk_uuid_exploring"] self.status = task_parameters["status"] self.request_failures = 1 self.html = None self._parsing_flow = new_parsing_flow self._capturing_interface = capturing_interface self._url_acquistion_object = url_acquisition_object self._is_dead_ad = dead_ad_checker self.update_status("Loading") def update_status(self, new_status) -> None: """ Updates the task status and persists it in the task queue. :param new_status: string describing the new status :return: None """ self.status = new_status self._capturing_interface.update_capturing_task( self.uuid, self.uuid_exploring, self.status, self.ad_url ) def capture(self) -> None: """ Main flow of work """ self.update_status("WIP") while self.request_failures < 4: attack = self._url_acquistion_object(self.ad_url) attack.attack() if attack.success: self._parse_html(html=attack.get_text()) return if not attack.success: try: if self._is_dead_ad(attack.get_text()): self.update_status("Dead ad") return except AttributeError: logging.error( "Something went wrong when checking if the ad is gone" ) self.update_status("Fail {}".format(self.request_failures)) self.request_failures += 1 sleep(CapturingTask.sleep_time_failed_request) continue self.update_status("Surrender") logging.warning(f"A task has surrendered. {self.ad_url}") def _extract_data(self) -> None: """ Parses the obtained html to extract the ad information. :return: None """ self.parser = AdHtmlParser(self.html) self.parser.parse() def _check_data(self) -> None: """ Validates that all compulsory fields have been obtained and that the values are within the expected. Sets the status of task accordingly. :return: None """ if self.parser.fields_missing(): self.update_status("Fields missing") return if not self.parser.all_fields_are_valid(): self.update_status("Invalid value fields") return self.update_status("Data ready") def get_ad_data(self) -> dict: """ Returns the extracted data. :return: dictionary with the data of the ad. """ return self._parsing_flow.field_values def _parse_html(self, html: str) -> None: self._parsing_flow.execute_flow(soup=BeautifulSoup(html, "html5lib")) if not self._parsing_flow.issues: self.update_status("Data ready") return if not self._parsing_flow.all_found_fields_are_valid: self.update_status("Invalid value fields") logging.warning(f"Invalid fields found in ad: {self.ad_url}") logging.warning(f"{self._parsing_flow.issues}") return if not self._parsing_flow.all_non_optional_fields_were_found: self.update_status("Fields missing") logging.warning( f"Couldn't scrap necessary fields: {self._parsing_flow.issues}" ) return class AdHtmlParser: """ Object for parsing, storing and validating the data of the HTML of an ad. """ def __init__(self, html_string: str) -> None: """ Initializes an instance of the parser with the HTML of an ad. :param html_string: the full HTML code of the ad page """ self.html = html_string self.ad_fields = { "referencia": {"found": False, "optional": False, "value": None}, "precio": {"found": False, "optional": False, "value": None}, "tamano_categorico": {"found": False, "optional": True, "value": None}, "m2": {"found": False, "optional": True, "value": None}, "tipo_anuncio": {"found": False, "optional": False, "value": None}, "calle": {"found": False, "optional": True, "value": None}, "barrio": {"found": False, "optional": False, "value": None}, "distrito": {"found": False, "optional": False, "value": None}, "ciudad": {"found": False, "optional": False, "value": None}, "cubierta": {"found": False, "optional": False, "value": None}, "puerta_auto": {"found": False, "optional": False, "value": None}, "ascensor": {"found": False, "optional": False, "value": None}, "alarma": {"found": False, "optional": False, "value": None}, "circuito": {"found": False, "optional": False, "value": None}, "personal": {"found": False, "optional": False, "value": None}, "telefono": {"found": False, "optional": True, "value": None}, } def parse(self) -> None: """ Parses the HTML and stores the ad data. :return: None """ soup = BeautifulSoup(self.html, "html5lib") if soup.find_all("link", {"rel": "canonical"}) is not None: self.ad_fields["referencia"]["value"] = re.findall( r"[0-9]{5,20}", str(soup.find_all("link", {"rel": "canonical"})[0]) )[0] self.ad_fields["referencia"]["found"] = True if soup.find_all("strong", {"class": "price"}) is not None: self.ad_fields["precio"]["value"] = "".join( re.findall( r"[0-9]", str(soup.find_all("strong", {"class": "price"})[0]) ) ) self.ad_fields["precio"]["found"] = True if soup.find("div", {"class": "info-features"}) is not None: try: if ( "m²" not in soup.find("div", {"class": "info-features"}) .find("span") .find("span") .text ): self.ad_fields["tamano_categorico"]["value"] = ( soup.find("div", {"class": "info-features"}) .find("span") .find("span") .text ) self.ad_fields["tamano_categorico"]["found"] = True except: pass posible_m2 = [ tag.text for tag in soup.find("div", {"class": "info-features"}).find_all("span") ] if [posible for posible in posible_m2 if "m²" in posible]: self.ad_fields["m2"]["value"] = [ "".join(re.findall(r"[0-9]+,*[0-9]*", posible)) for posible in posible_m2 if "m²" in posible ][0].replace(",", ".") self.ad_fields["m2"]["found"] = True if soup.find("title") is not None: if "venta" in soup.find("title").text: self.ad_fields["tipo_anuncio"]["value"] = 1 else: self.ad_fields["tipo_anuncio"]["value"] = 2 self.ad_fields["tipo_anuncio"]["found"] = True if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 3: self.ad_fields["calle"]["value"] = "" self.ad_fields["ciudad"]["value"] = ( soup.find("div", {"id": "headerMap"}).find_all("li")[-2].text.strip() ) self.ad_fields["ciudad"]["found"] = True self.ad_fields["distrito"]["value"] = ( soup.find("div", {"id": "headerMap"}).find_all("li")[-3].text.strip() ) self.ad_fields["distrito"]["found"] = True self.ad_fields["barrio"]["value"] = ( soup.find("div", {"id": "headerMap"}).find_all("li")[-4].text.strip() ) self.ad_fields["barrio"]["found"] = True if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 4: self.ad_fields["calle"]["value"] = ( soup.find("div", {"id": "headerMap"}).find_all("li")[0].text.strip() ) self.ad_fields["calle"]["found"] = True features_lists = soup.find_all("div", {"class": "details-property_features"}) features = [ feature.text for feature_list in features_lists for feature in feature_list.find_all("li") ] self.ad_fields["cubierta"]["value"] = 1 * any( "Cubierta" in feature for feature in features ) self.ad_fields["puerta_auto"]["value"] = 1 * any( "Puerta" in feature for feature in features ) self.ad_fields["ascensor"]["value"] = 1 * any( "ascensor" in feature for feature in features ) self.ad_fields["alarma"]["value"] = 1 * any( "Alarma" in feature for feature in features ) self.ad_fields["circuito"]["value"] = 1 * any( "Cámaras" in feature for feature in features ) self.ad_fields["personal"]["value"] = 1 * any( "Personal" in feature for feature in features ) self.ad_fields["cubierta"]["found"] = True self.ad_fields["puerta_auto"]["found"] = True self.ad_fields["ascensor"]["found"] = True self.ad_fields["alarma"]["found"] = True self.ad_fields["circuito"]["found"] = True self.ad_fields["personal"]["found"] = True if soup.find("p", {"class": "txt-bold _browserPhone icon-phone"}) is not None: self.ad_fields["telefono"]["value"] = soup.find( "p", {"class": "txt-bold _browserPhone icon-phone"} ).text.replace(" ", "") self.ad_fields["telefono"]["found"] = True def _validate(self) -> None: """ Checks whether the extracted values are valid against the expected typology. Stores the results. :return: None """ self.invalid_fields = [] if not re.match(r"[0-9]{4,20}", self.ad_fields["referencia"]["value"]): self.invalid_fields.append("referencia") if not re.match(r"[0-9]{1,20}", self.ad_fields["precio"]["value"]): self.invalid_fields.append("precio") possible_values_tamano = [ "2 coches o más", "coche y moto", "coche grande", "coche pequeño", "moto", None, ] if self.ad_fields["tamano_categorico"]["value"] not in possible_values_tamano: self.invalid_fields.append("tamano_categorico") if not "Barrio" in self.ad_fields["barrio"]["value"]: self.invalid_fields.append("barrio") if not "Distrito" in self.ad_fields["distrito"]["value"]: self.invalid_fields.append("distrito") if self.ad_fields["telefono"]["found"] and not re.match( r"\s*\+?[0-9\s]*", self.ad_fields["telefono"]["value"] ): self.invalid_fields.append("telefono") # TODO añadir + a caracteres validos def all_fields_are_valid(self) -> bool: """ Reports on whether the extracted data is valid. :return: True if values are valid, false if not """ self._validate() if self.invalid_fields: return False else: return True def fields_missing(self) -> bool: """ Reports on whether all compulsory fields are present. :return: True if some field is missing, false if not """ for key, contents in self.ad_fields.items(): if not contents["optional"] and not contents["found"]: return True return False def get_data(self) -> dict: """ Returns the extracted data in the form of a dictionary. :return: dictionary with the extracted data """ data = {} for ad_field in self.ad_fields.keys(): data[ad_field] = self.ad_fields[ad_field]["value"] return data if __name__ == "__main__": capturing_tasks_interface = CapturingTasksInterface() capturas_interface = CapturasInterface() throttling_manager = ThrottleManager() throttling_manager.add_rule(WorkingHoursThrottlingRule(working_hours)).add_rule( CooldownThrottlingRule(minimum_seconds_between_tries), required_argument_names=["last_attempt_timestamp"], ).add_rule( DynamicThrottlingRule( lambda: bool(capturing_tasks_interface.get_pending_task()) ) ) parsing_flow_generator = ParsingFlowGenerator( ParsingFlow, ( (ReferenciaFieldInstructions, {}), (PrecioFieldInstructions, {}), (TamanoCategoricoFieldInstructions, {}), (M2FieldInstructions, {}), (TipoAnuncioFieldInstructions, {}), (CalleFieldInstructions, {}), (BarrioFieldInstructions, {}), (DistritoFieldInstructions, {}), (CiudadFieldInstructions, {}), ( SecondaryFeaturesFieldInstructions, {"field_name": "cubierta", "search_keyword": "Cubierta"}, ), ( SecondaryFeaturesFieldInstructions, {"field_name": "puerta_auto", "search_keyword": "Puerta"}, ), ( SecondaryFeaturesFieldInstructions, {"field_name": "ascensor", "search_keyword": "ascensor"}, ), ( SecondaryFeaturesFieldInstructions, {"field_name": "alarma", "search_keyword": "Alarma"}, ), ( SecondaryFeaturesFieldInstructions, {"field_name": "circuito", "search_keyword": "Cámaras"}, ), ( SecondaryFeaturesFieldInstructions, {"field_name": "personal", "search_keyword": "Personal"}, ), (TelefonoFieldInstructions, {}), ), ) capturer = Capturer( throttling_manager=throttling_manager, capturing_tasks_interface=capturing_tasks_interface, capturas_interface=capturas_interface, parsing_flow_generator=parsing_flow_generator, url_acquisition_object=UrlAttack, dead_ad_checker=Refresher.dead_ad_checker, ) capturer.start()