From a61fac72f747191ddfa56a1dadc71a306b722e13 Mon Sep 17 00:00:00 2001 From: pablo Date: Tue, 3 Nov 2020 21:55:09 +0100 Subject: [PATCH] Typing, docstrings and formatting of explorer.py --- explorer/explorer.py | 192 +++++++++++++++++++++++++++---------------- 1 file changed, 119 insertions(+), 73 deletions(-) diff --git a/explorer/explorer.py b/explorer/explorer.py index 3cad2c2..43f9eb1 100644 --- a/explorer/explorer.py +++ b/explorer/explorer.py @@ -8,6 +8,7 @@ from time import sleep from bs4 import BeautifulSoup import re from random import randint +import mysql.connector from core.mysql_wrapper import get_anunciosdb, get_tasksdb from core.config import monthly_new_ads_target, working_hours from core.scrapping_utils import UrlAttack @@ -18,12 +19,20 @@ import logging class Explorer: + """ + Daemon with the full flow of execution of generating a listing page url, + requesting the page, scraping the ad references and storing logs in the + task database + """ sleep_time_no_work = 60 sleep_time_no_service = 600 ad_types = {"1": "alquiler", "2": "venta"} - def __init__(self): + def __init__(self) -> None: + """ + Connect to database and set up initial parameters. + """ try: self.anunciosdb = get_anunciosdb() self.tasksdb = get_tasksdb() @@ -35,23 +44,28 @@ class Explorer: self.max_queue_retries = 3 self.queue_retries = 0 - def start(self): + def start(self) -> None: + """ + Full flow of execution. Checks whether it should capture a URL, tries + to do so and stores the result if successful. + :return: None + """ logging.info("Starting explorer") while True: - if not self.there_is_work(): + if not self._is_there_work(): print("{}: Waiting. No work".format(datetime.datetime.now())) sleep(Explorer.sleep_time_no_work) continue logging.info("Waiting") - if not self.database_is_up(): + if not self._database_is_up(): alert_master( "SQL DOWN", "El explorer informa de que SQL esta caida. Actividad detenida", ) - self.stop() + raise ConnectionError("Unable to connect to database") - current_task = ExploringTask(self.compose_listing_url()) + current_task = ExploringTask(self._compose_listing_url()) current_task.explore() logging.info("Exploring task done...") @@ -66,33 +80,32 @@ class Explorer: continue - def stop(self): - # TODO Detener el servicio - # Detener el servicio - pass - - def there_is_work(self): + def _is_there_work(self) -> bool: """ - Funcion que agrupa las condiciones que se deben cumplir para poder trabajar + Checks whether it should try to scrap a listing page according to + limits and cooldowns. + :return: True if it should work, false otherwise """ - if self.check_if_recent_task(): - return False - - if not self.in_working_hours(): - return False - - if ( - self.get_referencias_acquired_today() - >= self.get_max_referencias_for_today() + if any( + [ + self._check_if_recent_task(), + not self._in_working_hours(), + ( + self._get_referencias_acquired_today() + >= self._get_max_referencias_for_today() + ), + (self._get_tasks_created_today() >= self._get_max_tasks_today()), + ] ): return False - if self.get_tasks_created_today() >= self.get_max_tasks_today(): - return False - return True - def database_is_up(self): + def _database_is_up(self) -> bool: + """ + Checks whether the db is reachable with some retries. + :return: True if db is reachable, false if not + """ while self.db_retries <= self.max_db_retries: try: self.anunciosdb.ping() @@ -104,16 +117,22 @@ class Explorer: return False - def in_working_hours(self): + @staticmethod + def _in_working_hours() -> None: + """ + Checks whether now is within the working hours of the daemon. + :return: True if so, false if not + """ return ( working_hours["start"] <= datetime.datetime.now().time() <= working_hours["end"] ) - def get_referencias_acquired_today(self): + def _get_referencias_acquired_today(self) -> int: """ - Cuenta cuantas nuevas referencias han aparecido en las ultimas 24 horas + Queries the database to obtain the count of scraped ads in the last 24h. + :return: the resulting count """ query_statement = """ SELECT count(referencia) @@ -125,10 +144,11 @@ class Explorer: return cursor_result.fetchone()[0] - def get_max_referencias_for_today(self): + def _get_max_referencias_for_today(self) -> float: """ - Calcula la cantidad objetivo para las ultimas 24 horas en base a la - diferencia con el objetivo mensual + Queries the database for the number of captured ads in the last 30 days + and computes the max number of ad references to obtain today. + :return: the max number of references """ query_statement = """ SELECT count(referencia) FROM primera_captura_full @@ -144,9 +164,11 @@ class Explorer: return max_referencias - def get_tasks_created_today(self): + def _get_tasks_created_today(self) -> int: """ - Mira en el task log cuantas tareas se han iniciado en las ultimas 24 horas + Queries the database for the number of exploring tasks created in the + last 24h, returns it. + :return: number of exploring tasks created """ query_statement = """ SELECT count(uuid) FROM exploring_tasks_logs @@ -158,16 +180,17 @@ class Explorer: return tasks_created_today - def get_max_tasks_today(self): + def _get_max_tasks_today(self) -> float: """ - Calcula el maximo diario de intentos en forma de tareas, en base al - maximo de capturas mas un multiplicador + Computes the current task goal + :return: max current tasks target """ - return (self.get_max_referencias_for_today() / 30) * 6 + return (self._get_max_referencias_for_today() / 30) * 6 - def check_if_recent_task(self): + def _check_if_recent_task(self) -> int: """ - Mira si se ha creado alguna tarea recientemente + Queries the db for the number of tasks created in the last 10 minutes. + :return: the number of recently created tasks """ query_statement = """ SELECT count(uuid) FROM exploring_tasks_logs @@ -178,10 +201,11 @@ class Explorer: return cursor_result.fetchone()[0] - def compose_listing_url(self): + @staticmethod + def _compose_listing_url() -> str: """ - Genera URLs de manera aleatoria - :return: + Generates a listing page URL randomly. + :return: the listing page URL """ root = "https://www.idealista.com/" type = Explorer.ad_types[str(randint(1, 2))] @@ -204,38 +228,58 @@ class Explorer: class ExploringTask: - def __init__(self, url): + """ + Task object wrapping the process of attempting to capture a listing page, + parsing the ad references and sending to db. + """ + + def __init__(self, url: str) -> None: + """ + Initialize with task parameters and mark the task as being worked on + in the task queue. + :param url: string with the listing page url to be captured + """ self.anunciosdb = get_anunciosdb() self.tasksdb = get_tasksdb() self.target_url = url self.id = str(uuid.uuid4()) self._update_status("Pending") - def _update_status(self, new_status): + def _update_status(self, new_status: str) -> None: + """ + Updates the task status and persists it in the task queue. + :param new_status: string describing the new status + :return: None + """ self.status = new_status self._log_in_tasksdb() - def explore(self): + def explore(self) -> None: + """ + Main flow of work. + :return: None + """ attack = UrlAttack(self.target_url) attack.attack() self._update_status("Attacked") - if attack.success: - self._validate_referencias(attack.get_text()) - self._extract_referencias(attack.get_text()) - if self.referencias: - self._update_status("Referencias ready") - elif self.there_are_referencias: - self._update_status("Failure - No new referencias in HTML") - else: - self._update_status("Failure - HTML with no referencias") - else: + if not attack.success: self._update_status("Failure - Bad request") + return - def _log_in_tasksdb(self): + self._validate_referencias(attack.get_text()) + self._extract_referencias(attack.get_text()) + if self.referencias: + self._update_status("Referencias ready") + elif self.there_are_referencias: + self._update_status("Failure - No new referencias in HTML") + else: + self._update_status("Failure - HTML with no referencias") + + def _log_in_tasksdb(self) -> None: """ - Graba en la base de datos de tareas un registro con el UUID de la tarea, - un timestamp y el status + Logs status in the task db. + :return: None """ query_statement = """INSERT INTO exploring_tasks_logs @@ -246,10 +290,11 @@ class ExploringTask: self.tasksdb.query(query_statement, query_parameters) - def _validate_referencias(self, html): + def _validate_referencias(self, html: str) -> None: """ - Comprueba que las etiquetas sigan el formato de un anuncio. - Lanza una advertencia si no es así. + Checks that the ad references are in the HTML code. + :param html: string with HTML code of the listing page + :return: None """ soup = BeautifulSoup(html, "html5lib") ads = soup.find_all(class_="item") @@ -267,10 +312,11 @@ class ExploringTask: ) break - def _extract_referencias(self, html): + def _extract_referencias(self, html: str) -> None: """ - Saca referencias de HTML, descarta las que ya exiten en la base de datos - de capturas, y guarda si han aparecido listings y si hay alguno nuevo + Scraps the ad references out of the HTML code and stores them. + :param html: string with HTML code of the listing page + :return: None """ soup = BeautifulSoup(html, "html5lib") @@ -281,9 +327,11 @@ class ExploringTask: if self._is_new_listing(ad["data-adid"]): self.referencias.append(ad["data-adid"]) - def _is_new_listing(self, referencia): + def _is_new_listing(self, referencia: str) -> bool: """ - Comprueba si el listing ya existe en la base de datos de anuncios + Checks if an ad reference already exists in the db. + :param referencia: + :return: True if it is new, false if not """ query_statement = """SELECT count(referencia) FROM capturas @@ -297,14 +345,12 @@ class ExploringTask: else: return True - def get_referencias(self): + def get_referencias(self) -> list: """ - Devuelve las referencias, si las hay + Gets the references. + :return: list of ad references """ - if self.referencias: - return self.referencias - else: - return None + return self.referencias if __name__ == "__main__":