Typing, docstrings and formatting of explorer.py

2020-11-03 21:55:09 +01:00 · 2020-11-03 21:55:09 +01:00 · a61fac72f7
commit a61fac72f7
parent f53a65834b
1 changed files with 119 additions and 73 deletions
--- a/explorer/explorer.py
+++ b/explorer/explorer.py
@ -8,6 +8,7 @@ from time import sleep
 from bs4 import BeautifulSoup
 import re
 from random import randint
+import mysql.connector
 from core.mysql_wrapper import get_anunciosdb, get_tasksdb
 from core.config import monthly_new_ads_target, working_hours
 from core.scrapping_utils import UrlAttack
@ -18,12 +19,20 @@ import logging


 class Explorer:
+    """
+    Daemon with the full flow of execution of generating a listing page url,
+    requesting the page, scraping the ad references and storing logs in the
+    task database
+    """

    sleep_time_no_work = 60
    sleep_time_no_service = 600
    ad_types = {"1": "alquiler", "2": "venta"}

-    def __init__(self):
+    def __init__(self) -> None:
+        """
+        Connect to database and set up initial parameters.
+        """
        try:
            self.anunciosdb = get_anunciosdb()
            self.tasksdb = get_tasksdb()
@ -35,23 +44,28 @@ class Explorer:
        self.max_queue_retries = 3
        self.queue_retries = 0

-    def start(self):
+    def start(self) -> None:
+        """
+        Full flow of execution. Checks whether it should capture a URL, tries
+        to do so and stores the result if successful.
+        :return: None
+        """
        logging.info("Starting explorer")
        while True:
-            if not self.there_is_work():
+            if not self._is_there_work():
                print("{}: Waiting. No work".format(datetime.datetime.now()))
                sleep(Explorer.sleep_time_no_work)
                continue
                logging.info("Waiting")

-            if not self.database_is_up():
+            if not self._database_is_up():
                alert_master(
                    "SQL DOWN",
                    "El explorer informa de que SQL esta caida. Actividad detenida",
                )
-                self.stop()
+                raise ConnectionError("Unable to connect to database")

-            current_task = ExploringTask(self.compose_listing_url())
+            current_task = ExploringTask(self._compose_listing_url())
            current_task.explore()
            logging.info("Exploring task done...")

@ -66,33 +80,32 @@ class Explorer:

            continue

-    def stop(self):
-        # TODO Detener el servicio
-        # Detener el servicio
-        pass
-
-    def there_is_work(self):
+    def _is_there_work(self) -> bool:
        """
-        Funcion que agrupa las condiciones que se deben cumplir para poder trabajar
+        Checks whether it should try to scrap a listing page according to
+        limits and cooldowns.
+        :return: True if it should work, false otherwise
        """
-        if self.check_if_recent_task():
-            return False
-
-        if not self.in_working_hours():
-            return False
-
-        if (
-            self.get_referencias_acquired_today()
-            >= self.get_max_referencias_for_today()
+        if any(
+            [
+                self._check_if_recent_task(),
+                not self._in_working_hours(),
+                (
+                    self._get_referencias_acquired_today()
+                    >= self._get_max_referencias_for_today()
+                ),
+                (self._get_tasks_created_today() >= self._get_max_tasks_today()),
+            ]
        ):
            return False

-        if self.get_tasks_created_today() >= self.get_max_tasks_today():
-            return False
-
        return True

-    def database_is_up(self):
+    def _database_is_up(self) -> bool:
+        """
+        Checks whether the db is reachable with some retries.
+        :return: True if db is reachable, false if not
+        """
        while self.db_retries <= self.max_db_retries:
            try:
                self.anunciosdb.ping()
@ -104,16 +117,22 @@ class Explorer:

        return False

-    def in_working_hours(self):
+    @staticmethod
+    def _in_working_hours() -> None:
+        """
+        Checks whether now is within the working hours of the daemon.
+        :return: True if so, false if not
+        """
        return (
            working_hours["start"]
            <= datetime.datetime.now().time()
            <= working_hours["end"]
        )

-    def get_referencias_acquired_today(self):
+    def _get_referencias_acquired_today(self) -> int:
        """
-        Cuenta cuantas nuevas referencias han aparecido en las ultimas 24 horas
+        Queries the database to obtain the count of scraped ads in the last 24h.
+        :return: the resulting count
        """

        query_statement = """ SELECT count(referencia)
@ -125,10 +144,11 @@ class Explorer:

        return cursor_result.fetchone()[0]

-    def get_max_referencias_for_today(self):
+    def _get_max_referencias_for_today(self) -> float:
        """
-        Calcula la cantidad objetivo para las ultimas 24 horas  en base a la
-        diferencia con el objetivo mensual
+        Queries the database for the number of captured ads in the last 30 days
+        and computes the max number of ad references to obtain today.
+        :return: the max number of references
        """
        query_statement = """ SELECT count(referencia)
                              FROM primera_captura_full
@ -144,9 +164,11 @@ class Explorer:

        return max_referencias

-    def get_tasks_created_today(self):
+    def _get_tasks_created_today(self) -> int:
        """
-        Mira en el task log cuantas tareas se han iniciado en las ultimas 24 horas
+        Queries the database for the number of exploring tasks created in the
+        last 24h, returns it.
+        :return: number of exploring tasks created
        """
        query_statement = """ SELECT count(uuid)
                              FROM exploring_tasks_logs
@ -158,16 +180,17 @@ class Explorer:

        return tasks_created_today

-    def get_max_tasks_today(self):
+    def _get_max_tasks_today(self) -> float:
        """
-        Calcula el maximo diario de intentos en forma de tareas, en base al
-        maximo de capturas mas un multiplicador
+        Computes the current task goal
+        :return: max current tasks target
        """
-        return (self.get_max_referencias_for_today() / 30) * 6
+        return (self._get_max_referencias_for_today() / 30) * 6

-    def check_if_recent_task(self):
+    def _check_if_recent_task(self) -> int:
        """
-        Mira si se ha creado alguna tarea recientemente
+        Queries the db for the number of tasks created in the last 10 minutes.
+        :return: the number of recently created tasks
        """
        query_statement = """ SELECT count(uuid)
                              FROM exploring_tasks_logs
@ -178,10 +201,11 @@ class Explorer:

        return cursor_result.fetchone()[0]

-    def compose_listing_url(self):
+    @staticmethod
+    def _compose_listing_url() -> str:
        """
-        Genera URLs de manera aleatoria
-        :return:
+        Generates a listing page URL randomly.
+        :return: the listing page URL
        """
        root = "https://www.idealista.com/"
        type = Explorer.ad_types[str(randint(1, 2))]
@ -204,38 +228,58 @@ class Explorer:


 class ExploringTask:
-    def __init__(self, url):
+    """
+    Task object wrapping the process of attempting to capture a listing page,
+    parsing the ad references and sending to db.
+    """
+
+    def __init__(self, url: str) -> None:
+        """
+        Initialize with task parameters and mark the task as being worked on
+        in the task queue.
+        :param url: string with the listing page url to be captured
+        """
        self.anunciosdb = get_anunciosdb()
        self.tasksdb = get_tasksdb()
        self.target_url = url
        self.id = str(uuid.uuid4())
        self._update_status("Pending")

-    def _update_status(self, new_status):
+    def _update_status(self, new_status: str) -> None:
+        """
+        Updates the task status and persists it in the task queue.
+        :param new_status: string describing the new status
+        :return: None
+        """
        self.status = new_status
        self._log_in_tasksdb()

-    def explore(self):
+    def explore(self) -> None:
+        """
+        Main flow of work.
+        :return: None
+        """
        attack = UrlAttack(self.target_url)
        attack.attack()
        self._update_status("Attacked")

-        if attack.success:
-            self._validate_referencias(attack.get_text())
-            self._extract_referencias(attack.get_text())
-            if self.referencias:
-                self._update_status("Referencias ready")
-            elif self.there_are_referencias:
-                self._update_status("Failure - No new referencias in HTML")
-            else:
-                self._update_status("Failure - HTML with no referencias")
-        else:
+        if not attack.success:
            self._update_status("Failure - Bad request")
+            return

-    def _log_in_tasksdb(self):
+        self._validate_referencias(attack.get_text())
+        self._extract_referencias(attack.get_text())
+        if self.referencias:
+            self._update_status("Referencias ready")
+        elif self.there_are_referencias:
+            self._update_status("Failure - No new referencias in HTML")
+        else:
+            self._update_status("Failure - HTML with no referencias")
+
+    def _log_in_tasksdb(self) -> None:
        """
-        Graba en la base de datos de tareas un registro con el UUID de la tarea,
-        un timestamp y el status
+        Logs status in the task db.
+        :return: None
        """

        query_statement = """INSERT INTO exploring_tasks_logs
@ -246,10 +290,11 @@ class ExploringTask:

        self.tasksdb.query(query_statement, query_parameters)

-    def _validate_referencias(self, html):
+    def _validate_referencias(self, html: str) -> None:
        """
-        Comprueba que las etiquetas sigan el formato de un anuncio.
-        Lanza una advertencia si no es así.
+        Checks that the ad references are in the HTML code.
+        :param html: string with HTML code of the listing page
+        :return: None
        """
        soup = BeautifulSoup(html, "html5lib")
        ads = soup.find_all(class_="item")
@ -267,10 +312,11 @@ class ExploringTask:
                )
                break

-    def _extract_referencias(self, html):
+    def _extract_referencias(self, html: str) -> None:
        """
-        Saca referencias de HTML, descarta las que ya exiten en la base de datos
-        de capturas, y guarda si han aparecido listings y si hay alguno nuevo
+        Scraps the ad references out of the HTML code and stores them.
+        :param html: string with HTML code of the listing page
+        :return: None
        """

        soup = BeautifulSoup(html, "html5lib")
@ -281,9 +327,11 @@ class ExploringTask:
            if self._is_new_listing(ad["data-adid"]):
                self.referencias.append(ad["data-adid"])

-    def _is_new_listing(self, referencia):
+    def _is_new_listing(self, referencia: str) -> bool:
        """
-        Comprueba si el listing ya existe en la base de datos de anuncios
+        Checks if an ad reference already exists in the db.
+        :param referencia:
+        :return: True if it is new, false if not
        """
        query_statement = """SELECT count(referencia)
                             FROM capturas
@ -297,14 +345,12 @@ class ExploringTask:
        else:
            return True

-    def get_referencias(self):
+    def get_referencias(self) -> list:
        """
-        Devuelve las referencias, si las hay
+        Gets the references.
+        :return: list of ad references
        """
-        if self.referencias:
-            return self.referencias
-        else:
-            return None
+        return self.referencias


 if __name__ == "__main__":