From a61fac72f747191ddfa56a1dadc71a306b722e13 Mon Sep 17 00:00:00 2001
From: pablo <pablomartincalvo@gmail.com>
Date: Tue, 3 Nov 2020 21:55:09 +0100
Subject: [PATCH] Typing, docstrings and formatting of explorer.py

---
 explorer/explorer.py | 192 +++++++++++++++++++++++++++----------------
 1 file changed, 119 insertions(+), 73 deletions(-)

diff --git a/explorer/explorer.py b/explorer/explorer.py
index 3cad2c2..43f9eb1 100644
--- a/explorer/explorer.py
+++ b/explorer/explorer.py
@@ -8,6 +8,7 @@ from time import sleep
 from bs4 import BeautifulSoup
 import re
 from random import randint
+import mysql.connector
 from core.mysql_wrapper import get_anunciosdb, get_tasksdb
 from core.config import monthly_new_ads_target, working_hours
 from core.scrapping_utils import UrlAttack
@@ -18,12 +19,20 @@ import logging
 
 
 class Explorer:
+    """
+    Daemon with the full flow of execution of generating a listing page url,
+    requesting the page, scraping the ad references and storing logs in the
+    task database
+    """
 
     sleep_time_no_work = 60
     sleep_time_no_service = 600
     ad_types = {"1": "alquiler", "2": "venta"}
 
-    def __init__(self):
+    def __init__(self) -> None:
+        """
+        Connect to database and set up initial parameters.
+        """
         try:
             self.anunciosdb = get_anunciosdb()
             self.tasksdb = get_tasksdb()
@@ -35,23 +44,28 @@ class Explorer:
         self.max_queue_retries = 3
         self.queue_retries = 0
 
-    def start(self):
+    def start(self) -> None:
+        """
+        Full flow of execution. Checks whether it should capture a URL, tries
+        to do so and stores the result if successful.
+        :return: None
+        """
         logging.info("Starting explorer")
         while True:
-            if not self.there_is_work():
+            if not self._is_there_work():
                 print("{}: Waiting. No work".format(datetime.datetime.now()))
                 sleep(Explorer.sleep_time_no_work)
                 continue
                 logging.info("Waiting")
 
-            if not self.database_is_up():
+            if not self._database_is_up():
                 alert_master(
                     "SQL DOWN",
                     "El explorer informa de que SQL esta caida. Actividad detenida",
                 )
-                self.stop()
+                raise ConnectionError("Unable to connect to database")
 
-            current_task = ExploringTask(self.compose_listing_url())
+            current_task = ExploringTask(self._compose_listing_url())
             current_task.explore()
             logging.info("Exploring task done...")
 
@@ -66,33 +80,32 @@ class Explorer:
 
             continue
 
-    def stop(self):
-        # TODO Detener el servicio
-        # Detener el servicio
-        pass
-
-    def there_is_work(self):
+    def _is_there_work(self) -> bool:
         """
-        Funcion que agrupa las condiciones que se deben cumplir para poder trabajar
+        Checks whether it should try to scrap a listing page according to
+        limits and cooldowns.
+        :return: True if it should work, false otherwise
         """
-        if self.check_if_recent_task():
-            return False
-
-        if not self.in_working_hours():
-            return False
-
-        if (
-            self.get_referencias_acquired_today()
-            >= self.get_max_referencias_for_today()
+        if any(
+            [
+                self._check_if_recent_task(),
+                not self._in_working_hours(),
+                (
+                    self._get_referencias_acquired_today()
+                    >= self._get_max_referencias_for_today()
+                ),
+                (self._get_tasks_created_today() >= self._get_max_tasks_today()),
+            ]
         ):
             return False
 
-        if self.get_tasks_created_today() >= self.get_max_tasks_today():
-            return False
-
         return True
 
-    def database_is_up(self):
+    def _database_is_up(self) -> bool:
+        """
+        Checks whether the db is reachable with some retries.
+        :return: True if db is reachable, false if not
+        """
         while self.db_retries <= self.max_db_retries:
             try:
                 self.anunciosdb.ping()
@@ -104,16 +117,22 @@ class Explorer:
 
         return False
 
-    def in_working_hours(self):
+    @staticmethod
+    def _in_working_hours() -> None:
+        """
+        Checks whether now is within the working hours of the daemon.
+        :return: True if so, false if not
+        """
         return (
             working_hours["start"]
             <= datetime.datetime.now().time()
             <= working_hours["end"]
         )
 
-    def get_referencias_acquired_today(self):
+    def _get_referencias_acquired_today(self) -> int:
         """
-        Cuenta cuantas nuevas referencias han aparecido en las ultimas 24 horas
+        Queries the database to obtain the count of scraped ads in the last 24h.
+        :return: the resulting count
         """
 
         query_statement = """ SELECT count(referencia)
@@ -125,10 +144,11 @@ class Explorer:
 
         return cursor_result.fetchone()[0]
 
-    def get_max_referencias_for_today(self):
+    def _get_max_referencias_for_today(self) -> float:
         """
-        Calcula la cantidad objetivo para las ultimas 24 horas  en base a la
-        diferencia con el objetivo mensual
+        Queries the database for the number of captured ads in the last 30 days
+        and computes the max number of ad references to obtain today.
+        :return: the max number of references
         """
         query_statement = """ SELECT count(referencia)
                               FROM primera_captura_full
@@ -144,9 +164,11 @@ class Explorer:
 
         return max_referencias
 
-    def get_tasks_created_today(self):
+    def _get_tasks_created_today(self) -> int:
         """
-        Mira en el task log cuantas tareas se han iniciado en las ultimas 24 horas
+        Queries the database for the number of exploring tasks created in the
+        last 24h, returns it.
+        :return: number of exploring tasks created
         """
         query_statement = """ SELECT count(uuid)
                               FROM exploring_tasks_logs
@@ -158,16 +180,17 @@ class Explorer:
 
         return tasks_created_today
 
-    def get_max_tasks_today(self):
+    def _get_max_tasks_today(self) -> float:
         """
-        Calcula el maximo diario de intentos en forma de tareas, en base al
-        maximo de capturas mas un multiplicador
+        Computes the current task goal
+        :return: max current tasks target
         """
-        return (self.get_max_referencias_for_today() / 30) * 6
+        return (self._get_max_referencias_for_today() / 30) * 6
 
-    def check_if_recent_task(self):
+    def _check_if_recent_task(self) -> int:
         """
-        Mira si se ha creado alguna tarea recientemente
+        Queries the db for the number of tasks created in the last 10 minutes.
+        :return: the number of recently created tasks
         """
         query_statement = """ SELECT count(uuid)
                               FROM exploring_tasks_logs
@@ -178,10 +201,11 @@ class Explorer:
 
         return cursor_result.fetchone()[0]
 
-    def compose_listing_url(self):
+    @staticmethod
+    def _compose_listing_url() -> str:
         """
-        Genera URLs de manera aleatoria
-        :return:
+        Generates a listing page URL randomly.
+        :return: the listing page URL
         """
         root = "https://www.idealista.com/"
         type = Explorer.ad_types[str(randint(1, 2))]
@@ -204,38 +228,58 @@ class Explorer:
 
 
 class ExploringTask:
-    def __init__(self, url):
+    """
+    Task object wrapping the process of attempting to capture a listing page,
+    parsing the ad references and sending to db.
+    """
+
+    def __init__(self, url: str) -> None:
+        """
+        Initialize with task parameters and mark the task as being worked on
+        in the task queue.
+        :param url: string with the listing page url to be captured
+        """
         self.anunciosdb = get_anunciosdb()
         self.tasksdb = get_tasksdb()
         self.target_url = url
         self.id = str(uuid.uuid4())
         self._update_status("Pending")
 
-    def _update_status(self, new_status):
+    def _update_status(self, new_status: str) -> None:
+        """
+        Updates the task status and persists it in the task queue.
+        :param new_status: string describing the new status
+        :return: None
+        """
         self.status = new_status
         self._log_in_tasksdb()
 
-    def explore(self):
+    def explore(self) -> None:
+        """
+        Main flow of work.
+        :return: None
+        """
         attack = UrlAttack(self.target_url)
         attack.attack()
         self._update_status("Attacked")
 
-        if attack.success:
-            self._validate_referencias(attack.get_text())
-            self._extract_referencias(attack.get_text())
-            if self.referencias:
-                self._update_status("Referencias ready")
-            elif self.there_are_referencias:
-                self._update_status("Failure - No new referencias in HTML")
-            else:
-                self._update_status("Failure - HTML with no referencias")
-        else:
+        if not attack.success:
             self._update_status("Failure - Bad request")
+            return
 
-    def _log_in_tasksdb(self):
+        self._validate_referencias(attack.get_text())
+        self._extract_referencias(attack.get_text())
+        if self.referencias:
+            self._update_status("Referencias ready")
+        elif self.there_are_referencias:
+            self._update_status("Failure - No new referencias in HTML")
+        else:
+            self._update_status("Failure - HTML with no referencias")
+
+    def _log_in_tasksdb(self) -> None:
         """
-        Graba en la base de datos de tareas un registro con el UUID de la tarea,
-        un timestamp y el status
+        Logs status in the task db.
+        :return: None
         """
 
         query_statement = """INSERT INTO exploring_tasks_logs
@@ -246,10 +290,11 @@ class ExploringTask:
 
         self.tasksdb.query(query_statement, query_parameters)
 
-    def _validate_referencias(self, html):
+    def _validate_referencias(self, html: str) -> None:
         """
-        Comprueba que las etiquetas sigan el formato de un anuncio.
-        Lanza una advertencia si no es así.
+        Checks that the ad references are in the HTML code.
+        :param html: string with HTML code of the listing page
+        :return: None
         """
         soup = BeautifulSoup(html, "html5lib")
         ads = soup.find_all(class_="item")
@@ -267,10 +312,11 @@ class ExploringTask:
                 )
                 break
 
-    def _extract_referencias(self, html):
+    def _extract_referencias(self, html: str) -> None:
         """
-        Saca referencias de HTML, descarta las que ya exiten en la base de datos
-        de capturas, y guarda si han aparecido listings y si hay alguno nuevo
+        Scraps the ad references out of the HTML code and stores them.
+        :param html: string with HTML code of the listing page
+        :return: None
         """
 
         soup = BeautifulSoup(html, "html5lib")
@@ -281,9 +327,11 @@ class ExploringTask:
             if self._is_new_listing(ad["data-adid"]):
                 self.referencias.append(ad["data-adid"])
 
-    def _is_new_listing(self, referencia):
+    def _is_new_listing(self, referencia: str) -> bool:
         """
-        Comprueba si el listing ya existe en la base de datos de anuncios
+        Checks if an ad reference already exists in the db.
+        :param referencia:
+        :return: True if it is new, false if not
         """
         query_statement = """SELECT count(referencia)
                              FROM capturas
@@ -297,14 +345,12 @@ class ExploringTask:
         else:
             return True
 
-    def get_referencias(self):
+    def get_referencias(self) -> list:
         """
-        Devuelve las referencias, si las hay
+        Gets the references.
+        :return: list of ad references
         """
-        if self.referencias:
-            return self.referencias
-        else:
-            return None
+        return self.referencias
 
 
 if __name__ == "__main__":