Typing, docstrings and formatting of explorer.py

This commit is contained in:
pablo 2020-11-03 21:55:09 +01:00
parent f53a65834b
commit a61fac72f7

View file

@ -8,6 +8,7 @@ from time import sleep
from bs4 import BeautifulSoup
import re
from random import randint
import mysql.connector
from core.mysql_wrapper import get_anunciosdb, get_tasksdb
from core.config import monthly_new_ads_target, working_hours
from core.scrapping_utils import UrlAttack
@ -18,12 +19,20 @@ import logging
class Explorer:
"""
Daemon with the full flow of execution of generating a listing page url,
requesting the page, scraping the ad references and storing logs in the
task database
"""
sleep_time_no_work = 60
sleep_time_no_service = 600
ad_types = {"1": "alquiler", "2": "venta"}
def __init__(self):
def __init__(self) -> None:
"""
Connect to database and set up initial parameters.
"""
try:
self.anunciosdb = get_anunciosdb()
self.tasksdb = get_tasksdb()
@ -35,23 +44,28 @@ class Explorer:
self.max_queue_retries = 3
self.queue_retries = 0
def start(self):
def start(self) -> None:
"""
Full flow of execution. Checks whether it should capture a URL, tries
to do so and stores the result if successful.
:return: None
"""
logging.info("Starting explorer")
while True:
if not self.there_is_work():
if not self._is_there_work():
print("{}: Waiting. No work".format(datetime.datetime.now()))
sleep(Explorer.sleep_time_no_work)
continue
logging.info("Waiting")
if not self.database_is_up():
if not self._database_is_up():
alert_master(
"SQL DOWN",
"El explorer informa de que SQL esta caida. Actividad detenida",
)
self.stop()
raise ConnectionError("Unable to connect to database")
current_task = ExploringTask(self.compose_listing_url())
current_task = ExploringTask(self._compose_listing_url())
current_task.explore()
logging.info("Exploring task done...")
@ -66,33 +80,32 @@ class Explorer:
continue
def stop(self):
# TODO Detener el servicio
# Detener el servicio
pass
def there_is_work(self):
def _is_there_work(self) -> bool:
"""
Funcion que agrupa las condiciones que se deben cumplir para poder trabajar
Checks whether it should try to scrap a listing page according to
limits and cooldowns.
:return: True if it should work, false otherwise
"""
if self.check_if_recent_task():
return False
if not self.in_working_hours():
return False
if (
self.get_referencias_acquired_today()
>= self.get_max_referencias_for_today()
if any(
[
self._check_if_recent_task(),
not self._in_working_hours(),
(
self._get_referencias_acquired_today()
>= self._get_max_referencias_for_today()
),
(self._get_tasks_created_today() >= self._get_max_tasks_today()),
]
):
return False
if self.get_tasks_created_today() >= self.get_max_tasks_today():
return False
return True
def database_is_up(self):
def _database_is_up(self) -> bool:
"""
Checks whether the db is reachable with some retries.
:return: True if db is reachable, false if not
"""
while self.db_retries <= self.max_db_retries:
try:
self.anunciosdb.ping()
@ -104,16 +117,22 @@ class Explorer:
return False
def in_working_hours(self):
@staticmethod
def _in_working_hours() -> None:
"""
Checks whether now is within the working hours of the daemon.
:return: True if so, false if not
"""
return (
working_hours["start"]
<= datetime.datetime.now().time()
<= working_hours["end"]
)
def get_referencias_acquired_today(self):
def _get_referencias_acquired_today(self) -> int:
"""
Cuenta cuantas nuevas referencias han aparecido en las ultimas 24 horas
Queries the database to obtain the count of scraped ads in the last 24h.
:return: the resulting count
"""
query_statement = """ SELECT count(referencia)
@ -125,10 +144,11 @@ class Explorer:
return cursor_result.fetchone()[0]
def get_max_referencias_for_today(self):
def _get_max_referencias_for_today(self) -> float:
"""
Calcula la cantidad objetivo para las ultimas 24 horas en base a la
diferencia con el objetivo mensual
Queries the database for the number of captured ads in the last 30 days
and computes the max number of ad references to obtain today.
:return: the max number of references
"""
query_statement = """ SELECT count(referencia)
FROM primera_captura_full
@ -144,9 +164,11 @@ class Explorer:
return max_referencias
def get_tasks_created_today(self):
def _get_tasks_created_today(self) -> int:
"""
Mira en el task log cuantas tareas se han iniciado en las ultimas 24 horas
Queries the database for the number of exploring tasks created in the
last 24h, returns it.
:return: number of exploring tasks created
"""
query_statement = """ SELECT count(uuid)
FROM exploring_tasks_logs
@ -158,16 +180,17 @@ class Explorer:
return tasks_created_today
def get_max_tasks_today(self):
def _get_max_tasks_today(self) -> float:
"""
Calcula el maximo diario de intentos en forma de tareas, en base al
maximo de capturas mas un multiplicador
Computes the current task goal
:return: max current tasks target
"""
return (self.get_max_referencias_for_today() / 30) * 6
return (self._get_max_referencias_for_today() / 30) * 6
def check_if_recent_task(self):
def _check_if_recent_task(self) -> int:
"""
Mira si se ha creado alguna tarea recientemente
Queries the db for the number of tasks created in the last 10 minutes.
:return: the number of recently created tasks
"""
query_statement = """ SELECT count(uuid)
FROM exploring_tasks_logs
@ -178,10 +201,11 @@ class Explorer:
return cursor_result.fetchone()[0]
def compose_listing_url(self):
@staticmethod
def _compose_listing_url() -> str:
"""
Genera URLs de manera aleatoria
:return:
Generates a listing page URL randomly.
:return: the listing page URL
"""
root = "https://www.idealista.com/"
type = Explorer.ad_types[str(randint(1, 2))]
@ -204,38 +228,58 @@ class Explorer:
class ExploringTask:
def __init__(self, url):
"""
Task object wrapping the process of attempting to capture a listing page,
parsing the ad references and sending to db.
"""
def __init__(self, url: str) -> None:
"""
Initialize with task parameters and mark the task as being worked on
in the task queue.
:param url: string with the listing page url to be captured
"""
self.anunciosdb = get_anunciosdb()
self.tasksdb = get_tasksdb()
self.target_url = url
self.id = str(uuid.uuid4())
self._update_status("Pending")
def _update_status(self, new_status):
def _update_status(self, new_status: str) -> None:
"""
Updates the task status and persists it in the task queue.
:param new_status: string describing the new status
:return: None
"""
self.status = new_status
self._log_in_tasksdb()
def explore(self):
def explore(self) -> None:
"""
Main flow of work.
:return: None
"""
attack = UrlAttack(self.target_url)
attack.attack()
self._update_status("Attacked")
if attack.success:
self._validate_referencias(attack.get_text())
self._extract_referencias(attack.get_text())
if self.referencias:
self._update_status("Referencias ready")
elif self.there_are_referencias:
self._update_status("Failure - No new referencias in HTML")
else:
self._update_status("Failure - HTML with no referencias")
else:
if not attack.success:
self._update_status("Failure - Bad request")
return
def _log_in_tasksdb(self):
self._validate_referencias(attack.get_text())
self._extract_referencias(attack.get_text())
if self.referencias:
self._update_status("Referencias ready")
elif self.there_are_referencias:
self._update_status("Failure - No new referencias in HTML")
else:
self._update_status("Failure - HTML with no referencias")
def _log_in_tasksdb(self) -> None:
"""
Graba en la base de datos de tareas un registro con el UUID de la tarea,
un timestamp y el status
Logs status in the task db.
:return: None
"""
query_statement = """INSERT INTO exploring_tasks_logs
@ -246,10 +290,11 @@ class ExploringTask:
self.tasksdb.query(query_statement, query_parameters)
def _validate_referencias(self, html):
def _validate_referencias(self, html: str) -> None:
"""
Comprueba que las etiquetas sigan el formato de un anuncio.
Lanza una advertencia si no es así.
Checks that the ad references are in the HTML code.
:param html: string with HTML code of the listing page
:return: None
"""
soup = BeautifulSoup(html, "html5lib")
ads = soup.find_all(class_="item")
@ -267,10 +312,11 @@ class ExploringTask:
)
break
def _extract_referencias(self, html):
def _extract_referencias(self, html: str) -> None:
"""
Saca referencias de HTML, descarta las que ya exiten en la base de datos
de capturas, y guarda si han aparecido listings y si hay alguno nuevo
Scraps the ad references out of the HTML code and stores them.
:param html: string with HTML code of the listing page
:return: None
"""
soup = BeautifulSoup(html, "html5lib")
@ -281,9 +327,11 @@ class ExploringTask:
if self._is_new_listing(ad["data-adid"]):
self.referencias.append(ad["data-adid"])
def _is_new_listing(self, referencia):
def _is_new_listing(self, referencia: str) -> bool:
"""
Comprueba si el listing ya existe en la base de datos de anuncios
Checks if an ad reference already exists in the db.
:param referencia:
:return: True if it is new, false if not
"""
query_statement = """SELECT count(referencia)
FROM capturas
@ -297,14 +345,12 @@ class ExploringTask:
else:
return True
def get_referencias(self):
def get_referencias(self) -> list:
"""
Devuelve las referencias, si las hay
Gets the references.
:return: list of ad references
"""
if self.referencias:
return self.referencias
else:
return None
return self.referencias
if __name__ == "__main__":