Typing, docstrings and formatting of explorer.py
This commit is contained in:
parent
f53a65834b
commit
a61fac72f7
1 changed files with 119 additions and 73 deletions
|
|
@ -8,6 +8,7 @@ from time import sleep
|
|||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
from random import randint
|
||||
import mysql.connector
|
||||
from core.mysql_wrapper import get_anunciosdb, get_tasksdb
|
||||
from core.config import monthly_new_ads_target, working_hours
|
||||
from core.scrapping_utils import UrlAttack
|
||||
|
|
@ -18,12 +19,20 @@ import logging
|
|||
|
||||
|
||||
class Explorer:
|
||||
"""
|
||||
Daemon with the full flow of execution of generating a listing page url,
|
||||
requesting the page, scraping the ad references and storing logs in the
|
||||
task database
|
||||
"""
|
||||
|
||||
sleep_time_no_work = 60
|
||||
sleep_time_no_service = 600
|
||||
ad_types = {"1": "alquiler", "2": "venta"}
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
"""
|
||||
Connect to database and set up initial parameters.
|
||||
"""
|
||||
try:
|
||||
self.anunciosdb = get_anunciosdb()
|
||||
self.tasksdb = get_tasksdb()
|
||||
|
|
@ -35,23 +44,28 @@ class Explorer:
|
|||
self.max_queue_retries = 3
|
||||
self.queue_retries = 0
|
||||
|
||||
def start(self):
|
||||
def start(self) -> None:
|
||||
"""
|
||||
Full flow of execution. Checks whether it should capture a URL, tries
|
||||
to do so and stores the result if successful.
|
||||
:return: None
|
||||
"""
|
||||
logging.info("Starting explorer")
|
||||
while True:
|
||||
if not self.there_is_work():
|
||||
if not self._is_there_work():
|
||||
print("{}: Waiting. No work".format(datetime.datetime.now()))
|
||||
sleep(Explorer.sleep_time_no_work)
|
||||
continue
|
||||
logging.info("Waiting")
|
||||
|
||||
if not self.database_is_up():
|
||||
if not self._database_is_up():
|
||||
alert_master(
|
||||
"SQL DOWN",
|
||||
"El explorer informa de que SQL esta caida. Actividad detenida",
|
||||
)
|
||||
self.stop()
|
||||
raise ConnectionError("Unable to connect to database")
|
||||
|
||||
current_task = ExploringTask(self.compose_listing_url())
|
||||
current_task = ExploringTask(self._compose_listing_url())
|
||||
current_task.explore()
|
||||
logging.info("Exploring task done...")
|
||||
|
||||
|
|
@ -66,33 +80,32 @@ class Explorer:
|
|||
|
||||
continue
|
||||
|
||||
def stop(self):
|
||||
# TODO Detener el servicio
|
||||
# Detener el servicio
|
||||
pass
|
||||
|
||||
def there_is_work(self):
|
||||
def _is_there_work(self) -> bool:
|
||||
"""
|
||||
Funcion que agrupa las condiciones que se deben cumplir para poder trabajar
|
||||
Checks whether it should try to scrap a listing page according to
|
||||
limits and cooldowns.
|
||||
:return: True if it should work, false otherwise
|
||||
"""
|
||||
if self.check_if_recent_task():
|
||||
return False
|
||||
|
||||
if not self.in_working_hours():
|
||||
return False
|
||||
|
||||
if (
|
||||
self.get_referencias_acquired_today()
|
||||
>= self.get_max_referencias_for_today()
|
||||
if any(
|
||||
[
|
||||
self._check_if_recent_task(),
|
||||
not self._in_working_hours(),
|
||||
(
|
||||
self._get_referencias_acquired_today()
|
||||
>= self._get_max_referencias_for_today()
|
||||
),
|
||||
(self._get_tasks_created_today() >= self._get_max_tasks_today()),
|
||||
]
|
||||
):
|
||||
return False
|
||||
|
||||
if self.get_tasks_created_today() >= self.get_max_tasks_today():
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def database_is_up(self):
|
||||
def _database_is_up(self) -> bool:
|
||||
"""
|
||||
Checks whether the db is reachable with some retries.
|
||||
:return: True if db is reachable, false if not
|
||||
"""
|
||||
while self.db_retries <= self.max_db_retries:
|
||||
try:
|
||||
self.anunciosdb.ping()
|
||||
|
|
@ -104,16 +117,22 @@ class Explorer:
|
|||
|
||||
return False
|
||||
|
||||
def in_working_hours(self):
|
||||
@staticmethod
|
||||
def _in_working_hours() -> None:
|
||||
"""
|
||||
Checks whether now is within the working hours of the daemon.
|
||||
:return: True if so, false if not
|
||||
"""
|
||||
return (
|
||||
working_hours["start"]
|
||||
<= datetime.datetime.now().time()
|
||||
<= working_hours["end"]
|
||||
)
|
||||
|
||||
def get_referencias_acquired_today(self):
|
||||
def _get_referencias_acquired_today(self) -> int:
|
||||
"""
|
||||
Cuenta cuantas nuevas referencias han aparecido en las ultimas 24 horas
|
||||
Queries the database to obtain the count of scraped ads in the last 24h.
|
||||
:return: the resulting count
|
||||
"""
|
||||
|
||||
query_statement = """ SELECT count(referencia)
|
||||
|
|
@ -125,10 +144,11 @@ class Explorer:
|
|||
|
||||
return cursor_result.fetchone()[0]
|
||||
|
||||
def get_max_referencias_for_today(self):
|
||||
def _get_max_referencias_for_today(self) -> float:
|
||||
"""
|
||||
Calcula la cantidad objetivo para las ultimas 24 horas en base a la
|
||||
diferencia con el objetivo mensual
|
||||
Queries the database for the number of captured ads in the last 30 days
|
||||
and computes the max number of ad references to obtain today.
|
||||
:return: the max number of references
|
||||
"""
|
||||
query_statement = """ SELECT count(referencia)
|
||||
FROM primera_captura_full
|
||||
|
|
@ -144,9 +164,11 @@ class Explorer:
|
|||
|
||||
return max_referencias
|
||||
|
||||
def get_tasks_created_today(self):
|
||||
def _get_tasks_created_today(self) -> int:
|
||||
"""
|
||||
Mira en el task log cuantas tareas se han iniciado en las ultimas 24 horas
|
||||
Queries the database for the number of exploring tasks created in the
|
||||
last 24h, returns it.
|
||||
:return: number of exploring tasks created
|
||||
"""
|
||||
query_statement = """ SELECT count(uuid)
|
||||
FROM exploring_tasks_logs
|
||||
|
|
@ -158,16 +180,17 @@ class Explorer:
|
|||
|
||||
return tasks_created_today
|
||||
|
||||
def get_max_tasks_today(self):
|
||||
def _get_max_tasks_today(self) -> float:
|
||||
"""
|
||||
Calcula el maximo diario de intentos en forma de tareas, en base al
|
||||
maximo de capturas mas un multiplicador
|
||||
Computes the current task goal
|
||||
:return: max current tasks target
|
||||
"""
|
||||
return (self.get_max_referencias_for_today() / 30) * 6
|
||||
return (self._get_max_referencias_for_today() / 30) * 6
|
||||
|
||||
def check_if_recent_task(self):
|
||||
def _check_if_recent_task(self) -> int:
|
||||
"""
|
||||
Mira si se ha creado alguna tarea recientemente
|
||||
Queries the db for the number of tasks created in the last 10 minutes.
|
||||
:return: the number of recently created tasks
|
||||
"""
|
||||
query_statement = """ SELECT count(uuid)
|
||||
FROM exploring_tasks_logs
|
||||
|
|
@ -178,10 +201,11 @@ class Explorer:
|
|||
|
||||
return cursor_result.fetchone()[0]
|
||||
|
||||
def compose_listing_url(self):
|
||||
@staticmethod
|
||||
def _compose_listing_url() -> str:
|
||||
"""
|
||||
Genera URLs de manera aleatoria
|
||||
:return:
|
||||
Generates a listing page URL randomly.
|
||||
:return: the listing page URL
|
||||
"""
|
||||
root = "https://www.idealista.com/"
|
||||
type = Explorer.ad_types[str(randint(1, 2))]
|
||||
|
|
@ -204,38 +228,58 @@ class Explorer:
|
|||
|
||||
|
||||
class ExploringTask:
|
||||
def __init__(self, url):
|
||||
"""
|
||||
Task object wrapping the process of attempting to capture a listing page,
|
||||
parsing the ad references and sending to db.
|
||||
"""
|
||||
|
||||
def __init__(self, url: str) -> None:
|
||||
"""
|
||||
Initialize with task parameters and mark the task as being worked on
|
||||
in the task queue.
|
||||
:param url: string with the listing page url to be captured
|
||||
"""
|
||||
self.anunciosdb = get_anunciosdb()
|
||||
self.tasksdb = get_tasksdb()
|
||||
self.target_url = url
|
||||
self.id = str(uuid.uuid4())
|
||||
self._update_status("Pending")
|
||||
|
||||
def _update_status(self, new_status):
|
||||
def _update_status(self, new_status: str) -> None:
|
||||
"""
|
||||
Updates the task status and persists it in the task queue.
|
||||
:param new_status: string describing the new status
|
||||
:return: None
|
||||
"""
|
||||
self.status = new_status
|
||||
self._log_in_tasksdb()
|
||||
|
||||
def explore(self):
|
||||
def explore(self) -> None:
|
||||
"""
|
||||
Main flow of work.
|
||||
:return: None
|
||||
"""
|
||||
attack = UrlAttack(self.target_url)
|
||||
attack.attack()
|
||||
self._update_status("Attacked")
|
||||
|
||||
if attack.success:
|
||||
self._validate_referencias(attack.get_text())
|
||||
self._extract_referencias(attack.get_text())
|
||||
if self.referencias:
|
||||
self._update_status("Referencias ready")
|
||||
elif self.there_are_referencias:
|
||||
self._update_status("Failure - No new referencias in HTML")
|
||||
else:
|
||||
self._update_status("Failure - HTML with no referencias")
|
||||
else:
|
||||
if not attack.success:
|
||||
self._update_status("Failure - Bad request")
|
||||
return
|
||||
|
||||
def _log_in_tasksdb(self):
|
||||
self._validate_referencias(attack.get_text())
|
||||
self._extract_referencias(attack.get_text())
|
||||
if self.referencias:
|
||||
self._update_status("Referencias ready")
|
||||
elif self.there_are_referencias:
|
||||
self._update_status("Failure - No new referencias in HTML")
|
||||
else:
|
||||
self._update_status("Failure - HTML with no referencias")
|
||||
|
||||
def _log_in_tasksdb(self) -> None:
|
||||
"""
|
||||
Graba en la base de datos de tareas un registro con el UUID de la tarea,
|
||||
un timestamp y el status
|
||||
Logs status in the task db.
|
||||
:return: None
|
||||
"""
|
||||
|
||||
query_statement = """INSERT INTO exploring_tasks_logs
|
||||
|
|
@ -246,10 +290,11 @@ class ExploringTask:
|
|||
|
||||
self.tasksdb.query(query_statement, query_parameters)
|
||||
|
||||
def _validate_referencias(self, html):
|
||||
def _validate_referencias(self, html: str) -> None:
|
||||
"""
|
||||
Comprueba que las etiquetas sigan el formato de un anuncio.
|
||||
Lanza una advertencia si no es así.
|
||||
Checks that the ad references are in the HTML code.
|
||||
:param html: string with HTML code of the listing page
|
||||
:return: None
|
||||
"""
|
||||
soup = BeautifulSoup(html, "html5lib")
|
||||
ads = soup.find_all(class_="item")
|
||||
|
|
@ -267,10 +312,11 @@ class ExploringTask:
|
|||
)
|
||||
break
|
||||
|
||||
def _extract_referencias(self, html):
|
||||
def _extract_referencias(self, html: str) -> None:
|
||||
"""
|
||||
Saca referencias de HTML, descarta las que ya exiten en la base de datos
|
||||
de capturas, y guarda si han aparecido listings y si hay alguno nuevo
|
||||
Scraps the ad references out of the HTML code and stores them.
|
||||
:param html: string with HTML code of the listing page
|
||||
:return: None
|
||||
"""
|
||||
|
||||
soup = BeautifulSoup(html, "html5lib")
|
||||
|
|
@ -281,9 +327,11 @@ class ExploringTask:
|
|||
if self._is_new_listing(ad["data-adid"]):
|
||||
self.referencias.append(ad["data-adid"])
|
||||
|
||||
def _is_new_listing(self, referencia):
|
||||
def _is_new_listing(self, referencia: str) -> bool:
|
||||
"""
|
||||
Comprueba si el listing ya existe en la base de datos de anuncios
|
||||
Checks if an ad reference already exists in the db.
|
||||
:param referencia:
|
||||
:return: True if it is new, false if not
|
||||
"""
|
||||
query_statement = """SELECT count(referencia)
|
||||
FROM capturas
|
||||
|
|
@ -297,14 +345,12 @@ class ExploringTask:
|
|||
else:
|
||||
return True
|
||||
|
||||
def get_referencias(self):
|
||||
def get_referencias(self) -> list:
|
||||
"""
|
||||
Devuelve las referencias, si las hay
|
||||
Gets the references.
|
||||
:return: list of ad references
|
||||
"""
|
||||
if self.referencias:
|
||||
return self.referencias
|
||||
else:
|
||||
return None
|
||||
return self.referencias
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue