Typing, docstrings and formatting of explorer.py
This commit is contained in:
parent
f53a65834b
commit
a61fac72f7
1 changed files with 119 additions and 73 deletions
|
|
@ -8,6 +8,7 @@ from time import sleep
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import re
|
import re
|
||||||
from random import randint
|
from random import randint
|
||||||
|
import mysql.connector
|
||||||
from core.mysql_wrapper import get_anunciosdb, get_tasksdb
|
from core.mysql_wrapper import get_anunciosdb, get_tasksdb
|
||||||
from core.config import monthly_new_ads_target, working_hours
|
from core.config import monthly_new_ads_target, working_hours
|
||||||
from core.scrapping_utils import UrlAttack
|
from core.scrapping_utils import UrlAttack
|
||||||
|
|
@ -18,12 +19,20 @@ import logging
|
||||||
|
|
||||||
|
|
||||||
class Explorer:
|
class Explorer:
|
||||||
|
"""
|
||||||
|
Daemon with the full flow of execution of generating a listing page url,
|
||||||
|
requesting the page, scraping the ad references and storing logs in the
|
||||||
|
task database
|
||||||
|
"""
|
||||||
|
|
||||||
sleep_time_no_work = 60
|
sleep_time_no_work = 60
|
||||||
sleep_time_no_service = 600
|
sleep_time_no_service = 600
|
||||||
ad_types = {"1": "alquiler", "2": "venta"}
|
ad_types = {"1": "alquiler", "2": "venta"}
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
|
"""
|
||||||
|
Connect to database and set up initial parameters.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
self.anunciosdb = get_anunciosdb()
|
self.anunciosdb = get_anunciosdb()
|
||||||
self.tasksdb = get_tasksdb()
|
self.tasksdb = get_tasksdb()
|
||||||
|
|
@ -35,23 +44,28 @@ class Explorer:
|
||||||
self.max_queue_retries = 3
|
self.max_queue_retries = 3
|
||||||
self.queue_retries = 0
|
self.queue_retries = 0
|
||||||
|
|
||||||
def start(self):
|
def start(self) -> None:
|
||||||
|
"""
|
||||||
|
Full flow of execution. Checks whether it should capture a URL, tries
|
||||||
|
to do so and stores the result if successful.
|
||||||
|
:return: None
|
||||||
|
"""
|
||||||
logging.info("Starting explorer")
|
logging.info("Starting explorer")
|
||||||
while True:
|
while True:
|
||||||
if not self.there_is_work():
|
if not self._is_there_work():
|
||||||
print("{}: Waiting. No work".format(datetime.datetime.now()))
|
print("{}: Waiting. No work".format(datetime.datetime.now()))
|
||||||
sleep(Explorer.sleep_time_no_work)
|
sleep(Explorer.sleep_time_no_work)
|
||||||
continue
|
continue
|
||||||
logging.info("Waiting")
|
logging.info("Waiting")
|
||||||
|
|
||||||
if not self.database_is_up():
|
if not self._database_is_up():
|
||||||
alert_master(
|
alert_master(
|
||||||
"SQL DOWN",
|
"SQL DOWN",
|
||||||
"El explorer informa de que SQL esta caida. Actividad detenida",
|
"El explorer informa de que SQL esta caida. Actividad detenida",
|
||||||
)
|
)
|
||||||
self.stop()
|
raise ConnectionError("Unable to connect to database")
|
||||||
|
|
||||||
current_task = ExploringTask(self.compose_listing_url())
|
current_task = ExploringTask(self._compose_listing_url())
|
||||||
current_task.explore()
|
current_task.explore()
|
||||||
logging.info("Exploring task done...")
|
logging.info("Exploring task done...")
|
||||||
|
|
||||||
|
|
@ -66,33 +80,32 @@ class Explorer:
|
||||||
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
def stop(self):
|
def _is_there_work(self) -> bool:
|
||||||
# TODO Detener el servicio
|
|
||||||
# Detener el servicio
|
|
||||||
pass
|
|
||||||
|
|
||||||
def there_is_work(self):
|
|
||||||
"""
|
"""
|
||||||
Funcion que agrupa las condiciones que se deben cumplir para poder trabajar
|
Checks whether it should try to scrap a listing page according to
|
||||||
|
limits and cooldowns.
|
||||||
|
:return: True if it should work, false otherwise
|
||||||
"""
|
"""
|
||||||
if self.check_if_recent_task():
|
if any(
|
||||||
return False
|
[
|
||||||
|
self._check_if_recent_task(),
|
||||||
if not self.in_working_hours():
|
not self._in_working_hours(),
|
||||||
return False
|
(
|
||||||
|
self._get_referencias_acquired_today()
|
||||||
if (
|
>= self._get_max_referencias_for_today()
|
||||||
self.get_referencias_acquired_today()
|
),
|
||||||
>= self.get_max_referencias_for_today()
|
(self._get_tasks_created_today() >= self._get_max_tasks_today()),
|
||||||
|
]
|
||||||
):
|
):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if self.get_tasks_created_today() >= self.get_max_tasks_today():
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def database_is_up(self):
|
def _database_is_up(self) -> bool:
|
||||||
|
"""
|
||||||
|
Checks whether the db is reachable with some retries.
|
||||||
|
:return: True if db is reachable, false if not
|
||||||
|
"""
|
||||||
while self.db_retries <= self.max_db_retries:
|
while self.db_retries <= self.max_db_retries:
|
||||||
try:
|
try:
|
||||||
self.anunciosdb.ping()
|
self.anunciosdb.ping()
|
||||||
|
|
@ -104,16 +117,22 @@ class Explorer:
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def in_working_hours(self):
|
@staticmethod
|
||||||
|
def _in_working_hours() -> None:
|
||||||
|
"""
|
||||||
|
Checks whether now is within the working hours of the daemon.
|
||||||
|
:return: True if so, false if not
|
||||||
|
"""
|
||||||
return (
|
return (
|
||||||
working_hours["start"]
|
working_hours["start"]
|
||||||
<= datetime.datetime.now().time()
|
<= datetime.datetime.now().time()
|
||||||
<= working_hours["end"]
|
<= working_hours["end"]
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_referencias_acquired_today(self):
|
def _get_referencias_acquired_today(self) -> int:
|
||||||
"""
|
"""
|
||||||
Cuenta cuantas nuevas referencias han aparecido en las ultimas 24 horas
|
Queries the database to obtain the count of scraped ads in the last 24h.
|
||||||
|
:return: the resulting count
|
||||||
"""
|
"""
|
||||||
|
|
||||||
query_statement = """ SELECT count(referencia)
|
query_statement = """ SELECT count(referencia)
|
||||||
|
|
@ -125,10 +144,11 @@ class Explorer:
|
||||||
|
|
||||||
return cursor_result.fetchone()[0]
|
return cursor_result.fetchone()[0]
|
||||||
|
|
||||||
def get_max_referencias_for_today(self):
|
def _get_max_referencias_for_today(self) -> float:
|
||||||
"""
|
"""
|
||||||
Calcula la cantidad objetivo para las ultimas 24 horas en base a la
|
Queries the database for the number of captured ads in the last 30 days
|
||||||
diferencia con el objetivo mensual
|
and computes the max number of ad references to obtain today.
|
||||||
|
:return: the max number of references
|
||||||
"""
|
"""
|
||||||
query_statement = """ SELECT count(referencia)
|
query_statement = """ SELECT count(referencia)
|
||||||
FROM primera_captura_full
|
FROM primera_captura_full
|
||||||
|
|
@ -144,9 +164,11 @@ class Explorer:
|
||||||
|
|
||||||
return max_referencias
|
return max_referencias
|
||||||
|
|
||||||
def get_tasks_created_today(self):
|
def _get_tasks_created_today(self) -> int:
|
||||||
"""
|
"""
|
||||||
Mira en el task log cuantas tareas se han iniciado en las ultimas 24 horas
|
Queries the database for the number of exploring tasks created in the
|
||||||
|
last 24h, returns it.
|
||||||
|
:return: number of exploring tasks created
|
||||||
"""
|
"""
|
||||||
query_statement = """ SELECT count(uuid)
|
query_statement = """ SELECT count(uuid)
|
||||||
FROM exploring_tasks_logs
|
FROM exploring_tasks_logs
|
||||||
|
|
@ -158,16 +180,17 @@ class Explorer:
|
||||||
|
|
||||||
return tasks_created_today
|
return tasks_created_today
|
||||||
|
|
||||||
def get_max_tasks_today(self):
|
def _get_max_tasks_today(self) -> float:
|
||||||
"""
|
"""
|
||||||
Calcula el maximo diario de intentos en forma de tareas, en base al
|
Computes the current task goal
|
||||||
maximo de capturas mas un multiplicador
|
:return: max current tasks target
|
||||||
"""
|
"""
|
||||||
return (self.get_max_referencias_for_today() / 30) * 6
|
return (self._get_max_referencias_for_today() / 30) * 6
|
||||||
|
|
||||||
def check_if_recent_task(self):
|
def _check_if_recent_task(self) -> int:
|
||||||
"""
|
"""
|
||||||
Mira si se ha creado alguna tarea recientemente
|
Queries the db for the number of tasks created in the last 10 minutes.
|
||||||
|
:return: the number of recently created tasks
|
||||||
"""
|
"""
|
||||||
query_statement = """ SELECT count(uuid)
|
query_statement = """ SELECT count(uuid)
|
||||||
FROM exploring_tasks_logs
|
FROM exploring_tasks_logs
|
||||||
|
|
@ -178,10 +201,11 @@ class Explorer:
|
||||||
|
|
||||||
return cursor_result.fetchone()[0]
|
return cursor_result.fetchone()[0]
|
||||||
|
|
||||||
def compose_listing_url(self):
|
@staticmethod
|
||||||
|
def _compose_listing_url() -> str:
|
||||||
"""
|
"""
|
||||||
Genera URLs de manera aleatoria
|
Generates a listing page URL randomly.
|
||||||
:return:
|
:return: the listing page URL
|
||||||
"""
|
"""
|
||||||
root = "https://www.idealista.com/"
|
root = "https://www.idealista.com/"
|
||||||
type = Explorer.ad_types[str(randint(1, 2))]
|
type = Explorer.ad_types[str(randint(1, 2))]
|
||||||
|
|
@ -204,23 +228,45 @@ class Explorer:
|
||||||
|
|
||||||
|
|
||||||
class ExploringTask:
|
class ExploringTask:
|
||||||
def __init__(self, url):
|
"""
|
||||||
|
Task object wrapping the process of attempting to capture a listing page,
|
||||||
|
parsing the ad references and sending to db.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, url: str) -> None:
|
||||||
|
"""
|
||||||
|
Initialize with task parameters and mark the task as being worked on
|
||||||
|
in the task queue.
|
||||||
|
:param url: string with the listing page url to be captured
|
||||||
|
"""
|
||||||
self.anunciosdb = get_anunciosdb()
|
self.anunciosdb = get_anunciosdb()
|
||||||
self.tasksdb = get_tasksdb()
|
self.tasksdb = get_tasksdb()
|
||||||
self.target_url = url
|
self.target_url = url
|
||||||
self.id = str(uuid.uuid4())
|
self.id = str(uuid.uuid4())
|
||||||
self._update_status("Pending")
|
self._update_status("Pending")
|
||||||
|
|
||||||
def _update_status(self, new_status):
|
def _update_status(self, new_status: str) -> None:
|
||||||
|
"""
|
||||||
|
Updates the task status and persists it in the task queue.
|
||||||
|
:param new_status: string describing the new status
|
||||||
|
:return: None
|
||||||
|
"""
|
||||||
self.status = new_status
|
self.status = new_status
|
||||||
self._log_in_tasksdb()
|
self._log_in_tasksdb()
|
||||||
|
|
||||||
def explore(self):
|
def explore(self) -> None:
|
||||||
|
"""
|
||||||
|
Main flow of work.
|
||||||
|
:return: None
|
||||||
|
"""
|
||||||
attack = UrlAttack(self.target_url)
|
attack = UrlAttack(self.target_url)
|
||||||
attack.attack()
|
attack.attack()
|
||||||
self._update_status("Attacked")
|
self._update_status("Attacked")
|
||||||
|
|
||||||
if attack.success:
|
if not attack.success:
|
||||||
|
self._update_status("Failure - Bad request")
|
||||||
|
return
|
||||||
|
|
||||||
self._validate_referencias(attack.get_text())
|
self._validate_referencias(attack.get_text())
|
||||||
self._extract_referencias(attack.get_text())
|
self._extract_referencias(attack.get_text())
|
||||||
if self.referencias:
|
if self.referencias:
|
||||||
|
|
@ -229,13 +275,11 @@ class ExploringTask:
|
||||||
self._update_status("Failure - No new referencias in HTML")
|
self._update_status("Failure - No new referencias in HTML")
|
||||||
else:
|
else:
|
||||||
self._update_status("Failure - HTML with no referencias")
|
self._update_status("Failure - HTML with no referencias")
|
||||||
else:
|
|
||||||
self._update_status("Failure - Bad request")
|
|
||||||
|
|
||||||
def _log_in_tasksdb(self):
|
def _log_in_tasksdb(self) -> None:
|
||||||
"""
|
"""
|
||||||
Graba en la base de datos de tareas un registro con el UUID de la tarea,
|
Logs status in the task db.
|
||||||
un timestamp y el status
|
:return: None
|
||||||
"""
|
"""
|
||||||
|
|
||||||
query_statement = """INSERT INTO exploring_tasks_logs
|
query_statement = """INSERT INTO exploring_tasks_logs
|
||||||
|
|
@ -246,10 +290,11 @@ class ExploringTask:
|
||||||
|
|
||||||
self.tasksdb.query(query_statement, query_parameters)
|
self.tasksdb.query(query_statement, query_parameters)
|
||||||
|
|
||||||
def _validate_referencias(self, html):
|
def _validate_referencias(self, html: str) -> None:
|
||||||
"""
|
"""
|
||||||
Comprueba que las etiquetas sigan el formato de un anuncio.
|
Checks that the ad references are in the HTML code.
|
||||||
Lanza una advertencia si no es así.
|
:param html: string with HTML code of the listing page
|
||||||
|
:return: None
|
||||||
"""
|
"""
|
||||||
soup = BeautifulSoup(html, "html5lib")
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
ads = soup.find_all(class_="item")
|
ads = soup.find_all(class_="item")
|
||||||
|
|
@ -267,10 +312,11 @@ class ExploringTask:
|
||||||
)
|
)
|
||||||
break
|
break
|
||||||
|
|
||||||
def _extract_referencias(self, html):
|
def _extract_referencias(self, html: str) -> None:
|
||||||
"""
|
"""
|
||||||
Saca referencias de HTML, descarta las que ya exiten en la base de datos
|
Scraps the ad references out of the HTML code and stores them.
|
||||||
de capturas, y guarda si han aparecido listings y si hay alguno nuevo
|
:param html: string with HTML code of the listing page
|
||||||
|
:return: None
|
||||||
"""
|
"""
|
||||||
|
|
||||||
soup = BeautifulSoup(html, "html5lib")
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
@ -281,9 +327,11 @@ class ExploringTask:
|
||||||
if self._is_new_listing(ad["data-adid"]):
|
if self._is_new_listing(ad["data-adid"]):
|
||||||
self.referencias.append(ad["data-adid"])
|
self.referencias.append(ad["data-adid"])
|
||||||
|
|
||||||
def _is_new_listing(self, referencia):
|
def _is_new_listing(self, referencia: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Comprueba si el listing ya existe en la base de datos de anuncios
|
Checks if an ad reference already exists in the db.
|
||||||
|
:param referencia:
|
||||||
|
:return: True if it is new, false if not
|
||||||
"""
|
"""
|
||||||
query_statement = """SELECT count(referencia)
|
query_statement = """SELECT count(referencia)
|
||||||
FROM capturas
|
FROM capturas
|
||||||
|
|
@ -297,14 +345,12 @@ class ExploringTask:
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def get_referencias(self):
|
def get_referencias(self) -> list:
|
||||||
"""
|
"""
|
||||||
Devuelve las referencias, si las hay
|
Gets the references.
|
||||||
|
:return: list of ad references
|
||||||
"""
|
"""
|
||||||
if self.referencias:
|
|
||||||
return self.referencias
|
return self.referencias
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue