Formatting

This commit is contained in:
pablo 2020-03-26 11:38:08 +01:00
parent 9c2565f5d8
commit acfeeef0d1

View file

@ -1,6 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import sys import sys
sys.path.append('..')
sys.path.append("..")
import uuid import uuid
import datetime import datetime
from time import sleep from time import sleep
@ -12,13 +13,15 @@ from core.config import monthly_new_ads_target, working_hours
from core.scrapping_utils import UrlAttack from core.scrapping_utils import UrlAttack
from core.alerts import alert_master from core.alerts import alert_master
from db_layer.capturing_tasks_interface import capturing_interface from db_layer.capturing_tasks_interface import capturing_interface
from core import my_logger
import logging
class Explorer:
class Explorer():
sleep_time_no_work = 60 sleep_time_no_work = 60
sleep_time_no_service = 600 sleep_time_no_service = 600
ad_types = {'1': 'alquiler', ad_types = {"1": "alquiler", "2": "venta"}
'2': 'venta'}
def __init__(self): def __init__(self):
try: try:
@ -33,35 +36,39 @@ class Explorer():
self.queue_retries = 0 self.queue_retries = 0
def start(self): def start(self):
while True: while True:
if not self.there_is_work(): if not self.there_is_work():
print('{}: Waiting. No work'.format(datetime.datetime.now())) print("{}: Waiting. No work".format(datetime.datetime.now()))
sleep(Explorer.sleep_time_no_work) sleep(Explorer.sleep_time_no_work)
continue continue
if not self.database_is_up(): if not self.database_is_up():
alert_master("SQL DOWN", "El explorer informa de que SQL esta caida. Actividad detenida") alert_master(
"SQL DOWN",
"El explorer informa de que SQL esta caida. Actividad detenida",
)
self.stop() self.stop()
current_task = ExploringTask(self.compose_listing_url()) current_task = ExploringTask(self.compose_listing_url())
current_task.explore() current_task.explore()
print('{}: Exploring done'.format(datetime.datetime.now())) print("{}: Exploring done".format(datetime.datetime.now()))
if current_task.status == 'Referencias ready': if current_task.status == "Referencias ready":
referencias = current_task.get_referencias() referencias = current_task.get_referencias()
for referencia in referencias: for referencia in referencias:
capturing_interface.create_capturing_task(referencia, current_task.id) capturing_interface.create_capturing_task(
referencia, current_task.id
)
current_task._update_status("Sent to queue") current_task._update_status("Sent to queue")
continue continue
def stop(self): def stop(self):
#TODO Detener el servicio # TODO Detener el servicio
#Detener el servicio # Detener el servicio
pass pass
def there_is_work(self): def there_is_work(self):
""" """
Funcion que agrupa las condiciones que se deben cumplir para poder trabajar Funcion que agrupa las condiciones que se deben cumplir para poder trabajar
@ -71,15 +78,18 @@ class Explorer():
if not self.in_working_hours(): if not self.in_working_hours():
return False return False
if self.get_referencias_acquired_today() >= self.get_max_referencias_for_today(): if (
self.get_referencias_acquired_today()
>= self.get_max_referencias_for_today()
):
return False return False
if self.get_tasks_created_today() >= self.get_max_tasks_today(): if self.get_tasks_created_today() >= self.get_max_tasks_today():
return False return False
return True return True
def database_is_up(self): def database_is_up(self):
while self.db_retries <= self.max_db_retries: while self.db_retries <= self.max_db_retries:
try: try:
@ -89,26 +99,30 @@ class Explorer():
except: except:
sleep(Explorer.sleep_time_no_service) sleep(Explorer.sleep_time_no_service)
self.db_retries = self.db_retries + 1 self.db_retries = self.db_retries + 1
return False return False
def in_working_hours(self): def in_working_hours(self):
return working_hours['start'] <= datetime.datetime.now().time() <= working_hours['end'] return (
working_hours["start"]
<= datetime.datetime.now().time()
<= working_hours["end"]
)
def get_referencias_acquired_today(self): def get_referencias_acquired_today(self):
""" """
Cuenta cuantas nuevas referencias han aparecido en las ultimas 24 horas Cuenta cuantas nuevas referencias han aparecido en las ultimas 24 horas
""" """
query_statement = """ SELECT count(referencia) query_statement = """ SELECT count(referencia)
FROM primera_captura_full FROM primera_captura_full
WHERE fecha_captura >= now() - INTERVAL 1 DAY; WHERE fecha_captura >= now() - INTERVAL 1 DAY;
""" """
cursor_result = self.anunciosdb.query(query_statement) cursor_result = self.anunciosdb.query(query_statement)
return cursor_result.fetchone()[0] return cursor_result.fetchone()[0]
def get_max_referencias_for_today(self): def get_max_referencias_for_today(self):
""" """
Calcula la cantidad objetivo para las ultimas 24 horas en base a la Calcula la cantidad objetivo para las ultimas 24 horas en base a la
@ -121,7 +135,9 @@ class Explorer():
cursor_result = self.anunciosdb.query(query_statement) cursor_result = self.anunciosdb.query(query_statement)
new_referencias_last_30 = cursor_result.fetchone()[0] new_referencias_last_30 = cursor_result.fetchone()[0]
deviation = (monthly_new_ads_target - new_referencias_last_30) / monthly_new_ads_target deviation = (
monthly_new_ads_target - new_referencias_last_30
) / monthly_new_ads_target
max_referencias = (monthly_new_ads_target / 30) * (1 + deviation) max_referencias = (monthly_new_ads_target / 30) * (1 + deviation)
return max_referencias return max_referencias
@ -165,76 +181,88 @@ class Explorer():
Genera URLs de manera aleatoria Genera URLs de manera aleatoria
:return: :return:
""" """
root = 'https://www.idealista.com/' root = "https://www.idealista.com/"
type = Explorer.ad_types[str(randint(1,2))] type = Explorer.ad_types[str(randint(1, 2))]
city = 'barcelona' city = "barcelona"
page_number = str(randint(1,30)) page_number = str(randint(1, 30))
url = root + type + '-garajes/' + city + '-' + city + '/' + \ url = (
'pagina-' + page_number + '.htm' root
+ type
+ "-garajes/"
+ city
+ "-"
+ city
+ "/"
+ "pagina-"
+ page_number
+ ".htm"
)
return url return url
class ExploringTask: class ExploringTask:
def __init__(self, url): def __init__(self, url):
self.anunciosdb = get_anunciosdb() self.anunciosdb = get_anunciosdb()
self.tasksdb = get_tasksdb() self.tasksdb = get_tasksdb()
self.target_url = url self.target_url = url
self.id = str(uuid.uuid4()) self.id = str(uuid.uuid4())
self._update_status('Pending') self._update_status("Pending")
def _update_status(self, new_status): def _update_status(self, new_status):
self.status = new_status self.status = new_status
self._log_in_tasksdb() self._log_in_tasksdb()
def explore(self): def explore(self):
attack = UrlAttack(self.target_url) attack = UrlAttack(self.target_url)
attack.attack() attack.attack()
self._update_status('Attacked') self._update_status("Attacked")
if attack.success: if attack.success:
self._validate_referencias(attack.get_text()) self._validate_referencias(attack.get_text())
self._extract_referencias(attack.get_text()) self._extract_referencias(attack.get_text())
if self.referencias: if self.referencias:
self._update_status('Referencias ready') self._update_status("Referencias ready")
elif self.there_are_referencias: elif self.there_are_referencias:
self._update_status('Failure - No new referencias in HTML') self._update_status("Failure - No new referencias in HTML")
else: else:
self._update_status('Failure - HTML with no referencias') self._update_status("Failure - HTML with no referencias")
else: else:
self._update_status('Failure - Bad request') self._update_status("Failure - Bad request")
def _log_in_tasksdb(self): def _log_in_tasksdb(self):
""" """
Graba en la base de datos de tareas un registro con el UUID de la tarea, Graba en la base de datos de tareas un registro con el UUID de la tarea,
un timestamp y el status un timestamp y el status
""" """
query_statement = """INSERT INTO exploring_tasks_logs query_statement = """INSERT INTO exploring_tasks_logs
(uuid, write_time, status) (uuid, write_time, status)
VALUES (%(uuid)s, NOW(), %(status)s)""" VALUES (%(uuid)s, NOW(), %(status)s)"""
query_parameters = {'uuid': self.id, query_parameters = {"uuid": self.id, "status": self.status}
'status': self.status}
self.tasksdb.query(query_statement, query_parameters) self.tasksdb.query(query_statement, query_parameters)
def _validate_referencias(self, html): def _validate_referencias(self, html):
""" """
Comprueba que las etiquetas sigan el formato de un anuncio. Comprueba que las etiquetas sigan el formato de un anuncio.
Lanza una advertencia si no es así. Lanza una advertencia si no es así.
""" """
soup = BeautifulSoup(html, 'html5lib') soup = BeautifulSoup(html, "html5lib")
ads = soup.find_all(class_="item") ads = soup.find_all(class_="item")
pattern = "^[0-9]{3,20}$" pattern = "^[0-9]{3,20}$"
for ad in ads: for ad in ads:
if not re.match(pattern, ad["data-adid"]): if not re.match(pattern, ad["data-adid"]):
alert_master("Alerta - Referencias no válidas", alert_master(
"""Una tarea de exploración ha considerado inválida "Alerta - Referencias no válidas",
"""Una tarea de exploración ha considerado inválida
una referencia. El texto de la referencia era : {} una referencia. El texto de la referencia era : {}
""".format(ad["data-adid"])) """.format(
ad["data-adid"]
),
)
break break
def _extract_referencias(self, html): def _extract_referencias(self, html):
@ -243,13 +271,13 @@ class ExploringTask:
de capturas, y guarda si han aparecido listings y si hay alguno nuevo de capturas, y guarda si han aparecido listings y si hay alguno nuevo
""" """
soup = BeautifulSoup(html, 'html5lib') soup = BeautifulSoup(html, "html5lib")
ads = soup.find_all(class_ = "item") ads = soup.find_all(class_="item")
self.there_are_referencias = bool(ads) self.there_are_referencias = bool(ads)
self.referencias = [] self.referencias = []
for ad in ads: for ad in ads:
if self._is_new_listing(ad["data-adid"]): if self._is_new_listing(ad["data-adid"]):
self.referencias.append(ad["data-adid"]) self.referencias.append(ad["data-adid"])
def _is_new_listing(self, referencia): def _is_new_listing(self, referencia):
""" """
@ -260,13 +288,13 @@ class ExploringTask:
WHERE referencia = %s""" WHERE referencia = %s"""
query_params = (referencia,) query_params = (referencia,)
cursor_result = self.anunciosdb.query(query_statement, query_params) cursor_result = self.anunciosdb.query(query_statement, query_params)
result = cursor_result.fetchone() result = cursor_result.fetchone()
if result[0] > 0: if result[0] > 0:
return False return False
else: else:
return True return True
def get_referencias(self): def get_referencias(self):
""" """
Devuelve las referencias, si las hay Devuelve las referencias, si las hay
@ -277,6 +305,6 @@ class ExploringTask:
return None return None
if __name__ == '__main__': if __name__ == "__main__":
explorer = Explorer() explorer = Explorer()
explorer.start() explorer.start()