From acfeeef0d11d5a19d6b37e51814ba46fdccec5d5 Mon Sep 17 00:00:00 2001 From: pablo Date: Thu, 26 Mar 2020 11:38:08 +0100 Subject: [PATCH] Formatting --- explorer/explorer.py | 158 +++++++++++++++++++++++++------------------ 1 file changed, 93 insertions(+), 65 deletions(-) diff --git a/explorer/explorer.py b/explorer/explorer.py index 4c65350..374b5fe 100644 --- a/explorer/explorer.py +++ b/explorer/explorer.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import sys -sys.path.append('..') + +sys.path.append("..") import uuid import datetime from time import sleep @@ -12,13 +13,15 @@ from core.config import monthly_new_ads_target, working_hours from core.scrapping_utils import UrlAttack from core.alerts import alert_master from db_layer.capturing_tasks_interface import capturing_interface +from core import my_logger +import logging + + +class Explorer: -class Explorer(): - sleep_time_no_work = 60 sleep_time_no_service = 600 - ad_types = {'1': 'alquiler', - '2': 'venta'} + ad_types = {"1": "alquiler", "2": "venta"} def __init__(self): try: @@ -33,35 +36,39 @@ class Explorer(): self.queue_retries = 0 def start(self): - + while True: if not self.there_is_work(): - print('{}: Waiting. No work'.format(datetime.datetime.now())) + print("{}: Waiting. No work".format(datetime.datetime.now())) sleep(Explorer.sleep_time_no_work) continue - + if not self.database_is_up(): - alert_master("SQL DOWN", "El explorer informa de que SQL esta caida. Actividad detenida") + alert_master( + "SQL DOWN", + "El explorer informa de que SQL esta caida. Actividad detenida", + ) self.stop() current_task = ExploringTask(self.compose_listing_url()) current_task.explore() - print('{}: Exploring done'.format(datetime.datetime.now())) + print("{}: Exploring done".format(datetime.datetime.now())) - if current_task.status == 'Referencias ready': + if current_task.status == "Referencias ready": referencias = current_task.get_referencias() for referencia in referencias: - capturing_interface.create_capturing_task(referencia, current_task.id) + capturing_interface.create_capturing_task( + referencia, current_task.id + ) current_task._update_status("Sent to queue") - continue - + continue def stop(self): - #TODO Detener el servicio - #Detener el servicio + # TODO Detener el servicio + # Detener el servicio pass - + def there_is_work(self): """ Funcion que agrupa las condiciones que se deben cumplir para poder trabajar @@ -71,15 +78,18 @@ class Explorer(): if not self.in_working_hours(): return False - - if self.get_referencias_acquired_today() >= self.get_max_referencias_for_today(): + + if ( + self.get_referencias_acquired_today() + >= self.get_max_referencias_for_today() + ): return False - + if self.get_tasks_created_today() >= self.get_max_tasks_today(): return False return True - + def database_is_up(self): while self.db_retries <= self.max_db_retries: try: @@ -89,26 +99,30 @@ class Explorer(): except: sleep(Explorer.sleep_time_no_service) self.db_retries = self.db_retries + 1 - + return False - + def in_working_hours(self): - return working_hours['start'] <= datetime.datetime.now().time() <= working_hours['end'] - + return ( + working_hours["start"] + <= datetime.datetime.now().time() + <= working_hours["end"] + ) + def get_referencias_acquired_today(self): """ Cuenta cuantas nuevas referencias han aparecido en las ultimas 24 horas """ - + query_statement = """ SELECT count(referencia) FROM primera_captura_full WHERE fecha_captura >= now() - INTERVAL 1 DAY; """ - + cursor_result = self.anunciosdb.query(query_statement) - + return cursor_result.fetchone()[0] - + def get_max_referencias_for_today(self): """ Calcula la cantidad objetivo para las ultimas 24 horas en base a la @@ -121,7 +135,9 @@ class Explorer(): cursor_result = self.anunciosdb.query(query_statement) new_referencias_last_30 = cursor_result.fetchone()[0] - deviation = (monthly_new_ads_target - new_referencias_last_30) / monthly_new_ads_target + deviation = ( + monthly_new_ads_target - new_referencias_last_30 + ) / monthly_new_ads_target max_referencias = (monthly_new_ads_target / 30) * (1 + deviation) return max_referencias @@ -165,76 +181,88 @@ class Explorer(): Genera URLs de manera aleatoria :return: """ - root = 'https://www.idealista.com/' - type = Explorer.ad_types[str(randint(1,2))] - city = 'barcelona' - page_number = str(randint(1,30)) - url = root + type + '-garajes/' + city + '-' + city + '/' + \ - 'pagina-' + page_number + '.htm' - + root = "https://www.idealista.com/" + type = Explorer.ad_types[str(randint(1, 2))] + city = "barcelona" + page_number = str(randint(1, 30)) + url = ( + root + + type + + "-garajes/" + + city + + "-" + + city + + "/" + + "pagina-" + + page_number + + ".htm" + ) + return url - + class ExploringTask: - def __init__(self, url): self.anunciosdb = get_anunciosdb() self.tasksdb = get_tasksdb() self.target_url = url self.id = str(uuid.uuid4()) - self._update_status('Pending') - + self._update_status("Pending") + def _update_status(self, new_status): self.status = new_status self._log_in_tasksdb() - + def explore(self): attack = UrlAttack(self.target_url) attack.attack() - self._update_status('Attacked') - + self._update_status("Attacked") + if attack.success: self._validate_referencias(attack.get_text()) self._extract_referencias(attack.get_text()) if self.referencias: - self._update_status('Referencias ready') + self._update_status("Referencias ready") elif self.there_are_referencias: - self._update_status('Failure - No new referencias in HTML') + self._update_status("Failure - No new referencias in HTML") else: - self._update_status('Failure - HTML with no referencias') + self._update_status("Failure - HTML with no referencias") else: - self._update_status('Failure - Bad request') + self._update_status("Failure - Bad request") def _log_in_tasksdb(self): """ Graba en la base de datos de tareas un registro con el UUID de la tarea, un timestamp y el status """ - + query_statement = """INSERT INTO exploring_tasks_logs (uuid, write_time, status) VALUES (%(uuid)s, NOW(), %(status)s)""" - - query_parameters = {'uuid': self.id, - 'status': self.status} - + + query_parameters = {"uuid": self.id, "status": self.status} + self.tasksdb.query(query_statement, query_parameters) - + def _validate_referencias(self, html): """ Comprueba que las etiquetas sigan el formato de un anuncio. Lanza una advertencia si no es así. """ - soup = BeautifulSoup(html, 'html5lib') + soup = BeautifulSoup(html, "html5lib") ads = soup.find_all(class_="item") pattern = "^[0-9]{3,20}$" - + for ad in ads: if not re.match(pattern, ad["data-adid"]): - alert_master("Alerta - Referencias no válidas", - """Una tarea de exploración ha considerado inválida + alert_master( + "Alerta - Referencias no válidas", + """Una tarea de exploración ha considerado inválida una referencia. El texto de la referencia era : {} - """.format(ad["data-adid"])) + """.format( + ad["data-adid"] + ), + ) break def _extract_referencias(self, html): @@ -243,13 +271,13 @@ class ExploringTask: de capturas, y guarda si han aparecido listings y si hay alguno nuevo """ - soup = BeautifulSoup(html, 'html5lib') - ads = soup.find_all(class_ = "item") + soup = BeautifulSoup(html, "html5lib") + ads = soup.find_all(class_="item") self.there_are_referencias = bool(ads) self.referencias = [] for ad in ads: if self._is_new_listing(ad["data-adid"]): - self.referencias.append(ad["data-adid"]) + self.referencias.append(ad["data-adid"]) def _is_new_listing(self, referencia): """ @@ -260,13 +288,13 @@ class ExploringTask: WHERE referencia = %s""" query_params = (referencia,) cursor_result = self.anunciosdb.query(query_statement, query_params) - + result = cursor_result.fetchone() if result[0] > 0: return False else: return True - + def get_referencias(self): """ Devuelve las referencias, si las hay @@ -277,6 +305,6 @@ class ExploringTask: return None -if __name__ == '__main__': +if __name__ == "__main__": explorer = Explorer() explorer.start()