Formatting

This commit is contained in:
pablo 2020-03-26 11:38:08 +01:00
parent 9c2565f5d8
commit acfeeef0d1

View file

@ -1,6 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import sys import sys
sys.path.append('..')
sys.path.append("..")
import uuid import uuid
import datetime import datetime
from time import sleep from time import sleep
@ -12,13 +13,15 @@ from core.config import monthly_new_ads_target, working_hours
from core.scrapping_utils import UrlAttack from core.scrapping_utils import UrlAttack
from core.alerts import alert_master from core.alerts import alert_master
from db_layer.capturing_tasks_interface import capturing_interface from db_layer.capturing_tasks_interface import capturing_interface
from core import my_logger
import logging
class Explorer():
class Explorer:
sleep_time_no_work = 60 sleep_time_no_work = 60
sleep_time_no_service = 600 sleep_time_no_service = 600
ad_types = {'1': 'alquiler', ad_types = {"1": "alquiler", "2": "venta"}
'2': 'venta'}
def __init__(self): def __init__(self):
try: try:
@ -36,30 +39,34 @@ class Explorer():
while True: while True:
if not self.there_is_work(): if not self.there_is_work():
print('{}: Waiting. No work'.format(datetime.datetime.now())) print("{}: Waiting. No work".format(datetime.datetime.now()))
sleep(Explorer.sleep_time_no_work) sleep(Explorer.sleep_time_no_work)
continue continue
if not self.database_is_up(): if not self.database_is_up():
alert_master("SQL DOWN", "El explorer informa de que SQL esta caida. Actividad detenida") alert_master(
"SQL DOWN",
"El explorer informa de que SQL esta caida. Actividad detenida",
)
self.stop() self.stop()
current_task = ExploringTask(self.compose_listing_url()) current_task = ExploringTask(self.compose_listing_url())
current_task.explore() current_task.explore()
print('{}: Exploring done'.format(datetime.datetime.now())) print("{}: Exploring done".format(datetime.datetime.now()))
if current_task.status == 'Referencias ready': if current_task.status == "Referencias ready":
referencias = current_task.get_referencias() referencias = current_task.get_referencias()
for referencia in referencias: for referencia in referencias:
capturing_interface.create_capturing_task(referencia, current_task.id) capturing_interface.create_capturing_task(
referencia, current_task.id
)
current_task._update_status("Sent to queue") current_task._update_status("Sent to queue")
continue continue
def stop(self): def stop(self):
#TODO Detener el servicio # TODO Detener el servicio
#Detener el servicio # Detener el servicio
pass pass
def there_is_work(self): def there_is_work(self):
@ -72,7 +79,10 @@ class Explorer():
if not self.in_working_hours(): if not self.in_working_hours():
return False return False
if self.get_referencias_acquired_today() >= self.get_max_referencias_for_today(): if (
self.get_referencias_acquired_today()
>= self.get_max_referencias_for_today()
):
return False return False
if self.get_tasks_created_today() >= self.get_max_tasks_today(): if self.get_tasks_created_today() >= self.get_max_tasks_today():
@ -93,7 +103,11 @@ class Explorer():
return False return False
def in_working_hours(self): def in_working_hours(self):
return working_hours['start'] <= datetime.datetime.now().time() <= working_hours['end'] return (
working_hours["start"]
<= datetime.datetime.now().time()
<= working_hours["end"]
)
def get_referencias_acquired_today(self): def get_referencias_acquired_today(self):
""" """
@ -121,7 +135,9 @@ class Explorer():
cursor_result = self.anunciosdb.query(query_statement) cursor_result = self.anunciosdb.query(query_statement)
new_referencias_last_30 = cursor_result.fetchone()[0] new_referencias_last_30 = cursor_result.fetchone()[0]
deviation = (monthly_new_ads_target - new_referencias_last_30) / monthly_new_ads_target deviation = (
monthly_new_ads_target - new_referencias_last_30
) / monthly_new_ads_target
max_referencias = (monthly_new_ads_target / 30) * (1 + deviation) max_referencias = (monthly_new_ads_target / 30) * (1 + deviation)
return max_referencias return max_referencias
@ -165,24 +181,33 @@ class Explorer():
Genera URLs de manera aleatoria Genera URLs de manera aleatoria
:return: :return:
""" """
root = 'https://www.idealista.com/' root = "https://www.idealista.com/"
type = Explorer.ad_types[str(randint(1,2))] type = Explorer.ad_types[str(randint(1, 2))]
city = 'barcelona' city = "barcelona"
page_number = str(randint(1,30)) page_number = str(randint(1, 30))
url = root + type + '-garajes/' + city + '-' + city + '/' + \ url = (
'pagina-' + page_number + '.htm' root
+ type
+ "-garajes/"
+ city
+ "-"
+ city
+ "/"
+ "pagina-"
+ page_number
+ ".htm"
)
return url return url
class ExploringTask: class ExploringTask:
def __init__(self, url): def __init__(self, url):
self.anunciosdb = get_anunciosdb() self.anunciosdb = get_anunciosdb()
self.tasksdb = get_tasksdb() self.tasksdb = get_tasksdb()
self.target_url = url self.target_url = url
self.id = str(uuid.uuid4()) self.id = str(uuid.uuid4())
self._update_status('Pending') self._update_status("Pending")
def _update_status(self, new_status): def _update_status(self, new_status):
self.status = new_status self.status = new_status
@ -191,19 +216,19 @@ class ExploringTask:
def explore(self): def explore(self):
attack = UrlAttack(self.target_url) attack = UrlAttack(self.target_url)
attack.attack() attack.attack()
self._update_status('Attacked') self._update_status("Attacked")
if attack.success: if attack.success:
self._validate_referencias(attack.get_text()) self._validate_referencias(attack.get_text())
self._extract_referencias(attack.get_text()) self._extract_referencias(attack.get_text())
if self.referencias: if self.referencias:
self._update_status('Referencias ready') self._update_status("Referencias ready")
elif self.there_are_referencias: elif self.there_are_referencias:
self._update_status('Failure - No new referencias in HTML') self._update_status("Failure - No new referencias in HTML")
else: else:
self._update_status('Failure - HTML with no referencias') self._update_status("Failure - HTML with no referencias")
else: else:
self._update_status('Failure - Bad request') self._update_status("Failure - Bad request")
def _log_in_tasksdb(self): def _log_in_tasksdb(self):
""" """
@ -215,8 +240,7 @@ class ExploringTask:
(uuid, write_time, status) (uuid, write_time, status)
VALUES (%(uuid)s, NOW(), %(status)s)""" VALUES (%(uuid)s, NOW(), %(status)s)"""
query_parameters = {'uuid': self.id, query_parameters = {"uuid": self.id, "status": self.status}
'status': self.status}
self.tasksdb.query(query_statement, query_parameters) self.tasksdb.query(query_statement, query_parameters)
@ -225,16 +249,20 @@ class ExploringTask:
Comprueba que las etiquetas sigan el formato de un anuncio. Comprueba que las etiquetas sigan el formato de un anuncio.
Lanza una advertencia si no es así. Lanza una advertencia si no es así.
""" """
soup = BeautifulSoup(html, 'html5lib') soup = BeautifulSoup(html, "html5lib")
ads = soup.find_all(class_="item") ads = soup.find_all(class_="item")
pattern = "^[0-9]{3,20}$" pattern = "^[0-9]{3,20}$"
for ad in ads: for ad in ads:
if not re.match(pattern, ad["data-adid"]): if not re.match(pattern, ad["data-adid"]):
alert_master("Alerta - Referencias no válidas", alert_master(
"""Una tarea de exploración ha considerado inválida "Alerta - Referencias no válidas",
"""Una tarea de exploración ha considerado inválida
una referencia. El texto de la referencia era : {} una referencia. El texto de la referencia era : {}
""".format(ad["data-adid"])) """.format(
ad["data-adid"]
),
)
break break
def _extract_referencias(self, html): def _extract_referencias(self, html):
@ -243,8 +271,8 @@ class ExploringTask:
de capturas, y guarda si han aparecido listings y si hay alguno nuevo de capturas, y guarda si han aparecido listings y si hay alguno nuevo
""" """
soup = BeautifulSoup(html, 'html5lib') soup = BeautifulSoup(html, "html5lib")
ads = soup.find_all(class_ = "item") ads = soup.find_all(class_="item")
self.there_are_referencias = bool(ads) self.there_are_referencias = bool(ads)
self.referencias = [] self.referencias = []
for ad in ads: for ad in ads:
@ -277,6 +305,6 @@ class ExploringTask:
return None return None
if __name__ == '__main__': if __name__ == "__main__":
explorer = Explorer() explorer = Explorer()
explorer.start() explorer.start()