Formatting

This commit is contained in:
pablo 2020-03-26 11:38:08 +01:00
parent 9c2565f5d8
commit acfeeef0d1

View file

@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
import sys
sys.path.append('..')
sys.path.append("..")
import uuid
import datetime
from time import sleep
@ -12,13 +13,15 @@ from core.config import monthly_new_ads_target, working_hours
from core.scrapping_utils import UrlAttack
from core.alerts import alert_master
from db_layer.capturing_tasks_interface import capturing_interface
from core import my_logger
import logging
class Explorer:
class Explorer():
sleep_time_no_work = 60
sleep_time_no_service = 600
ad_types = {'1': 'alquiler',
'2': 'venta'}
ad_types = {"1": "alquiler", "2": "venta"}
def __init__(self):
try:
@ -33,35 +36,39 @@ class Explorer():
self.queue_retries = 0
def start(self):
while True:
if not self.there_is_work():
print('{}: Waiting. No work'.format(datetime.datetime.now()))
print("{}: Waiting. No work".format(datetime.datetime.now()))
sleep(Explorer.sleep_time_no_work)
continue
if not self.database_is_up():
alert_master("SQL DOWN", "El explorer informa de que SQL esta caida. Actividad detenida")
alert_master(
"SQL DOWN",
"El explorer informa de que SQL esta caida. Actividad detenida",
)
self.stop()
current_task = ExploringTask(self.compose_listing_url())
current_task.explore()
print('{}: Exploring done'.format(datetime.datetime.now()))
print("{}: Exploring done".format(datetime.datetime.now()))
if current_task.status == 'Referencias ready':
if current_task.status == "Referencias ready":
referencias = current_task.get_referencias()
for referencia in referencias:
capturing_interface.create_capturing_task(referencia, current_task.id)
capturing_interface.create_capturing_task(
referencia, current_task.id
)
current_task._update_status("Sent to queue")
continue
continue
def stop(self):
#TODO Detener el servicio
#Detener el servicio
# TODO Detener el servicio
# Detener el servicio
pass
def there_is_work(self):
"""
Funcion que agrupa las condiciones que se deben cumplir para poder trabajar
@ -71,15 +78,18 @@ class Explorer():
if not self.in_working_hours():
return False
if self.get_referencias_acquired_today() >= self.get_max_referencias_for_today():
if (
self.get_referencias_acquired_today()
>= self.get_max_referencias_for_today()
):
return False
if self.get_tasks_created_today() >= self.get_max_tasks_today():
return False
return True
def database_is_up(self):
while self.db_retries <= self.max_db_retries:
try:
@ -89,26 +99,30 @@ class Explorer():
except:
sleep(Explorer.sleep_time_no_service)
self.db_retries = self.db_retries + 1
return False
def in_working_hours(self):
return working_hours['start'] <= datetime.datetime.now().time() <= working_hours['end']
return (
working_hours["start"]
<= datetime.datetime.now().time()
<= working_hours["end"]
)
def get_referencias_acquired_today(self):
"""
Cuenta cuantas nuevas referencias han aparecido en las ultimas 24 horas
"""
query_statement = """ SELECT count(referencia)
FROM primera_captura_full
WHERE fecha_captura >= now() - INTERVAL 1 DAY;
"""
cursor_result = self.anunciosdb.query(query_statement)
return cursor_result.fetchone()[0]
def get_max_referencias_for_today(self):
"""
Calcula la cantidad objetivo para las ultimas 24 horas en base a la
@ -121,7 +135,9 @@ class Explorer():
cursor_result = self.anunciosdb.query(query_statement)
new_referencias_last_30 = cursor_result.fetchone()[0]
deviation = (monthly_new_ads_target - new_referencias_last_30) / monthly_new_ads_target
deviation = (
monthly_new_ads_target - new_referencias_last_30
) / monthly_new_ads_target
max_referencias = (monthly_new_ads_target / 30) * (1 + deviation)
return max_referencias
@ -165,76 +181,88 @@ class Explorer():
Genera URLs de manera aleatoria
:return:
"""
root = 'https://www.idealista.com/'
type = Explorer.ad_types[str(randint(1,2))]
city = 'barcelona'
page_number = str(randint(1,30))
url = root + type + '-garajes/' + city + '-' + city + '/' + \
'pagina-' + page_number + '.htm'
root = "https://www.idealista.com/"
type = Explorer.ad_types[str(randint(1, 2))]
city = "barcelona"
page_number = str(randint(1, 30))
url = (
root
+ type
+ "-garajes/"
+ city
+ "-"
+ city
+ "/"
+ "pagina-"
+ page_number
+ ".htm"
)
return url
class ExploringTask:
def __init__(self, url):
self.anunciosdb = get_anunciosdb()
self.tasksdb = get_tasksdb()
self.target_url = url
self.id = str(uuid.uuid4())
self._update_status('Pending')
self._update_status("Pending")
def _update_status(self, new_status):
self.status = new_status
self._log_in_tasksdb()
def explore(self):
attack = UrlAttack(self.target_url)
attack.attack()
self._update_status('Attacked')
self._update_status("Attacked")
if attack.success:
self._validate_referencias(attack.get_text())
self._extract_referencias(attack.get_text())
if self.referencias:
self._update_status('Referencias ready')
self._update_status("Referencias ready")
elif self.there_are_referencias:
self._update_status('Failure - No new referencias in HTML')
self._update_status("Failure - No new referencias in HTML")
else:
self._update_status('Failure - HTML with no referencias')
self._update_status("Failure - HTML with no referencias")
else:
self._update_status('Failure - Bad request')
self._update_status("Failure - Bad request")
def _log_in_tasksdb(self):
"""
Graba en la base de datos de tareas un registro con el UUID de la tarea,
un timestamp y el status
"""
query_statement = """INSERT INTO exploring_tasks_logs
(uuid, write_time, status)
VALUES (%(uuid)s, NOW(), %(status)s)"""
query_parameters = {'uuid': self.id,
'status': self.status}
query_parameters = {"uuid": self.id, "status": self.status}
self.tasksdb.query(query_statement, query_parameters)
def _validate_referencias(self, html):
"""
Comprueba que las etiquetas sigan el formato de un anuncio.
Lanza una advertencia si no es así.
"""
soup = BeautifulSoup(html, 'html5lib')
soup = BeautifulSoup(html, "html5lib")
ads = soup.find_all(class_="item")
pattern = "^[0-9]{3,20}$"
for ad in ads:
if not re.match(pattern, ad["data-adid"]):
alert_master("Alerta - Referencias no válidas",
"""Una tarea de exploración ha considerado inválida
alert_master(
"Alerta - Referencias no válidas",
"""Una tarea de exploración ha considerado inválida
una referencia. El texto de la referencia era : {}
""".format(ad["data-adid"]))
""".format(
ad["data-adid"]
),
)
break
def _extract_referencias(self, html):
@ -243,13 +271,13 @@ class ExploringTask:
de capturas, y guarda si han aparecido listings y si hay alguno nuevo
"""
soup = BeautifulSoup(html, 'html5lib')
ads = soup.find_all(class_ = "item")
soup = BeautifulSoup(html, "html5lib")
ads = soup.find_all(class_="item")
self.there_are_referencias = bool(ads)
self.referencias = []
for ad in ads:
if self._is_new_listing(ad["data-adid"]):
self.referencias.append(ad["data-adid"])
self.referencias.append(ad["data-adid"])
def _is_new_listing(self, referencia):
"""
@ -260,13 +288,13 @@ class ExploringTask:
WHERE referencia = %s"""
query_params = (referencia,)
cursor_result = self.anunciosdb.query(query_statement, query_params)
result = cursor_result.fetchone()
if result[0] > 0:
return False
else:
return True
def get_referencias(self):
"""
Devuelve las referencias, si las hay
@ -277,6 +305,6 @@ class ExploringTask:
return None
if __name__ == '__main__':
if __name__ == "__main__":
explorer = Explorer()
explorer.start()