2018-08-09 20:55:04 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
import sys
|
|
|
|
|
sys.path.append('..')
|
2018-08-12 23:14:47 +02:00
|
|
|
import uuid
|
2018-09-09 19:22:21 +02:00
|
|
|
import datetime
|
2018-08-09 20:55:04 +02:00
|
|
|
from time import sleep
|
2018-08-13 23:55:17 +02:00
|
|
|
from bs4 import BeautifulSoup
|
2018-08-30 19:38:31 +02:00
|
|
|
import re
|
|
|
|
|
from random import randint
|
|
|
|
|
from core.mysql_wrapper import get_anunciosdb, get_tasksdb
|
2018-10-29 21:57:20 +01:00
|
|
|
from core.config import monthly_new_ads_target, working_hours
|
2018-08-30 19:38:31 +02:00
|
|
|
from core.scrapping_utils import UrlAttack
|
2018-09-09 19:22:21 +02:00
|
|
|
from core.alerts import alert_master
|
2018-10-13 18:07:32 +02:00
|
|
|
from db_layer.capturing_tasks_interface import capturing_interface
|
2018-08-09 20:55:04 +02:00
|
|
|
|
|
|
|
|
class Explorer():
|
|
|
|
|
|
|
|
|
|
sleep_time_no_work = 60
|
|
|
|
|
sleep_time_no_service = 600
|
2018-09-22 23:17:49 +02:00
|
|
|
ad_types = {'1': 'alquiler',
|
|
|
|
|
'2': 'venta'}
|
2018-09-09 19:42:52 +02:00
|
|
|
|
2018-08-09 20:55:04 +02:00
|
|
|
def __init__(self):
|
|
|
|
|
try:
|
|
|
|
|
self.anunciosdb = get_anunciosdb()
|
2018-09-09 19:42:52 +02:00
|
|
|
self.tasksdb = get_tasksdb()
|
2018-08-14 20:02:40 +02:00
|
|
|
except:
|
|
|
|
|
print("Could not connect to anuncios DB")
|
2018-08-30 19:38:31 +02:00
|
|
|
|
2018-08-09 20:55:04 +02:00
|
|
|
self.max_db_retries = 3
|
|
|
|
|
self.db_retries = 0
|
|
|
|
|
self.max_queue_retries = 3
|
|
|
|
|
self.queue_retries = 0
|
2018-08-14 20:02:40 +02:00
|
|
|
|
2018-08-09 20:55:04 +02:00
|
|
|
def start(self):
|
|
|
|
|
|
|
|
|
|
while True:
|
|
|
|
|
if not self.there_is_work():
|
2018-10-14 18:41:12 +02:00
|
|
|
print('{}: Waiting. No work'.format(datetime.datetime.now()))
|
2018-09-09 19:22:21 +02:00
|
|
|
sleep(Explorer.sleep_time_no_work)
|
2018-08-09 20:55:04 +02:00
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if not self.database_is_up():
|
2018-09-09 19:22:21 +02:00
|
|
|
alert_master("SQL DOWN", "El explorer informa de que SQL esta caida. Actividad detenida")
|
2018-09-22 23:56:01 +02:00
|
|
|
self.stop()
|
2018-09-09 19:22:21 +02:00
|
|
|
|
2018-09-22 23:17:49 +02:00
|
|
|
current_task = ExploringTask(self.compose_listing_url())
|
2018-08-30 19:38:31 +02:00
|
|
|
current_task.explore()
|
2018-10-14 18:41:12 +02:00
|
|
|
print('{}: Exploring done'.format(datetime.datetime.now()))
|
2018-09-21 18:19:33 +02:00
|
|
|
|
|
|
|
|
if current_task.status == 'Referencias ready':
|
|
|
|
|
referencias = current_task.get_referencias()
|
|
|
|
|
for referencia in referencias:
|
2018-10-14 18:41:12 +02:00
|
|
|
capturing_interface.create_capturing_task(referencia, current_task.id)
|
|
|
|
|
current_task._update_status("Sent to queue")
|
2018-09-21 18:19:33 +02:00
|
|
|
|
2018-08-14 20:02:40 +02:00
|
|
|
continue
|
2018-10-14 18:41:12 +02:00
|
|
|
|
2018-09-09 19:22:21 +02:00
|
|
|
|
2018-08-09 20:55:04 +02:00
|
|
|
def stop(self):
|
2018-09-09 19:22:21 +02:00
|
|
|
#TODO Detener el servicio
|
2018-08-09 20:55:04 +02:00
|
|
|
#Detener el servicio
|
2018-08-30 19:38:31 +02:00
|
|
|
pass
|
2018-08-09 20:55:04 +02:00
|
|
|
|
|
|
|
|
def there_is_work(self):
|
2018-08-30 19:38:31 +02:00
|
|
|
"""
|
|
|
|
|
Funcion que agrupa las condiciones que se deben cumplir para poder trabajar
|
|
|
|
|
"""
|
2018-09-21 18:19:33 +02:00
|
|
|
if self.check_if_recent_task():
|
|
|
|
|
return False
|
|
|
|
|
|
2018-08-30 19:38:31 +02:00
|
|
|
if not self.in_working_hours():
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
if self.get_referencias_acquired_today() >= self.get_max_referencias_for_today():
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
if self.get_tasks_created_today() >= self.get_max_tasks_today():
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
return True
|
2018-08-09 20:55:04 +02:00
|
|
|
|
|
|
|
|
def database_is_up(self):
|
|
|
|
|
while self.db_retries <= self.max_db_retries:
|
|
|
|
|
try:
|
|
|
|
|
self.anunciosdb.ping()
|
|
|
|
|
self.db_retries = 0
|
|
|
|
|
return True
|
|
|
|
|
except:
|
2018-09-09 19:22:21 +02:00
|
|
|
sleep(Explorer.sleep_time_no_service)
|
2018-08-09 20:55:04 +02:00
|
|
|
self.db_retries = self.db_retries + 1
|
|
|
|
|
|
2018-08-30 19:38:31 +02:00
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def in_working_hours(self):
|
2018-10-29 21:57:20 +01:00
|
|
|
return working_hours['start'] <= datetime.datetime.now().time() <= working_hours['end']
|
2018-08-09 20:55:04 +02:00
|
|
|
|
2018-08-30 19:38:31 +02:00
|
|
|
def get_referencias_acquired_today(self):
|
|
|
|
|
"""
|
|
|
|
|
Cuenta cuantas nuevas referencias han aparecido en las ultimas 24 horas
|
|
|
|
|
"""
|
|
|
|
|
|
2018-09-09 19:42:52 +02:00
|
|
|
query_statement = """ SELECT count(referencia)
|
2018-08-30 19:38:31 +02:00
|
|
|
FROM primera_captura_full
|
|
|
|
|
WHERE fecha_captura >= now() - INTERVAL 1 DAY;
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
cursor_result = self.anunciosdb.query(query_statement)
|
|
|
|
|
|
2018-09-09 19:42:52 +02:00
|
|
|
return cursor_result.fetchone()[0]
|
2018-08-30 19:38:31 +02:00
|
|
|
|
|
|
|
|
def get_max_referencias_for_today(self):
|
|
|
|
|
"""
|
|
|
|
|
Calcula la cantidad objetivo para las ultimas 24 horas en base a la
|
|
|
|
|
diferencia con el objetivo mensual
|
|
|
|
|
"""
|
2018-09-09 19:42:52 +02:00
|
|
|
query_statement = """ SELECT count(referencia)
|
2018-08-30 19:38:31 +02:00
|
|
|
FROM primera_captura_full
|
|
|
|
|
WHERE fecha_captura >= now() - INTERVAL 30 DAY;
|
|
|
|
|
"""
|
|
|
|
|
cursor_result = self.anunciosdb.query(query_statement)
|
2018-09-09 19:42:52 +02:00
|
|
|
new_referencias_last_30 = cursor_result.fetchone()[0]
|
|
|
|
|
|
2018-10-29 21:57:20 +01:00
|
|
|
deviation = (monthly_new_ads_target - new_referencias_last_30) / monthly_new_ads_target
|
|
|
|
|
max_referencias = (monthly_new_ads_target / 30) * (1 + deviation)
|
2018-09-09 19:42:52 +02:00
|
|
|
|
2018-08-30 19:38:31 +02:00
|
|
|
return max_referencias
|
2018-09-09 19:42:52 +02:00
|
|
|
|
2018-08-30 19:38:31 +02:00
|
|
|
def get_tasks_created_today(self):
|
|
|
|
|
"""
|
|
|
|
|
Mira en el task log cuantas tareas se han iniciado en las ultimas 24 horas
|
|
|
|
|
"""
|
|
|
|
|
query_statement = """ SELECT count(uuid)
|
2018-09-09 19:42:52 +02:00
|
|
|
FROM exploring_tasks_logs
|
2018-08-30 19:38:31 +02:00
|
|
|
WHERE status = 'Attacked'
|
|
|
|
|
AND write_time >= now() - INTERVAL 1 DAY;
|
|
|
|
|
"""
|
|
|
|
|
cursor_result = self.tasksdb.query(query_statement)
|
2018-09-09 19:42:52 +02:00
|
|
|
tasks_created_today = cursor_result.fetchone()[0]
|
2018-08-30 19:38:31 +02:00
|
|
|
|
|
|
|
|
return tasks_created_today
|
|
|
|
|
|
|
|
|
|
def get_max_tasks_today(self):
|
|
|
|
|
"""
|
|
|
|
|
Calcula el maximo diario de intentos en forma de tareas, en base al
|
|
|
|
|
maximo de capturas mas un multiplicador
|
|
|
|
|
"""
|
|
|
|
|
return (self.get_max_referencias_for_today() / 30) * 6
|
|
|
|
|
|
2018-09-21 18:19:33 +02:00
|
|
|
def check_if_recent_task(self):
|
|
|
|
|
"""
|
|
|
|
|
Mira si se ha creado alguna tarea recientemente
|
|
|
|
|
"""
|
|
|
|
|
query_statement = """ SELECT count(uuid)
|
|
|
|
|
FROM exploring_tasks_logs
|
|
|
|
|
WHERE status = 'Attacked'
|
|
|
|
|
AND write_time >= now() - INTERVAL 10 MINUTE
|
|
|
|
|
"""
|
|
|
|
|
cursor_result = self.tasksdb.query(query_statement)
|
|
|
|
|
|
2018-10-14 18:41:12 +02:00
|
|
|
return cursor_result.fetchone()[0]
|
2018-09-21 18:19:33 +02:00
|
|
|
|
2018-08-09 20:55:04 +02:00
|
|
|
def compose_listing_url(self):
|
2018-08-30 19:38:31 +02:00
|
|
|
"""
|
|
|
|
|
Genera URLs de manera aleatoria
|
|
|
|
|
:return:
|
|
|
|
|
"""
|
2018-09-22 23:17:49 +02:00
|
|
|
root = 'https://www.idealista.com/'
|
2018-10-14 18:41:12 +02:00
|
|
|
type = Explorer.ad_types[str(randint(1,2))]
|
2018-09-22 23:17:49 +02:00
|
|
|
city = 'barcelona'
|
|
|
|
|
page_number = str(randint(1,30))
|
|
|
|
|
url = root + type + '-garajes/' + city + '-' + city + '/' + \
|
|
|
|
|
'pagina-' + page_number + '.htm'
|
2018-08-12 23:14:47 +02:00
|
|
|
|
|
|
|
|
return url
|
2018-08-09 20:55:04 +02:00
|
|
|
|
|
|
|
|
|
2018-09-21 18:19:33 +02:00
|
|
|
class ExploringTask:
|
2018-08-09 20:55:04 +02:00
|
|
|
|
2018-08-12 23:14:47 +02:00
|
|
|
def __init__(self, url):
|
2018-08-30 19:38:31 +02:00
|
|
|
self.anunciosdb = get_anunciosdb()
|
|
|
|
|
self.tasksdb = get_tasksdb()
|
2018-08-12 23:14:47 +02:00
|
|
|
self.target_url = url
|
|
|
|
|
self.id = str(uuid.uuid4())
|
2018-08-30 19:38:31 +02:00
|
|
|
self._update_status('Pending')
|
2018-08-14 20:02:40 +02:00
|
|
|
|
2018-08-30 19:38:31 +02:00
|
|
|
def _update_status(self, new_status):
|
2018-08-14 20:02:40 +02:00
|
|
|
self.status = new_status
|
2018-08-30 19:38:31 +02:00
|
|
|
self._log_in_tasksdb()
|
2018-08-13 23:55:17 +02:00
|
|
|
|
2018-08-12 23:14:47 +02:00
|
|
|
def explore(self):
|
2018-08-30 19:38:31 +02:00
|
|
|
attack = UrlAttack(self.target_url)
|
2018-08-12 23:14:47 +02:00
|
|
|
attack.attack()
|
2018-08-30 19:38:31 +02:00
|
|
|
self._update_status('Attacked')
|
2018-08-12 23:14:47 +02:00
|
|
|
|
|
|
|
|
if attack.success:
|
2018-08-30 19:38:31 +02:00
|
|
|
self._validate_referencias(attack.get_text())
|
2018-08-14 20:02:40 +02:00
|
|
|
self._extract_referencias(attack.get_text())
|
2018-08-30 19:38:31 +02:00
|
|
|
if self.referencias:
|
|
|
|
|
self._update_status('Referencias ready')
|
|
|
|
|
elif self.there_are_referencias:
|
|
|
|
|
self._update_status('Failure - No new referencias in HTML')
|
2018-08-13 23:55:17 +02:00
|
|
|
else:
|
2018-08-30 19:38:31 +02:00
|
|
|
self._update_status('Failure - HTML with no referencias')
|
2018-08-12 23:14:47 +02:00
|
|
|
else:
|
2018-08-30 19:38:31 +02:00
|
|
|
self._update_status('Failure - Bad request')
|
2018-08-13 23:55:17 +02:00
|
|
|
|
2018-08-30 19:38:31 +02:00
|
|
|
def _log_in_tasksdb(self):
|
|
|
|
|
"""
|
|
|
|
|
Graba en la base de datos de tareas un registro con el UUID de la tarea,
|
|
|
|
|
un timestamp y el status
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
query_statement = """INSERT INTO exploring_tasks_logs
|
|
|
|
|
(uuid, write_time, status)
|
|
|
|
|
VALUES (%(uuid)s, NOW(), %(status)s)"""
|
|
|
|
|
|
|
|
|
|
query_parameters = {'uuid': self.id,
|
|
|
|
|
'status': self.status}
|
|
|
|
|
|
|
|
|
|
self.tasksdb.query(query_statement, query_parameters)
|
|
|
|
|
|
|
|
|
|
def _validate_referencias(self, html):
|
|
|
|
|
"""
|
|
|
|
|
Comprueba que las etiquetas sigan el formato de un anuncio.
|
|
|
|
|
Lanza una advertencia si no es así.
|
|
|
|
|
"""
|
|
|
|
|
soup = BeautifulSoup(html, 'html5lib')
|
2018-12-01 16:26:25 +01:00
|
|
|
ads = soup.find_all(class_="item")
|
2018-08-30 19:38:31 +02:00
|
|
|
pattern = "^[0-9]{3,20}$"
|
|
|
|
|
|
|
|
|
|
for ad in ads:
|
|
|
|
|
if not re.match(pattern, ad["data-adid"]):
|
2018-09-09 19:22:21 +02:00
|
|
|
alert_master("Alerta - Referencias no válidas",
|
|
|
|
|
"""Una tarea de exploración ha considerado inválida
|
|
|
|
|
una referencia. El texto de la referencia era : {}
|
|
|
|
|
""".format(ad["data-adid"]))
|
|
|
|
|
break
|
|
|
|
|
|
2018-08-14 20:02:40 +02:00
|
|
|
def _extract_referencias(self, html):
|
|
|
|
|
"""
|
|
|
|
|
Saca referencias de HTML, descarta las que ya exiten en la base de datos
|
|
|
|
|
de capturas, y guarda si han aparecido listings y si hay alguno nuevo
|
|
|
|
|
"""
|
|
|
|
|
|
2018-08-30 19:38:31 +02:00
|
|
|
soup = BeautifulSoup(html, 'html5lib')
|
|
|
|
|
ads = soup.find_all(class_ = "item")
|
|
|
|
|
self.there_are_referencias = bool(ads)
|
2018-08-14 20:02:40 +02:00
|
|
|
self.referencias = []
|
|
|
|
|
for ad in ads:
|
|
|
|
|
if self._is_new_listing(ad["data-adid"]):
|
|
|
|
|
self.referencias.append(ad["data-adid"])
|
2018-09-21 18:19:33 +02:00
|
|
|
|
2018-08-14 20:02:40 +02:00
|
|
|
def _is_new_listing(self, referencia):
|
2018-08-30 19:38:31 +02:00
|
|
|
"""
|
|
|
|
|
Comprueba si el listing ya existe en la base de datos de anuncios
|
|
|
|
|
"""
|
2018-08-14 20:02:40 +02:00
|
|
|
query_statement = """SELECT count(referencia)
|
|
|
|
|
FROM capturas
|
|
|
|
|
WHERE referencia = %s"""
|
|
|
|
|
query_params = (referencia,)
|
|
|
|
|
cursor_result = self.anunciosdb.query(query_statement, query_params)
|
|
|
|
|
|
|
|
|
|
result = cursor_result.fetchone()
|
|
|
|
|
if result[0] > 0:
|
|
|
|
|
return False
|
|
|
|
|
else:
|
|
|
|
|
return True
|
2018-08-30 19:38:31 +02:00
|
|
|
|
2018-09-21 18:19:33 +02:00
|
|
|
def get_referencias(self):
|
|
|
|
|
"""
|
|
|
|
|
Devuelve las referencias, si las hay
|
|
|
|
|
"""
|
|
|
|
|
if self.referencias:
|
|
|
|
|
return self.referencias
|
|
|
|
|
else:
|
|
|
|
|
return None
|
2018-09-09 19:22:21 +02:00
|
|
|
|
2018-10-13 18:17:05 +02:00
|
|
|
|
2018-10-14 17:19:48 +02:00
|
|
|
if __name__ == '__main__':
|
2018-10-13 18:17:05 +02:00
|
|
|
explorer = Explorer()
|
|
|
|
|
explorer.start()
|