2018-09-22 23:56:01 +02:00
|
|
|
import sys
|
|
|
|
|
sys.path.append('..')
|
2018-09-21 18:19:33 +02:00
|
|
|
import uuid
|
2018-09-23 21:04:00 +02:00
|
|
|
from time import sleep
|
2018-09-22 23:56:01 +02:00
|
|
|
from core.mysql_wrapper import get_anunciosdb, get_tasksdb
|
|
|
|
|
from core.scrapping_utils import UrlAttack
|
2018-09-23 21:04:00 +02:00
|
|
|
from core.alerts import alert_master
|
|
|
|
|
|
2018-09-21 18:19:33 +02:00
|
|
|
|
|
|
|
|
ads_root = 'https://www.idealista.com/inmueble/'
|
|
|
|
|
|
2018-09-22 23:56:01 +02:00
|
|
|
#TODO Crear la lista de campos
|
|
|
|
|
|
2018-09-23 21:04:00 +02:00
|
|
|
ad_fields_parameters = []
|
|
|
|
|
|
2018-09-21 18:19:33 +02:00
|
|
|
|
|
|
|
|
def create_capturing_task(referencia, db_wrapper, uuid_exploring=None):
|
|
|
|
|
|
2018-09-22 23:17:49 +02:00
|
|
|
query_parameters = {'ad_url': ads_root + referencia,
|
2018-09-21 18:19:33 +02:00
|
|
|
'uuid': str(uuid.uuid4()),
|
|
|
|
|
'status': 'Pending'}
|
|
|
|
|
|
|
|
|
|
if uuid_exploring is None:
|
|
|
|
|
query_statement = """INSERT INTO capturing_tasks_logs
|
|
|
|
|
(uuid, write_time, status, url)
|
2018-09-22 23:17:49 +02:00
|
|
|
VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s)"""
|
2018-09-21 18:19:33 +02:00
|
|
|
else:
|
2018-09-22 23:17:49 +02:00
|
|
|
query_parameters['uuid_exploring'] = uuid_exploring
|
2018-09-21 18:19:33 +02:00
|
|
|
query_statement = """INSERT INTO capturing_tasks_logs
|
|
|
|
|
(uuid, write_time, status, url, fk_uuid_exploring)
|
2018-09-22 23:17:49 +02:00
|
|
|
VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s, %(uuid_exploring)s)"""
|
|
|
|
|
|
2018-09-21 18:19:33 +02:00
|
|
|
db_wrapper.query(query_statement, query_parameters)
|
|
|
|
|
|
2018-09-22 23:56:01 +02:00
|
|
|
class CapturingTask:
|
|
|
|
|
|
2018-09-23 21:04:00 +02:00
|
|
|
sleep_time_failed_request = 60
|
|
|
|
|
|
2018-09-22 23:56:01 +02:00
|
|
|
def __init__(self, parameters):
|
|
|
|
|
self.uuid = parameters['uuid']
|
|
|
|
|
self.ad_url = parameters['ad_url']
|
|
|
|
|
self.uuid_exploring = parameters['uuid_exploring']
|
2018-09-23 21:04:00 +02:00
|
|
|
self.request_failures = 0
|
2018-09-22 23:56:01 +02:00
|
|
|
|
|
|
|
|
self.tasksdb = get_tasksdb()
|
|
|
|
|
|
2018-09-23 21:04:00 +02:00
|
|
|
self._update_status('Loading')
|
2018-09-22 23:56:01 +02:00
|
|
|
|
|
|
|
|
def _update_status(self, new_status):
|
|
|
|
|
self.status = new_status
|
|
|
|
|
self._log_in_tasksdb()
|
|
|
|
|
|
|
|
|
|
def _log_in_tasksdb(self):
|
|
|
|
|
"""
|
|
|
|
|
Graba en la base de datos de tareas un registro con el UUID de la tarea,
|
|
|
|
|
un timestamp y el status
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
query_statement = """INSERT INTO capturing_tasks_logs
|
|
|
|
|
(uuid, write_time, status, ad_url, fk_uuid_exploring)
|
|
|
|
|
VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s, %(fk_uuid_exploring)s)"""
|
|
|
|
|
|
|
|
|
|
query_parameters = {'uuid': self.uuid,
|
|
|
|
|
'status': self.status,
|
|
|
|
|
'ad_url': self.ad_url,
|
|
|
|
|
'fk_uuid_exploring': self.uuid_exploring}
|
|
|
|
|
|
|
|
|
|
self.tasksdb.query(query_statement, query_parameters)
|
|
|
|
|
|
|
|
|
|
def capture(self):
|
|
|
|
|
"""
|
|
|
|
|
Metodo principal que contiene el flujo de captura
|
|
|
|
|
"""
|
|
|
|
|
#TODO Desarrollar flujo de captura
|
2018-09-23 21:04:00 +02:00
|
|
|
self._update_status('WIP')
|
|
|
|
|
|
|
|
|
|
self._read_fields()
|
|
|
|
|
|
|
|
|
|
while self.request_failures < 4:
|
|
|
|
|
attack = UrlAttack(self.ad_url)
|
|
|
|
|
attack.attack()
|
|
|
|
|
|
|
|
|
|
if attack.success():
|
|
|
|
|
self.html = attack.get_text()
|
|
|
|
|
if not self._html_is_valid():
|
|
|
|
|
alert_master('ERROR CAPTURER',
|
|
|
|
|
'El HTML de una pagina de anuncio es invalido')
|
|
|
|
|
|
|
|
|
|
with self._fields_not_present() as missing_fields:
|
|
|
|
|
if missing_fields:
|
|
|
|
|
alert_master('ERROR CAPTURER',
|
|
|
|
|
'Los siguientes campos no estaban presentes {}. '
|
|
|
|
|
'URL = {}'.format(missing_fields, self.ad_url))
|
|
|
|
|
self._update_status('Dead ad')
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
with self._fields_not_valid() as unvalid_fields:
|
|
|
|
|
if unvalid_fields:
|
|
|
|
|
alert_master('ERROR CAPTURER',
|
|
|
|
|
'Los siguientes campos no tenian valores presentes {}'
|
|
|
|
|
'URL = {}'.format(unvalid_fields, self.ad_url))
|
|
|
|
|
self._update_status('Dead ad')
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
#Extraer datos
|
|
|
|
|
self.extract_data()
|
|
|
|
|
#Geocodear
|
|
|
|
|
self.geocode()
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
self.request_failures += 1
|
|
|
|
|
self._update_status('Fail {}'.format(self.request_failures))
|
|
|
|
|
sleep(sleep_time_failed_request)
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
self._update_status('Surrender')
|
2018-09-22 23:56:01 +02:00
|
|
|
|
|
|
|
|
|
2018-09-23 21:04:00 +02:00
|
|
|
def _read_fields(self):
|
|
|
|
|
self.fields = []
|
|
|
|
|
for field_parameters in ad_fields_parameters:
|
|
|
|
|
self.fields.append(ScrapTargetField(field_parameters))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _html_is_valid(self, html=self.html):
|
2018-09-22 23:56:01 +02:00
|
|
|
"""
|
|
|
|
|
Lee el HTML y aplica normas de validación del contenido
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
#TODO Comprobar si HTML es pagina de bloqueo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#TODO Check de longitud
|
|
|
|
|
pass
|
|
|
|
|
|
2018-09-23 21:04:00 +02:00
|
|
|
def _fields_not_present(self, html=self.html):
|
2018-09-22 23:56:01 +02:00
|
|
|
"""
|
|
|
|
|
Lee el HTML y devuelve los campos que no esten presentes
|
|
|
|
|
"""
|
|
|
|
|
fields_not_present = []
|
2018-09-23 21:04:00 +02:00
|
|
|
for field in self.fields:
|
|
|
|
|
if not field.exists(html):
|
|
|
|
|
fields_not_present.append(field.name)
|
2018-09-22 23:56:01 +02:00
|
|
|
|
|
|
|
|
return fields_not_present
|
|
|
|
|
|
2018-09-23 21:04:00 +02:00
|
|
|
def _fields_not_valid(self, html=self.html):
|
|
|
|
|
"""
|
|
|
|
|
Lee el HTML y devuelve los campos que no tengan valores validos
|
|
|
|
|
"""
|
|
|
|
|
fields_not_valid = []
|
|
|
|
|
for field in self.fields:
|
|
|
|
|
if not field.validate_value(html):
|
|
|
|
|
fields_not_valid.append(field.name)
|
|
|
|
|
|
|
|
|
|
return fields_not_valid
|
|
|
|
|
|
|
|
|
|
def extract_data(self):
|
|
|
|
|
self.ad_data = {}
|
|
|
|
|
|
|
|
|
|
for field in self.fields:
|
|
|
|
|
self.ad_data[field.name] = field.get_value(self.html)
|
|
|
|
|
|
|
|
|
|
def geocode(self):
|
|
|
|
|
#TODO Construir metodo de Geocoding. Quizas en otra clase?
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_ad_data(self):
|
|
|
|
|
return self.ad_data
|
|
|
|
|
|
2018-09-22 23:56:01 +02:00
|
|
|
|
|
|
|
|
class ScrapTargetField:
|
|
|
|
|
|
|
|
|
|
def __init__(self, target_parameters):
|
|
|
|
|
self.name = target_parameters['name']
|
|
|
|
|
self.search_method = target_parameters['search_method']
|
|
|
|
|
self.validation_method = target_parameters['validation_method']
|
|
|
|
|
|
|
|
|
|
def exists(self, html):
|
|
|
|
|
"""
|
|
|
|
|
Busca el dato en un HTML
|
|
|
|
|
"""
|
|
|
|
|
if self.search_method(html) is None:
|
|
|
|
|
return False
|
|
|
|
|
else:
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
def validate_value(self, dato):
|
|
|
|
|
"""
|
|
|
|
|
Comprueba el valor y valida con la norma respectiva que sea lo esperado
|
|
|
|
|
"""
|
|
|
|
|
return self.validation_method(dato)
|
|
|
|
|
|
|
|
|
|
def get_value(self, html):
|
|
|
|
|
"""
|
|
|
|
|
Busca en un HTML el dato
|
|
|
|
|
"""
|
|
|
|
|
return self.search_method(html)
|
|
|
|
|
|
|
|
|
|
|