import sys sys.path.append('..') import uuid from core.mysql_wrapper import get_anunciosdb, get_tasksdb from core.scrapping_utils import UrlAttack ads_root = 'https://www.idealista.com/inmueble/' #TODO Crear la lista de campos def create_capturing_task(referencia, db_wrapper, uuid_exploring=None): query_parameters = {'ad_url': ads_root + referencia, 'uuid': str(uuid.uuid4()), 'status': 'Pending'} if uuid_exploring is None: query_statement = """INSERT INTO capturing_tasks_logs (uuid, write_time, status, url) VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s)""" else: query_parameters['uuid_exploring'] = uuid_exploring query_statement = """INSERT INTO capturing_tasks_logs (uuid, write_time, status, url, fk_uuid_exploring) VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s, %(uuid_exploring)s)""" db_wrapper.query(query_statement, query_parameters) class CapturingTask: def __init__(self, parameters): self.uuid = parameters['uuid'] self.status = 'Loading' self.ad_url = parameters['ad_url'] self.uuid_exploring = parameters['uuid_exploring'] self.tasksdb = get_tasksdb() self._log_in_tasksdb() def _update_status(self, new_status): self.status = new_status self._log_in_tasksdb() def _log_in_tasksdb(self): """ Graba en la base de datos de tareas un registro con el UUID de la tarea, un timestamp y el status """ query_statement = """INSERT INTO capturing_tasks_logs (uuid, write_time, status, ad_url, fk_uuid_exploring) VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s, %(fk_uuid_exploring)s)""" query_parameters = {'uuid': self.uuid, 'status': self.status, 'ad_url': self.ad_url, 'fk_uuid_exploring': self.uuid_exploring} self.tasksdb.query(query_statement, query_parameters) def capture(self): """ Metodo principal que contiene el flujo de captura """ #TODO Desarrollar flujo de captura def _html_is_valid(selfself, html=self.html): """ Lee el HTML y aplica normas de validación del contenido """ #TODO Comprobar si HTML es pagina de bloqueo #TODO Check de longitud pass def _fields_not_present(self, field_list, html=self.html): """ Lee el HTML y devuelve los campos que no esten presentes """ fields_not_present = [] for field_parameters in field_list: field = ScrapTargetField(field_parameters) if not field.exists(html): fields_not_present.append(field.name) return fields_not_present class ScrapTargetField: def __init__(self, target_parameters): self.name = target_parameters['name'] self.search_method = target_parameters['search_method'] self.validation_method = target_parameters['validation_method'] def exists(self, html): """ Busca el dato en un HTML """ if self.search_method(html) is None: return False else: return True def validate_value(self, dato): """ Comprueba el valor y valida con la norma respectiva que sea lo esperado """ return self.validation_method(dato) def get_value(self, html): """ Busca en un HTML el dato """ return self.search_method(html)