import sys sys.path.append('..') import uuid from time import sleep from core.mysql_wrapper import get_anunciosdb, get_tasksdb from core.scrapping_utils import UrlAttack from core.alerts import alert_master from capturer.geocoder import GeocodingTask ads_root = 'https://www.idealista.com/inmueble/' #TODO Crear la lista de campos ad_fields_parameters = [] def create_capturing_task(referencia, db_wrapper, uuid_exploring=None): query_parameters = {'ad_url': ads_root + referencia, 'uuid': str(uuid.uuid4()), 'status': 'Pending'} if uuid_exploring is None: query_statement = """INSERT INTO capturing_tasks_logs (uuid, write_time, status, url) VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s)""" else: query_parameters['uuid_exploring'] = uuid_exploring query_statement = """INSERT INTO capturing_tasks_logs (uuid, write_time, status, url, fk_uuid_exploring) VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s, %(uuid_exploring)s)""" db_wrapper.query(query_statement, query_parameters) class CapturingTask: sleep_time_failed_request = 60 def __init__(self, parameters): self.uuid = parameters['uuid'] self.ad_url = parameters['ad_url'] self.uuid_exploring = parameters['uuid_exploring'] self.request_failures = 1 self.geocode_status = "Pending" self.tasksdb = get_tasksdb() self._update_status('Loading') def _update_status(self, new_status): self.status = new_status self._log_in_tasksdb() def _log_in_tasksdb(self): """ Graba en la base de datos de tareas un registro con el UUID de la tarea, un timestamp y el status """ query_statement = """INSERT INTO capturing_tasks_logs (uuid, write_time, status, ad_url, fk_uuid_exploring) VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s, %(fk_uuid_exploring)s)""" query_parameters = {'uuid': self.uuid, 'status': self.status, 'ad_url': self.ad_url, 'fk_uuid_exploring': self.uuid_exploring} self.tasksdb.query(query_statement, query_parameters) def capture(self): """ Metodo principal que contiene el flujo de captura """ #TODO Desarrollar flujo de captura self._update_status('WIP') self._read_fields() while self.request_failures < 3: attack = UrlAttack(self.ad_url) attack.attack() if attack.success(): self.html = attack.get_text() with self._fields_not_present() as missing_fields: if missing_fields: alert_master('ERROR CAPTURER', 'Los siguientes campos no estaban presentes {}. ' 'URL = {}'.format(missing_fields, self.ad_url)) self._update_status('Dead ad') return with self._fields_not_valid() as unvalid_fields: if unvalid_fields: alert_master('ERROR CAPTURER', 'Los siguientes campos no tenian valores presentes {}' 'URL = {}'.format(unvalid_fields, self.ad_url)) self._update_status('Dead ad') return #Extraer datos self.extract_data() #Geocodear self.geocode() #TODO Lidiar con el resultado del geocoding #TODO Manejar tema cache else: self.request_failures += 1 self._update_status('Fail {}'.format(self.request_failures)) sleep(sleep_time_failed_request) continue self._update_status('Surrender') def _read_fields(self): self.fields = [] for field_parameters in ad_fields_parameters: self.fields.append(ScrapTargetField(field_parameters)) def _fields_not_present(self, html=self.html): """ Lee el HTML y devuelve los campos que no esten presentes """ fields_not_present = [] for field in self.fields: if not field.exists(html): fields_not_present.append(field.name) return fields_not_present def _fields_not_valid(self, html=self.html): """ Lee el HTML y devuelve los campos que no tengan valores validos """ fields_not_valid = [] for field in self.fields: if not field.validate_value(html): fields_not_valid.append(field.name) return fields_not_valid def extract_data(self): self.ad_data = {} for field in self.fields: self.ad_data[field.name] = field.get_value(self.html) def get_ad_data(self): return self.ad_data def geocode(self): # Construir direccion con formato adecuado geocode_tries = 0 geo_task = GeocodingTask(formated_address) while geocode_tries < 3: geo_task.geocode() if geo_task.get_request_status() == 200: google_status = geo_task.success_surrender_retry() if google_status == 'Success': self.geocode_status = 'Success' self.geocode_results = geo_task.get_results() return elif google_status == 'Surrender': self.geocode_status = 'Surrender' return elif google_status == 'Retry': geocode_tries += 1 self.geocode_status = 'Surrender' return class ScrapTargetField: def __init__(self, target_parameters): self.name = target_parameters['name'] self.search_method = target_parameters['search_method'] self.validation_method = target_parameters['validation_method'] def exists(self, html): """ Busca el dato en un HTML """ if self.search_method(html) is None: return False else: return True def validate_value(self, dato): """ Comprueba el valor y valida con la norma respectiva que sea lo esperado """ return self.validation_method(dato) def get_value(self, html): """ Busca en un HTML el dato """ return self.search_method(html)