drogon/capturer/capturer.py

258 lines
8.9 KiB
Python

import sys
sys.path.append('..')
import uuid
from time import sleep
from core.mysql_wrapper import get_anunciosdb, get_tasksdb
from core.scrapping_utils import UrlAttack
from core.alerts import alert_master
from capturer.geocoder import GeocodingTask
ads_root = 'https://www.idealista.com/inmueble/'
#TODO Crear la lista de campos
ad_fields_parameters = [{'name': 'referencia',
'search_method': '',
'validation_method': ''},
{'name': 'precio',
'search_method': '',
'validation_method': ''},
{'name': 'tamano_categorico',
'search_method': '',
'validation_method': ''},
{'name': 'm2',
'search_method': '',
'validation_method': ''},
{'name': 'telefono',
'search_method': '',
'validation_method': ''},
{'name': 'texto_tipo',
'search_method': '',
'validation_method': ''},
{'name': 'ciudad',
'search_method': '',
'validation_method': ''},
{'name': 'distrito',
'search_method': '',
'validation_method': ''},
{'name': 'barrio',
'search_method': '',
'validation_method': ''},
{'name': 'calle',
'search_method': '',
'validation_method': ''},
{'name': 'cubierta',
'search_method': '',
'validation_method': ''},
{'name': 'puerta_auto',
'search_method': '',
'validation_method': ''},
{'name': 'ascensor',
'search_method': '',
'validation_method': ''},
{'name': 'alarma',
'search_method': '',
'validation_method': ''},
{'name': 'circuito',
'search_method': '',
'validation_method': ''},
{'name': 'personal',
'search_method': '',
'validation_method': ''},
{'name': 'texto_libre',
'search_method': '',
'validation_method': ''}]
def create_capturing_task(referencia, db_wrapper, uuid_exploring=None):
query_parameters = {'ad_url': ads_root + referencia,
'uuid': str(uuid.uuid4()),
'status': 'Pending'}
if uuid_exploring is None:
query_statement = """INSERT INTO capturing_tasks_logs
(uuid, write_time, status, url)
VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s)"""
else:
query_parameters['uuid_exploring'] = uuid_exploring
query_statement = """INSERT INTO capturing_tasks_logs
(uuid, write_time, status, url, fk_uuid_exploring)
VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s, %(uuid_exploring)s)"""
db_wrapper.query(query_statement, query_parameters)
class CapturingTask:
sleep_time_failed_request = 60
def __init__(self, parameters):
self.uuid = parameters['uuid']
self.ad_url = parameters['ad_url']
self.uuid_exploring = parameters['uuid_exploring']
self.request_failures = 1
self.geocode_status = "Pending"
self.tasksdb = get_tasksdb()
self._update_status('Loading')
def _update_status(self, new_status):
self.status = new_status
self._log_in_tasksdb()
def _log_in_tasksdb(self):
"""
Graba en la base de datos de tareas un registro con el UUID de la tarea,
un timestamp y el status
"""
query_statement = """INSERT INTO capturing_tasks_logs
(uuid, write_time, status, ad_url, fk_uuid_exploring)
VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s, %(fk_uuid_exploring)s)"""
query_parameters = {'uuid': self.uuid,
'status': self.status,
'ad_url': self.ad_url,
'fk_uuid_exploring': self.uuid_exploring}
self.tasksdb.query(query_statement, query_parameters)
def capture(self):
"""
Metodo principal que contiene el flujo de captura
"""
#TODO Desarrollar flujo de captura
self._update_status('WIP')
self._read_fields()
while self.request_failures < 3:
attack = UrlAttack(self.ad_url)
attack.attack()
if attack.success():
self.html = attack.get_text()
with self._fields_not_present() as missing_fields:
if missing_fields:
alert_master('ERROR CAPTURER',
'Los siguientes campos no estaban presentes {}. '
'URL = {}'.format(missing_fields, self.ad_url))
self._update_status('Dead ad')
return
with self._fields_not_valid() as unvalid_fields:
if unvalid_fields:
alert_master('ERROR CAPTURER',
'Los siguientes campos no tenian valores presentes {}'
'URL = {}'.format(unvalid_fields, self.ad_url))
self._update_status('Dead ad')
return
#Extraer datos
self.extract_data()
else:
self.request_failures += 1
self._update_status('Fail {}'.format(self.request_failures))
sleep(sleep_time_failed_request)
continue
self._update_status('Surrender')
def _read_fields(self):
self.fields = []
for field_parameters in ad_fields_parameters:
self.fields.append(ScrapTargetField(field_parameters))
def _fields_not_present(self, html=self.html):
"""
Lee el HTML y devuelve los campos que no esten presentes
"""
#TODO Implementar campos optativos
fields_not_present = []
for field in self.fields:
if not field.exists(html):
fields_not_present.append(field.name)
return fields_not_present
def _fields_not_valid(self, html=self.html):
"""
Lee el HTML y devuelve los campos que no tengan valores validos
"""
fields_not_valid = []
for field in self.fields:
if not field.validate_value(html):
fields_not_valid.append(field.name)
return fields_not_valid
def extract_data(self):
self.ad_data = {}
for field in self.fields:
self.ad_data[field.name] = field.get_value(self.html)
def get_ad_data(self):
return self.ad_data
def geocode(self):
#TODO Hacer esta funcion bien
# Construir direccion con formato adecuado
geocode_tries = 0
geo_task = GeocodingTask(formated_address)
while geocode_tries < 3:
geo_task.geocode()
if geo_task.get_request_status() == 200:
google_status = geo_task.success_surrender_retry()
if google_status == 'Success':
self.geocode_status = 'Success'
self.geocode_results = geo_task.get_results()
return
elif google_status == 'Surrender':
self.geocode_status = 'Surrender'
return
elif google_status == 'Retry':
geocode_tries += 1
self.geocode_status = 'Surrender'
return
class ScrapTargetField:
def __init__(self, target_parameters):
self.name = target_parameters['name']
self.search_method = target_parameters['search_method']
self.validation_method = target_parameters['validation_method']
def exists(self, html):
"""
Busca el dato en un HTML
"""
if self.search_method(html) is None:
return False
else:
return True
def validate_value(self, dato):
"""
Comprueba el valor y valida con la norma respectiva que sea lo esperado
"""
return self.validation_method(dato)
def get_value(self, html):
"""
Busca en un HTML el dato
"""
return self.search_method(html)