Avanzado en desarrollo de capturing task.

This commit is contained in:
pablomartincalvo 2018-09-23 21:04:00 +02:00
parent 7b47b1e766
commit 1d68575fd7
2 changed files with 125 additions and 40 deletions

View file

@ -1,13 +1,18 @@
import sys
sys.path.append('..')
import uuid
from time import sleep
from core.mysql_wrapper import get_anunciosdb, get_tasksdb
from core.scrapping_utils import UrlAttack
from core.alerts import alert_master
ads_root = 'https://www.idealista.com/inmueble/'
#TODO Crear la lista de campos
ad_fields_parameters = []
def create_capturing_task(referencia, db_wrapper, uuid_exploring=None):
@ -29,15 +34,17 @@ def create_capturing_task(referencia, db_wrapper, uuid_exploring=None):
class CapturingTask:
sleep_time_failed_request = 60
def __init__(self, parameters):
self.uuid = parameters['uuid']
self.status = 'Loading'
self.ad_url = parameters['ad_url']
self.uuid_exploring = parameters['uuid_exploring']
self.request_failures = 0
self.tasksdb = get_tasksdb()
self._log_in_tasksdb()
self._update_status('Loading')
def _update_status(self, new_status):
self.status = new_status
@ -65,9 +72,57 @@ class CapturingTask:
Metodo principal que contiene el flujo de captura
"""
#TODO Desarrollar flujo de captura
self._update_status('WIP')
self._read_fields()
while self.request_failures < 4:
attack = UrlAttack(self.ad_url)
attack.attack()
if attack.success():
self.html = attack.get_text()
if not self._html_is_valid():
alert_master('ERROR CAPTURER',
'El HTML de una pagina de anuncio es invalido')
with self._fields_not_present() as missing_fields:
if missing_fields:
alert_master('ERROR CAPTURER',
'Los siguientes campos no estaban presentes {}. '
'URL = {}'.format(missing_fields, self.ad_url))
self._update_status('Dead ad')
return
with self._fields_not_valid() as unvalid_fields:
if unvalid_fields:
alert_master('ERROR CAPTURER',
'Los siguientes campos no tenian valores presentes {}'
'URL = {}'.format(unvalid_fields, self.ad_url))
self._update_status('Dead ad')
return
#Extraer datos
self.extract_data()
#Geocodear
self.geocode()
else:
self.request_failures += 1
self._update_status('Fail {}'.format(self.request_failures))
sleep(sleep_time_failed_request)
continue
self._update_status('Surrender')
def _html_is_valid(selfself, html=self.html):
def _read_fields(self):
self.fields = []
for field_parameters in ad_fields_parameters:
self.fields.append(ScrapTargetField(field_parameters))
def _html_is_valid(self, html=self.html):
"""
Lee el HTML y aplica normas de validación del contenido
"""
@ -78,18 +133,42 @@ class CapturingTask:
#TODO Check de longitud
pass
def _fields_not_present(self, field_list, html=self.html):
def _fields_not_present(self, html=self.html):
"""
Lee el HTML y devuelve los campos que no esten presentes
"""
fields_not_present = []
for field_parameters in field_list:
field = ScrapTargetField(field_parameters)
if not field.exists(html):
fields_not_present.append(field.name)
for field in self.fields:
if not field.exists(html):
fields_not_present.append(field.name)
return fields_not_present
def _fields_not_valid(self, html=self.html):
"""
Lee el HTML y devuelve los campos que no tengan valores validos
"""
fields_not_valid = []
for field in self.fields:
if not field.validate_value(html):
fields_not_valid.append(field.name)
return fields_not_valid
def extract_data(self):
self.ad_data = {}
for field in self.fields:
self.ad_data[field.name] = field.get_value(self.html)
def geocode(self):
#TODO Construir metodo de Geocoding. Quizas en otra clase?
pass
def get_ad_data(self):
return self.ad_data
class ScrapTargetField: