Avanzado en desarrollo de capturing task. Creado clase GeocodingTask

This commit is contained in:
pablomartincalvo 2018-09-26 22:56:45 +02:00
parent 1d68575fd7
commit 9d947f7e2b
3 changed files with 152 additions and 52 deletions

View file

@ -5,7 +5,7 @@ from time import sleep
from core.mysql_wrapper import get_anunciosdb, get_tasksdb
from core.scrapping_utils import UrlAttack
from core.alerts import alert_master
from capturer.geocoder import GeocodingTask
ads_root = 'https://www.idealista.com/inmueble/'
@ -40,7 +40,8 @@ class CapturingTask:
self.uuid = parameters['uuid']
self.ad_url = parameters['ad_url']
self.uuid_exploring = parameters['uuid_exploring']
self.request_failures = 0
self.request_failures = 1
self.geocode_status = "Pending"
self.tasksdb = get_tasksdb()
@ -76,15 +77,12 @@ class CapturingTask:
self._read_fields()
while self.request_failures < 4:
while self.request_failures < 3:
attack = UrlAttack(self.ad_url)
attack.attack()
if attack.success():
self.html = attack.get_text()
if not self._html_is_valid():
alert_master('ERROR CAPTURER',
'El HTML de una pagina de anuncio es invalido')
with self._fields_not_present() as missing_fields:
if missing_fields:
@ -107,6 +105,9 @@ class CapturingTask:
#Geocodear
self.geocode()
#TODO Lidiar con el resultado del geocoding
#TODO Manejar tema cache
else:
self.request_failures += 1
self._update_status('Fail {}'.format(self.request_failures))
@ -121,18 +122,6 @@ class CapturingTask:
for field_parameters in ad_fields_parameters:
self.fields.append(ScrapTargetField(field_parameters))
def _html_is_valid(self, html=self.html):
"""
Lee el HTML y aplica normas de validación del contenido
"""
#TODO Comprobar si HTML es pagina de bloqueo
#TODO Check de longitud
pass
def _fields_not_present(self, html=self.html):
"""
Lee el HTML y devuelve los campos que no esten presentes
@ -161,14 +150,35 @@ class CapturingTask:
for field in self.fields:
self.ad_data[field.name] = field.get_value(self.html)
def geocode(self):
#TODO Construir metodo de Geocoding. Quizas en otra clase?
pass
def get_ad_data(self):
return self.ad_data
def geocode(self):
# Construir direccion con formato adecuado
geocode_tries = 0
geo_task = GeocodingTask(formated_address)
while geocode_tries < 3:
geo_task.geocode()
if geo_task.get_request_status() == 200:
google_status = geo_task.success_surrender_retry()
if google_status == 'Success':
self.geocode_status = 'Success'
self.geocode_results = geo_task.get_results()
return
elif google_status == 'Surrender':
self.geocode_status = 'Surrender'
return
elif google_status == 'Retry':
geocode_tries += 1
self.geocode_status = 'Surrender'
return
class ScrapTargetField: