diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 4b8460b..4149dd7 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -2,6 +2,7 @@ + @@ -15,36 +16,36 @@ - + - + - + - + - + - - + + - - + + @@ -64,13 +65,27 @@ - - + + - - - - + + + + + + + + + + + + + + + + + + @@ -152,6 +167,7 @@ @@ -289,7 +305,14 @@ @@ -305,8 +328,8 @@ - - + + @@ -361,7 +384,8 @@ - @@ -383,6 +407,16 @@ + + + + + + + + + + @@ -400,25 +434,27 @@ - + - - + + - + + + - - + + - - - - + + + + diff --git a/capturer/capturer.py b/capturer/capturer.py index 7747016..019cd7a 100644 --- a/capturer/capturer.py +++ b/capturer/capturer.py @@ -5,7 +5,7 @@ from time import sleep from core.mysql_wrapper import get_anunciosdb, get_tasksdb from core.scrapping_utils import UrlAttack from core.alerts import alert_master - +from capturer.geocoder import GeocodingTask ads_root = 'https://www.idealista.com/inmueble/' @@ -40,7 +40,8 @@ class CapturingTask: self.uuid = parameters['uuid'] self.ad_url = parameters['ad_url'] self.uuid_exploring = parameters['uuid_exploring'] - self.request_failures = 0 + self.request_failures = 1 + self.geocode_status = "Pending" self.tasksdb = get_tasksdb() @@ -76,15 +77,12 @@ class CapturingTask: self._read_fields() - while self.request_failures < 4: + while self.request_failures < 3: attack = UrlAttack(self.ad_url) attack.attack() if attack.success(): self.html = attack.get_text() - if not self._html_is_valid(): - alert_master('ERROR CAPTURER', - 'El HTML de una pagina de anuncio es invalido') with self._fields_not_present() as missing_fields: if missing_fields: @@ -107,6 +105,9 @@ class CapturingTask: #Geocodear self.geocode() + #TODO Lidiar con el resultado del geocoding + #TODO Manejar tema cache + else: self.request_failures += 1 self._update_status('Fail {}'.format(self.request_failures)) @@ -121,18 +122,6 @@ class CapturingTask: for field_parameters in ad_fields_parameters: self.fields.append(ScrapTargetField(field_parameters)) - - def _html_is_valid(self, html=self.html): - """ - Lee el HTML y aplica normas de validación del contenido - """ - - #TODO Comprobar si HTML es pagina de bloqueo - - - #TODO Check de longitud - pass - def _fields_not_present(self, html=self.html): """ Lee el HTML y devuelve los campos que no esten presentes @@ -161,14 +150,35 @@ class CapturingTask: for field in self.fields: self.ad_data[field.name] = field.get_value(self.html) - def geocode(self): - #TODO Construir metodo de Geocoding. Quizas en otra clase? - pass - - def get_ad_data(self): return self.ad_data + def geocode(self): + + # Construir direccion con formato adecuado + geocode_tries = 0 + + geo_task = GeocodingTask(formated_address) + + while geocode_tries < 3: + geo_task.geocode() + + if geo_task.get_request_status() == 200: + google_status = geo_task.success_surrender_retry() + + if google_status == 'Success': + self.geocode_status = 'Success' + self.geocode_results = geo_task.get_results() + return + elif google_status == 'Surrender': + self.geocode_status = 'Surrender' + return + elif google_status == 'Retry': + geocode_tries += 1 + + self.geocode_status = 'Surrender' + return + class ScrapTargetField: diff --git a/capturer/geocoder.py b/capturer/geocoder.py new file mode 100644 index 0000000..60b4315 --- /dev/null +++ b/capturer/geocoder.py @@ -0,0 +1,54 @@ +import requests + + +class GeocodingTask: + + url = 'https://maps.googleapis.com/maps/api/geocode/json' + + request_parameters = {'region': 'es', + 'key': 'AIzaSyCnKj0WnsxVZcaoxeAYkuRw3cKRNGiISYA'} + + geocoding_status_success = ['OK'] + geocoding_status_surrender = ['ZERO_RESULTS'] + geocoding_status_retry = ['OVER_QUERY_LIMIT', + 'REQUEST_DENIED', + 'INVALID_REQUEST', + 'UNKNOWN_ERROR'] + + def __init__(self, address): + request_paremeters['address'] = address + + def geocode(self): + """ + Lanza la peticion de gecoding al servicio de google + """ + self.response = requests.get(url, request_parameters) + self.response_json = self.response.json() + + def get_request_status(self): + """ + Devuelve el status HTTP de la request + """ + return self.response.status_code + + def success_surrender_retry(self): + """ + Devuelve el estado del resultado desde el punto de vista de Google + """ + if self.response_json['status'] in geocoding_status_success: + return "Success" + elif self.response_json['status'] in geocoding_status_surrender: + return "Surrender" + else: + return "Retry" + + def get_results(self): + """ + Extrae los resultados del JSON de respuesta y los devuelve + """ + results = {'latitude': self.response_json['results'][0]['geometry']['location']['lat'], + 'longitude': self.response_json['results'][0]['geometry']['location']['lon'], + 'precision': self.response_json['results'][0]['geometry']['location_type']} + + return results +