diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index cc0d79c..4b8460b 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -4,7 +4,6 @@
-
@@ -16,12 +15,13 @@
-
+
-
+
+
@@ -37,14 +37,14 @@
-
-
+
+
-
-
+
+
@@ -64,13 +64,13 @@
-
-
+
+
-
-
-
-
+
+
+
+
@@ -83,10 +83,9 @@
-
-
+
+
-
@@ -105,7 +104,7 @@
-
+
@@ -152,8 +151,8 @@
-
+
@@ -283,7 +282,14 @@
1537651070019
-
+
+ 1537653361355
+
+
+
+ 1537653361355
+
+
@@ -299,8 +305,8 @@
-
-
+
+
@@ -354,7 +360,8 @@
-
+
+
@@ -385,7 +392,7 @@
-
+
@@ -395,10 +402,9 @@
-
-
+
+
-
@@ -406,13 +412,13 @@
-
-
+
+
-
-
-
-
+
+
+
+
diff --git a/capturer/capturer.py b/capturer/capturer.py
index a2f126d..7747016 100644
--- a/capturer/capturer.py
+++ b/capturer/capturer.py
@@ -1,13 +1,18 @@
import sys
sys.path.append('..')
import uuid
+from time import sleep
from core.mysql_wrapper import get_anunciosdb, get_tasksdb
from core.scrapping_utils import UrlAttack
+from core.alerts import alert_master
+
ads_root = 'https://www.idealista.com/inmueble/'
#TODO Crear la lista de campos
+ad_fields_parameters = []
+
def create_capturing_task(referencia, db_wrapper, uuid_exploring=None):
@@ -29,15 +34,17 @@ def create_capturing_task(referencia, db_wrapper, uuid_exploring=None):
class CapturingTask:
+ sleep_time_failed_request = 60
+
def __init__(self, parameters):
self.uuid = parameters['uuid']
- self.status = 'Loading'
self.ad_url = parameters['ad_url']
self.uuid_exploring = parameters['uuid_exploring']
+ self.request_failures = 0
self.tasksdb = get_tasksdb()
- self._log_in_tasksdb()
+ self._update_status('Loading')
def _update_status(self, new_status):
self.status = new_status
@@ -65,9 +72,57 @@ class CapturingTask:
Metodo principal que contiene el flujo de captura
"""
#TODO Desarrollar flujo de captura
+ self._update_status('WIP')
+
+ self._read_fields()
+
+ while self.request_failures < 4:
+ attack = UrlAttack(self.ad_url)
+ attack.attack()
+
+ if attack.success():
+ self.html = attack.get_text()
+ if not self._html_is_valid():
+ alert_master('ERROR CAPTURER',
+ 'El HTML de una pagina de anuncio es invalido')
+
+ with self._fields_not_present() as missing_fields:
+ if missing_fields:
+ alert_master('ERROR CAPTURER',
+ 'Los siguientes campos no estaban presentes {}. '
+ 'URL = {}'.format(missing_fields, self.ad_url))
+ self._update_status('Dead ad')
+ return
+
+ with self._fields_not_valid() as unvalid_fields:
+ if unvalid_fields:
+ alert_master('ERROR CAPTURER',
+ 'Los siguientes campos no tenian valores presentes {}'
+ 'URL = {}'.format(unvalid_fields, self.ad_url))
+ self._update_status('Dead ad')
+ return
+
+ #Extraer datos
+ self.extract_data()
+ #Geocodear
+ self.geocode()
+
+ else:
+ self.request_failures += 1
+ self._update_status('Fail {}'.format(self.request_failures))
+ sleep(sleep_time_failed_request)
+ continue
+
+ self._update_status('Surrender')
- def _html_is_valid(selfself, html=self.html):
+ def _read_fields(self):
+ self.fields = []
+ for field_parameters in ad_fields_parameters:
+ self.fields.append(ScrapTargetField(field_parameters))
+
+
+ def _html_is_valid(self, html=self.html):
"""
Lee el HTML y aplica normas de validación del contenido
"""
@@ -78,18 +133,42 @@ class CapturingTask:
#TODO Check de longitud
pass
- def _fields_not_present(self, field_list, html=self.html):
+ def _fields_not_present(self, html=self.html):
"""
Lee el HTML y devuelve los campos que no esten presentes
"""
fields_not_present = []
- for field_parameters in field_list:
- field = ScrapTargetField(field_parameters)
- if not field.exists(html):
- fields_not_present.append(field.name)
+ for field in self.fields:
+ if not field.exists(html):
+ fields_not_present.append(field.name)
return fields_not_present
+ def _fields_not_valid(self, html=self.html):
+ """
+ Lee el HTML y devuelve los campos que no tengan valores validos
+ """
+ fields_not_valid = []
+ for field in self.fields:
+ if not field.validate_value(html):
+ fields_not_valid.append(field.name)
+
+ return fields_not_valid
+
+ def extract_data(self):
+ self.ad_data = {}
+
+ for field in self.fields:
+ self.ad_data[field.name] = field.get_value(self.html)
+
+ def geocode(self):
+ #TODO Construir metodo de Geocoding. Quizas en otra clase?
+ pass
+
+
+ def get_ad_data(self):
+ return self.ad_data
+
class ScrapTargetField: