Iniciadas clases de capturing_task y scraptargetfield.
This commit is contained in:
parent
80d65b7a7c
commit
7b47b1e766
3 changed files with 165 additions and 57 deletions
|
|
@ -1,7 +1,13 @@
|
|||
import sys
|
||||
sys.path.append('..')
|
||||
import uuid
|
||||
from core.mysql_wrapper import get_anunciosdb, get_tasksdb
|
||||
from core.scrapping_utils import UrlAttack
|
||||
|
||||
ads_root = 'https://www.idealista.com/inmueble/'
|
||||
|
||||
#TODO Crear la lista de campos
|
||||
|
||||
|
||||
def create_capturing_task(referencia, db_wrapper, uuid_exploring=None):
|
||||
|
||||
|
|
@ -19,6 +25,98 @@ def create_capturing_task(referencia, db_wrapper, uuid_exploring=None):
|
|||
(uuid, write_time, status, url, fk_uuid_exploring)
|
||||
VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s, %(uuid_exploring)s)"""
|
||||
|
||||
|
||||
db_wrapper.query(query_statement, query_parameters)
|
||||
|
||||
class CapturingTask:
|
||||
|
||||
def __init__(self, parameters):
|
||||
self.uuid = parameters['uuid']
|
||||
self.status = 'Loading'
|
||||
self.ad_url = parameters['ad_url']
|
||||
self.uuid_exploring = parameters['uuid_exploring']
|
||||
|
||||
self.tasksdb = get_tasksdb()
|
||||
|
||||
self._log_in_tasksdb()
|
||||
|
||||
def _update_status(self, new_status):
|
||||
self.status = new_status
|
||||
self._log_in_tasksdb()
|
||||
|
||||
def _log_in_tasksdb(self):
|
||||
"""
|
||||
Graba en la base de datos de tareas un registro con el UUID de la tarea,
|
||||
un timestamp y el status
|
||||
"""
|
||||
|
||||
query_statement = """INSERT INTO capturing_tasks_logs
|
||||
(uuid, write_time, status, ad_url, fk_uuid_exploring)
|
||||
VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s, %(fk_uuid_exploring)s)"""
|
||||
|
||||
query_parameters = {'uuid': self.uuid,
|
||||
'status': self.status,
|
||||
'ad_url': self.ad_url,
|
||||
'fk_uuid_exploring': self.uuid_exploring}
|
||||
|
||||
self.tasksdb.query(query_statement, query_parameters)
|
||||
|
||||
def capture(self):
|
||||
"""
|
||||
Metodo principal que contiene el flujo de captura
|
||||
"""
|
||||
#TODO Desarrollar flujo de captura
|
||||
|
||||
|
||||
def _html_is_valid(selfself, html=self.html):
|
||||
"""
|
||||
Lee el HTML y aplica normas de validación del contenido
|
||||
"""
|
||||
|
||||
#TODO Comprobar si HTML es pagina de bloqueo
|
||||
|
||||
|
||||
#TODO Check de longitud
|
||||
pass
|
||||
|
||||
def _fields_not_present(self, field_list, html=self.html):
|
||||
"""
|
||||
Lee el HTML y devuelve los campos que no esten presentes
|
||||
"""
|
||||
fields_not_present = []
|
||||
for field_parameters in field_list:
|
||||
field = ScrapTargetField(field_parameters)
|
||||
if not field.exists(html):
|
||||
fields_not_present.append(field.name)
|
||||
|
||||
return fields_not_present
|
||||
|
||||
|
||||
class ScrapTargetField:
|
||||
|
||||
def __init__(self, target_parameters):
|
||||
self.name = target_parameters['name']
|
||||
self.search_method = target_parameters['search_method']
|
||||
self.validation_method = target_parameters['validation_method']
|
||||
|
||||
def exists(self, html):
|
||||
"""
|
||||
Busca el dato en un HTML
|
||||
"""
|
||||
if self.search_method(html) is None:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def validate_value(self, dato):
|
||||
"""
|
||||
Comprueba el valor y valida con la norma respectiva que sea lo esperado
|
||||
"""
|
||||
return self.validation_method(dato)
|
||||
|
||||
def get_value(self, html):
|
||||
"""
|
||||
Busca en un HTML el dato
|
||||
"""
|
||||
return self.search_method(html)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue