y la tabla de capturas. Traslado todo lo relacionado a Geocoding a un servicio independiente del capturer. Replanteo totalmente el parseo del html, creando un objeto nuevo.
210 lines
5.9 KiB
Python
210 lines
5.9 KiB
Python
import sys
|
|
sys.path.append('..')
|
|
import uuid
|
|
from time import sleep
|
|
from bs4 import BeautifulSoup
|
|
import re
|
|
from mysql.capturing_tasks_interface import capturing_interface
|
|
from mysql.capturas_interface import capturas_interface
|
|
from core.scrapping_utils import UrlAttack
|
|
from core.alerts import alert_master
|
|
|
|
|
|
|
|
class Capturer:
|
|
|
|
sleep_time_no_work = 60
|
|
minimum_seconds_between_tries = 120
|
|
|
|
def start(self):
|
|
|
|
#Juzgar si hay que currar
|
|
while True:
|
|
|
|
if capturing_interface.get_pending_task() is None:
|
|
sleep(Capturer.sleep_time_no_work)
|
|
continue
|
|
|
|
if capturing_interface.seconds_since_last_try() < minimum_seconds_between_tries:
|
|
sleep(Capturer.sleep_time_no_work)
|
|
continue
|
|
|
|
task_parameters = capturing_interface.get_pending_task()
|
|
|
|
task = CapturingTask(task_parameters)
|
|
task.capture()
|
|
|
|
if tasks.status = 'Data ready':
|
|
ad_data = task.get_ad_data()
|
|
else:
|
|
continue
|
|
|
|
capturas_interface.insert_captura(ad_data)
|
|
|
|
|
|
|
|
class CapturingTask:
|
|
|
|
sleep_time_failed_request = 60
|
|
|
|
def __init__(self, parameters):
|
|
self.uuid = parameters['uuid']
|
|
self.ad_url = parameters['ad_url']
|
|
self.uuid_exploring = parameters['fk_uuid_exploring']
|
|
self.status = parameters['status']
|
|
self.request_failures = 1
|
|
|
|
self._update_status('Loading')
|
|
|
|
def _update_status(self, new_status):
|
|
self.status = new_status
|
|
capturing_interface.update_capturing_task(self.uuid, self.uuid_exploring,
|
|
self.status, self.ad_url)
|
|
|
|
def capture(self):
|
|
"""
|
|
Metodo principal que contiene el flujo de captura
|
|
"""
|
|
#TODO Desarrollar flujo de captura
|
|
self._update_status('WIP')
|
|
|
|
self._read_fields()
|
|
|
|
while self.request_failures < 3:
|
|
attack = UrlAttack(self.ad_url)
|
|
attack.attack()
|
|
|
|
if attack.success():
|
|
self.html = attack.get_text()
|
|
|
|
#Extraer datos
|
|
self.extract_data()
|
|
|
|
|
|
self._update_status('Data ready')
|
|
|
|
else:
|
|
self.request_failures += 1
|
|
self._update_status('Fail {}'.format(self.request_failures))
|
|
sleep(sleep_time_failed_request)
|
|
continue
|
|
|
|
self._update_status('Surrender')
|
|
|
|
def extract_data(self):
|
|
#TODO Crear un objeto parser y ver que todo esta bien
|
|
|
|
def get_ad_data(self):
|
|
return self.ad_data
|
|
|
|
|
|
|
|
class AdHtmlParser:
|
|
|
|
def __init__(self, html_string):
|
|
self.html = html_string
|
|
|
|
self.ad_fields = {'referencia': {
|
|
'found': False,
|
|
'optional': False,
|
|
'value': None},
|
|
'precio': {
|
|
'found': False,
|
|
'optional': False,
|
|
'value': None},
|
|
'tamano_categorico': {
|
|
'found': False,
|
|
'optional': False,
|
|
'value': None},
|
|
'm2': {
|
|
'found': False,
|
|
'optional': True,
|
|
'value': None},
|
|
'tipo_anuncio': {
|
|
'found': False,
|
|
'optional': False,
|
|
'value': None},
|
|
'calle': {
|
|
'found': False,
|
|
'optional': False,
|
|
'value': None},
|
|
'barrio': {
|
|
'found': False,
|
|
'optional': False,
|
|
'value': None},
|
|
'distrito': {
|
|
'found': False,
|
|
'optional': False,
|
|
'value': None},
|
|
'ciudad': {
|
|
'found': False,
|
|
'optional': False,
|
|
'value': None},
|
|
'cubierta': {
|
|
'found': False,
|
|
'optional': False,
|
|
'value': None},
|
|
'puerta_auto': {
|
|
'found': False,
|
|
'optional': False,
|
|
'value': None},
|
|
'ascensor': {
|
|
'found': False,
|
|
'optional': False,
|
|
'value': None},
|
|
'alarma': {
|
|
'found': False,
|
|
'optional': False,
|
|
'value': None},
|
|
'circuito': {
|
|
'found': False,
|
|
'optional': False,
|
|
'value': None},
|
|
'personal': {
|
|
'found': False,
|
|
'optional': False,
|
|
'value': None},
|
|
'telefono': {
|
|
'found': False,
|
|
'optional': True,
|
|
'value': None}}
|
|
|
|
def parse(self):
|
|
|
|
soup = BeautifulSoup(self.html, 'html5lib' )
|
|
|
|
|
|
|
|
if soup.findall('link', {'rel': 'canonical'}) is not None:
|
|
self.ad_fields['referencia']['value'] = re.findall(r'[0-9]{5,20}',
|
|
str(soup.findall('link', {'rel': 'canonical'})[0]))[0]
|
|
self.ad_fields['referencia']['found'] = True
|
|
|
|
if sopa.find_all('strong', {'class': 'price'}) is not None:
|
|
self.ad_fields['precio']['value'] = ''.join(re.findall(r'[0-9]',
|
|
str(sopa.find_all('strong', {'class': 'price'})[0])))
|
|
self.ad_fields['precio']['found'] = True
|
|
|
|
if soup.find('div', {'class':'info-features'}) is not None:
|
|
self.ad_fields['tamano_categorico']['value'] = sopa.find('div',
|
|
{'class':'info-features'}).find('span').find('span').text
|
|
self.ad_fields['tamano_categorico']['found'] = True
|
|
|
|
#TODO Seguir con los metodos de parseo
|
|
|
|
|
|
|
|
def validate(self):
|
|
#TODO Implementar validacion para aquellos campos que lo necesiten
|
|
|
|
|
|
def fields_missing(self):
|
|
#TODO Iterar el diccionario para ver que todos los campos obligatorios estan
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|