diff --git a/capturer/capturer.py b/capturer/capturer.py index c96e668..efe4a68 100644 --- a/capturer/capturer.py +++ b/capturer/capturer.py @@ -1,40 +1,31 @@ import sys + sys.path.append('..') -import uuid from time import sleep from bs4 import BeautifulSoup import re from mysql.capturing_tasks_interface import capturing_interface from mysql.capturas_interface import capturas_interface from core.scrapping_utils import UrlAttack -from core.alerts import alert_master - class Capturer: - sleep_time_no_work = 60 minimum_seconds_between_tries = 120 def start(self): - #Juzgar si hay que currar while True: - if capturing_interface.get_pending_task() is None: + if (capturing_interface.get_pending_task() is None + and capturing_interface.seconds_since_last_try() < Capturer.minimum_seconds_between_tries): sleep(Capturer.sleep_time_no_work) continue - if capturing_interface.seconds_since_last_try() < minimum_seconds_between_tries: - sleep(Capturer.sleep_time_no_work) - continue - - task_parameters = capturing_interface.get_pending_task() - - task = CapturingTask(task_parameters) + task = CapturingTask(capturing_interface.get_pending_task()) task.capture() - if tasks.status = 'Data ready': + if task.status == 'Data ready': ad_data = task.get_ad_data() else: continue @@ -42,9 +33,7 @@ class Capturer: capturas_interface.insert_captura(ad_data) - class CapturingTask: - sleep_time_failed_request = 60 def __init__(self, parameters): @@ -65,11 +54,8 @@ class CapturingTask: """ Metodo principal que contiene el flujo de captura """ - #TODO Desarrollar flujo de captura self._update_status('WIP') - self._read_fields() - while self.request_failures < 3: attack = UrlAttack(self.ad_url) attack.attack() @@ -77,26 +63,36 @@ class CapturingTask: if attack.success(): self.html = attack.get_text() - #Extraer datos - self.extract_data() - - - self._update_status('Data ready') + self._extract_data() + self._check_data() else: self.request_failures += 1 self._update_status('Fail {}'.format(self.request_failures)) - sleep(sleep_time_failed_request) + sleep(CapturingTask.sleep_time_failed_request) continue self._update_status('Surrender') - def extract_data(self): - #TODO Crear un objeto parser y ver que todo esta bien + def _extract_data(self): + self.parser = AdHtmlParser(self.html) + self.parser.parse() + + def _check_data(self): + self.parser.validate() + + if not self.parser.all_fields_are_valid(): + self._update_status('Invalid value fields') + return + + if not self.parser.fields_missing(): + self._update_status('Fields missing') + return + + self._update_status('Data ready') def get_ad_data(self): - return self.ad_data - + return self.parser.get_data() class AdHtmlParser: @@ -126,7 +122,7 @@ class AdHtmlParser: 'value': None}, 'calle': { 'found': False, - 'optional': False, + 'optional': True, 'value': None}, 'barrio': { 'found': False, @@ -171,40 +167,113 @@ class AdHtmlParser: def parse(self): - soup = BeautifulSoup(self.html, 'html5lib' ) - - + soup = BeautifulSoup(self.html, 'html5lib') if soup.findall('link', {'rel': 'canonical'}) is not None: self.ad_fields['referencia']['value'] = re.findall(r'[0-9]{5,20}', - str(soup.findall('link', {'rel': 'canonical'})[0]))[0] + str(soup.findall('link', {'rel': 'canonical'})[0]))[0] self.ad_fields['referencia']['found'] = True - if sopa.find_all('strong', {'class': 'price'}) is not None: - self.ad_fields['precio']['value'] = ''.join(re.findall(r'[0-9]', - str(sopa.find_all('strong', {'class': 'price'})[0]))) + if soup.find_all('strong', {'class': 'price'}) is not None: + self.ad_fields['precio']['value'] = ''.join(re.findall(r'[0-9]', + str(soup.find_all('strong', {'class': 'price'})[0]))) self.ad_fields['precio']['found'] = True - if soup.find('div', {'class':'info-features'}) is not None: - self.ad_fields['tamano_categorico']['value'] = sopa.find('div', - {'class':'info-features'}).find('span').find('span').text + if soup.find('div', {'class': 'info-features'}) is not None: + self.ad_fields['tamano_categorico']['value'] = soup.find('div', + {'class': 'info-features'}).find('span').find( + 'span').text self.ad_fields['tamano_categorico']['found'] = True - #TODO Seguir con los metodos de parseo + posible_m2 = [tag.text for tag in soup.find('div', {'class': 'info-features'}).findAll('span')] + if [posible for posible in posible_m2 if 'm²' in posible]: + self.ad_fields['m2']['value'] = \ + [''.join(re.findall(r'[0-9]', posible)) for posible in posible_m2 if 'm²' in posible][0] + self.ad_fields['m2']['found'] = True + if soup.find('title') is not None: + if 'venta' in soup.find('title'): + self.ad_fields['tipo_anuncio']['value'] = 1 + else: + self.ad_fields['tipo_anuncio']['value'] = 2 + self.ad_fields['tipo_anuncio']['found'] = True + if len(soup.find('div', {'id': 'headerMap'}).find_all('li')) > 3: + self.ad_fields['calle']['value'] = '' + self.ad_fields['ciudad']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-2].text + self.ad_fields['ciudad']['found'] = True + self.ad_fields['distrito']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-3].text + self.ad_fields['distrito']['found'] = True + self.ad_fields['barrio']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-4].text + self.ad_fields['barrio']['found'] = True + if len(soup.find('div', {'id': 'headerMap'}).find_all('li')) > 4: + self.ad_fields['calle']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[0].text + self.ad_fields['calle']['found'] = True + + features_lists = soup.find_all('div', {'class': 'details-property_features'}) + features = [feature.text for feature_list in features_lists for feature in feature_list.find_all('li')] + self.ad_fields['cubierta']['value'] = 1 * any('Cubierta' in feature for feature in features) + self.ad_fields['puerta_auto']['value'] = 1 * any('Puerta' in feature for feature in features) + self.ad_fields['ascensor']['value'] = 1 * any('ascensor' in feature for feature in features) + self.ad_fields['alarma']['value'] = 1 * any('Alarma' in feature for feature in features) + self.ad_fields['circuito']['value'] = 1 * any('Cámaras' in feature for feature in features) + self.ad_fields['personal']['value'] = 1 * any('Personal' in feature for feature in features) + + self.ad_fields['cubierta']['found'] = True + self.ad_fields['puerta_auto']['found'] = True + self.ad_fields['ascensor']['found'] = True + self.ad_fields['alarma']['found'] = True + self.ad_fields['circuito']['found'] = True + self.ad_fields['personal']['found'] = True + + if soup.find('p', {'class': 'txt-bold _browserPhone icon-phone'}) is not None: + self.ad_fields['telefono']['value'] = soup.find('p', {'class': 'txt-bold _browserPhone icon-phone'}) \ + .text.replace(' ', '') + self.ad_fields['telefono']['found'] = True def validate(self): - #TODO Implementar validacion para aquellos campos que lo necesiten + self.invalid_fields = [] + if re.match(r"[0-9]{4,20}", self.ad_fields['referencia']['value']): + self.invalid_fields.append('referencia') + + if re.match(r"[0-9]{1,20}", self.ad_fields['precio']['value']): + self.invalid_fields.append('precio') + + possible_values_tamano = ['2 coches o más', + 'coche y moto', + 'coche grande', + 'coche pequeño', + 'moto', + None] + if self.ad_fields['tamano_categorico']['value'] not in possible_values_tamano: + self.invalid_fields.append('tamano_categorico') + + if not 'Barrio' in self.ad_fields['barrio']['value']: + self.invalid_fields.append('barrio') + + if not 'Distrito' in self.ad_fields['distrito']['value']: + self.invalid_fields.append('distrito') + + if re.match(r"[0-9]{1,20}", self.ad_fields['telefono']['value']): + self.invalid_fields.append('telefono') + + def all_fields_are_valid(self): + if self.invalid_fields: + return False + else: + return True def fields_missing(self): - #TODO Iterar el diccionario para ver que todos los campos obligatorios estan - - - - - + for ad_field in self.ad_fields: + if not ad_field['optional'] and not ad_field['found']: + return True + return False + def get_data(self): + data = {} + for ad_field in self.ad_fields.keys(): + data[ad_field] = self.ad_fields[ad_field]['value'] + return data