drogon/capturer/capturer.py
2018-11-27 20:37:21 +01:00

304 lines
11 KiB
Python

import sys
sys.path.append('..')
from time import sleep
from bs4 import BeautifulSoup
import re
import datetime
from db_layer.capturing_tasks_interface import capturing_interface
from db_layer.capturas_interface import capturas_interface
from core.scrapping_utils import UrlAttack
from refresher.refresher import Refresher
class Capturer:
sleep_time_no_work = 60
minimum_seconds_between_tries = 120
working_hours = {'start': datetime.time(9, 0, 0),
'end': datetime.time(21, 0, 0)}
def start(self):
while True:
if (capturing_interface.get_pending_task() is None
or capturing_interface.seconds_since_last_try() < Capturer.minimum_seconds_between_tries
or not self.in_working_hours()):
sleep(Capturer.sleep_time_no_work)
continue
task = CapturingTask(capturing_interface.get_pending_task())
task.capture()
if task.status == 'Data ready':
ad_data = task.get_ad_data()
else:
continue
capturas_interface.insert_captura(ad_data)
task._update_status('Captura inserted')
def in_working_hours(self):
return Capturer.working_hours['start'] <= datetime.datetime.now().time() <= Capturer.working_hours['end']
class CapturingTask:
sleep_time_failed_request = 60
def __init__(self, parameters):
self.uuid = parameters['uuid']
self.ad_url = parameters['ad_url']
self.uuid_exploring = parameters['fk_uuid_exploring']
self.status = parameters['status']
self.request_failures = 1
self.html = None
self._update_status('Loading')
def _update_status(self, new_status):
self.status = new_status
capturing_interface.update_capturing_task(self.uuid, self.uuid_exploring,
self.status, self.ad_url)
def capture(self):
"""
Metodo principal que contiene el flujo de captura
"""
self._update_status('WIP')
while self.request_failures < 3:
attack = UrlAttack(self.ad_url)
attack.attack()
if attack.success:
self.html = attack.get_text()
self._extract_data()
self._check_data()
return
else:
try:
if Refresher.dead_ad_checker(attack.get_text()):
self._update_status('Dead ad')
return
except AttributeError:
pass
self._update_status('Fail {}'.format(self.request_failures))
self.request_failures += 1
sleep(CapturingTask.sleep_time_failed_request)
continue
self._update_status('Surrender')
def _extract_data(self):
self.parser = AdHtmlParser(self.html)
self.parser.parse()
def _check_data(self):
if self.parser.fields_missing():
self._update_status('Fields missing')
return
if not self.parser.all_fields_are_valid():
self._update_status('Invalid value fields')
return
self._update_status('Data ready')
def get_ad_data(self):
return self.parser.get_data()
class AdHtmlParser:
def __init__(self, html_string):
self.html = html_string
self.ad_fields = {'referencia': {
'found': False,
'optional': False,
'value': None},
'precio': {
'found': False,
'optional': False,
'value': None},
'tamano_categorico': {
'found': False,
'optional': True,
'value': None},
'm2': {
'found': False,
'optional': True,
'value': None},
'tipo_anuncio': {
'found': False,
'optional': False,
'value': None},
'calle': {
'found': False,
'optional': True,
'value': None},
'barrio': {
'found': False,
'optional': False,
'value': None},
'distrito': {
'found': False,
'optional': False,
'value': None},
'ciudad': {
'found': False,
'optional': False,
'value': None},
'cubierta': {
'found': False,
'optional': False,
'value': None},
'puerta_auto': {
'found': False,
'optional': False,
'value': None},
'ascensor': {
'found': False,
'optional': False,
'value': None},
'alarma': {
'found': False,
'optional': False,
'value': None},
'circuito': {
'found': False,
'optional': False,
'value': None},
'personal': {
'found': False,
'optional': False,
'value': None},
'telefono': {
'found': False,
'optional': True,
'value': None}}
def parse(self):
soup = BeautifulSoup(self.html, 'html5lib')
if soup.find_all('link', {'rel': 'canonical'}) is not None:
self.ad_fields['referencia']['value'] = re.findall(r'[0-9]{5,20}',
str(soup.find_all('link', {'rel': 'canonical'})[0]))[0]
self.ad_fields['referencia']['found'] = True
if soup.find_all('strong', {'class': 'price'}) is not None:
self.ad_fields['precio']['value'] = ''.join(re.findall(r'[0-9]',
str(soup.find_all('strong', {'class': 'price'})[0])))
self.ad_fields['precio']['found'] = True
if soup.find('div', {'class': 'info-features'}) is not None:
try:
self.ad_fields['tamano_categorico']['value'] = soup.find('div', {'class': 'info-features'}).find(
'span').find('span').text
self.ad_fields['tamano_categorico']['found'] = True
except:
pass
posible_m2 = [tag.text for tag in soup.find('div', {'class': 'info-features'}).find_all('span')]
if [posible for posible in posible_m2 if '' in posible]:
self.ad_fields['m2']['value'] = \
[''.join(re.findall(r'[0-9]+,*[0-9]*', posible)) for posible in posible_m2 if '' in posible][0].replace(',', '.')
self.ad_fields['m2']['found'] = True
if soup.find('title') is not None:
if 'venta' in soup.find('title').text:
self.ad_fields['tipo_anuncio']['value'] = 1
else:
self.ad_fields['tipo_anuncio']['value'] = 2
self.ad_fields['tipo_anuncio']['found'] = True
if len(soup.find('div', {'id': 'headerMap'}).find_all('li')) > 3:
self.ad_fields['calle']['value'] = ''
self.ad_fields['ciudad']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-2].text.strip()
self.ad_fields['ciudad']['found'] = True
self.ad_fields['distrito']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-3].text.strip()
self.ad_fields['distrito']['found'] = True
self.ad_fields['barrio']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-4].text.strip()
self.ad_fields['barrio']['found'] = True
if len(soup.find('div', {'id': 'headerMap'}).find_all('li')) > 4:
self.ad_fields['calle']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[0].text.strip()
self.ad_fields['calle']['found'] = True
features_lists = soup.find_all('div', {'class': 'details-property_features'})
features = [feature.text for feature_list in features_lists for feature in feature_list.find_all('li')]
self.ad_fields['cubierta']['value'] = 1 * any('Cubierta' in feature for feature in features)
self.ad_fields['puerta_auto']['value'] = 1 * any('Puerta' in feature for feature in features)
self.ad_fields['ascensor']['value'] = 1 * any('ascensor' in feature for feature in features)
self.ad_fields['alarma']['value'] = 1 * any('Alarma' in feature for feature in features)
self.ad_fields['circuito']['value'] = 1 * any('Cámaras' in feature for feature in features)
self.ad_fields['personal']['value'] = 1 * any('Personal' in feature for feature in features)
self.ad_fields['cubierta']['found'] = True
self.ad_fields['puerta_auto']['found'] = True
self.ad_fields['ascensor']['found'] = True
self.ad_fields['alarma']['found'] = True
self.ad_fields['circuito']['found'] = True
self.ad_fields['personal']['found'] = True
if soup.find('p', {'class': 'txt-bold _browserPhone icon-phone'}) is not None:
self.ad_fields['telefono']['value'] = soup.find('p', {'class': 'txt-bold _browserPhone icon-phone'}) \
.text.replace(' ', '')
self.ad_fields['telefono']['found'] = True
def _validate(self):
self.invalid_fields = []
if not re.match(r"[0-9]{4,20}", self.ad_fields['referencia']['value']):
self.invalid_fields.append('referencia')
if not re.match(r"[0-9]{1,20}", self.ad_fields['precio']['value']):
self.invalid_fields.append('precio')
possible_values_tamano = ['2 coches o más',
'coche y moto',
'coche grande',
'coche pequeño',
'moto',
None]
if self.ad_fields['tamano_categorico']['value'] not in possible_values_tamano:
self.invalid_fields.append('tamano_categorico')
if not 'Barrio' in self.ad_fields['barrio']['value']:
self.invalid_fields.append('barrio')
if not 'Distrito' in self.ad_fields['distrito']['value']:
self.invalid_fields.append('distrito')
if (self.ad_fields['telefono']['found']
and not re.match(r"[0-9]{1,20}", self.ad_fields['telefono']['value'])):
self.invalid_fields.append('telefono')
def all_fields_are_valid(self):
self._validate()
if self.invalid_fields:
return False
else:
return True
def fields_missing(self):
for key, contents in self.ad_fields.items():
if not contents['optional'] and not contents['found']:
return True
return False
def get_data(self):
data = {}
for ad_field in self.ad_fields.keys():
data[ad_field] = self.ad_fields[ad_field]['value']
return data
if __name__ == '__main__':
capturer = Capturer()
capturer.start()