Logging and formatting

This commit is contained in:
pablo 2020-03-26 11:37:32 +01:00
parent cdbb6b5325
commit 9c2565f5d8

View file

@ -1,6 +1,6 @@
import sys import sys
sys.path.append('..') sys.path.append("..")
from time import sleep from time import sleep
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import re import re
@ -10,6 +10,8 @@ from db_layer.capturas_interface import capturas_interface
from core.scrapping_utils import UrlAttack from core.scrapping_utils import UrlAttack
from core.config import working_hours, minimum_seconds_between_tries from core.config import working_hours, minimum_seconds_between_tries
from refresher.refresher import Refresher from refresher.refresher import Refresher
from core import my_logger
import logging
class Capturer: class Capturer:
@ -20,11 +22,15 @@ class Capturer:
def start(self): def start(self):
logging.info("Starting capturer")
while True: while True:
if (self.seconds_since_last_try() < minimum_seconds_between_tries if (
or not self.in_working_hours()): self.seconds_since_last_try() < minimum_seconds_between_tries
or not self.in_working_hours()
):
sleep(Capturer.sleep_time_no_work) sleep(Capturer.sleep_time_no_work)
logging.info("Waiting...")
continue continue
pending_task = capturing_interface.get_pending_task() pending_task = capturing_interface.get_pending_task()
@ -34,16 +40,20 @@ class Capturer:
self.last_try_datetime = datetime.datetime.now() self.last_try_datetime = datetime.datetime.now()
task.capture() task.capture()
if task.status == 'Data ready': if task.status == "Data ready":
ad_data = task.get_ad_data() ad_data = task.get_ad_data()
else: else:
continue continue
capturas_interface.insert_captura(ad_data) capturas_interface.insert_captura(ad_data)
task._update_status('Captura inserted') task._update_status("Captura inserted")
def in_working_hours(self): def in_working_hours(self):
return working_hours['start'] <= datetime.datetime.now().time() <= working_hours['end'] return (
working_hours["start"]
<= datetime.datetime.now().time()
<= working_hours["end"]
)
def seconds_since_last_try(self): def seconds_since_last_try(self):
return (datetime.datetime.now() - self.last_try_datetime).total_seconds() return (datetime.datetime.now() - self.last_try_datetime).total_seconds()
@ -53,25 +63,26 @@ class CapturingTask:
sleep_time_failed_request = 180 sleep_time_failed_request = 180
def __init__(self, parameters): def __init__(self, parameters):
self.uuid = parameters['uuid'] self.uuid = parameters["uuid"]
self.ad_url = parameters['ad_url'] self.ad_url = parameters["ad_url"]
self.uuid_exploring = parameters['fk_uuid_exploring'] self.uuid_exploring = parameters["fk_uuid_exploring"]
self.status = parameters['status'] self.status = parameters["status"]
self.request_failures = 1 self.request_failures = 1
self.html = None self.html = None
self._update_status('Loading') self._update_status("Loading")
def _update_status(self, new_status): def _update_status(self, new_status):
self.status = new_status self.status = new_status
capturing_interface.update_capturing_task(self.uuid, self.uuid_exploring, capturing_interface.update_capturing_task(
self.status, self.ad_url) self.uuid, self.uuid_exploring, self.status, self.ad_url
)
def capture(self): def capture(self):
""" """
Metodo principal que contiene el flujo de captura Metodo principal que contiene el flujo de captura
""" """
self._update_status('WIP') self._update_status("WIP")
while self.request_failures < 4: while self.request_failures < 4:
attack = UrlAttack(self.ad_url) attack = UrlAttack(self.ad_url)
@ -87,17 +98,18 @@ class CapturingTask:
else: else:
try: try:
if Refresher.dead_ad_checker(attack.get_text()): if Refresher.dead_ad_checker(attack.get_text()):
self._update_status('Dead ad') self._update_status("Dead ad")
return return
except AttributeError: except AttributeError:
pass pass
self._update_status('Fail {}'.format(self.request_failures)) self._update_status("Fail {}".format(self.request_failures))
self.request_failures += 1 self.request_failures += 1
sleep(CapturingTask.sleep_time_failed_request) sleep(CapturingTask.sleep_time_failed_request)
continue continue
self._update_status('Surrender') self._update_status("Surrender")
logging.warning(f"A task has surrendered. {self.ad_url}")
def _extract_data(self): def _extract_data(self):
self.parser = AdHtmlParser(self.html) self.parser = AdHtmlParser(self.html)
@ -105,189 +117,190 @@ class CapturingTask:
def _check_data(self): def _check_data(self):
if self.parser.fields_missing(): if self.parser.fields_missing():
self._update_status('Fields missing') self._update_status("Fields missing")
return return
if not self.parser.all_fields_are_valid(): if not self.parser.all_fields_are_valid():
self._update_status('Invalid value fields') self._update_status("Invalid value fields")
return return
self._update_status('Data ready') self._update_status("Data ready")
def get_ad_data(self): def get_ad_data(self):
return self.parser.get_data() return self.parser.get_data()
class AdHtmlParser: class AdHtmlParser:
def __init__(self, html_string): def __init__(self, html_string):
self.html = html_string self.html = html_string
self.ad_fields = {'referencia': { self.ad_fields = {
'found': False, "referencia": {"found": False, "optional": False, "value": None},
'optional': False, "precio": {"found": False, "optional": False, "value": None},
'value': None}, "tamano_categorico": {"found": False, "optional": True, "value": None},
'precio': { "m2": {"found": False, "optional": True, "value": None},
'found': False, "tipo_anuncio": {"found": False, "optional": False, "value": None},
'optional': False, "calle": {"found": False, "optional": True, "value": None},
'value': None}, "barrio": {"found": False, "optional": False, "value": None},
'tamano_categorico': { "distrito": {"found": False, "optional": False, "value": None},
'found': False, "ciudad": {"found": False, "optional": False, "value": None},
'optional': True, "cubierta": {"found": False, "optional": False, "value": None},
'value': None}, "puerta_auto": {"found": False, "optional": False, "value": None},
'm2': { "ascensor": {"found": False, "optional": False, "value": None},
'found': False, "alarma": {"found": False, "optional": False, "value": None},
'optional': True, "circuito": {"found": False, "optional": False, "value": None},
'value': None}, "personal": {"found": False, "optional": False, "value": None},
'tipo_anuncio': { "telefono": {"found": False, "optional": True, "value": None},
'found': False, }
'optional': False, # TODO añadir campos de visitas
'value': None},
'calle': {
'found': False,
'optional': True,
'value': None},
'barrio': {
'found': False,
'optional': False,
'value': None},
'distrito': {
'found': False,
'optional': False,
'value': None},
'ciudad': {
'found': False,
'optional': False,
'value': None},
'cubierta': {
'found': False,
'optional': False,
'value': None},
'puerta_auto': {
'found': False,
'optional': False,
'value': None},
'ascensor': {
'found': False,
'optional': False,
'value': None},
'alarma': {
'found': False,
'optional': False,
'value': None},
'circuito': {
'found': False,
'optional': False,
'value': None},
'personal': {
'found': False,
'optional': False,
'value': None},
'telefono': {
'found': False,
'optional': True,
'value': None}}
#TODO añadir campos de visitas
def parse(self): def parse(self):
soup = BeautifulSoup(self.html, 'html5lib') soup = BeautifulSoup(self.html, "html5lib")
if soup.find_all('link', {'rel': 'canonical'}) is not None: if soup.find_all("link", {"rel": "canonical"}) is not None:
self.ad_fields['referencia']['value'] = re.findall(r'[0-9]{5,20}', self.ad_fields["referencia"]["value"] = re.findall(
str(soup.find_all('link', {'rel': 'canonical'})[0]))[0] r"[0-9]{5,20}", str(soup.find_all("link", {"rel": "canonical"})[0])
self.ad_fields['referencia']['found'] = True )[0]
self.ad_fields["referencia"]["found"] = True
if soup.find_all('strong', {'class': 'price'}) is not None: if soup.find_all("strong", {"class": "price"}) is not None:
self.ad_fields['precio']['value'] = ''.join(re.findall(r'[0-9]', self.ad_fields["precio"]["value"] = "".join(
str(soup.find_all('strong', {'class': 'price'})[0]))) re.findall(
self.ad_fields['precio']['found'] = True r"[0-9]", str(soup.find_all("strong", {"class": "price"})[0])
)
)
self.ad_fields["precio"]["found"] = True
if soup.find('div', {'class': 'info-features'}) is not None: if soup.find("div", {"class": "info-features"}) is not None:
try: try:
if '' not in soup.find('div', {'class': 'info-features'}).find('span').find('span').text: if (
self.ad_fields['tamano_categorico']['value'] = soup.find('div', {'class': 'info-features'}).find( ""
'span').find('span').text not in soup.find("div", {"class": "info-features"})
self.ad_fields['tamano_categorico']['found'] = True .find("span")
.find("span")
.text
):
self.ad_fields["tamano_categorico"]["value"] = (
soup.find("div", {"class": "info-features"})
.find("span")
.find("span")
.text
)
self.ad_fields["tamano_categorico"]["found"] = True
except: except:
pass pass
posible_m2 = [tag.text for tag in soup.find('div', {'class': 'info-features'}).find_all('span')] posible_m2 = [
if [posible for posible in posible_m2 if '' in posible]: tag.text
self.ad_fields['m2']['value'] = \ for tag in soup.find("div", {"class": "info-features"}).find_all("span")
[''.join(re.findall(r'[0-9]+,*[0-9]*', posible)) for posible in posible_m2 if '' in posible][0].replace(',', '.') ]
self.ad_fields['m2']['found'] = True if [posible for posible in posible_m2 if "" in posible]:
self.ad_fields["m2"]["value"] = [
"".join(re.findall(r"[0-9]+,*[0-9]*", posible))
for posible in posible_m2
if "" in posible
][0].replace(",", ".")
self.ad_fields["m2"]["found"] = True
if soup.find('title') is not None: if soup.find("title") is not None:
if 'venta' in soup.find('title').text: if "venta" in soup.find("title").text:
self.ad_fields['tipo_anuncio']['value'] = 1 self.ad_fields["tipo_anuncio"]["value"] = 1
else: else:
self.ad_fields['tipo_anuncio']['value'] = 2 self.ad_fields["tipo_anuncio"]["value"] = 2
self.ad_fields['tipo_anuncio']['found'] = True self.ad_fields["tipo_anuncio"]["found"] = True
if len(soup.find('div', {'id': 'headerMap'}).find_all('li')) > 3: if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 3:
self.ad_fields['calle']['value'] = '' self.ad_fields["calle"]["value"] = ""
self.ad_fields['ciudad']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-2].text.strip() self.ad_fields["ciudad"]["value"] = (
self.ad_fields['ciudad']['found'] = True soup.find("div", {"id": "headerMap"}).find_all("li")[-2].text.strip()
self.ad_fields['distrito']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-3].text.strip() )
self.ad_fields['distrito']['found'] = True self.ad_fields["ciudad"]["found"] = True
self.ad_fields['barrio']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-4].text.strip() self.ad_fields["distrito"]["value"] = (
self.ad_fields['barrio']['found'] = True soup.find("div", {"id": "headerMap"}).find_all("li")[-3].text.strip()
if len(soup.find('div', {'id': 'headerMap'}).find_all('li')) > 4: )
self.ad_fields['calle']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[0].text.strip() self.ad_fields["distrito"]["found"] = True
self.ad_fields['calle']['found'] = True self.ad_fields["barrio"]["value"] = (
soup.find("div", {"id": "headerMap"}).find_all("li")[-4].text.strip()
)
self.ad_fields["barrio"]["found"] = True
if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 4:
self.ad_fields["calle"]["value"] = (
soup.find("div", {"id": "headerMap"}).find_all("li")[0].text.strip()
)
self.ad_fields["calle"]["found"] = True
features_lists = soup.find_all('div', {'class': 'details-property_features'}) features_lists = soup.find_all("div", {"class": "details-property_features"})
features = [feature.text for feature_list in features_lists for feature in feature_list.find_all('li')] features = [
self.ad_fields['cubierta']['value'] = 1 * any('Cubierta' in feature for feature in features) feature.text
self.ad_fields['puerta_auto']['value'] = 1 * any('Puerta' in feature for feature in features) for feature_list in features_lists
self.ad_fields['ascensor']['value'] = 1 * any('ascensor' in feature for feature in features) for feature in feature_list.find_all("li")
self.ad_fields['alarma']['value'] = 1 * any('Alarma' in feature for feature in features) ]
self.ad_fields['circuito']['value'] = 1 * any('Cámaras' in feature for feature in features) self.ad_fields["cubierta"]["value"] = 1 * any(
self.ad_fields['personal']['value'] = 1 * any('Personal' in feature for feature in features) "Cubierta" in feature for feature in features
)
self.ad_fields["puerta_auto"]["value"] = 1 * any(
"Puerta" in feature for feature in features
)
self.ad_fields["ascensor"]["value"] = 1 * any(
"ascensor" in feature for feature in features
)
self.ad_fields["alarma"]["value"] = 1 * any(
"Alarma" in feature for feature in features
)
self.ad_fields["circuito"]["value"] = 1 * any(
"Cámaras" in feature for feature in features
)
self.ad_fields["personal"]["value"] = 1 * any(
"Personal" in feature for feature in features
)
self.ad_fields['cubierta']['found'] = True self.ad_fields["cubierta"]["found"] = True
self.ad_fields['puerta_auto']['found'] = True self.ad_fields["puerta_auto"]["found"] = True
self.ad_fields['ascensor']['found'] = True self.ad_fields["ascensor"]["found"] = True
self.ad_fields['alarma']['found'] = True self.ad_fields["alarma"]["found"] = True
self.ad_fields['circuito']['found'] = True self.ad_fields["circuito"]["found"] = True
self.ad_fields['personal']['found'] = True self.ad_fields["personal"]["found"] = True
if soup.find('p', {'class': 'txt-bold _browserPhone icon-phone'}) is not None: if soup.find("p", {"class": "txt-bold _browserPhone icon-phone"}) is not None:
self.ad_fields['telefono']['value'] = soup.find('p', {'class': 'txt-bold _browserPhone icon-phone'}) \ self.ad_fields["telefono"]["value"] = soup.find(
.text.replace(' ', '') "p", {"class": "txt-bold _browserPhone icon-phone"}
self.ad_fields['telefono']['found'] = True ).text.replace(" ", "")
self.ad_fields["telefono"]["found"] = True
# TODO capturar datos de visitas # TODO capturar datos de visitas
def _validate(self): def _validate(self):
self.invalid_fields = [] self.invalid_fields = []
if not re.match(r"[0-9]{4,20}", self.ad_fields['referencia']['value']): if not re.match(r"[0-9]{4,20}", self.ad_fields["referencia"]["value"]):
self.invalid_fields.append('referencia') self.invalid_fields.append("referencia")
if not re.match(r"[0-9]{1,20}", self.ad_fields['precio']['value']): if not re.match(r"[0-9]{1,20}", self.ad_fields["precio"]["value"]):
self.invalid_fields.append('precio') self.invalid_fields.append("precio")
possible_values_tamano = ['2 coches o más', possible_values_tamano = [
'coche y moto', "2 coches o más",
'coche grande', "coche y moto",
'coche pequeño', "coche grande",
'moto', "coche pequeño",
None] "moto",
if self.ad_fields['tamano_categorico']['value'] not in possible_values_tamano: None,
self.invalid_fields.append('tamano_categorico') ]
if self.ad_fields["tamano_categorico"]["value"] not in possible_values_tamano:
self.invalid_fields.append("tamano_categorico")
if not 'Barrio' in self.ad_fields['barrio']['value']: if not "Barrio" in self.ad_fields["barrio"]["value"]:
self.invalid_fields.append('barrio') self.invalid_fields.append("barrio")
if not 'Distrito' in self.ad_fields['distrito']['value']: if not "Distrito" in self.ad_fields["distrito"]["value"]:
self.invalid_fields.append('distrito') self.invalid_fields.append("distrito")
if (self.ad_fields['telefono']['found'] if self.ad_fields["telefono"]["found"] and not re.match(
and not re.match(r"\s*\+?[0-9\s]*", self.ad_fields['telefono']['value'])): r"\s*\+?[0-9\s]*", self.ad_fields["telefono"]["value"]
self.invalid_fields.append('telefono') ):
#TODO añadir + a caracteres validos self.invalid_fields.append("telefono")
# TODO añadir + a caracteres validos
def all_fields_are_valid(self): def all_fields_are_valid(self):
self._validate() self._validate()
@ -298,7 +311,7 @@ class AdHtmlParser:
def fields_missing(self): def fields_missing(self):
for key, contents in self.ad_fields.items(): for key, contents in self.ad_fields.items():
if not contents['optional'] and not contents['found']: if not contents["optional"] and not contents["found"]:
return True return True
return False return False
@ -306,11 +319,11 @@ class AdHtmlParser:
data = {} data = {}
for ad_field in self.ad_fields.keys(): for ad_field in self.ad_fields.keys():
data[ad_field] = self.ad_fields[ad_field]['value'] data[ad_field] = self.ad_fields[ad_field]["value"]
return data return data
if __name__ == '__main__': if __name__ == "__main__":
capturer = Capturer() capturer = Capturer()
capturer.start() capturer.start()