diff --git a/capturer/capturer.py b/capturer/capturer.py index 3faba41..d9cd826 100644 --- a/capturer/capturer.py +++ b/capturer/capturer.py @@ -1,6 +1,6 @@ import sys -sys.path.append('..') +sys.path.append("..") from time import sleep from bs4 import BeautifulSoup import re @@ -10,6 +10,8 @@ from db_layer.capturas_interface import capturas_interface from core.scrapping_utils import UrlAttack from core.config import working_hours, minimum_seconds_between_tries from refresher.refresher import Refresher +from core import my_logger +import logging class Capturer: @@ -20,11 +22,15 @@ class Capturer: def start(self): + logging.info("Starting capturer") while True: - if (self.seconds_since_last_try() < minimum_seconds_between_tries - or not self.in_working_hours()): + if ( + self.seconds_since_last_try() < minimum_seconds_between_tries + or not self.in_working_hours() + ): sleep(Capturer.sleep_time_no_work) + logging.info("Waiting...") continue pending_task = capturing_interface.get_pending_task() @@ -34,16 +40,20 @@ class Capturer: self.last_try_datetime = datetime.datetime.now() task.capture() - if task.status == 'Data ready': + if task.status == "Data ready": ad_data = task.get_ad_data() else: continue capturas_interface.insert_captura(ad_data) - task._update_status('Captura inserted') + task._update_status("Captura inserted") def in_working_hours(self): - return working_hours['start'] <= datetime.datetime.now().time() <= working_hours['end'] + return ( + working_hours["start"] + <= datetime.datetime.now().time() + <= working_hours["end"] + ) def seconds_since_last_try(self): return (datetime.datetime.now() - self.last_try_datetime).total_seconds() @@ -53,25 +63,26 @@ class CapturingTask: sleep_time_failed_request = 180 def __init__(self, parameters): - self.uuid = parameters['uuid'] - self.ad_url = parameters['ad_url'] - self.uuid_exploring = parameters['fk_uuid_exploring'] - self.status = parameters['status'] + self.uuid = parameters["uuid"] + self.ad_url = parameters["ad_url"] + self.uuid_exploring = parameters["fk_uuid_exploring"] + self.status = parameters["status"] self.request_failures = 1 self.html = None - self._update_status('Loading') + self._update_status("Loading") def _update_status(self, new_status): self.status = new_status - capturing_interface.update_capturing_task(self.uuid, self.uuid_exploring, - self.status, self.ad_url) + capturing_interface.update_capturing_task( + self.uuid, self.uuid_exploring, self.status, self.ad_url + ) def capture(self): """ Metodo principal que contiene el flujo de captura """ - self._update_status('WIP') + self._update_status("WIP") while self.request_failures < 4: attack = UrlAttack(self.ad_url) @@ -87,17 +98,18 @@ class CapturingTask: else: try: if Refresher.dead_ad_checker(attack.get_text()): - self._update_status('Dead ad') + self._update_status("Dead ad") return except AttributeError: pass - self._update_status('Fail {}'.format(self.request_failures)) + self._update_status("Fail {}".format(self.request_failures)) self.request_failures += 1 sleep(CapturingTask.sleep_time_failed_request) continue - self._update_status('Surrender') + self._update_status("Surrender") + logging.warning(f"A task has surrendered. {self.ad_url}") def _extract_data(self): self.parser = AdHtmlParser(self.html) @@ -105,189 +117,190 @@ class CapturingTask: def _check_data(self): if self.parser.fields_missing(): - self._update_status('Fields missing') + self._update_status("Fields missing") return if not self.parser.all_fields_are_valid(): - self._update_status('Invalid value fields') + self._update_status("Invalid value fields") return - self._update_status('Data ready') + self._update_status("Data ready") def get_ad_data(self): return self.parser.get_data() class AdHtmlParser: - def __init__(self, html_string): self.html = html_string - self.ad_fields = {'referencia': { - 'found': False, - 'optional': False, - 'value': None}, - 'precio': { - 'found': False, - 'optional': False, - 'value': None}, - 'tamano_categorico': { - 'found': False, - 'optional': True, - 'value': None}, - 'm2': { - 'found': False, - 'optional': True, - 'value': None}, - 'tipo_anuncio': { - 'found': False, - 'optional': False, - 'value': None}, - 'calle': { - 'found': False, - 'optional': True, - 'value': None}, - 'barrio': { - 'found': False, - 'optional': False, - 'value': None}, - 'distrito': { - 'found': False, - 'optional': False, - 'value': None}, - 'ciudad': { - 'found': False, - 'optional': False, - 'value': None}, - 'cubierta': { - 'found': False, - 'optional': False, - 'value': None}, - 'puerta_auto': { - 'found': False, - 'optional': False, - 'value': None}, - 'ascensor': { - 'found': False, - 'optional': False, - 'value': None}, - 'alarma': { - 'found': False, - 'optional': False, - 'value': None}, - 'circuito': { - 'found': False, - 'optional': False, - 'value': None}, - 'personal': { - 'found': False, - 'optional': False, - 'value': None}, - 'telefono': { - 'found': False, - 'optional': True, - 'value': None}} - #TODO añadir campos de visitas + self.ad_fields = { + "referencia": {"found": False, "optional": False, "value": None}, + "precio": {"found": False, "optional": False, "value": None}, + "tamano_categorico": {"found": False, "optional": True, "value": None}, + "m2": {"found": False, "optional": True, "value": None}, + "tipo_anuncio": {"found": False, "optional": False, "value": None}, + "calle": {"found": False, "optional": True, "value": None}, + "barrio": {"found": False, "optional": False, "value": None}, + "distrito": {"found": False, "optional": False, "value": None}, + "ciudad": {"found": False, "optional": False, "value": None}, + "cubierta": {"found": False, "optional": False, "value": None}, + "puerta_auto": {"found": False, "optional": False, "value": None}, + "ascensor": {"found": False, "optional": False, "value": None}, + "alarma": {"found": False, "optional": False, "value": None}, + "circuito": {"found": False, "optional": False, "value": None}, + "personal": {"found": False, "optional": False, "value": None}, + "telefono": {"found": False, "optional": True, "value": None}, + } + # TODO añadir campos de visitas def parse(self): - soup = BeautifulSoup(self.html, 'html5lib') + soup = BeautifulSoup(self.html, "html5lib") - if soup.find_all('link', {'rel': 'canonical'}) is not None: - self.ad_fields['referencia']['value'] = re.findall(r'[0-9]{5,20}', - str(soup.find_all('link', {'rel': 'canonical'})[0]))[0] - self.ad_fields['referencia']['found'] = True + if soup.find_all("link", {"rel": "canonical"}) is not None: + self.ad_fields["referencia"]["value"] = re.findall( + r"[0-9]{5,20}", str(soup.find_all("link", {"rel": "canonical"})[0]) + )[0] + self.ad_fields["referencia"]["found"] = True - if soup.find_all('strong', {'class': 'price'}) is not None: - self.ad_fields['precio']['value'] = ''.join(re.findall(r'[0-9]', - str(soup.find_all('strong', {'class': 'price'})[0]))) - self.ad_fields['precio']['found'] = True + if soup.find_all("strong", {"class": "price"}) is not None: + self.ad_fields["precio"]["value"] = "".join( + re.findall( + r"[0-9]", str(soup.find_all("strong", {"class": "price"})[0]) + ) + ) + self.ad_fields["precio"]["found"] = True - if soup.find('div', {'class': 'info-features'}) is not None: + if soup.find("div", {"class": "info-features"}) is not None: try: - if 'm²' not in soup.find('div', {'class': 'info-features'}).find('span').find('span').text: - self.ad_fields['tamano_categorico']['value'] = soup.find('div', {'class': 'info-features'}).find( - 'span').find('span').text - self.ad_fields['tamano_categorico']['found'] = True + if ( + "m²" + not in soup.find("div", {"class": "info-features"}) + .find("span") + .find("span") + .text + ): + self.ad_fields["tamano_categorico"]["value"] = ( + soup.find("div", {"class": "info-features"}) + .find("span") + .find("span") + .text + ) + self.ad_fields["tamano_categorico"]["found"] = True except: pass - posible_m2 = [tag.text for tag in soup.find('div', {'class': 'info-features'}).find_all('span')] - if [posible for posible in posible_m2 if 'm²' in posible]: - self.ad_fields['m2']['value'] = \ - [''.join(re.findall(r'[0-9]+,*[0-9]*', posible)) for posible in posible_m2 if 'm²' in posible][0].replace(',', '.') - self.ad_fields['m2']['found'] = True + posible_m2 = [ + tag.text + for tag in soup.find("div", {"class": "info-features"}).find_all("span") + ] + if [posible for posible in posible_m2 if "m²" in posible]: + self.ad_fields["m2"]["value"] = [ + "".join(re.findall(r"[0-9]+,*[0-9]*", posible)) + for posible in posible_m2 + if "m²" in posible + ][0].replace(",", ".") + self.ad_fields["m2"]["found"] = True - if soup.find('title') is not None: - if 'venta' in soup.find('title').text: - self.ad_fields['tipo_anuncio']['value'] = 1 + if soup.find("title") is not None: + if "venta" in soup.find("title").text: + self.ad_fields["tipo_anuncio"]["value"] = 1 else: - self.ad_fields['tipo_anuncio']['value'] = 2 - self.ad_fields['tipo_anuncio']['found'] = True + self.ad_fields["tipo_anuncio"]["value"] = 2 + self.ad_fields["tipo_anuncio"]["found"] = True - if len(soup.find('div', {'id': 'headerMap'}).find_all('li')) > 3: - self.ad_fields['calle']['value'] = '' - self.ad_fields['ciudad']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-2].text.strip() - self.ad_fields['ciudad']['found'] = True - self.ad_fields['distrito']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-3].text.strip() - self.ad_fields['distrito']['found'] = True - self.ad_fields['barrio']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-4].text.strip() - self.ad_fields['barrio']['found'] = True - if len(soup.find('div', {'id': 'headerMap'}).find_all('li')) > 4: - self.ad_fields['calle']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[0].text.strip() - self.ad_fields['calle']['found'] = True + if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 3: + self.ad_fields["calle"]["value"] = "" + self.ad_fields["ciudad"]["value"] = ( + soup.find("div", {"id": "headerMap"}).find_all("li")[-2].text.strip() + ) + self.ad_fields["ciudad"]["found"] = True + self.ad_fields["distrito"]["value"] = ( + soup.find("div", {"id": "headerMap"}).find_all("li")[-3].text.strip() + ) + self.ad_fields["distrito"]["found"] = True + self.ad_fields["barrio"]["value"] = ( + soup.find("div", {"id": "headerMap"}).find_all("li")[-4].text.strip() + ) + self.ad_fields["barrio"]["found"] = True + if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 4: + self.ad_fields["calle"]["value"] = ( + soup.find("div", {"id": "headerMap"}).find_all("li")[0].text.strip() + ) + self.ad_fields["calle"]["found"] = True - features_lists = soup.find_all('div', {'class': 'details-property_features'}) - features = [feature.text for feature_list in features_lists for feature in feature_list.find_all('li')] - self.ad_fields['cubierta']['value'] = 1 * any('Cubierta' in feature for feature in features) - self.ad_fields['puerta_auto']['value'] = 1 * any('Puerta' in feature for feature in features) - self.ad_fields['ascensor']['value'] = 1 * any('ascensor' in feature for feature in features) - self.ad_fields['alarma']['value'] = 1 * any('Alarma' in feature for feature in features) - self.ad_fields['circuito']['value'] = 1 * any('Cámaras' in feature for feature in features) - self.ad_fields['personal']['value'] = 1 * any('Personal' in feature for feature in features) + features_lists = soup.find_all("div", {"class": "details-property_features"}) + features = [ + feature.text + for feature_list in features_lists + for feature in feature_list.find_all("li") + ] + self.ad_fields["cubierta"]["value"] = 1 * any( + "Cubierta" in feature for feature in features + ) + self.ad_fields["puerta_auto"]["value"] = 1 * any( + "Puerta" in feature for feature in features + ) + self.ad_fields["ascensor"]["value"] = 1 * any( + "ascensor" in feature for feature in features + ) + self.ad_fields["alarma"]["value"] = 1 * any( + "Alarma" in feature for feature in features + ) + self.ad_fields["circuito"]["value"] = 1 * any( + "Cámaras" in feature for feature in features + ) + self.ad_fields["personal"]["value"] = 1 * any( + "Personal" in feature for feature in features + ) - self.ad_fields['cubierta']['found'] = True - self.ad_fields['puerta_auto']['found'] = True - self.ad_fields['ascensor']['found'] = True - self.ad_fields['alarma']['found'] = True - self.ad_fields['circuito']['found'] = True - self.ad_fields['personal']['found'] = True + self.ad_fields["cubierta"]["found"] = True + self.ad_fields["puerta_auto"]["found"] = True + self.ad_fields["ascensor"]["found"] = True + self.ad_fields["alarma"]["found"] = True + self.ad_fields["circuito"]["found"] = True + self.ad_fields["personal"]["found"] = True - if soup.find('p', {'class': 'txt-bold _browserPhone icon-phone'}) is not None: - self.ad_fields['telefono']['value'] = soup.find('p', {'class': 'txt-bold _browserPhone icon-phone'}) \ - .text.replace(' ', '') - self.ad_fields['telefono']['found'] = True + if soup.find("p", {"class": "txt-bold _browserPhone icon-phone"}) is not None: + self.ad_fields["telefono"]["value"] = soup.find( + "p", {"class": "txt-bold _browserPhone icon-phone"} + ).text.replace(" ", "") + self.ad_fields["telefono"]["found"] = True # TODO capturar datos de visitas def _validate(self): self.invalid_fields = [] - if not re.match(r"[0-9]{4,20}", self.ad_fields['referencia']['value']): - self.invalid_fields.append('referencia') + if not re.match(r"[0-9]{4,20}", self.ad_fields["referencia"]["value"]): + self.invalid_fields.append("referencia") - if not re.match(r"[0-9]{1,20}", self.ad_fields['precio']['value']): - self.invalid_fields.append('precio') + if not re.match(r"[0-9]{1,20}", self.ad_fields["precio"]["value"]): + self.invalid_fields.append("precio") - possible_values_tamano = ['2 coches o más', - 'coche y moto', - 'coche grande', - 'coche pequeño', - 'moto', - None] - if self.ad_fields['tamano_categorico']['value'] not in possible_values_tamano: - self.invalid_fields.append('tamano_categorico') + possible_values_tamano = [ + "2 coches o más", + "coche y moto", + "coche grande", + "coche pequeño", + "moto", + None, + ] + if self.ad_fields["tamano_categorico"]["value"] not in possible_values_tamano: + self.invalid_fields.append("tamano_categorico") - if not 'Barrio' in self.ad_fields['barrio']['value']: - self.invalid_fields.append('barrio') + if not "Barrio" in self.ad_fields["barrio"]["value"]: + self.invalid_fields.append("barrio") - if not 'Distrito' in self.ad_fields['distrito']['value']: - self.invalid_fields.append('distrito') + if not "Distrito" in self.ad_fields["distrito"]["value"]: + self.invalid_fields.append("distrito") - if (self.ad_fields['telefono']['found'] - and not re.match(r"\s*\+?[0-9\s]*", self.ad_fields['telefono']['value'])): - self.invalid_fields.append('telefono') - #TODO añadir + a caracteres validos + if self.ad_fields["telefono"]["found"] and not re.match( + r"\s*\+?[0-9\s]*", self.ad_fields["telefono"]["value"] + ): + self.invalid_fields.append("telefono") + # TODO añadir + a caracteres validos def all_fields_are_valid(self): self._validate() @@ -298,7 +311,7 @@ class AdHtmlParser: def fields_missing(self): for key, contents in self.ad_fields.items(): - if not contents['optional'] and not contents['found']: + if not contents["optional"] and not contents["found"]: return True return False @@ -306,11 +319,11 @@ class AdHtmlParser: data = {} for ad_field in self.ad_fields.keys(): - data[ad_field] = self.ad_fields[ad_field]['value'] + data[ad_field] = self.ad_fields[ad_field]["value"] return data -if __name__ == '__main__': +if __name__ == "__main__": capturer = Capturer() capturer.start()