import sys sys.path.append("..") from time import sleep from bs4 import BeautifulSoup import re import datetime from db_layer.capturing_tasks_interface import capturing_interface from db_layer.capturas_interface import capturas_interface from core.scrapping_utils import UrlAttack from core.config import working_hours, minimum_seconds_between_tries from refresher.refresher import Refresher from core import my_logger import logging class Capturer: sleep_time_no_work = 15 def __init__(self): self.last_try_datetime = datetime.datetime.now() def start(self): logging.info("Starting capturer") while True: if ( self.seconds_since_last_try() < minimum_seconds_between_tries() or not self.in_working_hours() ): sleep(Capturer.sleep_time_no_work) logging.info("Waiting...") continue pending_task = capturing_interface.get_pending_task() if not pending_task: continue task = CapturingTask(pending_task) self.last_try_datetime = datetime.datetime.now() task.capture() if task.status == "Data ready": ad_data = task.get_ad_data() else: continue capturas_interface.insert_captura(ad_data) task._update_status("Captura inserted") def in_working_hours(self): return ( working_hours["start"] <= datetime.datetime.now().time() <= working_hours["end"] ) def seconds_since_last_try(self): return (datetime.datetime.now() - self.last_try_datetime).total_seconds() class CapturingTask: sleep_time_failed_request = 180 def __init__(self, parameters): self.uuid = parameters["uuid"] self.ad_url = parameters["ad_url"] self.uuid_exploring = parameters["fk_uuid_exploring"] self.status = parameters["status"] self.request_failures = 1 self.html = None self._update_status("Loading") def _update_status(self, new_status): self.status = new_status capturing_interface.update_capturing_task( self.uuid, self.uuid_exploring, self.status, self.ad_url ) def capture(self): """ Metodo principal que contiene el flujo de captura """ self._update_status("WIP") while self.request_failures < 4: attack = UrlAttack(self.ad_url) attack.attack() if attack.success: self.html = attack.get_text() self._extract_data() self._check_data() return else: try: if Refresher.dead_ad_checker(attack.get_text()): self._update_status("Dead ad") return except AttributeError: pass self._update_status("Fail {}".format(self.request_failures)) self.request_failures += 1 sleep(CapturingTask.sleep_time_failed_request) continue self._update_status("Surrender") logging.warning(f"A task has surrendered. {self.ad_url}") def _extract_data(self): self.parser = AdHtmlParser(self.html) self.parser.parse() def _check_data(self): if self.parser.fields_missing(): self._update_status("Fields missing") return if not self.parser.all_fields_are_valid(): self._update_status("Invalid value fields") return self._update_status("Data ready") def get_ad_data(self): return self.parser.get_data() class AdHtmlParser: def __init__(self, html_string): self.html = html_string self.ad_fields = { "referencia": {"found": False, "optional": False, "value": None}, "precio": {"found": False, "optional": False, "value": None}, "tamano_categorico": {"found": False, "optional": True, "value": None}, "m2": {"found": False, "optional": True, "value": None}, "tipo_anuncio": {"found": False, "optional": False, "value": None}, "calle": {"found": False, "optional": True, "value": None}, "barrio": {"found": False, "optional": False, "value": None}, "distrito": {"found": False, "optional": False, "value": None}, "ciudad": {"found": False, "optional": False, "value": None}, "cubierta": {"found": False, "optional": False, "value": None}, "puerta_auto": {"found": False, "optional": False, "value": None}, "ascensor": {"found": False, "optional": False, "value": None}, "alarma": {"found": False, "optional": False, "value": None}, "circuito": {"found": False, "optional": False, "value": None}, "personal": {"found": False, "optional": False, "value": None}, "telefono": {"found": False, "optional": True, "value": None}, } # TODO añadir campos de visitas def parse(self): soup = BeautifulSoup(self.html, "html5lib") if soup.find_all("link", {"rel": "canonical"}) is not None: self.ad_fields["referencia"]["value"] = re.findall( r"[0-9]{5,20}", str(soup.find_all("link", {"rel": "canonical"})[0]) )[0] self.ad_fields["referencia"]["found"] = True if soup.find_all("strong", {"class": "price"}) is not None: self.ad_fields["precio"]["value"] = "".join( re.findall( r"[0-9]", str(soup.find_all("strong", {"class": "price"})[0]) ) ) self.ad_fields["precio"]["found"] = True if soup.find("div", {"class": "info-features"}) is not None: try: if ( "m²" not in soup.find("div", {"class": "info-features"}) .find("span") .find("span") .text ): self.ad_fields["tamano_categorico"]["value"] = ( soup.find("div", {"class": "info-features"}) .find("span") .find("span") .text ) self.ad_fields["tamano_categorico"]["found"] = True except: pass posible_m2 = [ tag.text for tag in soup.find("div", {"class": "info-features"}).find_all("span") ] if [posible for posible in posible_m2 if "m²" in posible]: self.ad_fields["m2"]["value"] = [ "".join(re.findall(r"[0-9]+,*[0-9]*", posible)) for posible in posible_m2 if "m²" in posible ][0].replace(",", ".") self.ad_fields["m2"]["found"] = True if soup.find("title") is not None: if "venta" in soup.find("title").text: self.ad_fields["tipo_anuncio"]["value"] = 1 else: self.ad_fields["tipo_anuncio"]["value"] = 2 self.ad_fields["tipo_anuncio"]["found"] = True if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 3: self.ad_fields["calle"]["value"] = "" self.ad_fields["ciudad"]["value"] = ( soup.find("div", {"id": "headerMap"}).find_all("li")[-2].text.strip() ) self.ad_fields["ciudad"]["found"] = True self.ad_fields["distrito"]["value"] = ( soup.find("div", {"id": "headerMap"}).find_all("li")[-3].text.strip() ) self.ad_fields["distrito"]["found"] = True self.ad_fields["barrio"]["value"] = ( soup.find("div", {"id": "headerMap"}).find_all("li")[-4].text.strip() ) self.ad_fields["barrio"]["found"] = True if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 4: self.ad_fields["calle"]["value"] = ( soup.find("div", {"id": "headerMap"}).find_all("li")[0].text.strip() ) self.ad_fields["calle"]["found"] = True features_lists = soup.find_all("div", {"class": "details-property_features"}) features = [ feature.text for feature_list in features_lists for feature in feature_list.find_all("li") ] self.ad_fields["cubierta"]["value"] = 1 * any( "Cubierta" in feature for feature in features ) self.ad_fields["puerta_auto"]["value"] = 1 * any( "Puerta" in feature for feature in features ) self.ad_fields["ascensor"]["value"] = 1 * any( "ascensor" in feature for feature in features ) self.ad_fields["alarma"]["value"] = 1 * any( "Alarma" in feature for feature in features ) self.ad_fields["circuito"]["value"] = 1 * any( "Cámaras" in feature for feature in features ) self.ad_fields["personal"]["value"] = 1 * any( "Personal" in feature for feature in features ) self.ad_fields["cubierta"]["found"] = True self.ad_fields["puerta_auto"]["found"] = True self.ad_fields["ascensor"]["found"] = True self.ad_fields["alarma"]["found"] = True self.ad_fields["circuito"]["found"] = True self.ad_fields["personal"]["found"] = True if soup.find("p", {"class": "txt-bold _browserPhone icon-phone"}) is not None: self.ad_fields["telefono"]["value"] = soup.find( "p", {"class": "txt-bold _browserPhone icon-phone"} ).text.replace(" ", "") self.ad_fields["telefono"]["found"] = True # TODO capturar datos de visitas def _validate(self): self.invalid_fields = [] if not re.match(r"[0-9]{4,20}", self.ad_fields["referencia"]["value"]): self.invalid_fields.append("referencia") if not re.match(r"[0-9]{1,20}", self.ad_fields["precio"]["value"]): self.invalid_fields.append("precio") possible_values_tamano = [ "2 coches o más", "coche y moto", "coche grande", "coche pequeño", "moto", None, ] if self.ad_fields["tamano_categorico"]["value"] not in possible_values_tamano: self.invalid_fields.append("tamano_categorico") if not "Barrio" in self.ad_fields["barrio"]["value"]: self.invalid_fields.append("barrio") if not "Distrito" in self.ad_fields["distrito"]["value"]: self.invalid_fields.append("distrito") if self.ad_fields["telefono"]["found"] and not re.match( r"\s*\+?[0-9\s]*", self.ad_fields["telefono"]["value"] ): self.invalid_fields.append("telefono") # TODO añadir + a caracteres validos def all_fields_are_valid(self): self._validate() if self.invalid_fields: return False else: return True def fields_missing(self): for key, contents in self.ad_fields.items(): if not contents["optional"] and not contents["found"]: return True return False def get_data(self): data = {} for ad_field in self.ad_fields.keys(): data[ad_field] = self.ad_fields[ad_field]["value"] return data if __name__ == "__main__": capturer = Capturer() capturer.start()