From 43236c2884770f80a60a3703a2783ce4354b3f7e Mon Sep 17 00:00:00 2001 From: pablo Date: Tue, 3 Nov 2020 13:50:36 +0100 Subject: [PATCH] Typing, docstrings, formatting for capturer.py --- capturer/capturer.py | 127 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 101 insertions(+), 26 deletions(-) diff --git a/capturer/capturer.py b/capturer/capturer.py index 455eb15..d8a1bff 100644 --- a/capturer/capturer.py +++ b/capturer/capturer.py @@ -15,23 +15,31 @@ import logging class Capturer: - sleep_time_no_work = 15 + """ + Daemon with the full flow of execution of individual ad requesting, data + scraping and db storage. + """ - def __init__(self): + def __init__(self) -> None: self.last_try_datetime = datetime.datetime.now() - def start(self): + def start(self) -> None: + """ + Full flow of execution. Checks whether it should capture a URL, tries + to do so and stores the result if successful. + :return: None + """ logging.info("Starting capturer") while True: - if not self.in_working_hours(): + if not self._in_working_hours(): sleep(1800) logging.info("Waiting...") continue seconds_to_next_capture = ( - minimum_seconds_between_tries() - self.seconds_since_last_try() + minimum_seconds_between_tries() - self._seconds_since_last_try() ) if seconds_to_next_capture > 0: sleep(seconds_to_next_capture) @@ -39,7 +47,9 @@ class Capturer: pending_task = capturing_interface.get_pending_task() if not pending_task: + logging.info("No pending tasks.") continue + task = CapturingTask(pending_task) self.last_try_datetime = datetime.datetime.now() task.capture() @@ -47,26 +57,46 @@ class Capturer: if task.status == "Data ready": ad_data = task.get_ad_data() else: + logging.warning("Something went wrong, not adding data.") continue capturas_interface.insert_captura(ad_data) task._update_status("Captura inserted") + logging.info("New ad inserted.") - def in_working_hours(self): + def _in_working_hours(self) -> bool: + """ + Checks whether now is within the working hours of the daemon. + :return: True if so, false if not + """ return ( working_hours["start"] <= datetime.datetime.now().time() <= working_hours["end"] ) - def seconds_since_last_try(self): + def _seconds_since_last_try(self) -> float: + """ + Computes how many seconds have passed since the last capturing attempt + :return: seconds since last try as integer + """ return (datetime.datetime.now() - self.last_try_datetime).total_seconds() class CapturingTask: + """ + Task object wrapping the process of attempting to capture and ad, parsing + the data and sending to db. + """ + sleep_time_failed_request = 180 - def __init__(self, parameters): + def __init__(self, parameters) -> None: + """ + Initialize with task parameters and mark the task as being worked on + in the task queue. + :param parameters: dict with the necessary parameters for the task + """ self.uuid = parameters["uuid"] self.ad_url = parameters["ad_url"] self.uuid_exploring = parameters["fk_uuid_exploring"] @@ -76,15 +106,20 @@ class CapturingTask: self._update_status("Loading") - def _update_status(self, new_status): + def _update_status(self, new_status) -> None: + """ + Updates the task status and persists it in the task queue. + :param new_status: string describing the new status + :return: None + """ self.status = new_status capturing_interface.update_capturing_task( self.uuid, self.uuid_exploring, self.status, self.ad_url ) - def capture(self): + def capture(self) -> None: """ - Metodo principal que contiene el flujo de captura + Main flow of work """ self._update_status("WIP") @@ -94,18 +129,19 @@ class CapturingTask: if attack.success: self.html = attack.get_text() - self._extract_data() self._check_data() return - else: + if not attack.success: try: if Refresher.dead_ad_checker(attack.get_text()): self._update_status("Dead ad") return except AttributeError: - pass + logging.error( + "Something went wrong when checking if the ad is gone" + ) self._update_status("Fail {}".format(self.request_failures)) self.request_failures += 1 @@ -115,11 +151,20 @@ class CapturingTask: self._update_status("Surrender") logging.warning(f"A task has surrendered. {self.ad_url}") - def _extract_data(self): + def _extract_data(self) -> None: + """ + Parses the obtained html to extract the ad information. + :return: None + """ self.parser = AdHtmlParser(self.html) self.parser.parse() - def _check_data(self): + def _check_data(self) -> None: + """ + Validates that all compulsory fields have been obtained and that the + values are within the expected. Sets the status of task accordingly. + :return: None + """ if self.parser.fields_missing(): self._update_status("Fields missing") return @@ -130,12 +175,24 @@ class CapturingTask: self._update_status("Data ready") - def get_ad_data(self): + def get_ad_data(self) -> dict: + """ + Returns the extracted data. + :return: dictionary with the data of the ad. + """ return self.parser.get_data() class AdHtmlParser: - def __init__(self, html_string): + """ + Object for parsing, storing and validating the data of the HTML of an ad. + """ + + def __init__(self, html_string: str) -> None: + """ + Initializes an instance of the parser with the HTML of an ad. + :param html_string: the full HTML code of the ad page + """ self.html = html_string self.ad_fields = { @@ -156,9 +213,12 @@ class AdHtmlParser: "personal": {"found": False, "optional": False, "value": None}, "telefono": {"found": False, "optional": True, "value": None}, } - # TODO añadir campos de visitas - def parse(self): + def parse(self) -> None: + """ + Parses the HTML and stores the ad data. + :return: None + """ soup = BeautifulSoup(self.html, "html5lib") @@ -272,9 +332,12 @@ class AdHtmlParser: ).text.replace(" ", "") self.ad_fields["telefono"]["found"] = True - # TODO capturar datos de visitas - - def _validate(self): + def _validate(self) -> None: + """ + Checks whether the extracted values are valid against the expected + typology. Stores the results. + :return: None + """ self.invalid_fields = [] if not re.match(r"[0-9]{4,20}", self.ad_fields["referencia"]["value"]): @@ -306,20 +369,32 @@ class AdHtmlParser: self.invalid_fields.append("telefono") # TODO añadir + a caracteres validos - def all_fields_are_valid(self): + def all_fields_are_valid(self) -> bool: + """ + Reports on whether the extracted data is valid. + :return: True if values are valid, false if not + """ self._validate() if self.invalid_fields: return False else: return True - def fields_missing(self): + def fields_missing(self) -> None: + """ + Reports on whether all compulsory fields are present. + :return: True if some field is missing, false if not + """ for key, contents in self.ad_fields.items(): if not contents["optional"] and not contents["found"]: return True return False - def get_data(self): + def get_data(self) -> dict: + """ + Returns the extracted data in the form of a dictionary. + :return: dictionary with the extracted data + """ data = {} for ad_field in self.ad_fields.keys():