Typing, docstrings, formatting for capturer.py

This commit is contained in:
pablo 2020-11-03 13:50:36 +01:00
parent 3cf7dd8bd9
commit 43236c2884

View file

@ -15,23 +15,31 @@ import logging
class Capturer: class Capturer:
sleep_time_no_work = 15 """
Daemon with the full flow of execution of individual ad requesting, data
scraping and db storage.
"""
def __init__(self): def __init__(self) -> None:
self.last_try_datetime = datetime.datetime.now() self.last_try_datetime = datetime.datetime.now()
def start(self): def start(self) -> None:
"""
Full flow of execution. Checks whether it should capture a URL, tries
to do so and stores the result if successful.
:return: None
"""
logging.info("Starting capturer") logging.info("Starting capturer")
while True: while True:
if not self.in_working_hours(): if not self._in_working_hours():
sleep(1800) sleep(1800)
logging.info("Waiting...") logging.info("Waiting...")
continue continue
seconds_to_next_capture = ( seconds_to_next_capture = (
minimum_seconds_between_tries() - self.seconds_since_last_try() minimum_seconds_between_tries() - self._seconds_since_last_try()
) )
if seconds_to_next_capture > 0: if seconds_to_next_capture > 0:
sleep(seconds_to_next_capture) sleep(seconds_to_next_capture)
@ -39,7 +47,9 @@ class Capturer:
pending_task = capturing_interface.get_pending_task() pending_task = capturing_interface.get_pending_task()
if not pending_task: if not pending_task:
logging.info("No pending tasks.")
continue continue
task = CapturingTask(pending_task) task = CapturingTask(pending_task)
self.last_try_datetime = datetime.datetime.now() self.last_try_datetime = datetime.datetime.now()
task.capture() task.capture()
@ -47,26 +57,46 @@ class Capturer:
if task.status == "Data ready": if task.status == "Data ready":
ad_data = task.get_ad_data() ad_data = task.get_ad_data()
else: else:
logging.warning("Something went wrong, not adding data.")
continue continue
capturas_interface.insert_captura(ad_data) capturas_interface.insert_captura(ad_data)
task._update_status("Captura inserted") task._update_status("Captura inserted")
logging.info("New ad inserted.")
def in_working_hours(self): def _in_working_hours(self) -> bool:
"""
Checks whether now is within the working hours of the daemon.
:return: True if so, false if not
"""
return ( return (
working_hours["start"] working_hours["start"]
<= datetime.datetime.now().time() <= datetime.datetime.now().time()
<= working_hours["end"] <= working_hours["end"]
) )
def seconds_since_last_try(self): def _seconds_since_last_try(self) -> float:
"""
Computes how many seconds have passed since the last capturing attempt
:return: seconds since last try as integer
"""
return (datetime.datetime.now() - self.last_try_datetime).total_seconds() return (datetime.datetime.now() - self.last_try_datetime).total_seconds()
class CapturingTask: class CapturingTask:
"""
Task object wrapping the process of attempting to capture and ad, parsing
the data and sending to db.
"""
sleep_time_failed_request = 180 sleep_time_failed_request = 180
def __init__(self, parameters): def __init__(self, parameters) -> None:
"""
Initialize with task parameters and mark the task as being worked on
in the task queue.
:param parameters: dict with the necessary parameters for the task
"""
self.uuid = parameters["uuid"] self.uuid = parameters["uuid"]
self.ad_url = parameters["ad_url"] self.ad_url = parameters["ad_url"]
self.uuid_exploring = parameters["fk_uuid_exploring"] self.uuid_exploring = parameters["fk_uuid_exploring"]
@ -76,15 +106,20 @@ class CapturingTask:
self._update_status("Loading") self._update_status("Loading")
def _update_status(self, new_status): def _update_status(self, new_status) -> None:
"""
Updates the task status and persists it in the task queue.
:param new_status: string describing the new status
:return: None
"""
self.status = new_status self.status = new_status
capturing_interface.update_capturing_task( capturing_interface.update_capturing_task(
self.uuid, self.uuid_exploring, self.status, self.ad_url self.uuid, self.uuid_exploring, self.status, self.ad_url
) )
def capture(self): def capture(self) -> None:
""" """
Metodo principal que contiene el flujo de captura Main flow of work
""" """
self._update_status("WIP") self._update_status("WIP")
@ -94,18 +129,19 @@ class CapturingTask:
if attack.success: if attack.success:
self.html = attack.get_text() self.html = attack.get_text()
self._extract_data() self._extract_data()
self._check_data() self._check_data()
return return
else: if not attack.success:
try: try:
if Refresher.dead_ad_checker(attack.get_text()): if Refresher.dead_ad_checker(attack.get_text()):
self._update_status("Dead ad") self._update_status("Dead ad")
return return
except AttributeError: except AttributeError:
pass logging.error(
"Something went wrong when checking if the ad is gone"
)
self._update_status("Fail {}".format(self.request_failures)) self._update_status("Fail {}".format(self.request_failures))
self.request_failures += 1 self.request_failures += 1
@ -115,11 +151,20 @@ class CapturingTask:
self._update_status("Surrender") self._update_status("Surrender")
logging.warning(f"A task has surrendered. {self.ad_url}") logging.warning(f"A task has surrendered. {self.ad_url}")
def _extract_data(self): def _extract_data(self) -> None:
"""
Parses the obtained html to extract the ad information.
:return: None
"""
self.parser = AdHtmlParser(self.html) self.parser = AdHtmlParser(self.html)
self.parser.parse() self.parser.parse()
def _check_data(self): def _check_data(self) -> None:
"""
Validates that all compulsory fields have been obtained and that the
values are within the expected. Sets the status of task accordingly.
:return: None
"""
if self.parser.fields_missing(): if self.parser.fields_missing():
self._update_status("Fields missing") self._update_status("Fields missing")
return return
@ -130,12 +175,24 @@ class CapturingTask:
self._update_status("Data ready") self._update_status("Data ready")
def get_ad_data(self): def get_ad_data(self) -> dict:
"""
Returns the extracted data.
:return: dictionary with the data of the ad.
"""
return self.parser.get_data() return self.parser.get_data()
class AdHtmlParser: class AdHtmlParser:
def __init__(self, html_string): """
Object for parsing, storing and validating the data of the HTML of an ad.
"""
def __init__(self, html_string: str) -> None:
"""
Initializes an instance of the parser with the HTML of an ad.
:param html_string: the full HTML code of the ad page
"""
self.html = html_string self.html = html_string
self.ad_fields = { self.ad_fields = {
@ -156,9 +213,12 @@ class AdHtmlParser:
"personal": {"found": False, "optional": False, "value": None}, "personal": {"found": False, "optional": False, "value": None},
"telefono": {"found": False, "optional": True, "value": None}, "telefono": {"found": False, "optional": True, "value": None},
} }
# TODO añadir campos de visitas
def parse(self): def parse(self) -> None:
"""
Parses the HTML and stores the ad data.
:return: None
"""
soup = BeautifulSoup(self.html, "html5lib") soup = BeautifulSoup(self.html, "html5lib")
@ -272,9 +332,12 @@ class AdHtmlParser:
).text.replace(" ", "") ).text.replace(" ", "")
self.ad_fields["telefono"]["found"] = True self.ad_fields["telefono"]["found"] = True
# TODO capturar datos de visitas def _validate(self) -> None:
"""
def _validate(self): Checks whether the extracted values are valid against the expected
typology. Stores the results.
:return: None
"""
self.invalid_fields = [] self.invalid_fields = []
if not re.match(r"[0-9]{4,20}", self.ad_fields["referencia"]["value"]): if not re.match(r"[0-9]{4,20}", self.ad_fields["referencia"]["value"]):
@ -306,20 +369,32 @@ class AdHtmlParser:
self.invalid_fields.append("telefono") self.invalid_fields.append("telefono")
# TODO añadir + a caracteres validos # TODO añadir + a caracteres validos
def all_fields_are_valid(self): def all_fields_are_valid(self) -> bool:
"""
Reports on whether the extracted data is valid.
:return: True if values are valid, false if not
"""
self._validate() self._validate()
if self.invalid_fields: if self.invalid_fields:
return False return False
else: else:
return True return True
def fields_missing(self): def fields_missing(self) -> None:
"""
Reports on whether all compulsory fields are present.
:return: True if some field is missing, false if not
"""
for key, contents in self.ad_fields.items(): for key, contents in self.ad_fields.items():
if not contents["optional"] and not contents["found"]: if not contents["optional"] and not contents["found"]:
return True return True
return False return False
def get_data(self): def get_data(self) -> dict:
"""
Returns the extracted data in the form of a dictionary.
:return: dictionary with the extracted data
"""
data = {} data = {}
for ad_field in self.ad_fields.keys(): for ad_field in self.ad_fields.keys():