Typing, docstrings, formatting for capturer.py

This commit is contained in:
pablo 2020-11-03 13:50:36 +01:00
parent 3cf7dd8bd9
commit 43236c2884

View file

@ -15,23 +15,31 @@ import logging
class Capturer:
sleep_time_no_work = 15
"""
Daemon with the full flow of execution of individual ad requesting, data
scraping and db storage.
"""
def __init__(self):
def __init__(self) -> None:
self.last_try_datetime = datetime.datetime.now()
def start(self):
def start(self) -> None:
"""
Full flow of execution. Checks whether it should capture a URL, tries
to do so and stores the result if successful.
:return: None
"""
logging.info("Starting capturer")
while True:
if not self.in_working_hours():
if not self._in_working_hours():
sleep(1800)
logging.info("Waiting...")
continue
seconds_to_next_capture = (
minimum_seconds_between_tries() - self.seconds_since_last_try()
minimum_seconds_between_tries() - self._seconds_since_last_try()
)
if seconds_to_next_capture > 0:
sleep(seconds_to_next_capture)
@ -39,7 +47,9 @@ class Capturer:
pending_task = capturing_interface.get_pending_task()
if not pending_task:
logging.info("No pending tasks.")
continue
task = CapturingTask(pending_task)
self.last_try_datetime = datetime.datetime.now()
task.capture()
@ -47,26 +57,46 @@ class Capturer:
if task.status == "Data ready":
ad_data = task.get_ad_data()
else:
logging.warning("Something went wrong, not adding data.")
continue
capturas_interface.insert_captura(ad_data)
task._update_status("Captura inserted")
logging.info("New ad inserted.")
def in_working_hours(self):
def _in_working_hours(self) -> bool:
"""
Checks whether now is within the working hours of the daemon.
:return: True if so, false if not
"""
return (
working_hours["start"]
<= datetime.datetime.now().time()
<= working_hours["end"]
)
def seconds_since_last_try(self):
def _seconds_since_last_try(self) -> float:
"""
Computes how many seconds have passed since the last capturing attempt
:return: seconds since last try as integer
"""
return (datetime.datetime.now() - self.last_try_datetime).total_seconds()
class CapturingTask:
"""
Task object wrapping the process of attempting to capture and ad, parsing
the data and sending to db.
"""
sleep_time_failed_request = 180
def __init__(self, parameters):
def __init__(self, parameters) -> None:
"""
Initialize with task parameters and mark the task as being worked on
in the task queue.
:param parameters: dict with the necessary parameters for the task
"""
self.uuid = parameters["uuid"]
self.ad_url = parameters["ad_url"]
self.uuid_exploring = parameters["fk_uuid_exploring"]
@ -76,15 +106,20 @@ class CapturingTask:
self._update_status("Loading")
def _update_status(self, new_status):
def _update_status(self, new_status) -> None:
"""
Updates the task status and persists it in the task queue.
:param new_status: string describing the new status
:return: None
"""
self.status = new_status
capturing_interface.update_capturing_task(
self.uuid, self.uuid_exploring, self.status, self.ad_url
)
def capture(self):
def capture(self) -> None:
"""
Metodo principal que contiene el flujo de captura
Main flow of work
"""
self._update_status("WIP")
@ -94,18 +129,19 @@ class CapturingTask:
if attack.success:
self.html = attack.get_text()
self._extract_data()
self._check_data()
return
else:
if not attack.success:
try:
if Refresher.dead_ad_checker(attack.get_text()):
self._update_status("Dead ad")
return
except AttributeError:
pass
logging.error(
"Something went wrong when checking if the ad is gone"
)
self._update_status("Fail {}".format(self.request_failures))
self.request_failures += 1
@ -115,11 +151,20 @@ class CapturingTask:
self._update_status("Surrender")
logging.warning(f"A task has surrendered. {self.ad_url}")
def _extract_data(self):
def _extract_data(self) -> None:
"""
Parses the obtained html to extract the ad information.
:return: None
"""
self.parser = AdHtmlParser(self.html)
self.parser.parse()
def _check_data(self):
def _check_data(self) -> None:
"""
Validates that all compulsory fields have been obtained and that the
values are within the expected. Sets the status of task accordingly.
:return: None
"""
if self.parser.fields_missing():
self._update_status("Fields missing")
return
@ -130,12 +175,24 @@ class CapturingTask:
self._update_status("Data ready")
def get_ad_data(self):
def get_ad_data(self) -> dict:
"""
Returns the extracted data.
:return: dictionary with the data of the ad.
"""
return self.parser.get_data()
class AdHtmlParser:
def __init__(self, html_string):
"""
Object for parsing, storing and validating the data of the HTML of an ad.
"""
def __init__(self, html_string: str) -> None:
"""
Initializes an instance of the parser with the HTML of an ad.
:param html_string: the full HTML code of the ad page
"""
self.html = html_string
self.ad_fields = {
@ -156,9 +213,12 @@ class AdHtmlParser:
"personal": {"found": False, "optional": False, "value": None},
"telefono": {"found": False, "optional": True, "value": None},
}
# TODO añadir campos de visitas
def parse(self):
def parse(self) -> None:
"""
Parses the HTML and stores the ad data.
:return: None
"""
soup = BeautifulSoup(self.html, "html5lib")
@ -272,9 +332,12 @@ class AdHtmlParser:
).text.replace(" ", "")
self.ad_fields["telefono"]["found"] = True
# TODO capturar datos de visitas
def _validate(self):
def _validate(self) -> None:
"""
Checks whether the extracted values are valid against the expected
typology. Stores the results.
:return: None
"""
self.invalid_fields = []
if not re.match(r"[0-9]{4,20}", self.ad_fields["referencia"]["value"]):
@ -306,20 +369,32 @@ class AdHtmlParser:
self.invalid_fields.append("telefono")
# TODO añadir + a caracteres validos
def all_fields_are_valid(self):
def all_fields_are_valid(self) -> bool:
"""
Reports on whether the extracted data is valid.
:return: True if values are valid, false if not
"""
self._validate()
if self.invalid_fields:
return False
else:
return True
def fields_missing(self):
def fields_missing(self) -> None:
"""
Reports on whether all compulsory fields are present.
:return: True if some field is missing, false if not
"""
for key, contents in self.ad_fields.items():
if not contents["optional"] and not contents["found"]:
return True
return False
def get_data(self):
def get_data(self) -> dict:
"""
Returns the extracted data in the form of a dictionary.
:return: dictionary with the extracted data
"""
data = {}
for ad_field in self.ad_fields.keys():