Typing, docstrings, formatting for capturer.py
This commit is contained in:
parent
3cf7dd8bd9
commit
43236c2884
1 changed files with 101 additions and 26 deletions
|
|
@ -15,23 +15,31 @@ import logging
|
|||
|
||||
|
||||
class Capturer:
|
||||
sleep_time_no_work = 15
|
||||
"""
|
||||
Daemon with the full flow of execution of individual ad requesting, data
|
||||
scraping and db storage.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
self.last_try_datetime = datetime.datetime.now()
|
||||
|
||||
def start(self):
|
||||
def start(self) -> None:
|
||||
"""
|
||||
Full flow of execution. Checks whether it should capture a URL, tries
|
||||
to do so and stores the result if successful.
|
||||
:return: None
|
||||
"""
|
||||
|
||||
logging.info("Starting capturer")
|
||||
while True:
|
||||
|
||||
if not self.in_working_hours():
|
||||
if not self._in_working_hours():
|
||||
sleep(1800)
|
||||
logging.info("Waiting...")
|
||||
continue
|
||||
|
||||
seconds_to_next_capture = (
|
||||
minimum_seconds_between_tries() - self.seconds_since_last_try()
|
||||
minimum_seconds_between_tries() - self._seconds_since_last_try()
|
||||
)
|
||||
if seconds_to_next_capture > 0:
|
||||
sleep(seconds_to_next_capture)
|
||||
|
|
@ -39,7 +47,9 @@ class Capturer:
|
|||
|
||||
pending_task = capturing_interface.get_pending_task()
|
||||
if not pending_task:
|
||||
logging.info("No pending tasks.")
|
||||
continue
|
||||
|
||||
task = CapturingTask(pending_task)
|
||||
self.last_try_datetime = datetime.datetime.now()
|
||||
task.capture()
|
||||
|
|
@ -47,26 +57,46 @@ class Capturer:
|
|||
if task.status == "Data ready":
|
||||
ad_data = task.get_ad_data()
|
||||
else:
|
||||
logging.warning("Something went wrong, not adding data.")
|
||||
continue
|
||||
|
||||
capturas_interface.insert_captura(ad_data)
|
||||
task._update_status("Captura inserted")
|
||||
logging.info("New ad inserted.")
|
||||
|
||||
def in_working_hours(self):
|
||||
def _in_working_hours(self) -> bool:
|
||||
"""
|
||||
Checks whether now is within the working hours of the daemon.
|
||||
:return: True if so, false if not
|
||||
"""
|
||||
return (
|
||||
working_hours["start"]
|
||||
<= datetime.datetime.now().time()
|
||||
<= working_hours["end"]
|
||||
)
|
||||
|
||||
def seconds_since_last_try(self):
|
||||
def _seconds_since_last_try(self) -> float:
|
||||
"""
|
||||
Computes how many seconds have passed since the last capturing attempt
|
||||
:return: seconds since last try as integer
|
||||
"""
|
||||
return (datetime.datetime.now() - self.last_try_datetime).total_seconds()
|
||||
|
||||
|
||||
class CapturingTask:
|
||||
"""
|
||||
Task object wrapping the process of attempting to capture and ad, parsing
|
||||
the data and sending to db.
|
||||
"""
|
||||
|
||||
sleep_time_failed_request = 180
|
||||
|
||||
def __init__(self, parameters):
|
||||
def __init__(self, parameters) -> None:
|
||||
"""
|
||||
Initialize with task parameters and mark the task as being worked on
|
||||
in the task queue.
|
||||
:param parameters: dict with the necessary parameters for the task
|
||||
"""
|
||||
self.uuid = parameters["uuid"]
|
||||
self.ad_url = parameters["ad_url"]
|
||||
self.uuid_exploring = parameters["fk_uuid_exploring"]
|
||||
|
|
@ -76,15 +106,20 @@ class CapturingTask:
|
|||
|
||||
self._update_status("Loading")
|
||||
|
||||
def _update_status(self, new_status):
|
||||
def _update_status(self, new_status) -> None:
|
||||
"""
|
||||
Updates the task status and persists it in the task queue.
|
||||
:param new_status: string describing the new status
|
||||
:return: None
|
||||
"""
|
||||
self.status = new_status
|
||||
capturing_interface.update_capturing_task(
|
||||
self.uuid, self.uuid_exploring, self.status, self.ad_url
|
||||
)
|
||||
|
||||
def capture(self):
|
||||
def capture(self) -> None:
|
||||
"""
|
||||
Metodo principal que contiene el flujo de captura
|
||||
Main flow of work
|
||||
"""
|
||||
self._update_status("WIP")
|
||||
|
||||
|
|
@ -94,18 +129,19 @@ class CapturingTask:
|
|||
|
||||
if attack.success:
|
||||
self.html = attack.get_text()
|
||||
|
||||
self._extract_data()
|
||||
self._check_data()
|
||||
return
|
||||
|
||||
else:
|
||||
if not attack.success:
|
||||
try:
|
||||
if Refresher.dead_ad_checker(attack.get_text()):
|
||||
self._update_status("Dead ad")
|
||||
return
|
||||
except AttributeError:
|
||||
pass
|
||||
logging.error(
|
||||
"Something went wrong when checking if the ad is gone"
|
||||
)
|
||||
|
||||
self._update_status("Fail {}".format(self.request_failures))
|
||||
self.request_failures += 1
|
||||
|
|
@ -115,11 +151,20 @@ class CapturingTask:
|
|||
self._update_status("Surrender")
|
||||
logging.warning(f"A task has surrendered. {self.ad_url}")
|
||||
|
||||
def _extract_data(self):
|
||||
def _extract_data(self) -> None:
|
||||
"""
|
||||
Parses the obtained html to extract the ad information.
|
||||
:return: None
|
||||
"""
|
||||
self.parser = AdHtmlParser(self.html)
|
||||
self.parser.parse()
|
||||
|
||||
def _check_data(self):
|
||||
def _check_data(self) -> None:
|
||||
"""
|
||||
Validates that all compulsory fields have been obtained and that the
|
||||
values are within the expected. Sets the status of task accordingly.
|
||||
:return: None
|
||||
"""
|
||||
if self.parser.fields_missing():
|
||||
self._update_status("Fields missing")
|
||||
return
|
||||
|
|
@ -130,12 +175,24 @@ class CapturingTask:
|
|||
|
||||
self._update_status("Data ready")
|
||||
|
||||
def get_ad_data(self):
|
||||
def get_ad_data(self) -> dict:
|
||||
"""
|
||||
Returns the extracted data.
|
||||
:return: dictionary with the data of the ad.
|
||||
"""
|
||||
return self.parser.get_data()
|
||||
|
||||
|
||||
class AdHtmlParser:
|
||||
def __init__(self, html_string):
|
||||
"""
|
||||
Object for parsing, storing and validating the data of the HTML of an ad.
|
||||
"""
|
||||
|
||||
def __init__(self, html_string: str) -> None:
|
||||
"""
|
||||
Initializes an instance of the parser with the HTML of an ad.
|
||||
:param html_string: the full HTML code of the ad page
|
||||
"""
|
||||
self.html = html_string
|
||||
|
||||
self.ad_fields = {
|
||||
|
|
@ -156,9 +213,12 @@ class AdHtmlParser:
|
|||
"personal": {"found": False, "optional": False, "value": None},
|
||||
"telefono": {"found": False, "optional": True, "value": None},
|
||||
}
|
||||
# TODO añadir campos de visitas
|
||||
|
||||
def parse(self):
|
||||
def parse(self) -> None:
|
||||
"""
|
||||
Parses the HTML and stores the ad data.
|
||||
:return: None
|
||||
"""
|
||||
|
||||
soup = BeautifulSoup(self.html, "html5lib")
|
||||
|
||||
|
|
@ -272,9 +332,12 @@ class AdHtmlParser:
|
|||
).text.replace(" ", "")
|
||||
self.ad_fields["telefono"]["found"] = True
|
||||
|
||||
# TODO capturar datos de visitas
|
||||
|
||||
def _validate(self):
|
||||
def _validate(self) -> None:
|
||||
"""
|
||||
Checks whether the extracted values are valid against the expected
|
||||
typology. Stores the results.
|
||||
:return: None
|
||||
"""
|
||||
self.invalid_fields = []
|
||||
|
||||
if not re.match(r"[0-9]{4,20}", self.ad_fields["referencia"]["value"]):
|
||||
|
|
@ -306,20 +369,32 @@ class AdHtmlParser:
|
|||
self.invalid_fields.append("telefono")
|
||||
# TODO añadir + a caracteres validos
|
||||
|
||||
def all_fields_are_valid(self):
|
||||
def all_fields_are_valid(self) -> bool:
|
||||
"""
|
||||
Reports on whether the extracted data is valid.
|
||||
:return: True if values are valid, false if not
|
||||
"""
|
||||
self._validate()
|
||||
if self.invalid_fields:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def fields_missing(self):
|
||||
def fields_missing(self) -> None:
|
||||
"""
|
||||
Reports on whether all compulsory fields are present.
|
||||
:return: True if some field is missing, false if not
|
||||
"""
|
||||
for key, contents in self.ad_fields.items():
|
||||
if not contents["optional"] and not contents["found"]:
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_data(self):
|
||||
def get_data(self) -> dict:
|
||||
"""
|
||||
Returns the extracted data in the form of a dictionary.
|
||||
:return: dictionary with the extracted data
|
||||
"""
|
||||
data = {}
|
||||
|
||||
for ad_field in self.ad_fields.keys():
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue