Formatting, docstrings and other chores.

This commit is contained in:
pablo 2021-01-04 22:17:40 +01:00
parent adf2cd26ba
commit cbf1643fb5
3 changed files with 32 additions and 255 deletions

View file

@ -1,5 +1,3 @@
import sys
from time import sleep from time import sleep
import datetime import datetime
@ -33,6 +31,19 @@ class Capturer:
url_acquisition_object: Type[UrlAttack], url_acquisition_object: Type[UrlAttack],
dead_ad_checker: Callable, dead_ad_checker: Callable,
) -> None: ) -> None:
"""
Receive all required objects.
:param throttling_manager: takes care of deciding whether a task should
be started
:param capturing_tasks_interface: interface to interact with the tasks
database
:param capturas_interface: interface to interact with the ad database
:param parsing_flow_generator: an object capable of generating empty
parsing flows to give each task a new one
:param url_acquisition_object: gateway to obtaining the HTML of an url
:param dead_ad_checker: callable capable of checking if an ad is dead
through its HTML
"""
self._throttling_manager = throttling_manager self._throttling_manager = throttling_manager
self._capturing_tasks_interface = capturing_tasks_interface self._capturing_tasks_interface = capturing_tasks_interface
self._capturas_interface = capturas_interface self._capturas_interface = capturas_interface
@ -103,6 +114,10 @@ class CapturingTask:
Initialize with task parameters and mark the task as being worked on Initialize with task parameters and mark the task as being worked on
in the task queue. in the task queue.
:param task_parameters: dict with the necessary parameters for the task :param task_parameters: dict with the necessary parameters for the task
:param capturing_interface: interface to interact with the ad database
:param new_parsing_flow: an empty parsing flow
:param url_acquisition_object: gateway to obtaining the HTML of an url
:param dead_ad_checker: callable capable of checking if an ad is dead
""" """
self.uuid = task_parameters["uuid"] self.uuid = task_parameters["uuid"]
self.ad_url = task_parameters["ad_url"] self.ad_url = task_parameters["ad_url"]
@ -160,30 +175,6 @@ class CapturingTask:
self.update_status("Surrender") self.update_status("Surrender")
logging.warning(f"A task has surrendered. {self.ad_url}") logging.warning(f"A task has surrendered. {self.ad_url}")
def _extract_data(self) -> None:
"""
Parses the obtained html to extract the ad information.
:return: None
"""
self.parser = AdHtmlParser(self.html)
self.parser.parse()
def _check_data(self) -> None:
"""
Validates that all compulsory fields have been obtained and that the
values are within the expected. Sets the status of task accordingly.
:return: None
"""
if self.parser.fields_missing():
self.update_status("Fields missing")
return
if not self.parser.all_fields_are_valid():
self.update_status("Invalid value fields")
return
self.update_status("Data ready")
def get_ad_data(self) -> dict: def get_ad_data(self) -> dict:
""" """
Returns the extracted data. Returns the extracted data.
@ -192,6 +183,12 @@ class CapturingTask:
return self._parsing_flow.field_values return self._parsing_flow.field_values
def _parse_html(self, html: str) -> None: def _parse_html(self, html: str) -> None:
"""
Execute the complete parsing flow and report the task status depending
on the outcome.
:param html: the HTML of the ad
:return: None
"""
self._parsing_flow.execute_flow(soup=BeautifulSoup(html, "html5lib")) self._parsing_flow.execute_flow(soup=BeautifulSoup(html, "html5lib"))
if not self._parsing_flow.issues: if not self._parsing_flow.issues:
@ -211,226 +208,6 @@ class CapturingTask:
return return
class AdHtmlParser:
"""
Object for parsing, storing and validating the data of the HTML of an ad.
"""
def __init__(self, html_string: str) -> None:
"""
Initializes an instance of the parser with the HTML of an ad.
:param html_string: the full HTML code of the ad page
"""
self.html = html_string
self.ad_fields = {
"referencia": {"found": False, "optional": False, "value": None},
"precio": {"found": False, "optional": False, "value": None},
"tamano_categorico": {"found": False, "optional": True, "value": None},
"m2": {"found": False, "optional": True, "value": None},
"tipo_anuncio": {"found": False, "optional": False, "value": None},
"calle": {"found": False, "optional": True, "value": None},
"barrio": {"found": False, "optional": False, "value": None},
"distrito": {"found": False, "optional": False, "value": None},
"ciudad": {"found": False, "optional": False, "value": None},
"cubierta": {"found": False, "optional": False, "value": None},
"puerta_auto": {"found": False, "optional": False, "value": None},
"ascensor": {"found": False, "optional": False, "value": None},
"alarma": {"found": False, "optional": False, "value": None},
"circuito": {"found": False, "optional": False, "value": None},
"personal": {"found": False, "optional": False, "value": None},
"telefono": {"found": False, "optional": True, "value": None},
}
def parse(self) -> None:
"""
Parses the HTML and stores the ad data.
:return: None
"""
soup = BeautifulSoup(self.html, "html5lib")
if soup.find_all("link", {"rel": "canonical"}) is not None:
self.ad_fields["referencia"]["value"] = re.findall(
r"[0-9]{5,20}", str(soup.find_all("link", {"rel": "canonical"})[0])
)[0]
self.ad_fields["referencia"]["found"] = True
if soup.find_all("strong", {"class": "price"}) is not None:
self.ad_fields["precio"]["value"] = "".join(
re.findall(
r"[0-9]", str(soup.find_all("strong", {"class": "price"})[0])
)
)
self.ad_fields["precio"]["found"] = True
if soup.find("div", {"class": "info-features"}) is not None:
try:
if (
""
not in soup.find("div", {"class": "info-features"})
.find("span")
.find("span")
.text
):
self.ad_fields["tamano_categorico"]["value"] = (
soup.find("div", {"class": "info-features"})
.find("span")
.find("span")
.text
)
self.ad_fields["tamano_categorico"]["found"] = True
except:
pass
posible_m2 = [
tag.text
for tag in soup.find("div", {"class": "info-features"}).find_all("span")
]
if [posible for posible in posible_m2 if "" in posible]:
self.ad_fields["m2"]["value"] = [
"".join(re.findall(r"[0-9]+,*[0-9]*", posible))
for posible in posible_m2
if "" in posible
][0].replace(",", ".")
self.ad_fields["m2"]["found"] = True
if soup.find("title") is not None:
if "venta" in soup.find("title").text:
self.ad_fields["tipo_anuncio"]["value"] = 1
else:
self.ad_fields["tipo_anuncio"]["value"] = 2
self.ad_fields["tipo_anuncio"]["found"] = True
if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 3:
self.ad_fields["calle"]["value"] = ""
self.ad_fields["ciudad"]["value"] = (
soup.find("div", {"id": "headerMap"}).find_all("li")[-2].text.strip()
)
self.ad_fields["ciudad"]["found"] = True
self.ad_fields["distrito"]["value"] = (
soup.find("div", {"id": "headerMap"}).find_all("li")[-3].text.strip()
)
self.ad_fields["distrito"]["found"] = True
self.ad_fields["barrio"]["value"] = (
soup.find("div", {"id": "headerMap"}).find_all("li")[-4].text.strip()
)
self.ad_fields["barrio"]["found"] = True
if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 4:
self.ad_fields["calle"]["value"] = (
soup.find("div", {"id": "headerMap"}).find_all("li")[0].text.strip()
)
self.ad_fields["calle"]["found"] = True
features_lists = soup.find_all("div", {"class": "details-property_features"})
features = [
feature.text
for feature_list in features_lists
for feature in feature_list.find_all("li")
]
self.ad_fields["cubierta"]["value"] = 1 * any(
"Cubierta" in feature for feature in features
)
self.ad_fields["puerta_auto"]["value"] = 1 * any(
"Puerta" in feature for feature in features
)
self.ad_fields["ascensor"]["value"] = 1 * any(
"ascensor" in feature for feature in features
)
self.ad_fields["alarma"]["value"] = 1 * any(
"Alarma" in feature for feature in features
)
self.ad_fields["circuito"]["value"] = 1 * any(
"Cámaras" in feature for feature in features
)
self.ad_fields["personal"]["value"] = 1 * any(
"Personal" in feature for feature in features
)
self.ad_fields["cubierta"]["found"] = True
self.ad_fields["puerta_auto"]["found"] = True
self.ad_fields["ascensor"]["found"] = True
self.ad_fields["alarma"]["found"] = True
self.ad_fields["circuito"]["found"] = True
self.ad_fields["personal"]["found"] = True
if soup.find("p", {"class": "txt-bold _browserPhone icon-phone"}) is not None:
self.ad_fields["telefono"]["value"] = soup.find(
"p", {"class": "txt-bold _browserPhone icon-phone"}
).text.replace(" ", "")
self.ad_fields["telefono"]["found"] = True
def _validate(self) -> None:
"""
Checks whether the extracted values are valid against the expected
typology. Stores the results.
:return: None
"""
self.invalid_fields = []
if not re.match(r"[0-9]{4,20}", self.ad_fields["referencia"]["value"]):
self.invalid_fields.append("referencia")
if not re.match(r"[0-9]{1,20}", self.ad_fields["precio"]["value"]):
self.invalid_fields.append("precio")
possible_values_tamano = [
"2 coches o más",
"coche y moto",
"coche grande",
"coche pequeño",
"moto",
None,
]
if self.ad_fields["tamano_categorico"]["value"] not in possible_values_tamano:
self.invalid_fields.append("tamano_categorico")
if not "Barrio" in self.ad_fields["barrio"]["value"]:
self.invalid_fields.append("barrio")
if not "Distrito" in self.ad_fields["distrito"]["value"]:
self.invalid_fields.append("distrito")
if self.ad_fields["telefono"]["found"] and not re.match(
r"\s*\+?[0-9\s]*", self.ad_fields["telefono"]["value"]
):
self.invalid_fields.append("telefono")
# TODO añadir + a caracteres validos
def all_fields_are_valid(self) -> bool:
"""
Reports on whether the extracted data is valid.
:return: True if values are valid, false if not
"""
self._validate()
if self.invalid_fields:
return False
else:
return True
def fields_missing(self) -> bool:
"""
Reports on whether all compulsory fields are present.
:return: True if some field is missing, false if not
"""
for key, contents in self.ad_fields.items():
if not contents["optional"] and not contents["found"]:
return True
return False
def get_data(self) -> dict:
"""
Returns the extracted data in the form of a dictionary.
:return: dictionary with the extracted data
"""
data = {}
for ad_field in self.ad_fields.keys():
data[ad_field] = self.ad_fields[ad_field]["value"]
return data
if __name__ == "__main__": if __name__ == "__main__":
capturing_tasks_interface = CapturingTasksInterface() capturing_tasks_interface = CapturingTasksInterface()

View file

@ -92,7 +92,7 @@ class UrlAttack:
if self.response.ok: if self.response.ok:
self.success = True self.success = True
except Exception as e: except Exception:
self.success = False self.success = False
if ( if (

View file

@ -2518,8 +2518,8 @@ def test_referencia_instructions_extract_correctly(real_ad_html):
referencia_instructions.validate() referencia_instructions.validate()
assert ( assert (
referencia_instructions.found == True referencia_instructions.found is True
and referencia_instructions.valid == True and referencia_instructions.valid is True
and referencia_instructions.value is not None and referencia_instructions.value is not None
and referencia_instructions.search_issue is None and referencia_instructions.search_issue is None
) )
@ -2534,7 +2534,7 @@ def test_referencia_instructions_find_nothing_in_unrelated_html(unrelated_html):
referencia_instructions.validate() referencia_instructions.validate()
assert ( assert (
referencia_instructions.found == False referencia_instructions.found is False
and referencia_instructions.valid is None and referencia_instructions.valid is None
and referencia_instructions.value is None and referencia_instructions.value is None
and referencia_instructions.search_issue is not None and referencia_instructions.search_issue is not None
@ -2580,8 +2580,8 @@ def test_all_instructions_extract_correctly(real_ad_html):
assert all( assert all(
[ [
instruction.found == True instruction.found is True
and instruction.valid == True and instruction.valid is True
and instruction.value is not None and instruction.value is not None
and instruction.search_issue is None and instruction.search_issue is None
for instruction in all_instructions for instruction in all_instructions
@ -2628,8 +2628,8 @@ def test_all_instructions_fail_on_unrelated_html(unrelated_html):
assert all( assert all(
[ [
instruction.found == False instruction.found is False
and (instruction.valid == False or instruction.valid == None) and (instruction.valid is False or instruction.valid is None)
and instruction.value is None and instruction.value is None
for instruction in all_instructions for instruction in all_instructions
] ]
@ -2725,7 +2725,7 @@ def test_parsing_flow_fails_for_unrelated_html(unrelated_html):
assert not parsing_flow.all_non_optional_fields_were_found and len( assert not parsing_flow.all_non_optional_fields_were_found and len(
parsing_flow.issues parsing_flow.issues
) == len(all_instructions) ) == len([field for field in all_instructions if not field.is_optional])
def test_parsing_flow_generator_returns_proper_flows(): def test_parsing_flow_generator_returns_proper_flows():