Formatting, docstrings and other chores.

This commit is contained in:
pablo 2021-01-04 22:17:40 +01:00
parent adf2cd26ba
commit cbf1643fb5
3 changed files with 32 additions and 255 deletions

View file

@ -1,5 +1,3 @@
import sys
from time import sleep
import datetime
@ -33,6 +31,19 @@ class Capturer:
url_acquisition_object: Type[UrlAttack],
dead_ad_checker: Callable,
) -> None:
"""
Receive all required objects.
:param throttling_manager: takes care of deciding whether a task should
be started
:param capturing_tasks_interface: interface to interact with the tasks
database
:param capturas_interface: interface to interact with the ad database
:param parsing_flow_generator: an object capable of generating empty
parsing flows to give each task a new one
:param url_acquisition_object: gateway to obtaining the HTML of an url
:param dead_ad_checker: callable capable of checking if an ad is dead
through its HTML
"""
self._throttling_manager = throttling_manager
self._capturing_tasks_interface = capturing_tasks_interface
self._capturas_interface = capturas_interface
@ -103,6 +114,10 @@ class CapturingTask:
Initialize with task parameters and mark the task as being worked on
in the task queue.
:param task_parameters: dict with the necessary parameters for the task
:param capturing_interface: interface to interact with the ad database
:param new_parsing_flow: an empty parsing flow
:param url_acquisition_object: gateway to obtaining the HTML of an url
:param dead_ad_checker: callable capable of checking if an ad is dead
"""
self.uuid = task_parameters["uuid"]
self.ad_url = task_parameters["ad_url"]
@ -160,30 +175,6 @@ class CapturingTask:
self.update_status("Surrender")
logging.warning(f"A task has surrendered. {self.ad_url}")
def _extract_data(self) -> None:
"""
Parses the obtained html to extract the ad information.
:return: None
"""
self.parser = AdHtmlParser(self.html)
self.parser.parse()
def _check_data(self) -> None:
"""
Validates that all compulsory fields have been obtained and that the
values are within the expected. Sets the status of task accordingly.
:return: None
"""
if self.parser.fields_missing():
self.update_status("Fields missing")
return
if not self.parser.all_fields_are_valid():
self.update_status("Invalid value fields")
return
self.update_status("Data ready")
def get_ad_data(self) -> dict:
"""
Returns the extracted data.
@ -192,6 +183,12 @@ class CapturingTask:
return self._parsing_flow.field_values
def _parse_html(self, html: str) -> None:
"""
Execute the complete parsing flow and report the task status depending
on the outcome.
:param html: the HTML of the ad
:return: None
"""
self._parsing_flow.execute_flow(soup=BeautifulSoup(html, "html5lib"))
if not self._parsing_flow.issues:
@ -211,226 +208,6 @@ class CapturingTask:
return
class AdHtmlParser:
"""
Object for parsing, storing and validating the data of the HTML of an ad.
"""
def __init__(self, html_string: str) -> None:
"""
Initializes an instance of the parser with the HTML of an ad.
:param html_string: the full HTML code of the ad page
"""
self.html = html_string
self.ad_fields = {
"referencia": {"found": False, "optional": False, "value": None},
"precio": {"found": False, "optional": False, "value": None},
"tamano_categorico": {"found": False, "optional": True, "value": None},
"m2": {"found": False, "optional": True, "value": None},
"tipo_anuncio": {"found": False, "optional": False, "value": None},
"calle": {"found": False, "optional": True, "value": None},
"barrio": {"found": False, "optional": False, "value": None},
"distrito": {"found": False, "optional": False, "value": None},
"ciudad": {"found": False, "optional": False, "value": None},
"cubierta": {"found": False, "optional": False, "value": None},
"puerta_auto": {"found": False, "optional": False, "value": None},
"ascensor": {"found": False, "optional": False, "value": None},
"alarma": {"found": False, "optional": False, "value": None},
"circuito": {"found": False, "optional": False, "value": None},
"personal": {"found": False, "optional": False, "value": None},
"telefono": {"found": False, "optional": True, "value": None},
}
def parse(self) -> None:
"""
Parses the HTML and stores the ad data.
:return: None
"""
soup = BeautifulSoup(self.html, "html5lib")
if soup.find_all("link", {"rel": "canonical"}) is not None:
self.ad_fields["referencia"]["value"] = re.findall(
r"[0-9]{5,20}", str(soup.find_all("link", {"rel": "canonical"})[0])
)[0]
self.ad_fields["referencia"]["found"] = True
if soup.find_all("strong", {"class": "price"}) is not None:
self.ad_fields["precio"]["value"] = "".join(
re.findall(
r"[0-9]", str(soup.find_all("strong", {"class": "price"})[0])
)
)
self.ad_fields["precio"]["found"] = True
if soup.find("div", {"class": "info-features"}) is not None:
try:
if (
""
not in soup.find("div", {"class": "info-features"})
.find("span")
.find("span")
.text
):
self.ad_fields["tamano_categorico"]["value"] = (
soup.find("div", {"class": "info-features"})
.find("span")
.find("span")
.text
)
self.ad_fields["tamano_categorico"]["found"] = True
except:
pass
posible_m2 = [
tag.text
for tag in soup.find("div", {"class": "info-features"}).find_all("span")
]
if [posible for posible in posible_m2 if "" in posible]:
self.ad_fields["m2"]["value"] = [
"".join(re.findall(r"[0-9]+,*[0-9]*", posible))
for posible in posible_m2
if "" in posible
][0].replace(",", ".")
self.ad_fields["m2"]["found"] = True
if soup.find("title") is not None:
if "venta" in soup.find("title").text:
self.ad_fields["tipo_anuncio"]["value"] = 1
else:
self.ad_fields["tipo_anuncio"]["value"] = 2
self.ad_fields["tipo_anuncio"]["found"] = True
if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 3:
self.ad_fields["calle"]["value"] = ""
self.ad_fields["ciudad"]["value"] = (
soup.find("div", {"id": "headerMap"}).find_all("li")[-2].text.strip()
)
self.ad_fields["ciudad"]["found"] = True
self.ad_fields["distrito"]["value"] = (
soup.find("div", {"id": "headerMap"}).find_all("li")[-3].text.strip()
)
self.ad_fields["distrito"]["found"] = True
self.ad_fields["barrio"]["value"] = (
soup.find("div", {"id": "headerMap"}).find_all("li")[-4].text.strip()
)
self.ad_fields["barrio"]["found"] = True
if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 4:
self.ad_fields["calle"]["value"] = (
soup.find("div", {"id": "headerMap"}).find_all("li")[0].text.strip()
)
self.ad_fields["calle"]["found"] = True
features_lists = soup.find_all("div", {"class": "details-property_features"})
features = [
feature.text
for feature_list in features_lists
for feature in feature_list.find_all("li")
]
self.ad_fields["cubierta"]["value"] = 1 * any(
"Cubierta" in feature for feature in features
)
self.ad_fields["puerta_auto"]["value"] = 1 * any(
"Puerta" in feature for feature in features
)
self.ad_fields["ascensor"]["value"] = 1 * any(
"ascensor" in feature for feature in features
)
self.ad_fields["alarma"]["value"] = 1 * any(
"Alarma" in feature for feature in features
)
self.ad_fields["circuito"]["value"] = 1 * any(
"Cámaras" in feature for feature in features
)
self.ad_fields["personal"]["value"] = 1 * any(
"Personal" in feature for feature in features
)
self.ad_fields["cubierta"]["found"] = True
self.ad_fields["puerta_auto"]["found"] = True
self.ad_fields["ascensor"]["found"] = True
self.ad_fields["alarma"]["found"] = True
self.ad_fields["circuito"]["found"] = True
self.ad_fields["personal"]["found"] = True
if soup.find("p", {"class": "txt-bold _browserPhone icon-phone"}) is not None:
self.ad_fields["telefono"]["value"] = soup.find(
"p", {"class": "txt-bold _browserPhone icon-phone"}
).text.replace(" ", "")
self.ad_fields["telefono"]["found"] = True
def _validate(self) -> None:
"""
Checks whether the extracted values are valid against the expected
typology. Stores the results.
:return: None
"""
self.invalid_fields = []
if not re.match(r"[0-9]{4,20}", self.ad_fields["referencia"]["value"]):
self.invalid_fields.append("referencia")
if not re.match(r"[0-9]{1,20}", self.ad_fields["precio"]["value"]):
self.invalid_fields.append("precio")
possible_values_tamano = [
"2 coches o más",
"coche y moto",
"coche grande",
"coche pequeño",
"moto",
None,
]
if self.ad_fields["tamano_categorico"]["value"] not in possible_values_tamano:
self.invalid_fields.append("tamano_categorico")
if not "Barrio" in self.ad_fields["barrio"]["value"]:
self.invalid_fields.append("barrio")
if not "Distrito" in self.ad_fields["distrito"]["value"]:
self.invalid_fields.append("distrito")
if self.ad_fields["telefono"]["found"] and not re.match(
r"\s*\+?[0-9\s]*", self.ad_fields["telefono"]["value"]
):
self.invalid_fields.append("telefono")
# TODO añadir + a caracteres validos
def all_fields_are_valid(self) -> bool:
"""
Reports on whether the extracted data is valid.
:return: True if values are valid, false if not
"""
self._validate()
if self.invalid_fields:
return False
else:
return True
def fields_missing(self) -> bool:
"""
Reports on whether all compulsory fields are present.
:return: True if some field is missing, false if not
"""
for key, contents in self.ad_fields.items():
if not contents["optional"] and not contents["found"]:
return True
return False
def get_data(self) -> dict:
"""
Returns the extracted data in the form of a dictionary.
:return: dictionary with the extracted data
"""
data = {}
for ad_field in self.ad_fields.keys():
data[ad_field] = self.ad_fields[ad_field]["value"]
return data
if __name__ == "__main__":
capturing_tasks_interface = CapturingTasksInterface()

View file

@ -92,7 +92,7 @@ class UrlAttack:
if self.response.ok:
self.success = True
except Exception as e:
except Exception:
self.success = False
if (

View file

@ -2518,8 +2518,8 @@ def test_referencia_instructions_extract_correctly(real_ad_html):
referencia_instructions.validate()
assert (
referencia_instructions.found == True
and referencia_instructions.valid == True
referencia_instructions.found is True
and referencia_instructions.valid is True
and referencia_instructions.value is not None
and referencia_instructions.search_issue is None
)
@ -2534,7 +2534,7 @@ def test_referencia_instructions_find_nothing_in_unrelated_html(unrelated_html):
referencia_instructions.validate()
assert (
referencia_instructions.found == False
referencia_instructions.found is False
and referencia_instructions.valid is None
and referencia_instructions.value is None
and referencia_instructions.search_issue is not None
@ -2580,8 +2580,8 @@ def test_all_instructions_extract_correctly(real_ad_html):
assert all(
[
instruction.found == True
and instruction.valid == True
instruction.found is True
and instruction.valid is True
and instruction.value is not None
and instruction.search_issue is None
for instruction in all_instructions
@ -2628,8 +2628,8 @@ def test_all_instructions_fail_on_unrelated_html(unrelated_html):
assert all(
[
instruction.found == False
and (instruction.valid == False or instruction.valid == None)
instruction.found is False
and (instruction.valid is False or instruction.valid is None)
and instruction.value is None
for instruction in all_instructions
]
@ -2725,7 +2725,7 @@ def test_parsing_flow_fails_for_unrelated_html(unrelated_html):
assert not parsing_flow.all_non_optional_fields_were_found and len(
parsing_flow.issues
) == len(all_instructions)
) == len([field for field in all_instructions if not field.is_optional])
def test_parsing_flow_generator_returns_proper_flows():