Formatting, docstrings and other chores.
This commit is contained in:
parent
adf2cd26ba
commit
cbf1643fb5
3 changed files with 32 additions and 255 deletions
|
|
@ -1,5 +1,3 @@
|
|||
import sys
|
||||
|
||||
from time import sleep
|
||||
import datetime
|
||||
|
||||
|
|
@ -33,6 +31,19 @@ class Capturer:
|
|||
url_acquisition_object: Type[UrlAttack],
|
||||
dead_ad_checker: Callable,
|
||||
) -> None:
|
||||
"""
|
||||
Receive all required objects.
|
||||
:param throttling_manager: takes care of deciding whether a task should
|
||||
be started
|
||||
:param capturing_tasks_interface: interface to interact with the tasks
|
||||
database
|
||||
:param capturas_interface: interface to interact with the ad database
|
||||
:param parsing_flow_generator: an object capable of generating empty
|
||||
parsing flows to give each task a new one
|
||||
:param url_acquisition_object: gateway to obtaining the HTML of an url
|
||||
:param dead_ad_checker: callable capable of checking if an ad is dead
|
||||
through its HTML
|
||||
"""
|
||||
self._throttling_manager = throttling_manager
|
||||
self._capturing_tasks_interface = capturing_tasks_interface
|
||||
self._capturas_interface = capturas_interface
|
||||
|
|
@ -103,6 +114,10 @@ class CapturingTask:
|
|||
Initialize with task parameters and mark the task as being worked on
|
||||
in the task queue.
|
||||
:param task_parameters: dict with the necessary parameters for the task
|
||||
:param capturing_interface: interface to interact with the ad database
|
||||
:param new_parsing_flow: an empty parsing flow
|
||||
:param url_acquisition_object: gateway to obtaining the HTML of an url
|
||||
:param dead_ad_checker: callable capable of checking if an ad is dead
|
||||
"""
|
||||
self.uuid = task_parameters["uuid"]
|
||||
self.ad_url = task_parameters["ad_url"]
|
||||
|
|
@ -160,30 +175,6 @@ class CapturingTask:
|
|||
self.update_status("Surrender")
|
||||
logging.warning(f"A task has surrendered. {self.ad_url}")
|
||||
|
||||
def _extract_data(self) -> None:
|
||||
"""
|
||||
Parses the obtained html to extract the ad information.
|
||||
:return: None
|
||||
"""
|
||||
self.parser = AdHtmlParser(self.html)
|
||||
self.parser.parse()
|
||||
|
||||
def _check_data(self) -> None:
|
||||
"""
|
||||
Validates that all compulsory fields have been obtained and that the
|
||||
values are within the expected. Sets the status of task accordingly.
|
||||
:return: None
|
||||
"""
|
||||
if self.parser.fields_missing():
|
||||
self.update_status("Fields missing")
|
||||
return
|
||||
|
||||
if not self.parser.all_fields_are_valid():
|
||||
self.update_status("Invalid value fields")
|
||||
return
|
||||
|
||||
self.update_status("Data ready")
|
||||
|
||||
def get_ad_data(self) -> dict:
|
||||
"""
|
||||
Returns the extracted data.
|
||||
|
|
@ -192,6 +183,12 @@ class CapturingTask:
|
|||
return self._parsing_flow.field_values
|
||||
|
||||
def _parse_html(self, html: str) -> None:
|
||||
"""
|
||||
Execute the complete parsing flow and report the task status depending
|
||||
on the outcome.
|
||||
:param html: the HTML of the ad
|
||||
:return: None
|
||||
"""
|
||||
self._parsing_flow.execute_flow(soup=BeautifulSoup(html, "html5lib"))
|
||||
|
||||
if not self._parsing_flow.issues:
|
||||
|
|
@ -211,226 +208,6 @@ class CapturingTask:
|
|||
return
|
||||
|
||||
|
||||
class AdHtmlParser:
|
||||
"""
|
||||
Object for parsing, storing and validating the data of the HTML of an ad.
|
||||
"""
|
||||
|
||||
def __init__(self, html_string: str) -> None:
|
||||
"""
|
||||
Initializes an instance of the parser with the HTML of an ad.
|
||||
:param html_string: the full HTML code of the ad page
|
||||
"""
|
||||
self.html = html_string
|
||||
|
||||
self.ad_fields = {
|
||||
"referencia": {"found": False, "optional": False, "value": None},
|
||||
"precio": {"found": False, "optional": False, "value": None},
|
||||
"tamano_categorico": {"found": False, "optional": True, "value": None},
|
||||
"m2": {"found": False, "optional": True, "value": None},
|
||||
"tipo_anuncio": {"found": False, "optional": False, "value": None},
|
||||
"calle": {"found": False, "optional": True, "value": None},
|
||||
"barrio": {"found": False, "optional": False, "value": None},
|
||||
"distrito": {"found": False, "optional": False, "value": None},
|
||||
"ciudad": {"found": False, "optional": False, "value": None},
|
||||
"cubierta": {"found": False, "optional": False, "value": None},
|
||||
"puerta_auto": {"found": False, "optional": False, "value": None},
|
||||
"ascensor": {"found": False, "optional": False, "value": None},
|
||||
"alarma": {"found": False, "optional": False, "value": None},
|
||||
"circuito": {"found": False, "optional": False, "value": None},
|
||||
"personal": {"found": False, "optional": False, "value": None},
|
||||
"telefono": {"found": False, "optional": True, "value": None},
|
||||
}
|
||||
|
||||
def parse(self) -> None:
|
||||
"""
|
||||
Parses the HTML and stores the ad data.
|
||||
:return: None
|
||||
"""
|
||||
|
||||
soup = BeautifulSoup(self.html, "html5lib")
|
||||
|
||||
if soup.find_all("link", {"rel": "canonical"}) is not None:
|
||||
self.ad_fields["referencia"]["value"] = re.findall(
|
||||
r"[0-9]{5,20}", str(soup.find_all("link", {"rel": "canonical"})[0])
|
||||
)[0]
|
||||
self.ad_fields["referencia"]["found"] = True
|
||||
|
||||
if soup.find_all("strong", {"class": "price"}) is not None:
|
||||
self.ad_fields["precio"]["value"] = "".join(
|
||||
re.findall(
|
||||
r"[0-9]", str(soup.find_all("strong", {"class": "price"})[0])
|
||||
)
|
||||
)
|
||||
self.ad_fields["precio"]["found"] = True
|
||||
|
||||
if soup.find("div", {"class": "info-features"}) is not None:
|
||||
try:
|
||||
if (
|
||||
"m²"
|
||||
not in soup.find("div", {"class": "info-features"})
|
||||
.find("span")
|
||||
.find("span")
|
||||
.text
|
||||
):
|
||||
self.ad_fields["tamano_categorico"]["value"] = (
|
||||
soup.find("div", {"class": "info-features"})
|
||||
.find("span")
|
||||
.find("span")
|
||||
.text
|
||||
)
|
||||
self.ad_fields["tamano_categorico"]["found"] = True
|
||||
except:
|
||||
pass
|
||||
|
||||
posible_m2 = [
|
||||
tag.text
|
||||
for tag in soup.find("div", {"class": "info-features"}).find_all("span")
|
||||
]
|
||||
if [posible for posible in posible_m2 if "m²" in posible]:
|
||||
self.ad_fields["m2"]["value"] = [
|
||||
"".join(re.findall(r"[0-9]+,*[0-9]*", posible))
|
||||
for posible in posible_m2
|
||||
if "m²" in posible
|
||||
][0].replace(",", ".")
|
||||
self.ad_fields["m2"]["found"] = True
|
||||
|
||||
if soup.find("title") is not None:
|
||||
if "venta" in soup.find("title").text:
|
||||
self.ad_fields["tipo_anuncio"]["value"] = 1
|
||||
else:
|
||||
self.ad_fields["tipo_anuncio"]["value"] = 2
|
||||
self.ad_fields["tipo_anuncio"]["found"] = True
|
||||
|
||||
if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 3:
|
||||
self.ad_fields["calle"]["value"] = ""
|
||||
self.ad_fields["ciudad"]["value"] = (
|
||||
soup.find("div", {"id": "headerMap"}).find_all("li")[-2].text.strip()
|
||||
)
|
||||
self.ad_fields["ciudad"]["found"] = True
|
||||
self.ad_fields["distrito"]["value"] = (
|
||||
soup.find("div", {"id": "headerMap"}).find_all("li")[-3].text.strip()
|
||||
)
|
||||
self.ad_fields["distrito"]["found"] = True
|
||||
self.ad_fields["barrio"]["value"] = (
|
||||
soup.find("div", {"id": "headerMap"}).find_all("li")[-4].text.strip()
|
||||
)
|
||||
self.ad_fields["barrio"]["found"] = True
|
||||
if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 4:
|
||||
self.ad_fields["calle"]["value"] = (
|
||||
soup.find("div", {"id": "headerMap"}).find_all("li")[0].text.strip()
|
||||
)
|
||||
self.ad_fields["calle"]["found"] = True
|
||||
|
||||
features_lists = soup.find_all("div", {"class": "details-property_features"})
|
||||
features = [
|
||||
feature.text
|
||||
for feature_list in features_lists
|
||||
for feature in feature_list.find_all("li")
|
||||
]
|
||||
self.ad_fields["cubierta"]["value"] = 1 * any(
|
||||
"Cubierta" in feature for feature in features
|
||||
)
|
||||
self.ad_fields["puerta_auto"]["value"] = 1 * any(
|
||||
"Puerta" in feature for feature in features
|
||||
)
|
||||
self.ad_fields["ascensor"]["value"] = 1 * any(
|
||||
"ascensor" in feature for feature in features
|
||||
)
|
||||
self.ad_fields["alarma"]["value"] = 1 * any(
|
||||
"Alarma" in feature for feature in features
|
||||
)
|
||||
self.ad_fields["circuito"]["value"] = 1 * any(
|
||||
"Cámaras" in feature for feature in features
|
||||
)
|
||||
self.ad_fields["personal"]["value"] = 1 * any(
|
||||
"Personal" in feature for feature in features
|
||||
)
|
||||
|
||||
self.ad_fields["cubierta"]["found"] = True
|
||||
self.ad_fields["puerta_auto"]["found"] = True
|
||||
self.ad_fields["ascensor"]["found"] = True
|
||||
self.ad_fields["alarma"]["found"] = True
|
||||
self.ad_fields["circuito"]["found"] = True
|
||||
self.ad_fields["personal"]["found"] = True
|
||||
|
||||
if soup.find("p", {"class": "txt-bold _browserPhone icon-phone"}) is not None:
|
||||
self.ad_fields["telefono"]["value"] = soup.find(
|
||||
"p", {"class": "txt-bold _browserPhone icon-phone"}
|
||||
).text.replace(" ", "")
|
||||
self.ad_fields["telefono"]["found"] = True
|
||||
|
||||
def _validate(self) -> None:
|
||||
"""
|
||||
Checks whether the extracted values are valid against the expected
|
||||
typology. Stores the results.
|
||||
:return: None
|
||||
"""
|
||||
self.invalid_fields = []
|
||||
|
||||
if not re.match(r"[0-9]{4,20}", self.ad_fields["referencia"]["value"]):
|
||||
self.invalid_fields.append("referencia")
|
||||
|
||||
if not re.match(r"[0-9]{1,20}", self.ad_fields["precio"]["value"]):
|
||||
self.invalid_fields.append("precio")
|
||||
|
||||
possible_values_tamano = [
|
||||
"2 coches o más",
|
||||
"coche y moto",
|
||||
"coche grande",
|
||||
"coche pequeño",
|
||||
"moto",
|
||||
None,
|
||||
]
|
||||
if self.ad_fields["tamano_categorico"]["value"] not in possible_values_tamano:
|
||||
self.invalid_fields.append("tamano_categorico")
|
||||
|
||||
if not "Barrio" in self.ad_fields["barrio"]["value"]:
|
||||
self.invalid_fields.append("barrio")
|
||||
|
||||
if not "Distrito" in self.ad_fields["distrito"]["value"]:
|
||||
self.invalid_fields.append("distrito")
|
||||
|
||||
if self.ad_fields["telefono"]["found"] and not re.match(
|
||||
r"\s*\+?[0-9\s]*", self.ad_fields["telefono"]["value"]
|
||||
):
|
||||
self.invalid_fields.append("telefono")
|
||||
# TODO añadir + a caracteres validos
|
||||
|
||||
def all_fields_are_valid(self) -> bool:
|
||||
"""
|
||||
Reports on whether the extracted data is valid.
|
||||
:return: True if values are valid, false if not
|
||||
"""
|
||||
self._validate()
|
||||
if self.invalid_fields:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def fields_missing(self) -> bool:
|
||||
"""
|
||||
Reports on whether all compulsory fields are present.
|
||||
:return: True if some field is missing, false if not
|
||||
"""
|
||||
for key, contents in self.ad_fields.items():
|
||||
if not contents["optional"] and not contents["found"]:
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_data(self) -> dict:
|
||||
"""
|
||||
Returns the extracted data in the form of a dictionary.
|
||||
:return: dictionary with the extracted data
|
||||
"""
|
||||
data = {}
|
||||
|
||||
for ad_field in self.ad_fields.keys():
|
||||
data[ad_field] = self.ad_fields[ad_field]["value"]
|
||||
|
||||
return data
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
capturing_tasks_interface = CapturingTasksInterface()
|
||||
|
|
|
|||
|
|
@ -92,7 +92,7 @@ class UrlAttack:
|
|||
|
||||
if self.response.ok:
|
||||
self.success = True
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
self.success = False
|
||||
|
||||
if (
|
||||
|
|
|
|||
|
|
@ -2518,8 +2518,8 @@ def test_referencia_instructions_extract_correctly(real_ad_html):
|
|||
referencia_instructions.validate()
|
||||
|
||||
assert (
|
||||
referencia_instructions.found == True
|
||||
and referencia_instructions.valid == True
|
||||
referencia_instructions.found is True
|
||||
and referencia_instructions.valid is True
|
||||
and referencia_instructions.value is not None
|
||||
and referencia_instructions.search_issue is None
|
||||
)
|
||||
|
|
@ -2534,7 +2534,7 @@ def test_referencia_instructions_find_nothing_in_unrelated_html(unrelated_html):
|
|||
referencia_instructions.validate()
|
||||
|
||||
assert (
|
||||
referencia_instructions.found == False
|
||||
referencia_instructions.found is False
|
||||
and referencia_instructions.valid is None
|
||||
and referencia_instructions.value is None
|
||||
and referencia_instructions.search_issue is not None
|
||||
|
|
@ -2580,8 +2580,8 @@ def test_all_instructions_extract_correctly(real_ad_html):
|
|||
|
||||
assert all(
|
||||
[
|
||||
instruction.found == True
|
||||
and instruction.valid == True
|
||||
instruction.found is True
|
||||
and instruction.valid is True
|
||||
and instruction.value is not None
|
||||
and instruction.search_issue is None
|
||||
for instruction in all_instructions
|
||||
|
|
@ -2628,8 +2628,8 @@ def test_all_instructions_fail_on_unrelated_html(unrelated_html):
|
|||
|
||||
assert all(
|
||||
[
|
||||
instruction.found == False
|
||||
and (instruction.valid == False or instruction.valid == None)
|
||||
instruction.found is False
|
||||
and (instruction.valid is False or instruction.valid is None)
|
||||
and instruction.value is None
|
||||
for instruction in all_instructions
|
||||
]
|
||||
|
|
@ -2725,7 +2725,7 @@ def test_parsing_flow_fails_for_unrelated_html(unrelated_html):
|
|||
|
||||
assert not parsing_flow.all_non_optional_fields_were_found and len(
|
||||
parsing_flow.issues
|
||||
) == len(all_instructions)
|
||||
) == len([field for field in all_instructions if not field.is_optional])
|
||||
|
||||
|
||||
def test_parsing_flow_generator_returns_proper_flows():
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue