drogon/capturer/capturer.py
2021-01-03 20:05:34 +01:00

497 lines
18 KiB
Python

import sys
from time import sleep
import datetime
from db_layer.capturing_tasks_interface import CapturingTasksInterface
from db_layer.capturas_interface import CapturasInterface
from core.scrapping_utils import UrlAttack
from core.config import working_hours, minimum_seconds_between_tries
from core.throttling_utils import (
ThrottleManager,
WorkingHoursThrottlingRule,
CooldownThrottlingRule,
DynamicThrottlingRule,
)
from refresher.refresher import Refresher
from core.parsing_utils import *
import logging
class Capturer:
"""
Daemon with the full flow of execution of individual ad requesting, data
scraping and db storage.
"""
def __init__(
self,
throttling_manager: ThrottleManager,
capturing_tasks_interface: CapturingTasksInterface,
capturas_interface: CapturasInterface,
parsing_flow_generator: ParsingFlowGenerator,
url_acquisition_object: Type[UrlAttack],
dead_ad_checker: Callable,
) -> None:
self._throttling_manager = throttling_manager
self._capturing_tasks_interface = capturing_tasks_interface
self._capturas_interface = capturas_interface
self._parsing_flow_generator = parsing_flow_generator
self._url_acquisition_object = url_acquisition_object
self._dead_ad_checker = dead_ad_checker
self.last_try_datetime = datetime.datetime.now()
def start(self) -> None:
"""
Full flow of execution. Checks whether it should capture a URL, tries
to do so and stores the result if successful.
:return: None
"""
logging.info("Starting capturer")
while True:
while not self._throttling_manager.allow_next_task(
last_attempt_timestamp=self.last_try_datetime
):
sleep(10)
logging.info("Waiting...")
pending_task = self._capturing_tasks_interface.get_pending_task()
logging.info("Got a task")
task = CapturingTask(
pending_task,
capturing_interface=self._capturing_tasks_interface,
new_parsing_flow=self._parsing_flow_generator.get_new_flow(),
url_acquisition_object=self._url_acquisition_object,
dead_ad_checker=self._dead_ad_checker,
)
self.last_try_datetime = datetime.datetime.now()
task.capture()
if task.status == "Data ready":
ad_data = task.get_ad_data()
else:
logging.warning("Something went wrong, not adding data.")
continue
self._capturas_interface.insert_captura(ad_data)
task.update_status("Captura inserted")
logging.info("New ad inserted.")
class CapturingTask:
"""
Task object wrapping the process of attempting to capture and ad, parsing
the data and sending to db.
"""
sleep_time_failed_request = 180
def __init__(
self,
task_parameters: dict,
capturing_interface: CapturingTasksInterface,
new_parsing_flow: ParsingFlow,
url_acquisition_object: Type[UrlAttack],
dead_ad_checker: Callable,
) -> None:
"""
Initialize with task parameters and mark the task as being worked on
in the task queue.
:param task_parameters: dict with the necessary parameters for the task
"""
self.uuid = task_parameters["uuid"]
self.ad_url = task_parameters["ad_url"]
self.uuid_exploring = task_parameters["fk_uuid_exploring"]
self.status = task_parameters["status"]
self.request_failures = 1
self.html = None
self._parsing_flow = new_parsing_flow
self._capturing_interface = capturing_interface
self._url_acquistion_object = url_acquisition_object
self._is_dead_ad = dead_ad_checker
self.update_status("Loading")
def update_status(self, new_status) -> None:
"""
Updates the task status and persists it in the task queue.
:param new_status: string describing the new status
:return: None
"""
self.status = new_status
self._capturing_interface.update_capturing_task(
self.uuid, self.uuid_exploring, self.status, self.ad_url
)
def capture(self) -> None:
"""
Main flow of work
"""
self.update_status("WIP")
while self.request_failures < 4:
attack = self._url_acquistion_object(self.ad_url)
attack.attack()
if attack.success:
self._parse_html(html=attack.get_text())
return
if not attack.success:
try:
if self._is_dead_ad(attack.get_text()):
self.update_status("Dead ad")
return
except AttributeError:
logging.error(
"Something went wrong when checking if the ad is gone"
)
self.update_status("Fail {}".format(self.request_failures))
self.request_failures += 1
sleep(CapturingTask.sleep_time_failed_request)
continue
self.update_status("Surrender")
logging.warning(f"A task has surrendered. {self.ad_url}")
def _extract_data(self) -> None:
"""
Parses the obtained html to extract the ad information.
:return: None
"""
self.parser = AdHtmlParser(self.html)
self.parser.parse()
def _check_data(self) -> None:
"""
Validates that all compulsory fields have been obtained and that the
values are within the expected. Sets the status of task accordingly.
:return: None
"""
if self.parser.fields_missing():
self.update_status("Fields missing")
return
if not self.parser.all_fields_are_valid():
self.update_status("Invalid value fields")
return
self.update_status("Data ready")
def get_ad_data(self) -> dict:
"""
Returns the extracted data.
:return: dictionary with the data of the ad.
"""
return self._parsing_flow.field_values
def _parse_html(self, html: str) -> None:
self._parsing_flow.execute_flow(soup=BeautifulSoup(html, "html5lib"))
if not self._parsing_flow.issues:
self.update_status("Data ready")
return
if not self._parsing_flow.all_found_fields_are_valid:
self.update_status("Invalid value fields")
logging.warning(f"Invalid fields found in ad: {self.ad_url}")
logging.warning(f"{self._parsing_flow.issues}")
return
if not self._parsing_flow.all_non_optional_fields_were_found:
self.update_status("Fields missing")
logging.warning(
f"Couldn't scrap necessary fields: {self._parsing_flow.issues}"
)
return
class AdHtmlParser:
"""
Object for parsing, storing and validating the data of the HTML of an ad.
"""
def __init__(self, html_string: str) -> None:
"""
Initializes an instance of the parser with the HTML of an ad.
:param html_string: the full HTML code of the ad page
"""
self.html = html_string
self.ad_fields = {
"referencia": {"found": False, "optional": False, "value": None},
"precio": {"found": False, "optional": False, "value": None},
"tamano_categorico": {"found": False, "optional": True, "value": None},
"m2": {"found": False, "optional": True, "value": None},
"tipo_anuncio": {"found": False, "optional": False, "value": None},
"calle": {"found": False, "optional": True, "value": None},
"barrio": {"found": False, "optional": False, "value": None},
"distrito": {"found": False, "optional": False, "value": None},
"ciudad": {"found": False, "optional": False, "value": None},
"cubierta": {"found": False, "optional": False, "value": None},
"puerta_auto": {"found": False, "optional": False, "value": None},
"ascensor": {"found": False, "optional": False, "value": None},
"alarma": {"found": False, "optional": False, "value": None},
"circuito": {"found": False, "optional": False, "value": None},
"personal": {"found": False, "optional": False, "value": None},
"telefono": {"found": False, "optional": True, "value": None},
}
def parse(self) -> None:
"""
Parses the HTML and stores the ad data.
:return: None
"""
soup = BeautifulSoup(self.html, "html5lib")
if soup.find_all("link", {"rel": "canonical"}) is not None:
self.ad_fields["referencia"]["value"] = re.findall(
r"[0-9]{5,20}", str(soup.find_all("link", {"rel": "canonical"})[0])
)[0]
self.ad_fields["referencia"]["found"] = True
if soup.find_all("strong", {"class": "price"}) is not None:
self.ad_fields["precio"]["value"] = "".join(
re.findall(
r"[0-9]", str(soup.find_all("strong", {"class": "price"})[0])
)
)
self.ad_fields["precio"]["found"] = True
if soup.find("div", {"class": "info-features"}) is not None:
try:
if (
""
not in soup.find("div", {"class": "info-features"})
.find("span")
.find("span")
.text
):
self.ad_fields["tamano_categorico"]["value"] = (
soup.find("div", {"class": "info-features"})
.find("span")
.find("span")
.text
)
self.ad_fields["tamano_categorico"]["found"] = True
except:
pass
posible_m2 = [
tag.text
for tag in soup.find("div", {"class": "info-features"}).find_all("span")
]
if [posible for posible in posible_m2 if "" in posible]:
self.ad_fields["m2"]["value"] = [
"".join(re.findall(r"[0-9]+,*[0-9]*", posible))
for posible in posible_m2
if "" in posible
][0].replace(",", ".")
self.ad_fields["m2"]["found"] = True
if soup.find("title") is not None:
if "venta" in soup.find("title").text:
self.ad_fields["tipo_anuncio"]["value"] = 1
else:
self.ad_fields["tipo_anuncio"]["value"] = 2
self.ad_fields["tipo_anuncio"]["found"] = True
if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 3:
self.ad_fields["calle"]["value"] = ""
self.ad_fields["ciudad"]["value"] = (
soup.find("div", {"id": "headerMap"}).find_all("li")[-2].text.strip()
)
self.ad_fields["ciudad"]["found"] = True
self.ad_fields["distrito"]["value"] = (
soup.find("div", {"id": "headerMap"}).find_all("li")[-3].text.strip()
)
self.ad_fields["distrito"]["found"] = True
self.ad_fields["barrio"]["value"] = (
soup.find("div", {"id": "headerMap"}).find_all("li")[-4].text.strip()
)
self.ad_fields["barrio"]["found"] = True
if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 4:
self.ad_fields["calle"]["value"] = (
soup.find("div", {"id": "headerMap"}).find_all("li")[0].text.strip()
)
self.ad_fields["calle"]["found"] = True
features_lists = soup.find_all("div", {"class": "details-property_features"})
features = [
feature.text
for feature_list in features_lists
for feature in feature_list.find_all("li")
]
self.ad_fields["cubierta"]["value"] = 1 * any(
"Cubierta" in feature for feature in features
)
self.ad_fields["puerta_auto"]["value"] = 1 * any(
"Puerta" in feature for feature in features
)
self.ad_fields["ascensor"]["value"] = 1 * any(
"ascensor" in feature for feature in features
)
self.ad_fields["alarma"]["value"] = 1 * any(
"Alarma" in feature for feature in features
)
self.ad_fields["circuito"]["value"] = 1 * any(
"Cámaras" in feature for feature in features
)
self.ad_fields["personal"]["value"] = 1 * any(
"Personal" in feature for feature in features
)
self.ad_fields["cubierta"]["found"] = True
self.ad_fields["puerta_auto"]["found"] = True
self.ad_fields["ascensor"]["found"] = True
self.ad_fields["alarma"]["found"] = True
self.ad_fields["circuito"]["found"] = True
self.ad_fields["personal"]["found"] = True
if soup.find("p", {"class": "txt-bold _browserPhone icon-phone"}) is not None:
self.ad_fields["telefono"]["value"] = soup.find(
"p", {"class": "txt-bold _browserPhone icon-phone"}
).text.replace(" ", "")
self.ad_fields["telefono"]["found"] = True
def _validate(self) -> None:
"""
Checks whether the extracted values are valid against the expected
typology. Stores the results.
:return: None
"""
self.invalid_fields = []
if not re.match(r"[0-9]{4,20}", self.ad_fields["referencia"]["value"]):
self.invalid_fields.append("referencia")
if not re.match(r"[0-9]{1,20}", self.ad_fields["precio"]["value"]):
self.invalid_fields.append("precio")
possible_values_tamano = [
"2 coches o más",
"coche y moto",
"coche grande",
"coche pequeño",
"moto",
None,
]
if self.ad_fields["tamano_categorico"]["value"] not in possible_values_tamano:
self.invalid_fields.append("tamano_categorico")
if not "Barrio" in self.ad_fields["barrio"]["value"]:
self.invalid_fields.append("barrio")
if not "Distrito" in self.ad_fields["distrito"]["value"]:
self.invalid_fields.append("distrito")
if self.ad_fields["telefono"]["found"] and not re.match(
r"\s*\+?[0-9\s]*", self.ad_fields["telefono"]["value"]
):
self.invalid_fields.append("telefono")
# TODO añadir + a caracteres validos
def all_fields_are_valid(self) -> bool:
"""
Reports on whether the extracted data is valid.
:return: True if values are valid, false if not
"""
self._validate()
if self.invalid_fields:
return False
else:
return True
def fields_missing(self) -> bool:
"""
Reports on whether all compulsory fields are present.
:return: True if some field is missing, false if not
"""
for key, contents in self.ad_fields.items():
if not contents["optional"] and not contents["found"]:
return True
return False
def get_data(self) -> dict:
"""
Returns the extracted data in the form of a dictionary.
:return: dictionary with the extracted data
"""
data = {}
for ad_field in self.ad_fields.keys():
data[ad_field] = self.ad_fields[ad_field]["value"]
return data
if __name__ == "__main__":
capturing_tasks_interface = CapturingTasksInterface()
capturas_interface = CapturasInterface()
throttling_manager = ThrottleManager()
throttling_manager.add_rule(WorkingHoursThrottlingRule(working_hours)).add_rule(
CooldownThrottlingRule(minimum_seconds_between_tries),
required_argument_names=["last_attempt_timestamp"],
).add_rule(
DynamicThrottlingRule(
lambda: bool(capturing_tasks_interface.get_pending_task())
)
)
parsing_flow_generator = ParsingFlowGenerator(
ParsingFlow,
(
(ReferenciaFieldInstructions, {}),
(PrecioFieldInstructions, {}),
(TamanoCategoricoFieldInstructions, {}),
(M2FieldInstructions, {}),
(TipoAnuncioFieldInstructions, {}),
(CalleFieldInstructions, {}),
(BarrioFieldInstructions, {}),
(DistritoFieldInstructions, {}),
(CiudadFieldInstructions, {}),
(
SecondaryFeaturesFieldInstructions,
{"field_name": "cubierta", "search_keyword": "Cubierta"},
),
(
SecondaryFeaturesFieldInstructions,
{"field_name": "puerta_auto", "search_keyword": "Puerta"},
),
(
SecondaryFeaturesFieldInstructions,
{"field_name": "ascensor", "search_keyword": "ascensor"},
),
(
SecondaryFeaturesFieldInstructions,
{"field_name": "alarma", "search_keyword": "Alarma"},
),
(
SecondaryFeaturesFieldInstructions,
{"field_name": "circuito", "search_keyword": "Cámaras"},
),
(
SecondaryFeaturesFieldInstructions,
{"field_name": "personal", "search_keyword": "Personal"},
),
(TelefonoFieldInstructions, {}),
),
)
capturer = Capturer(
throttling_manager=throttling_manager,
capturing_tasks_interface=capturing_tasks_interface,
capturas_interface=capturas_interface,
parsing_flow_generator=parsing_flow_generator,
url_acquisition_object=UrlAttack,
dead_ad_checker=Refresher.dead_ad_checker,
)
capturer.start()