drogon/capturer/capturer.py

import sys

from time import sleep
import datetime

from db_layer.capturing_tasks_interface import CapturingTasksInterface
from db_layer.capturas_interface import CapturasInterface
from core.scrapping_utils import UrlAttack
from core.config import working_hours, minimum_seconds_between_tries
from core.throttling_utils import (
    ThrottleManager,
    WorkingHoursThrottlingRule,
    CooldownThrottlingRule,
    DynamicThrottlingRule,
)
from refresher.refresher import Refresher
from core.parsing_utils import *
import logging


class Capturer:
    """
    Daemon with the full flow of execution of individual ad requesting, data
    scraping and db storage.
    """

    def __init__(
        self,
        throttling_manager: ThrottleManager,
        capturing_tasks_interface: CapturingTasksInterface,
        capturas_interface: CapturasInterface,
        parsing_flow_generator: ParsingFlowGenerator,
        url_acquisition_object: Type[UrlAttack],
        dead_ad_checker: Callable,
    ) -> None:
        self._throttling_manager = throttling_manager
        self._capturing_tasks_interface = capturing_tasks_interface
        self._capturas_interface = capturas_interface
        self._parsing_flow_generator = parsing_flow_generator
        self._url_acquisition_object = url_acquisition_object
        self._dead_ad_checker = dead_ad_checker

        self.last_try_datetime = datetime.datetime.now()

    def start(self) -> None:
        """
        Full flow of execution. Checks whether it should capture a URL, tries
        to do so and stores the result if successful.
        :return: None
        """

        logging.info("Starting capturer")
        while True:

            while not self._throttling_manager.allow_next_task(
                last_attempt_timestamp=self.last_try_datetime
            ):
                sleep(10)
                logging.info("Waiting...")

            pending_task = self._capturing_tasks_interface.get_pending_task()

            logging.info("Got a task")

            task = CapturingTask(
                pending_task,
                capturing_interface=self._capturing_tasks_interface,
                new_parsing_flow=self._parsing_flow_generator.get_new_flow(),
                url_acquisition_object=self._url_acquisition_object,
                dead_ad_checker=self._dead_ad_checker,
            )
            self.last_try_datetime = datetime.datetime.now()
            task.capture()

            if task.status == "Data ready":
                ad_data = task.get_ad_data()
            else:
                logging.warning("Something went wrong, not adding data.")
                continue

            self._capturas_interface.insert_captura(ad_data)
            task.update_status("Captura inserted")
            logging.info("New ad inserted.")


class CapturingTask:
    """
    Task object wrapping the process of attempting to capture and ad, parsing
    the data and sending to db.
    """

    sleep_time_failed_request = 180

    def __init__(
        self,
        task_parameters: dict,
        capturing_interface: CapturingTasksInterface,
        new_parsing_flow: ParsingFlow,
        url_acquisition_object: Type[UrlAttack],
        dead_ad_checker: Callable,
    ) -> None:
        """
        Initialize with task parameters and mark the task as being worked on
        in the task queue.
        :param task_parameters: dict with the necessary parameters for the task
        """
        self.uuid = task_parameters["uuid"]
        self.ad_url = task_parameters["ad_url"]
        self.uuid_exploring = task_parameters["fk_uuid_exploring"]
        self.status = task_parameters["status"]
        self.request_failures = 1
        self.html = None
        self._parsing_flow = new_parsing_flow
        self._capturing_interface = capturing_interface
        self._url_acquistion_object = url_acquisition_object
        self._is_dead_ad = dead_ad_checker

        self.update_status("Loading")

    def update_status(self, new_status) -> None:
        """
        Updates the task status and persists it in the task queue.
        :param new_status: string describing the new status
        :return: None
        """
        self.status = new_status
        self._capturing_interface.update_capturing_task(
            self.uuid, self.uuid_exploring, self.status, self.ad_url
        )

    def capture(self) -> None:
        """
        Main flow of work
        """
        self.update_status("WIP")

        while self.request_failures < 4:
            attack = self._url_acquistion_object(self.ad_url)
            attack.attack()

            if attack.success:
                self._parse_html(html=attack.get_text())
                return

            if not attack.success:
                try:
                    if self._is_dead_ad(attack.get_text()):
                        self.update_status("Dead ad")
                        return
                except AttributeError:
                    logging.error(
                        "Something went wrong when checking if the ad is gone"
                    )

                self.update_status("Fail {}".format(self.request_failures))
                self.request_failures += 1
                sleep(CapturingTask.sleep_time_failed_request)
                continue

        self.update_status("Surrender")
        logging.warning(f"A task has surrendered. {self.ad_url}")

    def _extract_data(self) -> None:
        """
        Parses the obtained html to extract the ad information.
        :return: None
        """
        self.parser = AdHtmlParser(self.html)
        self.parser.parse()

    def _check_data(self) -> None:
        """
        Validates that all compulsory fields have been obtained and that the
        values are within the expected. Sets the status of task accordingly.
        :return: None
        """
        if self.parser.fields_missing():
            self.update_status("Fields missing")
            return

        if not self.parser.all_fields_are_valid():
            self.update_status("Invalid value fields")
            return

        self.update_status("Data ready")

    def get_ad_data(self) -> dict:
        """
        Returns the extracted data.
        :return: dictionary with the data of the ad.
        """
        return self._parsing_flow.field_values

    def _parse_html(self, html: str) -> None:
        self._parsing_flow.execute_flow(soup=BeautifulSoup(html, "html5lib"))

        if not self._parsing_flow.issues:
            self.update_status("Data ready")
            return

        if not self._parsing_flow.all_found_fields_are_valid:
            self.update_status("Invalid value fields")
            logging.warning(f"Invalid fields found in ad: {self.ad_url}")
            logging.warning(f"{self._parsing_flow.issues}")
            return
        if not self._parsing_flow.all_non_optional_fields_were_found:
            self.update_status("Fields missing")
            logging.warning(
                f"Couldn't scrap necessary fields: {self._parsing_flow.issues}"
            )
            return


class AdHtmlParser:
    """
    Object for parsing, storing and validating the data of the HTML of an ad.
    """

    def __init__(self, html_string: str) -> None:
        """
        Initializes an instance of the parser with the HTML of an ad.
        :param html_string: the full HTML code of the ad page
        """
        self.html = html_string

        self.ad_fields = {
            "referencia": {"found": False, "optional": False, "value": None},
            "precio": {"found": False, "optional": False, "value": None},
            "tamano_categorico": {"found": False, "optional": True, "value": None},
            "m2": {"found": False, "optional": True, "value": None},
            "tipo_anuncio": {"found": False, "optional": False, "value": None},
            "calle": {"found": False, "optional": True, "value": None},
            "barrio": {"found": False, "optional": False, "value": None},
            "distrito": {"found": False, "optional": False, "value": None},
            "ciudad": {"found": False, "optional": False, "value": None},
            "cubierta": {"found": False, "optional": False, "value": None},
            "puerta_auto": {"found": False, "optional": False, "value": None},
            "ascensor": {"found": False, "optional": False, "value": None},
            "alarma": {"found": False, "optional": False, "value": None},
            "circuito": {"found": False, "optional": False, "value": None},
            "personal": {"found": False, "optional": False, "value": None},
            "telefono": {"found": False, "optional": True, "value": None},
        }

    def parse(self) -> None:
        """
        Parses the HTML and stores the ad data.
        :return: None
        """

        soup = BeautifulSoup(self.html, "html5lib")

        if soup.find_all("link", {"rel": "canonical"}) is not None:
            self.ad_fields["referencia"]["value"] = re.findall(
                r"[0-9]{5,20}", str(soup.find_all("link", {"rel": "canonical"})[0])
            )[0]
            self.ad_fields["referencia"]["found"] = True

        if soup.find_all("strong", {"class": "price"}) is not None:
            self.ad_fields["precio"]["value"] = "".join(
                re.findall(
                    r"[0-9]", str(soup.find_all("strong", {"class": "price"})[0])
                )
            )
            self.ad_fields["precio"]["found"] = True

        if soup.find("div", {"class": "info-features"}) is not None:
            try:
                if (
                    "m²"
                    not in soup.find("div", {"class": "info-features"})
                    .find("span")
                    .find("span")
                    .text
                ):
                    self.ad_fields["tamano_categorico"]["value"] = (
                        soup.find("div", {"class": "info-features"})
                        .find("span")
                        .find("span")
                        .text
                    )
                    self.ad_fields["tamano_categorico"]["found"] = True
            except:
                pass

        posible_m2 = [
            tag.text
            for tag in soup.find("div", {"class": "info-features"}).find_all("span")
        ]
        if [posible for posible in posible_m2 if "m²" in posible]:
            self.ad_fields["m2"]["value"] = [
                "".join(re.findall(r"[0-9]+,*[0-9]*", posible))
                for posible in posible_m2
                if "m²" in posible
            ][0].replace(",", ".")
            self.ad_fields["m2"]["found"] = True

        if soup.find("title") is not None:
            if "venta" in soup.find("title").text:
                self.ad_fields["tipo_anuncio"]["value"] = 1
            else:
                self.ad_fields["tipo_anuncio"]["value"] = 2
            self.ad_fields["tipo_anuncio"]["found"] = True

        if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 3:
            self.ad_fields["calle"]["value"] = ""
            self.ad_fields["ciudad"]["value"] = (
                soup.find("div", {"id": "headerMap"}).find_all("li")[-2].text.strip()
            )
            self.ad_fields["ciudad"]["found"] = True
            self.ad_fields["distrito"]["value"] = (
                soup.find("div", {"id": "headerMap"}).find_all("li")[-3].text.strip()
            )
            self.ad_fields["distrito"]["found"] = True
            self.ad_fields["barrio"]["value"] = (
                soup.find("div", {"id": "headerMap"}).find_all("li")[-4].text.strip()
            )
            self.ad_fields["barrio"]["found"] = True
        if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 4:
            self.ad_fields["calle"]["value"] = (
                soup.find("div", {"id": "headerMap"}).find_all("li")[0].text.strip()
            )
            self.ad_fields["calle"]["found"] = True

        features_lists = soup.find_all("div", {"class": "details-property_features"})
        features = [
            feature.text
            for feature_list in features_lists
            for feature in feature_list.find_all("li")
        ]
        self.ad_fields["cubierta"]["value"] = 1 * any(
            "Cubierta" in feature for feature in features
        )
        self.ad_fields["puerta_auto"]["value"] = 1 * any(
            "Puerta" in feature for feature in features
        )
        self.ad_fields["ascensor"]["value"] = 1 * any(
            "ascensor" in feature for feature in features
        )
        self.ad_fields["alarma"]["value"] = 1 * any(
            "Alarma" in feature for feature in features
        )
        self.ad_fields["circuito"]["value"] = 1 * any(
            "Cámaras" in feature for feature in features
        )
        self.ad_fields["personal"]["value"] = 1 * any(
            "Personal" in feature for feature in features
        )

        self.ad_fields["cubierta"]["found"] = True
        self.ad_fields["puerta_auto"]["found"] = True
        self.ad_fields["ascensor"]["found"] = True
        self.ad_fields["alarma"]["found"] = True
        self.ad_fields["circuito"]["found"] = True
        self.ad_fields["personal"]["found"] = True

        if soup.find("p", {"class": "txt-bold _browserPhone icon-phone"}) is not None:
            self.ad_fields["telefono"]["value"] = soup.find(
                "p", {"class": "txt-bold _browserPhone icon-phone"}
            ).text.replace(" ", "")
            self.ad_fields["telefono"]["found"] = True

    def _validate(self) -> None:
        """
        Checks whether the extracted values are valid against the expected
        typology. Stores the results.
        :return: None
        """
        self.invalid_fields = []

        if not re.match(r"[0-9]{4,20}", self.ad_fields["referencia"]["value"]):
            self.invalid_fields.append("referencia")

        if not re.match(r"[0-9]{1,20}", self.ad_fields["precio"]["value"]):
            self.invalid_fields.append("precio")

        possible_values_tamano = [
            "2 coches o más",
            "coche y moto",
            "coche grande",
            "coche pequeño",
            "moto",
            None,
        ]
        if self.ad_fields["tamano_categorico"]["value"] not in possible_values_tamano:
            self.invalid_fields.append("tamano_categorico")

        if not "Barrio" in self.ad_fields["barrio"]["value"]:
            self.invalid_fields.append("barrio")

        if not "Distrito" in self.ad_fields["distrito"]["value"]:
            self.invalid_fields.append("distrito")

        if self.ad_fields["telefono"]["found"] and not re.match(
            r"\s*\+?[0-9\s]*", self.ad_fields["telefono"]["value"]
        ):
            self.invalid_fields.append("telefono")
        # TODO añadir + a caracteres validos

    def all_fields_are_valid(self) -> bool:
        """
        Reports on whether the extracted data is valid.
        :return: True if values are valid, false if not
        """
        self._validate()
        if self.invalid_fields:
            return False
        else:
            return True

    def fields_missing(self) -> bool:
        """
        Reports on whether all compulsory fields are present.
        :return: True if some field is missing, false if not
        """
        for key, contents in self.ad_fields.items():
            if not contents["optional"] and not contents["found"]:
                return True
        return False

    def get_data(self) -> dict:
        """
        Returns the extracted data in the form of a dictionary.
        :return: dictionary with the extracted data
        """
        data = {}

        for ad_field in self.ad_fields.keys():
            data[ad_field] = self.ad_fields[ad_field]["value"]

        return data


if __name__ == "__main__":

    capturing_tasks_interface = CapturingTasksInterface()
    capturas_interface = CapturasInterface()

    throttling_manager = ThrottleManager()
    throttling_manager.add_rule(WorkingHoursThrottlingRule(working_hours)).add_rule(
        CooldownThrottlingRule(minimum_seconds_between_tries),
        required_argument_names=["last_attempt_timestamp"],
    ).add_rule(
        DynamicThrottlingRule(
            lambda: bool(capturing_tasks_interface.get_pending_task())
        )
    )

    parsing_flow_generator = ParsingFlowGenerator(
        ParsingFlow,
        (
            (ReferenciaFieldInstructions, {}),
            (PrecioFieldInstructions, {}),
            (TamanoCategoricoFieldInstructions, {}),
            (M2FieldInstructions, {}),
            (TipoAnuncioFieldInstructions, {}),
            (CalleFieldInstructions, {}),
            (BarrioFieldInstructions, {}),
            (DistritoFieldInstructions, {}),
            (CiudadFieldInstructions, {}),
            (
                SecondaryFeaturesFieldInstructions,
                {"field_name": "cubierta", "search_keyword": "Cubierta"},
            ),
            (
                SecondaryFeaturesFieldInstructions,
                {"field_name": "puerta_auto", "search_keyword": "Puerta"},
            ),
            (
                SecondaryFeaturesFieldInstructions,
                {"field_name": "ascensor", "search_keyword": "ascensor"},
            ),
            (
                SecondaryFeaturesFieldInstructions,
                {"field_name": "alarma", "search_keyword": "Alarma"},
            ),
            (
                SecondaryFeaturesFieldInstructions,
                {"field_name": "circuito", "search_keyword": "Cámaras"},
            ),
            (
                SecondaryFeaturesFieldInstructions,
                {"field_name": "personal", "search_keyword": "Personal"},
            ),
            (TelefonoFieldInstructions, {}),
        ),
    )

    capturer = Capturer(
        throttling_manager=throttling_manager,
        capturing_tasks_interface=capturing_tasks_interface,
        capturas_interface=capturas_interface,
        parsing_flow_generator=parsing_flow_generator,
        url_acquisition_object=UrlAttack,
        dead_ad_checker=Refresher.dead_ad_checker,
    )
    capturer.start()