Minor fixes.

This commit is contained in:
pablo 2021-01-03 20:05:34 +01:00
parent e34a34acaf
commit 007f458cd5
3 changed files with 140 additions and 39 deletions

View file

@ -1,13 +1,10 @@
import sys import sys
sys.path.append("..")
from time import sleep from time import sleep
from bs4 import BeautifulSoup
import re
import datetime import datetime
from db_layer.capturing_tasks_interface import capturing_interface from db_layer.capturing_tasks_interface import CapturingTasksInterface
from db_layer.capturas_interface import capturas_interface from db_layer.capturas_interface import CapturasInterface
from core.scrapping_utils import UrlAttack from core.scrapping_utils import UrlAttack
from core.config import working_hours, minimum_seconds_between_tries from core.config import working_hours, minimum_seconds_between_tries
from core.throttling_utils import ( from core.throttling_utils import (
@ -17,6 +14,7 @@ from core.throttling_utils import (
DynamicThrottlingRule, DynamicThrottlingRule,
) )
from refresher.refresher import Refresher from refresher.refresher import Refresher
from core.parsing_utils import *
import logging import logging
@ -26,8 +24,22 @@ class Capturer:
scraping and db storage. scraping and db storage.
""" """
def __init__(self, throttling_manager: ThrottleManager) -> None: def __init__(
self,
throttling_manager: ThrottleManager,
capturing_tasks_interface: CapturingTasksInterface,
capturas_interface: CapturasInterface,
parsing_flow_generator: ParsingFlowGenerator,
url_acquisition_object: Type[UrlAttack],
dead_ad_checker: Callable,
) -> None:
self._throttling_manager = throttling_manager self._throttling_manager = throttling_manager
self._capturing_tasks_interface = capturing_tasks_interface
self._capturas_interface = capturas_interface
self._parsing_flow_generator = parsing_flow_generator
self._url_acquisition_object = url_acquisition_object
self._dead_ad_checker = dead_ad_checker
self.last_try_datetime = datetime.datetime.now() self.last_try_datetime = datetime.datetime.now()
def start(self) -> None: def start(self) -> None:
@ -46,11 +58,17 @@ class Capturer:
sleep(10) sleep(10)
logging.info("Waiting...") logging.info("Waiting...")
pending_task = capturing_interface.get_pending_task() pending_task = self._capturing_tasks_interface.get_pending_task()
logging.info("Got a task") logging.info("Got a task")
task = CapturingTask(pending_task) task = CapturingTask(
pending_task,
capturing_interface=self._capturing_tasks_interface,
new_parsing_flow=self._parsing_flow_generator.get_new_flow(),
url_acquisition_object=self._url_acquisition_object,
dead_ad_checker=self._dead_ad_checker,
)
self.last_try_datetime = datetime.datetime.now() self.last_try_datetime = datetime.datetime.now()
task.capture() task.capture()
@ -60,8 +78,8 @@ class Capturer:
logging.warning("Something went wrong, not adding data.") logging.warning("Something went wrong, not adding data.")
continue continue
capturas_interface.insert_captura(ad_data) self._capturas_interface.insert_captura(ad_data)
task._update_status("Captura inserted") task.update_status("Captura inserted")
logging.info("New ad inserted.") logging.info("New ad inserted.")
@ -73,29 +91,40 @@ class CapturingTask:
sleep_time_failed_request = 180 sleep_time_failed_request = 180
def __init__(self, parameters) -> None: def __init__(
self,
task_parameters: dict,
capturing_interface: CapturingTasksInterface,
new_parsing_flow: ParsingFlow,
url_acquisition_object: Type[UrlAttack],
dead_ad_checker: Callable,
) -> None:
""" """
Initialize with task parameters and mark the task as being worked on Initialize with task parameters and mark the task as being worked on
in the task queue. in the task queue.
:param parameters: dict with the necessary parameters for the task :param task_parameters: dict with the necessary parameters for the task
""" """
self.uuid = parameters["uuid"] self.uuid = task_parameters["uuid"]
self.ad_url = parameters["ad_url"] self.ad_url = task_parameters["ad_url"]
self.uuid_exploring = parameters["fk_uuid_exploring"] self.uuid_exploring = task_parameters["fk_uuid_exploring"]
self.status = parameters["status"] self.status = task_parameters["status"]
self.request_failures = 1 self.request_failures = 1
self.html = None self.html = None
self._parsing_flow = new_parsing_flow
self._capturing_interface = capturing_interface
self._url_acquistion_object = url_acquisition_object
self._is_dead_ad = dead_ad_checker
self._update_status("Loading") self.update_status("Loading")
def _update_status(self, new_status) -> None: def update_status(self, new_status) -> None:
""" """
Updates the task status and persists it in the task queue. Updates the task status and persists it in the task queue.
:param new_status: string describing the new status :param new_status: string describing the new status
:return: None :return: None
""" """
self.status = new_status self.status = new_status
capturing_interface.update_capturing_task( self._capturing_interface.update_capturing_task(
self.uuid, self.uuid_exploring, self.status, self.ad_url self.uuid, self.uuid_exploring, self.status, self.ad_url
) )
@ -103,34 +132,32 @@ class CapturingTask:
""" """
Main flow of work Main flow of work
""" """
self._update_status("WIP") self.update_status("WIP")
while self.request_failures < 4: while self.request_failures < 4:
attack = UrlAttack(self.ad_url) attack = self._url_acquistion_object(self.ad_url)
attack.attack() attack.attack()
if attack.success: if attack.success:
self.html = attack.get_text() self._parse_html(html=attack.get_text())
self._extract_data()
self._check_data()
return return
if not attack.success: if not attack.success:
try: try:
if Refresher.dead_ad_checker(attack.get_text()): if self._is_dead_ad(attack.get_text()):
self._update_status("Dead ad") self.update_status("Dead ad")
return return
except AttributeError: except AttributeError:
logging.error( logging.error(
"Something went wrong when checking if the ad is gone" "Something went wrong when checking if the ad is gone"
) )
self._update_status("Fail {}".format(self.request_failures)) self.update_status("Fail {}".format(self.request_failures))
self.request_failures += 1 self.request_failures += 1
sleep(CapturingTask.sleep_time_failed_request) sleep(CapturingTask.sleep_time_failed_request)
continue continue
self._update_status("Surrender") self.update_status("Surrender")
logging.warning(f"A task has surrendered. {self.ad_url}") logging.warning(f"A task has surrendered. {self.ad_url}")
def _extract_data(self) -> None: def _extract_data(self) -> None:
@ -148,21 +175,40 @@ class CapturingTask:
:return: None :return: None
""" """
if self.parser.fields_missing(): if self.parser.fields_missing():
self._update_status("Fields missing") self.update_status("Fields missing")
return return
if not self.parser.all_fields_are_valid(): if not self.parser.all_fields_are_valid():
self._update_status("Invalid value fields") self.update_status("Invalid value fields")
return return
self._update_status("Data ready") self.update_status("Data ready")
def get_ad_data(self) -> dict: def get_ad_data(self) -> dict:
""" """
Returns the extracted data. Returns the extracted data.
:return: dictionary with the data of the ad. :return: dictionary with the data of the ad.
""" """
return self.parser.get_data() return self._parsing_flow.field_values
def _parse_html(self, html: str) -> None:
self._parsing_flow.execute_flow(soup=BeautifulSoup(html, "html5lib"))
if not self._parsing_flow.issues:
self.update_status("Data ready")
return
if not self._parsing_flow.all_found_fields_are_valid:
self.update_status("Invalid value fields")
logging.warning(f"Invalid fields found in ad: {self.ad_url}")
logging.warning(f"{self._parsing_flow.issues}")
return
if not self._parsing_flow.all_non_optional_fields_were_found:
self.update_status("Fields missing")
logging.warning(
f"Couldn't scrap necessary fields: {self._parsing_flow.issues}"
)
return
class AdHtmlParser: class AdHtmlParser:
@ -362,7 +408,7 @@ class AdHtmlParser:
else: else:
return True return True
def fields_missing(self) -> None: def fields_missing(self) -> bool:
""" """
Reports on whether all compulsory fields are present. Reports on whether all compulsory fields are present.
:return: True if some field is missing, false if not :return: True if some field is missing, false if not
@ -387,13 +433,65 @@ class AdHtmlParser:
if __name__ == "__main__": if __name__ == "__main__":
capturing_tasks_interface = CapturingTasksInterface()
capturas_interface = CapturasInterface()
throttling_manager = ThrottleManager() throttling_manager = ThrottleManager()
throttling_manager.add_rule(WorkingHoursThrottlingRule(working_hours)).add_rule( throttling_manager.add_rule(WorkingHoursThrottlingRule(working_hours)).add_rule(
CooldownThrottlingRule(minimum_seconds_between_tries), CooldownThrottlingRule(minimum_seconds_between_tries),
required_argument_names=["last_attempt_timestamp"], required_argument_names=["last_attempt_timestamp"],
).add_rule( ).add_rule(
DynamicThrottlingRule(lambda: bool(capturing_interface.get_pending_task())) DynamicThrottlingRule(
lambda: bool(capturing_tasks_interface.get_pending_task())
)
) )
capturer = Capturer(throttling_manager=throttling_manager) parsing_flow_generator = ParsingFlowGenerator(
ParsingFlow,
(
(ReferenciaFieldInstructions, {}),
(PrecioFieldInstructions, {}),
(TamanoCategoricoFieldInstructions, {}),
(M2FieldInstructions, {}),
(TipoAnuncioFieldInstructions, {}),
(CalleFieldInstructions, {}),
(BarrioFieldInstructions, {}),
(DistritoFieldInstructions, {}),
(CiudadFieldInstructions, {}),
(
SecondaryFeaturesFieldInstructions,
{"field_name": "cubierta", "search_keyword": "Cubierta"},
),
(
SecondaryFeaturesFieldInstructions,
{"field_name": "puerta_auto", "search_keyword": "Puerta"},
),
(
SecondaryFeaturesFieldInstructions,
{"field_name": "ascensor", "search_keyword": "ascensor"},
),
(
SecondaryFeaturesFieldInstructions,
{"field_name": "alarma", "search_keyword": "Alarma"},
),
(
SecondaryFeaturesFieldInstructions,
{"field_name": "circuito", "search_keyword": "Cámaras"},
),
(
SecondaryFeaturesFieldInstructions,
{"field_name": "personal", "search_keyword": "Personal"},
),
(TelefonoFieldInstructions, {}),
),
)
capturer = Capturer(
throttling_manager=throttling_manager,
capturing_tasks_interface=capturing_tasks_interface,
capturas_interface=capturas_interface,
parsing_flow_generator=parsing_flow_generator,
url_acquisition_object=UrlAttack,
dead_ad_checker=Refresher.dead_ad_checker,
)
capturer.start() capturer.start()

View file

@ -258,7 +258,7 @@ class TipoAnuncioFieldInstructions(BaseTargetFieldInstructions):
if "venta" in soup.find("title").text: if "venta" in soup.find("title").text:
self.value = 1 self.value = 1
self.found = True self.found = True
if "alquiler" in soup.find("title").text: if "Alquiler" in soup.find("title").text:
self.value = 2 self.value = 2
self.found = True self.found = True
@ -542,11 +542,11 @@ class ParsingFlow:
if (field.found or field.is_optional) and field.valid: if (field.found or field.is_optional) and field.valid:
continue continue
this_field_issues = {} this_field_issues = {}
if not field.found: if not field.found and not field.is_optional:
this_field_issues["found"] = "Not found" this_field_issues["found"] = "Not found"
if field.search_issue: if field.search_issue:
this_field_issues["search_issue"] = field.search_issue this_field_issues["search_issue"] = field.search_issue
if not field.valid: if not field.valid and field.valid is not None:
this_field_issues["validity"] = "Not valid" this_field_issues["validity"] = "Not valid"
this_field_issues["value"] = field.value this_field_issues["value"] = field.value

View file

@ -95,7 +95,10 @@ class UrlAttack:
except Exception as e: except Exception as e:
self.success = False self.success = False
if random.randrange(0, 100) < UrlAttack.identity_change_probability: if (
not self.success
or random.randrange(0, 100) < UrlAttack.identity_change_probability
):
self._change_identity() self._change_identity()
def _change_identity(self) -> None: def _change_identity(self) -> None: