diff --git a/capturer/capturer.py b/capturer/capturer.py index 3acda04..3b25056 100644 --- a/capturer/capturer.py +++ b/capturer/capturer.py @@ -5,12 +5,18 @@ from time import sleep from bs4 import BeautifulSoup import re import datetime + from db_layer.capturing_tasks_interface import capturing_interface from db_layer.capturas_interface import capturas_interface from core.scrapping_utils import UrlAttack from core.config import working_hours, minimum_seconds_between_tries +from core.throttling_utils import ( + ThrottleManager, + WorkingHoursThrottlingRule, + CooldownThrottlingRule, + DynamicThrottlingRule, +) from refresher.refresher import Refresher -from core import my_logger import logging @@ -20,7 +26,8 @@ class Capturer: scraping and db storage. """ - def __init__(self) -> None: + def __init__(self, throttling_manager: ThrottleManager) -> None: + self._throttling_manager = throttling_manager self.last_try_datetime = datetime.datetime.now() def start(self) -> None: @@ -33,22 +40,15 @@ class Capturer: logging.info("Starting capturer") while True: - if not self._in_working_hours(): - sleep(1800) - logging.info("Waiting...") - continue - - seconds_to_next_capture = ( - minimum_seconds_between_tries() - self._seconds_since_last_try() - ) - if seconds_to_next_capture > 0: - sleep(seconds_to_next_capture) + while not self._throttling_manager.allow_next_task( + last_attempt_timestamp=self.last_try_datetime + ): + sleep(10) logging.info("Waiting...") pending_task = capturing_interface.get_pending_task() - if not pending_task: - logging.info("No pending tasks.") - continue + + logging.info("Got a task") task = CapturingTask(pending_task) self.last_try_datetime = datetime.datetime.now() @@ -64,25 +64,6 @@ class Capturer: task._update_status("Captura inserted") logging.info("New ad inserted.") - @staticmethod - def _in_working_hours() -> bool: - """ - Checks whether now is within the working hours of the daemon. - :return: True if so, false if not - """ - return ( - working_hours["start"] - <= datetime.datetime.now().time() - <= working_hours["end"] - ) - - def _seconds_since_last_try(self) -> float: - """ - Computes how many seconds have passed since the last capturing attempt - :return: seconds since last try as integer - """ - return (datetime.datetime.now() - self.last_try_datetime).total_seconds() - class CapturingTask: """ @@ -405,5 +386,14 @@ class AdHtmlParser: if __name__ == "__main__": - capturer = Capturer() + + throttling_manager = ThrottleManager() + throttling_manager.add_rule(WorkingHoursThrottlingRule(working_hours)).add_rule( + CooldownThrottlingRule(minimum_seconds_between_tries), + required_argument_names=["last_attempt_timestamp"], + ).add_rule( + DynamicThrottlingRule(lambda: bool(capturing_interface.get_pending_task())) + ) + + capturer = Capturer(throttling_manager=throttling_manager) capturer.start()