Integrated throttling in capturer.

This commit is contained in:
pablo 2020-12-27 12:35:02 +01:00
parent d136144a4e
commit 3f9a6d8e53

View file

@ -5,12 +5,18 @@ from time import sleep
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import re import re
import datetime import datetime
from db_layer.capturing_tasks_interface import capturing_interface from db_layer.capturing_tasks_interface import capturing_interface
from db_layer.capturas_interface import capturas_interface from db_layer.capturas_interface import capturas_interface
from core.scrapping_utils import UrlAttack from core.scrapping_utils import UrlAttack
from core.config import working_hours, minimum_seconds_between_tries from core.config import working_hours, minimum_seconds_between_tries
from core.throttling_utils import (
ThrottleManager,
WorkingHoursThrottlingRule,
CooldownThrottlingRule,
DynamicThrottlingRule,
)
from refresher.refresher import Refresher from refresher.refresher import Refresher
from core import my_logger
import logging import logging
@ -20,7 +26,8 @@ class Capturer:
scraping and db storage. scraping and db storage.
""" """
def __init__(self) -> None: def __init__(self, throttling_manager: ThrottleManager) -> None:
self._throttling_manager = throttling_manager
self.last_try_datetime = datetime.datetime.now() self.last_try_datetime = datetime.datetime.now()
def start(self) -> None: def start(self) -> None:
@ -33,22 +40,15 @@ class Capturer:
logging.info("Starting capturer") logging.info("Starting capturer")
while True: while True:
if not self._in_working_hours(): while not self._throttling_manager.allow_next_task(
sleep(1800) last_attempt_timestamp=self.last_try_datetime
logging.info("Waiting...") ):
continue sleep(10)
seconds_to_next_capture = (
minimum_seconds_between_tries() - self._seconds_since_last_try()
)
if seconds_to_next_capture > 0:
sleep(seconds_to_next_capture)
logging.info("Waiting...") logging.info("Waiting...")
pending_task = capturing_interface.get_pending_task() pending_task = capturing_interface.get_pending_task()
if not pending_task:
logging.info("No pending tasks.") logging.info("Got a task")
continue
task = CapturingTask(pending_task) task = CapturingTask(pending_task)
self.last_try_datetime = datetime.datetime.now() self.last_try_datetime = datetime.datetime.now()
@ -64,25 +64,6 @@ class Capturer:
task._update_status("Captura inserted") task._update_status("Captura inserted")
logging.info("New ad inserted.") logging.info("New ad inserted.")
@staticmethod
def _in_working_hours() -> bool:
"""
Checks whether now is within the working hours of the daemon.
:return: True if so, false if not
"""
return (
working_hours["start"]
<= datetime.datetime.now().time()
<= working_hours["end"]
)
def _seconds_since_last_try(self) -> float:
"""
Computes how many seconds have passed since the last capturing attempt
:return: seconds since last try as integer
"""
return (datetime.datetime.now() - self.last_try_datetime).total_seconds()
class CapturingTask: class CapturingTask:
""" """
@ -405,5 +386,14 @@ class AdHtmlParser:
if __name__ == "__main__": if __name__ == "__main__":
capturer = Capturer()
throttling_manager = ThrottleManager()
throttling_manager.add_rule(WorkingHoursThrottlingRule(working_hours)).add_rule(
CooldownThrottlingRule(minimum_seconds_between_tries),
required_argument_names=["last_attempt_timestamp"],
).add_rule(
DynamicThrottlingRule(lambda: bool(capturing_interface.get_pending_task()))
)
capturer = Capturer(throttling_manager=throttling_manager)
capturer.start() capturer.start()