Integrated throttling in capturer.
This commit is contained in:
parent
d136144a4e
commit
3f9a6d8e53
1 changed files with 25 additions and 35 deletions
|
|
@ -5,12 +5,18 @@ from time import sleep
|
|||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
import datetime
|
||||
|
||||
from db_layer.capturing_tasks_interface import capturing_interface
|
||||
from db_layer.capturas_interface import capturas_interface
|
||||
from core.scrapping_utils import UrlAttack
|
||||
from core.config import working_hours, minimum_seconds_between_tries
|
||||
from core.throttling_utils import (
|
||||
ThrottleManager,
|
||||
WorkingHoursThrottlingRule,
|
||||
CooldownThrottlingRule,
|
||||
DynamicThrottlingRule,
|
||||
)
|
||||
from refresher.refresher import Refresher
|
||||
from core import my_logger
|
||||
import logging
|
||||
|
||||
|
||||
|
|
@ -20,7 +26,8 @@ class Capturer:
|
|||
scraping and db storage.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
def __init__(self, throttling_manager: ThrottleManager) -> None:
|
||||
self._throttling_manager = throttling_manager
|
||||
self.last_try_datetime = datetime.datetime.now()
|
||||
|
||||
def start(self) -> None:
|
||||
|
|
@ -33,22 +40,15 @@ class Capturer:
|
|||
logging.info("Starting capturer")
|
||||
while True:
|
||||
|
||||
if not self._in_working_hours():
|
||||
sleep(1800)
|
||||
logging.info("Waiting...")
|
||||
continue
|
||||
|
||||
seconds_to_next_capture = (
|
||||
minimum_seconds_between_tries() - self._seconds_since_last_try()
|
||||
)
|
||||
if seconds_to_next_capture > 0:
|
||||
sleep(seconds_to_next_capture)
|
||||
while not self._throttling_manager.allow_next_task(
|
||||
last_attempt_timestamp=self.last_try_datetime
|
||||
):
|
||||
sleep(10)
|
||||
logging.info("Waiting...")
|
||||
|
||||
pending_task = capturing_interface.get_pending_task()
|
||||
if not pending_task:
|
||||
logging.info("No pending tasks.")
|
||||
continue
|
||||
|
||||
logging.info("Got a task")
|
||||
|
||||
task = CapturingTask(pending_task)
|
||||
self.last_try_datetime = datetime.datetime.now()
|
||||
|
|
@ -64,25 +64,6 @@ class Capturer:
|
|||
task._update_status("Captura inserted")
|
||||
logging.info("New ad inserted.")
|
||||
|
||||
@staticmethod
|
||||
def _in_working_hours() -> bool:
|
||||
"""
|
||||
Checks whether now is within the working hours of the daemon.
|
||||
:return: True if so, false if not
|
||||
"""
|
||||
return (
|
||||
working_hours["start"]
|
||||
<= datetime.datetime.now().time()
|
||||
<= working_hours["end"]
|
||||
)
|
||||
|
||||
def _seconds_since_last_try(self) -> float:
|
||||
"""
|
||||
Computes how many seconds have passed since the last capturing attempt
|
||||
:return: seconds since last try as integer
|
||||
"""
|
||||
return (datetime.datetime.now() - self.last_try_datetime).total_seconds()
|
||||
|
||||
|
||||
class CapturingTask:
|
||||
"""
|
||||
|
|
@ -405,5 +386,14 @@ class AdHtmlParser:
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
capturer = Capturer()
|
||||
|
||||
throttling_manager = ThrottleManager()
|
||||
throttling_manager.add_rule(WorkingHoursThrottlingRule(working_hours)).add_rule(
|
||||
CooldownThrottlingRule(minimum_seconds_between_tries),
|
||||
required_argument_names=["last_attempt_timestamp"],
|
||||
).add_rule(
|
||||
DynamicThrottlingRule(lambda: bool(capturing_interface.get_pending_task()))
|
||||
)
|
||||
|
||||
capturer = Capturer(throttling_manager=throttling_manager)
|
||||
capturer.start()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue