2018-09-22 23:56:01 +02:00
|
|
|
import sys
|
2018-10-19 17:22:09 +02:00
|
|
|
|
2020-03-26 11:37:32 +01:00
|
|
|
sys.path.append("..")
|
2018-09-23 21:04:00 +02:00
|
|
|
from time import sleep
|
2018-10-06 19:09:44 +02:00
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
import re
|
2018-10-14 18:41:12 +02:00
|
|
|
import datetime
|
2020-12-27 12:35:02 +01:00
|
|
|
|
2018-10-13 02:08:58 +02:00
|
|
|
from db_layer.capturing_tasks_interface import capturing_interface
|
|
|
|
|
from db_layer.capturas_interface import capturas_interface
|
2018-09-22 23:56:01 +02:00
|
|
|
from core.scrapping_utils import UrlAttack
|
2018-12-01 16:26:25 +01:00
|
|
|
from core.config import working_hours, minimum_seconds_between_tries
|
2020-12-27 12:35:02 +01:00
|
|
|
from core.throttling_utils import (
|
|
|
|
|
ThrottleManager,
|
|
|
|
|
WorkingHoursThrottlingRule,
|
|
|
|
|
CooldownThrottlingRule,
|
|
|
|
|
DynamicThrottlingRule,
|
|
|
|
|
)
|
2018-10-14 19:24:17 +02:00
|
|
|
from refresher.refresher import Refresher
|
2020-03-26 11:37:32 +01:00
|
|
|
import logging
|
2018-10-06 19:09:44 +02:00
|
|
|
|
2018-10-19 17:22:09 +02:00
|
|
|
|
2018-10-06 19:09:44 +02:00
|
|
|
class Capturer:
|
2020-11-03 13:50:36 +01:00
|
|
|
"""
|
|
|
|
|
Daemon with the full flow of execution of individual ad requesting, data
|
|
|
|
|
scraping and db storage.
|
|
|
|
|
"""
|
2018-12-04 21:02:30 +01:00
|
|
|
|
2020-12-27 12:35:02 +01:00
|
|
|
def __init__(self, throttling_manager: ThrottleManager) -> None:
|
|
|
|
|
self._throttling_manager = throttling_manager
|
2018-12-04 21:02:30 +01:00
|
|
|
self.last_try_datetime = datetime.datetime.now()
|
|
|
|
|
|
2020-11-03 13:50:36 +01:00
|
|
|
def start(self) -> None:
|
|
|
|
|
"""
|
|
|
|
|
Full flow of execution. Checks whether it should capture a URL, tries
|
|
|
|
|
to do so and stores the result if successful.
|
|
|
|
|
:return: None
|
|
|
|
|
"""
|
2018-10-06 19:09:44 +02:00
|
|
|
|
2020-03-26 11:37:32 +01:00
|
|
|
logging.info("Starting capturer")
|
2018-10-06 19:09:44 +02:00
|
|
|
while True:
|
|
|
|
|
|
2020-12-27 12:35:02 +01:00
|
|
|
while not self._throttling_manager.allow_next_task(
|
|
|
|
|
last_attempt_timestamp=self.last_try_datetime
|
|
|
|
|
):
|
|
|
|
|
sleep(10)
|
2020-05-05 11:36:28 +02:00
|
|
|
logging.info("Waiting...")
|
|
|
|
|
|
2019-01-07 18:09:52 +01:00
|
|
|
pending_task = capturing_interface.get_pending_task()
|
2020-12-27 12:35:02 +01:00
|
|
|
|
|
|
|
|
logging.info("Got a task")
|
2020-11-03 13:50:36 +01:00
|
|
|
|
2019-01-07 18:09:52 +01:00
|
|
|
task = CapturingTask(pending_task)
|
2018-12-04 21:02:30 +01:00
|
|
|
self.last_try_datetime = datetime.datetime.now()
|
2018-10-06 19:09:44 +02:00
|
|
|
task.capture()
|
|
|
|
|
|
2020-03-26 11:37:32 +01:00
|
|
|
if task.status == "Data ready":
|
2018-10-06 19:09:44 +02:00
|
|
|
ad_data = task.get_ad_data()
|
|
|
|
|
else:
|
2020-11-03 13:50:36 +01:00
|
|
|
logging.warning("Something went wrong, not adding data.")
|
2018-10-06 19:09:44 +02:00
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
capturas_interface.insert_captura(ad_data)
|
2020-03-26 11:37:32 +01:00
|
|
|
task._update_status("Captura inserted")
|
2020-11-03 13:50:36 +01:00
|
|
|
logging.info("New ad inserted.")
|
2018-10-06 19:09:44 +02:00
|
|
|
|
2018-12-04 21:02:30 +01:00
|
|
|
|
2018-09-22 23:56:01 +02:00
|
|
|
class CapturingTask:
|
2020-11-03 13:50:36 +01:00
|
|
|
"""
|
|
|
|
|
Task object wrapping the process of attempting to capture and ad, parsing
|
|
|
|
|
the data and sending to db.
|
|
|
|
|
"""
|
|
|
|
|
|
2018-12-04 21:02:30 +01:00
|
|
|
sleep_time_failed_request = 180
|
2018-09-23 21:04:00 +02:00
|
|
|
|
2020-11-03 13:50:36 +01:00
|
|
|
def __init__(self, parameters) -> None:
|
|
|
|
|
"""
|
|
|
|
|
Initialize with task parameters and mark the task as being worked on
|
|
|
|
|
in the task queue.
|
|
|
|
|
:param parameters: dict with the necessary parameters for the task
|
|
|
|
|
"""
|
2020-03-26 11:37:32 +01:00
|
|
|
self.uuid = parameters["uuid"]
|
|
|
|
|
self.ad_url = parameters["ad_url"]
|
|
|
|
|
self.uuid_exploring = parameters["fk_uuid_exploring"]
|
|
|
|
|
self.status = parameters["status"]
|
2018-09-26 22:56:45 +02:00
|
|
|
self.request_failures = 1
|
2018-10-19 18:05:35 +02:00
|
|
|
self.html = None
|
2018-09-22 23:56:01 +02:00
|
|
|
|
2020-03-26 11:37:32 +01:00
|
|
|
self._update_status("Loading")
|
2018-09-22 23:56:01 +02:00
|
|
|
|
2020-11-03 13:50:36 +01:00
|
|
|
def _update_status(self, new_status) -> None:
|
|
|
|
|
"""
|
|
|
|
|
Updates the task status and persists it in the task queue.
|
|
|
|
|
:param new_status: string describing the new status
|
|
|
|
|
:return: None
|
|
|
|
|
"""
|
2018-09-22 23:56:01 +02:00
|
|
|
self.status = new_status
|
2020-03-26 11:37:32 +01:00
|
|
|
capturing_interface.update_capturing_task(
|
|
|
|
|
self.uuid, self.uuid_exploring, self.status, self.ad_url
|
|
|
|
|
)
|
2018-09-22 23:56:01 +02:00
|
|
|
|
2020-11-03 13:50:36 +01:00
|
|
|
def capture(self) -> None:
|
2018-09-22 23:56:01 +02:00
|
|
|
"""
|
2020-11-03 13:50:36 +01:00
|
|
|
Main flow of work
|
2018-09-22 23:56:01 +02:00
|
|
|
"""
|
2020-03-26 11:37:32 +01:00
|
|
|
self._update_status("WIP")
|
2018-09-23 21:04:00 +02:00
|
|
|
|
2018-12-04 21:02:30 +01:00
|
|
|
while self.request_failures < 4:
|
2018-09-23 21:04:00 +02:00
|
|
|
attack = UrlAttack(self.ad_url)
|
|
|
|
|
attack.attack()
|
|
|
|
|
|
2018-10-13 02:08:58 +02:00
|
|
|
if attack.success:
|
2018-09-23 21:04:00 +02:00
|
|
|
self.html = attack.get_text()
|
2018-10-12 20:08:18 +02:00
|
|
|
self._extract_data()
|
|
|
|
|
self._check_data()
|
2018-10-13 17:45:42 +02:00
|
|
|
return
|
2018-10-06 19:09:44 +02:00
|
|
|
|
2020-11-03 13:50:36 +01:00
|
|
|
if not attack.success:
|
2018-11-27 20:37:21 +01:00
|
|
|
try:
|
|
|
|
|
if Refresher.dead_ad_checker(attack.get_text()):
|
2020-03-26 11:37:32 +01:00
|
|
|
self._update_status("Dead ad")
|
2018-11-27 20:37:21 +01:00
|
|
|
return
|
|
|
|
|
except AttributeError:
|
2020-11-03 13:50:36 +01:00
|
|
|
logging.error(
|
|
|
|
|
"Something went wrong when checking if the ad is gone"
|
|
|
|
|
)
|
2018-11-27 20:37:21 +01:00
|
|
|
|
2020-03-26 11:37:32 +01:00
|
|
|
self._update_status("Fail {}".format(self.request_failures))
|
2018-11-16 18:20:50 +01:00
|
|
|
self.request_failures += 1
|
2018-10-12 20:08:18 +02:00
|
|
|
sleep(CapturingTask.sleep_time_failed_request)
|
2018-09-23 21:04:00 +02:00
|
|
|
continue
|
|
|
|
|
|
2020-03-26 11:37:32 +01:00
|
|
|
self._update_status("Surrender")
|
|
|
|
|
logging.warning(f"A task has surrendered. {self.ad_url}")
|
2018-09-22 23:56:01 +02:00
|
|
|
|
2020-11-03 13:50:36 +01:00
|
|
|
def _extract_data(self) -> None:
|
|
|
|
|
"""
|
|
|
|
|
Parses the obtained html to extract the ad information.
|
|
|
|
|
:return: None
|
|
|
|
|
"""
|
2018-10-12 20:08:18 +02:00
|
|
|
self.parser = AdHtmlParser(self.html)
|
|
|
|
|
self.parser.parse()
|
2018-09-23 21:04:00 +02:00
|
|
|
|
2020-11-03 13:50:36 +01:00
|
|
|
def _check_data(self) -> None:
|
|
|
|
|
"""
|
|
|
|
|
Validates that all compulsory fields have been obtained and that the
|
|
|
|
|
values are within the expected. Sets the status of task accordingly.
|
|
|
|
|
:return: None
|
|
|
|
|
"""
|
2018-10-13 17:45:42 +02:00
|
|
|
if self.parser.fields_missing():
|
2020-03-26 11:37:32 +01:00
|
|
|
self._update_status("Fields missing")
|
2018-10-12 20:08:18 +02:00
|
|
|
return
|
|
|
|
|
|
2018-10-19 17:22:09 +02:00
|
|
|
if not self.parser.all_fields_are_valid():
|
2020-03-26 11:37:32 +01:00
|
|
|
self._update_status("Invalid value fields")
|
2018-10-19 17:22:09 +02:00
|
|
|
return
|
|
|
|
|
|
2020-03-26 11:37:32 +01:00
|
|
|
self._update_status("Data ready")
|
2018-10-12 20:08:18 +02:00
|
|
|
|
2020-11-03 13:50:36 +01:00
|
|
|
def get_ad_data(self) -> dict:
|
|
|
|
|
"""
|
|
|
|
|
Returns the extracted data.
|
|
|
|
|
:return: dictionary with the data of the ad.
|
|
|
|
|
"""
|
2018-10-12 20:08:18 +02:00
|
|
|
return self.parser.get_data()
|
2018-09-26 22:56:45 +02:00
|
|
|
|
|
|
|
|
|
2018-10-06 19:09:44 +02:00
|
|
|
class AdHtmlParser:
|
2020-11-03 13:50:36 +01:00
|
|
|
"""
|
|
|
|
|
Object for parsing, storing and validating the data of the HTML of an ad.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, html_string: str) -> None:
|
|
|
|
|
"""
|
|
|
|
|
Initializes an instance of the parser with the HTML of an ad.
|
|
|
|
|
:param html_string: the full HTML code of the ad page
|
|
|
|
|
"""
|
2018-10-06 19:09:44 +02:00
|
|
|
self.html = html_string
|
|
|
|
|
|
2020-03-26 11:37:32 +01:00
|
|
|
self.ad_fields = {
|
|
|
|
|
"referencia": {"found": False, "optional": False, "value": None},
|
|
|
|
|
"precio": {"found": False, "optional": False, "value": None},
|
|
|
|
|
"tamano_categorico": {"found": False, "optional": True, "value": None},
|
|
|
|
|
"m2": {"found": False, "optional": True, "value": None},
|
|
|
|
|
"tipo_anuncio": {"found": False, "optional": False, "value": None},
|
|
|
|
|
"calle": {"found": False, "optional": True, "value": None},
|
|
|
|
|
"barrio": {"found": False, "optional": False, "value": None},
|
|
|
|
|
"distrito": {"found": False, "optional": False, "value": None},
|
|
|
|
|
"ciudad": {"found": False, "optional": False, "value": None},
|
|
|
|
|
"cubierta": {"found": False, "optional": False, "value": None},
|
|
|
|
|
"puerta_auto": {"found": False, "optional": False, "value": None},
|
|
|
|
|
"ascensor": {"found": False, "optional": False, "value": None},
|
|
|
|
|
"alarma": {"found": False, "optional": False, "value": None},
|
|
|
|
|
"circuito": {"found": False, "optional": False, "value": None},
|
|
|
|
|
"personal": {"found": False, "optional": False, "value": None},
|
|
|
|
|
"telefono": {"found": False, "optional": True, "value": None},
|
|
|
|
|
}
|
2018-10-06 19:09:44 +02:00
|
|
|
|
2020-11-03 13:50:36 +01:00
|
|
|
def parse(self) -> None:
|
|
|
|
|
"""
|
|
|
|
|
Parses the HTML and stores the ad data.
|
|
|
|
|
:return: None
|
|
|
|
|
"""
|
2018-10-06 19:09:44 +02:00
|
|
|
|
2020-03-26 11:37:32 +01:00
|
|
|
soup = BeautifulSoup(self.html, "html5lib")
|
2018-10-06 19:09:44 +02:00
|
|
|
|
2020-03-26 11:37:32 +01:00
|
|
|
if soup.find_all("link", {"rel": "canonical"}) is not None:
|
|
|
|
|
self.ad_fields["referencia"]["value"] = re.findall(
|
|
|
|
|
r"[0-9]{5,20}", str(soup.find_all("link", {"rel": "canonical"})[0])
|
|
|
|
|
)[0]
|
|
|
|
|
self.ad_fields["referencia"]["found"] = True
|
2018-10-06 19:09:44 +02:00
|
|
|
|
2020-03-26 11:37:32 +01:00
|
|
|
if soup.find_all("strong", {"class": "price"}) is not None:
|
|
|
|
|
self.ad_fields["precio"]["value"] = "".join(
|
|
|
|
|
re.findall(
|
|
|
|
|
r"[0-9]", str(soup.find_all("strong", {"class": "price"})[0])
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
self.ad_fields["precio"]["found"] = True
|
2018-10-06 19:09:44 +02:00
|
|
|
|
2020-03-26 11:37:32 +01:00
|
|
|
if soup.find("div", {"class": "info-features"}) is not None:
|
2018-10-13 02:08:58 +02:00
|
|
|
try:
|
2020-03-26 11:37:32 +01:00
|
|
|
if (
|
|
|
|
|
"m²"
|
|
|
|
|
not in soup.find("div", {"class": "info-features"})
|
|
|
|
|
.find("span")
|
|
|
|
|
.find("span")
|
|
|
|
|
.text
|
|
|
|
|
):
|
|
|
|
|
self.ad_fields["tamano_categorico"]["value"] = (
|
|
|
|
|
soup.find("div", {"class": "info-features"})
|
|
|
|
|
.find("span")
|
|
|
|
|
.find("span")
|
|
|
|
|
.text
|
|
|
|
|
)
|
|
|
|
|
self.ad_fields["tamano_categorico"]["found"] = True
|
2018-10-13 02:08:58 +02:00
|
|
|
except:
|
|
|
|
|
pass
|
2018-10-06 19:09:44 +02:00
|
|
|
|
2020-03-26 11:37:32 +01:00
|
|
|
posible_m2 = [
|
|
|
|
|
tag.text
|
|
|
|
|
for tag in soup.find("div", {"class": "info-features"}).find_all("span")
|
|
|
|
|
]
|
|
|
|
|
if [posible for posible in posible_m2 if "m²" in posible]:
|
|
|
|
|
self.ad_fields["m2"]["value"] = [
|
|
|
|
|
"".join(re.findall(r"[0-9]+,*[0-9]*", posible))
|
|
|
|
|
for posible in posible_m2
|
|
|
|
|
if "m²" in posible
|
|
|
|
|
][0].replace(",", ".")
|
|
|
|
|
self.ad_fields["m2"]["found"] = True
|
|
|
|
|
|
|
|
|
|
if soup.find("title") is not None:
|
|
|
|
|
if "venta" in soup.find("title").text:
|
|
|
|
|
self.ad_fields["tipo_anuncio"]["value"] = 1
|
2018-10-12 20:08:18 +02:00
|
|
|
else:
|
2020-03-26 11:37:32 +01:00
|
|
|
self.ad_fields["tipo_anuncio"]["value"] = 2
|
|
|
|
|
self.ad_fields["tipo_anuncio"]["found"] = True
|
|
|
|
|
|
|
|
|
|
if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 3:
|
|
|
|
|
self.ad_fields["calle"]["value"] = ""
|
|
|
|
|
self.ad_fields["ciudad"]["value"] = (
|
|
|
|
|
soup.find("div", {"id": "headerMap"}).find_all("li")[-2].text.strip()
|
|
|
|
|
)
|
|
|
|
|
self.ad_fields["ciudad"]["found"] = True
|
|
|
|
|
self.ad_fields["distrito"]["value"] = (
|
|
|
|
|
soup.find("div", {"id": "headerMap"}).find_all("li")[-3].text.strip()
|
|
|
|
|
)
|
|
|
|
|
self.ad_fields["distrito"]["found"] = True
|
|
|
|
|
self.ad_fields["barrio"]["value"] = (
|
|
|
|
|
soup.find("div", {"id": "headerMap"}).find_all("li")[-4].text.strip()
|
|
|
|
|
)
|
|
|
|
|
self.ad_fields["barrio"]["found"] = True
|
|
|
|
|
if len(soup.find("div", {"id": "headerMap"}).find_all("li")) > 4:
|
|
|
|
|
self.ad_fields["calle"]["value"] = (
|
|
|
|
|
soup.find("div", {"id": "headerMap"}).find_all("li")[0].text.strip()
|
|
|
|
|
)
|
|
|
|
|
self.ad_fields["calle"]["found"] = True
|
|
|
|
|
|
|
|
|
|
features_lists = soup.find_all("div", {"class": "details-property_features"})
|
|
|
|
|
features = [
|
|
|
|
|
feature.text
|
|
|
|
|
for feature_list in features_lists
|
|
|
|
|
for feature in feature_list.find_all("li")
|
|
|
|
|
]
|
|
|
|
|
self.ad_fields["cubierta"]["value"] = 1 * any(
|
|
|
|
|
"Cubierta" in feature for feature in features
|
|
|
|
|
)
|
|
|
|
|
self.ad_fields["puerta_auto"]["value"] = 1 * any(
|
|
|
|
|
"Puerta" in feature for feature in features
|
|
|
|
|
)
|
|
|
|
|
self.ad_fields["ascensor"]["value"] = 1 * any(
|
|
|
|
|
"ascensor" in feature for feature in features
|
|
|
|
|
)
|
|
|
|
|
self.ad_fields["alarma"]["value"] = 1 * any(
|
|
|
|
|
"Alarma" in feature for feature in features
|
|
|
|
|
)
|
|
|
|
|
self.ad_fields["circuito"]["value"] = 1 * any(
|
|
|
|
|
"Cámaras" in feature for feature in features
|
|
|
|
|
)
|
|
|
|
|
self.ad_fields["personal"]["value"] = 1 * any(
|
|
|
|
|
"Personal" in feature for feature in features
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
self.ad_fields["cubierta"]["found"] = True
|
|
|
|
|
self.ad_fields["puerta_auto"]["found"] = True
|
|
|
|
|
self.ad_fields["ascensor"]["found"] = True
|
|
|
|
|
self.ad_fields["alarma"]["found"] = True
|
|
|
|
|
self.ad_fields["circuito"]["found"] = True
|
|
|
|
|
self.ad_fields["personal"]["found"] = True
|
|
|
|
|
|
|
|
|
|
if soup.find("p", {"class": "txt-bold _browserPhone icon-phone"}) is not None:
|
|
|
|
|
self.ad_fields["telefono"]["value"] = soup.find(
|
|
|
|
|
"p", {"class": "txt-bold _browserPhone icon-phone"}
|
|
|
|
|
).text.replace(" ", "")
|
|
|
|
|
self.ad_fields["telefono"]["found"] = True
|
2018-10-06 19:09:44 +02:00
|
|
|
|
2020-11-03 13:50:36 +01:00
|
|
|
def _validate(self) -> None:
|
|
|
|
|
"""
|
|
|
|
|
Checks whether the extracted values are valid against the expected
|
|
|
|
|
typology. Stores the results.
|
|
|
|
|
:return: None
|
|
|
|
|
"""
|
2018-10-12 20:08:18 +02:00
|
|
|
self.invalid_fields = []
|
2018-10-06 19:09:44 +02:00
|
|
|
|
2020-03-26 11:37:32 +01:00
|
|
|
if not re.match(r"[0-9]{4,20}", self.ad_fields["referencia"]["value"]):
|
|
|
|
|
self.invalid_fields.append("referencia")
|
2018-10-06 19:09:44 +02:00
|
|
|
|
2020-03-26 11:37:32 +01:00
|
|
|
if not re.match(r"[0-9]{1,20}", self.ad_fields["precio"]["value"]):
|
|
|
|
|
self.invalid_fields.append("precio")
|
2018-10-12 20:08:18 +02:00
|
|
|
|
2020-03-26 11:37:32 +01:00
|
|
|
possible_values_tamano = [
|
|
|
|
|
"2 coches o más",
|
|
|
|
|
"coche y moto",
|
|
|
|
|
"coche grande",
|
|
|
|
|
"coche pequeño",
|
|
|
|
|
"moto",
|
|
|
|
|
None,
|
|
|
|
|
]
|
|
|
|
|
if self.ad_fields["tamano_categorico"]["value"] not in possible_values_tamano:
|
|
|
|
|
self.invalid_fields.append("tamano_categorico")
|
2018-09-26 22:56:45 +02:00
|
|
|
|
2020-03-26 11:37:32 +01:00
|
|
|
if not "Barrio" in self.ad_fields["barrio"]["value"]:
|
|
|
|
|
self.invalid_fields.append("barrio")
|
2018-09-26 22:56:45 +02:00
|
|
|
|
2020-03-26 11:37:32 +01:00
|
|
|
if not "Distrito" in self.ad_fields["distrito"]["value"]:
|
|
|
|
|
self.invalid_fields.append("distrito")
|
2018-09-23 21:04:00 +02:00
|
|
|
|
2020-03-26 11:37:32 +01:00
|
|
|
if self.ad_fields["telefono"]["found"] and not re.match(
|
|
|
|
|
r"\s*\+?[0-9\s]*", self.ad_fields["telefono"]["value"]
|
|
|
|
|
):
|
|
|
|
|
self.invalid_fields.append("telefono")
|
|
|
|
|
# TODO añadir + a caracteres validos
|
2018-12-02 18:53:28 +01:00
|
|
|
|
2020-11-03 13:50:36 +01:00
|
|
|
def all_fields_are_valid(self) -> bool:
|
|
|
|
|
"""
|
|
|
|
|
Reports on whether the extracted data is valid.
|
|
|
|
|
:return: True if values are valid, false if not
|
|
|
|
|
"""
|
2018-10-13 02:08:58 +02:00
|
|
|
self._validate()
|
2018-10-12 20:08:18 +02:00
|
|
|
if self.invalid_fields:
|
|
|
|
|
return False
|
|
|
|
|
else:
|
|
|
|
|
return True
|
2018-09-22 23:56:01 +02:00
|
|
|
|
2020-11-03 13:50:36 +01:00
|
|
|
def fields_missing(self) -> None:
|
|
|
|
|
"""
|
|
|
|
|
Reports on whether all compulsory fields are present.
|
|
|
|
|
:return: True if some field is missing, false if not
|
|
|
|
|
"""
|
2018-10-13 02:08:58 +02:00
|
|
|
for key, contents in self.ad_fields.items():
|
2020-03-26 11:37:32 +01:00
|
|
|
if not contents["optional"] and not contents["found"]:
|
2018-10-12 20:08:18 +02:00
|
|
|
return True
|
|
|
|
|
return False
|
2018-09-22 23:56:01 +02:00
|
|
|
|
2020-11-03 13:50:36 +01:00
|
|
|
def get_data(self) -> dict:
|
|
|
|
|
"""
|
|
|
|
|
Returns the extracted data in the form of a dictionary.
|
|
|
|
|
:return: dictionary with the extracted data
|
|
|
|
|
"""
|
2018-10-12 20:08:18 +02:00
|
|
|
data = {}
|
2018-09-22 23:56:01 +02:00
|
|
|
|
2018-10-12 20:08:18 +02:00
|
|
|
for ad_field in self.ad_fields.keys():
|
2020-03-26 11:37:32 +01:00
|
|
|
data[ad_field] = self.ad_fields[ad_field]["value"]
|
2018-09-22 23:56:01 +02:00
|
|
|
|
2018-10-12 20:08:18 +02:00
|
|
|
return data
|
2018-10-13 18:17:05 +02:00
|
|
|
|
2018-10-14 17:19:48 +02:00
|
|
|
|
2020-03-26 11:37:32 +01:00
|
|
|
if __name__ == "__main__":
|
2020-12-27 12:35:02 +01:00
|
|
|
|
|
|
|
|
throttling_manager = ThrottleManager()
|
|
|
|
|
throttling_manager.add_rule(WorkingHoursThrottlingRule(working_hours)).add_rule(
|
|
|
|
|
CooldownThrottlingRule(minimum_seconds_between_tries),
|
|
|
|
|
required_argument_names=["last_attempt_timestamp"],
|
|
|
|
|
).add_rule(
|
|
|
|
|
DynamicThrottlingRule(lambda: bool(capturing_interface.get_pending_task()))
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
capturer = Capturer(throttling_manager=throttling_manager)
|
2018-10-19 17:22:09 +02:00
|
|
|
capturer.start()
|