drogon/explorer/explorer.py

359 lines
12 KiB
Python
Raw Normal View History

# -*- coding: utf-8 -*-
import sys
2020-03-26 11:38:08 +01:00
sys.path.append("..")
import uuid
import datetime
from time import sleep
from bs4 import BeautifulSoup
import re
from random import randint
import mysql.connector
from core.mysql_wrapper import get_anunciosdb, get_tasksdb
from core.config import monthly_new_ads_target, working_hours
from core.scrapping_utils import UrlAttack
from core.alerts import alert_master
from db_layer.capturing_tasks_interface import capturing_interface
2020-03-26 11:38:08 +01:00
from core import my_logger
import logging
class Explorer:
"""
Daemon with the full flow of execution of generating a listing page url,
requesting the page, scraping the ad references and storing logs in the
task database
"""
sleep_time_no_work = 60
sleep_time_no_service = 600
2020-03-26 11:38:08 +01:00
ad_types = {"1": "alquiler", "2": "venta"}
2018-09-09 19:42:52 +02:00
def __init__(self) -> None:
"""
Connect to database and set up initial parameters.
"""
try:
self.anunciosdb = get_anunciosdb()
2018-09-09 19:42:52 +02:00
self.tasksdb = get_tasksdb()
2018-08-14 20:02:40 +02:00
except:
print("Could not connect to anuncios DB")
self.max_db_retries = 3
self.db_retries = 0
self.max_queue_retries = 3
self.queue_retries = 0
2018-08-14 20:02:40 +02:00
def start(self) -> None:
"""
Full flow of execution. Checks whether it should capture a URL, tries
to do so and stores the result if successful.
:return: None
"""
2020-03-26 11:47:12 +01:00
logging.info("Starting explorer")
while True:
if not self._is_there_work():
2020-03-26 11:38:08 +01:00
print("{}: Waiting. No work".format(datetime.datetime.now()))
sleep(Explorer.sleep_time_no_work)
continue
2020-03-26 11:47:12 +01:00
logging.info("Waiting")
2020-03-26 11:38:08 +01:00
if not self._database_is_up():
2020-03-26 11:38:08 +01:00
alert_master(
"SQL DOWN",
"El explorer informa de que SQL esta caida. Actividad detenida",
)
raise ConnectionError("Unable to connect to database")
current_task = ExploringTask(self._compose_listing_url())
current_task.explore()
2020-03-26 11:47:12 +01:00
logging.info("Exploring task done...")
2020-03-26 11:38:08 +01:00
if current_task.status == "Referencias ready":
referencias = current_task.get_referencias()
for referencia in referencias:
2020-03-26 11:38:08 +01:00
capturing_interface.create_capturing_task(
referencia, current_task.id
)
current_task._update_status("Sent to queue")
2020-03-26 11:47:12 +01:00
logging.info("The task was successful.")
2020-03-26 11:38:08 +01:00
continue
def _is_there_work(self) -> bool:
"""
Checks whether it should try to scrap a listing page according to
limits and cooldowns.
:return: True if it should work, false otherwise
"""
if any(
[
self._check_if_recent_task(),
not self._in_working_hours(),
(
self._get_referencias_acquired_today()
>= self._get_max_referencias_for_today()
),
(self._get_tasks_created_today() >= self._get_max_tasks_today()),
]
2020-03-26 11:38:08 +01:00
):
return False
2020-03-26 11:38:08 +01:00
return True
2020-03-26 11:38:08 +01:00
def _database_is_up(self) -> bool:
"""
Checks whether the db is reachable with some retries.
:return: True if db is reachable, false if not
"""
while self.db_retries <= self.max_db_retries:
try:
self.anunciosdb.ping()
self.db_retries = 0
return True
except:
sleep(Explorer.sleep_time_no_service)
self.db_retries = self.db_retries + 1
2020-03-26 11:38:08 +01:00
return False
2020-03-26 11:38:08 +01:00
@staticmethod
def _in_working_hours() -> None:
"""
Checks whether now is within the working hours of the daemon.
:return: True if so, false if not
"""
2020-03-26 11:38:08 +01:00
return (
working_hours["start"]
<= datetime.datetime.now().time()
<= working_hours["end"]
)
def _get_referencias_acquired_today(self) -> int:
"""
Queries the database to obtain the count of scraped ads in the last 24h.
:return: the resulting count
"""
2020-03-26 11:38:08 +01:00
2018-09-09 19:42:52 +02:00
query_statement = """ SELECT count(referencia)
FROM primera_captura_full
WHERE fecha_captura >= now() - INTERVAL 1 DAY;
"""
2020-03-26 11:38:08 +01:00
cursor_result = self.anunciosdb.query(query_statement)
2020-03-26 11:38:08 +01:00
2018-09-09 19:42:52 +02:00
return cursor_result.fetchone()[0]
2020-03-26 11:38:08 +01:00
def _get_max_referencias_for_today(self) -> float:
"""
Queries the database for the number of captured ads in the last 30 days
and computes the max number of ad references to obtain today.
:return: the max number of references
"""
2018-09-09 19:42:52 +02:00
query_statement = """ SELECT count(referencia)
FROM primera_captura_full
WHERE fecha_captura >= now() - INTERVAL 30 DAY;
"""
cursor_result = self.anunciosdb.query(query_statement)
2018-09-09 19:42:52 +02:00
new_referencias_last_30 = cursor_result.fetchone()[0]
2020-03-26 11:38:08 +01:00
deviation = (
monthly_new_ads_target - new_referencias_last_30
) / monthly_new_ads_target
max_referencias = (monthly_new_ads_target / 30) * (1 + deviation)
2018-09-09 19:42:52 +02:00
return max_referencias
2018-09-09 19:42:52 +02:00
def _get_tasks_created_today(self) -> int:
"""
Queries the database for the number of exploring tasks created in the
last 24h, returns it.
:return: number of exploring tasks created
"""
query_statement = """ SELECT count(uuid)
2018-09-09 19:42:52 +02:00
FROM exploring_tasks_logs
WHERE status = 'Attacked'
AND write_time >= now() - INTERVAL 1 DAY;
"""
cursor_result = self.tasksdb.query(query_statement)
2018-09-09 19:42:52 +02:00
tasks_created_today = cursor_result.fetchone()[0]
return tasks_created_today
def _get_max_tasks_today(self) -> float:
"""
Computes the current task goal
:return: max current tasks target
"""
return (self._get_max_referencias_for_today() / 30) * 6
def _check_if_recent_task(self) -> int:
"""
Queries the db for the number of tasks created in the last 10 minutes.
:return: the number of recently created tasks
"""
query_statement = """ SELECT count(uuid)
FROM exploring_tasks_logs
WHERE status = 'Attacked'
AND write_time >= now() - INTERVAL 10 MINUTE
"""
cursor_result = self.tasksdb.query(query_statement)
return cursor_result.fetchone()[0]
@staticmethod
def _compose_listing_url() -> str:
"""
Generates a listing page URL randomly.
:return: the listing page URL
"""
2020-03-26 11:38:08 +01:00
root = "https://www.idealista.com/"
type = Explorer.ad_types[str(randint(1, 2))]
city = "barcelona"
page_number = str(randint(1, 30))
url = (
root
+ type
+ "-garajes/"
+ city
+ "-"
+ city
+ "/"
+ "pagina-"
+ page_number
+ ".htm"
)
return url
2020-03-26 11:38:08 +01:00
class ExploringTask:
"""
Task object wrapping the process of attempting to capture a listing page,
parsing the ad references and sending to db.
"""
def __init__(self, url: str) -> None:
"""
Initialize with task parameters and mark the task as being worked on
in the task queue.
:param url: string with the listing page url to be captured
"""
self.anunciosdb = get_anunciosdb()
self.tasksdb = get_tasksdb()
self.target_url = url
self.id = str(uuid.uuid4())
2020-03-26 11:38:08 +01:00
self._update_status("Pending")
def _update_status(self, new_status: str) -> None:
"""
Updates the task status and persists it in the task queue.
:param new_status: string describing the new status
:return: None
"""
2018-08-14 20:02:40 +02:00
self.status = new_status
self._log_in_tasksdb()
2020-03-26 11:38:08 +01:00
def explore(self) -> None:
"""
Main flow of work.
:return: None
"""
attack = UrlAttack(self.target_url)
attack.attack()
2020-03-26 11:38:08 +01:00
self._update_status("Attacked")
if not attack.success:
2020-03-26 11:38:08 +01:00
self._update_status("Failure - Bad request")
return
self._validate_referencias(attack.get_text())
self._extract_referencias(attack.get_text())
if self.referencias:
self._update_status("Referencias ready")
elif self.there_are_referencias:
self._update_status("Failure - No new referencias in HTML")
else:
self._update_status("Failure - HTML with no referencias")
def _log_in_tasksdb(self) -> None:
"""
Logs status in the task db.
:return: None
"""
2020-03-26 11:38:08 +01:00
query_statement = """INSERT INTO exploring_tasks_logs
(uuid, write_time, status)
VALUES (%(uuid)s, NOW(), %(status)s)"""
2020-03-26 11:38:08 +01:00
query_parameters = {"uuid": self.id, "status": self.status}
self.tasksdb.query(query_statement, query_parameters)
2020-03-26 11:38:08 +01:00
def _validate_referencias(self, html: str) -> None:
"""
Checks that the ad references are in the HTML code.
:param html: string with HTML code of the listing page
:return: None
"""
2020-03-26 11:38:08 +01:00
soup = BeautifulSoup(html, "html5lib")
ads = soup.find_all(class_="item")
pattern = "^[0-9]{3,20}$"
2020-03-26 11:38:08 +01:00
for ad in ads:
if not re.match(pattern, ad["data-adid"]):
2020-03-26 11:38:08 +01:00
alert_master(
"Alerta - Referencias no válidas",
"""Una tarea de exploración ha considerado inválida
una referencia. El texto de la referencia era : {}
2020-03-26 11:38:08 +01:00
""".format(
ad["data-adid"]
),
)
break
def _extract_referencias(self, html: str) -> None:
2018-08-14 20:02:40 +02:00
"""
Scraps the ad references out of the HTML code and stores them.
:param html: string with HTML code of the listing page
:return: None
2018-08-14 20:02:40 +02:00
"""
2020-03-26 11:38:08 +01:00
soup = BeautifulSoup(html, "html5lib")
ads = soup.find_all(class_="item")
self.there_are_referencias = bool(ads)
2018-08-14 20:02:40 +02:00
self.referencias = []
for ad in ads:
if self._is_new_listing(ad["data-adid"]):
2020-03-26 11:38:08 +01:00
self.referencias.append(ad["data-adid"])
def _is_new_listing(self, referencia: str) -> bool:
"""
Checks if an ad reference already exists in the db.
:param referencia:
:return: True if it is new, false if not
"""
2018-08-14 20:02:40 +02:00
query_statement = """SELECT count(referencia)
FROM capturas
WHERE referencia = %s"""
query_params = (referencia,)
cursor_result = self.anunciosdb.query(query_statement, query_params)
2020-03-26 11:38:08 +01:00
2018-08-14 20:02:40 +02:00
result = cursor_result.fetchone()
if result[0] > 0:
return False
else:
return True
2020-03-26 11:38:08 +01:00
def get_referencias(self) -> list:
"""
Gets the references.
:return: list of ad references
"""
return self.referencias
2018-10-13 18:17:05 +02:00
2020-03-26 11:38:08 +01:00
if __name__ == "__main__":
2018-10-13 18:17:05 +02:00
explorer = Explorer()
explorer.start()