Minor fixes.
This commit is contained in:
parent
e34a34acaf
commit
007f458cd5
3 changed files with 140 additions and 39 deletions
|
|
@ -1,13 +1,10 @@
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
sys.path.append("..")
|
|
||||||
from time import sleep
|
from time import sleep
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import re
|
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
from db_layer.capturing_tasks_interface import capturing_interface
|
from db_layer.capturing_tasks_interface import CapturingTasksInterface
|
||||||
from db_layer.capturas_interface import capturas_interface
|
from db_layer.capturas_interface import CapturasInterface
|
||||||
from core.scrapping_utils import UrlAttack
|
from core.scrapping_utils import UrlAttack
|
||||||
from core.config import working_hours, minimum_seconds_between_tries
|
from core.config import working_hours, minimum_seconds_between_tries
|
||||||
from core.throttling_utils import (
|
from core.throttling_utils import (
|
||||||
|
|
@ -17,6 +14,7 @@ from core.throttling_utils import (
|
||||||
DynamicThrottlingRule,
|
DynamicThrottlingRule,
|
||||||
)
|
)
|
||||||
from refresher.refresher import Refresher
|
from refresher.refresher import Refresher
|
||||||
|
from core.parsing_utils import *
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -26,8 +24,22 @@ class Capturer:
|
||||||
scraping and db storage.
|
scraping and db storage.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, throttling_manager: ThrottleManager) -> None:
|
def __init__(
|
||||||
|
self,
|
||||||
|
throttling_manager: ThrottleManager,
|
||||||
|
capturing_tasks_interface: CapturingTasksInterface,
|
||||||
|
capturas_interface: CapturasInterface,
|
||||||
|
parsing_flow_generator: ParsingFlowGenerator,
|
||||||
|
url_acquisition_object: Type[UrlAttack],
|
||||||
|
dead_ad_checker: Callable,
|
||||||
|
) -> None:
|
||||||
self._throttling_manager = throttling_manager
|
self._throttling_manager = throttling_manager
|
||||||
|
self._capturing_tasks_interface = capturing_tasks_interface
|
||||||
|
self._capturas_interface = capturas_interface
|
||||||
|
self._parsing_flow_generator = parsing_flow_generator
|
||||||
|
self._url_acquisition_object = url_acquisition_object
|
||||||
|
self._dead_ad_checker = dead_ad_checker
|
||||||
|
|
||||||
self.last_try_datetime = datetime.datetime.now()
|
self.last_try_datetime = datetime.datetime.now()
|
||||||
|
|
||||||
def start(self) -> None:
|
def start(self) -> None:
|
||||||
|
|
@ -46,11 +58,17 @@ class Capturer:
|
||||||
sleep(10)
|
sleep(10)
|
||||||
logging.info("Waiting...")
|
logging.info("Waiting...")
|
||||||
|
|
||||||
pending_task = capturing_interface.get_pending_task()
|
pending_task = self._capturing_tasks_interface.get_pending_task()
|
||||||
|
|
||||||
logging.info("Got a task")
|
logging.info("Got a task")
|
||||||
|
|
||||||
task = CapturingTask(pending_task)
|
task = CapturingTask(
|
||||||
|
pending_task,
|
||||||
|
capturing_interface=self._capturing_tasks_interface,
|
||||||
|
new_parsing_flow=self._parsing_flow_generator.get_new_flow(),
|
||||||
|
url_acquisition_object=self._url_acquisition_object,
|
||||||
|
dead_ad_checker=self._dead_ad_checker,
|
||||||
|
)
|
||||||
self.last_try_datetime = datetime.datetime.now()
|
self.last_try_datetime = datetime.datetime.now()
|
||||||
task.capture()
|
task.capture()
|
||||||
|
|
||||||
|
|
@ -60,8 +78,8 @@ class Capturer:
|
||||||
logging.warning("Something went wrong, not adding data.")
|
logging.warning("Something went wrong, not adding data.")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
capturas_interface.insert_captura(ad_data)
|
self._capturas_interface.insert_captura(ad_data)
|
||||||
task._update_status("Captura inserted")
|
task.update_status("Captura inserted")
|
||||||
logging.info("New ad inserted.")
|
logging.info("New ad inserted.")
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -73,29 +91,40 @@ class CapturingTask:
|
||||||
|
|
||||||
sleep_time_failed_request = 180
|
sleep_time_failed_request = 180
|
||||||
|
|
||||||
def __init__(self, parameters) -> None:
|
def __init__(
|
||||||
|
self,
|
||||||
|
task_parameters: dict,
|
||||||
|
capturing_interface: CapturingTasksInterface,
|
||||||
|
new_parsing_flow: ParsingFlow,
|
||||||
|
url_acquisition_object: Type[UrlAttack],
|
||||||
|
dead_ad_checker: Callable,
|
||||||
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Initialize with task parameters and mark the task as being worked on
|
Initialize with task parameters and mark the task as being worked on
|
||||||
in the task queue.
|
in the task queue.
|
||||||
:param parameters: dict with the necessary parameters for the task
|
:param task_parameters: dict with the necessary parameters for the task
|
||||||
"""
|
"""
|
||||||
self.uuid = parameters["uuid"]
|
self.uuid = task_parameters["uuid"]
|
||||||
self.ad_url = parameters["ad_url"]
|
self.ad_url = task_parameters["ad_url"]
|
||||||
self.uuid_exploring = parameters["fk_uuid_exploring"]
|
self.uuid_exploring = task_parameters["fk_uuid_exploring"]
|
||||||
self.status = parameters["status"]
|
self.status = task_parameters["status"]
|
||||||
self.request_failures = 1
|
self.request_failures = 1
|
||||||
self.html = None
|
self.html = None
|
||||||
|
self._parsing_flow = new_parsing_flow
|
||||||
|
self._capturing_interface = capturing_interface
|
||||||
|
self._url_acquistion_object = url_acquisition_object
|
||||||
|
self._is_dead_ad = dead_ad_checker
|
||||||
|
|
||||||
self._update_status("Loading")
|
self.update_status("Loading")
|
||||||
|
|
||||||
def _update_status(self, new_status) -> None:
|
def update_status(self, new_status) -> None:
|
||||||
"""
|
"""
|
||||||
Updates the task status and persists it in the task queue.
|
Updates the task status and persists it in the task queue.
|
||||||
:param new_status: string describing the new status
|
:param new_status: string describing the new status
|
||||||
:return: None
|
:return: None
|
||||||
"""
|
"""
|
||||||
self.status = new_status
|
self.status = new_status
|
||||||
capturing_interface.update_capturing_task(
|
self._capturing_interface.update_capturing_task(
|
||||||
self.uuid, self.uuid_exploring, self.status, self.ad_url
|
self.uuid, self.uuid_exploring, self.status, self.ad_url
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -103,34 +132,32 @@ class CapturingTask:
|
||||||
"""
|
"""
|
||||||
Main flow of work
|
Main flow of work
|
||||||
"""
|
"""
|
||||||
self._update_status("WIP")
|
self.update_status("WIP")
|
||||||
|
|
||||||
while self.request_failures < 4:
|
while self.request_failures < 4:
|
||||||
attack = UrlAttack(self.ad_url)
|
attack = self._url_acquistion_object(self.ad_url)
|
||||||
attack.attack()
|
attack.attack()
|
||||||
|
|
||||||
if attack.success:
|
if attack.success:
|
||||||
self.html = attack.get_text()
|
self._parse_html(html=attack.get_text())
|
||||||
self._extract_data()
|
|
||||||
self._check_data()
|
|
||||||
return
|
return
|
||||||
|
|
||||||
if not attack.success:
|
if not attack.success:
|
||||||
try:
|
try:
|
||||||
if Refresher.dead_ad_checker(attack.get_text()):
|
if self._is_dead_ad(attack.get_text()):
|
||||||
self._update_status("Dead ad")
|
self.update_status("Dead ad")
|
||||||
return
|
return
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
logging.error(
|
logging.error(
|
||||||
"Something went wrong when checking if the ad is gone"
|
"Something went wrong when checking if the ad is gone"
|
||||||
)
|
)
|
||||||
|
|
||||||
self._update_status("Fail {}".format(self.request_failures))
|
self.update_status("Fail {}".format(self.request_failures))
|
||||||
self.request_failures += 1
|
self.request_failures += 1
|
||||||
sleep(CapturingTask.sleep_time_failed_request)
|
sleep(CapturingTask.sleep_time_failed_request)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
self._update_status("Surrender")
|
self.update_status("Surrender")
|
||||||
logging.warning(f"A task has surrendered. {self.ad_url}")
|
logging.warning(f"A task has surrendered. {self.ad_url}")
|
||||||
|
|
||||||
def _extract_data(self) -> None:
|
def _extract_data(self) -> None:
|
||||||
|
|
@ -148,21 +175,40 @@ class CapturingTask:
|
||||||
:return: None
|
:return: None
|
||||||
"""
|
"""
|
||||||
if self.parser.fields_missing():
|
if self.parser.fields_missing():
|
||||||
self._update_status("Fields missing")
|
self.update_status("Fields missing")
|
||||||
return
|
return
|
||||||
|
|
||||||
if not self.parser.all_fields_are_valid():
|
if not self.parser.all_fields_are_valid():
|
||||||
self._update_status("Invalid value fields")
|
self.update_status("Invalid value fields")
|
||||||
return
|
return
|
||||||
|
|
||||||
self._update_status("Data ready")
|
self.update_status("Data ready")
|
||||||
|
|
||||||
def get_ad_data(self) -> dict:
|
def get_ad_data(self) -> dict:
|
||||||
"""
|
"""
|
||||||
Returns the extracted data.
|
Returns the extracted data.
|
||||||
:return: dictionary with the data of the ad.
|
:return: dictionary with the data of the ad.
|
||||||
"""
|
"""
|
||||||
return self.parser.get_data()
|
return self._parsing_flow.field_values
|
||||||
|
|
||||||
|
def _parse_html(self, html: str) -> None:
|
||||||
|
self._parsing_flow.execute_flow(soup=BeautifulSoup(html, "html5lib"))
|
||||||
|
|
||||||
|
if not self._parsing_flow.issues:
|
||||||
|
self.update_status("Data ready")
|
||||||
|
return
|
||||||
|
|
||||||
|
if not self._parsing_flow.all_found_fields_are_valid:
|
||||||
|
self.update_status("Invalid value fields")
|
||||||
|
logging.warning(f"Invalid fields found in ad: {self.ad_url}")
|
||||||
|
logging.warning(f"{self._parsing_flow.issues}")
|
||||||
|
return
|
||||||
|
if not self._parsing_flow.all_non_optional_fields_were_found:
|
||||||
|
self.update_status("Fields missing")
|
||||||
|
logging.warning(
|
||||||
|
f"Couldn't scrap necessary fields: {self._parsing_flow.issues}"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
class AdHtmlParser:
|
class AdHtmlParser:
|
||||||
|
|
@ -362,7 +408,7 @@ class AdHtmlParser:
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def fields_missing(self) -> None:
|
def fields_missing(self) -> bool:
|
||||||
"""
|
"""
|
||||||
Reports on whether all compulsory fields are present.
|
Reports on whether all compulsory fields are present.
|
||||||
:return: True if some field is missing, false if not
|
:return: True if some field is missing, false if not
|
||||||
|
|
@ -387,13 +433,65 @@ class AdHtmlParser:
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
capturing_tasks_interface = CapturingTasksInterface()
|
||||||
|
capturas_interface = CapturasInterface()
|
||||||
|
|
||||||
throttling_manager = ThrottleManager()
|
throttling_manager = ThrottleManager()
|
||||||
throttling_manager.add_rule(WorkingHoursThrottlingRule(working_hours)).add_rule(
|
throttling_manager.add_rule(WorkingHoursThrottlingRule(working_hours)).add_rule(
|
||||||
CooldownThrottlingRule(minimum_seconds_between_tries),
|
CooldownThrottlingRule(minimum_seconds_between_tries),
|
||||||
required_argument_names=["last_attempt_timestamp"],
|
required_argument_names=["last_attempt_timestamp"],
|
||||||
).add_rule(
|
).add_rule(
|
||||||
DynamicThrottlingRule(lambda: bool(capturing_interface.get_pending_task()))
|
DynamicThrottlingRule(
|
||||||
|
lambda: bool(capturing_tasks_interface.get_pending_task())
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
capturer = Capturer(throttling_manager=throttling_manager)
|
parsing_flow_generator = ParsingFlowGenerator(
|
||||||
|
ParsingFlow,
|
||||||
|
(
|
||||||
|
(ReferenciaFieldInstructions, {}),
|
||||||
|
(PrecioFieldInstructions, {}),
|
||||||
|
(TamanoCategoricoFieldInstructions, {}),
|
||||||
|
(M2FieldInstructions, {}),
|
||||||
|
(TipoAnuncioFieldInstructions, {}),
|
||||||
|
(CalleFieldInstructions, {}),
|
||||||
|
(BarrioFieldInstructions, {}),
|
||||||
|
(DistritoFieldInstructions, {}),
|
||||||
|
(CiudadFieldInstructions, {}),
|
||||||
|
(
|
||||||
|
SecondaryFeaturesFieldInstructions,
|
||||||
|
{"field_name": "cubierta", "search_keyword": "Cubierta"},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
SecondaryFeaturesFieldInstructions,
|
||||||
|
{"field_name": "puerta_auto", "search_keyword": "Puerta"},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
SecondaryFeaturesFieldInstructions,
|
||||||
|
{"field_name": "ascensor", "search_keyword": "ascensor"},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
SecondaryFeaturesFieldInstructions,
|
||||||
|
{"field_name": "alarma", "search_keyword": "Alarma"},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
SecondaryFeaturesFieldInstructions,
|
||||||
|
{"field_name": "circuito", "search_keyword": "Cámaras"},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
SecondaryFeaturesFieldInstructions,
|
||||||
|
{"field_name": "personal", "search_keyword": "Personal"},
|
||||||
|
),
|
||||||
|
(TelefonoFieldInstructions, {}),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
capturer = Capturer(
|
||||||
|
throttling_manager=throttling_manager,
|
||||||
|
capturing_tasks_interface=capturing_tasks_interface,
|
||||||
|
capturas_interface=capturas_interface,
|
||||||
|
parsing_flow_generator=parsing_flow_generator,
|
||||||
|
url_acquisition_object=UrlAttack,
|
||||||
|
dead_ad_checker=Refresher.dead_ad_checker,
|
||||||
|
)
|
||||||
capturer.start()
|
capturer.start()
|
||||||
|
|
|
||||||
|
|
@ -258,7 +258,7 @@ class TipoAnuncioFieldInstructions(BaseTargetFieldInstructions):
|
||||||
if "venta" in soup.find("title").text:
|
if "venta" in soup.find("title").text:
|
||||||
self.value = 1
|
self.value = 1
|
||||||
self.found = True
|
self.found = True
|
||||||
if "alquiler" in soup.find("title").text:
|
if "Alquiler" in soup.find("title").text:
|
||||||
self.value = 2
|
self.value = 2
|
||||||
self.found = True
|
self.found = True
|
||||||
|
|
||||||
|
|
@ -542,11 +542,11 @@ class ParsingFlow:
|
||||||
if (field.found or field.is_optional) and field.valid:
|
if (field.found or field.is_optional) and field.valid:
|
||||||
continue
|
continue
|
||||||
this_field_issues = {}
|
this_field_issues = {}
|
||||||
if not field.found:
|
if not field.found and not field.is_optional:
|
||||||
this_field_issues["found"] = "Not found"
|
this_field_issues["found"] = "Not found"
|
||||||
if field.search_issue:
|
if field.search_issue:
|
||||||
this_field_issues["search_issue"] = field.search_issue
|
this_field_issues["search_issue"] = field.search_issue
|
||||||
if not field.valid:
|
if not field.valid and field.valid is not None:
|
||||||
this_field_issues["validity"] = "Not valid"
|
this_field_issues["validity"] = "Not valid"
|
||||||
this_field_issues["value"] = field.value
|
this_field_issues["value"] = field.value
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -95,7 +95,10 @@ class UrlAttack:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.success = False
|
self.success = False
|
||||||
|
|
||||||
if random.randrange(0, 100) < UrlAttack.identity_change_probability:
|
if (
|
||||||
|
not self.success
|
||||||
|
or random.randrange(0, 100) < UrlAttack.identity_change_probability
|
||||||
|
):
|
||||||
self._change_identity()
|
self._change_identity()
|
||||||
|
|
||||||
def _change_identity(self) -> None:
|
def _change_identity(self) -> None:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue