From eeb8672f0dfda7a098c129533d09b74afa57f956 Mon Sep 17 00:00:00 2001 From: pablomartincalvo Date: Sun, 2 Dec 2018 18:53:28 +0100 Subject: [PATCH 1/2] =?UTF-8?q?Esqueleto=20de=20los=20cambios=20necesarios?= =?UTF-8?q?=20para=20a=C3=B1adir=20informacion=20de=20visitas=20al=20siste?= =?UTF-8?q?ma.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- capturer/capturer.py | 7 +++++++ db_layer/capturas_interface.py | 2 +- db_layer/db_init_scripts/3_alter_capturas.sql | 0 3 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 db_layer/db_init_scripts/3_alter_capturas.sql diff --git a/capturer/capturer.py b/capturer/capturer.py index e37f032..001b7c7 100644 --- a/capturer/capturer.py +++ b/capturer/capturer.py @@ -177,6 +177,7 @@ class AdHtmlParser: 'found': False, 'optional': True, 'value': None}} + #TODO aƱadir campos de visitas def parse(self): @@ -246,6 +247,9 @@ class AdHtmlParser: .text.replace(' ', '') self.ad_fields['telefono']['found'] = True + # TODO capturar datos de visitas + + def _validate(self): self.invalid_fields = [] @@ -274,6 +278,9 @@ class AdHtmlParser: and not re.match(r"[0-9]{1,20}", self.ad_fields['telefono']['value'])): self.invalid_fields.append('telefono') + + + def all_fields_are_valid(self): self._validate() if self.invalid_fields: diff --git a/db_layer/capturas_interface.py b/db_layer/capturas_interface.py index f06c2d5..8892c45 100644 --- a/db_layer/capturas_interface.py +++ b/db_layer/capturas_interface.py @@ -1,7 +1,7 @@ from core.mysql_wrapper import get_anunciosdb -class CapturasInterface(): +class CapturasInterface: def __init__(self): diff --git a/db_layer/db_init_scripts/3_alter_capturas.sql b/db_layer/db_init_scripts/3_alter_capturas.sql new file mode 100644 index 0000000..e69de29 From 5aba6309f03adba1707db278f0471ffd493e1880 Mon Sep 17 00:00:00 2001 From: pablomartincalvo Date: Tue, 4 Dec 2018 21:02:30 +0100 Subject: [PATCH 2/2] Transferido el espaciado entre intentos a la memoria de python en lugar de a una comprobacion de base de datos. Ajustado algunos tiempos. --- capturer/capturer.py | 20 ++++++++++++++++---- core/config.py | 4 ++-- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/capturer/capturer.py b/capturer/capturer.py index 001b7c7..8ef69b3 100644 --- a/capturer/capturer.py +++ b/capturer/capturer.py @@ -13,18 +13,23 @@ from refresher.refresher import Refresher class Capturer: - sleep_time_no_work = 60 + sleep_time_no_work = 15 + + def __init__(self): + self.last_try_datetime = datetime.datetime.now() + def start(self): while True: if (capturing_interface.get_pending_task() is None - or capturing_interface.seconds_since_last_try() < minimum_seconds_between_tries + or self.seconds_since_last_try() < minimum_seconds_between_tries or not self.in_working_hours()): sleep(Capturer.sleep_time_no_work) continue task = CapturingTask(capturing_interface.get_pending_task()) + self.last_try_datetime = datetime.datetime.now() task.capture() if task.status == 'Data ready': @@ -38,9 +43,13 @@ class Capturer: def in_working_hours(self): return working_hours['start'] <= datetime.datetime.now().time() <= working_hours['end'] + def seconds_since_last_try(self): + return (datetime.datetime.now() - self.last_try_datetime).total_seconds() + + class CapturingTask: - sleep_time_failed_request = 60 + sleep_time_failed_request = 180 def __init__(self, parameters): self.uuid = parameters['uuid'] @@ -63,7 +72,7 @@ class CapturingTask: """ self._update_status('WIP') - while self.request_failures < 3: + while self.request_failures < 4: attack = UrlAttack(self.ad_url) attack.attack() @@ -88,6 +97,9 @@ class CapturingTask: continue self._update_status('Surrender') + print(datetime.datetime.now()) + print(self.html) + print(attack.get_response()) def _extract_data(self): self.parser = AdHtmlParser(self.html) diff --git a/core/config.py b/core/config.py index 594c282..07d3f6c 100644 --- a/core/config.py +++ b/core/config.py @@ -25,6 +25,6 @@ working_hours = {'start': datetime.time(9, 0, 0), 'end': datetime.time(21, 0, 0)} monthly_new_ads_target = 1200 google_api_key = 'AIzaSyCnKj0WnsxVZcaoxeAYkuRw3cKRNGiISYA' -minimum_seconds_between_tries = 45 -geocoder_delay = 30 +minimum_seconds_between_tries = 60 +geocoder_delay = 10 refresher_delay = 10 \ No newline at end of file