diff --git a/.idea/workspace.xml b/.idea/workspace.xml index f9387ed..832f210 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -1,8 +1,11 @@ - + + + + @@ -248,7 +263,7 @@ - + + + + @@ -437,7 +474,14 @@ @@ -451,19 +495,20 @@ + - - + + - + - + @@ -513,9 +558,38 @@ - + + + + + file://$PROJECT_DIR$/explorer/explorer.py + 38 + + + + + + + + + + + + + + + Explorer.working_hours['start'] <= datetime.datetime.now().time() <= Explorer.working_hours['end'] + Python + CODE_FRAGMENT + + capturing_interface.seconds_since_last_try() @@ -523,12 +597,42 @@ EXPRESSION - cursor.fetchone() + cursor.fetchone()[0] Python EXPRESSION - cursor.fetchone()[0] + return cursor.fetchone()[0] + Python + EXPRESSION + + + capturing_interface.get_pending_task() is None + Python + EXPRESSION + + + execution_cursor.execute(query_statement, query_parameters) + Python + EXPRESSION + + + uuid_exploring + Python + EXPRESSION + + + self.database_is_up() + Python + EXPRESSION + + + self.get_max_tasks_today() + Python + EXPRESSION + + + self.get_tasks_created_today() Python EXPRESSION @@ -630,13 +734,6 @@ - - - - - - - @@ -644,16 +741,6 @@ - - - - - - - - - - @@ -661,13 +748,6 @@ - - - - - - - @@ -675,17 +755,51 @@ + + + + + + + + + + + + + + + + + + + + + + + + - - + + + + + + + + + + + + - - + + diff --git a/capturer/capturer.py b/capturer/capturer.py index 8d83496..480f370 100644 --- a/capturer/capturer.py +++ b/capturer/capturer.py @@ -3,6 +3,7 @@ sys.path.append('..') from time import sleep from bs4 import BeautifulSoup import re +import datetime from db_layer.capturing_tasks_interface import capturing_interface from db_layer.capturas_interface import capturas_interface from core.scrapping_utils import UrlAttack @@ -11,13 +12,15 @@ from core.scrapping_utils import UrlAttack class Capturer: sleep_time_no_work = 60 minimum_seconds_between_tries = 120 - + working_hours = {'start': datetime.time(9, 0, 0), + 'end': datetime.time(21, 0, 0)} def start(self): while True: if (capturing_interface.get_pending_task() is None - or capturing_interface.seconds_since_last_try() < Capturer.minimum_seconds_between_tries): + or capturing_interface.seconds_since_last_try() < Capturer.minimum_seconds_between_tries + or not self.in_working_hours()): sleep(Capturer.sleep_time_no_work) continue @@ -32,6 +35,10 @@ class Capturer: capturas_interface.insert_captura(ad_data) task._update_status('Captura inserted') + def in_working_hours(self): + return Capturer.working_hours['start'] <= datetime.datetime.now().time() <= Capturer.working_hours['end'] + + class CapturingTask: sleep_time_failed_request = 60 diff --git a/db_layer/capturing_tasks_interface.py b/db_layer/capturing_tasks_interface.py index 34d91be..27e917f 100644 --- a/db_layer/capturing_tasks_interface.py +++ b/db_layer/capturing_tasks_interface.py @@ -1,3 +1,4 @@ +import uuid from core.mysql_wrapper import get_tasksdb class CapturingTasksInterface: @@ -36,9 +37,9 @@ class CapturingTasksInterface: """ cursor = self.tasksdb.query(query_statement, dictionary=True) - if cursor.rowcount: + try: return cursor.fetchone() - else: + except: return None def update_capturing_task(self, uuid, uuid_exploring, status, ad_url): @@ -62,7 +63,7 @@ class CapturingTasksInterface: query_statement = """SELECT TIME_TO_SEC(TIMEDIFF(now(), write_time)) FROM capturing_tasks_logs WHERE status = 'Loading' - ORDER BY write_time + ORDER BY write_time DESC LIMIT 1 """ diff --git a/explorer/explorer.py b/explorer/explorer.py index adc1533..23b9fe6 100644 --- a/explorer/explorer.py +++ b/explorer/explorer.py @@ -17,7 +17,7 @@ class Explorer(): sleep_time_no_work = 60 sleep_time_no_service = 600 working_hours = {'start': datetime.time(9, 0, 0), - 'end': datetime.time(18, 0, 0)} + 'end': datetime.time(21, 0, 0)} monthly_capture_target = 1000 ad_types = {'1': 'alquiler', '2': 'venta'} @@ -38,6 +38,7 @@ class Explorer(): while True: if not self.there_is_work(): + print('{}: Waiting. No work'.format(datetime.datetime.now())) sleep(Explorer.sleep_time_no_work) continue @@ -47,17 +48,16 @@ class Explorer(): current_task = ExploringTask(self.compose_listing_url()) current_task.explore() + print('{}: Exploring done'.format(datetime.datetime.now())) if current_task.status == 'Referencias ready': referencias = current_task.get_referencias() for referencia in referencias: - capturing_interface.create_capturing_task(referencia) - - current_task._update_status("Sent to queue") + capturing_interface.create_capturing_task(referencia, current_task.id) + current_task._update_status("Sent to queue") continue - - self.stop() + def stop(self): #TODO Detener el servicio @@ -160,7 +160,7 @@ class Explorer(): """ cursor_result = self.tasksdb.query(query_statement) - return cursor_result.row_count + return cursor_result.fetchone()[0] def compose_listing_url(self): """ @@ -168,7 +168,7 @@ class Explorer(): :return: """ root = 'https://www.idealista.com/' - type = ad_type[str(randint(1,2))] + type = Explorer.ad_types[str(randint(1,2))] city = 'barcelona' page_number = str(randint(1,30)) url = root + type + '-garajes/' + city + '-' + city + '/' + \