Retoques menores en capturer y explorer. De vuelta al testing.

This commit is contained in:
pablomartincalvo 2018-10-14 18:41:12 +02:00
parent 6342a95f40
commit e0216060b9
4 changed files with 190 additions and 68 deletions

224
.idea/workspace.xml generated
View file

@ -1,8 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<list default="true" id="6e2fbba0-85ff-42d6-8e70-e4cdef1000c8" name="Default Changelist" comment="">
<list default="true" id="6e2fbba0-85ff-42d6-8e70-e4cdef1000c8" name="Default Changelist" comment="Puesto bien mains en explorer y capturer.">
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/capturer/capturer.py" beforeDir="false" afterPath="$PROJECT_DIR$/capturer/capturer.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/db_layer/capturing_tasks_interface.py" beforeDir="false" afterPath="$PROJECT_DIR$/db_layer/capturing_tasks_interface.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/explorer/explorer.py" beforeDir="false" afterPath="$PROJECT_DIR$/explorer/explorer.py" afterDir="false" />
</list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" />
@ -27,29 +30,29 @@
</usages-collector>
<usages-collector id="statistics.file.extensions.open">
<counts>
<entry key="py" value="43" />
<entry key="py" value="46" />
</counts>
</usages-collector>
<usages-collector id="statistics.file.types.open">
<counts>
<entry key="Python" value="40" />
<entry key="Python" value="43" />
<entry key="Scratch" value="3" />
</counts>
</usages-collector>
<usages-collector id="statistics.file.extensions.edit">
<counts>
<entry key="Python Console" value="1555" />
<entry key="Python Console" value="1579" />
<entry key="capturer" value="862" />
<entry key="dummy" value="14" />
<entry key="py" value="15226" />
<entry key="py" value="15563" />
<entry key="scratch_1" value="489" />
<entry key="txt" value="985" />
<entry key="txt" value="990" />
</counts>
</usages-collector>
<usages-collector id="statistics.file.types.edit">
<counts>
<entry key="PLAIN_TEXT" value="999" />
<entry key="Python" value="17962" />
<entry key="PLAIN_TEXT" value="1004" />
<entry key="Python" value="18323" />
<entry key="Scratch" value="170" />
</counts>
</usages-collector>
@ -71,8 +74,8 @@
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/capturer/capturer.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="344">
<caret line="228" column="82" lean-forward="true" selection-start-line="228" selection-start-column="82" selection-end-line="228" selection-end-column="82" />
<state relative-caret-position="285">
<caret line="19" lean-forward="true" selection-start-line="19" selection-end-line="19" />
</state>
</provider>
</entry>
@ -95,11 +98,23 @@
</split-first>
<split-second>
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
<file pinned="false" current-in-tab="true">
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/explorer/explorer.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="479">
<caret line="282" lean-forward="true" selection-start-line="282" selection-end-line="284" selection-end-column="20" />
<state relative-caret-position="401">
<caret line="283" selection-start-line="283" selection-end-line="283" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/db_layer/capturing_tasks_interface.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="265">
<caret line="66" column="36" lean-forward="true" selection-start-line="66" selection-start-column="36" selection-end-line="66" selection-end-column="36" />
<folding>
<element signature="e#0#11#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
@ -107,8 +122,8 @@
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/core/mysql_wrapper.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="675">
<caret line="45" column="23" selection-start-line="45" selection-start-column="23" selection-end-line="45" selection-end-column="23" />
<state relative-caret-position="263">
<caret line="47" selection-start-line="47" selection-end-line="47" />
<folding>
<element signature="e#24#46#0" expanded="true" />
</folding>
@ -178,9 +193,9 @@
<option value="$PROJECT_DIR$/tests/capturer_tests.py" />
<option value="$PROJECT_DIR$/core/mysql_wrapper.py" />
<option value="$PROJECT_DIR$/db_layer/capturas_interface.py" />
<option value="$PROJECT_DIR$/db_layer/capturing_tasks_interface.py" />
<option value="$PROJECT_DIR$/explorer/explorer.py" />
<option value="$PROJECT_DIR$/capturer/capturer.py" />
<option value="$PROJECT_DIR$/db_layer/capturing_tasks_interface.py" />
</list>
</option>
</component>
@ -248,7 +263,7 @@
</list>
</option>
</component>
<component name="RunManager" selected="Python.capturer_tests">
<component name="RunManager" selected="Python.capturer">
<configuration name="alerts" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="Drogon" />
<option name="INTERPRETER_OPTIONS" value="" />
@ -312,6 +327,27 @@
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="explorer" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="Drogon" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/explorer" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/explorer/explorer.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="scratch_1" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="Drogon" />
<option name="INTERPRETER_OPTIONS" value="" />
@ -338,14 +374,15 @@
<item itemvalue="Python.scratch_1" />
<item itemvalue="Python.capturer" />
<item itemvalue="Python.capturer_tests" />
<item itemvalue="Python.explorer" />
</list>
<recent_temporary>
<list>
<item itemvalue="Python.capturer" />
<item itemvalue="Python.explorer" />
<item itemvalue="Python.capturer_tests" />
<item itemvalue="Python.capturer" />
<item itemvalue="Python.capturer" />
<item itemvalue="Python.capturer" />
<item itemvalue="Python.capturer" />
</list>
</recent_temporary>
</component>
@ -437,7 +474,14 @@
<option name="project" value="LOCAL" />
<updated>1539447425957</updated>
</task>
<option name="localTasksCounter" value="12" />
<task id="LOCAL-00012" summary="Puesto bien mains en explorer y capturer.">
<created>1539530388252</created>
<option name="number" value="00012" />
<option name="presentableId" value="LOCAL-00012" />
<option name="project" value="LOCAL" />
<updated>1539530388252</updated>
</task>
<option name="localTasksCounter" value="13" />
<servers />
</component>
<component name="TodoView" selected-index="1">
@ -451,19 +495,20 @@
</component>
<component name="ToolWindowManager">
<frame x="0" y="-2" width="1920" height="1082" extended-state="6" />
<editor active="true" />
<layout>
<window_info active="true" content_ui="combo" id="Project" order="0" sideWeight="0.48345324" visible="true" weight="0.14918292" />
<window_info id="Structure" order="1" sideWeight="0.5165468" side_tool="true" visible="true" weight="0.14918292" />
<window_info content_ui="combo" id="Project" order="0" sideWeight="0.48251748" visible="true" weight="0.14918292" />
<window_info id="Structure" order="1" sideWeight="0.5174825" side_tool="true" visible="true" weight="0.14918292" />
<window_info id="Favorites" order="2" sideWeight="0.5015674" side_tool="true" weight="0.14918292" />
<window_info active="true" id="Repositories" order="3" sideWeight="0.49529782" visible="true" weight="0.32999474" />
<window_info anchor="bottom" id="Message" order="0" />
<window_info anchor="bottom" id="Find" order="1" weight="0.32983193" />
<window_info anchor="bottom" id="Run" order="2" weight="0.32983193" />
<window_info anchor="bottom" id="Debug" order="3" visible="true" weight="0.39915967" />
<window_info active="true" anchor="bottom" id="Debug" order="3" visible="true" weight="0.39915967" />
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="bottom" id="TODO" order="6" sideWeight="0.49973643" weight="0.32878152" />
<window_info anchor="bottom" id="Version Control" order="7" sideWeight="0.49973643" visible="true" weight="0.269958" />
<window_info anchor="bottom" id="Version Control" order="7" sideWeight="0.49973643" weight="0.269958" />
<window_info anchor="bottom" id="Terminal" order="8" weight="0.32983193" />
<window_info anchor="bottom" id="Event Log" order="9" sideWeight="0.5007907" side_tool="true" weight="0.32983193" />
<window_info anchor="bottom" id="Python Console" order="10" sideWeight="0.49920928" weight="0.32983193" />
@ -513,9 +558,38 @@
<MESSAGE value="Creado cache de Geocoding. Avanzado en Geocoding Task. Decido mover parte de la gestion del geocoding al capturer." />
<MESSAGE value="Testeos en desarrollo del sistema de capturas. Pequeños retoques." />
<MESSAGE value="Puesto mains en explorer y capturer." />
<option name="LAST_COMMIT_MESSAGE" value="Puesto mains en explorer y capturer." />
<MESSAGE value="Puesto bien mains en explorer y capturer." />
<option name="LAST_COMMIT_MESSAGE" value="Puesto bien mains en explorer y capturer." />
</component>
<component name="XDebuggerManager">
<breakpoint-manager>
<breakpoints>
<line-breakpoint enabled="true" suspend="THREAD" type="python-line">
<url>file://$PROJECT_DIR$/explorer/explorer.py</url>
<line>38</line>
<option name="timeStamp" value="7" />
</line-breakpoint>
</breakpoints>
<breakpoints-dialog>
<breakpoints-dialog />
</breakpoints-dialog>
<default-breakpoints>
<breakpoint type="python-exception">
<properties notifyOnTerminate="true" exception="BaseException">
<option name="notifyOnTerminate" value="true" />
</properties>
</breakpoint>
</default-breakpoints>
</breakpoint-manager>
</component>
<component name="debuggerHistoryManager">
<expressions id="evaluateCodeFragment">
<expression>
<expression-string>Explorer.working_hours['start'] &lt;= datetime.datetime.now().time() &lt;= Explorer.working_hours['end']</expression-string>
<language-id>Python</language-id>
<evaluation-mode>CODE_FRAGMENT</evaluation-mode>
</expression>
</expressions>
<expressions id="evaluateExpression">
<expression>
<expression-string>capturing_interface.seconds_since_last_try()</expression-string>
@ -523,12 +597,42 @@
<evaluation-mode>EXPRESSION</evaluation-mode>
</expression>
<expression>
<expression-string>cursor.fetchone()</expression-string>
<expression-string>cursor.fetchone()[0]</expression-string>
<language-id>Python</language-id>
<evaluation-mode>EXPRESSION</evaluation-mode>
</expression>
<expression>
<expression-string>cursor.fetchone()[0]</expression-string>
<expression-string>return cursor.fetchone()[0]</expression-string>
<language-id>Python</language-id>
<evaluation-mode>EXPRESSION</evaluation-mode>
</expression>
<expression>
<expression-string>capturing_interface.get_pending_task() is None</expression-string>
<language-id>Python</language-id>
<evaluation-mode>EXPRESSION</evaluation-mode>
</expression>
<expression>
<expression-string>execution_cursor.execute(query_statement, query_parameters)</expression-string>
<language-id>Python</language-id>
<evaluation-mode>EXPRESSION</evaluation-mode>
</expression>
<expression>
<expression-string>uuid_exploring</expression-string>
<language-id>Python</language-id>
<evaluation-mode>EXPRESSION</evaluation-mode>
</expression>
<expression>
<expression-string>self.database_is_up()</expression-string>
<language-id>Python</language-id>
<evaluation-mode>EXPRESSION</evaluation-mode>
</expression>
<expression>
<expression-string>self.get_max_tasks_today()</expression-string>
<language-id>Python</language-id>
<evaluation-mode>EXPRESSION</evaluation-mode>
</expression>
<expression>
<expression-string>self.get_tasks_created_today()</expression-string>
<language-id>Python</language-id>
<evaluation-mode>EXPRESSION</evaluation-mode>
</expression>
@ -630,13 +734,6 @@
</state>
</provider>
</entry>
<entry file="file:///usr/lib/python3/dist-packages/apport_python_hook.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="211">
<caret line="48" selection-start-line="48" selection-end-line="48" />
</state>
</provider>
</entry>
<entry file="file:///usr/lib/python3/dist-packages/mysql/connector/cursor.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="203">
@ -644,16 +741,6 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/core/mysql_wrapper.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="675">
<caret line="45" column="23" selection-start-line="45" selection-start-column="23" selection-end-line="45" selection-end-column="23" />
<folding>
<element signature="e#24#46#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/db_layer/capturas_interface.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="300">
@ -661,13 +748,6 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/db_layer/capturing_tasks_interface.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="930">
<caret line="62" column="15" selection-start-line="62" selection-start-column="15" selection-end-line="62" selection-end-column="15" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/tests/capturer_tests.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="225">
@ -675,17 +755,51 @@
</state>
</provider>
</entry>
<entry file="file://$APPLICATION_HOME_DIR$/helpers/pydev/_pydev_imps/_pydev_execfile.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="255">
<caret line="17" selection-start-line="17" selection-end-line="17" />
</state>
</provider>
</entry>
<entry file="file:///usr/lib/python3/dist-packages/apport_python_hook.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="211">
<caret line="46" selection-start-line="46" selection-end-line="46" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/core/mysql_wrapper.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="263">
<caret line="47" selection-start-line="47" selection-end-line="47" />
<folding>
<element signature="e#24#46#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/explorer/explorer.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="479">
<caret line="282" lean-forward="true" selection-start-line="282" selection-end-line="284" selection-end-column="20" />
<state relative-caret-position="401">
<caret line="283" selection-start-line="283" selection-end-line="283" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/db_layer/capturing_tasks_interface.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="265">
<caret line="66" column="36" lean-forward="true" selection-start-line="66" selection-start-column="36" selection-end-line="66" selection-end-column="36" />
<folding>
<element signature="e#0#11#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/capturer/capturer.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="344">
<caret line="228" column="82" lean-forward="true" selection-start-line="228" selection-start-column="82" selection-end-line="228" selection-end-column="82" />
<state relative-caret-position="285">
<caret line="19" lean-forward="true" selection-start-line="19" selection-end-line="19" />
</state>
</provider>
</entry>

View file

@ -3,6 +3,7 @@ sys.path.append('..')
from time import sleep
from bs4 import BeautifulSoup
import re
import datetime
from db_layer.capturing_tasks_interface import capturing_interface
from db_layer.capturas_interface import capturas_interface
from core.scrapping_utils import UrlAttack
@ -11,13 +12,15 @@ from core.scrapping_utils import UrlAttack
class Capturer:
sleep_time_no_work = 60
minimum_seconds_between_tries = 120
working_hours = {'start': datetime.time(9, 0, 0),
'end': datetime.time(21, 0, 0)}
def start(self):
while True:
if (capturing_interface.get_pending_task() is None
or capturing_interface.seconds_since_last_try() < Capturer.minimum_seconds_between_tries):
or capturing_interface.seconds_since_last_try() < Capturer.minimum_seconds_between_tries
or not self.in_working_hours()):
sleep(Capturer.sleep_time_no_work)
continue
@ -32,6 +35,10 @@ class Capturer:
capturas_interface.insert_captura(ad_data)
task._update_status('Captura inserted')
def in_working_hours(self):
return Capturer.working_hours['start'] <= datetime.datetime.now().time() <= Capturer.working_hours['end']
class CapturingTask:
sleep_time_failed_request = 60

View file

@ -1,3 +1,4 @@
import uuid
from core.mysql_wrapper import get_tasksdb
class CapturingTasksInterface:
@ -36,9 +37,9 @@ class CapturingTasksInterface:
"""
cursor = self.tasksdb.query(query_statement, dictionary=True)
if cursor.rowcount:
try:
return cursor.fetchone()
else:
except:
return None
def update_capturing_task(self, uuid, uuid_exploring, status, ad_url):
@ -62,7 +63,7 @@ class CapturingTasksInterface:
query_statement = """SELECT TIME_TO_SEC(TIMEDIFF(now(), write_time))
FROM capturing_tasks_logs
WHERE status = 'Loading'
ORDER BY write_time
ORDER BY write_time DESC
LIMIT 1
"""

View file

@ -17,7 +17,7 @@ class Explorer():
sleep_time_no_work = 60
sleep_time_no_service = 600
working_hours = {'start': datetime.time(9, 0, 0),
'end': datetime.time(18, 0, 0)}
'end': datetime.time(21, 0, 0)}
monthly_capture_target = 1000
ad_types = {'1': 'alquiler',
'2': 'venta'}
@ -38,6 +38,7 @@ class Explorer():
while True:
if not self.there_is_work():
print('{}: Waiting. No work'.format(datetime.datetime.now()))
sleep(Explorer.sleep_time_no_work)
continue
@ -47,17 +48,16 @@ class Explorer():
current_task = ExploringTask(self.compose_listing_url())
current_task.explore()
print('{}: Exploring done'.format(datetime.datetime.now()))
if current_task.status == 'Referencias ready':
referencias = current_task.get_referencias()
for referencia in referencias:
capturing_interface.create_capturing_task(referencia)
current_task._update_status("Sent to queue")
capturing_interface.create_capturing_task(referencia, current_task.id)
current_task._update_status("Sent to queue")
continue
self.stop()
def stop(self):
#TODO Detener el servicio
@ -160,7 +160,7 @@ class Explorer():
"""
cursor_result = self.tasksdb.query(query_statement)
return cursor_result.row_count
return cursor_result.fetchone()[0]
def compose_listing_url(self):
"""
@ -168,7 +168,7 @@ class Explorer():
:return:
"""
root = 'https://www.idealista.com/'
type = ad_type[str(randint(1,2))]
type = Explorer.ad_types[str(randint(1,2))]
city = 'barcelona'
page_number = str(randint(1,30))
url = root + type + '-garajes/' + city + '-' + city + '/' + \