Iniciadas clases de capturing_task y scraptargetfield.
This commit is contained in:
parent
80d65b7a7c
commit
7b47b1e766
3 changed files with 165 additions and 57 deletions
120
.idea/workspace.xml
generated
120
.idea/workspace.xml
generated
|
|
@ -26,25 +26,25 @@
|
|||
</usages-collector>
|
||||
<usages-collector id="statistics.file.extensions.open">
|
||||
<counts>
|
||||
<entry key="py" value="10" />
|
||||
<entry key="py" value="14" />
|
||||
</counts>
|
||||
</usages-collector>
|
||||
<usages-collector id="statistics.file.types.open">
|
||||
<counts>
|
||||
<entry key="Python" value="10" />
|
||||
<entry key="Python" value="14" />
|
||||
</counts>
|
||||
</usages-collector>
|
||||
<usages-collector id="statistics.file.extensions.edit">
|
||||
<counts>
|
||||
<entry key="Python Console" value="1519" />
|
||||
<entry key="py" value="3070" />
|
||||
<entry key="txt" value="472" />
|
||||
<entry key="py" value="4904" />
|
||||
<entry key="txt" value="569" />
|
||||
</counts>
|
||||
</usages-collector>
|
||||
<usages-collector id="statistics.file.types.edit">
|
||||
<counts>
|
||||
<entry key="PLAIN_TEXT" value="472" />
|
||||
<entry key="Python" value="4589" />
|
||||
<entry key="PLAIN_TEXT" value="569" />
|
||||
<entry key="Python" value="6423" />
|
||||
</counts>
|
||||
</usages-collector>
|
||||
<usages-collector id="statistics.vcs.git.usages">
|
||||
|
|
@ -61,34 +61,33 @@
|
|||
<splitter split-orientation="horizontal" split-proportion="0.5">
|
||||
<split-first>
|
||||
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/explorer/explorer.py">
|
||||
<file pinned="false" current-in-tab="true">
|
||||
<entry file="file://$PROJECT_DIR$/capturer/capturer.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="2117">
|
||||
<caret line="176" column="18" lean-forward="true" selection-start-line="176" selection-start-column="18" selection-end-line="176" selection-end-column="18" />
|
||||
<state relative-caret-position="285">
|
||||
<caret line="102" column="32" lean-forward="true" selection-start-line="102" selection-start-column="32" selection-end-line="102" selection-end-column="32" />
|
||||
<folding>
|
||||
<marker date="1537650937299" expanded="true" signature="5092:5544" ph="..." />
|
||||
<element signature="e#1322#1377#0" />
|
||||
<element signature="e#1418#1920#0" />
|
||||
<marker date="1537653289723" expanded="true" signature="1082:1083" ph="..." />
|
||||
<marker date="1537653289723" expanded="true" signature="2990:3080" ph="..." />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
</leaf>
|
||||
</split-first>
|
||||
<split-second>
|
||||
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
|
||||
<file pinned="false" current-in-tab="true">
|
||||
<entry file="file://$PROJECT_DIR$/capturer/capturer.py">
|
||||
<entry file="file://$PROJECT_DIR$/explorer/explorer.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="315">
|
||||
<caret line="21" lean-forward="true" selection-start-line="21" selection-end-line="21" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/core/alerts.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="75">
|
||||
<caret line="7" column="34" selection-start-line="7" selection-start-column="34" selection-end-line="7" selection-end-column="34" />
|
||||
<state relative-caret-position="260">
|
||||
<caret line="52" column="29" selection-start-line="52" selection-start-column="29" selection-end-line="52" selection-end-column="29" />
|
||||
<folding>
|
||||
<element signature="e#0#46#0" expanded="true" />
|
||||
<element signature="e#6100#9639#0" />
|
||||
<marker date="1537653289735" expanded="true" signature="5088:5540" ph="..." />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
|
|
@ -103,17 +102,13 @@
|
|||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
</leaf>
|
||||
</split-first>
|
||||
<split-second>
|
||||
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
|
||||
<file pinned="false" current-in-tab="true">
|
||||
<entry file="file://$PROJECT_DIR$/explorer/explorer.py">
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/core/alerts.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="317">
|
||||
<caret line="273" column="44" selection-start-line="273" selection-start-column="44" selection-end-line="273" selection-end-column="44" />
|
||||
<state relative-caret-position="105">
|
||||
<caret line="7" column="34" selection-start-line="7" selection-start-column="34" selection-end-line="7" selection-end-column="34" />
|
||||
<folding>
|
||||
<marker date="1537650937299" expanded="true" signature="5092:5544" ph="..." />
|
||||
<element signature="e#0#46#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
|
|
@ -157,8 +152,8 @@
|
|||
<option value="$PROJECT_DIR$/core/alerts.py" />
|
||||
<option value="$PROJECT_DIR$/core/task.py" />
|
||||
<option value="$PROJECT_DIR$/capturer/__init__.py" />
|
||||
<option value="$PROJECT_DIR$/explorer/explorer.py" />
|
||||
<option value="$PROJECT_DIR$/capturer/capturer.py" />
|
||||
<option value="$PROJECT_DIR$/explorer/explorer.py" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
|
|
@ -281,7 +276,14 @@
|
|||
<option name="project" value="LOCAL" />
|
||||
<updated>1537546774036</updated>
|
||||
</task>
|
||||
<option name="localTasksCounter" value="4" />
|
||||
<task id="LOCAL-00004" summary="Corregidos pequeños errores y type tras primer test del servicio.">
|
||||
<created>1537651070019</created>
|
||||
<option name="number" value="00004" />
|
||||
<option name="presentableId" value="LOCAL-00004" />
|
||||
<option name="project" value="LOCAL" />
|
||||
<updated>1537651070019</updated>
|
||||
</task>
|
||||
<option name="localTasksCounter" value="5" />
|
||||
<servers />
|
||||
</component>
|
||||
<component name="TodoView" selected-index="1">
|
||||
|
|
@ -297,17 +299,17 @@
|
|||
<frame x="0" y="-2" width="1920" height="1082" extended-state="6" />
|
||||
<editor active="true" />
|
||||
<layout>
|
||||
<window_info content_ui="combo" id="Project" order="0" sideWeight="0.4905956" weight="0.14918292" />
|
||||
<window_info id="Structure" order="1" sideWeight="0.50940436" side_tool="true" visible="true" weight="0.14918292" />
|
||||
<window_info content_ui="combo" id="Project" order="0" sideWeight="0.48982784" visible="true" weight="0.14918292" />
|
||||
<window_info id="Structure" order="1" sideWeight="0.5101721" side_tool="true" visible="true" weight="0.14918292" />
|
||||
<window_info id="Favorites" order="2" sideWeight="0.5015674" side_tool="true" weight="0.14918292" />
|
||||
<window_info active="true" id="Repositories" order="3" sideWeight="0.49529782" visible="true" weight="0.32999474" />
|
||||
<window_info anchor="bottom" id="Message" order="0" />
|
||||
<window_info active="true" anchor="bottom" id="Find" order="1" visible="true" weight="0.32983193" />
|
||||
<window_info anchor="bottom" id="Find" order="1" weight="0.32983193" />
|
||||
<window_info anchor="bottom" id="Run" order="2" weight="0.32983193" />
|
||||
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
|
||||
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
|
||||
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
|
||||
<window_info anchor="bottom" id="TODO" order="6" sideWeight="0.49973643" weight="0.32914045" />
|
||||
<window_info active="true" anchor="bottom" id="TODO" order="6" sideWeight="0.49973643" visible="true" weight="0.32878152" />
|
||||
<window_info anchor="bottom" id="Version Control" order="7" sideWeight="0.49973643" weight="0.269958" />
|
||||
<window_info anchor="bottom" id="Terminal" order="8" weight="0.32983193" />
|
||||
<window_info anchor="bottom" id="Event Log" order="9" sideWeight="0.5007907" side_tool="true" weight="0.32983193" />
|
||||
|
|
@ -351,7 +353,8 @@
|
|||
<MESSAGE value="Finalizado modulo de alertas. Testeado clase ExploringTask a fondo." />
|
||||
<MESSAGE value="Pequeños detalles en Explorer." />
|
||||
<MESSAGE value="Refactorizado Explorer para que el Explorer, y no el exploring_task, se encargue de postear las tareas de captura. Creado una funcion independiente de creacion de capturas para que sea compartida entre todos aquellos servicios que las creen." />
|
||||
<option name="LAST_COMMIT_MESSAGE" value="Refactorizado Explorer para que el Explorer, y no el exploring_task, se encargue de postear las tareas de captura. Creado una funcion independiente de creacion de capturas para que sea compartida entre todos aquellos servicios que las creen." />
|
||||
<MESSAGE value="Corregidos pequeños errores y type tras primer test del servicio." />
|
||||
<option name="LAST_COMMIT_MESSAGE" value="Corregidos pequeños errores y type tras primer test del servicio." />
|
||||
</component>
|
||||
<component name="editorHistoryManager">
|
||||
<entry file="file://$PROJECT_DIR$/explorer/test_explorer.py" />
|
||||
|
|
@ -373,16 +376,6 @@
|
|||
<entry file="file://$PROJECT_DIR$/capturer/__init__.py">
|
||||
<provider selected="true" editor-type-id="text-editor" />
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/core/alerts.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="75">
|
||||
<caret line="7" column="34" selection-start-line="7" selection-start-column="34" selection-end-line="7" selection-end-column="34" />
|
||||
<folding>
|
||||
<element signature="e#0#46#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/core/mysql_wrapper.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="540">
|
||||
|
|
@ -390,20 +383,37 @@
|
|||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/core/alerts.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="105">
|
||||
<caret line="7" column="34" selection-start-line="7" selection-start-column="34" selection-end-line="7" selection-end-column="34" />
|
||||
<folding>
|
||||
<element signature="e#0#46#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/explorer/explorer.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="2117">
|
||||
<caret line="176" column="18" lean-forward="true" selection-start-line="176" selection-start-column="18" selection-end-line="176" selection-end-column="18" />
|
||||
<state relative-caret-position="260">
|
||||
<caret line="52" column="29" selection-start-line="52" selection-start-column="29" selection-end-line="52" selection-end-column="29" />
|
||||
<folding>
|
||||
<marker date="1537650937299" expanded="true" signature="5092:5544" ph="..." />
|
||||
<element signature="e#6100#9639#0" />
|
||||
<marker date="1537653289735" expanded="true" signature="5088:5540" ph="..." />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/capturer/capturer.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="315">
|
||||
<caret line="21" lean-forward="true" selection-start-line="21" selection-end-line="21" />
|
||||
<state relative-caret-position="285">
|
||||
<caret line="102" column="32" lean-forward="true" selection-start-line="102" selection-start-column="32" selection-end-line="102" selection-end-column="32" />
|
||||
<folding>
|
||||
<element signature="e#1322#1377#0" />
|
||||
<element signature="e#1418#1920#0" />
|
||||
<marker date="1537653289723" expanded="true" signature="1082:1083" ph="..." />
|
||||
<marker date="1537653289723" expanded="true" signature="2990:3080" ph="..." />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
|
|
|
|||
|
|
@ -1,7 +1,13 @@
|
|||
import sys
|
||||
sys.path.append('..')
|
||||
import uuid
|
||||
from core.mysql_wrapper import get_anunciosdb, get_tasksdb
|
||||
from core.scrapping_utils import UrlAttack
|
||||
|
||||
ads_root = 'https://www.idealista.com/inmueble/'
|
||||
|
||||
#TODO Crear la lista de campos
|
||||
|
||||
|
||||
def create_capturing_task(referencia, db_wrapper, uuid_exploring=None):
|
||||
|
||||
|
|
@ -19,6 +25,98 @@ def create_capturing_task(referencia, db_wrapper, uuid_exploring=None):
|
|||
(uuid, write_time, status, url, fk_uuid_exploring)
|
||||
VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s, %(uuid_exploring)s)"""
|
||||
|
||||
|
||||
db_wrapper.query(query_statement, query_parameters)
|
||||
|
||||
class CapturingTask:
|
||||
|
||||
def __init__(self, parameters):
|
||||
self.uuid = parameters['uuid']
|
||||
self.status = 'Loading'
|
||||
self.ad_url = parameters['ad_url']
|
||||
self.uuid_exploring = parameters['uuid_exploring']
|
||||
|
||||
self.tasksdb = get_tasksdb()
|
||||
|
||||
self._log_in_tasksdb()
|
||||
|
||||
def _update_status(self, new_status):
|
||||
self.status = new_status
|
||||
self._log_in_tasksdb()
|
||||
|
||||
def _log_in_tasksdb(self):
|
||||
"""
|
||||
Graba en la base de datos de tareas un registro con el UUID de la tarea,
|
||||
un timestamp y el status
|
||||
"""
|
||||
|
||||
query_statement = """INSERT INTO capturing_tasks_logs
|
||||
(uuid, write_time, status, ad_url, fk_uuid_exploring)
|
||||
VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s, %(fk_uuid_exploring)s)"""
|
||||
|
||||
query_parameters = {'uuid': self.uuid,
|
||||
'status': self.status,
|
||||
'ad_url': self.ad_url,
|
||||
'fk_uuid_exploring': self.uuid_exploring}
|
||||
|
||||
self.tasksdb.query(query_statement, query_parameters)
|
||||
|
||||
def capture(self):
|
||||
"""
|
||||
Metodo principal que contiene el flujo de captura
|
||||
"""
|
||||
#TODO Desarrollar flujo de captura
|
||||
|
||||
|
||||
def _html_is_valid(selfself, html=self.html):
|
||||
"""
|
||||
Lee el HTML y aplica normas de validación del contenido
|
||||
"""
|
||||
|
||||
#TODO Comprobar si HTML es pagina de bloqueo
|
||||
|
||||
|
||||
#TODO Check de longitud
|
||||
pass
|
||||
|
||||
def _fields_not_present(self, field_list, html=self.html):
|
||||
"""
|
||||
Lee el HTML y devuelve los campos que no esten presentes
|
||||
"""
|
||||
fields_not_present = []
|
||||
for field_parameters in field_list:
|
||||
field = ScrapTargetField(field_parameters)
|
||||
if not field.exists(html):
|
||||
fields_not_present.append(field.name)
|
||||
|
||||
return fields_not_present
|
||||
|
||||
|
||||
class ScrapTargetField:
|
||||
|
||||
def __init__(self, target_parameters):
|
||||
self.name = target_parameters['name']
|
||||
self.search_method = target_parameters['search_method']
|
||||
self.validation_method = target_parameters['validation_method']
|
||||
|
||||
def exists(self, html):
|
||||
"""
|
||||
Busca el dato en un HTML
|
||||
"""
|
||||
if self.search_method(html) is None:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def validate_value(self, dato):
|
||||
"""
|
||||
Comprueba el valor y valida con la norma respectiva que sea lo esperado
|
||||
"""
|
||||
return self.validation_method(dato)
|
||||
|
||||
def get_value(self, html):
|
||||
"""
|
||||
Busca en un HTML el dato
|
||||
"""
|
||||
return self.search_method(html)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@ class Explorer():
|
|||
|
||||
if not self.database_is_up():
|
||||
alert_master("SQL DOWN", "El explorer informa de que SQL esta caida. Actividad detenida")
|
||||
self.stop(self)
|
||||
self.stop()
|
||||
|
||||
current_task = ExploringTask(self.compose_listing_url())
|
||||
current_task.explore()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue