Avanzado en desarrollo de capturing task.
This commit is contained in:
parent
7b47b1e766
commit
1d68575fd7
2 changed files with 125 additions and 40 deletions
70
.idea/workspace.xml
generated
70
.idea/workspace.xml
generated
|
|
@ -4,7 +4,6 @@
|
|||
<list default="true" id="6e2fbba0-85ff-42d6-8e70-e4cdef1000c8" name="Default Changelist" comment="">
|
||||
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/capturer/capturer.py" beforeDir="false" afterPath="$PROJECT_DIR$/capturer/capturer.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/explorer/explorer.py" beforeDir="false" afterPath="$PROJECT_DIR$/explorer/explorer.py" afterDir="false" />
|
||||
</list>
|
||||
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
|
||||
<option name="SHOW_DIALOG" value="false" />
|
||||
|
|
@ -16,12 +15,13 @@
|
|||
<session id="1687213926">
|
||||
<usages-collector id="statistics.lifecycle.project">
|
||||
<counts>
|
||||
<entry key="project.closed" value="1" />
|
||||
<entry key="project.closed" value="2" />
|
||||
<entry key="project.open.time.0" value="1" />
|
||||
<entry key="project.open.time.12" value="1" />
|
||||
<entry key="project.open.time.13" value="2" />
|
||||
<entry key="project.open.time.14" value="2" />
|
||||
<entry key="project.opened" value="6" />
|
||||
<entry key="project.open.time.21" value="1" />
|
||||
<entry key="project.opened" value="7" />
|
||||
</counts>
|
||||
</usages-collector>
|
||||
<usages-collector id="statistics.file.extensions.open">
|
||||
|
|
@ -37,14 +37,14 @@
|
|||
<usages-collector id="statistics.file.extensions.edit">
|
||||
<counts>
|
||||
<entry key="Python Console" value="1519" />
|
||||
<entry key="py" value="4904" />
|
||||
<entry key="txt" value="569" />
|
||||
<entry key="py" value="6444" />
|
||||
<entry key="txt" value="647" />
|
||||
</counts>
|
||||
</usages-collector>
|
||||
<usages-collector id="statistics.file.types.edit">
|
||||
<counts>
|
||||
<entry key="PLAIN_TEXT" value="569" />
|
||||
<entry key="Python" value="6423" />
|
||||
<entry key="PLAIN_TEXT" value="647" />
|
||||
<entry key="Python" value="7963" />
|
||||
</counts>
|
||||
</usages-collector>
|
||||
<usages-collector id="statistics.vcs.git.usages">
|
||||
|
|
@ -64,13 +64,13 @@
|
|||
<file pinned="false" current-in-tab="true">
|
||||
<entry file="file://$PROJECT_DIR$/capturer/capturer.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="285">
|
||||
<caret line="102" column="32" lean-forward="true" selection-start-line="102" selection-start-column="32" selection-end-line="102" selection-end-column="32" />
|
||||
<state relative-caret-position="377">
|
||||
<caret line="164" column="8" selection-start-line="164" selection-start-column="8" selection-end-line="164" selection-end-column="8" />
|
||||
<folding>
|
||||
<element signature="e#1322#1377#0" />
|
||||
<element signature="e#1418#1920#0" />
|
||||
<marker date="1537653289723" expanded="true" signature="1082:1083" ph="..." />
|
||||
<marker date="1537653289723" expanded="true" signature="2990:3080" ph="..." />
|
||||
<marker date="1537729307226" expanded="true" signature="1206:1207" ph="..." />
|
||||
<marker date="1537729307226" expanded="true" signature="4143:4296" ph="..." />
|
||||
<marker date="1537729307226" expanded="true" signature="4911:5209" ph="..." />
|
||||
<marker date="1537729307226" expanded="true" signature="5383:5398" ph="..." />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
|
|
@ -83,10 +83,9 @@
|
|||
<file pinned="false" current-in-tab="true">
|
||||
<entry file="file://$PROJECT_DIR$/explorer/explorer.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="260">
|
||||
<caret line="52" column="29" selection-start-line="52" selection-start-column="29" selection-end-line="52" selection-end-column="29" />
|
||||
<state relative-caret-position="75">
|
||||
<caret line="5" selection-start-line="5" selection-end-line="5" selection-end-column="22" />
|
||||
<folding>
|
||||
<element signature="e#6100#9639#0" />
|
||||
<marker date="1537653289735" expanded="true" signature="5088:5540" ph="..." />
|
||||
</folding>
|
||||
</state>
|
||||
|
|
@ -105,7 +104,7 @@
|
|||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/core/alerts.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="105">
|
||||
<state relative-caret-position="75">
|
||||
<caret line="7" column="34" selection-start-line="7" selection-start-column="34" selection-end-line="7" selection-end-column="34" />
|
||||
<folding>
|
||||
<element signature="e#0#46#0" expanded="true" />
|
||||
|
|
@ -152,8 +151,8 @@
|
|||
<option value="$PROJECT_DIR$/core/alerts.py" />
|
||||
<option value="$PROJECT_DIR$/core/task.py" />
|
||||
<option value="$PROJECT_DIR$/capturer/__init__.py" />
|
||||
<option value="$PROJECT_DIR$/capturer/capturer.py" />
|
||||
<option value="$PROJECT_DIR$/explorer/explorer.py" />
|
||||
<option value="$PROJECT_DIR$/capturer/capturer.py" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
|
|
@ -283,7 +282,14 @@
|
|||
<option name="project" value="LOCAL" />
|
||||
<updated>1537651070019</updated>
|
||||
</task>
|
||||
<option name="localTasksCounter" value="5" />
|
||||
<task id="LOCAL-00005" summary="Iniciadas clases de capturing_task y scraptargetfield.">
|
||||
<created>1537653361355</created>
|
||||
<option name="number" value="00005" />
|
||||
<option name="presentableId" value="LOCAL-00005" />
|
||||
<option name="project" value="LOCAL" />
|
||||
<updated>1537653361355</updated>
|
||||
</task>
|
||||
<option name="localTasksCounter" value="6" />
|
||||
<servers />
|
||||
</component>
|
||||
<component name="TodoView" selected-index="1">
|
||||
|
|
@ -299,8 +305,8 @@
|
|||
<frame x="0" y="-2" width="1920" height="1082" extended-state="6" />
|
||||
<editor active="true" />
|
||||
<layout>
|
||||
<window_info content_ui="combo" id="Project" order="0" sideWeight="0.48982784" visible="true" weight="0.14918292" />
|
||||
<window_info id="Structure" order="1" sideWeight="0.5101721" side_tool="true" visible="true" weight="0.14918292" />
|
||||
<window_info content_ui="combo" id="Project" order="0" sideWeight="0.48826292" visible="true" weight="0.14918292" />
|
||||
<window_info id="Structure" order="1" sideWeight="0.5117371" side_tool="true" visible="true" weight="0.14918292" />
|
||||
<window_info id="Favorites" order="2" sideWeight="0.5015674" side_tool="true" weight="0.14918292" />
|
||||
<window_info active="true" id="Repositories" order="3" sideWeight="0.49529782" visible="true" weight="0.32999474" />
|
||||
<window_info anchor="bottom" id="Message" order="0" />
|
||||
|
|
@ -354,7 +360,8 @@
|
|||
<MESSAGE value="Pequeños detalles en Explorer." />
|
||||
<MESSAGE value="Refactorizado Explorer para que el Explorer, y no el exploring_task, se encargue de postear las tareas de captura. Creado una funcion independiente de creacion de capturas para que sea compartida entre todos aquellos servicios que las creen." />
|
||||
<MESSAGE value="Corregidos pequeños errores y type tras primer test del servicio." />
|
||||
<option name="LAST_COMMIT_MESSAGE" value="Corregidos pequeños errores y type tras primer test del servicio." />
|
||||
<MESSAGE value="Iniciadas clases de capturing_task y scraptargetfield." />
|
||||
<option name="LAST_COMMIT_MESSAGE" value="Iniciadas clases de capturing_task y scraptargetfield." />
|
||||
</component>
|
||||
<component name="editorHistoryManager">
|
||||
<entry file="file://$PROJECT_DIR$/explorer/test_explorer.py" />
|
||||
|
|
@ -385,7 +392,7 @@
|
|||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/core/alerts.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="105">
|
||||
<state relative-caret-position="75">
|
||||
<caret line="7" column="34" selection-start-line="7" selection-start-column="34" selection-end-line="7" selection-end-column="34" />
|
||||
<folding>
|
||||
<element signature="e#0#46#0" expanded="true" />
|
||||
|
|
@ -395,10 +402,9 @@
|
|||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/explorer/explorer.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="260">
|
||||
<caret line="52" column="29" selection-start-line="52" selection-start-column="29" selection-end-line="52" selection-end-column="29" />
|
||||
<state relative-caret-position="75">
|
||||
<caret line="5" selection-start-line="5" selection-end-line="5" selection-end-column="22" />
|
||||
<folding>
|
||||
<element signature="e#6100#9639#0" />
|
||||
<marker date="1537653289735" expanded="true" signature="5088:5540" ph="..." />
|
||||
</folding>
|
||||
</state>
|
||||
|
|
@ -406,13 +412,13 @@
|
|||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/capturer/capturer.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="285">
|
||||
<caret line="102" column="32" lean-forward="true" selection-start-line="102" selection-start-column="32" selection-end-line="102" selection-end-column="32" />
|
||||
<state relative-caret-position="377">
|
||||
<caret line="164" column="8" selection-start-line="164" selection-start-column="8" selection-end-line="164" selection-end-column="8" />
|
||||
<folding>
|
||||
<element signature="e#1322#1377#0" />
|
||||
<element signature="e#1418#1920#0" />
|
||||
<marker date="1537653289723" expanded="true" signature="1082:1083" ph="..." />
|
||||
<marker date="1537653289723" expanded="true" signature="2990:3080" ph="..." />
|
||||
<marker date="1537729307226" expanded="true" signature="1206:1207" ph="..." />
|
||||
<marker date="1537729307226" expanded="true" signature="4143:4296" ph="..." />
|
||||
<marker date="1537729307226" expanded="true" signature="4911:5209" ph="..." />
|
||||
<marker date="1537729307226" expanded="true" signature="5383:5398" ph="..." />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
|
|
|
|||
|
|
@ -1,13 +1,18 @@
|
|||
import sys
|
||||
sys.path.append('..')
|
||||
import uuid
|
||||
from time import sleep
|
||||
from core.mysql_wrapper import get_anunciosdb, get_tasksdb
|
||||
from core.scrapping_utils import UrlAttack
|
||||
from core.alerts import alert_master
|
||||
|
||||
|
||||
ads_root = 'https://www.idealista.com/inmueble/'
|
||||
|
||||
#TODO Crear la lista de campos
|
||||
|
||||
ad_fields_parameters = []
|
||||
|
||||
|
||||
def create_capturing_task(referencia, db_wrapper, uuid_exploring=None):
|
||||
|
||||
|
|
@ -29,15 +34,17 @@ def create_capturing_task(referencia, db_wrapper, uuid_exploring=None):
|
|||
|
||||
class CapturingTask:
|
||||
|
||||
sleep_time_failed_request = 60
|
||||
|
||||
def __init__(self, parameters):
|
||||
self.uuid = parameters['uuid']
|
||||
self.status = 'Loading'
|
||||
self.ad_url = parameters['ad_url']
|
||||
self.uuid_exploring = parameters['uuid_exploring']
|
||||
self.request_failures = 0
|
||||
|
||||
self.tasksdb = get_tasksdb()
|
||||
|
||||
self._log_in_tasksdb()
|
||||
self._update_status('Loading')
|
||||
|
||||
def _update_status(self, new_status):
|
||||
self.status = new_status
|
||||
|
|
@ -65,9 +72,57 @@ class CapturingTask:
|
|||
Metodo principal que contiene el flujo de captura
|
||||
"""
|
||||
#TODO Desarrollar flujo de captura
|
||||
self._update_status('WIP')
|
||||
|
||||
self._read_fields()
|
||||
|
||||
while self.request_failures < 4:
|
||||
attack = UrlAttack(self.ad_url)
|
||||
attack.attack()
|
||||
|
||||
if attack.success():
|
||||
self.html = attack.get_text()
|
||||
if not self._html_is_valid():
|
||||
alert_master('ERROR CAPTURER',
|
||||
'El HTML de una pagina de anuncio es invalido')
|
||||
|
||||
with self._fields_not_present() as missing_fields:
|
||||
if missing_fields:
|
||||
alert_master('ERROR CAPTURER',
|
||||
'Los siguientes campos no estaban presentes {}. '
|
||||
'URL = {}'.format(missing_fields, self.ad_url))
|
||||
self._update_status('Dead ad')
|
||||
return
|
||||
|
||||
with self._fields_not_valid() as unvalid_fields:
|
||||
if unvalid_fields:
|
||||
alert_master('ERROR CAPTURER',
|
||||
'Los siguientes campos no tenian valores presentes {}'
|
||||
'URL = {}'.format(unvalid_fields, self.ad_url))
|
||||
self._update_status('Dead ad')
|
||||
return
|
||||
|
||||
#Extraer datos
|
||||
self.extract_data()
|
||||
#Geocodear
|
||||
self.geocode()
|
||||
|
||||
else:
|
||||
self.request_failures += 1
|
||||
self._update_status('Fail {}'.format(self.request_failures))
|
||||
sleep(sleep_time_failed_request)
|
||||
continue
|
||||
|
||||
self._update_status('Surrender')
|
||||
|
||||
|
||||
def _html_is_valid(selfself, html=self.html):
|
||||
def _read_fields(self):
|
||||
self.fields = []
|
||||
for field_parameters in ad_fields_parameters:
|
||||
self.fields.append(ScrapTargetField(field_parameters))
|
||||
|
||||
|
||||
def _html_is_valid(self, html=self.html):
|
||||
"""
|
||||
Lee el HTML y aplica normas de validación del contenido
|
||||
"""
|
||||
|
|
@ -78,18 +133,42 @@ class CapturingTask:
|
|||
#TODO Check de longitud
|
||||
pass
|
||||
|
||||
def _fields_not_present(self, field_list, html=self.html):
|
||||
def _fields_not_present(self, html=self.html):
|
||||
"""
|
||||
Lee el HTML y devuelve los campos que no esten presentes
|
||||
"""
|
||||
fields_not_present = []
|
||||
for field_parameters in field_list:
|
||||
field = ScrapTargetField(field_parameters)
|
||||
if not field.exists(html):
|
||||
fields_not_present.append(field.name)
|
||||
for field in self.fields:
|
||||
if not field.exists(html):
|
||||
fields_not_present.append(field.name)
|
||||
|
||||
return fields_not_present
|
||||
|
||||
def _fields_not_valid(self, html=self.html):
|
||||
"""
|
||||
Lee el HTML y devuelve los campos que no tengan valores validos
|
||||
"""
|
||||
fields_not_valid = []
|
||||
for field in self.fields:
|
||||
if not field.validate_value(html):
|
||||
fields_not_valid.append(field.name)
|
||||
|
||||
return fields_not_valid
|
||||
|
||||
def extract_data(self):
|
||||
self.ad_data = {}
|
||||
|
||||
for field in self.fields:
|
||||
self.ad_data[field.name] = field.get_value(self.html)
|
||||
|
||||
def geocode(self):
|
||||
#TODO Construir metodo de Geocoding. Quizas en otra clase?
|
||||
pass
|
||||
|
||||
|
||||
def get_ad_data(self):
|
||||
return self.ad_data
|
||||
|
||||
|
||||
class ScrapTargetField:
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue