Avanzado en desarrollo de capturing task. Creado clase GeocodingTask
This commit is contained in:
parent
1d68575fd7
commit
9d947f7e2b
3 changed files with 152 additions and 52 deletions
94
.idea/workspace.xml
generated
94
.idea/workspace.xml
generated
|
|
@ -2,6 +2,7 @@
|
||||||
<project version="4">
|
<project version="4">
|
||||||
<component name="ChangeListManager">
|
<component name="ChangeListManager">
|
||||||
<list default="true" id="6e2fbba0-85ff-42d6-8e70-e4cdef1000c8" name="Default Changelist" comment="">
|
<list default="true" id="6e2fbba0-85ff-42d6-8e70-e4cdef1000c8" name="Default Changelist" comment="">
|
||||||
|
<change afterPath="$PROJECT_DIR$/capturer/geocoder.py" afterDir="false" />
|
||||||
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
||||||
<change beforePath="$PROJECT_DIR$/capturer/capturer.py" beforeDir="false" afterPath="$PROJECT_DIR$/capturer/capturer.py" afterDir="false" />
|
<change beforePath="$PROJECT_DIR$/capturer/capturer.py" beforeDir="false" afterPath="$PROJECT_DIR$/capturer/capturer.py" afterDir="false" />
|
||||||
</list>
|
</list>
|
||||||
|
|
@ -15,36 +16,36 @@
|
||||||
<session id="1687213926">
|
<session id="1687213926">
|
||||||
<usages-collector id="statistics.lifecycle.project">
|
<usages-collector id="statistics.lifecycle.project">
|
||||||
<counts>
|
<counts>
|
||||||
<entry key="project.closed" value="2" />
|
<entry key="project.closed" value="3" />
|
||||||
<entry key="project.open.time.0" value="1" />
|
<entry key="project.open.time.0" value="1" />
|
||||||
<entry key="project.open.time.12" value="1" />
|
<entry key="project.open.time.12" value="1" />
|
||||||
<entry key="project.open.time.13" value="2" />
|
<entry key="project.open.time.13" value="2" />
|
||||||
<entry key="project.open.time.14" value="2" />
|
<entry key="project.open.time.14" value="3" />
|
||||||
<entry key="project.open.time.21" value="1" />
|
<entry key="project.open.time.21" value="1" />
|
||||||
<entry key="project.opened" value="7" />
|
<entry key="project.opened" value="8" />
|
||||||
</counts>
|
</counts>
|
||||||
</usages-collector>
|
</usages-collector>
|
||||||
<usages-collector id="statistics.file.extensions.open">
|
<usages-collector id="statistics.file.extensions.open">
|
||||||
<counts>
|
<counts>
|
||||||
<entry key="py" value="14" />
|
<entry key="py" value="15" />
|
||||||
</counts>
|
</counts>
|
||||||
</usages-collector>
|
</usages-collector>
|
||||||
<usages-collector id="statistics.file.types.open">
|
<usages-collector id="statistics.file.types.open">
|
||||||
<counts>
|
<counts>
|
||||||
<entry key="Python" value="14" />
|
<entry key="Python" value="15" />
|
||||||
</counts>
|
</counts>
|
||||||
</usages-collector>
|
</usages-collector>
|
||||||
<usages-collector id="statistics.file.extensions.edit">
|
<usages-collector id="statistics.file.extensions.edit">
|
||||||
<counts>
|
<counts>
|
||||||
<entry key="Python Console" value="1519" />
|
<entry key="Python Console" value="1519" />
|
||||||
<entry key="py" value="6444" />
|
<entry key="py" value="8493" />
|
||||||
<entry key="txt" value="647" />
|
<entry key="txt" value="692" />
|
||||||
</counts>
|
</counts>
|
||||||
</usages-collector>
|
</usages-collector>
|
||||||
<usages-collector id="statistics.file.types.edit">
|
<usages-collector id="statistics.file.types.edit">
|
||||||
<counts>
|
<counts>
|
||||||
<entry key="PLAIN_TEXT" value="647" />
|
<entry key="PLAIN_TEXT" value="692" />
|
||||||
<entry key="Python" value="7963" />
|
<entry key="Python" value="10012" />
|
||||||
</counts>
|
</counts>
|
||||||
</usages-collector>
|
</usages-collector>
|
||||||
<usages-collector id="statistics.vcs.git.usages">
|
<usages-collector id="statistics.vcs.git.usages">
|
||||||
|
|
@ -64,13 +65,27 @@
|
||||||
<file pinned="false" current-in-tab="true">
|
<file pinned="false" current-in-tab="true">
|
||||||
<entry file="file://$PROJECT_DIR$/capturer/capturer.py">
|
<entry file="file://$PROJECT_DIR$/capturer/capturer.py">
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
<state relative-caret-position="377">
|
<state relative-caret-position="2325">
|
||||||
<caret line="164" column="8" selection-start-line="164" selection-start-column="8" selection-end-line="164" selection-end-column="8" />
|
<caret line="155" lean-forward="true" selection-start-line="155" selection-end-line="155" />
|
||||||
<folding>
|
<folding>
|
||||||
<marker date="1537729307226" expanded="true" signature="1206:1207" ph="..." />
|
<marker date="1537995271039" expanded="true" signature="1249:1250" ph="..." />
|
||||||
<marker date="1537729307226" expanded="true" signature="4143:4296" ph="..." />
|
<marker date="1537995271039" expanded="true" signature="4109:4257" ph="..." />
|
||||||
<marker date="1537729307226" expanded="true" signature="4911:5209" ph="..." />
|
<marker date="1537995271039" expanded="true" signature="4640:4938" ph="..." />
|
||||||
<marker date="1537729307226" expanded="true" signature="5383:5398" ph="..." />
|
<marker date="1537995271039" expanded="true" signature="5089:5094" ph="..." />
|
||||||
|
</folding>
|
||||||
|
</state>
|
||||||
|
</provider>
|
||||||
|
</entry>
|
||||||
|
</file>
|
||||||
|
<file pinned="false" current-in-tab="false">
|
||||||
|
<entry file="file://$PROJECT_DIR$/capturer/geocoder.py">
|
||||||
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
|
<state relative-caret-position="77">
|
||||||
|
<caret line="17" column="14" selection-start-line="17" selection-start-column="14" selection-end-line="17" selection-end-column="14" />
|
||||||
|
<folding>
|
||||||
|
<marker date="1537995271051" expanded="true" signature="519:524" ph="..." />
|
||||||
|
<marker date="1537995271051" expanded="true" signature="708:826" ph="..." />
|
||||||
|
<marker date="1537995271051" expanded="true" signature="1349:1354" ph="..." />
|
||||||
</folding>
|
</folding>
|
||||||
</state>
|
</state>
|
||||||
</provider>
|
</provider>
|
||||||
|
|
@ -152,6 +167,7 @@
|
||||||
<option value="$PROJECT_DIR$/core/task.py" />
|
<option value="$PROJECT_DIR$/core/task.py" />
|
||||||
<option value="$PROJECT_DIR$/capturer/__init__.py" />
|
<option value="$PROJECT_DIR$/capturer/__init__.py" />
|
||||||
<option value="$PROJECT_DIR$/explorer/explorer.py" />
|
<option value="$PROJECT_DIR$/explorer/explorer.py" />
|
||||||
|
<option value="$PROJECT_DIR$/capturer/geocoder.py" />
|
||||||
<option value="$PROJECT_DIR$/capturer/capturer.py" />
|
<option value="$PROJECT_DIR$/capturer/capturer.py" />
|
||||||
</list>
|
</list>
|
||||||
</option>
|
</option>
|
||||||
|
|
@ -289,7 +305,14 @@
|
||||||
<option name="project" value="LOCAL" />
|
<option name="project" value="LOCAL" />
|
||||||
<updated>1537653361355</updated>
|
<updated>1537653361355</updated>
|
||||||
</task>
|
</task>
|
||||||
<option name="localTasksCounter" value="6" />
|
<task id="LOCAL-00006" summary="Avanzado en desarrollo de capturing task.">
|
||||||
|
<created>1537729440310</created>
|
||||||
|
<option name="number" value="00006" />
|
||||||
|
<option name="presentableId" value="LOCAL-00006" />
|
||||||
|
<option name="project" value="LOCAL" />
|
||||||
|
<updated>1537729440311</updated>
|
||||||
|
</task>
|
||||||
|
<option name="localTasksCounter" value="7" />
|
||||||
<servers />
|
<servers />
|
||||||
</component>
|
</component>
|
||||||
<component name="TodoView" selected-index="1">
|
<component name="TodoView" selected-index="1">
|
||||||
|
|
@ -305,8 +328,8 @@
|
||||||
<frame x="0" y="-2" width="1920" height="1082" extended-state="6" />
|
<frame x="0" y="-2" width="1920" height="1082" extended-state="6" />
|
||||||
<editor active="true" />
|
<editor active="true" />
|
||||||
<layout>
|
<layout>
|
||||||
<window_info content_ui="combo" id="Project" order="0" sideWeight="0.48826292" visible="true" weight="0.14918292" />
|
<window_info content_ui="combo" id="Project" order="0" sideWeight="0.48669797" visible="true" weight="0.14918292" />
|
||||||
<window_info id="Structure" order="1" sideWeight="0.5117371" side_tool="true" visible="true" weight="0.14918292" />
|
<window_info id="Structure" order="1" sideWeight="0.513302" side_tool="true" visible="true" weight="0.14918292" />
|
||||||
<window_info id="Favorites" order="2" sideWeight="0.5015674" side_tool="true" weight="0.14918292" />
|
<window_info id="Favorites" order="2" sideWeight="0.5015674" side_tool="true" weight="0.14918292" />
|
||||||
<window_info active="true" id="Repositories" order="3" sideWeight="0.49529782" visible="true" weight="0.32999474" />
|
<window_info active="true" id="Repositories" order="3" sideWeight="0.49529782" visible="true" weight="0.32999474" />
|
||||||
<window_info anchor="bottom" id="Message" order="0" />
|
<window_info anchor="bottom" id="Message" order="0" />
|
||||||
|
|
@ -361,7 +384,8 @@
|
||||||
<MESSAGE value="Refactorizado Explorer para que el Explorer, y no el exploring_task, se encargue de postear las tareas de captura. Creado una funcion independiente de creacion de capturas para que sea compartida entre todos aquellos servicios que las creen." />
|
<MESSAGE value="Refactorizado Explorer para que el Explorer, y no el exploring_task, se encargue de postear las tareas de captura. Creado una funcion independiente de creacion de capturas para que sea compartida entre todos aquellos servicios que las creen." />
|
||||||
<MESSAGE value="Corregidos pequeños errores y type tras primer test del servicio." />
|
<MESSAGE value="Corregidos pequeños errores y type tras primer test del servicio." />
|
||||||
<MESSAGE value="Iniciadas clases de capturing_task y scraptargetfield." />
|
<MESSAGE value="Iniciadas clases de capturing_task y scraptargetfield." />
|
||||||
<option name="LAST_COMMIT_MESSAGE" value="Iniciadas clases de capturing_task y scraptargetfield." />
|
<MESSAGE value="Avanzado en desarrollo de capturing task." />
|
||||||
|
<option name="LAST_COMMIT_MESSAGE" value="Avanzado en desarrollo de capturing task." />
|
||||||
</component>
|
</component>
|
||||||
<component name="editorHistoryManager">
|
<component name="editorHistoryManager">
|
||||||
<entry file="file://$PROJECT_DIR$/explorer/test_explorer.py" />
|
<entry file="file://$PROJECT_DIR$/explorer/test_explorer.py" />
|
||||||
|
|
@ -383,6 +407,16 @@
|
||||||
<entry file="file://$PROJECT_DIR$/capturer/__init__.py">
|
<entry file="file://$PROJECT_DIR$/capturer/__init__.py">
|
||||||
<provider selected="true" editor-type-id="text-editor" />
|
<provider selected="true" editor-type-id="text-editor" />
|
||||||
</entry>
|
</entry>
|
||||||
|
<entry file="file://$PROJECT_DIR$/explorer/explorer.py">
|
||||||
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
|
<state relative-caret-position="75">
|
||||||
|
<caret line="5" selection-start-line="5" selection-end-line="5" selection-end-column="22" />
|
||||||
|
<folding>
|
||||||
|
<marker date="1537653289735" expanded="true" signature="5088:5540" ph="..." />
|
||||||
|
</folding>
|
||||||
|
</state>
|
||||||
|
</provider>
|
||||||
|
</entry>
|
||||||
<entry file="file://$PROJECT_DIR$/core/mysql_wrapper.py">
|
<entry file="file://$PROJECT_DIR$/core/mysql_wrapper.py">
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
<state relative-caret-position="540">
|
<state relative-caret-position="540">
|
||||||
|
|
@ -400,25 +434,27 @@
|
||||||
</state>
|
</state>
|
||||||
</provider>
|
</provider>
|
||||||
</entry>
|
</entry>
|
||||||
<entry file="file://$PROJECT_DIR$/explorer/explorer.py">
|
<entry file="file://$PROJECT_DIR$/capturer/geocoder.py">
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
<state relative-caret-position="75">
|
<state relative-caret-position="77">
|
||||||
<caret line="5" selection-start-line="5" selection-end-line="5" selection-end-column="22" />
|
<caret line="17" column="14" selection-start-line="17" selection-start-column="14" selection-end-line="17" selection-end-column="14" />
|
||||||
<folding>
|
<folding>
|
||||||
<marker date="1537653289735" expanded="true" signature="5088:5540" ph="..." />
|
<marker date="1537995271051" expanded="true" signature="519:524" ph="..." />
|
||||||
|
<marker date="1537995271051" expanded="true" signature="708:826" ph="..." />
|
||||||
|
<marker date="1537995271051" expanded="true" signature="1349:1354" ph="..." />
|
||||||
</folding>
|
</folding>
|
||||||
</state>
|
</state>
|
||||||
</provider>
|
</provider>
|
||||||
</entry>
|
</entry>
|
||||||
<entry file="file://$PROJECT_DIR$/capturer/capturer.py">
|
<entry file="file://$PROJECT_DIR$/capturer/capturer.py">
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
<state relative-caret-position="377">
|
<state relative-caret-position="2325">
|
||||||
<caret line="164" column="8" selection-start-line="164" selection-start-column="8" selection-end-line="164" selection-end-column="8" />
|
<caret line="155" lean-forward="true" selection-start-line="155" selection-end-line="155" />
|
||||||
<folding>
|
<folding>
|
||||||
<marker date="1537729307226" expanded="true" signature="1206:1207" ph="..." />
|
<marker date="1537995271039" expanded="true" signature="1249:1250" ph="..." />
|
||||||
<marker date="1537729307226" expanded="true" signature="4143:4296" ph="..." />
|
<marker date="1537995271039" expanded="true" signature="4109:4257" ph="..." />
|
||||||
<marker date="1537729307226" expanded="true" signature="4911:5209" ph="..." />
|
<marker date="1537995271039" expanded="true" signature="4640:4938" ph="..." />
|
||||||
<marker date="1537729307226" expanded="true" signature="5383:5398" ph="..." />
|
<marker date="1537995271039" expanded="true" signature="5089:5094" ph="..." />
|
||||||
</folding>
|
</folding>
|
||||||
</state>
|
</state>
|
||||||
</provider>
|
</provider>
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ from time import sleep
|
||||||
from core.mysql_wrapper import get_anunciosdb, get_tasksdb
|
from core.mysql_wrapper import get_anunciosdb, get_tasksdb
|
||||||
from core.scrapping_utils import UrlAttack
|
from core.scrapping_utils import UrlAttack
|
||||||
from core.alerts import alert_master
|
from core.alerts import alert_master
|
||||||
|
from capturer.geocoder import GeocodingTask
|
||||||
|
|
||||||
ads_root = 'https://www.idealista.com/inmueble/'
|
ads_root = 'https://www.idealista.com/inmueble/'
|
||||||
|
|
||||||
|
|
@ -40,7 +40,8 @@ class CapturingTask:
|
||||||
self.uuid = parameters['uuid']
|
self.uuid = parameters['uuid']
|
||||||
self.ad_url = parameters['ad_url']
|
self.ad_url = parameters['ad_url']
|
||||||
self.uuid_exploring = parameters['uuid_exploring']
|
self.uuid_exploring = parameters['uuid_exploring']
|
||||||
self.request_failures = 0
|
self.request_failures = 1
|
||||||
|
self.geocode_status = "Pending"
|
||||||
|
|
||||||
self.tasksdb = get_tasksdb()
|
self.tasksdb = get_tasksdb()
|
||||||
|
|
||||||
|
|
@ -76,15 +77,12 @@ class CapturingTask:
|
||||||
|
|
||||||
self._read_fields()
|
self._read_fields()
|
||||||
|
|
||||||
while self.request_failures < 4:
|
while self.request_failures < 3:
|
||||||
attack = UrlAttack(self.ad_url)
|
attack = UrlAttack(self.ad_url)
|
||||||
attack.attack()
|
attack.attack()
|
||||||
|
|
||||||
if attack.success():
|
if attack.success():
|
||||||
self.html = attack.get_text()
|
self.html = attack.get_text()
|
||||||
if not self._html_is_valid():
|
|
||||||
alert_master('ERROR CAPTURER',
|
|
||||||
'El HTML de una pagina de anuncio es invalido')
|
|
||||||
|
|
||||||
with self._fields_not_present() as missing_fields:
|
with self._fields_not_present() as missing_fields:
|
||||||
if missing_fields:
|
if missing_fields:
|
||||||
|
|
@ -107,6 +105,9 @@ class CapturingTask:
|
||||||
#Geocodear
|
#Geocodear
|
||||||
self.geocode()
|
self.geocode()
|
||||||
|
|
||||||
|
#TODO Lidiar con el resultado del geocoding
|
||||||
|
#TODO Manejar tema cache
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.request_failures += 1
|
self.request_failures += 1
|
||||||
self._update_status('Fail {}'.format(self.request_failures))
|
self._update_status('Fail {}'.format(self.request_failures))
|
||||||
|
|
@ -121,18 +122,6 @@ class CapturingTask:
|
||||||
for field_parameters in ad_fields_parameters:
|
for field_parameters in ad_fields_parameters:
|
||||||
self.fields.append(ScrapTargetField(field_parameters))
|
self.fields.append(ScrapTargetField(field_parameters))
|
||||||
|
|
||||||
|
|
||||||
def _html_is_valid(self, html=self.html):
|
|
||||||
"""
|
|
||||||
Lee el HTML y aplica normas de validación del contenido
|
|
||||||
"""
|
|
||||||
|
|
||||||
#TODO Comprobar si HTML es pagina de bloqueo
|
|
||||||
|
|
||||||
|
|
||||||
#TODO Check de longitud
|
|
||||||
pass
|
|
||||||
|
|
||||||
def _fields_not_present(self, html=self.html):
|
def _fields_not_present(self, html=self.html):
|
||||||
"""
|
"""
|
||||||
Lee el HTML y devuelve los campos que no esten presentes
|
Lee el HTML y devuelve los campos que no esten presentes
|
||||||
|
|
@ -161,14 +150,35 @@ class CapturingTask:
|
||||||
for field in self.fields:
|
for field in self.fields:
|
||||||
self.ad_data[field.name] = field.get_value(self.html)
|
self.ad_data[field.name] = field.get_value(self.html)
|
||||||
|
|
||||||
def geocode(self):
|
|
||||||
#TODO Construir metodo de Geocoding. Quizas en otra clase?
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def get_ad_data(self):
|
def get_ad_data(self):
|
||||||
return self.ad_data
|
return self.ad_data
|
||||||
|
|
||||||
|
def geocode(self):
|
||||||
|
|
||||||
|
# Construir direccion con formato adecuado
|
||||||
|
geocode_tries = 0
|
||||||
|
|
||||||
|
geo_task = GeocodingTask(formated_address)
|
||||||
|
|
||||||
|
while geocode_tries < 3:
|
||||||
|
geo_task.geocode()
|
||||||
|
|
||||||
|
if geo_task.get_request_status() == 200:
|
||||||
|
google_status = geo_task.success_surrender_retry()
|
||||||
|
|
||||||
|
if google_status == 'Success':
|
||||||
|
self.geocode_status = 'Success'
|
||||||
|
self.geocode_results = geo_task.get_results()
|
||||||
|
return
|
||||||
|
elif google_status == 'Surrender':
|
||||||
|
self.geocode_status = 'Surrender'
|
||||||
|
return
|
||||||
|
elif google_status == 'Retry':
|
||||||
|
geocode_tries += 1
|
||||||
|
|
||||||
|
self.geocode_status = 'Surrender'
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
class ScrapTargetField:
|
class ScrapTargetField:
|
||||||
|
|
||||||
|
|
|
||||||
54
capturer/geocoder.py
Normal file
54
capturer/geocoder.py
Normal file
|
|
@ -0,0 +1,54 @@
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
class GeocodingTask:
|
||||||
|
|
||||||
|
url = 'https://maps.googleapis.com/maps/api/geocode/json'
|
||||||
|
|
||||||
|
request_parameters = {'region': 'es',
|
||||||
|
'key': 'AIzaSyCnKj0WnsxVZcaoxeAYkuRw3cKRNGiISYA'}
|
||||||
|
|
||||||
|
geocoding_status_success = ['OK']
|
||||||
|
geocoding_status_surrender = ['ZERO_RESULTS']
|
||||||
|
geocoding_status_retry = ['OVER_QUERY_LIMIT',
|
||||||
|
'REQUEST_DENIED',
|
||||||
|
'INVALID_REQUEST',
|
||||||
|
'UNKNOWN_ERROR']
|
||||||
|
|
||||||
|
def __init__(self, address):
|
||||||
|
request_paremeters['address'] = address
|
||||||
|
|
||||||
|
def geocode(self):
|
||||||
|
"""
|
||||||
|
Lanza la peticion de gecoding al servicio de google
|
||||||
|
"""
|
||||||
|
self.response = requests.get(url, request_parameters)
|
||||||
|
self.response_json = self.response.json()
|
||||||
|
|
||||||
|
def get_request_status(self):
|
||||||
|
"""
|
||||||
|
Devuelve el status HTTP de la request
|
||||||
|
"""
|
||||||
|
return self.response.status_code
|
||||||
|
|
||||||
|
def success_surrender_retry(self):
|
||||||
|
"""
|
||||||
|
Devuelve el estado del resultado desde el punto de vista de Google
|
||||||
|
"""
|
||||||
|
if self.response_json['status'] in geocoding_status_success:
|
||||||
|
return "Success"
|
||||||
|
elif self.response_json['status'] in geocoding_status_surrender:
|
||||||
|
return "Surrender"
|
||||||
|
else:
|
||||||
|
return "Retry"
|
||||||
|
|
||||||
|
def get_results(self):
|
||||||
|
"""
|
||||||
|
Extrae los resultados del JSON de respuesta y los devuelve
|
||||||
|
"""
|
||||||
|
results = {'latitude': self.response_json['results'][0]['geometry']['location']['lat'],
|
||||||
|
'longitude': self.response_json['results'][0]['geometry']['location']['lon'],
|
||||||
|
'precision': self.response_json['results'][0]['geometry']['location_type']}
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue