Cambios notables. Creadas interfaces para la tabla de capturing task

y la tabla de capturas.

Traslado todo lo relacionado a Geocoding a un servicio independiente
del capturer.

Replanteo totalmente el parseo del html, creando un objeto nuevo.
This commit is contained in:
pablomartincalvo 2018-10-06 19:09:44 +02:00
parent 3bd8de0e02
commit 240a61649c
7 changed files with 474 additions and 262 deletions

278
.idea/workspace.xml generated
View file

@ -2,9 +2,13 @@
<project version="4">
<component name="ChangeListManager">
<list default="true" id="6e2fbba0-85ff-42d6-8e70-e4cdef1000c8" name="Default Changelist" comment="">
<change afterPath="$PROJECT_DIR$/mysql/capturas_interface.py" afterDir="false" />
<change afterPath="$PROJECT_DIR$/mysql/capturing_tasks_interface.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/capturer/capturer.py" beforeDir="false" afterPath="$PROJECT_DIR$/capturer/capturer.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/capturer/geocoder.py" beforeDir="false" afterPath="$PROJECT_DIR$/capturer/geocoder.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/capturer/geocoder.py" beforeDir="false" afterPath="$PROJECT_DIR$/geocoder/geocoder.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/core/mysql_wrapper.py" beforeDir="false" afterPath="$PROJECT_DIR$/core/mysql_wrapper.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/explorer/explorer.py" beforeDir="false" afterPath="$PROJECT_DIR$/explorer/explorer.py" afterDir="false" />
</list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" />
@ -16,37 +20,41 @@
<session id="1687213926">
<usages-collector id="statistics.lifecycle.project">
<counts>
<entry key="project.closed" value="4" />
<entry key="project.closed" value="5" />
<entry key="project.open.time.0" value="1" />
<entry key="project.open.time.12" value="1" />
<entry key="project.open.time.13" value="2" />
<entry key="project.open.time.14" value="3" />
<entry key="project.open.time.17" value="1" />
<entry key="project.open.time.18" value="1" />
<entry key="project.open.time.21" value="1" />
<entry key="project.opened" value="9" />
<entry key="project.opened" value="10" />
</counts>
</usages-collector>
<usages-collector id="statistics.file.extensions.open">
<counts>
<entry key="py" value="15" />
<entry key="py" value="20" />
</counts>
</usages-collector>
<usages-collector id="statistics.file.types.open">
<counts>
<entry key="Python" value="15" />
<entry key="Python" value="18" />
<entry key="Scratch" value="2" />
</counts>
</usages-collector>
<usages-collector id="statistics.file.extensions.edit">
<counts>
<entry key="Python Console" value="1519" />
<entry key="py" value="9871" />
<entry key="txt" value="745" />
<entry key="py" value="14320" />
<entry key="scratch_1" value="489" />
<entry key="txt" value="880" />
</counts>
</usages-collector>
<usages-collector id="statistics.file.types.edit">
<counts>
<entry key="PLAIN_TEXT" value="745" />
<entry key="Python" value="11390" />
<entry key="PLAIN_TEXT" value="880" />
<entry key="Python" value="16173" />
<entry key="Scratch" value="155" />
</counts>
</usages-collector>
<usages-collector id="statistics.vcs.git.usages">
@ -66,53 +74,99 @@
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/capturer/capturer.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="180">
<caret line="12" lean-forward="true" selection-start-line="12" selection-end-line="12" />
<state relative-caret-position="76">
<caret line="82" column="16" selection-start-line="82" selection-start-column="16" selection-end-line="82" selection-end-column="16" />
<folding>
<marker date="1538514781483" expanded="true" signature="3640:3641" ph="..." />
<marker date="1538514781483" expanded="true" signature="6381:6529" ph="..." />
<marker date="1538514781483" expanded="true" signature="6955:7253" ph="..." />
<marker date="1538514781483" expanded="true" signature="7404:7409" ph="..." />
<element signature="e#3455#6755#0" />
<marker date="1538845705076" expanded="true" signature="395:1123" ph="..." />
<marker date="1538845705076" expanded="true" signature="417:427" ph="..." />
<marker date="1538845705076" expanded="true" signature="1122:1123" ph="..." />
<marker date="1538845705076" expanded="true" signature="1179:1180" ph="..." />
<marker date="1538845705076" expanded="true" signature="2538:2547" ph="..." />
<marker date="1538845705076" expanded="true" signature="2538:2606" ph="..." />
<marker date="1538845705076" expanded="true" signature="2601:2606" ph="..." />
<marker date="1538845705076" expanded="true" signature="2679:6045" ph="..." />
<marker date="1538845705076" expanded="true" signature="4773:5825" ph="..." />
<marker date="1538845705076" expanded="true" signature="5844:5853" ph="..." />
<marker date="1538845705076" expanded="true" signature="5844:5926" ph="..." />
<marker date="1538845705076" expanded="true" signature="5951:5960" ph="..." />
<marker date="1538845705076" expanded="true" signature="6036:6039" ph="..." />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/capturer/geocoder.py">
<entry file="file://$APPLICATION_CONFIG_DIR$/scratches/scratch_1.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="615">
<caret line="41" lean-forward="true" selection-start-line="41" selection-end-line="41" />
<state relative-caret-position="466">
<caret line="261" column="15" lean-forward="true" selection-start-line="261" selection-start-column="15" selection-end-line="261" selection-end-column="15" />
<folding>
<marker date="1538514781491" expanded="true" signature="66:1353" ph="..." />
<marker date="1538514781491" expanded="true" signature="91:134" ph="..." />
<marker date="1538514781491" expanded="true" signature="1854:1859" ph="..." />
<element signature="e#95512#95521#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/mysql/capturas_interface.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="405">
<caret line="27" lean-forward="true" selection-start-line="27" selection-end-line="27" />
<folding>
<marker date="1538837294625" expanded="true" signature="74:75" ph="..." />
<marker date="1538837294625" expanded="true" signature="74:76" ph="..." />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/mysql/capturing_tasks_interface.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="60">
<caret line="4" selection-start-line="4" selection-end-line="6" selection-end-column="36" />
<folding>
<marker date="1538834627813" expanded="true" signature="74:80" ph="..." />
<marker date="1538834627813" expanded="true" signature="1110:1701" ph="..." />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/geocoder/geocoder.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-298">
<caret line="41" selection-start-line="41" selection-end-line="41" />
</state>
</provider>
</entry>
</file>
</leaf>
</split-first>
<split-second>
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
<file pinned="false" current-in-tab="true">
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/explorer/explorer.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-3499">
<caret line="16" column="27" lean-forward="true" selection-start-line="16" selection-start-column="27" selection-end-line="16" selection-end-column="27" />
<state relative-caret-position="165">
<caret line="11" selection-start-line="11" selection-end-line="11" selection-end-column="36" />
<folding>
<marker date="1537653289735" expanded="true" signature="5088:5540" ph="..." />
<marker date="1538826138348" expanded="true" signature="5106:5558" ph="..." />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/core/mysql_wrapper.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="540">
<caret line="36" column="22" selection-start-line="36" selection-start-column="22" selection-end-line="36" selection-end-column="22" />
<state relative-caret-position="435">
<caret line="45" column="23" selection-start-line="45" selection-start-column="23" selection-end-line="45" selection-end-column="23" />
<folding>
<element signature="e#24#46#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
@ -148,6 +202,7 @@
<find>datetime</find>
<find>task</find>
<find>exploring_tasks</find>
<find>ge</find>
</findStrings>
</component>
<component name="Git.Settings">
@ -163,13 +218,17 @@
<list>
<option value="$PROJECT_DIR$/core/scrapping_utils.py" />
<option value="$PROJECT_DIR$/explorer/test_explorer.py" />
<option value="$PROJECT_DIR$/core/mysql_wrapper.py" />
<option value="$PROJECT_DIR$/core/alerts.py" />
<option value="$PROJECT_DIR$/core/task.py" />
<option value="$PROJECT_DIR$/capturer/__init__.py" />
<option value="$PROJECT_DIR$/explorer/explorer.py" />
<option value="$PROJECT_DIR$/capturer/capturer.py" />
<option value="$PROJECT_DIR$/capturer/geocoder.py" />
<option value="$PROJECT_DIR$/geocoder/geocoder.py" />
<option value="$PROJECT_DIR$/explorer/explorer.py" />
<option value="$PROJECT_DIR$/mysql/capturing_tasks_interface.py" />
<option value="$PROJECT_DIR$/core/mysql_wrapper.py" />
<option value="$PROJECT_DIR$/mysql/capturas_interface.py" />
<option value="$APPLICATION_CONFIG_DIR$/scratches/scratch_1.py" />
<option value="$PROJECT_DIR$/capturer/capturer.py" />
</list>
</option>
</component>
@ -183,6 +242,7 @@
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="Scope" />
<pane id="ProjectPane">
<subPane>
<expand>
@ -198,28 +258,37 @@
<path>
<item name="Drogon" type="b2602c69:ProjectViewProjectNode" />
<item name="Drogon" type="462c0819:PsiDirectoryNode" />
<item name="core" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="Drogon" type="b2602c69:ProjectViewProjectNode" />
<item name="Drogon" type="462c0819:PsiDirectoryNode" />
<item name="explorer" type="462c0819:PsiDirectoryNode" />
<item name="mysql" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="Drogon" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
</path>
<path>
<item name="Drogon" type="b2602c69:ProjectViewProjectNode" />
<item name="Scratches and Consoles" type="1a2a3e82:ScratchProjectViewPane$MyProjectNode" />
</path>
<path>
<item name="Drogon" type="b2602c69:ProjectViewProjectNode" />
<item name="Scratches and Consoles" type="1a2a3e82:ScratchProjectViewPane$MyProjectNode" />
<item name="Scratches" type="d62648e6:ScratchProjectViewPane$MyRootNode" />
</path>
</expand>
<select />
</subPane>
</pane>
<pane id="Scope" />
</panes>
</component>
<component name="PropertiesComponent">
<property name="TODO_SCOPE" value="All Places" />
<property name="com.intellij.ide.scratch.LRUPopupBuilder$1/New Scratch File" value="Python" />
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
</component>
<component name="RecentsManager">
<key name="MoveFile.RECENT_KEYS">
<recent name="$PROJECT_DIR$/geocoder" />
</key>
</component>
<component name="RunDashboard">
<option name="ruleStates">
<list>
@ -232,7 +301,7 @@
</list>
</option>
</component>
<component name="RunManager">
<component name="RunManager" selected="Python.scratch_1">
<configuration name="alerts" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="Drogon" />
<option name="INTERPRETER_OPTIONS" value="" />
@ -254,8 +323,34 @@
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="scratch_1" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="Drogon" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$APPLICATION_CONFIG_DIR$/scratches" />
<option name="IS_MODULE_SDK" value="false" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$APPLICATION_CONFIG_DIR$/scratches/scratch_1.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="true" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<list>
<item itemvalue="Python.alerts" />
<item itemvalue="Python.scratch_1" />
</list>
<recent_temporary>
<list>
<item itemvalue="Python.scratch_1" />
<item itemvalue="Python.scratch_1" />
<item itemvalue="Python.scratch_1" />
<item itemvalue="Python.scratch_1" />
<item itemvalue="Python.alerts" />
</list>
</recent_temporary>
@ -320,7 +415,14 @@
<option name="project" value="LOCAL" />
<updated>1537995406032</updated>
</task>
<option name="localTasksCounter" value="8" />
<task id="LOCAL-00008" summary="Creado cache de Geocoding. Avanzado en Geocoding Task. Decido mover parte de la gestion del geocoding al capturer.">
<created>1538514864934</created>
<option name="number" value="00008" />
<option name="presentableId" value="LOCAL-00008" />
<option name="project" value="LOCAL" />
<updated>1538514864935</updated>
</task>
<option name="localTasksCounter" value="9" />
<servers />
</component>
<component name="TodoView" selected-index="1">
@ -336,21 +438,21 @@
<frame x="0" y="-2" width="1920" height="1082" extended-state="6" />
<editor active="true" />
<layout>
<window_info content_ui="combo" id="Project" order="0" sideWeight="0.48513302" visible="true" weight="0.14918292" />
<window_info id="Structure" order="1" sideWeight="0.514867" side_tool="true" visible="true" weight="0.14918292" />
<window_info content_ui="combo" id="Project" order="0" sideWeight="0.484326" visible="true" weight="0.14918292" />
<window_info id="Structure" order="1" sideWeight="0.515674" side_tool="true" visible="true" weight="0.14918292" />
<window_info id="Favorites" order="2" sideWeight="0.5015674" side_tool="true" weight="0.14918292" />
<window_info active="true" id="Repositories" order="3" sideWeight="0.49529782" visible="true" weight="0.32999474" />
<window_info anchor="bottom" id="Message" order="0" />
<window_info anchor="bottom" id="Find" order="1" weight="0.32983193" />
<window_info anchor="bottom" id="Run" order="2" weight="0.32983193" />
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
<window_info anchor="bottom" id="Debug" order="3" weight="0.39915967" />
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info active="true" anchor="bottom" id="TODO" order="6" sideWeight="0.49973643" visible="true" weight="0.32878152" />
<window_info anchor="bottom" id="TODO" order="6" sideWeight="0.49973643" weight="0.32878152" />
<window_info anchor="bottom" id="Version Control" order="7" sideWeight="0.49973643" weight="0.269958" />
<window_info anchor="bottom" id="Terminal" order="8" weight="0.32983193" />
<window_info anchor="bottom" id="Event Log" order="9" sideWeight="0.5007907" side_tool="true" weight="0.32983193" />
<window_info anchor="bottom" id="Python Console" order="10" sideWeight="0.49920928" weight="0.32983193" />
<window_info active="true" anchor="bottom" id="Python Console" order="10" sideWeight="0.49920928" visible="true" weight="0.32983193" />
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
@ -394,7 +496,8 @@
<MESSAGE value="Iniciadas clases de capturing_task y scraptargetfield." />
<MESSAGE value="Avanzado en desarrollo de capturing task." />
<MESSAGE value="Avanzado en desarrollo de capturing task. Creado clase GeocodingTask" />
<option name="LAST_COMMIT_MESSAGE" value="Avanzado en desarrollo de capturing task. Creado clase GeocodingTask" />
<MESSAGE value="Creado cache de Geocoding. Avanzado en Geocoding Task. Decido mover parte de la gestion del geocoding al capturer." />
<option name="LAST_COMMIT_MESSAGE" value="Creado cache de Geocoding. Avanzado en Geocoding Task. Decido mover parte de la gestion del geocoding al capturer." />
</component>
<component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/explorer/test_explorer.py" />
@ -416,10 +519,10 @@
<entry file="file://$PROJECT_DIR$/capturer/__init__.py">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/core/mysql_wrapper.py">
<entry file="file://$PROJECT_DIR$/geocoder/geocoder.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="540">
<caret line="36" column="22" selection-start-line="36" selection-start-column="22" selection-end-line="36" selection-end-column="22" />
<state relative-caret-position="615">
<caret line="41" lean-forward="true" selection-start-line="41" selection-end-line="41" />
</state>
</provider>
</entry>
@ -433,37 +536,84 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/geocoder/geocoder.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-298">
<caret line="41" selection-start-line="41" selection-end-line="41" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/explorer/explorer.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-3499">
<caret line="16" column="27" lean-forward="true" selection-start-line="16" selection-start-column="27" selection-end-line="16" selection-end-column="27" />
<state relative-caret-position="165">
<caret line="11" selection-start-line="11" selection-end-line="11" selection-end-column="36" />
<folding>
<marker date="1537653289735" expanded="true" signature="5088:5540" ph="..." />
<marker date="1538826138348" expanded="true" signature="5106:5558" ph="..." />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/capturer/geocoder.py">
<entry file="file://$PROJECT_DIR$/mysql/capturing_tasks_interface.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="615">
<caret line="41" lean-forward="true" selection-start-line="41" selection-end-line="41" />
<state relative-caret-position="60">
<caret line="4" selection-start-line="4" selection-end-line="6" selection-end-column="36" />
<folding>
<marker date="1538514781491" expanded="true" signature="66:1353" ph="..." />
<marker date="1538514781491" expanded="true" signature="91:134" ph="..." />
<marker date="1538514781491" expanded="true" signature="1854:1859" ph="..." />
<marker date="1538834627813" expanded="true" signature="74:80" ph="..." />
<marker date="1538834627813" expanded="true" signature="1110:1701" ph="..." />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/mysql/capturas_interface.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="405">
<caret line="27" lean-forward="true" selection-start-line="27" selection-end-line="27" />
<folding>
<marker date="1538837294625" expanded="true" signature="74:75" ph="..." />
<marker date="1538837294625" expanded="true" signature="74:76" ph="..." />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/core/mysql_wrapper.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="435">
<caret line="45" column="23" selection-start-line="45" selection-start-column="23" selection-end-line="45" selection-end-column="23" />
<folding>
<element signature="e#24#46#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$APPLICATION_CONFIG_DIR$/scratches/scratch_1.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="466">
<caret line="261" column="15" lean-forward="true" selection-start-line="261" selection-start-column="15" selection-end-line="261" selection-end-column="15" />
<folding>
<element signature="e#95512#95521#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/capturer/capturer.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="180">
<caret line="12" lean-forward="true" selection-start-line="12" selection-end-line="12" />
<state relative-caret-position="76">
<caret line="82" column="16" selection-start-line="82" selection-start-column="16" selection-end-line="82" selection-end-column="16" />
<folding>
<marker date="1538514781483" expanded="true" signature="3640:3641" ph="..." />
<marker date="1538514781483" expanded="true" signature="6381:6529" ph="..." />
<marker date="1538514781483" expanded="true" signature="6955:7253" ph="..." />
<marker date="1538514781483" expanded="true" signature="7404:7409" ph="..." />
<element signature="e#3455#6755#0" />
<marker date="1538845705076" expanded="true" signature="395:1123" ph="..." />
<marker date="1538845705076" expanded="true" signature="417:427" ph="..." />
<marker date="1538845705076" expanded="true" signature="1122:1123" ph="..." />
<marker date="1538845705076" expanded="true" signature="1179:1180" ph="..." />
<marker date="1538845705076" expanded="true" signature="2538:2547" ph="..." />
<marker date="1538845705076" expanded="true" signature="2538:2606" ph="..." />
<marker date="1538845705076" expanded="true" signature="2601:2606" ph="..." />
<marker date="1538845705076" expanded="true" signature="2679:6045" ph="..." />
<marker date="1538845705076" expanded="true" signature="4773:5825" ph="..." />
<marker date="1538845705076" expanded="true" signature="5844:5853" ph="..." />
<marker date="1538845705076" expanded="true" signature="5844:5926" ph="..." />
<marker date="1538845705076" expanded="true" signature="5951:5960" ph="..." />
<marker date="1538845705076" expanded="true" signature="6036:6039" ph="..." />
</folding>
</state>
</provider>

View file

@ -2,85 +2,45 @@ import sys
sys.path.append('..')
import uuid
from time import sleep
from core.mysql_wrapper import get_anunciosdb, get_tasksdb
from bs4 import BeautifulSoup
import re
from mysql.capturing_tasks_interface import capturing_interface
from mysql.capturas_interface import capturas_interface
from core.scrapping_utils import UrlAttack
from core.alerts import alert_master
from capturer.geocoder import GeocodingTask
ads_root = 'https://www.idealista.com/inmueble/'
#TODO Crear la lista de campos
ad_fields_parameters = [{'name': 'referencia',
'search_method': '',
'validation_method': ''},
{'name': 'precio',
'search_method': '',
'validation_method': ''},
{'name': 'tamano_categorico',
'search_method': '',
'validation_method': ''},
{'name': 'm2',
'search_method': '',
'validation_method': ''},
{'name': 'telefono',
'search_method': '',
'validation_method': ''},
{'name': 'texto_tipo',
'search_method': '',
'validation_method': ''},
{'name': 'ciudad',
'search_method': '',
'validation_method': ''},
{'name': 'distrito',
'search_method': '',
'validation_method': ''},
{'name': 'barrio',
'search_method': '',
'validation_method': ''},
{'name': 'calle',
'search_method': '',
'validation_method': ''},
{'name': 'cubierta',
'search_method': '',
'validation_method': ''},
{'name': 'puerta_auto',
'search_method': '',
'validation_method': ''},
{'name': 'ascensor',
'search_method': '',
'validation_method': ''},
{'name': 'alarma',
'search_method': '',
'validation_method': ''},
{'name': 'circuito',
'search_method': '',
'validation_method': ''},
{'name': 'personal',
'search_method': '',
'validation_method': ''},
{'name': 'texto_libre',
'search_method': '',
'validation_method': ''}]
def create_capturing_task(referencia, db_wrapper, uuid_exploring=None):
query_parameters = {'ad_url': ads_root + referencia,
'uuid': str(uuid.uuid4()),
'status': 'Pending'}
class Capturer:
if uuid_exploring is None:
query_statement = """INSERT INTO capturing_tasks_logs
(uuid, write_time, status, url)
VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s)"""
else:
query_parameters['uuid_exploring'] = uuid_exploring
query_statement = """INSERT INTO capturing_tasks_logs
(uuid, write_time, status, url, fk_uuid_exploring)
VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s, %(uuid_exploring)s)"""
sleep_time_no_work = 60
minimum_seconds_between_tries = 120
def start(self):
#Juzgar si hay que currar
while True:
if capturing_interface.get_pending_task() is None:
sleep(Capturer.sleep_time_no_work)
continue
if capturing_interface.seconds_since_last_try() < minimum_seconds_between_tries:
sleep(Capturer.sleep_time_no_work)
continue
task_parameters = capturing_interface.get_pending_task()
task = CapturingTask(task_parameters)
task.capture()
if tasks.status = 'Data ready':
ad_data = task.get_ad_data()
else:
continue
capturas_interface.insert_captura(ad_data)
db_wrapper.query(query_statement, query_parameters)
class CapturingTask:
@ -90,34 +50,16 @@ class CapturingTask:
def __init__(self, parameters):
self.uuid = parameters['uuid']
self.ad_url = parameters['ad_url']
self.uuid_exploring = parameters['uuid_exploring']
self.uuid_exploring = parameters['fk_uuid_exploring']
self.status = parameters['status']
self.request_failures = 1
self.geocode_status = "Pending"
self.tasksdb = get_tasksdb()
self._update_status('Loading')
def _update_status(self, new_status):
self.status = new_status
self._log_in_tasksdb()
def _log_in_tasksdb(self):
"""
Graba en la base de datos de tareas un registro con el UUID de la tarea,
un timestamp y el status
"""
query_statement = """INSERT INTO capturing_tasks_logs
(uuid, write_time, status, ad_url, fk_uuid_exploring)
VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s, %(fk_uuid_exploring)s)"""
query_parameters = {'uuid': self.uuid,
'status': self.status,
'ad_url': self.ad_url,
'fk_uuid_exploring': self.uuid_exploring}
self.tasksdb.query(query_statement, query_parameters)
capturing_interface.update_capturing_task(self.uuid, self.uuid_exploring,
self.status, self.ad_url)
def capture(self):
"""
@ -135,25 +77,12 @@ class CapturingTask:
if attack.success():
self.html = attack.get_text()
with self._fields_not_present() as missing_fields:
if missing_fields:
alert_master('ERROR CAPTURER',
'Los siguientes campos no estaban presentes {}. '
'URL = {}'.format(missing_fields, self.ad_url))
self._update_status('Dead ad')
return
with self._fields_not_valid() as unvalid_fields:
if unvalid_fields:
alert_master('ERROR CAPTURER',
'Los siguientes campos no tenian valores presentes {}'
'URL = {}'.format(unvalid_fields, self.ad_url))
self._update_status('Dead ad')
return
#Extraer datos
self.extract_data()
self._update_status('Data ready')
else:
self.request_failures += 1
self._update_status('Fail {}'.format(self.request_failures))
@ -162,97 +91,120 @@ class CapturingTask:
self._update_status('Surrender')
def _read_fields(self):
self.fields = []
for field_parameters in ad_fields_parameters:
self.fields.append(ScrapTargetField(field_parameters))
def _fields_not_present(self, html=self.html):
"""
Lee el HTML y devuelve los campos que no esten presentes
"""
#TODO Implementar campos optativos
fields_not_present = []
for field in self.fields:
if not field.exists(html):
fields_not_present.append(field.name)
return fields_not_present
def _fields_not_valid(self, html=self.html):
"""
Lee el HTML y devuelve los campos que no tengan valores validos
"""
fields_not_valid = []
for field in self.fields:
if not field.validate_value(html):
fields_not_valid.append(field.name)
return fields_not_valid
def extract_data(self):
self.ad_data = {}
for field in self.fields:
self.ad_data[field.name] = field.get_value(self.html)
#TODO Crear un objeto parser y ver que todo esta bien
def get_ad_data(self):
return self.ad_data
def geocode(self):
#TODO Hacer esta funcion bien
# Construir direccion con formato adecuado
geocode_tries = 0
geo_task = GeocodingTask(formated_address)
while geocode_tries < 3:
geo_task.geocode()
if geo_task.get_request_status() == 200:
google_status = geo_task.success_surrender_retry()
if google_status == 'Success':
self.geocode_status = 'Success'
self.geocode_results = geo_task.get_results()
return
elif google_status == 'Surrender':
self.geocode_status = 'Surrender'
return
elif google_status == 'Retry':
geocode_tries += 1
self.geocode_status = 'Surrender'
return
class ScrapTargetField:
class AdHtmlParser:
def __init__(self, html_string):
self.html = html_string
self.ad_fields = {'referencia': {
'found': False,
'optional': False,
'value': None},
'precio': {
'found': False,
'optional': False,
'value': None},
'tamano_categorico': {
'found': False,
'optional': False,
'value': None},
'm2': {
'found': False,
'optional': True,
'value': None},
'tipo_anuncio': {
'found': False,
'optional': False,
'value': None},
'calle': {
'found': False,
'optional': False,
'value': None},
'barrio': {
'found': False,
'optional': False,
'value': None},
'distrito': {
'found': False,
'optional': False,
'value': None},
'ciudad': {
'found': False,
'optional': False,
'value': None},
'cubierta': {
'found': False,
'optional': False,
'value': None},
'puerta_auto': {
'found': False,
'optional': False,
'value': None},
'ascensor': {
'found': False,
'optional': False,
'value': None},
'alarma': {
'found': False,
'optional': False,
'value': None},
'circuito': {
'found': False,
'optional': False,
'value': None},
'personal': {
'found': False,
'optional': False,
'value': None},
'telefono': {
'found': False,
'optional': True,
'value': None}}
def parse(self):
soup = BeautifulSoup(self.html, 'html5lib' )
if soup.findall('link', {'rel': 'canonical'}) is not None:
self.ad_fields['referencia']['value'] = re.findall(r'[0-9]{5,20}',
str(soup.findall('link', {'rel': 'canonical'})[0]))[0]
self.ad_fields['referencia']['found'] = True
if sopa.find_all('strong', {'class': 'price'}) is not None:
self.ad_fields['precio']['value'] = ''.join(re.findall(r'[0-9]',
str(sopa.find_all('strong', {'class': 'price'})[0])))
self.ad_fields['precio']['found'] = True
if soup.find('div', {'class':'info-features'}) is not None:
self.ad_fields['tamano_categorico']['value'] = sopa.find('div',
{'class':'info-features'}).find('span').find('span').text
self.ad_fields['tamano_categorico']['found'] = True
#TODO Seguir con los metodos de parseo
def validate(self):
#TODO Implementar validacion para aquellos campos que lo necesiten
def fields_missing(self):
#TODO Iterar el diccionario para ver que todos los campos obligatorios estan
def __init__(self, target_parameters):
self.name = target_parameters['name']
self.search_method = target_parameters['search_method']
self.validation_method = target_parameters['validation_method']
def exists(self, html):
"""
Busca el dato en un HTML
"""
if self.search_method(html) is None:
return False
else:
return True
def validate_value(self, dato):
"""
Comprueba el valor y valida con la norma respectiva que sea lo esperado
"""
return self.validation_method(dato)
def get_value(self, html):
"""
Busca en un HTML el dato
"""
return self.search_method(html)

View file

@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
import mysql.connector
from core.alerts import alert_master
anuncios_db_parameters = {'host': '185.166.215.170',
'database': 'anuncios',
@ -37,13 +38,21 @@ class DatabaseWrapper():
self.connect()
self.disconnect()
def query(self, query_statement, query_parameters = None, dictionary = False):
def query(self, query_statement, query_parameters=None, dictionary=False):
self.connect()
if self.connection.is_connected():
execution_cursor = self.connection.cursor(dictionary = dictionary)
execution_cursor.execute(query_statement, query_parameters)
self.disconnect()
return execution_cursor
try:
execution_cursor = self.connection.cursor(dictionary = dictionary)
execution_cursor.execute(query_statement, query_parameters)
self.disconnect()
return execution_cursor
except:
alert_master("SQL ERROR", """Se ha producido un error ejecutando la
siguiente query: %s.
Con los siguientes parametros: %s
""".format(query_statement,
query_parameters))
else:
raise Exception("Could not connect to the database.")

View file

@ -10,7 +10,7 @@ from random import randint
from core.mysql_wrapper import get_anunciosdb, get_tasksdb
from core.scrapping_utils import UrlAttack
from core.alerts import alert_master
from capturer.capturer import create_capturing_task
from mysql.capturing_tasks_interface import capturing_interface
class Explorer():
@ -51,7 +51,7 @@ class Explorer():
if current_task.status == 'Referencias ready':
referencias = current_task.get_referencias()
for referencia in referencias:
create_capturing_task(referencia, self.tasksdb)
capturing_interface.create_capturing_task(referencia)
current_task._update_status("Sent to queue")

View file

@ -0,0 +1,27 @@
from core.mysql_wrapper import get_anunciosdb
class CapturasInterface():
def __init__(self):
self.anunciosdb = get_anunciosdb()
def insert_captura(self, ad_data):
columns = ', '.join(ad_data.keys())
placeholders_string = ', '.join('%s' * len(ad_data))
query_statement = """ INSERT INTO capturas
(%s)
VALUES(%s)""".format(columns, placeholders_string)
query_parameters = ad_data.values()
self.anunciosdb.query(query_statement, query_parameters)
capturas_interface = CapturasInterface()

View file

@ -0,0 +1,74 @@
from core.mysql_wrapper import get_tasksdb
class CapturingTasksInterface:
def __init__(self):
self.tasksdb = get_tasksdb()
def create_capturing_task(self, referencia, uuid_exploring=None):
ads_root = 'https://www.idealista.com/inmueble/'
query_parameters = {'ad_url': ads_root + referencia,
'uuid': str(uuid.uuid4()),
'status': 'Pending'}
if uuid_exploring is None:
query_statement = """INSERT INTO capturing_tasks_logs
(uuid, write_time, status, url)
VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s)"""
else:
query_parameters['uuid_exploring'] = uuid_exploring
query_statement = """INSERT INTO capturing_tasks_logs
(uuid, write_time, status, url, fk_uuid_exploring)
VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s, %(uuid_exploring)s)"""
self.tasksdb.query(query_statement, query_parameters)
def get_pending_task(self):
query_statement = """SELECT logs.*
FROM capturing_tasks_logs as logs
INNER JOIN capturing_last as last
ON logs.uuid = last.uuid
WHERE last.status = 'Pending'
ORDER BY logs.write_time ASC
LIMIT 1
"""
cursor = self.tasksdb.query(query_statement, dictionary=True)
if cursor.rowcount:
return cursor.fetchone()
else:
return None
def update_capturing_task(self, uuid, uuid_exploring, status, ad_url):
query_parameters = {'ad_url': ad_url,
'uuid': uuid,
'status': status}
if uuid_exploring is None:
query_statement = """INSERT INTO capturing_tasks_logs
(uuid, write_time, status, url)
VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s)"""
else:
query_parameters['uuid_exploring'] = uuid_exploring
query_statement = """INSERT INTO capturing_tasks_logs
(uuid, write_time, status, url, fk_uuid_exploring)
VALUES (%(uuid)s, NOW(), %(status)s, %(ad_url)s, %(uuid_exploring)s)"""
self.tasksdb.query(query_statement, query_parameters)
def seconds_since_last_try(self):
query_statement = """SELECT TIME_TO_SEC(TIME_DIFF(now(), write_time))
FROM capturing_task_logs
WHERE status = 'Pending'
ORDER BY write_time
LIMIT 1
"""
cursor = self.taskdb.query(query_statement)
return cursor.fetchone()[0]
capturing_interface = CapturingTasksInterface()