Creado clase de Exploring Task. Creado modulo de scraping utils
This commit is contained in:
parent
b1b7de13f8
commit
c0b398b9c6
13 changed files with 258 additions and 14 deletions
11
.idea/Drogon.iml
generated
Normal file
11
.idea/Drogon.iml
generated
Normal file
|
|
@ -0,0 +1,11 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="PYTHON_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$" />
|
||||||
|
<orderEntry type="inheritedJdk" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
<component name="TestRunnerService">
|
||||||
|
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
|
||||||
|
</component>
|
||||||
|
</module>
|
||||||
4
.idea/misc.xml
generated
Normal file
4
.idea/misc.xml
generated
Normal file
|
|
@ -0,0 +1,4 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
|
||||||
|
</project>
|
||||||
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
|
|
@ -0,0 +1,8 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/Drogon.iml" filepath="$PROJECT_DIR$/.idea/Drogon.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
141
.idea/workspace.xml
generated
Normal file
141
.idea/workspace.xml
generated
Normal file
|
|
@ -0,0 +1,141 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ChangeListManager">
|
||||||
|
<list default="true" id="6e2fbba0-85ff-42d6-8e70-e4cdef1000c8" name="Default Changelist" comment="">
|
||||||
|
<change beforePath="$PROJECT_DIR$/explorer/explorer.py" beforeDir="false" afterPath="$PROJECT_DIR$/explorer/explorer.py" afterDir="false" />
|
||||||
|
</list>
|
||||||
|
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
|
||||||
|
<option name="SHOW_DIALOG" value="false" />
|
||||||
|
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||||
|
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
||||||
|
<option name="LAST_RESOLUTION" value="IGNORE" />
|
||||||
|
</component>
|
||||||
|
<component name="FUSProjectUsageTrigger">
|
||||||
|
<session id="1687213926">
|
||||||
|
<usages-collector id="statistics.lifecycle.project">
|
||||||
|
<counts>
|
||||||
|
<entry key="project.open.time.0" value="1" />
|
||||||
|
<entry key="project.opened" value="1" />
|
||||||
|
</counts>
|
||||||
|
</usages-collector>
|
||||||
|
</session>
|
||||||
|
</component>
|
||||||
|
<component name="FileEditorManager">
|
||||||
|
<leaf>
|
||||||
|
<file pinned="false" current-in-tab="true">
|
||||||
|
<entry file="file://$PROJECT_DIR$/core/scrapping_utils.py">
|
||||||
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
|
<state relative-caret-position="60">
|
||||||
|
<caret line="4" lean-forward="true" selection-start-line="4" selection-end-line="4" />
|
||||||
|
</state>
|
||||||
|
</provider>
|
||||||
|
</entry>
|
||||||
|
</file>
|
||||||
|
</leaf>
|
||||||
|
</component>
|
||||||
|
<component name="Git.Settings">
|
||||||
|
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
|
||||||
|
</component>
|
||||||
|
<component name="IdeDocumentHistory">
|
||||||
|
<option name="CHANGED_PATHS">
|
||||||
|
<list>
|
||||||
|
<option value="$PROJECT_DIR$/core/scrapping_utils.py" />
|
||||||
|
</list>
|
||||||
|
</option>
|
||||||
|
</component>
|
||||||
|
<component name="ProjectFrameBounds" extendedState="6">
|
||||||
|
<option name="x" value="215" />
|
||||||
|
<option name="width" value="1289" />
|
||||||
|
<option name="height" value="728" />
|
||||||
|
</component>
|
||||||
|
<component name="ProjectView">
|
||||||
|
<navigator proportions="" version="1">
|
||||||
|
<foldersAlwaysOnTop value="true" />
|
||||||
|
</navigator>
|
||||||
|
<panes>
|
||||||
|
<pane id="ProjectPane">
|
||||||
|
<subPane>
|
||||||
|
<expand>
|
||||||
|
<path>
|
||||||
|
<item name="Drogon" type="b2602c69:ProjectViewProjectNode" />
|
||||||
|
<item name="Drogon" type="462c0819:PsiDirectoryNode" />
|
||||||
|
</path>
|
||||||
|
<path>
|
||||||
|
<item name="Drogon" type="b2602c69:ProjectViewProjectNode" />
|
||||||
|
<item name="Drogon" type="462c0819:PsiDirectoryNode" />
|
||||||
|
<item name="core" type="462c0819:PsiDirectoryNode" />
|
||||||
|
</path>
|
||||||
|
<path>
|
||||||
|
<item name="Drogon" type="b2602c69:ProjectViewProjectNode" />
|
||||||
|
<item name="Drogon" type="462c0819:PsiDirectoryNode" />
|
||||||
|
<item name="explorer" type="462c0819:PsiDirectoryNode" />
|
||||||
|
</path>
|
||||||
|
<path>
|
||||||
|
<item name="Drogon" type="b2602c69:ProjectViewProjectNode" />
|
||||||
|
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
|
||||||
|
</path>
|
||||||
|
</expand>
|
||||||
|
<select />
|
||||||
|
</subPane>
|
||||||
|
</pane>
|
||||||
|
<pane id="Scope" />
|
||||||
|
</panes>
|
||||||
|
</component>
|
||||||
|
<component name="PropertiesComponent">
|
||||||
|
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
|
||||||
|
</component>
|
||||||
|
<component name="RunDashboard">
|
||||||
|
<option name="ruleStates">
|
||||||
|
<list>
|
||||||
|
<RuleState>
|
||||||
|
<option name="name" value="ConfigurationTypeDashboardGroupingRule" />
|
||||||
|
</RuleState>
|
||||||
|
<RuleState>
|
||||||
|
<option name="name" value="StatusDashboardGroupingRule" />
|
||||||
|
</RuleState>
|
||||||
|
</list>
|
||||||
|
</option>
|
||||||
|
</component>
|
||||||
|
<component name="TaskManager">
|
||||||
|
<task active="true" id="Default" summary="Default task">
|
||||||
|
<changelist id="6e2fbba0-85ff-42d6-8e70-e4cdef1000c8" name="Default Changelist" comment="" />
|
||||||
|
<created>1534095188685</created>
|
||||||
|
<option name="number" value="Default" />
|
||||||
|
<option name="presentableId" value="Default" />
|
||||||
|
<updated>1534095188685</updated>
|
||||||
|
</task>
|
||||||
|
<servers />
|
||||||
|
</component>
|
||||||
|
<component name="ToolWindowManager">
|
||||||
|
<frame x="0" y="-2" width="1920" height="1082" extended-state="6" />
|
||||||
|
<editor active="true" />
|
||||||
|
<layout>
|
||||||
|
<window_info id="Favorites" side_tool="true" />
|
||||||
|
<window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.24986821" />
|
||||||
|
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
|
||||||
|
<window_info anchor="bottom" id="Version Control" weight="0.32983193" />
|
||||||
|
<window_info anchor="bottom" id="Python Console" weight="0.32983193" />
|
||||||
|
<window_info anchor="bottom" id="Event Log" side_tool="true" />
|
||||||
|
<window_info active="true" anchor="bottom" id="Terminal" visible="true" weight="0.32983193" />
|
||||||
|
<window_info anchor="bottom" id="Message" order="0" />
|
||||||
|
<window_info anchor="bottom" id="Find" order="1" />
|
||||||
|
<window_info anchor="bottom" id="Run" order="2" />
|
||||||
|
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
|
||||||
|
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
|
||||||
|
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
|
||||||
|
<window_info anchor="bottom" id="TODO" order="6" />
|
||||||
|
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
|
||||||
|
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
|
||||||
|
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
|
||||||
|
</layout>
|
||||||
|
</component>
|
||||||
|
<component name="editorHistoryManager">
|
||||||
|
<entry file="file://$PROJECT_DIR$/core/scrapping_utils.py">
|
||||||
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
|
<state relative-caret-position="60">
|
||||||
|
<caret line="4" lean-forward="true" selection-start-line="4" selection-end-line="4" />
|
||||||
|
</state>
|
||||||
|
</provider>
|
||||||
|
</entry>
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
BIN
__pycache__/__init__.cpython-36.pyc
Normal file
BIN
__pycache__/__init__.cpython-36.pyc
Normal file
Binary file not shown.
Binary file not shown.
BIN
core/__pycache__/scrapping_utils.cpython-36.pyc
Normal file
BIN
core/__pycache__/scrapping_utils.cpython-36.pyc
Normal file
Binary file not shown.
36
core/scrapping_utils.py
Normal file
36
core/scrapping_utils.py
Normal file
|
|
@ -0,0 +1,36 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
class UrlAttack():
|
||||||
|
|
||||||
|
headers = {'Upgrade-Insecure-Requests': "1",
|
||||||
|
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
|
||||||
|
'Accept-Encoding': "gzip, deflate, br",
|
||||||
|
'Accept-Language': "en-US,en;q=0.9",
|
||||||
|
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/67.0.3396.99 Chrome/67.0.3396.99 Safari/537.36'}
|
||||||
|
|
||||||
|
timeout = 20
|
||||||
|
|
||||||
|
def __init__(self, url):
|
||||||
|
self.url = url
|
||||||
|
self.success = None
|
||||||
|
self.has_been_attacked = False
|
||||||
|
|
||||||
|
def attack(self):
|
||||||
|
self.has_been_attacked = True
|
||||||
|
try:
|
||||||
|
self.response = requests.get(self.url, headers = headers,
|
||||||
|
timeout = timeout)
|
||||||
|
if self.response.ok:
|
||||||
|
self.success = True
|
||||||
|
except Exception:
|
||||||
|
self.success = False
|
||||||
|
|
||||||
|
def get_response(self):
|
||||||
|
if self.success:
|
||||||
|
return self.response
|
||||||
|
|
||||||
|
def get_text(self):
|
||||||
|
if self.success:
|
||||||
|
return self.response.text()
|
||||||
2
explorer/__init__.py
Normal file
2
explorer/__init__.py
Normal file
|
|
@ -0,0 +1,2 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
BIN
explorer/__pycache__/__init__.cpython-36.pyc
Normal file
BIN
explorer/__pycache__/__init__.cpython-36.pyc
Normal file
Binary file not shown.
BIN
explorer/__pycache__/mock.cpython-36.pyc
Normal file
BIN
explorer/__pycache__/mock.cpython-36.pyc
Normal file
Binary file not shown.
|
|
@ -1,9 +1,12 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import sys
|
import sys
|
||||||
sys.path.append('..')
|
sys.path.append('..')
|
||||||
|
import uuid
|
||||||
|
import datetime
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
|
||||||
from core.mysql_wrapper import get_anunciosdb
|
from core.mysql_wrapper import get_anunciosdb
|
||||||
|
from core.scrapping_utils import UrlAttack
|
||||||
|
|
||||||
class Explorer():
|
class Explorer():
|
||||||
|
|
||||||
|
|
@ -39,7 +42,7 @@ class Explorer():
|
||||||
if not self.queue_is_up():
|
if not self.queue_is_up():
|
||||||
break
|
break
|
||||||
|
|
||||||
self.compose_listing_url
|
current_task = ExploringTask(self.compose_listing_url)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -80,19 +83,52 @@ class Explorer():
|
||||||
|
|
||||||
def compose_listing_url(self):
|
def compose_listing_url(self):
|
||||||
#Decide que url hay que componer y la compone
|
#Decide que url hay que componer y la compone
|
||||||
|
raiz = 'https://www.idealista.com/'
|
||||||
raiz, orden = componer_raiz(p_orden, tipo_anuncios, ciudad)
|
tipo = #Logica random
|
||||||
cantidad = calcular_cantidad_listado(anuncios_por_capturar)
|
ciudad = 'barcelona'
|
||||||
|
numero = #logica random
|
||||||
lista_urls = []
|
url = raiz + tipo + '-garajes/' + ciudad + '-' + ciudad + '/' +
|
||||||
|
'pagina-' + numero + '.htm'
|
||||||
for num in range(primera_pagina, cantidad + primera_pagina):
|
|
||||||
lista_urls.append(raiz + 'pagina-' + str(num) + '.htm' + orde$
|
return url
|
||||||
|
|
||||||
|
|
||||||
return lista_urls
|
class ExploringTask():
|
||||||
|
|
||||||
#funcion que compone los strings de url de idealista
|
def __init__(self, url):
|
||||||
#tiene en cuenta cuantos anuncios quedan por capturar
|
self.creation_date = Datetime.Datetime.now()
|
||||||
#para decidir la cantidad de URLs a generar
|
self.target_url = url
|
||||||
|
self.id = str(uuid.uuid4())
|
||||||
|
self.status = 'Pending'
|
||||||
|
|
||||||
|
self.log()
|
||||||
|
|
||||||
|
def explore(self):
|
||||||
|
self.attack_date = Datetime.Datetime.now()
|
||||||
|
attack = UrlAttack(self.url)
|
||||||
|
|
||||||
|
attack.attack()
|
||||||
|
|
||||||
|
if attack.success:
|
||||||
|
self.get_listings()
|
||||||
|
if self.listings_ok:
|
||||||
|
self.status = 'Success'
|
||||||
|
else:
|
||||||
|
self.status = 'Failure'
|
||||||
|
|
||||||
|
self.log
|
||||||
|
|
||||||
|
def get_listings(self):
|
||||||
|
#TODO
|
||||||
|
#Funcion que devuelva los listings limpitos
|
||||||
|
|
||||||
|
def log(self):
|
||||||
|
#TODO
|
||||||
|
#Funcion que grabe estado y demas en una mongodb o argo azin
|
||||||
|
|
||||||
|
def extract_listings(self):
|
||||||
|
#TODO
|
||||||
|
#Coge el html de una pagina de listado y pilla las referencias.
|
||||||
|
#Si no cae ninguna, se considera un fail
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue