Correcciones en wrapper_mysql y avance en metodos de explorer. Iniciado modulo de alertas.
This commit is contained in:
parent
e883f9031b
commit
5eedb037ed
6 changed files with 359 additions and 105 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -0,0 +1 @@
|
|||
/data_backups
|
||||
156
.idea/workspace.xml
generated
156
.idea/workspace.xml
generated
|
|
@ -1,7 +1,12 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ChangeListManager">
|
||||
<list default="true" id="6e2fbba0-85ff-42d6-8e70-e4cdef1000c8" name="Default Changelist" comment="">
|
||||
<list default="true" id="6e2fbba0-85ff-42d6-8e70-e4cdef1000c8" name="Default Changelist" comment="Correcciones en wrapper_mysql y avance en metodos de explorer. Iniciado modulo de alertas.">
|
||||
<change afterPath="$PROJECT_DIR$/core/alerts.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/.gitignore" beforeDir="false" afterPath="$PROJECT_DIR$/.gitignore" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/core/mysql_wrapper.py" beforeDir="false" afterPath="$PROJECT_DIR$/core/mysql_wrapper.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/core/scrapping_utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/core/scrapping_utils.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/explorer/explorer.py" beforeDir="false" afterPath="$PROJECT_DIR$/explorer/explorer.py" afterDir="false" />
|
||||
</list>
|
||||
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
|
||||
|
|
@ -15,24 +20,90 @@
|
|||
<usages-collector id="statistics.lifecycle.project">
|
||||
<counts>
|
||||
<entry key="project.open.time.0" value="1" />
|
||||
<entry key="project.opened" value="1" />
|
||||
<entry key="project.open.time.13" value="1" />
|
||||
<entry key="project.opened" value="2" />
|
||||
</counts>
|
||||
</usages-collector>
|
||||
<usages-collector id="statistics.file.extensions.open">
|
||||
<counts>
|
||||
<entry key="py" value="4" />
|
||||
</counts>
|
||||
</usages-collector>
|
||||
<usages-collector id="statistics.file.types.open">
|
||||
<counts>
|
||||
<entry key="Python" value="4" />
|
||||
</counts>
|
||||
</usages-collector>
|
||||
<usages-collector id="statistics.file.extensions.edit">
|
||||
<counts>
|
||||
<entry key="py" value="634" />
|
||||
<entry key="txt" value="92" />
|
||||
</counts>
|
||||
</usages-collector>
|
||||
<usages-collector id="statistics.file.types.edit">
|
||||
<counts>
|
||||
<entry key="PLAIN_TEXT" value="92" />
|
||||
<entry key="Python" value="634" />
|
||||
</counts>
|
||||
</usages-collector>
|
||||
</session>
|
||||
</component>
|
||||
<component name="FileEditorManager">
|
||||
<leaf>
|
||||
<file pinned="false" current-in-tab="true">
|
||||
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/explorer/explorer.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="150">
|
||||
<caret line="11" column="18" lean-forward="true" selection-start-line="11" selection-start-column="18" selection-end-line="11" selection-end-column="18" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/core/alerts.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="45">
|
||||
<caret line="3" lean-forward="true" selection-start-line="2" selection-start-column="14" selection-end-line="3" />
|
||||
<folding>
|
||||
<element signature="e#0#46#0" expanded="true" />
|
||||
<marker date="1535650379609" expanded="true" signature="219:256" ph="..." />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/core/scrapping_utils.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="60">
|
||||
<caret line="4" lean-forward="true" selection-start-line="4" selection-end-line="4" />
|
||||
<state relative-caret-position="525">
|
||||
<caret line="35" column="37" lean-forward="true" selection-start-line="35" selection-start-column="37" selection-end-line="35" selection-end-column="37" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="true">
|
||||
<entry file="file://$PROJECT_DIR$/core/mysql_wrapper.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="391">
|
||||
<caret line="36" column="22" lean-forward="true" selection-start-line="36" selection-start-column="22" selection-end-line="36" selection-end-column="22" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
</leaf>
|
||||
</component>
|
||||
<component name="FileTemplateManagerImpl">
|
||||
<option name="RECENT_TEMPLATES">
|
||||
<list>
|
||||
<option value="Python Script" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="FindInProjectRecents">
|
||||
<findStrings>
|
||||
<find>queue_retries</find>
|
||||
</findStrings>
|
||||
</component>
|
||||
<component name="Git.Settings">
|
||||
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
|
||||
</component>
|
||||
|
|
@ -40,6 +111,10 @@
|
|||
<option name="CHANGED_PATHS">
|
||||
<list>
|
||||
<option value="$PROJECT_DIR$/core/scrapping_utils.py" />
|
||||
<option value="$PROJECT_DIR$/explorer/test_explorer.py" />
|
||||
<option value="$PROJECT_DIR$/explorer/explorer.py" />
|
||||
<option value="$PROJECT_DIR$/core/alerts.py" />
|
||||
<option value="$PROJECT_DIR$/core/mysql_wrapper.py" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
|
|
@ -96,6 +171,9 @@
|
|||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="SvnConfiguration">
|
||||
<configuration />
|
||||
</component>
|
||||
<component name="TaskManager">
|
||||
<task active="true" id="Default" summary="Default task">
|
||||
<changelist id="6e2fbba0-85ff-42d6-8e70-e4cdef1000c8" name="Default Changelist" comment="" />
|
||||
|
|
@ -106,34 +184,82 @@
|
|||
</task>
|
||||
<servers />
|
||||
</component>
|
||||
<component name="TodoView" selected-index="4">
|
||||
<todo-panel id="selected-file">
|
||||
<is-autoscroll-to-source value="true" />
|
||||
</todo-panel>
|
||||
<todo-panel id="all">
|
||||
<are-packages-shown value="true" />
|
||||
<is-autoscroll-to-source value="true" />
|
||||
</todo-panel>
|
||||
</component>
|
||||
<component name="ToolWindowManager">
|
||||
<frame x="0" y="-2" width="1920" height="1082" extended-state="6" />
|
||||
<editor active="true" />
|
||||
<layout>
|
||||
<window_info id="Favorites" side_tool="true" />
|
||||
<window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.24986821" />
|
||||
<window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.14918292" />
|
||||
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
|
||||
<window_info anchor="bottom" id="Version Control" weight="0.32983193" />
|
||||
<window_info anchor="bottom" id="Python Console" weight="0.32983193" />
|
||||
<window_info anchor="bottom" id="Event Log" side_tool="true" />
|
||||
<window_info active="true" anchor="bottom" id="Terminal" visible="true" weight="0.32983193" />
|
||||
<window_info id="Favorites" order="2" side_tool="true" />
|
||||
<window_info anchor="bottom" id="Message" order="0" />
|
||||
<window_info anchor="bottom" id="Find" order="1" />
|
||||
<window_info anchor="bottom" id="Run" order="2" />
|
||||
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
|
||||
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
|
||||
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
|
||||
<window_info anchor="bottom" id="TODO" order="6" />
|
||||
<window_info active="true" anchor="bottom" id="TODO" order="6" visible="true" weight="0.32983193" />
|
||||
<window_info anchor="bottom" id="Version Control" order="7" weight="0.32983193" />
|
||||
<window_info anchor="bottom" id="Terminal" order="8" weight="0.32983193" />
|
||||
<window_info anchor="bottom" id="Event Log" order="9" side_tool="true" />
|
||||
<window_info anchor="bottom" id="Python Console" order="10" weight="0.32983193" />
|
||||
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
|
||||
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
|
||||
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
|
||||
</layout>
|
||||
</component>
|
||||
<component name="VcsContentAnnotationSettings">
|
||||
<option name="myLimit" value="2678400000" />
|
||||
</component>
|
||||
<component name="VcsManagerConfiguration">
|
||||
<MESSAGE value="Correcciones en wrapper_mysql y avance en metodos de explorer. Iniciado modulo de alertas." />
|
||||
<option name="LAST_COMMIT_MESSAGE" value="Correcciones en wrapper_mysql y avance en metodos de explorer. Iniciado modulo de alertas." />
|
||||
</component>
|
||||
<component name="editorHistoryManager">
|
||||
<entry file="file://$PROJECT_DIR$/explorer/test_explorer.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="90">
|
||||
<caret line="6" lean-forward="true" selection-start-line="6" selection-end-line="6" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/core/scrapping_utils.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="60">
|
||||
<caret line="4" lean-forward="true" selection-start-line="4" selection-end-line="4" />
|
||||
<state relative-caret-position="525">
|
||||
<caret line="35" column="37" lean-forward="true" selection-start-line="35" selection-start-column="37" selection-end-line="35" selection-end-column="37" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/explorer/explorer.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="150">
|
||||
<caret line="11" column="18" lean-forward="true" selection-start-line="11" selection-start-column="18" selection-end-line="11" selection-end-column="18" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/core/alerts.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="45">
|
||||
<caret line="3" lean-forward="true" selection-start-line="2" selection-start-column="14" selection-end-line="3" />
|
||||
<folding>
|
||||
<element signature="e#0#46#0" expanded="true" />
|
||||
<marker date="1535650379609" expanded="true" signature="219:256" ph="..." />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/core/mysql_wrapper.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="391">
|
||||
<caret line="36" column="22" lean-forward="true" selection-start-line="36" selection-start-column="22" selection-end-line="36" selection-end-column="22" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
|
|
|
|||
39
core/alerts.py
Normal file
39
core/alerts.py
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
from email.mime.multipart import MIMEMultipart
|
||||
from email.mime.text import MIMEText
|
||||
import smtplib
|
||||
|
||||
|
||||
my_adress = 'drogonalerts@gmail.com'
|
||||
master_address = 'pablomartincalvo@gmail.com'
|
||||
|
||||
def alert_master(header, message):
|
||||
#TODO Acabar la alerta de email
|
||||
|
||||
msg = MIMEMultipart()
|
||||
|
||||
message = "Thank you"
|
||||
|
||||
# setup the parameters of the message
|
||||
password = "your_password"
|
||||
msg['From'] = "your_address"
|
||||
msg['To'] = "to_address"
|
||||
msg['Subject'] = "Subscription"
|
||||
|
||||
# add in the message body
|
||||
msg.attach(MIMEText(message, 'plain'))
|
||||
|
||||
# create server
|
||||
server = smtplib.SMTP('smtp.gmail.com: 587')
|
||||
|
||||
server.starttls()
|
||||
|
||||
# Login Credentials for sending the mail
|
||||
server.login(msg['From'], password)
|
||||
|
||||
# send the message via the server.
|
||||
server.sendmail(msg['From'], msg['To'], msg.as_string())
|
||||
|
||||
server.quit()
|
||||
|
||||
print
|
||||
"successfully sent email to %s:" % (msg['To'])
|
||||
|
|
@ -1,10 +1,15 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import mysql.connector
|
||||
|
||||
anuncios_db_parameters = {'host': '46.183.115.154',
|
||||
anuncios_db_parameters = {'host': '185.166.215.170',
|
||||
'database': 'anuncios',
|
||||
'user': 'pablo',
|
||||
'password': 'noesfacilvivirsinpin'}
|
||||
'user': 'drogon',
|
||||
'password': 'noesfacilvivirsindrogon'}
|
||||
|
||||
tasks_db_parameters = {'host': '185.166.215.170',
|
||||
'database': 'tasks',
|
||||
'user': 'drogon',
|
||||
'password': 'noesfacilvivirsindrogon'}
|
||||
|
||||
class DatabaseWrapper():
|
||||
|
||||
|
|
@ -18,14 +23,11 @@ class DatabaseWrapper():
|
|||
self.ping()
|
||||
|
||||
def connect(self):
|
||||
try:
|
||||
self.connection = mysql.connector.connect(host = self.host,
|
||||
database = self.database,
|
||||
user = self.user,
|
||||
password = self.password)
|
||||
except Exception as e:
|
||||
print("Could not connect to the database.")
|
||||
print(e)
|
||||
password = self.password,
|
||||
autocommit = True)
|
||||
|
||||
def disconnect(self):
|
||||
if self.connection.is_connected():
|
||||
|
|
@ -42,13 +44,19 @@ class DatabaseWrapper():
|
|||
execution_cursor.execute(query_statement, query_parameters)
|
||||
self.disconnect()
|
||||
return execution_cursor
|
||||
else:
|
||||
raise Exception("Could not connect to the database.")
|
||||
|
||||
|
||||
def query_dict(self, query_statement, query_parameters = None):
|
||||
return self.query(query_statement, query_parameters, dictionary = True)
|
||||
|
||||
|
||||
def get_anunciosdb():
|
||||
return DatabaseWrapper(anuncios_db_parameters)
|
||||
|
||||
def get_tasksdb():
|
||||
return DatabaseWrapper(tasks_db_parameters)
|
||||
|
||||
|
||||
|
||||
|
|
@ -33,4 +33,4 @@ class UrlAttack():
|
|||
|
||||
def get_text(self):
|
||||
if self.success:
|
||||
return self.response.text()
|
||||
return self.response.text
|
||||
|
|
@ -2,28 +2,28 @@
|
|||
import sys
|
||||
sys.path.append('..')
|
||||
import uuid
|
||||
import datetime
|
||||
from datetime import datetime
|
||||
from time import sleep
|
||||
from bs4 import BeautifulSoup
|
||||
from core.mysql_wrapper import get_anunciosdb
|
||||
import re
|
||||
from random import randint
|
||||
from core.mysql_wrapper import get_anunciosdb, get_tasksdb
|
||||
from core.scrapping_utils import UrlAttack
|
||||
import core.alerts
|
||||
|
||||
class Explorer():
|
||||
|
||||
sleep_time_no_work = 60
|
||||
sleep_time_no_service = 600
|
||||
|
||||
working_hours = {start: datetime.time(9, 0, 0),
|
||||
end: datetime.time(18, 0, 0)}
|
||||
monthly_capture_target = 1000
|
||||
|
||||
def __init__(self):
|
||||
try:
|
||||
self.anunciosdb = get_anunciosdb()
|
||||
|
||||
except:
|
||||
print("Could not connect to anuncios DB")
|
||||
try:
|
||||
self.task_log_db = #get_task_log_db()
|
||||
except:
|
||||
print("Could not connect to task log DB")
|
||||
|
||||
self.max_db_retries = 3
|
||||
self.db_retries = 0
|
||||
|
|
@ -31,7 +31,6 @@ class Explorer():
|
|||
self.queue_retries = 0
|
||||
|
||||
def start(self):
|
||||
#Arrancar el servicio
|
||||
|
||||
while True:
|
||||
if not self.there_is_work():
|
||||
|
|
@ -45,16 +44,7 @@ class Explorer():
|
|||
break
|
||||
|
||||
current_task = ExploringTask(self.compose_listing_url)
|
||||
if current_task.is_ready_to_explore:
|
||||
current_task.explore()
|
||||
else:
|
||||
break
|
||||
if current_task.status == 'referencias ready':
|
||||
current_referencias = current_task.get_referencias()
|
||||
|
||||
for referencia in current_referencias:
|
||||
self.post_task_to_queue(referencia)
|
||||
current_task.update_status('Sent to queue')
|
||||
|
||||
continue
|
||||
|
||||
|
|
@ -64,12 +54,22 @@ class Explorer():
|
|||
def stop(self):
|
||||
#TODO
|
||||
#Detener el servicio
|
||||
pass
|
||||
|
||||
def there_is_work(self):
|
||||
#TODO
|
||||
#Comprueba si hay trabajo por hacer
|
||||
#Mirando en la bd de tasks cuantas se han hecho ultimamente, mensualmente
|
||||
#etc.
|
||||
"""
|
||||
Funcion que agrupa las condiciones que se deben cumplir para poder trabajar
|
||||
"""
|
||||
if not self.in_working_hours():
|
||||
return False
|
||||
|
||||
if self.get_referencias_acquired_today() >= self.get_max_referencias_for_today():
|
||||
return False
|
||||
|
||||
if self.get_tasks_created_today() >= self.get_max_tasks_today():
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def database_is_up(self):
|
||||
while self.db_retries <= self.max_db_retries:
|
||||
|
|
@ -83,7 +83,6 @@ class Explorer():
|
|||
|
||||
return False
|
||||
|
||||
|
||||
def queue_is_up(self):
|
||||
#TODO
|
||||
while self.queue_retries <= self.max_queue_retries:
|
||||
|
|
@ -97,75 +96,138 @@ class Explorer():
|
|||
|
||||
return False
|
||||
|
||||
def in_working_hours(self):
|
||||
return working_hours['start'] <= datetime.now().time() <= working_hours['end']
|
||||
|
||||
def get_referencias_acquired_today(self):
|
||||
"""
|
||||
Cuenta cuantas nuevas referencias han aparecido en las ultimas 24 horas
|
||||
"""
|
||||
|
||||
query_statement = """ SELECT count(referencias)
|
||||
FROM primera_captura_full
|
||||
WHERE fecha_captura >= now() - INTERVAL 1 DAY;
|
||||
"""
|
||||
|
||||
cursor_result = self.anunciosdb.query(query_statement)
|
||||
|
||||
return cursor_result.fetchone()
|
||||
|
||||
def get_max_referencias_for_today(self):
|
||||
"""
|
||||
Calcula la cantidad objetivo para las ultimas 24 horas en base a la
|
||||
diferencia con el objetivo mensual
|
||||
"""
|
||||
query_statement = """ SELECT count(referencias)
|
||||
FROM primera_captura_full
|
||||
WHERE fecha_captura >= now() - INTERVAL 30 DAY;
|
||||
"""
|
||||
cursor_result = self.anunciosdb.query(query_statement)
|
||||
new_referencias_last_30 = cursor_result.fetchone()
|
||||
|
||||
deviation = (monthly_capture_target - new_referencias_last_30) / monthly_capture_target
|
||||
max_referencias = (monthly_capture_target/30) * (1 + (deviation))
|
||||
|
||||
return max_referencias
|
||||
|
||||
def get_tasks_created_today(self):
|
||||
"""
|
||||
Mira en el task log cuantas tareas se han iniciado en las ultimas 24 horas
|
||||
"""
|
||||
query_statement = """ SELECT count(uuid)
|
||||
FROM exploring_tasks
|
||||
WHERE status = 'Attacked'
|
||||
AND write_time >= now() - INTERVAL 1 DAY;
|
||||
"""
|
||||
cursor_result = self.tasksdb.query(query_statement)
|
||||
tasks_created_today = cursor_result.fetchone()
|
||||
|
||||
return tasks_created_today
|
||||
|
||||
def get_max_tasks_today(self):
|
||||
"""
|
||||
Calcula el maximo diario de intentos en forma de tareas, en base al
|
||||
maximo de capturas mas un multiplicador
|
||||
"""
|
||||
return (self.get_max_referencias_for_today() / 30) * 6
|
||||
|
||||
def compose_listing_url(self):
|
||||
#TODO
|
||||
#Decide que url hay que componer y la compone
|
||||
"""
|
||||
Genera URLs de manera aleatoria
|
||||
:return:
|
||||
"""
|
||||
raiz = 'https://www.idealista.com/'
|
||||
tipo = #Logica random
|
||||
tipo = randint(1,2)
|
||||
ciudad = 'barcelona'
|
||||
numero = #logica random
|
||||
url = raiz + tipo + '-garajes/' + ciudad + '-' + ciudad + '/' +
|
||||
numero = randint(1,30)
|
||||
url = raiz + tipo + '-garajes/' + ciudad + '-' + ciudad + '/' + \
|
||||
'pagina-' + numero + '.htm'
|
||||
|
||||
return url
|
||||
|
||||
def post_task_to_queue(self, referencia):
|
||||
#TODO
|
||||
#Manda la task a la cola redis
|
||||
|
||||
|
||||
|
||||
class ExploringTask():
|
||||
|
||||
def __init__(self, url):
|
||||
self.anunciosdb = get_anunciosdb()
|
||||
self.tasksdb = get_tasksdb()
|
||||
self.target_url = url
|
||||
self.id = str(uuid.uuid4())
|
||||
self.update_status('Pending')
|
||||
self._update_status('Pending')
|
||||
|
||||
try:
|
||||
self.anunciosdb = get_anunciosdb()
|
||||
except:
|
||||
self.anunciosdb = None
|
||||
self.update_status('Unable to connect to anuncios DB')
|
||||
|
||||
try:
|
||||
#TODO
|
||||
#Pendiente de implementar wraper para MongoDB
|
||||
#self.task_log_db =
|
||||
except:
|
||||
self.update_status('Unable to connect to task log DB')
|
||||
#self.task_log_db = None
|
||||
|
||||
def update_status(self, new_status):
|
||||
def _update_status(self, new_status):
|
||||
self.status = new_status
|
||||
self._log_in_taskdb()
|
||||
|
||||
|
||||
def is_ready_to_explore(self):
|
||||
if self.anunciosdb is not None and self.task_log_db is not None:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
self._log_in_tasksdb()
|
||||
|
||||
def explore(self):
|
||||
attack = UrlAttack(self.url)
|
||||
attack = UrlAttack(self.target_url)
|
||||
attack.attack()
|
||||
self._update_status('Attacked')
|
||||
|
||||
if attack.success:
|
||||
self._validate_referencias(attack.get_text())
|
||||
self._extract_referencias(attack.get_text())
|
||||
if self.new_listings:
|
||||
self.update_status('referencias ready')
|
||||
if self.referencias:
|
||||
self._update_status('Referencias ready')
|
||||
self._post_tasks_to_queue()
|
||||
self._update_status('Sent to Queue')
|
||||
elif self.there_are_referencias:
|
||||
self._update_status('Failure - No new referencias in HTML')
|
||||
else:
|
||||
self.update_status('Failure - No listings in HTML')
|
||||
self._update_status('Failure - HTML with no referencias')
|
||||
else:
|
||||
self.update_status('Failure - Bad request')
|
||||
self._update_status('Failure - Bad request')
|
||||
|
||||
def get_referencias(self):
|
||||
return self.referencias
|
||||
|
||||
def _log_in_taskdb(self):
|
||||
#TODO
|
||||
#Funcion que grabe estado y demas en una mongodb o argo azin
|
||||
def _log_in_tasksdb(self):
|
||||
"""
|
||||
Graba en la base de datos de tareas un registro con el UUID de la tarea,
|
||||
un timestamp y el status
|
||||
"""
|
||||
|
||||
query_statement = """INSERT INTO exploring_tasks_logs
|
||||
(uuid, write_time, status)
|
||||
VALUES (%(uuid)s, NOW(), %(status)s)"""
|
||||
|
||||
query_parameters = {'uuid': self.id,
|
||||
'status': self.status}
|
||||
|
||||
self.tasksdb.query(query_statement, query_parameters)
|
||||
|
||||
def _validate_referencias(self, html):
|
||||
"""
|
||||
Comprueba que las etiquetas sigan el formato de un anuncio.
|
||||
Lanza una advertencia si no es así.
|
||||
"""
|
||||
soup = BeautifulSoup(html, 'html5lib')
|
||||
ads = soup.find_all(class_ = "item")
|
||||
pattern = "^[0-9]{3,20}$"
|
||||
|
||||
for ad in ads:
|
||||
if not re.match(pattern, ad["data-adid"]):
|
||||
#TODO Levantar marron
|
||||
pass
|
||||
|
||||
|
||||
def _extract_referencias(self, html):
|
||||
"""
|
||||
|
|
@ -173,17 +235,19 @@ class ExploringTask():
|
|||
de capturas, y guarda si han aparecido listings y si hay alguno nuevo
|
||||
"""
|
||||
|
||||
soup = BeautifulSoup(self.html, 'html5lib')
|
||||
ads = sopa.find_all(class_ = "item")
|
||||
soup = BeautifulSoup(html, 'html5lib')
|
||||
ads = soup.find_all(class_ = "item")
|
||||
self.there_are_referencias = bool(ads)
|
||||
self.referencias = []
|
||||
for ad in ads:
|
||||
if self._is_new_listing(ad["data-adid"]):
|
||||
self.referencias.append(ad["data-adid"])
|
||||
self.new_listings = bool(self.referencias)
|
||||
|
||||
|
||||
def _is_new_listing(self, referencia):
|
||||
#TODO
|
||||
#Comprobar contra base de datos si la referencia existe en base de datos
|
||||
"""
|
||||
Comprueba si el listing ya existe en la base de datos de anuncios
|
||||
"""
|
||||
query_statement = """SELECT count(referencia)
|
||||
FROM capturas
|
||||
WHERE referencia = %s"""
|
||||
|
|
@ -196,3 +260,19 @@ class ExploringTask():
|
|||
else:
|
||||
return True
|
||||
|
||||
def _post_tasks_to_queue(self):
|
||||
#TODO Mandar las referencias a redis
|
||||
pass
|
||||
|
||||
|
||||
def testear_exploring_task():
|
||||
url = 'https://www.idealista.com/venta-garajes/barcelona-barcelona/'
|
||||
task = ExploringTask(url)
|
||||
task.explore()
|
||||
|
||||
print(task.referencias)
|
||||
|
||||
|
||||
testear_exploring_task()
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue