Correcciones en wrapper_mysql y avance en metodos de explorer. Iniciado modulo de alertas.

This commit is contained in:
pablomartincalvo 2018-08-30 19:38:31 +02:00
parent e883f9031b
commit 5eedb037ed
6 changed files with 359 additions and 105 deletions

1
.gitignore vendored
View file

@ -0,0 +1 @@
/data_backups

156
.idea/workspace.xml generated
View file

@ -1,7 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project version="4"> <project version="4">
<component name="ChangeListManager"> <component name="ChangeListManager">
<list default="true" id="6e2fbba0-85ff-42d6-8e70-e4cdef1000c8" name="Default Changelist" comment=""> <list default="true" id="6e2fbba0-85ff-42d6-8e70-e4cdef1000c8" name="Default Changelist" comment="Correcciones en wrapper_mysql y avance en metodos de explorer. Iniciado modulo de alertas.">
<change afterPath="$PROJECT_DIR$/core/alerts.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.gitignore" beforeDir="false" afterPath="$PROJECT_DIR$/.gitignore" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/core/mysql_wrapper.py" beforeDir="false" afterPath="$PROJECT_DIR$/core/mysql_wrapper.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/core/scrapping_utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/core/scrapping_utils.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/explorer/explorer.py" beforeDir="false" afterPath="$PROJECT_DIR$/explorer/explorer.py" afterDir="false" /> <change beforePath="$PROJECT_DIR$/explorer/explorer.py" beforeDir="false" afterPath="$PROJECT_DIR$/explorer/explorer.py" afterDir="false" />
</list> </list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" /> <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
@ -15,24 +20,90 @@
<usages-collector id="statistics.lifecycle.project"> <usages-collector id="statistics.lifecycle.project">
<counts> <counts>
<entry key="project.open.time.0" value="1" /> <entry key="project.open.time.0" value="1" />
<entry key="project.opened" value="1" /> <entry key="project.open.time.13" value="1" />
<entry key="project.opened" value="2" />
</counts>
</usages-collector>
<usages-collector id="statistics.file.extensions.open">
<counts>
<entry key="py" value="4" />
</counts>
</usages-collector>
<usages-collector id="statistics.file.types.open">
<counts>
<entry key="Python" value="4" />
</counts>
</usages-collector>
<usages-collector id="statistics.file.extensions.edit">
<counts>
<entry key="py" value="634" />
<entry key="txt" value="92" />
</counts>
</usages-collector>
<usages-collector id="statistics.file.types.edit">
<counts>
<entry key="PLAIN_TEXT" value="92" />
<entry key="Python" value="634" />
</counts> </counts>
</usages-collector> </usages-collector>
</session> </session>
</component> </component>
<component name="FileEditorManager"> <component name="FileEditorManager">
<leaf> <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
<file pinned="false" current-in-tab="true"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/explorer/explorer.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="150">
<caret line="11" column="18" lean-forward="true" selection-start-line="11" selection-start-column="18" selection-end-line="11" selection-end-column="18" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/core/alerts.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="45">
<caret line="3" lean-forward="true" selection-start-line="2" selection-start-column="14" selection-end-line="3" />
<folding>
<element signature="e#0#46#0" expanded="true" />
<marker date="1535650379609" expanded="true" signature="219:256" ph="..." />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/core/scrapping_utils.py"> <entry file="file://$PROJECT_DIR$/core/scrapping_utils.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="60"> <state relative-caret-position="525">
<caret line="4" lean-forward="true" selection-start-line="4" selection-end-line="4" /> <caret line="35" column="37" lean-forward="true" selection-start-line="35" selection-start-column="37" selection-end-line="35" selection-end-column="37" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/core/mysql_wrapper.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="391">
<caret line="36" column="22" lean-forward="true" selection-start-line="36" selection-start-column="22" selection-end-line="36" selection-end-column="22" />
</state> </state>
</provider> </provider>
</entry> </entry>
</file> </file>
</leaf> </leaf>
</component> </component>
<component name="FileTemplateManagerImpl">
<option name="RECENT_TEMPLATES">
<list>
<option value="Python Script" />
</list>
</option>
</component>
<component name="FindInProjectRecents">
<findStrings>
<find>queue_retries</find>
</findStrings>
</component>
<component name="Git.Settings"> <component name="Git.Settings">
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" /> <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
</component> </component>
@ -40,6 +111,10 @@
<option name="CHANGED_PATHS"> <option name="CHANGED_PATHS">
<list> <list>
<option value="$PROJECT_DIR$/core/scrapping_utils.py" /> <option value="$PROJECT_DIR$/core/scrapping_utils.py" />
<option value="$PROJECT_DIR$/explorer/test_explorer.py" />
<option value="$PROJECT_DIR$/explorer/explorer.py" />
<option value="$PROJECT_DIR$/core/alerts.py" />
<option value="$PROJECT_DIR$/core/mysql_wrapper.py" />
</list> </list>
</option> </option>
</component> </component>
@ -96,6 +171,9 @@
</list> </list>
</option> </option>
</component> </component>
<component name="SvnConfiguration">
<configuration />
</component>
<component name="TaskManager"> <component name="TaskManager">
<task active="true" id="Default" summary="Default task"> <task active="true" id="Default" summary="Default task">
<changelist id="6e2fbba0-85ff-42d6-8e70-e4cdef1000c8" name="Default Changelist" comment="" /> <changelist id="6e2fbba0-85ff-42d6-8e70-e4cdef1000c8" name="Default Changelist" comment="" />
@ -106,34 +184,82 @@
</task> </task>
<servers /> <servers />
</component> </component>
<component name="TodoView" selected-index="4">
<todo-panel id="selected-file">
<is-autoscroll-to-source value="true" />
</todo-panel>
<todo-panel id="all">
<are-packages-shown value="true" />
<is-autoscroll-to-source value="true" />
</todo-panel>
</component>
<component name="ToolWindowManager"> <component name="ToolWindowManager">
<frame x="0" y="-2" width="1920" height="1082" extended-state="6" /> <frame x="0" y="-2" width="1920" height="1082" extended-state="6" />
<editor active="true" /> <editor active="true" />
<layout> <layout>
<window_info id="Favorites" side_tool="true" /> <window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.14918292" />
<window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.24986821" />
<window_info id="Structure" order="1" side_tool="true" weight="0.25" /> <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
<window_info anchor="bottom" id="Version Control" weight="0.32983193" /> <window_info id="Favorites" order="2" side_tool="true" />
<window_info anchor="bottom" id="Python Console" weight="0.32983193" />
<window_info anchor="bottom" id="Event Log" side_tool="true" />
<window_info active="true" anchor="bottom" id="Terminal" visible="true" weight="0.32983193" />
<window_info anchor="bottom" id="Message" order="0" /> <window_info anchor="bottom" id="Message" order="0" />
<window_info anchor="bottom" id="Find" order="1" /> <window_info anchor="bottom" id="Find" order="1" />
<window_info anchor="bottom" id="Run" order="2" /> <window_info anchor="bottom" id="Run" order="2" />
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" /> <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" /> <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" /> <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="bottom" id="TODO" order="6" /> <window_info active="true" anchor="bottom" id="TODO" order="6" visible="true" weight="0.32983193" />
<window_info anchor="bottom" id="Version Control" order="7" weight="0.32983193" />
<window_info anchor="bottom" id="Terminal" order="8" weight="0.32983193" />
<window_info anchor="bottom" id="Event Log" order="9" side_tool="true" />
<window_info anchor="bottom" id="Python Console" order="10" weight="0.32983193" />
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" /> <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" /> <window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" /> <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
</layout> </layout>
</component> </component>
<component name="VcsContentAnnotationSettings">
<option name="myLimit" value="2678400000" />
</component>
<component name="VcsManagerConfiguration">
<MESSAGE value="Correcciones en wrapper_mysql y avance en metodos de explorer. Iniciado modulo de alertas." />
<option name="LAST_COMMIT_MESSAGE" value="Correcciones en wrapper_mysql y avance en metodos de explorer. Iniciado modulo de alertas." />
</component>
<component name="editorHistoryManager"> <component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/explorer/test_explorer.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="90">
<caret line="6" lean-forward="true" selection-start-line="6" selection-end-line="6" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/core/scrapping_utils.py"> <entry file="file://$PROJECT_DIR$/core/scrapping_utils.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="60"> <state relative-caret-position="525">
<caret line="4" lean-forward="true" selection-start-line="4" selection-end-line="4" /> <caret line="35" column="37" lean-forward="true" selection-start-line="35" selection-start-column="37" selection-end-line="35" selection-end-column="37" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/explorer/explorer.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="150">
<caret line="11" column="18" lean-forward="true" selection-start-line="11" selection-start-column="18" selection-end-line="11" selection-end-column="18" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/core/alerts.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="45">
<caret line="3" lean-forward="true" selection-start-line="2" selection-start-column="14" selection-end-line="3" />
<folding>
<element signature="e#0#46#0" expanded="true" />
<marker date="1535650379609" expanded="true" signature="219:256" ph="..." />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/core/mysql_wrapper.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="391">
<caret line="36" column="22" lean-forward="true" selection-start-line="36" selection-start-column="22" selection-end-line="36" selection-end-column="22" />
</state> </state>
</provider> </provider>
</entry> </entry>

39
core/alerts.py Normal file
View file

@ -0,0 +1,39 @@
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import smtplib
my_adress = 'drogonalerts@gmail.com'
master_address = 'pablomartincalvo@gmail.com'
def alert_master(header, message):
#TODO Acabar la alerta de email
msg = MIMEMultipart()
message = "Thank you"
# setup the parameters of the message
password = "your_password"
msg['From'] = "your_address"
msg['To'] = "to_address"
msg['Subject'] = "Subscription"
# add in the message body
msg.attach(MIMEText(message, 'plain'))
# create server
server = smtplib.SMTP('smtp.gmail.com: 587')
server.starttls()
# Login Credentials for sending the mail
server.login(msg['From'], password)
# send the message via the server.
server.sendmail(msg['From'], msg['To'], msg.as_string())
server.quit()
print
"successfully sent email to %s:" % (msg['To'])

View file

@ -1,10 +1,15 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import mysql.connector import mysql.connector
anuncios_db_parameters = {'host': '46.183.115.154', anuncios_db_parameters = {'host': '185.166.215.170',
'database': 'anuncios', 'database': 'anuncios',
'user': 'pablo', 'user': 'drogon',
'password': 'noesfacilvivirsinpin'} 'password': 'noesfacilvivirsindrogon'}
tasks_db_parameters = {'host': '185.166.215.170',
'database': 'tasks',
'user': 'drogon',
'password': 'noesfacilvivirsindrogon'}
class DatabaseWrapper(): class DatabaseWrapper():
@ -18,14 +23,11 @@ class DatabaseWrapper():
self.ping() self.ping()
def connect(self): def connect(self):
try: self.connection = mysql.connector.connect(host = self.host,
self.connection = mysql.connector.connect(host = self.host, database = self.database,
database = self.database, user = self.user,
user = self.user, password = self.password,
password = self.password) autocommit = True)
except Exception as e:
print("Could not connect to the database.")
print(e)
def disconnect(self): def disconnect(self):
if self.connection.is_connected(): if self.connection.is_connected():
@ -42,13 +44,19 @@ class DatabaseWrapper():
execution_cursor.execute(query_statement, query_parameters) execution_cursor.execute(query_statement, query_parameters)
self.disconnect() self.disconnect()
return execution_cursor return execution_cursor
else:
raise Exception("Could not connect to the database.")
def query_dict(self, query_statement, query_parameters = None): def query_dict(self, query_statement, query_parameters = None):
return self.query(query_statement, query_parameters, dictionary = True) return self.query(query_statement, query_parameters, dictionary = True)
def get_anunciosdb(): def get_anunciosdb():
return DatabaseWrapper(anuncios_db_parameters) return DatabaseWrapper(anuncios_db_parameters)
def get_tasksdb():
return DatabaseWrapper(tasks_db_parameters)

View file

@ -33,4 +33,4 @@ class UrlAttack():
def get_text(self): def get_text(self):
if self.success: if self.success:
return self.response.text() return self.response.text

View file

@ -2,36 +2,35 @@
import sys import sys
sys.path.append('..') sys.path.append('..')
import uuid import uuid
import datetime from datetime import datetime
from time import sleep from time import sleep
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from core.mysql_wrapper import get_anunciosdb import re
from core.scrapping_utils import UrlAttack from random import randint
from core.mysql_wrapper import get_anunciosdb, get_tasksdb
from core.scrapping_utils import UrlAttack
import core.alerts
class Explorer(): class Explorer():
sleep_time_no_work = 60 sleep_time_no_work = 60
sleep_time_no_service = 600 sleep_time_no_service = 600
working_hours = {start: datetime.time(9, 0, 0),
end: datetime.time(18, 0, 0)}
monthly_capture_target = 1000
def __init__(self): def __init__(self):
try: try:
self.anunciosdb = get_anunciosdb() self.anunciosdb = get_anunciosdb()
except: except:
print("Could not connect to anuncios DB") print("Could not connect to anuncios DB")
try:
self.task_log_db = #get_task_log_db()
except:
print("Could not connect to task log DB")
self.max_db_retries = 3 self.max_db_retries = 3
self.db_retries = 0 self.db_retries = 0
self.max_queue_retries = 3 self.max_queue_retries = 3
self.queue_retries = 0 self.queue_retries = 0
def start(self): def start(self):
#Arrancar el servicio
while True: while True:
if not self.there_is_work(): if not self.there_is_work():
@ -45,16 +44,7 @@ class Explorer():
break break
current_task = ExploringTask(self.compose_listing_url) current_task = ExploringTask(self.compose_listing_url)
if current_task.is_ready_to_explore: current_task.explore()
current_task.explore()
else:
break
if current_task.status == 'referencias ready':
current_referencias = current_task.get_referencias()
for referencia in current_referencias:
self.post_task_to_queue(referencia)
current_task.update_status('Sent to queue')
continue continue
@ -64,12 +54,22 @@ class Explorer():
def stop(self): def stop(self):
#TODO #TODO
#Detener el servicio #Detener el servicio
pass
def there_is_work(self): def there_is_work(self):
#TODO """
#Comprueba si hay trabajo por hacer Funcion que agrupa las condiciones que se deben cumplir para poder trabajar
#Mirando en la bd de tasks cuantas se han hecho ultimamente, mensualmente """
#etc. if not self.in_working_hours():
return False
if self.get_referencias_acquired_today() >= self.get_max_referencias_for_today():
return False
if self.get_tasks_created_today() >= self.get_max_tasks_today():
return False
return True
def database_is_up(self): def database_is_up(self):
while self.db_retries <= self.max_db_retries: while self.db_retries <= self.max_db_retries:
@ -82,8 +82,7 @@ class Explorer():
self.db_retries = self.db_retries + 1 self.db_retries = self.db_retries + 1
return False return False
def queue_is_up(self): def queue_is_up(self):
#TODO #TODO
while self.queue_retries <= self.max_queue_retries: while self.queue_retries <= self.max_queue_retries:
@ -95,77 +94,140 @@ class Explorer():
sleep(sleep_time_no_service) sleep(sleep_time_no_service)
self.queue_retries = self.queue_retries + 1 self.queue_retries = self.queue_retries + 1
return False return False
def in_working_hours(self):
return working_hours['start'] <= datetime.now().time() <= working_hours['end']
def get_referencias_acquired_today(self):
"""
Cuenta cuantas nuevas referencias han aparecido en las ultimas 24 horas
"""
query_statement = """ SELECT count(referencias)
FROM primera_captura_full
WHERE fecha_captura >= now() - INTERVAL 1 DAY;
"""
cursor_result = self.anunciosdb.query(query_statement)
return cursor_result.fetchone()
def get_max_referencias_for_today(self):
"""
Calcula la cantidad objetivo para las ultimas 24 horas en base a la
diferencia con el objetivo mensual
"""
query_statement = """ SELECT count(referencias)
FROM primera_captura_full
WHERE fecha_captura >= now() - INTERVAL 30 DAY;
"""
cursor_result = self.anunciosdb.query(query_statement)
new_referencias_last_30 = cursor_result.fetchone()
deviation = (monthly_capture_target - new_referencias_last_30) / monthly_capture_target
max_referencias = (monthly_capture_target/30) * (1 + (deviation))
return max_referencias
def get_tasks_created_today(self):
"""
Mira en el task log cuantas tareas se han iniciado en las ultimas 24 horas
"""
query_statement = """ SELECT count(uuid)
FROM exploring_tasks
WHERE status = 'Attacked'
AND write_time >= now() - INTERVAL 1 DAY;
"""
cursor_result = self.tasksdb.query(query_statement)
tasks_created_today = cursor_result.fetchone()
return tasks_created_today
def get_max_tasks_today(self):
"""
Calcula el maximo diario de intentos en forma de tareas, en base al
maximo de capturas mas un multiplicador
"""
return (self.get_max_referencias_for_today() / 30) * 6
def compose_listing_url(self): def compose_listing_url(self):
#TODO """
#Decide que url hay que componer y la compone Genera URLs de manera aleatoria
:return:
"""
raiz = 'https://www.idealista.com/' raiz = 'https://www.idealista.com/'
tipo = #Logica random tipo = randint(1,2)
ciudad = 'barcelona' ciudad = 'barcelona'
numero = #logica random numero = randint(1,30)
url = raiz + tipo + '-garajes/' + ciudad + '-' + ciudad + '/' + url = raiz + tipo + '-garajes/' + ciudad + '-' + ciudad + '/' + \
'pagina-' + numero + '.htm' 'pagina-' + numero + '.htm'
return url return url
def post_task_to_queue(self, referencia):
#TODO
#Manda la task a la cola redis
class ExploringTask(): class ExploringTask():
def __init__(self, url): def __init__(self, url):
self.anunciosdb = get_anunciosdb()
self.tasksdb = get_tasksdb()
self.target_url = url self.target_url = url
self.id = str(uuid.uuid4()) self.id = str(uuid.uuid4())
self.update_status('Pending') self._update_status('Pending')
try: def _update_status(self, new_status):
self.anunciosdb = get_anunciosdb()
except:
self.anunciosdb = None
self.update_status('Unable to connect to anuncios DB')
try:
#TODO
#Pendiente de implementar wraper para MongoDB
#self.task_log_db =
except:
self.update_status('Unable to connect to task log DB')
#self.task_log_db = None
def update_status(self, new_status):
self.status = new_status self.status = new_status
self._log_in_taskdb() self._log_in_tasksdb()
def is_ready_to_explore(self):
if self.anunciosdb is not None and self.task_log_db is not None:
return True
else:
return False
def explore(self): def explore(self):
attack = UrlAttack(self.url) attack = UrlAttack(self.target_url)
attack.attack() attack.attack()
self._update_status('Attacked')
if attack.success: if attack.success:
self._validate_referencias(attack.get_text())
self._extract_referencias(attack.get_text()) self._extract_referencias(attack.get_text())
if self.new_listings: if self.referencias:
self.update_status('referencias ready') self._update_status('Referencias ready')
self._post_tasks_to_queue()
self._update_status('Sent to Queue')
elif self.there_are_referencias:
self._update_status('Failure - No new referencias in HTML')
else: else:
self.update_status('Failure - No listings in HTML') self._update_status('Failure - HTML with no referencias')
else: else:
self.update_status('Failure - Bad request') self._update_status('Failure - Bad request')
def get_referencias(self):
return self.referencias
def _log_in_taskdb(self): def _log_in_tasksdb(self):
#TODO """
#Funcion que grabe estado y demas en una mongodb o argo azin Graba en la base de datos de tareas un registro con el UUID de la tarea,
un timestamp y el status
"""
query_statement = """INSERT INTO exploring_tasks_logs
(uuid, write_time, status)
VALUES (%(uuid)s, NOW(), %(status)s)"""
query_parameters = {'uuid': self.id,
'status': self.status}
self.tasksdb.query(query_statement, query_parameters)
def _validate_referencias(self, html):
"""
Comprueba que las etiquetas sigan el formato de un anuncio.
Lanza una advertencia si no es así.
"""
soup = BeautifulSoup(html, 'html5lib')
ads = soup.find_all(class_ = "item")
pattern = "^[0-9]{3,20}$"
for ad in ads:
if not re.match(pattern, ad["data-adid"]):
#TODO Levantar marron
pass
def _extract_referencias(self, html): def _extract_referencias(self, html):
""" """
@ -173,17 +235,19 @@ class ExploringTask():
de capturas, y guarda si han aparecido listings y si hay alguno nuevo de capturas, y guarda si han aparecido listings y si hay alguno nuevo
""" """
soup = BeautifulSoup(self.html, 'html5lib') soup = BeautifulSoup(html, 'html5lib')
ads = sopa.find_all(class_ = "item") ads = soup.find_all(class_ = "item")
self.there_are_referencias = bool(ads)
self.referencias = [] self.referencias = []
for ad in ads: for ad in ads:
if self._is_new_listing(ad["data-adid"]): if self._is_new_listing(ad["data-adid"]):
self.referencias.append(ad["data-adid"]) self.referencias.append(ad["data-adid"])
self.new_listings = bool(self.referencias)
def _is_new_listing(self, referencia): def _is_new_listing(self, referencia):
#TODO """
#Comprobar contra base de datos si la referencia existe en base de datos Comprueba si el listing ya existe en la base de datos de anuncios
"""
query_statement = """SELECT count(referencia) query_statement = """SELECT count(referencia)
FROM capturas FROM capturas
WHERE referencia = %s""" WHERE referencia = %s"""
@ -195,4 +259,20 @@ class ExploringTask():
return False return False
else: else:
return True return True
def _post_tasks_to_queue(self):
#TODO Mandar las referencias a redis
pass
def testear_exploring_task():
url = 'https://www.idealista.com/venta-garajes/barcelona-barcelona/'
task = ExploringTask(url)
task.explore()
print(task.referencias)
testear_exploring_task()