drogon/explorer/explorer.py

186 lines
5 KiB
Python
Raw Normal View History

# -*- coding: utf-8 -*-
import sys
sys.path.append('..')
import uuid
import datetime
from time import sleep
from bs4 import BeautifulSoup
from core.mysql_wrapper import get_anunciosdb
from core.scrapping_utils import UrlAttack
class Explorer():
sleep_time_no_work = 60
sleep_time_no_service = 600
def __init__(self):
try:
self.anunciosdb = get_anunciosdb()
except:
print("Could not connect to DB")
self.max_db_retries = 3
self.db_retries = 0
self.max_queue_retries = 3
self.queue_retries = 0
def start(self):
#Arrancar el servicio
while True:
if not self.there_is_work():
sleep(sleep_time_no_work)
continue
if not self.database_is_up():
break
if not self.queue_is_up():
break
current_task = ExploringTask(self.compose_listing_url)
if current_task.is_ready_to_explore:
current_task.explore()
else:
#Qué?
if current_task.status == 'Success':
current_referencias = current_task.get_referencias()
for referencia in current
self.stop()
def stop(self):
#TODO
#Detener el servicio
def there_is_work(self):
#TODO
#Comprueba si hay trabajo por hacer
def database_is_up(self):
while self.db_retries <= self.max_db_retries:
try:
self.anunciosdb.ping()
self.db_retries = 0
return True
except:
sleep(sleep_time_no_service)
self.db_retries = self.db_retries + 1
return False
def queue_is_up(self):
#TODO
while self.queue_retries <= self.max_queue_retries:
try:
#codigo que testea si redis esta vivo
self.queue_retries = 0
return True
except:
sleep(sleep_time_no_service)
self.queue_retries = self.queue_retries + 1
return False
def compose_listing_url(self):
#Decide que url hay que componer y la compone
raiz = 'https://www.idealista.com/'
tipo = #Logica random
ciudad = 'barcelona'
numero = #logica random
url = raiz + tipo + '-garajes/' + ciudad + '-' + ciudad + '/' +
'pagina-' + numero + '.htm'
return url
def post_task_to_queue(self, referencia):
class ExploringTask():
def __init__(self, url):
self.creation_date = Datetime.Datetime.now()
self.target_url = url
self.id = str(uuid.uuid4())
self.status = 'Pending'
try:
self.anunciosdb = get_anunciosdb()
except:
self.anunciosdb = None
print("Could not connect to DB")
try:
#TODO
#Pendiente de implementar wraper para MongoDB
#self.task_log_db = None
except:
#self.task_log_db = None
print("Could not connect to MongoDB")
self.log()
def is_ready_to_explore(self):
if self.anunciosdb is not None and self.task_log_db is not None:
return True
else:
return False
def explore(self):
self.attack_date = Datetime.Datetime.now()
attack = UrlAttack(self.url)
attack.attack()
if attack.success:
self.extract_listings(attack.get_text())
if self.listings_ok:
self.status = 'Success'
else:
self.status = 'Failure - No listings in HTML'
else:
self.status = 'Failure - Bad request
self.log()
def get_referencias(self):
return self.referencias
def log_in_taskdb(self):
#TODO
#Funcion que grabe estado y demas en una mongodb o argo azin
def extract_referencias(self, html):
#TODO
#Coge el html de una pagina de listado y pilla las referencias.
#Si no cae ninguna, se considera un fail
#Registra la propiedad listings_ok
if self.status == 'Success':
soup = BeautifulSoup(self.html, 'html5lib')
ads = sopa.find_all(class_ = "item")
self.referencias = []
for ad in ads:
if self.is_new_listing(ad["data-adid"]):
self.referencias.append(ad["data-adid"])
def is_new_listing(self, referencia):
#TODO
#Comprobar contra base de datos si la referencia existe en base de datos