2018-08-09 20:55:04 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
import sys
|
|
|
|
|
sys.path.append('..')
|
2018-08-12 23:14:47 +02:00
|
|
|
import uuid
|
|
|
|
|
import datetime
|
2018-08-09 20:55:04 +02:00
|
|
|
from time import sleep
|
2018-08-13 23:55:17 +02:00
|
|
|
from bs4 import BeautifulSoup
|
2018-08-09 20:55:04 +02:00
|
|
|
from core.mysql_wrapper import get_anunciosdb
|
2018-08-12 23:14:47 +02:00
|
|
|
from core.scrapping_utils import UrlAttack
|
2018-08-09 20:55:04 +02:00
|
|
|
|
|
|
|
|
class Explorer():
|
|
|
|
|
|
|
|
|
|
sleep_time_no_work = 60
|
|
|
|
|
sleep_time_no_service = 600
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
try:
|
|
|
|
|
self.anunciosdb = get_anunciosdb()
|
|
|
|
|
except:
|
|
|
|
|
print("Could not connect to DB")
|
|
|
|
|
|
|
|
|
|
self.max_db_retries = 3
|
|
|
|
|
self.db_retries = 0
|
|
|
|
|
self.max_queue_retries = 3
|
|
|
|
|
self.queue_retries = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def start(self):
|
|
|
|
|
#Arrancar el servicio
|
|
|
|
|
|
|
|
|
|
while True:
|
|
|
|
|
if not self.there_is_work():
|
|
|
|
|
sleep(sleep_time_no_work)
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if not self.database_is_up():
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if not self.queue_is_up():
|
|
|
|
|
break
|
|
|
|
|
|
2018-08-12 23:14:47 +02:00
|
|
|
current_task = ExploringTask(self.compose_listing_url)
|
2018-08-13 23:55:17 +02:00
|
|
|
|
|
|
|
|
if current_task.is_ready_to_explore:
|
|
|
|
|
current_task.explore()
|
|
|
|
|
else:
|
|
|
|
|
#Qué?
|
|
|
|
|
if current_task.status == 'Success':
|
|
|
|
|
current_referencias = current_task.get_referencias()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for referencia in current
|
2018-08-09 20:55:04 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.stop()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def stop(self):
|
2018-08-13 23:55:17 +02:00
|
|
|
#TODO
|
2018-08-09 20:55:04 +02:00
|
|
|
#Detener el servicio
|
|
|
|
|
|
|
|
|
|
def there_is_work(self):
|
2018-08-13 23:55:17 +02:00
|
|
|
#TODO
|
2018-08-09 20:55:04 +02:00
|
|
|
#Comprueba si hay trabajo por hacer
|
|
|
|
|
|
|
|
|
|
def database_is_up(self):
|
|
|
|
|
while self.db_retries <= self.max_db_retries:
|
|
|
|
|
try:
|
|
|
|
|
self.anunciosdb.ping()
|
|
|
|
|
self.db_retries = 0
|
|
|
|
|
return True
|
|
|
|
|
except:
|
|
|
|
|
sleep(sleep_time_no_service)
|
|
|
|
|
self.db_retries = self.db_retries + 1
|
|
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def queue_is_up(self):
|
2018-08-13 23:55:17 +02:00
|
|
|
#TODO
|
2018-08-09 20:55:04 +02:00
|
|
|
while self.queue_retries <= self.max_queue_retries:
|
|
|
|
|
try:
|
|
|
|
|
#codigo que testea si redis esta vivo
|
|
|
|
|
self.queue_retries = 0
|
|
|
|
|
return True
|
|
|
|
|
except:
|
|
|
|
|
sleep(sleep_time_no_service)
|
|
|
|
|
self.queue_retries = self.queue_retries + 1
|
|
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def compose_listing_url(self):
|
|
|
|
|
#Decide que url hay que componer y la compone
|
2018-08-12 23:14:47 +02:00
|
|
|
raiz = 'https://www.idealista.com/'
|
|
|
|
|
tipo = #Logica random
|
|
|
|
|
ciudad = 'barcelona'
|
|
|
|
|
numero = #logica random
|
|
|
|
|
url = raiz + tipo + '-garajes/' + ciudad + '-' + ciudad + '/' +
|
|
|
|
|
'pagina-' + numero + '.htm'
|
|
|
|
|
|
|
|
|
|
return url
|
2018-08-13 23:55:17 +02:00
|
|
|
|
|
|
|
|
def post_task_to_queue(self, referencia):
|
|
|
|
|
|
2018-08-09 20:55:04 +02:00
|
|
|
|
|
|
|
|
|
2018-08-12 23:14:47 +02:00
|
|
|
class ExploringTask():
|
2018-08-09 20:55:04 +02:00
|
|
|
|
2018-08-12 23:14:47 +02:00
|
|
|
def __init__(self, url):
|
|
|
|
|
self.creation_date = Datetime.Datetime.now()
|
|
|
|
|
self.target_url = url
|
|
|
|
|
self.id = str(uuid.uuid4())
|
|
|
|
|
self.status = 'Pending'
|
|
|
|
|
|
2018-08-13 23:55:17 +02:00
|
|
|
try:
|
|
|
|
|
self.anunciosdb = get_anunciosdb()
|
|
|
|
|
except:
|
|
|
|
|
self.anunciosdb = None
|
|
|
|
|
print("Could not connect to DB")
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
#TODO
|
|
|
|
|
#Pendiente de implementar wraper para MongoDB
|
|
|
|
|
#self.task_log_db = None
|
|
|
|
|
except:
|
|
|
|
|
#self.task_log_db = None
|
|
|
|
|
print("Could not connect to MongoDB")
|
|
|
|
|
|
2018-08-12 23:14:47 +02:00
|
|
|
self.log()
|
|
|
|
|
|
2018-08-13 23:55:17 +02:00
|
|
|
def is_ready_to_explore(self):
|
|
|
|
|
if self.anunciosdb is not None and self.task_log_db is not None:
|
|
|
|
|
return True
|
|
|
|
|
else:
|
|
|
|
|
return False
|
|
|
|
|
|
2018-08-12 23:14:47 +02:00
|
|
|
def explore(self):
|
|
|
|
|
self.attack_date = Datetime.Datetime.now()
|
|
|
|
|
attack = UrlAttack(self.url)
|
|
|
|
|
attack.attack()
|
|
|
|
|
|
|
|
|
|
if attack.success:
|
2018-08-13 23:55:17 +02:00
|
|
|
|
|
|
|
|
self.extract_listings(attack.get_text())
|
|
|
|
|
if self.listings_ok:
|
|
|
|
|
self.status = 'Success'
|
|
|
|
|
else:
|
|
|
|
|
self.status = 'Failure - No listings in HTML'
|
2018-08-12 23:14:47 +02:00
|
|
|
else:
|
2018-08-13 23:55:17 +02:00
|
|
|
self.status = 'Failure - Bad request
|
|
|
|
|
|
2018-08-12 23:14:47 +02:00
|
|
|
|
2018-08-13 23:55:17 +02:00
|
|
|
self.log()
|
2018-08-12 23:14:47 +02:00
|
|
|
|
2018-08-13 23:55:17 +02:00
|
|
|
def get_referencias(self):
|
|
|
|
|
return self.referencias
|
2018-08-12 23:14:47 +02:00
|
|
|
|
2018-08-13 23:55:17 +02:00
|
|
|
def log_in_taskdb(self):
|
2018-08-12 23:14:47 +02:00
|
|
|
#TODO
|
|
|
|
|
#Funcion que grabe estado y demas en una mongodb o argo azin
|
|
|
|
|
|
2018-08-13 23:55:17 +02:00
|
|
|
def extract_referencias(self, html):
|
2018-08-12 23:14:47 +02:00
|
|
|
#TODO
|
|
|
|
|
#Coge el html de una pagina de listado y pilla las referencias.
|
|
|
|
|
#Si no cae ninguna, se considera un fail
|
2018-08-13 23:55:17 +02:00
|
|
|
#Registra la propiedad listings_ok
|
|
|
|
|
if self.status == 'Success':
|
|
|
|
|
soup = BeautifulSoup(self.html, 'html5lib')
|
|
|
|
|
ads = sopa.find_all(class_ = "item")
|
|
|
|
|
self.referencias = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for ad in ads:
|
|
|
|
|
if self.is_new_listing(ad["data-adid"]):
|
|
|
|
|
self.referencias.append(ad["data-adid"])
|
|
|
|
|
|
|
|
|
|
def is_new_listing(self, referencia):
|
|
|
|
|
#TODO
|
|
|
|
|
#Comprobar contra base de datos si la referencia existe en base de datos
|
2018-08-12 23:14:47 +02:00
|
|
|
|
2018-08-09 20:55:04 +02:00
|
|
|
|