Formatting
This commit is contained in:
parent
9c2565f5d8
commit
acfeeef0d1
1 changed files with 93 additions and 65 deletions
|
|
@ -1,6 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import sys
|
import sys
|
||||||
sys.path.append('..')
|
|
||||||
|
sys.path.append("..")
|
||||||
import uuid
|
import uuid
|
||||||
import datetime
|
import datetime
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
|
@ -12,13 +13,15 @@ from core.config import monthly_new_ads_target, working_hours
|
||||||
from core.scrapping_utils import UrlAttack
|
from core.scrapping_utils import UrlAttack
|
||||||
from core.alerts import alert_master
|
from core.alerts import alert_master
|
||||||
from db_layer.capturing_tasks_interface import capturing_interface
|
from db_layer.capturing_tasks_interface import capturing_interface
|
||||||
|
from core import my_logger
|
||||||
|
import logging
|
||||||
|
|
||||||
class Explorer():
|
|
||||||
|
class Explorer:
|
||||||
|
|
||||||
sleep_time_no_work = 60
|
sleep_time_no_work = 60
|
||||||
sleep_time_no_service = 600
|
sleep_time_no_service = 600
|
||||||
ad_types = {'1': 'alquiler',
|
ad_types = {"1": "alquiler", "2": "venta"}
|
||||||
'2': 'venta'}
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
try:
|
try:
|
||||||
|
|
@ -36,30 +39,34 @@ class Explorer():
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
if not self.there_is_work():
|
if not self.there_is_work():
|
||||||
print('{}: Waiting. No work'.format(datetime.datetime.now()))
|
print("{}: Waiting. No work".format(datetime.datetime.now()))
|
||||||
sleep(Explorer.sleep_time_no_work)
|
sleep(Explorer.sleep_time_no_work)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not self.database_is_up():
|
if not self.database_is_up():
|
||||||
alert_master("SQL DOWN", "El explorer informa de que SQL esta caida. Actividad detenida")
|
alert_master(
|
||||||
|
"SQL DOWN",
|
||||||
|
"El explorer informa de que SQL esta caida. Actividad detenida",
|
||||||
|
)
|
||||||
self.stop()
|
self.stop()
|
||||||
|
|
||||||
current_task = ExploringTask(self.compose_listing_url())
|
current_task = ExploringTask(self.compose_listing_url())
|
||||||
current_task.explore()
|
current_task.explore()
|
||||||
print('{}: Exploring done'.format(datetime.datetime.now()))
|
print("{}: Exploring done".format(datetime.datetime.now()))
|
||||||
|
|
||||||
if current_task.status == 'Referencias ready':
|
if current_task.status == "Referencias ready":
|
||||||
referencias = current_task.get_referencias()
|
referencias = current_task.get_referencias()
|
||||||
for referencia in referencias:
|
for referencia in referencias:
|
||||||
capturing_interface.create_capturing_task(referencia, current_task.id)
|
capturing_interface.create_capturing_task(
|
||||||
|
referencia, current_task.id
|
||||||
|
)
|
||||||
current_task._update_status("Sent to queue")
|
current_task._update_status("Sent to queue")
|
||||||
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
#TODO Detener el servicio
|
# TODO Detener el servicio
|
||||||
#Detener el servicio
|
# Detener el servicio
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def there_is_work(self):
|
def there_is_work(self):
|
||||||
|
|
@ -72,7 +79,10 @@ class Explorer():
|
||||||
if not self.in_working_hours():
|
if not self.in_working_hours():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if self.get_referencias_acquired_today() >= self.get_max_referencias_for_today():
|
if (
|
||||||
|
self.get_referencias_acquired_today()
|
||||||
|
>= self.get_max_referencias_for_today()
|
||||||
|
):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if self.get_tasks_created_today() >= self.get_max_tasks_today():
|
if self.get_tasks_created_today() >= self.get_max_tasks_today():
|
||||||
|
|
@ -93,7 +103,11 @@ class Explorer():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def in_working_hours(self):
|
def in_working_hours(self):
|
||||||
return working_hours['start'] <= datetime.datetime.now().time() <= working_hours['end']
|
return (
|
||||||
|
working_hours["start"]
|
||||||
|
<= datetime.datetime.now().time()
|
||||||
|
<= working_hours["end"]
|
||||||
|
)
|
||||||
|
|
||||||
def get_referencias_acquired_today(self):
|
def get_referencias_acquired_today(self):
|
||||||
"""
|
"""
|
||||||
|
|
@ -121,7 +135,9 @@ class Explorer():
|
||||||
cursor_result = self.anunciosdb.query(query_statement)
|
cursor_result = self.anunciosdb.query(query_statement)
|
||||||
new_referencias_last_30 = cursor_result.fetchone()[0]
|
new_referencias_last_30 = cursor_result.fetchone()[0]
|
||||||
|
|
||||||
deviation = (monthly_new_ads_target - new_referencias_last_30) / monthly_new_ads_target
|
deviation = (
|
||||||
|
monthly_new_ads_target - new_referencias_last_30
|
||||||
|
) / monthly_new_ads_target
|
||||||
max_referencias = (monthly_new_ads_target / 30) * (1 + deviation)
|
max_referencias = (monthly_new_ads_target / 30) * (1 + deviation)
|
||||||
|
|
||||||
return max_referencias
|
return max_referencias
|
||||||
|
|
@ -165,24 +181,33 @@ class Explorer():
|
||||||
Genera URLs de manera aleatoria
|
Genera URLs de manera aleatoria
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
root = 'https://www.idealista.com/'
|
root = "https://www.idealista.com/"
|
||||||
type = Explorer.ad_types[str(randint(1,2))]
|
type = Explorer.ad_types[str(randint(1, 2))]
|
||||||
city = 'barcelona'
|
city = "barcelona"
|
||||||
page_number = str(randint(1,30))
|
page_number = str(randint(1, 30))
|
||||||
url = root + type + '-garajes/' + city + '-' + city + '/' + \
|
url = (
|
||||||
'pagina-' + page_number + '.htm'
|
root
|
||||||
|
+ type
|
||||||
|
+ "-garajes/"
|
||||||
|
+ city
|
||||||
|
+ "-"
|
||||||
|
+ city
|
||||||
|
+ "/"
|
||||||
|
+ "pagina-"
|
||||||
|
+ page_number
|
||||||
|
+ ".htm"
|
||||||
|
)
|
||||||
|
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
class ExploringTask:
|
class ExploringTask:
|
||||||
|
|
||||||
def __init__(self, url):
|
def __init__(self, url):
|
||||||
self.anunciosdb = get_anunciosdb()
|
self.anunciosdb = get_anunciosdb()
|
||||||
self.tasksdb = get_tasksdb()
|
self.tasksdb = get_tasksdb()
|
||||||
self.target_url = url
|
self.target_url = url
|
||||||
self.id = str(uuid.uuid4())
|
self.id = str(uuid.uuid4())
|
||||||
self._update_status('Pending')
|
self._update_status("Pending")
|
||||||
|
|
||||||
def _update_status(self, new_status):
|
def _update_status(self, new_status):
|
||||||
self.status = new_status
|
self.status = new_status
|
||||||
|
|
@ -191,19 +216,19 @@ class ExploringTask:
|
||||||
def explore(self):
|
def explore(self):
|
||||||
attack = UrlAttack(self.target_url)
|
attack = UrlAttack(self.target_url)
|
||||||
attack.attack()
|
attack.attack()
|
||||||
self._update_status('Attacked')
|
self._update_status("Attacked")
|
||||||
|
|
||||||
if attack.success:
|
if attack.success:
|
||||||
self._validate_referencias(attack.get_text())
|
self._validate_referencias(attack.get_text())
|
||||||
self._extract_referencias(attack.get_text())
|
self._extract_referencias(attack.get_text())
|
||||||
if self.referencias:
|
if self.referencias:
|
||||||
self._update_status('Referencias ready')
|
self._update_status("Referencias ready")
|
||||||
elif self.there_are_referencias:
|
elif self.there_are_referencias:
|
||||||
self._update_status('Failure - No new referencias in HTML')
|
self._update_status("Failure - No new referencias in HTML")
|
||||||
else:
|
else:
|
||||||
self._update_status('Failure - HTML with no referencias')
|
self._update_status("Failure - HTML with no referencias")
|
||||||
else:
|
else:
|
||||||
self._update_status('Failure - Bad request')
|
self._update_status("Failure - Bad request")
|
||||||
|
|
||||||
def _log_in_tasksdb(self):
|
def _log_in_tasksdb(self):
|
||||||
"""
|
"""
|
||||||
|
|
@ -215,8 +240,7 @@ class ExploringTask:
|
||||||
(uuid, write_time, status)
|
(uuid, write_time, status)
|
||||||
VALUES (%(uuid)s, NOW(), %(status)s)"""
|
VALUES (%(uuid)s, NOW(), %(status)s)"""
|
||||||
|
|
||||||
query_parameters = {'uuid': self.id,
|
query_parameters = {"uuid": self.id, "status": self.status}
|
||||||
'status': self.status}
|
|
||||||
|
|
||||||
self.tasksdb.query(query_statement, query_parameters)
|
self.tasksdb.query(query_statement, query_parameters)
|
||||||
|
|
||||||
|
|
@ -225,16 +249,20 @@ class ExploringTask:
|
||||||
Comprueba que las etiquetas sigan el formato de un anuncio.
|
Comprueba que las etiquetas sigan el formato de un anuncio.
|
||||||
Lanza una advertencia si no es así.
|
Lanza una advertencia si no es así.
|
||||||
"""
|
"""
|
||||||
soup = BeautifulSoup(html, 'html5lib')
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
ads = soup.find_all(class_="item")
|
ads = soup.find_all(class_="item")
|
||||||
pattern = "^[0-9]{3,20}$"
|
pattern = "^[0-9]{3,20}$"
|
||||||
|
|
||||||
for ad in ads:
|
for ad in ads:
|
||||||
if not re.match(pattern, ad["data-adid"]):
|
if not re.match(pattern, ad["data-adid"]):
|
||||||
alert_master("Alerta - Referencias no válidas",
|
alert_master(
|
||||||
"""Una tarea de exploración ha considerado inválida
|
"Alerta - Referencias no válidas",
|
||||||
|
"""Una tarea de exploración ha considerado inválida
|
||||||
una referencia. El texto de la referencia era : {}
|
una referencia. El texto de la referencia era : {}
|
||||||
""".format(ad["data-adid"]))
|
""".format(
|
||||||
|
ad["data-adid"]
|
||||||
|
),
|
||||||
|
)
|
||||||
break
|
break
|
||||||
|
|
||||||
def _extract_referencias(self, html):
|
def _extract_referencias(self, html):
|
||||||
|
|
@ -243,8 +271,8 @@ class ExploringTask:
|
||||||
de capturas, y guarda si han aparecido listings y si hay alguno nuevo
|
de capturas, y guarda si han aparecido listings y si hay alguno nuevo
|
||||||
"""
|
"""
|
||||||
|
|
||||||
soup = BeautifulSoup(html, 'html5lib')
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
ads = soup.find_all(class_ = "item")
|
ads = soup.find_all(class_="item")
|
||||||
self.there_are_referencias = bool(ads)
|
self.there_are_referencias = bool(ads)
|
||||||
self.referencias = []
|
self.referencias = []
|
||||||
for ad in ads:
|
for ad in ads:
|
||||||
|
|
@ -277,6 +305,6 @@ class ExploringTask:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
explorer = Explorer()
|
explorer = Explorer()
|
||||||
explorer.start()
|
explorer.start()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue