Avanzado en la clase de Exploring Task y Explorer.

2018-08-13 23:55:17 +02:00 · 2018-08-13 23:55:17 +02:00 · 7e8daab6ce
commit 7e8daab6ce
parent c0b398b9c6
1 changed files with 64 additions and 13 deletions
--- a/explorer/explorer.py
+++ b/explorer/explorer.py
@ -4,7 +4,7 @@ sys.path.append('..')
 import uuid
 import datetime
 from time import sleep
-
+from bs4 import BeautifulSoup
 from core.mysql_wrapper import get_anunciosdb
 from core.scrapping_utils import UrlAttack 
@ -44,15 +44,27 @@ class Explorer():
            current_task = ExploringTask(self.compose_listing_url)
            if current_task.is_ready_to_explore:
                current_task.explore()
            else:
                #Qué?
            if current_task.status == 'Success':
                current_referencias = current_task.get_referencias()
            for referencia in current
        self.stop()        
    def stop(self):
        #TODO
        #Detener el servicio
    def there_is_work(self):
        #TODO
        #Comprueba si hay trabajo por hacer
    def database_is_up(self):
@ -69,7 +81,7 @@ class Explorer():
    def queue_is_up(self):
-        #WIP
+        #TODO
        while self.queue_retries <= self.max_queue_retries:
            try:
                #codigo que testea si redis esta vivo
@ -92,6 +104,9 @@ class Explorer():
        return url
    def post_task_to_queue(self, referencia):
 class ExploringTask():
@ -101,34 +116,70 @@ class ExploringTask():
        self.id = str(uuid.uuid4())
        self.status = 'Pending'
        try:
            self.anunciosdb = get_anunciosdb()
        except:
            self.anunciosdb = None
            print("Could not connect to DB")
        try:
            #TODO
            #Pendiente de implementar wraper para MongoDB
            #self.task_log_db = None
        except:
            #self.task_log_db = None
            print("Could not connect to MongoDB")        
        self.log()
    def is_ready_to_explore(self):
        if self.anunciosdb is not None and self.task_log_db is not None:
            return True
        else:
            return False
    def explore(self):
        self.attack_date = Datetime.Datetime.now()
        attack = UrlAttack(self.url)
        attack.attack()
        if attack.success:
-            self.get_listings()
+            
-        if self.listings_ok:
+            self.extract_listings(attack.get_text())
-            self.status = 'Success'
+            if self.listings_ok:
                self.status = 'Success'
            else:
                self.status = 'Failure - No listings in HTML'
        else:
-            self.status = 'Failure'
+            self.status = 'Failure - Bad request
        self.log
-    def get_listings(self):
+        self.log()
        #TODO
        #Funcion que devuelva los listings limpitos
-    def log(self):
+    def get_referencias(self):
        return self.referencias
    def log_in_taskdb(self):
        #TODO
        #Funcion que grabe estado y demas en una mongodb o argo azin
-    def extract_listings(self):
+    def extract_referencias(self, html):
        #TODO
        #Coge el html de una pagina de listado y pilla las referencias.
        #Si no cae ninguna, se considera un fail
        #Registra la propiedad listings_ok
        if self.status == 'Success':
            soup = BeautifulSoup(self.html, 'html5lib')
            ads = sopa.find_all(class_ = "item")
            self.referencias = []
            for ad in ads:
                if self.is_new_listing(ad["data-adid"]):
                    self.referencias.append(ad["data-adid"])  
    def is_new_listing(self, referencia):
        #TODO
        #Comprobar contra base de datos si la referencia existe en base de datos