Mas avances en el explorer

2018-08-14 20:02:40 +02:00 · 2018-08-14 20:02:40 +02:00 · e466986721
commit e466986721
parent 7e8daab6ce
3 changed files with 60 additions and 49 deletions
--- a/core/mysql_wrapper.py
+++ b/core/mysql_wrapper.py
@ -30,8 +30,6 @@ class DatabaseWrapper():
    def disconnect(self):
        if self.connection.is_connected():
            self.connection.disconnect()
-        else:
-            print("Connection was not active.")

    def ping(self):
        self.connect()
--- a/core/scrapping_utils.py
+++ b/core/scrapping_utils.py
@ -20,8 +20,8 @@ class UrlAttack():
    def attack(self):
        self.has_been_attacked = True
        try:
-            self.response = requests.get(self.url, headers = headers, 
-                                            timeout = timeout)
+            self.response = requests.get(self.url, headers = self.headers, 
+                                            timeout = self.timeout)
            if self.response.ok:
                self.success = True
        except Exception:
--- a/explorer/explorer.py
+++ b/explorer/explorer.py
@ -17,17 +17,19 @@ class Explorer():
    def __init__(self):
        try:
            self.anunciosdb = get_anunciosdb()
+
        except:
-            print("Could not connect to DB")
+            print("Could not connect to anuncios DB")
+        try:
+            self.task_log_db = #get_task_log_db()
+        except:
+            print("Could not connect to task log DB")
        
        self.max_db_retries = 3
        self.db_retries = 0
        self.max_queue_retries = 3
        self.queue_retries = 0
-        
-        
-            
-        
+
    def start(self):
        #Arrancar el servicio
        
@ -43,18 +45,18 @@ class Explorer():
                break
            
            current_task = ExploringTask(self.compose_listing_url)
-            
            if current_task.is_ready_to_explore:
                current_task.explore()
            else:
-                #Qué?
-            if current_task.status == 'Success':
+                break
+            if current_task.status == 'referencias ready':
                current_referencias = current_task.get_referencias()
+                        
+            for referencia in current_referencias:
+                self.post_task_to_queue(referencia)
+            current_task.update_status('Sent to queue')
            
-            
-            for referencia in current
-
-
+            continue                
        
        self.stop()        
                
@ -66,6 +68,8 @@ class Explorer():
    def there_is_work(self):
        #TODO
        #Comprueba si hay trabajo por hacer
+        #Mirando en la bd de tasks cuantas se han hecho ultimamente, mensualmente
+        #etc.
        
    def database_is_up(self):
        while self.db_retries <= self.max_db_retries:
@ -94,6 +98,7 @@ class Explorer():
        return False  
    
    def compose_listing_url(self):
+        #TODO
        #Decide que url hay que componer y la compone
        raiz = 'https://www.idealista.com/'
        tipo = #Logica random
@ -105,32 +110,36 @@ class Explorer():
        return url
    
    def post_task_to_queue(self, referencia):
+        #TODO
+        #Manda la task a la cola redis
        

    
 class ExploringTask():
    
    def __init__(self, url):
-        self.creation_date = Datetime.Datetime.now()
        self.target_url = url
        self.id = str(uuid.uuid4())
-        self.status = 'Pending'
+        self.update_status('Pending')
        
        try:
            self.anunciosdb = get_anunciosdb()
        except:
            self.anunciosdb = None
-            print("Could not connect to DB")
+            self.update_status('Unable to connect to anuncios DB')
            
        try:
            #TODO
            #Pendiente de implementar wraper para MongoDB
-            #self.task_log_db = None
+            #self.task_log_db = 
        except:
-            #self.task_log_db = None
-            print("Could not connect to MongoDB")        
+            self.update_status('Unable to connect to task log DB')
+            #self.task_log_db = None      
+        
+    def update_status(self, new_status):
+        self.status = new_status
+        self._log_in_taskdb()
        
-        self.log()
        
    def is_ready_to_explore(self):
        if self.anunciosdb is not None and self.task_log_db is not None:
@ -139,47 +148,51 @@ class ExploringTask():
            return False
        
    def explore(self):
-        self.attack_date = Datetime.Datetime.now()
        attack = UrlAttack(self.url)
        attack.attack()
        
        if attack.success:
-            
-            self.extract_listings(attack.get_text())
-            if self.listings_ok:
-                self.status = 'Success'
+            self._extract_referencias(attack.get_text())
+            if self.new_listings:
+                self.update_status('referencias ready')
            else:
-                self.status = 'Failure - No listings in HTML'
+                self.update_status('Failure - No listings in HTML')
        else:
-            self.status = 'Failure - Bad request
+            self.update_status('Failure - Bad request')

-        
-        self.log()
-        
    def get_referencias(self):
        return self.referencias
        
-    def log_in_taskdb(self):
+    def _log_in_taskdb(self):
        #TODO
        #Funcion que grabe estado y demas en una mongodb o argo azin
        
-    def extract_referencias(self, html):
-        #TODO
-        #Coge el html de una pagina de listado y pilla las referencias.
-        #Si no cae ninguna, se considera un fail
-        #Registra la propiedad listings_ok
-        if self.status == 'Success':
-            soup = BeautifulSoup(self.html, 'html5lib')
-            ads = sopa.find_all(class_ = "item")
-            self.referencias = []
-            
-            
-            for ad in ads:
-                if self.is_new_listing(ad["data-adid"]):
-                    self.referencias.append(ad["data-adid"])  
+    def _extract_referencias(self, html):
+        """
+        Saca referencias de HTML, descarta las que ya exiten en la base de datos
+        de capturas, y guarda si han aparecido listings y si hay alguno nuevo
+        """
+
+        soup = BeautifulSoup(self.html, 'html5lib')
+        ads = sopa.find_all(class_ = "item")
+        self.referencias = []
+        for ad in ads:
+            if self._is_new_listing(ad["data-adid"]):
+                self.referencias.append(ad["data-adid"])        
+        self.new_listings = bool(self.referencias)
        
-    def is_new_listing(self, referencia):
+    def _is_new_listing(self, referencia):
        #TODO
        #Comprobar contra base de datos si la referencia existe en base de datos
+        query_statement = """SELECT count(referencia)
+                             FROM capturas
+                             WHERE referencia = %s"""
+        query_params = (referencia,)
+        cursor_result = self.anunciosdb.query(query_statement, query_params)
        
+        result = cursor_result.fetchone()
+        if result[0] > 0:
+            return False
+        else:
+            return True