Testeos en desarrollo del sistema de capturas. Pequeños retoques.

This commit is contained in:
pablomartincalvo 2018-10-13 02:08:58 +02:00
parent e97bbba274
commit 6a0baf4de6
7 changed files with 257 additions and 177 deletions

View file

@ -1,11 +1,10 @@
import sys
sys.path.append('..')
from time import sleep
from bs4 import BeautifulSoup
import re
from mysql.capturing_tasks_interface import capturing_interface
from mysql.capturas_interface import capturas_interface
from db_layer.capturing_tasks_interface import capturing_interface
from db_layer.capturas_interface import capturas_interface
from core.scrapping_utils import UrlAttack
@ -60,7 +59,7 @@ class CapturingTask:
attack = UrlAttack(self.ad_url)
attack.attack()
if attack.success():
if attack.success:
self.html = attack.get_text()
self._extract_data()
@ -79,8 +78,6 @@ class CapturingTask:
self.parser.parse()
def _check_data(self):
self.parser.validate()
if not self.parser.all_fields_are_valid():
self._update_status('Invalid value fields')
return
@ -169,9 +166,9 @@ class AdHtmlParser:
soup = BeautifulSoup(self.html, 'html5lib')
if soup.findall('link', {'rel': 'canonical'}) is not None:
if soup.find_all('link', {'rel': 'canonical'}) is not None:
self.ad_fields['referencia']['value'] = re.findall(r'[0-9]{5,20}',
str(soup.findall('link', {'rel': 'canonical'})[0]))[0]
str(soup.find_all('link', {'rel': 'canonical'})[0]))[0]
self.ad_fields['referencia']['found'] = True
if soup.find_all('strong', {'class': 'price'}) is not None:
@ -180,19 +177,20 @@ class AdHtmlParser:
self.ad_fields['precio']['found'] = True
if soup.find('div', {'class': 'info-features'}) is not None:
self.ad_fields['tamano_categorico']['value'] = soup.find('div',
{'class': 'info-features'}).find('span').find(
'span').text
self.ad_fields['tamano_categorico']['found'] = True
try:
self.ad_fields['tamano_categorico']['value'] = soup.find('div', {'class': 'info-features'}).find('span').find('span').text
self.ad_fields['tamano_categorico']['found'] = True
except:
pass
posible_m2 = [tag.text for tag in soup.find('div', {'class': 'info-features'}).findAll('span')]
posible_m2 = [tag.text for tag in soup.find('div', {'class': 'info-features'}).find_all('span')]
if [posible for posible in posible_m2 if '' in posible]:
self.ad_fields['m2']['value'] = \
[''.join(re.findall(r'[0-9]', posible)) for posible in posible_m2 if '' in posible][0]
self.ad_fields['m2']['found'] = True
if soup.find('title') is not None:
if 'venta' in soup.find('title'):
if 'venta' in soup.find('title').text:
self.ad_fields['tipo_anuncio']['value'] = 1
else:
self.ad_fields['tipo_anuncio']['value'] = 2
@ -200,14 +198,14 @@ class AdHtmlParser:
if len(soup.find('div', {'id': 'headerMap'}).find_all('li')) > 3:
self.ad_fields['calle']['value'] = ''
self.ad_fields['ciudad']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-2].text
self.ad_fields['ciudad']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-2].text.strip()
self.ad_fields['ciudad']['found'] = True
self.ad_fields['distrito']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-3].text
self.ad_fields['distrito']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-3].text.strip()
self.ad_fields['distrito']['found'] = True
self.ad_fields['barrio']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-4].text
self.ad_fields['barrio']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-4].text.strip()
self.ad_fields['barrio']['found'] = True
if len(soup.find('div', {'id': 'headerMap'}).find_all('li')) > 4:
self.ad_fields['calle']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[0].text
self.ad_fields['calle']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[0].text.strip()
self.ad_fields['calle']['found'] = True
features_lists = soup.find_all('div', {'class': 'details-property_features'})
@ -231,13 +229,13 @@ class AdHtmlParser:
.text.replace(' ', '')
self.ad_fields['telefono']['found'] = True
def validate(self):
def _validate(self):
self.invalid_fields = []
if re.match(r"[0-9]{4,20}", self.ad_fields['referencia']['value']):
if not re.match(r"[0-9]{4,20}", self.ad_fields['referencia']['value']):
self.invalid_fields.append('referencia')
if re.match(r"[0-9]{1,20}", self.ad_fields['precio']['value']):
if not re.match(r"[0-9]{1,20}", self.ad_fields['precio']['value']):
self.invalid_fields.append('precio')
possible_values_tamano = ['2 coches o más',
@ -255,18 +253,19 @@ class AdHtmlParser:
if not 'Distrito' in self.ad_fields['distrito']['value']:
self.invalid_fields.append('distrito')
if re.match(r"[0-9]{1,20}", self.ad_fields['telefono']['value']):
if not re.match(r"[0-9]{1,20}", self.ad_fields['telefono']['value']):
self.invalid_fields.append('telefono')
def all_fields_are_valid(self):
self._validate()
if self.invalid_fields:
return False
else:
return True
def fields_missing(self):
for ad_field in self.ad_fields:
if not ad_field['optional'] and not ad_field['found']:
for key, contents in self.ad_fields.items():
if not contents['optional'] and not contents['found']:
return True
return False