Testeos en desarrollo del sistema de capturas. Pequeños retoques.
This commit is contained in:
parent
e97bbba274
commit
6a0baf4de6
7 changed files with 257 additions and 177 deletions
|
|
@ -1,11 +1,10 @@
|
|||
import sys
|
||||
|
||||
sys.path.append('..')
|
||||
from time import sleep
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
from mysql.capturing_tasks_interface import capturing_interface
|
||||
from mysql.capturas_interface import capturas_interface
|
||||
from db_layer.capturing_tasks_interface import capturing_interface
|
||||
from db_layer.capturas_interface import capturas_interface
|
||||
from core.scrapping_utils import UrlAttack
|
||||
|
||||
|
||||
|
|
@ -60,7 +59,7 @@ class CapturingTask:
|
|||
attack = UrlAttack(self.ad_url)
|
||||
attack.attack()
|
||||
|
||||
if attack.success():
|
||||
if attack.success:
|
||||
self.html = attack.get_text()
|
||||
|
||||
self._extract_data()
|
||||
|
|
@ -79,8 +78,6 @@ class CapturingTask:
|
|||
self.parser.parse()
|
||||
|
||||
def _check_data(self):
|
||||
self.parser.validate()
|
||||
|
||||
if not self.parser.all_fields_are_valid():
|
||||
self._update_status('Invalid value fields')
|
||||
return
|
||||
|
|
@ -169,9 +166,9 @@ class AdHtmlParser:
|
|||
|
||||
soup = BeautifulSoup(self.html, 'html5lib')
|
||||
|
||||
if soup.findall('link', {'rel': 'canonical'}) is not None:
|
||||
if soup.find_all('link', {'rel': 'canonical'}) is not None:
|
||||
self.ad_fields['referencia']['value'] = re.findall(r'[0-9]{5,20}',
|
||||
str(soup.findall('link', {'rel': 'canonical'})[0]))[0]
|
||||
str(soup.find_all('link', {'rel': 'canonical'})[0]))[0]
|
||||
self.ad_fields['referencia']['found'] = True
|
||||
|
||||
if soup.find_all('strong', {'class': 'price'}) is not None:
|
||||
|
|
@ -180,19 +177,20 @@ class AdHtmlParser:
|
|||
self.ad_fields['precio']['found'] = True
|
||||
|
||||
if soup.find('div', {'class': 'info-features'}) is not None:
|
||||
self.ad_fields['tamano_categorico']['value'] = soup.find('div',
|
||||
{'class': 'info-features'}).find('span').find(
|
||||
'span').text
|
||||
self.ad_fields['tamano_categorico']['found'] = True
|
||||
try:
|
||||
self.ad_fields['tamano_categorico']['value'] = soup.find('div', {'class': 'info-features'}).find('span').find('span').text
|
||||
self.ad_fields['tamano_categorico']['found'] = True
|
||||
except:
|
||||
pass
|
||||
|
||||
posible_m2 = [tag.text for tag in soup.find('div', {'class': 'info-features'}).findAll('span')]
|
||||
posible_m2 = [tag.text for tag in soup.find('div', {'class': 'info-features'}).find_all('span')]
|
||||
if [posible for posible in posible_m2 if 'm²' in posible]:
|
||||
self.ad_fields['m2']['value'] = \
|
||||
[''.join(re.findall(r'[0-9]', posible)) for posible in posible_m2 if 'm²' in posible][0]
|
||||
self.ad_fields['m2']['found'] = True
|
||||
|
||||
if soup.find('title') is not None:
|
||||
if 'venta' in soup.find('title'):
|
||||
if 'venta' in soup.find('title').text:
|
||||
self.ad_fields['tipo_anuncio']['value'] = 1
|
||||
else:
|
||||
self.ad_fields['tipo_anuncio']['value'] = 2
|
||||
|
|
@ -200,14 +198,14 @@ class AdHtmlParser:
|
|||
|
||||
if len(soup.find('div', {'id': 'headerMap'}).find_all('li')) > 3:
|
||||
self.ad_fields['calle']['value'] = ''
|
||||
self.ad_fields['ciudad']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-2].text
|
||||
self.ad_fields['ciudad']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-2].text.strip()
|
||||
self.ad_fields['ciudad']['found'] = True
|
||||
self.ad_fields['distrito']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-3].text
|
||||
self.ad_fields['distrito']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-3].text.strip()
|
||||
self.ad_fields['distrito']['found'] = True
|
||||
self.ad_fields['barrio']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-4].text
|
||||
self.ad_fields['barrio']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[-4].text.strip()
|
||||
self.ad_fields['barrio']['found'] = True
|
||||
if len(soup.find('div', {'id': 'headerMap'}).find_all('li')) > 4:
|
||||
self.ad_fields['calle']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[0].text
|
||||
self.ad_fields['calle']['value'] = soup.find('div', {'id': 'headerMap'}).find_all('li')[0].text.strip()
|
||||
self.ad_fields['calle']['found'] = True
|
||||
|
||||
features_lists = soup.find_all('div', {'class': 'details-property_features'})
|
||||
|
|
@ -231,13 +229,13 @@ class AdHtmlParser:
|
|||
.text.replace(' ', '')
|
||||
self.ad_fields['telefono']['found'] = True
|
||||
|
||||
def validate(self):
|
||||
def _validate(self):
|
||||
self.invalid_fields = []
|
||||
|
||||
if re.match(r"[0-9]{4,20}", self.ad_fields['referencia']['value']):
|
||||
if not re.match(r"[0-9]{4,20}", self.ad_fields['referencia']['value']):
|
||||
self.invalid_fields.append('referencia')
|
||||
|
||||
if re.match(r"[0-9]{1,20}", self.ad_fields['precio']['value']):
|
||||
if not re.match(r"[0-9]{1,20}", self.ad_fields['precio']['value']):
|
||||
self.invalid_fields.append('precio')
|
||||
|
||||
possible_values_tamano = ['2 coches o más',
|
||||
|
|
@ -255,18 +253,19 @@ class AdHtmlParser:
|
|||
if not 'Distrito' in self.ad_fields['distrito']['value']:
|
||||
self.invalid_fields.append('distrito')
|
||||
|
||||
if re.match(r"[0-9]{1,20}", self.ad_fields['telefono']['value']):
|
||||
if not re.match(r"[0-9]{1,20}", self.ad_fields['telefono']['value']):
|
||||
self.invalid_fields.append('telefono')
|
||||
|
||||
def all_fields_are_valid(self):
|
||||
self._validate()
|
||||
if self.invalid_fields:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def fields_missing(self):
|
||||
for ad_field in self.ad_fields:
|
||||
if not ad_field['optional'] and not ad_field['found']:
|
||||
for key, contents in self.ad_fields.items():
|
||||
if not contents['optional'] and not contents['found']:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue