Arreglos menores en capturer y refresher.

This commit is contained in:
pablomartincalvo 2018-10-19 17:22:09 +02:00
parent e379708c04
commit c3c16e7015
5 changed files with 224 additions and 152 deletions

View file

@ -1,4 +1,5 @@
import sys
sys.path.append('..')
from time import sleep
from bs4 import BeautifulSoup
@ -9,11 +10,13 @@ from db_layer.capturas_interface import capturas_interface
from core.scrapping_utils import UrlAttack
from refresher.refresher import Refresher
class Capturer:
sleep_time_no_work = 60
minimum_seconds_between_tries = 120
working_hours = {'start': datetime.time(9, 0, 0),
'end': datetime.time(21, 0, 0)}
def start(self):
while True:
@ -42,7 +45,6 @@ class Capturer:
return Capturer.working_hours['start'] <= datetime.datetime.now().time() <= Capturer.working_hours['end']
class CapturingTask:
sleep_time_failed_request = 60
@ -90,14 +92,14 @@ class CapturingTask:
self.parser.parse()
def _check_data(self):
if not self.parser.all_fields_are_valid():
self._update_status('Invalid value fields')
return
if self.parser.fields_missing():
self._update_status('Fields missing')
return
if not self.parser.all_fields_are_valid():
self._update_status('Invalid value fields')
return
self._update_status('Data ready')
def get_ad_data(self):
@ -190,7 +192,8 @@ class AdHtmlParser:
if soup.find('div', {'class': 'info-features'}) is not None:
try:
self.ad_fields['tamano_categorico']['value'] = soup.find('div', {'class': 'info-features'}).find('span').find('span').text
self.ad_fields['tamano_categorico']['value'] = soup.find('div', {'class': 'info-features'}).find(
'span').find('span').text
self.ad_fields['tamano_categorico']['found'] = True
except:
pass
@ -198,7 +201,7 @@ class AdHtmlParser:
posible_m2 = [tag.text for tag in soup.find('div', {'class': 'info-features'}).find_all('span')]
if [posible for posible in posible_m2 if '' in posible]:
self.ad_fields['m2']['value'] = \
[''.join(re.findall(r'[0-9]', posible)) for posible in posible_m2 if '' in posible][0]
[''.join(re.findall(r'[0-9]', posible)) for posible in posible_m2 if '' in posible][0]
self.ad_fields['m2']['found'] = True
if soup.find('title') is not None:
@ -265,7 +268,8 @@ class AdHtmlParser:
if not 'Distrito' in self.ad_fields['distrito']['value']:
self.invalid_fields.append('distrito')
if not re.match(r"[0-9]{1,20}", self.ad_fields['telefono']['value']):
if (self.ad_fields['telefono']['found']
and not re.match(r"[0-9]{1,20}", self.ad_fields['telefono']['value'])):
self.invalid_fields.append('telefono')
def all_fields_are_valid(self):
@ -292,4 +296,4 @@ class AdHtmlParser:
if __name__ == '__main__':
capturer = Capturer()
capturer.start()
capturer.start()