Merge branch 'dev'

This commit is contained in:
pablomartincalvo 2018-12-25 18:54:06 +01:00
commit 368f8a00bb
10 changed files with 491 additions and 10 deletions

49
analysis/index_batch.py Normal file
View file

@ -0,0 +1,49 @@
from analysis.market_snapshot import Market, available_date_ranges
from db_layer.capturas_interface import capturas_interface
from db_layer.indices_interface import indices_interface
class IndexMM:
def __init__(self):
self.name = 'indexmm'
self.market = None
self.date = None
self.data = None
self.value = None
def calculate(self, market):
self.market = market
self.date = self.market.end_date
self.data = self.market.get_market_data()
data_coche_pequeno = {'count': self.data[self.data['tamano_categorico'] == 'coche pequeño'].shape[0],
'mean': self.data[self.data['tamano_categorico'] == 'coche pequeño']['precio'].mean()}
data_coche_grande = {'count': self.data[self.data['tamano_categorico'] == 'coche grande'].shape[0],
'mean': self.data[self.data['tamano_categorico'] == 'coche grande']['precio'].mean()}
data_coche_moto = {'count': self.data[self.data['tamano_categorico'] == 'coche y moto'].shape[0],
'mean': self.data[self.data['tamano_categorico'] == 'coche y moto']['precio'].mean()}
self.value = (((data_coche_grande['count'] * data_coche_grande['mean']) + (data_coche_moto['count'] * data_coche_moto['mean'])
+ (data_coche_pequeno['count'] * data_coche_pequeno['mean']))
/ (data_coche_grande['count'] + data_coche_moto['count'] + data_coche_pequeno['count']))
def get_data(self):
return {'name': self.name,
'date': self.date,
'value': self.value.item()}
if __name__ == '__main__':
for date_range in available_date_ranges:
market = Market()
market.load_market(capturas_interface.get_market_snapshot(date_range['start'], date_range['end']),
date_range=date_range)
market.clean_market('index')
index = IndexMM()
index.calculate(market)
indices_interface.write_index(index.get_data())

View file

@ -0,0 +1,76 @@
from datetime import datetime, timedelta
import pandas as pd
available_date_ranges = [{'start': datetime(2017, 10, 1), 'end': datetime(2018, 1, 1)},
{'start': datetime(2017, 11, 1), 'end': datetime(2018, 2, 1)},
{'start': datetime(2017, 12, 1), 'end': datetime(2018, 3, 1)},
{'start': datetime(2018, 1, 1), 'end': datetime(2018, 4, 1)},
{'start': datetime(2018, 2, 1), 'end': datetime(2018, 5, 1)},
{'start': datetime(2018, 3, 1), 'end': datetime(2018, 6, 1)},
{'start': datetime(2018, 4, 1), 'end': datetime(2018, 7, 1)},
{'start': datetime(2018, 5, 1), 'end': datetime(2018, 8, 1)},
{'start': datetime(2018, 6, 1), 'end': datetime(2018, 9, 1)},
{'start': datetime(2018, 7, 1), 'end': datetime(2018, 10, 1)},
{'start': datetime(2018, 8, 1), 'end': datetime(2018, 11, 1)},
{'start': datetime(2018, 9, 1), 'end': datetime(2018, 12, 1)}]
class Market:
def __init__(self):
self.start_date = None
self.end_date = None
self.market = None
def load_market(self, market_query_results, date_range):
self.market = pd.DataFrame(market_query_results)
self.start_date = date_range['start']
self.end_date = date_range['end']
def clean_market(self, method):
if method == 'index':
self.market = self.market.dropna(subset=['tamano_categorico'])
self.market = self.market[~self.market['tamano_categorico'].isin(['2 coches o más', 'moto'])]
self.market = self.market.drop_duplicates(subset=['tamano_categorico', 'precio', 'latitud', 'longitud', 'telefono'], keep='last')
self.market = self.market[self.market['tipo_anuncio'] == 1]
self.delete_outliers()
if method == 'valoracion':
self.market = self.market.dropna(subset=['tamano_categorico'])
self.market = self.market[~self.market['tamano_categorico'].isin(['2 coches o más', 'moto'])]
self.market = self.market[self.market['precision'].isin(['ROOFTOP'])]
self.market = self.market.drop_duplicates(subset=['tamano_categorico', 'precio', 'latitud', 'longitud', 'telefono'], keep='last')
self.market = self.market[self.market['tipo_anuncio'] == 1]
self.delete_outliers()
def delete_outliers(self):
outlier_combinations = [{'tipo_anuncio': 1, 'tamano_categorico': 'coche grande',
'min_precio': 1000, 'max_precio': 150000},
{'tipo_anuncio': 1, 'tamano_categorico': 'coche pequeño',
'min_precio': 1000, 'max_precio': 150000},
{'tipo_anuncio': 1, 'tamano_categorico': 'coche y moto',
'min_precio': 1000, 'max_precio': 200000},
{'tipo_anuncio': 1, 'tamano_categorico': 'moto',
'min_precio': 1000, 'max_precio': 40000},
{'tipo_anuncio': 2, 'tamano_categorico': 'coche grande',
'min_precio': 10, 'max_precio': 300},
{'tipo_anuncio': 2, 'tamano_categorico': 'coche pequeño',
'min_precio': 10, 'max_precio': 300},
{'tipo_anuncio': 2, 'tamano_categorico': 'coche y moto',
'min_precio': 10, 'max_precio': 3000},
{'tipo_anuncio': 2, 'tamano_categorico': 'moto',
'min_precio': 10, 'max_precio': 150}]
for combination in outlier_combinations:
self.market = self.market.loc[~(
(self.market['tipo_anuncio'] == combination['tipo_anuncio']) &
(self.market['tamano_categorico'] == combination['tamano_categorico']) &
((self.market['precio'] < combination['min_precio']) | (self.market['precio'] > combination['max_precio']))
)]
def get_market_data(self):
return self.market

View file

@ -261,7 +261,6 @@ class AdHtmlParser:
# TODO capturar datos de visitas # TODO capturar datos de visitas
def _validate(self): def _validate(self):
self.invalid_fields = [] self.invalid_fields = []
@ -287,11 +286,9 @@ class AdHtmlParser:
self.invalid_fields.append('distrito') self.invalid_fields.append('distrito')
if (self.ad_fields['telefono']['found'] if (self.ad_fields['telefono']['found']
and not re.match(r"[0-9]{1,20}", self.ad_fields['telefono']['value'])): and not re.match(r".\+?.[0-9]{1,20}", self.ad_fields['telefono']['value'])):
self.invalid_fields.append('telefono') self.invalid_fields.append('telefono')
#TODO añadir + a caracteres validos
def all_fields_are_valid(self): def all_fields_are_valid(self):
self._validate() self._validate()

View file

@ -14,7 +14,7 @@ databases = {'dev':{'host': '185.166.215.170 ',
try: try:
current_db = environ['DROGON_ENV'] current_db = environ['DROGON_ENV']
except KeyError: except KeyError:
current_db = 'pro' current_db = 'dev'
try: try:
current_db_parameters = databases[current_db] current_db_parameters = databases[current_db]

View file

@ -96,6 +96,30 @@ class CapturasInterface:
self.anunciosdb.query(query_statement, query_parameters) self.anunciosdb.query(query_statement, query_parameters)
def get_market_snapshot(self, start_date, end_date):
query_statement = """
SELECT *
FROM `anuncios`.`capturas` `t1`
WHERE
(
(
`t1`.`fecha_captura` =
(
SELECT
max(`t2`.`fecha_captura`)
FROM `anuncios`.`capturas` `t2`
WHERE (`t1`.`referencia` = `t2`.`referencia`)
)
)
AND (`t1`.`fecha_captura` BETWEEN %(start_date)s AND %(end_date)s)
)
"""
query_parameters = {'start_date': start_date.strftime('%Y-%m-%d 00:00:00'),
'end_date': end_date.strftime('%Y-%m-%d 00:00:00')}
cursor_result = self.anunciosdb.query(query_statement, query_parameters, dictionary=True)
return cursor_result.fetchall()
capturas_interface = CapturasInterface() capturas_interface = CapturasInterface()

View file

@ -0,0 +1,6 @@
CREATE TABLE `valores_indices` (
`fecha_valor` datetime NOT NULL,
`nombre_indice` varchar(255) NOT NULL,
`valor_indice` double NOT NULL,
PRIMARY KEY (`fecha_valor`,`nombre_indice`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1 ;

View file

@ -0,0 +1,21 @@
from core.mysql_wrapper import get_anunciosdb
class IndicesInterface:
def __init__(self):
self.anunciosdb = get_anunciosdb()
def write_index(self, index_data):
query_statement = """ REPLACE INTO valores_indices
(fecha_valor, nombre_indice, valor_indice)
VALUES
(%(date)s, %(name)s, %(value)s)
"""
query_parameters = index_data
self.anunciosdb.query(query_statement, query_parameters)
indices_interface = IndicesInterface()

View file

@ -1,4 +1,5 @@
bs4 bs4
mysql-connector-python mysql-connector-python
requests==2.18.4 requests==2.18.4
html5lib html5lib
pandas==0.23.4

File diff suppressed because one or more lines are too long

106
tests/index_batch_tests.py Normal file
View file

@ -0,0 +1,106 @@
from analysis.market_snapshot import Market
from analysis.index_batch import IndexMM
import pandas as pd
sample_market = [
{'tamano_categorico': 'coche pequeño',
'tipo_anuncio': 1,
'precio': 15000,
'calle': 'B1',
'telefono': 123,
'latitud': 2.1,
'longitud': 1.2},
{'tamano_categorico': 'coche pequeño',
'tipo_anuncio': 1,
'precio': 20000,
'calle': 'B2',
'telefono': 321,
'latitud': 2.1,
'longitud': 1.2},
{'tamano_categorico': 'coche grande',
'tipo_anuncio': 1,
'precio': 20000,
'calle': 'B2',
'telefono': 321,
'latitud': 2.1,
'longitud': 1.2},
{'tamano_categorico': 'coche grande',
'tipo_anuncio': 1,
'precio': 25000,
'calle': 'B2',
'telefono': 123,
'latitud': 2.1,
'longitud': 1.2},
{'tamano_categorico': 'coche y moto',
'tipo_anuncio': 1,
'precio': 22000,
'calle': 'B1',
'telefono': 456,
'latitud': 2.1,
'longitud': 1.2},
{'tamano_categorico': 'coche y moto',
'tipo_anuncio': 1,
'precio': 26000,
'calle': 'B3',
'telefono': 789,
'latitud': 2.1,
'longitud': 1.2},
{'tamano_categorico': None,
'tipo_anuncio': 1,
'precio': 15000,
'calle': 'abc',
'telefono': 456,
'latitud': 2.1,
'longitud': 1.2},
{'tamano_categorico': 'moto',
'tipo_anuncio': 1,
'precio': 3000,
'calle': 'B4',
'telefono': 123,
'latitud': 2.1,
'longitud': 1.2},
{'tamano_categorico': '2 coches o más',
'tipo_anuncio': 1,
'precio': 60000,
'calle': 'B4',
'telefono': 123,
'latitud': 2.1,
'longitud': 1.2},
{'tamano_categorico': 'coche pequeño',
'tipo_anuncio': 1,
'precio': 20000,
'calle': 'B2',
'telefono': 321,
'latitud': 2.1,
'longitud': 1.2},
{'tamano_categorico': 'coche pequeño',
'tipo_anuncio': 2,
'precio': 50,
'calle': 'B4',
'telefono': 123,
'latitud': 2.1,
'longitud': 1.2},
{'tamano_categorico': 'moto',
'tipo_anuncio': 1,
'precio': 300000,
'calle': 'B4',
'telefono': 123,
'latitud': 2.1,
'longitud': 1.2}
]
date_range = {'start': '2018-01-01 00:00:00',
'end': '2018-02-01 00:00:00'}
market = Market()
market.load_market(sample_market,
date_range=date_range)
market.market.fillna(value=pd.np.nan, inplace=True)
print(market.market.to_string())
market.clean_market('index')
print(market.market.to_string())
index = IndexMM()
index.calculate(market)
index.get_data()