diff --git a/analysis/index_batch.py b/analysis/index_batch.py new file mode 100644 index 0000000..8da42fc --- /dev/null +++ b/analysis/index_batch.py @@ -0,0 +1,49 @@ +from analysis.market_snapshot import Market, available_date_ranges +from db_layer.capturas_interface import capturas_interface +from db_layer.indices_interface import indices_interface + + +class IndexMM: + + def __init__(self): + self.name = 'indexmm' + self.market = None + self.date = None + self.data = None + self.value = None + + def calculate(self, market): + self.market = market + self.date = self.market.end_date + self.data = self.market.get_market_data() + + data_coche_pequeno = {'count': self.data[self.data['tamano_categorico'] == 'coche pequeño'].shape[0], + 'mean': self.data[self.data['tamano_categorico'] == 'coche pequeño']['precio'].mean()} + data_coche_grande = {'count': self.data[self.data['tamano_categorico'] == 'coche grande'].shape[0], + 'mean': self.data[self.data['tamano_categorico'] == 'coche grande']['precio'].mean()} + data_coche_moto = {'count': self.data[self.data['tamano_categorico'] == 'coche y moto'].shape[0], + 'mean': self.data[self.data['tamano_categorico'] == 'coche y moto']['precio'].mean()} + + self.value = (((data_coche_grande['count'] * data_coche_grande['mean']) + (data_coche_moto['count'] * data_coche_moto['mean']) + + (data_coche_pequeno['count'] * data_coche_pequeno['mean'])) + / (data_coche_grande['count'] + data_coche_moto['count'] + data_coche_pequeno['count'])) + + def get_data(self): + return {'name': self.name, + 'date': self.date, + 'value': self.value.item()} + + +if __name__ == '__main__': + for date_range in available_date_ranges: + market = Market() + market.load_market(capturas_interface.get_market_snapshot(date_range['start'], date_range['end']), + date_range=date_range) + market.clean_market('index') + + index = IndexMM() + index.calculate(market) + + indices_interface.write_index(index.get_data()) + + diff --git a/analysis/market_snapshot.py b/analysis/market_snapshot.py new file mode 100644 index 0000000..149b88b --- /dev/null +++ b/analysis/market_snapshot.py @@ -0,0 +1,76 @@ +from datetime import datetime, timedelta +import pandas as pd + +available_date_ranges = [{'start': datetime(2017, 10, 1), 'end': datetime(2018, 1, 1)}, + {'start': datetime(2017, 11, 1), 'end': datetime(2018, 2, 1)}, + {'start': datetime(2017, 12, 1), 'end': datetime(2018, 3, 1)}, + {'start': datetime(2018, 1, 1), 'end': datetime(2018, 4, 1)}, + {'start': datetime(2018, 2, 1), 'end': datetime(2018, 5, 1)}, + {'start': datetime(2018, 3, 1), 'end': datetime(2018, 6, 1)}, + {'start': datetime(2018, 4, 1), 'end': datetime(2018, 7, 1)}, + {'start': datetime(2018, 5, 1), 'end': datetime(2018, 8, 1)}, + {'start': datetime(2018, 6, 1), 'end': datetime(2018, 9, 1)}, + {'start': datetime(2018, 7, 1), 'end': datetime(2018, 10, 1)}, + {'start': datetime(2018, 8, 1), 'end': datetime(2018, 11, 1)}, + {'start': datetime(2018, 9, 1), 'end': datetime(2018, 12, 1)}] + + +class Market: + + def __init__(self): + self.start_date = None + self.end_date = None + self.market = None + + def load_market(self, market_query_results, date_range): + self.market = pd.DataFrame(market_query_results) + self.start_date = date_range['start'] + self.end_date = date_range['end'] + + def clean_market(self, method): + + if method == 'index': + self.market = self.market.dropna(subset=['tamano_categorico']) + self.market = self.market[~self.market['tamano_categorico'].isin(['2 coches o más', 'moto'])] + self.market = self.market.drop_duplicates(subset=['tamano_categorico', 'precio', 'latitud', 'longitud', 'telefono'], keep='last') + self.market = self.market[self.market['tipo_anuncio'] == 1] + self.delete_outliers() + + if method == 'valoracion': + self.market = self.market.dropna(subset=['tamano_categorico']) + self.market = self.market[~self.market['tamano_categorico'].isin(['2 coches o más', 'moto'])] + self.market = self.market[self.market['precision'].isin(['ROOFTOP'])] + self.market = self.market.drop_duplicates(subset=['tamano_categorico', 'precio', 'latitud', 'longitud', 'telefono'], keep='last') + self.market = self.market[self.market['tipo_anuncio'] == 1] + self.delete_outliers() + + def delete_outliers(self): + + outlier_combinations = [{'tipo_anuncio': 1, 'tamano_categorico': 'coche grande', + 'min_precio': 1000, 'max_precio': 150000}, + {'tipo_anuncio': 1, 'tamano_categorico': 'coche pequeño', + 'min_precio': 1000, 'max_precio': 150000}, + {'tipo_anuncio': 1, 'tamano_categorico': 'coche y moto', + 'min_precio': 1000, 'max_precio': 200000}, + {'tipo_anuncio': 1, 'tamano_categorico': 'moto', + 'min_precio': 1000, 'max_precio': 40000}, + {'tipo_anuncio': 2, 'tamano_categorico': 'coche grande', + 'min_precio': 10, 'max_precio': 300}, + {'tipo_anuncio': 2, 'tamano_categorico': 'coche pequeño', + 'min_precio': 10, 'max_precio': 300}, + {'tipo_anuncio': 2, 'tamano_categorico': 'coche y moto', + 'min_precio': 10, 'max_precio': 3000}, + {'tipo_anuncio': 2, 'tamano_categorico': 'moto', + 'min_precio': 10, 'max_precio': 150}] + + for combination in outlier_combinations: + self.market = self.market.loc[~( + (self.market['tipo_anuncio'] == combination['tipo_anuncio']) & + (self.market['tamano_categorico'] == combination['tamano_categorico']) & + ((self.market['precio'] < combination['min_precio']) | (self.market['precio'] > combination['max_precio'])) + )] + + def get_market_data(self): + return self.market + + diff --git a/capturer/capturer.py b/capturer/capturer.py index 8ef69b3..0a86bad 100644 --- a/capturer/capturer.py +++ b/capturer/capturer.py @@ -261,7 +261,6 @@ class AdHtmlParser: # TODO capturar datos de visitas - def _validate(self): self.invalid_fields = [] @@ -287,11 +286,9 @@ class AdHtmlParser: self.invalid_fields.append('distrito') if (self.ad_fields['telefono']['found'] - and not re.match(r"[0-9]{1,20}", self.ad_fields['telefono']['value'])): + and not re.match(r".\+?.[0-9]{1,20}", self.ad_fields['telefono']['value'])): self.invalid_fields.append('telefono') - - - + #TODO añadir + a caracteres validos def all_fields_are_valid(self): self._validate() diff --git a/core/config.py b/core/config.py index 07d3f6c..232d6a7 100644 --- a/core/config.py +++ b/core/config.py @@ -14,7 +14,7 @@ databases = {'dev':{'host': '185.166.215.170 ', try: current_db = environ['DROGON_ENV'] except KeyError: - current_db = 'pro' + current_db = 'dev' try: current_db_parameters = databases[current_db] diff --git a/db_layer/capturas_interface.py b/db_layer/capturas_interface.py index 8892c45..db11749 100644 --- a/db_layer/capturas_interface.py +++ b/db_layer/capturas_interface.py @@ -96,6 +96,30 @@ class CapturasInterface: self.anunciosdb.query(query_statement, query_parameters) + def get_market_snapshot(self, start_date, end_date): + query_statement = """ + SELECT * + FROM `anuncios`.`capturas` `t1` + WHERE + ( + ( + `t1`.`fecha_captura` = + ( + SELECT + max(`t2`.`fecha_captura`) + FROM `anuncios`.`capturas` `t2` + WHERE (`t1`.`referencia` = `t2`.`referencia`) + ) + ) + AND (`t1`.`fecha_captura` BETWEEN %(start_date)s AND %(end_date)s) + ) + """ + query_parameters = {'start_date': start_date.strftime('%Y-%m-%d 00:00:00'), + 'end_date': end_date.strftime('%Y-%m-%d 00:00:00')} + + cursor_result = self.anunciosdb.query(query_statement, query_parameters, dictionary=True) + return cursor_result.fetchall() + capturas_interface = CapturasInterface() diff --git a/db_layer/db_init_scripts/4_create_valores_indices.sql b/db_layer/db_init_scripts/4_create_valores_indices.sql new file mode 100644 index 0000000..d83eb50 --- /dev/null +++ b/db_layer/db_init_scripts/4_create_valores_indices.sql @@ -0,0 +1,6 @@ +CREATE TABLE `valores_indices` ( + `fecha_valor` datetime NOT NULL, + `nombre_indice` varchar(255) NOT NULL, + `valor_indice` double NOT NULL, + PRIMARY KEY (`fecha_valor`,`nombre_indice`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 ; diff --git a/db_layer/indices_interface.py b/db_layer/indices_interface.py new file mode 100644 index 0000000..c4b1069 --- /dev/null +++ b/db_layer/indices_interface.py @@ -0,0 +1,21 @@ +from core.mysql_wrapper import get_anunciosdb + + +class IndicesInterface: + + def __init__(self): + self.anunciosdb = get_anunciosdb() + + def write_index(self, index_data): + query_statement = """ REPLACE INTO valores_indices + (fecha_valor, nombre_indice, valor_indice) + VALUES + (%(date)s, %(name)s, %(value)s) + """ + + query_parameters = index_data + + self.anunciosdb.query(query_statement, query_parameters) + + +indices_interface = IndicesInterface() diff --git a/requirements.txt b/requirements.txt index 42a487e..7de728a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ bs4 mysql-connector-python requests==2.18.4 -html5lib \ No newline at end of file +html5lib +pandas==0.23.4 \ No newline at end of file diff --git a/tests/capturer_tests.py b/tests/capturer_tests.py index 8b30763..472aea9 100644 --- a/tests/capturer_tests.py +++ b/tests/capturer_tests.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import sys sys.path.append('..') -from capturer.capturer import CapturingTask, Capturer +from capturer.capturer import CapturingTask, Capturer, AdHtmlParser from db_layer.capturas_interface import capturas_interface @@ -22,7 +22,208 @@ def test_Capturer(): capturer = Capturer() capturer.start() +def test_AdHtmlParser(): -test_CapturingTask() + html = """ -# test_Capturer() \ No newline at end of file + + + + + + + + + + + + + Alquiler de Garaje en calle de Balmes, 138, La Dreta de l'Eixample, Barcelona

Alquiler de Garaje en calle de Balmes, 138 La Dreta de l'Eixample, Barcelona Ver mapa

30 €/mes
1 m²

Comentario del anunciante

Características básicas

  • 1 m²

¿Hay algún error en este anuncio?

Infórmanos para corregirlo y ayudar a otros usuarios.

Cuéntanos qué error has visto

¿Cuánto vale este inmueble?

Te enviamos un informe con la estimación de precio para este inmueble y con información de la zona.

Comprar estimación de precio

Ubicación

  • Calle de Balmes, 138
  • Urb. Eixample esquerra
  • Barrio La Dreta de l'Eixample
  • Distrito Eixample
  • Barcelona
  • Área de Barcelona, Barcelona

Estadísticas

+ +""" + + parser = AdHtmlParser(html) + + parser.parse() + parser._validate() + + +test_AdHtmlParser() + +#test_CapturingTask() + +#test_Capturer() \ No newline at end of file diff --git a/tests/index_batch_tests.py b/tests/index_batch_tests.py new file mode 100644 index 0000000..f7cbbf2 --- /dev/null +++ b/tests/index_batch_tests.py @@ -0,0 +1,106 @@ +from analysis.market_snapshot import Market +from analysis.index_batch import IndexMM +import pandas as pd + +sample_market = [ + {'tamano_categorico': 'coche pequeño', + 'tipo_anuncio': 1, + 'precio': 15000, + 'calle': 'B1', + 'telefono': 123, + 'latitud': 2.1, + 'longitud': 1.2}, + {'tamano_categorico': 'coche pequeño', + 'tipo_anuncio': 1, + 'precio': 20000, + 'calle': 'B2', + 'telefono': 321, + 'latitud': 2.1, + 'longitud': 1.2}, + {'tamano_categorico': 'coche grande', + 'tipo_anuncio': 1, + 'precio': 20000, + 'calle': 'B2', + 'telefono': 321, + 'latitud': 2.1, + 'longitud': 1.2}, + {'tamano_categorico': 'coche grande', + 'tipo_anuncio': 1, + 'precio': 25000, + 'calle': 'B2', + 'telefono': 123, + 'latitud': 2.1, + 'longitud': 1.2}, + {'tamano_categorico': 'coche y moto', + 'tipo_anuncio': 1, + 'precio': 22000, + 'calle': 'B1', + 'telefono': 456, + 'latitud': 2.1, + 'longitud': 1.2}, + {'tamano_categorico': 'coche y moto', + 'tipo_anuncio': 1, + 'precio': 26000, + 'calle': 'B3', + 'telefono': 789, + 'latitud': 2.1, + 'longitud': 1.2}, + {'tamano_categorico': None, + 'tipo_anuncio': 1, + 'precio': 15000, + 'calle': 'abc', + 'telefono': 456, + 'latitud': 2.1, + 'longitud': 1.2}, + {'tamano_categorico': 'moto', + 'tipo_anuncio': 1, + 'precio': 3000, + 'calle': 'B4', + 'telefono': 123, + 'latitud': 2.1, + 'longitud': 1.2}, + {'tamano_categorico': '2 coches o más', + 'tipo_anuncio': 1, + 'precio': 60000, + 'calle': 'B4', + 'telefono': 123, + 'latitud': 2.1, + 'longitud': 1.2}, + {'tamano_categorico': 'coche pequeño', + 'tipo_anuncio': 1, + 'precio': 20000, + 'calle': 'B2', + 'telefono': 321, + 'latitud': 2.1, + 'longitud': 1.2}, + {'tamano_categorico': 'coche pequeño', + 'tipo_anuncio': 2, + 'precio': 50, + 'calle': 'B4', + 'telefono': 123, + 'latitud': 2.1, + 'longitud': 1.2}, + {'tamano_categorico': 'moto', + 'tipo_anuncio': 1, + 'precio': 300000, + 'calle': 'B4', + 'telefono': 123, + 'latitud': 2.1, + 'longitud': 1.2} + ] +date_range = {'start': '2018-01-01 00:00:00', + 'end': '2018-02-01 00:00:00'} + + +market = Market() +market.load_market(sample_market, + date_range=date_range) +market.market.fillna(value=pd.np.nan, inplace=True) +print(market.market.to_string()) +market.clean_market('index') +print(market.market.to_string()) + +index = IndexMM() +index.calculate(market) +index.get_data() +