diff --git a/analysis/index_batch.py b/analysis/index_batch.py new file mode 100644 index 0000000..336b248 --- /dev/null +++ b/analysis/index_batch.py @@ -0,0 +1,37 @@ +from analysis.market_snapshot import Market, available_date_ranges + +class IndexMM: + + def __init__(self): + self.name = 'indexmm' + self.date = None + self.data = None + + def calculate(self, market): + self.market = market + self.date = self.market.end_date + self.data = market.get_market_data() + + data_coche_pequeno = {'count': self.data[self.data['tamano_categorico'] == 'coche pequeño'].count(), + 'mean': self.data[self.data['tamano_categorico' == 'coche pequeño']]['precio'].transform('mean')} + data_coche_grande = {'count': self.data[self.data['tamano_categorico'] == 'coche grande'].count(), + 'mean': self.data[self.data['tamano_categorico' == 'coche grande']]['precio'].transform('mean')} + data_coche_moto = {'count': self.data[self.data['tamano_categorico'] == 'coche y moto'].count(), + 'mean': self.data[self.data['tamano_categorico' == 'coche y moto']]['precio'].transform('mean')} + + self.value = (((data_coche_grande['count'] * data_coche_grande['mean']) + (data_coche_moto['count'] * data_coche_moto['mean']) + + (data_coche_pequeno['count'] * data_coche_pequeno['mean'])) + / (data_coche_grande['count'] + data_coche_moto['count'] + data_coche_pequeno['count'])) + + #SEGUIR AQUI + + + + + + def get_data(self): + return {'name': self.name, + 'date': self.date, + 'value': self.value} + + diff --git a/analysis/market_snapshot.py b/analysis/market_snapshot.py new file mode 100644 index 0000000..b735ca9 --- /dev/null +++ b/analysis/market_snapshot.py @@ -0,0 +1,74 @@ +from datetime import datetime, timedelta +import pandas as pd + +available_date_ranges = [{'start': datetime(2017, 10, 1), 'end': datetime(2018, 1, 1)}, + {'start': datetime(2017, 11, 1), 'end': datetime(2018, 2, 1)}, + {'start': datetime(2017, 12, 1), 'end': datetime(2018, 3, 1)}, + {'start': datetime(2018, 1, 1), 'end': datetime(2018, 4, 1)}, + {'start': datetime(2018, 2, 1), 'end': datetime(2018, 5, 1)}, + {'start': datetime(2018, 3, 1), 'end': datetime(2018, 6, 1)}, + {'start': datetime(2018, 4, 1), 'end': datetime(2018, 7, 1)}, + {'start': datetime(2018, 5, 1), 'end': datetime(2018, 8, 1)}, + {'start': datetime(2018, 6, 1), 'end': datetime(2018, 9, 1)}, + {'start': datetime(2018, 7, 1), 'end': datetime(2018, 10, 1)}, + {'start': datetime(2018, 8, 1), 'end': datetime(2018, 11, 1)}, + {'start': datetime(2018, 9, 1), 'end': datetime(2018, 12, 1)}] + + +class Market: + + def __init__(self): + self.start_date = datetime.today() - timedelta(days=90) + self.end_date = datetime.today() + self.market = None + + def load_market(self, market_query_results): + self.market = pd.DataFrame(market_query_results) + + def clean_market(self, method): + + if method == 'index': + self.market.dropna(subset=['tamano_categorico']) + self.market = self.market[~self.market['tamano_categorico'].isin(['2 coches o más', 'moto'])] + self.market.drop_duplicates(subset=['precio', 'latitud', 'longitud'], keep='last') + self.market = self.market[self.market['tipo_anuncio'] == 1] + self.delete_outliers() + + if method == 'valoracion': + self.market.dropna(subset=['tamano_categorico']) + self.market = self.market[~self.market['tamano_categorico'].isin(['2 coches o más', 'moto'])] + self.market = self.market[self.market['precision'].isin(['ROOFTOP'])] + self.market.drop_duplicates(subset=['precio', 'latitud', 'longitud'], keep='last') + self.market = self.market[self.market['tipo_anuncio'] == 1] + self.delete_outliers() + + def delete_outliers(self): + + outlier_combinations = [{'tipo_anuncio': 1, 'tamano_categorico': 'coche grande', + 'min_precio': 1000, 'max_precio': 150000}, + {'tipo_anuncio': 1, 'tamano_categorico': 'coche pequeño', + 'min_precio': 1000, 'max_precio': 150000}, + {'tipo_anuncio': 1, 'tamano_categorico': 'coche y moto', + 'min_precio': 1000, 'max_precio': 200000}, + {'tipo_anuncio': 1, 'tamano_categorico': 'moto', + 'min_precio': 1000, 'max_precio': 40000}, + {'tipo_anuncio': 2, 'tamano_categorico': 'coche grande', + 'min_precio': 10, 'max_precio': 300}, + {'tipo_anuncio': 2, 'tamano_categorico': 'coche pequeño', + 'min_precio': 10, 'max_precio': 300}, + {'tipo_anuncio': 2, 'tamano_categorico': 'coche y moto', + 'min_precio': 10, 'max_precio': 3000}, + {'tipo_anuncio': 2, 'tamano_categorico': 'moto', + 'min_precio': 10, 'max_precio': 150}] + + for combination in outlier_combinations: + self.market = self.market.loc[~( + (self.market['tipo_anuncio'] == combination['tipo_anuncio']) & + (self.market['tamano_categorico'] == combination['tamano_categorico']) & + ((self.market['precio'] < combination['min_precio']) | (self.market['precio'] > combination['max_precio'])) + )] + + def get_market_data(self): + return self.market + + diff --git a/db_layer/capturas_interface.py b/db_layer/capturas_interface.py index 8892c45..46a810a 100644 --- a/db_layer/capturas_interface.py +++ b/db_layer/capturas_interface.py @@ -96,6 +96,30 @@ class CapturasInterface: self.anunciosdb.query(query_statement, query_parameters) + def get_market_snapshot(self, start_date, end_date): + query_statement = """ + SELECT * + FROM `anuncios`.`capturas` `t1` + WHERE + ( + ( + `t1`.`fecha_captura` = + ( + SELECT + max(`t2`.`fecha_captura`) + FROM `anuncios`.`capturas` `t2` + WHERE (`t1`.`referencia` = `t2`.`referencia`) + ) + ) + AND (`t1`.`fecha_captura` BETWEEN %(start_date)S AND %(end_date)S) + ) + """ + query_parameters = {'start_date': start_date.strftime('%Y-%m-%d 00:00:00'), + 'end_date': end_date.strftime('%Y-%m-%d 00:00:00')} + + cursor_result = self.anunciosdb.query(query_statement, query_parameters, dictionary=True) + return cursor_result.fetchall() + capturas_interface = CapturasInterface() diff --git a/requirements.txt b/requirements.txt index 42a487e..7de728a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ bs4 mysql-connector-python requests==2.18.4 -html5lib \ No newline at end of file +html5lib +pandas==0.23.4 \ No newline at end of file