PortalLVAM / apps /Clustering.py
bullm's picture
ldsakjl
025632f
from io import BytesIO
import streamlit as st
import pandas as pd
from scipy.cluster.hierarchy import linkage
from datetime import date
from dateutil.relativedelta import relativedelta
import plotly.figure_factory as ff
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
import numpy as np
from modules import tables
@st.cache(suppress_st_warning=True)
def data_request(country_to_request, start, currency='USD', end=date.today()):
data = tables.EquityMaster(country=country_to_request, field='IQ_CLOSEPRICE_ADJ', currency=currency).query(rename=['asset'],
start=start, end=str(end))
adtv = tables.EquityMaster(country=country_to_request, field='IQ_VALUE_TRADED', currency=currency).query(rename=['asset'],
start=start, end=str(end)).median()
marketcap = tables.EquityMaster(country=country_to_request, field='IQ_MARKETCAP', currency=currency).query(rename=['asset'],
start=start, end=str(end)).median()
return data, adtv, marketcap
@st.cache(suppress_st_warning=True)
def data_filter(data, adtv, marketcap, adtv_threshold, mktcap_threshold, p):
adtv_filter = (adtv >= adtv_threshold)
adtv = adtv.loc[adtv_filter]
marketcap_filter = (marketcap >= mktcap_threshold)
marketcap = marketcap.loc[marketcap_filter]
data = data.loc[:, data.columns.isin(adtv.index)]
data = data.loc[:, data.columns.isin(marketcap.index)]
file_to_read = 'Data/Company_Base_Definitivo.xlsx'
company_base = pd.read_excel(file_to_read, sheet_name='Compilado')
id_to_ticker = {str(row['ID_Quant']): str(row['Ticker Bloomberg']).split()[0] for i, row in company_base.iterrows()}
data = data.loc[:, data.columns.isin(id_to_ticker.keys())]
data.columns = [id_to_ticker[col] for col in data.columns]
if isinstance(p, str):
returns_final = data.resample(p).last().pct_change().fillna(0)
else:
returns_final = data.iloc[::p].pct_change().fillna(0)
return returns_final
def dist(correlation):
return ((1-correlation)/2.)**.5
def to_excel(df_to_write):
output = BytesIO()
writer = pd.ExcelWriter(output, engine='xlsxwriter')
df_to_write.to_excel(writer, index=False, sheet_name='Sheet1')
workbook = writer.book
worksheet = writer.sheets['Sheet1']
format1 = workbook.add_format({'num_format': '0.00'})
worksheet.set_column('A:A', None, format1)
writer.save()
processed_data = output.getvalue()
return processed_data
@st.cache(suppress_st_warning=True)
def get_dtw_distance(x, y):
distance_dtw = fastdtw(x, y, dist=euclidean)[0]
return distance_dtw
def clustering_basado_en_correlacion():
form = st.form("Correlation Clustering")
posible_countries = ('Todos', 'Argentina', 'Brazil', 'Chile', 'Colombia', 'Mexico', 'Peru')
countries = form.multiselect('¿Qué países desea visualizar?', posible_countries)
if 'Todos' in countries:
countries = ('Argentina', 'Brazil', 'Chile', 'Colombia', 'Mexico', 'Peru')
adtv_p = form.number_input('Ingrese el mínimo Average Daily Traded Value que desea considerar', value=1., format="%.2f")
mktcap_thresh = form.number_input('Ingrese el mínimo Market Cap que desea considerar', value=200., format="%.2f")
start_date = form.selectbox('Ingrese la fecha de inicio que desea considerar', ('3 Meses', '6 Meses', '1 Año'))
period = form.number_input('Defina la frecuencia en la que desea las observaciones (en días)', value=1)
accept = form.form_submit_button('Aceptar')
if accept:
start_date = str(date.today() - relativedelta(months=int(start_date[0])))
for country in countries:
data_, adtv_, marketcap_ = data_request(country, start_date)
# Filtramos para que se cumplan los filtros del usuario en los datos
returns = data_filter(data_, adtv_, marketcap_, adtv_p, mktcap_thresh, period)
# Normalizamos
base = (returns.subtract(returns.mean(0), axis=1)).div(returns.std(axis=0), axis=1)
base = base.sort_index(axis=1)
# Procedemos a calcular correlación y covarianza
corr, covs = base.corr(), base.cov()
file = to_excel(corr)
# Definimos la matriz de distancia
dist_matrix = dist(corr)
hierarchy = linkage(dist_matrix)
ct = 0.54 * max(hierarchy[:, 2])
fig = ff.create_dendrogram(dist_matrix, orientation='left', labels=list(base.columns),
color_threshold=ct, linkagefun=linkage)
fig.update_layout(title='{} desde {} hasta {}'.format(country,
returns.index[0].date(),
returns.index[-1].date()))
if country == 'Brazil':
fig.update_layout(height=2000)
else:
fig.update_layout(height=900)
st.plotly_chart(fig, use_container_width=True)
st.download_button(label='Descargar Matriz de Correlación para {}'.format(country), data=file,
file_name='{}_correlacion.xlsx'.format(country))
def clustering_con_dtw():
form = st.form("Dynamic Time Warping Clustering")
posible_countries = ('Todos', 'Brazil', 'Argentina', 'Chile', 'Colombia', 'Mexico', 'Peru')
countries = form.multiselect('¿Qué países desea visualizar?', posible_countries)
if 'Todos' in countries:
countries = ('Brazil', 'Argentina', 'Chile', 'Colombia', 'Mexico', 'Peru')
adtv_p = form.number_input('Ingrese el mínimo Average Daily Traded Value que desea considerar', value=1.,
format="%.2f")
mktcap_thresh = form.number_input('Ingrese el Mínimo Market Cap que desea considerar', value=200., format="%.2f")
start_date = form.selectbox('Ingrese la fecha de inicio que desea considerar', ('3 Meses', '6 Meses', '1 Año'))
period = form.number_input('Defina la frecuencia en la que desea las observaciones (en días)', value=1)
accept = form.form_submit_button('Aceptar')
if accept:
start_date = str(date.today() - relativedelta(months=int(start_date[0])))
for country in countries:
data_, adtv_, marketcap_ = data_request(country, start_date)
# Filtramos para que se cumplan los filtros del usuario en los datos
returns = data_filter(data_, adtv_, marketcap_, adtv_p, mktcap_thresh, period)
# Normalizamos returns
base = (returns.subtract(returns.mean(0), axis=1)).div(returns.std(axis=0), axis=1)
base = base.sort_index(axis=1)
# Procedemos a calcular correlación y covarianza
N = len(base[:base.index[0]].T)
# Creamos la Matriz de Distancias para DTW
Dist = np.zeros((N, N))
place = st.empty()
for i in range(N):
place.write("Cargando: " + str(round(i*100/N)) + " %")
for j in range(i - 1, N):
company_1 = base.columns[i]
company_2 = base.columns[j]
Dist[i, j] = get_dtw_distance(base[company_1], base[company_2])
# La matriz es simétrica
Dist[j, i] = Dist[i, j]
# Creamos un DataFrame con la matriz de distancias
df = pd.DataFrame(Dist)
df.index = base.columns
df.columns = base.columns
# Pasamos el df a excel para descarga del usuario
file = to_excel(df)
hierarchy = linkage(Dist)
ct = 0.54 * max(hierarchy[:, 2])
fig = ff.create_dendrogram(Dist, orientation='left', labels=list(base.columns),
color_threshold=ct, linkagefun=linkage)
fig.update_layout(
title='{} desde {} hasta {}'.format(country, returns.index[0].date(), returns.index[-1].date()))
if country == 'Brazil':
fig.update_layout(height=2000)
else:
fig.update_layout(height=900)
st.plotly_chart(fig, use_container_width=True)
st.download_button(label='Descargar Matriz de distancias con DTW para {}'.format(country), data=file,
file_name='{}_correlacion.xlsx'.format(country))