Spaces:

oort
/

cc_clusters

Runtime error

App Files Files Community

cc_clusters / app.py

oort

Update app.py

06fc9c7 about 2 years ago

raw

history blame contribute delete

7.31 kB

	# -- coding: utf-8 --
	# File: app.py
	# Project: 'Homework #3 OTUS.ML.Advanced'
	# Created by Gennady Matveev (gm@og.ly) on 02-01-2022.
	# %%
	# Import libraries
	import re
	import pandas as pd
	import numpy as np
	import streamlit as st
	import requests
	import pickle
	from sklearn.preprocessing import StandardScaler
	from sklearn.cluster import KMeans
	import tsfel
	from kneed import KneeLocator
	import cryptocompare as cc
	import matplotlib.pyplot as plt
	import plotly.express as px
	from umap import UMAP
	import warnings

	plt.style.use("ggplot")
	plt.rcParams["figure.figsize"] = (10, 5)
	warnings.filterwarnings("ignore")
	# pd.options.display.precision = 4

	random_state = 17
	scaler = StandardScaler()
	n_jobs = -1


	# %%
	st.set_page_config(page_title="Cryptocurrencies clustering",
	page_icon='./head.ico', layout='centered', initial_sidebar_state='expanded') # wide

	padding = 0
	st.markdown(f""" <style>
	.reportview-container .main .block-container{{
	padding-top: {padding}rem;
	padding-right: {padding}rem;
	padding-left: {padding}rem;
	padding-bottom: {padding}rem;
	}} </style> """, unsafe_allow_html=True)

	st.image('./mundus.png')
	st.subheader('Clustering analysis of cryptocurrencies')
	st.markdown(
	'Explore similarities in statisticial, temporal and spectral domains')
	st.markdown('''Top 100 cryptocurrencies' daily closing prices are downloaded.
	Their dynamics can be analized in search of similarities between coins.
	Up to 8 currencies from each cluster are shown below.''')
	st.markdown("""---""")

	# %%
	# Set cryptocompare API key:
	api_key = st.secrets["api_key"]
	# %%
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36"
	}
	req = f"https://min-api.cryptocompare.com/data/top/mktcapfull?limit=100&tsym=USD&api_key={api_key}"

	# Utility functions for data download


	@st.cache(ttl=600)
	def get_price(ticker: str, limit: int):

	return cc.get_historical_price_day(ticker, currency="USD",
	limit=limit)


	@st.cache(ttl=600)
	def get_all_cc(limit: int):
	df = pd.DataFrame(index=range(limit))
	for tick in tickers:
	# print(tick, end="\t")
	try:
	d = get_price(tick, limit)
	one_cc = pd.DataFrame.from_dict(d)["close"]
	one_cc.rename(index=tick, inplace=True)
	df = pd.concat([df, one_cc], axis=1)
	except:
	pass

	return df

	# Utility functions for clustering analysis


	def elbow_study(data, k_max: int = 10, model=KMeans):
	X = scaler.fit_transform(data)
	inertia = []
	for k in range(2, k_max):
	clusterer = model(n_clusters=k, random_state=random_state)
	X_km = clusterer.fit(X)
	inertia.append(np.sqrt(X_km.inertia_))

	# Find a knee
	kneedle = KneeLocator(range(2, k_max), inertia, S=2,
	curve="convex", direction="decreasing")
	# Use 3 clusters in case kneed doesn't find a knee
	n_clusters = kneedle.knee or 3

	return n_clusters


	def plot_clusters_2(data, Xt, n_clusters, random_state):
	clusterer = KMeans(n_clusters=n_clusters, max_iter=100,
	random_state=random_state)
	X = scaler.fit_transform(Xt)
	dd = data.copy()
	dd.loc["cluster"] = clusterer.fit_predict(X.T)
	color = ["red", "green", "blue", "purple",
	"orange", "magenta", "goldenrod"]
	clusters_no = dd.loc["cluster"].value_counts(sort=False)

	for c in range(n_clusters):
	cc = color[c]
	fig, ax = plt.subplots(2, 4, sharex='col', figsize=(15, 5))
	cluster_ticks = dd.T[dd.T.loc[:, "cluster"] == c].index
	for i, tick in enumerate(cluster_ticks[:8]):
	ax[i % 2, i//2].plot(dd.iloc[:-1][tick],
	color=cc) # , label=tick)
	ax[i % 2, i//2].set_title(tick)
	fig.suptitle(f"Cluster {c}, {clusters_no[c]} items\n", y=1.02)
	st.pyplot(fig)
	return dd

	def visualize(Xt, n_clusters):
	clusterer = KMeans(n_clusters=n_clusters, max_iter=100,
	random_state=random_state)

	X = scaler.fit_transform(Xt.T)
	X_clust = clusterer.fit_predict(X)
	X_color = X_clust.astype(str)

	features = Xt.values

	# UMAP
	umap_3d = UMAP(n_components=3, init='random',
	random_state=random_state)

	proj_3d = umap_3d.fit_transform(features)

	fig_3d = px.scatter_3d(
	proj_3d, x=0, y=1, z=2,
	color=X_color, labels={'color': 'clusters'},
	color_discrete_sequence=["red", "green", "blue",
	"purple", "orange", "magenta", "goldenrod"],
	title=f"UMAP projection from feature space",
	width=800, height=600,
	)
	fig_3d.update_traces(marker_size=5)
	# fig_3d.show()
	st.write(fig_3d)
	# %%
	# START Sidebar ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


	st.sidebar.image('./blau.png')
	demo = st.sidebar.checkbox(label="Use demo data?", value=True, help="Use demo data or fetch actual?")
	days=st.sidebar.number_input('Number of days for analysis',
	min_value=30, max_value=100, value=60)
	domain=st.sidebar.selectbox('Domain', ('statistical', 'temporal', 'spectral', 'all'),
	index=1, help='Domain to use feature extraction')
	st.sidebar.markdown("""---""")
	analyze=st.sidebar.button('Start analysis')

	# END Sidebar ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

	# Analysis
	if analyze:
	with st.spinner('Downloading data...'):
	if demo==True:
	with open("./demo_data.pkl", "rb") as f:
	demo_data = pickle.load(f)
	dl = demo_data.shape[0]
	data_day = demo_data.iloc[dl-days:]
	tickers = demo_data.columns
	else:
	top100=requests.get(req, headers=headers)
	rs=re.compile(r"\"Name\":\"(?P<ticker>[A-Z0-9]+)\"")
	tickers=rs.findall(top100.text)
	data_day=get_all_cc(limit=days).copy()

	with st.spinner(f'Extracting {domain} features...'):
	dom=domain if domain != 'all' else None
	cfg_file=tsfel.get_features_by_domain(dom)
	# tsfel analysis
	x_temp=tsfel.time_series_features_extractor(
	cfg_file, data_day["BTC"], window_size=days)
	tf_columns=x_temp.columns
	xtf=pd.DataFrame(columns=data_day.columns, index=tf_columns)
	# Fill df with features
	for col in xtf.columns:
	xtf[col]=tsfel.time_series_features_extractor(
	cfg_file, data_day[col], window_size=days
	).T
	xtf.dropna(inplace=True)

	# Features dataframe
	Xt=pd.DataFrame(scaler.fit_transform(
	xtf), columns=data_day.columns, index=xtf.index)
	with st.spinner('Calculating optimal number of clusters...'):
	# Get optimal no of clusters
	n_clusters=elbow_study(Xt.T, model=KMeans) # metric="euclidean",

	# Plot clusters
	plot_clusters_2(data_day, Xt=Xt, n_clusters=n_clusters,
	random_state=random_state
	)

	# Plot umap
	# visualize(Xt, n_clusters)