shreyk31's picture
Update app.py
d344c88 verified
import streamlit as st
import pandas as pd
import numpy as np
from PIL import Image
import seaborn as sns
import codecs
import streamlit.components.v1 as components
import dagshub
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from shapash.explainer.smart_explainer import SmartExplainer
from prophet import Prophet
import mlflow
import mlflow.sklearn
from mlflow import log_metric, log_param, log_artifact
from sklearn.model_selection import train_test_split, GridSearchCV
import joblib
from datetime import datetime
from prophet.plot import plot_components_plotly
# Add custom CSS for U.S.-themed colors
st.markdown(
"""
<style>
.css-18e3th9 {
background-color: #f0f8ff; /* Light white/blue background */
}
.css-1d391kg {
background-color: #f0f8ff; /* For widgets */
}
.stButton button {
background-color: #ff4b4b; /* Red buttons */
color: white;
}
.css-1offfwp h1 {
color: #1e90ff; /* Blue titles */
}
.stMarkdown p, .stMarkdown h2, .stMarkdown h3 {
color: #2e8b57; /* U.S.-themed green text for subheaders */
}
</style>
""",
unsafe_allow_html=True
)
# Main title with an image
st.title("U.S. Fed's Next Move Prediction")
image_path = Image.open("fed.png")
st.image(image_path, width=400)
# Sidebar with U.S.-themed style
app_page = st.sidebar.selectbox(
"Select Page",
['Business Case', 'Data Exploration & Visualization', 'Prediction & Feature Importance', 'Hyperparameter Tuning', 'Conclusions and Data Insights']
)
df = pd.read_csv('econdataset.csv')
if app_page == 'Business Case':
st.title("1. Business Case")
st.subheader("Objective:")
st.write("The purpose of this dashboard is to analyze the Federal Reserve’s interest rate decisions from 1955 to 2024 and explore potential relationships between various economic indicators and the Federal Funds Rate. The dashboard applies time series analysis to forecast future Federal Funds Rate changes and utilizes linear regression to identify and quantify relationships between key economic variables and monetary policy decisions. These tools provide a comprehensive view of historical trends and predictive insights, enabling users to better understand the dynamics of Federal Reserve interest rate adjustments.")
st.subheader("Key Questions:")
st.write("1. How have key economic indicators (such as unemployment rate, inflation rate, CPI, and GDP) evolved over the past decades, and how do these trends correlate with the Federal Reserve’s interest rate decisions?")
st.write("2. What patterns emerge in the Federal Funds Rate over time, and how do other variables such as stock market indices (S&P 500, DOW Jones), bond yields, and GDP relate to these changes?")
st.write("3. Can we establish reliable relationships between economic indicators (e.g., unemployment rate, inflation, CPI, S&P 500, Bond Yield, Real GDP) to predict future Federal Reserve interest rate adjustments?")
st.subheader("Use of Analytical Models")
st.write("This dashboard showcases two key analytical approaches to provide insights:")
st.subheader("1.Time Series Analysis:")
st.write("Time series models are employed to forecast future Federal Funds Rate changes by analyzing historical trends in the data. These forecasts help identify patterns that may indicate upcoming rate hikes or cuts.")
st.subheader("2.Linear Regression:")
st.write("Linear regression models are used to uncover relationships between variables such as unemployment rate, inflation, bond yields, and the Federal Funds Rate. This analysis highlights the economic factors most closely associated with interest rate changes, providing a framework for understanding how these decisions are influenced by broader economic conditions.")
st.subheader("Relevance and Value")
st.write("By integrating predictive analytics, the dashboard provides valuable insights for businesses and decision-makers:")
st.write("For Businesses:")
st.write("Understanding and predicting interest rates is crucial for managing borrowing costs, planning capital investments, and making strategic financial decisions.")
st.write("For Financial Institutions:")
st.write("Insights into how economic conditions drive rate changes can improve decision-making around interest-sensitive products, such as loans and mortgages.")
st.write("For Investors:")
st.write("Predicting rate changes provides an advantage in portfolio management, particularly in optimizing the allocation of equities, bonds, and other interest rate-sensitive assets. Ultimately, understanding the dynamics of Federal Reserve decisions can inform strategies across multiple sectors.")
if app_page == 'Data Exploration & Visualization':
st.title("2. Data Exploration & Visualization")
st.write("Sample dataset loaded:")
st.dataframe(df.head(5))
# Cleaning the data
st.write("Cleaning the data:")
# Convert numeric columns stored as objects by removing special characters like "$" and commas
for column in ['S&P 500 Price', 'Nominal GDP Index (in billion USD)', 'Real GDP Index', 'DOW Jones Price']:
df[column] = df[column].replace('[\$,]', '', regex=True).astype(float)
# Check the updated data types and missing values after conversion
for year in df[df['Year'] <= 1991]['Year'].unique():
# Get the GDP value for January of the current year
jan_gdp_nominal = df.loc[(df['Year'] == year) & (df['Month'] == 'January'), 'Nominal GDP Index (in billion USD)'].values[0]
jan_gdp_real = df.loc[(df['Year'] == year) & (df['Month'] == 'January'), 'Real GDP Index'].values[0]
# Assign January's GDP values to all other months in the same year
df.loc[(df['Year'] == year) & (df['Month'] != 'January'), 'Nominal GDP Index (in billion USD)'] = jan_gdp_nominal
df.loc[(df['Year'] == year) & (df['Month'] != 'January'), 'Real GDP Index'] = jan_gdp_real
df.dropna(inplace=True) # This drops a single row in 2024 on which we don't have data
cleaning_summary = {
"dtypes_after_conversion": df.dtypes,
"missing_values_after_conversion": df.isnull().sum()
}
st.write(cleaning_summary)
st.subheader("01 Description of the dataset")
st.dataframe(df.describe())
st.write("This dataset contains key statistics from various observations.")
st.subheader("02 Missing values")
dfnull = df.isnull().sum() / len(df) * 100
st.write(dfnull)
if dfnull.sum() == 0:
st.success("No missing values found!")
st.write("Generate an automated report:")
if st.button("Generate Report"):
st.balloons()
def read_html_report(file_path):
with codecs.open(file_path, 'r', encoding="utf-8") as f:
return f.read()
html_report = read_html_report("report.html")
st.components.v1.html(html_report, height=1000, scrolling=True)
# Placeholder visualization options
st.write("Visualize key relationships and importance of variables.")
# Add color schemes reflecting the U.S.
list_columns = df.columns
values = st.multiselect("Select two variables to compare:", list_columns, ["Fed Effective Funds Rate", "Bond Yield (US 10Y TN)"])
st.line_chart(df, x=values[0], y=values[1])
st.bar_chart(df, x=values[0], y=values[1])
if app_page == 'Prediction & Feature Importance':
st.title("3. Prediction")
for column in ['S&P 500 Price', 'Nominal GDP Index (in billion USD)', 'Real GDP Index', 'DOW Jones Price']:
df[column] = df[column].replace('[\$,]', '', regex=True).astype(float)
df2 = df.dropna()
for year in df[df['Year'] <= 1991]['Year'].unique():
jan_gdp_nominal = df.loc[(df['Year'] == year) & (df['Month'] == 'January'), 'Nominal GDP Index (in billion USD)'].values[0]
jan_gdp_real = df.loc[(df['Year'] == year) & (df['Month'] == 'January'), 'Real GDP Index'].values[0]
df.loc[(df['Year'] == year) & (df['Month'] != 'January'), 'Nominal GDP Index (in billion USD)'] = jan_gdp_nominal
df.loc[(df['Year'] == year) & (df['Month'] != 'January'), 'Real GDP Index'] = jan_gdp_real
df.dropna(inplace=True) # This drops a single row in 2024 on which we don't have data
# Convert Month to a categorical column
df['Month'] = pd.Categorical(df['Month'], categories=[
'January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December'
], ordered=True)
# Get numerical codes for each month
df['Month'] = df['Month'].cat.codes + 1 # Adding 1 to start from 1 instead of 0
st.write("Now, we have all our numerical features ready to be used in our model")
st.dataframe(df.head())
list_columns = df.columns.drop("Fed Effective Funds Rate")
input_lr = st.multiselect("Select variables:",list_columns,["Bond Yield (US 10Y TN)", "Unemployment Rate"])
df_lr = df[input_lr]
# Step 1 splitting the dataset into X and y
X= df_lr
# target variable
y= df["Fed Effective Funds Rate"]
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
# Random Forest model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
# Predictions
lr_pred = lr_model.predict(X_test)
rf_pred = rf_model.predict(X_test)
predictions_df_lr = pd.Series(lr_pred, index=y_test.index, name="LR Predictions")
predictions_df_rf = pd.Series(rf_pred, index=y_test.index, name="RF Predictions")
result = pd.concat([predictions_df_lr, y_test,predictions_df_rf], axis=1)
st.dataframe(result)
# Compare model performance
st.subheader("Model Performance on Test Data")
st.write(f"Linear Regression RMSE: **{metrics.root_mean_squared_error(y_test, lr_pred):.2f}**")
st.write(f"Random Forest RMSE: **{metrics.root_mean_squared_error(y_test, rf_pred):.2f}**")
st.bar_chart(result)
# Time Series model
st.subheader("Time Series Modeling using FB Prophet")
st.write("Dropping all values before January 1992")
df2['Day'] = 15
df2['ds'] = pd.to_datetime(df2[['Year', 'Month', 'Day']].astype(str).agg('-'.join, axis=1))
df2 = df2[(df2['ds'] >= "1992-01-15 00:00:00")]
df_fp = pd.concat([df2['ds'], df2['Fed Effective Funds Rate']], axis=1)
df_fp.rename(columns={'ds': 'ds', 'Fed Effective Funds Rate': 'y'}, inplace=True)
st.dataframe(df_fp)
st.write("Future predictions after September 2024")
fp_model = Prophet(weekly_seasonality=True)
fp_model.fit(df_fp)
future = fp_model.make_future_dataframe(periods=48, freq='M')
fp_pred = fp_model.predict(future)
fp_pred = fp_pred[["ds","trend"]]
fp_pred = fp_pred[fp_pred['ds'] > "2024-09-15 00:00:00"]
st.dataframe(fp_pred)
st.line_chart(fp_pred, x='ds', y='trend')
st.write("Time Series Predictions between 2020 and 2024")
df2 = df2[(df2['ds'] >= "1992-01-15 00:00:00") & (df2['ds'] <= "2020-01-15 00:00:00")]
df_fp = pd.concat([df2['ds'], df2['Fed Effective Funds Rate']], axis=1)
df_fp.rename(columns={'ds': 'ds', 'Fed Effective Funds Rate': 'y'}, inplace=True)
st.dataframe(df_fp)
fp_model = Prophet()
fp_model.fit(df_fp)
future = fp_model.make_future_dataframe(periods=57, freq='M')
fp_pred = fp_model.predict(future)
chart_fp_pred = fp_pred
fp_pred = fp_pred[["ds","trend"]]
fp_pred = fp_pred[fp_pred['ds'] > "2020-01-15 00:00:00"]
st.dataframe(fp_pred)
# Extract seasonality components
fig = fp_model.plot_components(chart_fp_pred)
st.pyplot(fig)
st.title("4. Feature Importance")
# Initialize SmartExplainer
xpl = SmartExplainer(rf_model)
y_pred = pd.Series(rf_pred)
# Reset X_test index
X_test = X_test.reset_index(drop=True)
# Compile the explainer with test data and predictions
xpl.compile(x=X_test, y_pred=y_pred)
# Display overall feature importance
st.subheader("Overall Feature Importance")
fig = xpl.plot.features_importance()
st.plotly_chart(fig)
# Select a subset of rows for detailed importance
st.subheader("Feature Importance for a Subset")
subset = X_test.sample(n=50, random_state=42).index
fig_subset = xpl.plot.features_importance(selection=subset)
st.plotly_chart(fig_subset)
# Show contribution plot for a specific feature
st.subheader("Contribution Plot: Bond Yield (US 10Y TN)")
fig_contribution = xpl.plot.contribution_plot('Bond Yield (US 10Y TN)')
st.plotly_chart(fig_contribution)
if app_page == 'Hyperparameter Tuning':
st.title("5. Hyperparameter Tuning")
dagshub.init(repo_owner='shreykharbanda31', repo_name='fed-interest-rate-prediction', mlflow=True)
def process_data(X, y, test_size=0.3, random_state=42):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
return X_train, X_test, y_train, y_test
def train_and_optimize_model(X_train, y_train, model, param_grid, cv=5):
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv)
grid_search.fit(X_train, y_train)
return grid_search
def evaluate_model(model, X_test, y_test):
y_pred = model.predict(X_test)
rmse = metrics.root_mean_squared_error(y_test, y_pred)
mlflow.log_metric("RMSE", rmse)
# Usage of the functions within an MLflow run
with mlflow.start_run():
for column in ['S&P 500 Price', 'Nominal GDP Index (in billion USD)', 'Real GDP Index', 'DOW Jones Price']:
df[column] = df[column].replace('[\$,]', '', regex=True).astype(float)
for year in df[df['Year'] <= 1991]['Year'].unique():
jan_gdp_nominal = df.loc[(df['Year'] == year) & (df['Month'] == 'January'), 'Nominal GDP Index (in billion USD)'].values[0]
jan_gdp_real = df.loc[(df['Year'] == year) & (df['Month'] == 'January'), 'Real GDP Index'].values[0]
df.loc[(df['Year'] == year) & (df['Month'] != 'January'), 'Nominal GDP Index (in billion USD)'] = jan_gdp_nominal
df.loc[(df['Year'] == year) & (df['Month'] != 'January'), 'Real GDP Index'] = jan_gdp_real
df.dropna(inplace=True) # This drops a single row in 2024 on which we don't have data
# Convert Month to a categorical column
df['Month'] = pd.Categorical(df['Month'], categories=[
'January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December'
], ordered=True)
# Get numerical codes for each month
df['Month'] = df['Month'].cat.codes + 1 # Adding 1 to start from 1 instead of 0
list_columns = df.columns.drop("Fed Effective Funds Rate")
X, y = df[list_columns], df["Fed Effective Funds Rate"]
X_train, X_test, y_train, y_test = process_data(X, y)
rfg_param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [10, 20, None],
'min_samples_split': [2, 5, 10]
}
rf_model = RandomForestRegressor(random_state=42)
rf_grid_search = train_and_optimize_model(X_train, y_train, rf_model, rfg_param_grid)
best_rf = rf_grid_search.best_estimator_
mlflow.log_params(rf_grid_search.best_params_)
mlflow.sklearn.log_model(best_rf, datetime.now().strftime("%Y-%m-%d %H:%M:%S")+"best_rf")
mlflow.sklearn.save_model(best_rf, datetime.now().strftime("%Y-%m-%d %H:%M:%S")+"best_rf_model")
evaluate_model(best_rf, X_test, y_test)
if app_page == 'Conclusions and Data Insights':
st.title("6. Conclusions and Data Insights")
st.subheader("Linear Regression & Random Forest Regressor")
st.write("While these two go about creating the models in different ways their use cases are similarly two fold. The first use case for these models would be as a sort of extension of a correlation matrix. And the second use case would be as a hypothetical predictive model.")
st.write("The models and the correlation matrix are similar in that they are both very effective at giving the correlation between variables, so you could decipher from either which economic factor creates the largest impact on the interest rate. The value in this instance specifically for the models is that they can give correlation for a set of variables, while a correlation matrix can only compare one value to another at a given time.")
st.write("For the second use case this would come into play if you wanted to construct a hypothetical scenario, and see what the resulting change in the interest rate would be. For example if you wanted to predict the interest rate after the economic changes of covid had reverted back to the “base” form that we expect from the US economy, you could input those values into the model and spit out a prediction for what the interest rate would be.")
st.subheader("Prophet Time Series Model")
st.write("The time series model is primarily useful compared to the other models for its ability to predict the future. But this comes at the cost of accuracy.")
st.write("Based on the limited number of variables used it is particularly blind to socio-political factors that influence economic outcomes. For example the model’s prediction is entirely decimated covid, and resulting policy changes. The model is going to be behind humans in the ability to predict the depressionary effect that a pandemic like covid would have on an economy before government intervention, and because it cannot look at other economic factors like inflation rate is unable to see the precise point at which interest rate would shift.")
st.write("A time series is meant to look at slightly longer term trends, because typically small changes aren’t too telling, but the interest rate is a number set intentionally by the Fed, meaning that small changes are quite telling.")