|
import streamlit as st |
|
import io |
|
import numpy as np |
|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
import plotly.express as px |
|
from sklearn.metrics import ( |
|
accuracy_score, |
|
precision_score, |
|
recall_score, |
|
f1_score, |
|
confusion_matrix, |
|
mean_absolute_error, |
|
mean_squared_error, |
|
r2_score, |
|
) |
|
|
|
|
|
@st.cache_resource(show_spinner=False) |
|
def get_llm(): |
|
"""Cached LLM initialization to prevent reloading on every rerun""" |
|
from langchain_google_genai import ChatGoogleGenerativeAI |
|
from langchain_groq import ChatGroq |
|
import os |
|
|
|
try: |
|
return ChatGroq( |
|
model="gemma2-9b-it", |
|
groq_api_key=os.getenv("GROQ_API_KEY") |
|
) |
|
|
|
except Exception as e: |
|
try: |
|
return ChatGoogleGenerativeAI( |
|
model="gemini-2.0-flash-lite-preview-02-05", |
|
google_api_key=os.getenv("GEMINI_API_KEY") |
|
) |
|
except: |
|
return None |
|
|
|
llm_insights = get_llm() |
|
|
|
|
|
@st.cache_data(show_spinner=False) |
|
def _compute_classification_metrics(y_test, y_pred): |
|
"""Cached metric computation for classification""" |
|
return { |
|
'accuracy': accuracy_score(y_test, y_pred), |
|
'precision': precision_score(y_test, y_pred, average="weighted", zero_division=0), |
|
'recall': recall_score(y_test, y_pred, average="weighted", zero_division=0), |
|
'f1': f1_score(y_test, y_pred, average="weighted", zero_division=0), |
|
'cm': confusion_matrix(y_test, y_pred) |
|
} |
|
|
|
@st.cache_data |
|
def _compute_regression_metrics(y_test, y_pred): |
|
"""Cached metric computation for regression""" |
|
return { |
|
'mae': mean_absolute_error(y_test, y_pred), |
|
'mse': mean_squared_error(y_test, y_pred), |
|
'rmse': np.sqrt(mean_squared_error(y_test, y_pred)), |
|
'r2': r2_score(y_test, y_pred) |
|
} |
|
|
|
|
|
@st.cache_data(show_spinner=False) |
|
def _plot_confusion_matrix(cm, classes): |
|
"""Cached confusion matrix plotting""" |
|
fig, ax = plt.subplots(figsize=(2, 2), dpi=200) |
|
sns.heatmap( |
|
cm, |
|
annot=True, |
|
fmt="d", |
|
cmap="Blues", |
|
xticklabels=classes, |
|
yticklabels=classes, |
|
annot_kws={"size": 8}, |
|
) |
|
plt.xticks(fontsize=5) |
|
plt.yticks(fontsize=5) |
|
|
|
buf = io.BytesIO() |
|
fig.savefig(buf, format="png", bbox_inches="tight", dpi=200) |
|
buf.seek(0) |
|
return buf |
|
|
|
|
|
|
|
@st.cache_data(show_spinner=False) |
|
def _get_insights_classification(accuracy, precision, recall, f1, cm_shape): |
|
"""Cached insights generation based on metrics""" |
|
if llm_insights is None: |
|
return ( |
|
f"### Classification Metrics Explained\n\n" |
|
f"**Accuracy** ({accuracy:.3f}): Correct predictions ratio\n" |
|
f"**Precision** ({precision:.3f}): Positive prediction accuracy\n" |
|
f"**Recall** ({recall:.3f}): Actual positives found\n" |
|
f"**F1 Score** ({f1:.3f}): Precision-Recall balance\n" |
|
f"Confusion Matrix ({cm_shape[0]}x{cm_shape[1]}): Prediction vs Actual distribution" |
|
) |
|
|
|
try: |
|
response = llm_insights.invoke(f""" |
|
Briefly explain these classification metrics (accuracy={accuracy:.3f}, |
|
precision={precision:.3f}, recall={recall:.3f}, f1={f1:.3f}) |
|
and {cm_shape[0]}x{cm_shape[1]} confusion matrix. |
|
Use markdown bullet points. |
|
""") |
|
return response.content.strip() |
|
except: |
|
return "Could not generate AI insights - showing basic metrics explanation." |
|
|
|
|
|
|
|
def display_test_results(trained_model, X_test, y_test, task_type, label_encoder=None): |
|
""" |
|
Displays test results, including metrics, confusion matrix (if classification), |
|
and LLM-based or fallback insights about the metrics. |
|
""" |
|
|
|
|
|
st.markdown("## Test Results") |
|
loading_placeholder = st.empty() |
|
|
|
|
|
with loading_placeholder.container(): |
|
st.info("β³ Evaluating model performance on test data. This may take a moment for large datasets.") |
|
progress_bar = st.progress(0) |
|
|
|
|
|
if "test_results_calculated" not in st.session_state: |
|
st.session_state.test_results_calculated = False |
|
|
|
|
|
if not st.session_state.test_results_calculated: |
|
|
|
sampling_message = None |
|
MAX_SAMPLES = 5000 |
|
|
|
|
|
with loading_placeholder.container(): |
|
progress_bar.progress(10) |
|
|
|
if len(X_test) <= MAX_SAMPLES: |
|
|
|
X_test_sample = X_test |
|
y_test_sample = y_test |
|
st.info("π Using all test data for evaluation...") |
|
else: |
|
|
|
sampling_message = f"π Using {MAX_SAMPLES} samples from the test set for visualization (out of {len(X_test)} total)" |
|
st.info("π Sampling test data for evaluation...") |
|
|
|
|
|
idx = np.random.choice(len(X_test.index if hasattr(X_test, 'index') else X_test), size=MAX_SAMPLES, replace=False) |
|
X_test_sample = X_test.iloc[idx] if hasattr(X_test, 'iloc') else X_test[idx] |
|
y_test_sample = y_test.iloc[idx] if hasattr(y_test, 'iloc') else y_test[idx] |
|
|
|
|
|
with loading_placeholder.container(): |
|
progress_bar.progress(30) |
|
st.info("π Generating predictions... Please wait") |
|
|
|
with st.spinner("Model working..."): |
|
if task_type == "regression": |
|
y_pred = trained_model.predict(X_test_sample) |
|
elif task_type == "classification": |
|
pipeline, enc = trained_model if label_encoder is None else (trained_model, label_encoder) |
|
y_pred = pipeline.predict(X_test_sample) |
|
|
|
|
|
if enc: |
|
y_pred = enc.inverse_transform(y_pred) |
|
y_test_decoded = enc.inverse_transform(y_test_sample) |
|
else: |
|
y_test_decoded = y_test_sample |
|
|
|
|
|
with loading_placeholder.container(): |
|
progress_bar.progress(60) |
|
st.info("π Computing metrics...") |
|
|
|
|
|
if task_type == "regression": |
|
metrics = _compute_regression_metrics(y_test_sample, y_pred) |
|
else: |
|
metrics = _compute_classification_metrics(y_test_decoded, y_pred) |
|
|
|
|
|
with loading_placeholder.container(): |
|
progress_bar.progress(90) |
|
st.info("π Preparing visualizations...") |
|
|
|
|
|
if task_type == "classification": |
|
|
|
_ = _plot_confusion_matrix(metrics['cm'], np.unique(y_test_decoded)) |
|
|
|
_ = _get_insights_classification( |
|
metrics['accuracy'], |
|
metrics['precision'], |
|
metrics['recall'], |
|
metrics['f1'], |
|
metrics['cm'].shape |
|
) |
|
|
|
|
|
with loading_placeholder.container(): |
|
progress_bar.progress(100) |
|
st.success("β
Test results ready!") |
|
|
|
|
|
st.session_state.test_results_calculated = True |
|
|
|
|
|
st.session_state.test_metrics = metrics |
|
if task_type == "classification": |
|
st.session_state.test_y_pred = y_pred |
|
st.session_state.test_y_test = y_test_decoded |
|
else: |
|
st.session_state.test_y_pred = y_pred |
|
st.session_state.test_y_test = y_test_sample |
|
|
|
|
|
st.session_state.sampling_message = sampling_message |
|
|
|
|
|
import time |
|
time.sleep(0.5) |
|
|
|
|
|
if "sampling_message" in st.session_state and st.session_state.sampling_message: |
|
st.info(st.session_state.sampling_message) |
|
|
|
|
|
if task_type == "regression": |
|
st.subheader("π Regression Metrics") |
|
|
|
|
|
if "test_metrics" in st.session_state and st.session_state.test_results_calculated: |
|
metrics = st.session_state.test_metrics |
|
y_pred = st.session_state.test_y_pred |
|
y_test = st.session_state.test_y_test |
|
|
|
mae, mse, rmse, r2 = metrics['mae'], metrics['mse'], np.sqrt(metrics['mse']), metrics['r2'] |
|
|
|
col1, col2, col3, col4 = st.columns(4) |
|
col1.metric("π MAE", f"{mae:.4f}") |
|
col2.metric("π MSE", f"{mse:.4f}") |
|
col3.metric("π RMSE", f"{rmse:.4f}") |
|
col4.metric("π RΒ² Score", f"{r2:.4f}") |
|
|
|
|
|
st.subheader("π Prediction vs Actual") |
|
df_results = pd.DataFrame({ |
|
'Actual': y_test, |
|
'Predicted': y_pred |
|
}) |
|
fig = px.scatter(df_results, x='Actual', y='Predicted', |
|
title='Predicted vs Actual Values', |
|
labels={'Actual': 'Actual Values', 'Predicted': 'Predicted Values'}) |
|
fig.add_shape(type='line', x0=min(y_test), y0=min(y_test), |
|
x1=max(y_test), y1=max(y_test), |
|
line=dict(color='red', dash='dash')) |
|
st.plotly_chart(fig, use_container_width=True) |
|
|
|
elif task_type == "classification": |
|
st.subheader("π Classification Metrics") |
|
|
|
|
|
if "test_metrics" in st.session_state and st.session_state.test_results_calculated: |
|
metrics = st.session_state.test_metrics |
|
y_pred = st.session_state.test_y_pred |
|
y_test_decoded = st.session_state.test_y_test |
|
|
|
accuracy, precision, recall, f1 = metrics['accuracy'], metrics['precision'], metrics['recall'], metrics['f1'] |
|
|
|
col1, col2, col3, col4 = st.columns(4) |
|
col1.metric("β
Accuracy", f"{accuracy:.4f}") |
|
col2.metric("π― Precision", f"{precision:.4f}") |
|
col3.metric("π’ Recall", f"{recall:.4f}") |
|
col4.metric("π₯ F1 Score", f"{f1:.4f}") |
|
|
|
st.subheader("π Confusion Matrix") |
|
|
|
buf = _plot_confusion_matrix(metrics['cm'], np.unique(y_test_decoded)) |
|
st.image(buf, width=450) |
|
|
|
|
|
st.markdown("---") |
|
st.markdown("#### Test Insights") |
|
accuracy, precision, recall, f1 = metrics['accuracy'], metrics['precision'], metrics['recall'], metrics['f1'] |
|
classification_insights = _get_insights_classification(accuracy, precision, recall, f1, metrics['cm'].shape) |
|
st.markdown(classification_insights) |
|
|