Spaces:

jannisborn
/

NumberTokenLoss

Running

App Files Files Community

jannisborn commited on May 29

Commit

0dc70d1

unverified ·

1 Parent(s): 9914a10

update

Browse files

Files changed (3) hide show

.gitignore +2 -0
src/scenarios.py +33 -20
src/streamlit_app.py +273 -214

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.DS_Store
2	+ *__pycache__

src/scenarios.py CHANGED Viewed

@@ -1,9 +1,12 @@
 import numpy as np
 #  (1) A one-hot moving from token 0 to token 10 (“Text”)
 dirac = [
     {
-        "name": f"Dirac: all mass on token {i}",
         "values": [1.0 if j == i else 0.0 for j in range(11)],
         "ground_truth": "4",
         "explanation": "A Dirac distribution: all probability on a single token.",
@@ -29,7 +32,7 @@ def make_gauss_values(center, n=11, sigma=1.5, peak_mass=0.6):
 gauss = [
     {
-        "name": f"Gaussian: center at token {c}",
         "values": make_gauss_values(c),
         "ground_truth": "4",
         "explanation": "Gaussian-style: 0.6 mass at the highlighted token, 0.4 spread smoothly to its neighbors.",
@@ -38,23 +41,33 @@ gauss = [
 ]
-# (3) Bimodal: two spikes of 0.5 mass each, symmetrically offset from the GT=4 ---
-def make_bimodal_values(offset, n=11, gt=4):
-    # clamp to [0,n-1]
-    left = max(0, gt - offset)
-    right = min(n - 1, gt + offset)
-    vals = [0.0] * n
-    vals[left] = 0.5
-    vals[right] = 0.5
-    return vals
-bimodal = [
-    {
-        "name": f"Bimodal: peaks at tokens {max(0, 4 - d)} & {min(10, 4 + d)}",
-        "values": make_bimodal_values(d),
-        "ground_truth": "4",
-        "explanation": "Two-point (bimodal) distribution: equal 0.5 mass on each peak, which move ±offset from the ground truth.",
-    }
-    for d in range(11)
-]

 import numpy as np
+options = [str(i) for i in range(10)] + ["Text"]
 #  (1) A one-hot moving from token 0 to token 10 (“Text”)
 dirac = [
     {
+        "name": f"Dirac: all mass on token {options[i]}",
         "values": [1.0 if j == i else 0.0 for j in range(11)],
         "ground_truth": "4",
         "explanation": "A Dirac distribution: all probability on a single token.",
 gauss = [
     {
+        "name": f"Gaussian: center at token {options[c]}",
         "values": make_gauss_values(c),
         "ground_truth": "4",
         "explanation": "Gaussian-style: 0.6 mass at the highlighted token, 0.4 spread smoothly to its neighbors.",
 ]
+def make_bimodal_scenarios(gt_token: str, options: list[str]) -> list[dict]:
+    """
+    Build a list of { name, values, explanation } dicts, where
+    each scenario splits 50/50 between tokens (gt±offset),
+    wrapping around via Python’s % operator.
+    """
+    n = len(options)
+    gt_idx = options.index(gt_token)
+    scenarios = []
+    for offset in range(n):
+        left = (gt_idx - offset) % n
+        right = (gt_idx + offset) % n
+        # build the 50/50 (or 1.0 at gt when offset=0) vector
+        vals = [0.0] * n
+        if left == right:
+            vals[left] = 1.0
+        else:
+            vals[left] = 0.5
+            vals[right] = 0.5
+        label = f"({options[left]}, {options[right]})"
+        scenarios.append(
+            {
+                "name": label,
+                "values": vals,
+                "explanation": "50/50 mass at these two tokens (wrapping).",
+            }
+        )
+    return scenarios

src/streamlit_app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import time
 import altair as alt
@@ -7,25 +8,38 @@ import streamlit as st
 import streamlit_vertical_slider as svs
 import torch
-from scenarios import bimodal, dirac, gauss
 DEMO_INTERVAL = 1.5
-NTL_MSE_SCALING = 0.5
-MAX_LOSS_PLOT = 15
 LAST_STEP = -1
-# """TODO:
-# - Remove flickering of loss evolution scenario plot (lower ylim?)
-# - Move manual part down (predicted token probabilities)
-# - Allow to set GT token for each demo
-# - Add text token to loss evolution barplot
-# - pick good default (4?)
-# """
 # Define options globally as it's used in initialization and UI
 options = [str(i) for i in range(10)] + ["Text"]
 # --- Session State Initialization ---
 # Ensure all session state variables are initialized before first use, especially by widgets.
 if "running_demo" not in st.session_state:
@@ -44,37 +58,44 @@ if "active_scenarios" not in st.session_state:
 if "loss_history" not in st.session_state:
     st.session_state.loss_history = []
 # Initialize states for sliders and ground_truth selector
 # Using len(options) to correctly size for 0-9 + "Text"
 for i in range(len(options)):
     if f"slider_{i}" not in st.session_state:
-        st.session_state[f"slider_{i}"] = 1.0 / len(options)
 if "ground_truth" not in st.session_state:
-    st.session_state["ground_truth"] = options[0]  # Default to "0"
-st.title("Number Token Loss - Demo")
 st.markdown(
-    """
-    **Instructions**
-    1. **Pick a ground truth token (0–9).**
-    2. **Select one of the three automated demos:**
-    - **Dirac**: a one-hot (Dirac) distribution whose single 1.0 mass moves from token 0 all the way to “Text.”
-    - **Gaussian**: a peaked Gaussian (0.6 mass at center, 0.4 spread) that slides its center from token 0 to “Text.”
-    - **Bimodal**: two equal peaks (0.5 each) that start at (0,8) and then move symmetrically away from the GT token.
     """
 )
 if "ground_truth" not in st.session_state:
     st.session_state["ground_truth"] = "4"
-gt = st.selectbox(
-    "Ground Truth Token",
-    options=options,
-    index=options.index(st.session_state["ground_truth"]),
-    key="ground_truth",
-)
 def apply_scenario(step_idx):
@@ -84,7 +105,9 @@ def apply_scenario(step_idx):
 def start_dirac_demo():
     st.session_state.active_scenarios = dirac
     st.session_state.running_demo = True
     st.session_state.demo_step = 0
     st.session_state.last_update_time = time.time()
@@ -92,7 +115,9 @@ def start_dirac_demo():
 def start_gauss_demo():
     st.session_state.active_scenarios = gauss
     st.session_state.running_demo = True
     st.session_state.demo_step = 0
     st.session_state.last_update_time = time.time()
@@ -100,7 +125,11 @@ def start_gauss_demo():
 def start_bimodal_demo():
-    st.session_state.active_scenarios = bimodal
     st.session_state.running_demo = True
     st.session_state.demo_step = 0
     st.session_state.last_update_time = time.time()
@@ -118,11 +147,15 @@ if st.session_state.running_demo:
     scenario = st.session_state.active_scenarios
     current_time = time.time()
     if current_time - st.session_state.last_update_time > DEMO_INTERVAL:
-        next_step = (st.session_state.demo_step + 1) % len(scenario)
-        st.session_state.demo_step = next_step
-        apply_scenario(next_step)  # Update session state for the new scenario
-        st.session_state.last_update_time = time.time()  # Reset timer
-        st.rerun()  # Crucial: Rerun to reflect changes in widgets and charts
 # --- UI Rendering ---
 # This section renders the main UI. It executes after any potential rerun from the block above.
@@ -151,12 +184,9 @@ else:
             start_bimodal_demo()
             st.rerun()
-# Placeholder for charts and loss calculations that will be updated
-# This section always reads the current st.session_state to generate its content.
 current_prob_values_from_state = [
-    st.session_state.get(f"slider_{j}", 1.0 / len(options)) for j in range(len(options))
 ]
 total_from_state = sum(current_prob_values_from_state)
 probs_for_charts = (
@@ -165,7 +195,12 @@ probs_for_charts = (
     else torch.tensor([v / total_from_state for v in current_prob_values_from_state])
 )
-gt_choice_for_charts = st.session_state.get("ground_truth", options[0])
 if gt_choice_for_charts == "Text":
     gt_index_for_charts = 10  # Assuming "Text" is the 11th item (index 10)
     gt_numeric_for_charts = None
@@ -174,8 +209,9 @@ else:
     gt_numeric_for_charts = gt_index_for_charts
 gt = st.session_state["ground_truth"]
-st.markdown(f"#### Predicted Probability Distribution — Ground truth token {gt}")
 df_dist = pd.DataFrame(
     {"token": options, "probability": probs_for_charts.numpy().round(2)}
 )
@@ -183,26 +219,22 @@ df_dist["type"] = [
     "Ground Truth" if token == gt_choice_for_charts else "Prediction"
     for token in options
 ]
-bg = (
-    alt.Chart(pd.DataFrame({"token": [gt]}))
-    .mark_bar(size=40, color="lightgray", opacity=0.4)
-    .encode(
-        x=alt.X("token:N", sort=options),
-        x2=alt.X2("token:N"),  # pin the right edge to the same category
-        y=alt.value(0),  # bottom at y=0
-        y2=alt.value(1),  # top at y=1 (full height)
-    )
-)
 bars = (
     alt.Chart(df_dist)
-    .mark_bar()
     .encode(
         x=alt.X(
             "token:N",
             title="Token",
             sort=options,
-            axis=alt.Axis(labelAngle=0, labelFontSize=14, titleFontSize=16),
         ),
         y=alt.Y(
             "probability:Q",
@@ -210,21 +242,34 @@ bars = (
             scale=alt.Scale(domain=[0, 1]),
             axis=alt.Axis(format=".2f", labelFontSize=14, titleFontSize=16),
         ),
-        color=alt.Color(
-            "type:N",
-            scale=alt.Scale(
-                domain=["Ground Truth", "Prediction"], range=["green", "steelblue"]
-            ),
-            legend=alt.Legend(title="Token Type", titleFontSize=16, labelFontSize=14),
-        ),
         tooltip=[
             alt.Tooltip("token:N", title="Token"),
-            alt.Tooltip("probability:Q", title="Probability", format=".2f"),
-            alt.Tooltip("type:N", title="Type"),
         ],
     )
-    .properties(height=300)
 )
 annot1 = (
     alt.Chart(pd.DataFrame({"token": [gt]}))
     .mark_text(
@@ -233,12 +278,11 @@ annot1 = (
         dx=25,
         fontSize=14,
         fontWeight="bold",
-        color="green",
     )
     .encode(x=alt.X("token:N", sort=options), y=alt.value(1))
 )
-# second line: “truth=4”
 annot2 = (
     alt.Chart(pd.DataFrame({"token": [gt]}))
     .mark_text(
@@ -247,185 +291,164 @@ annot2 = (
         dx=35,
         fontSize=14,
         fontWeight="bold",
-        color="green",
     )
     .encode(x=alt.X("token:N", sort=options), y=alt.value(1))
 )
 # 4) Layer them in order: background, bars, annotation
-final_chart = (bg + bars + annot1 + annot2).properties(height=300)
 st.altair_chart(final_chart, use_container_width=True)
-ce_loss = -torch.log(torch.clamp(probs_for_charts[gt_index_for_charts], min=1e-9))
-if gt_numeric_for_charts is None:  # Text token
-    ntl_mse_loss = torch.tensor(float("nan"))  # MSE not applicable for text
-    ntl_was_loss = torch.tensor(float("nan"))  # WAS not applicable for text
-else:  # Numeric token
-    numeric_probs_for_loss = probs_for_charts[:10]  # Probabilities for 0-9
-    # Ensure numeric_probs_for_loss sums to 1 for NTL calculations if it's a subset
-    numeric_probs_sum = torch.sum(numeric_probs_for_loss)
-    if numeric_probs_sum > 1e-6:  # Avoid division by zero
-        normalized_numeric_probs = numeric_probs_for_loss / numeric_probs_sum
-    else:
-        normalized_numeric_probs = torch.zeros_like(numeric_probs_for_loss)
-    loss_values_tensor = torch.arange(0, 10, dtype=torch.float32)
-    # Use normalized probabilities for NTL if only considering numeric tokens
-    if gt_choice_for_charts != "Text" and torch.sum(probs_for_charts[:10]) > 1e-6:
-        pred_value = torch.sum(
-            (probs_for_charts[:10] / torch.sum(probs_for_charts[:10]))
-            * loss_values_tensor
-        )
-    elif (
-        gt_choice_for_charts != "Text"
-    ):  # if sum is zero, pred_value is ill-defined or 0
-        pred_value = torch.tensor(0.0)
-    else:  # Should not happen if gt_numeric_for_charts is not None
-        pred_value = torch.tensor(float("nan"))
-    if not torch.isnan(pred_value):
-        ntl_mse_loss = ntl_mse_loss = (
-            NTL_MSE_SCALING * (pred_value - float(gt_numeric_for_charts)) ** 2
-        )
-        abs_diff = torch.abs(loss_values_tensor - float(gt_numeric_for_charts))
-        if gt_choice_for_charts != "Text" and torch.sum(probs_for_charts[:10]) > 1e-6:
-            ntl_was_loss = torch.sum(
-                (probs_for_charts[:10] / torch.sum(probs_for_charts[:10])) * abs_diff
-            )
-        elif gt_choice_for_charts != "Text":
-            ntl_was_loss = torch.tensor(0.0)
-        else:
-            ntl_was_loss = torch.tensor(float("nan"))
     else:
-        ntl_mse_loss = torch.tensor(float("nan"))
-        ntl_was_loss = torch.tensor(float("nan"))
-ce_val = round(ce_loss.item(), 3)
-mse_val = round(ntl_mse_loss.item(), 3) if not torch.isnan(ntl_mse_loss) else "N/A"
-was_val = round(ntl_was_loss.item(), 3) if not torch.isnan(ntl_was_loss) else "N/A"
-if len(st.session_state.loss_history) < st.session_state.demo_step + 1:
     st.session_state.loss_history.append(
         {
-            "token_index": np.argmax(
-                st.session_state.active_scenarios[st.session_state["demo_step"]][
-                    "values"
-                ]
-            ),
-            # int(np.argmax(st.session_state['values']))
-            # int(),
-            "CE": ce_val,
-            "NTL-MSE": mse_val if mse_val != "N/A" else None,
-            "NTL-WAS": was_val if was_val != "N/A" else None,
         }
     )
-    last_step = st.session_state.demo_step
-if st.session_state.loss_history:
-    loss_plot_data = []
-    for entry in st.session_state.loss_history:
-        for loss_type in ["CE", "NTL-MSE", "NTL-WAS"]:
-            if entry[loss_type] is not None:
-                loss_plot_data.append(
-                    {
-                        "Token Index": entry["token_index"],
-                        "Loss Type": loss_type,
-                        "Loss Value": entry[loss_type],  # TODO: clip to MAX_LOSS_PLOT?
-                    }
-                )
-    df_loss_plot = pd.DataFrame(loss_plot_data)
 loss_data = {"Loss": ["Cross Entropy"], "Value": [ce_val]}
 if was_val != "N/A":
     loss_data["Loss"].append("NTL-WAS")
     loss_data["Value"].append(was_val)
-if mse_val != "N/A":
-    loss_data["Loss"].append("NTL-MSE")
-    loss_data["Value"].append(mse_val)
 loss_df = pd.DataFrame(loss_data)
-# ============== Chart Display ==============
-st.subheader("Loss Evolution Over Scenarios")
-x_domain = list(range(10))
 grouped_chart = (
     alt.Chart(df_loss_plot)
     .mark_bar()
     .encode(
         x=alt.X(
-            "Token Index:O",
-            title="Predicted Token Index",
-            axis=alt.Axis(labelAngle=0),
-            scale=alt.Scale(domain=x_domain),
         ),
         y=alt.Y(
-            "Loss Value:Q", title="Loss", scale=alt.Scale(domain=[0, MAX_LOSS_PLOT])
         ),
-        color=alt.Color("Loss Type:N", legend=alt.Legend(title="Loss")),
-        xOffset="Loss Type:N",  # <== this causes the grouping instead of stacking
     )
-    .properties(height=300)
 )
 st.altair_chart(grouped_chart, use_container_width=True)
 # Create a single chart for loss visualization
-st.subheader("Loss Comparison")
-st.markdown("""
-Adjust the sliders to set a predicted probability for each token (0-9 and "Text").
-The sliders are vertical and compact. The app normalizes the slider values
-to form a valid probability distribution, visualizes it, and computes the corresponding
-Cross Entropy, NTL-MSE, and NTL-WAS losses.
-""")
-# Create an Altair chart that will look good and redraw cleanly
-chart = (
-    alt.Chart(loss_df)
-    .mark_bar()
-    .encode(
-        x=alt.X("Loss:N", sort=loss_df["Loss"].tolist()),
-        y=alt.Y(
-            "Value:Q",
-            scale=alt.Scale(
-                domain=[
-                    0,
-                    max(
-                        loss_df["Value"].max() * 1.2,
-                        20 if st.session_state.running_demo else 0.5,
-                    ),
-                ]
-            ),
-        ),
-        color=alt.Color(
-            "Loss:N",
-            scale=alt.Scale(
-                domain=["Cross Entropy", "NTL-WAS", "NTL-MSE"],
-                range=["steelblue", "red", "forestgreen"],
-            ),
-        ),
-        tooltip=["Loss", "Value"],
     )
-    .properties(height=300)
-)
-# Sliders and Ground Truth Selector
-# These widgets will read their initial values from st.session_state.
-# User interactions will update st.session_state directly due to their keys.
-if not st.session_state.running_demo:
-    st.markdown("#### Predicted Token Probabilities")
     cols = st.columns(len(options))
     for i, col in enumerate(cols):
         label = options[i]  # Use token name directly for label
@@ -436,23 +459,58 @@ if not st.session_state.running_demo:
                 max_value=1.0,
                 step=0.01,
                 height=50,
-                key=f"slider_{i}",  # This key links the widget to st.session_state[f"slider_{i}"]
                 slider_color="green",
                 track_color="lightgray",
                 thumb_color="black",
             )
-# Add value labels on top of bars
-text = chart.mark_text(align="center", baseline="bottom", dy=-5, fontSize=14).encode(
-    text=alt.Text("Value:Q", format=".3f")
-)
-# Combine chart and text
-final_chart = chart + text
 # Display chart with the full container width
-st.altair_chart(final_chart, use_container_width=True)
 # --- Polling Rerun for Demo Mode ---
 # If the demo is running and we haven't just advanced (which would have caused a rerun),
@@ -463,17 +521,18 @@ if st.session_state.running_demo:
     time.sleep(0.1)
     st.rerun()
-# Add explanation of the demonstration
 st.markdown("""
-### What Does This Demo Show?
-- **Cross Entropy Loss**: Only cares if the prediction is exactly right or wrong - it doesn't consider how "close" a numerical prediction is.
-- **Number Token Loss (NTL)**: Considers numerical proximity - predicting "7" when the true value is "8" is better than predicting "2".
 """)
-# References / resources section with links (common to both modes)
-st.markdown("### Resources")
 st.markdown("""
-- [Paper: Number Token Loss (ArXiv)](https://arxiv.org/abs/2411.02083)
-- [GitHub: Number Token Loss](https://github.com/tum-ai/number-token-loss)
 """)

+import logging
 import time
 import altair as alt
 import streamlit_vertical_slider as svs
 import torch
+from scenarios import dirac, gauss, make_bimodal_scenarios
+logging.getLogger("streamlit.watcher.local_sources_watcher").setLevel(logging.ERROR)
 DEMO_INTERVAL = 1.5
+CE_SCALING = 0.25
+MAX_LOSS_PLOT = 6
 LAST_STEP = -1
 # Define options globally as it's used in initialization and UI
 options = [str(i) for i in range(10)] + ["Text"]
+def compute_losses(probs: torch.Tensor, gt_token: str) -> tuple[float, float, float]:
+    """Compute CE, NTL-MAE, NTL-WAS losses for the given probability vector and ground truth token."""
+    ce_loss = CE_SCALING * -torch.log(
+        torch.clamp(probs[options.index(gt_token)], min=1e-9)
+    )
+    numeric_mass = probs[:10].sum()
+    if gt_token == "Text" or numeric_mass < 1e-6:
+        return ce_loss.item(), 0.0, 0.0
+    gt_numeric = int(gt_token)
+    token_vals = torch.arange(10, dtype=torch.float32)
+    mae = numeric_mass * abs(torch.dot(token_vals, probs[:10]) - gt_numeric)
+    was = numeric_mass * torch.dot(probs[:10], torch.abs(token_vals - gt_numeric))
+    return round(ce_loss.item(), 3), round(mae.item(), 3), round(was.item(), 3)
 # --- Session State Initialization ---
 # Ensure all session state variables are initialized before first use, especially by widgets.
 if "running_demo" not in st.session_state:
 if "loss_history" not in st.session_state:
     st.session_state.loss_history = []
 # Initialize states for sliders and ground_truth selector
 # Using len(options) to correctly size for 0-9 + "Text"
 for i in range(len(options)):
     if f"slider_{i}" not in st.session_state:
+        st.session_state[f"slider_{i}"] = 0
 if "ground_truth" not in st.session_state:
+    st.session_state["ground_truth"] = options[5]
+if "manual_ground_truth" not in st.session_state:
+    st.session_state["manual_ground_truth"] = options[5]
+if "demo_name" not in st.session_state:
+    st.session_state["demo_name"] = "Dirac"
+st.title("NTL -- The Number Token Loss 🚀")
 st.markdown(
+    """This is the interactive demo for our [ICML 2025](https://arxiv.org/abs/2411.02083) paper!🎉
+    ➡️ NTL augments cross-entropy to help LMs reason better with numbers 🧠
     """
 )
+st.subheader("Demo 1 — NTL vs. Cross Entropy in 3 Scenarios")
+st.markdown("""
+1️⃣ Pick a ground truth token: a digit (0–9) or "Text" 📝 (simulates generic text tokens).
+2️⃣ Choose a demo:
+- **Dirac** ⚡: All probability mass on one token.
+- **Gaussian** 🌊: Soft bell-curve around the true number.
+- **Bimodal** 🎯: Two peaks moving away from the target.
+Watch how losses evolve as predictions get worse — and see how NTL shines compared to CE! 🌟
+""")
 if "ground_truth" not in st.session_state:
     st.session_state["ground_truth"] = "4"
+gt = st.selectbox("Ground Truth Token", options=options, key="ground_truth")
 def apply_scenario(step_idx):
 def start_dirac_demo():
+    st.session_state.loss_history = []
     st.session_state.active_scenarios = dirac
+    st.session_state.demo_name = "Dirac"
     st.session_state.running_demo = True
     st.session_state.demo_step = 0
     st.session_state.last_update_time = time.time()
 def start_gauss_demo():
+    st.session_state.loss_history = []
     st.session_state.active_scenarios = gauss
+    st.session_state.demo_name = "Gauss"
     st.session_state.running_demo = True
     st.session_state.demo_step = 0
     st.session_state.last_update_time = time.time()
 def start_bimodal_demo():
+    st.session_state.loss_history = []
+    gt = st.session_state["ground_truth"]
+    st.session_state.active_scenarios = make_bimodal_scenarios(gt, options)
+    st.session_state.demo_name = f"Bimodal (GT={gt})"
     st.session_state.running_demo = True
     st.session_state.demo_step = 0
     st.session_state.last_update_time = time.time()
     scenario = st.session_state.active_scenarios
     current_time = time.time()
     if current_time - st.session_state.last_update_time > DEMO_INTERVAL:
+        # if we haven’t yet shown the last scenario, advance
+        if st.session_state.demo_step < len(scenario) - 1:
+            st.session_state.demo_step += 1
+            apply_scenario(st.session_state.demo_step)
+            st.session_state.last_update_time = current_time
+            st.rerun()
+        else:
+            # we just displayed the final case → stop
+            st.session_state.running_demo = False
 # --- UI Rendering ---
 # This section renders the main UI. It executes after any potential rerun from the block above.
             start_bimodal_demo()
             st.rerun()
 current_prob_values_from_state = [
+    st.session_state.get(f"slider_{j}", 0)
+    for j in range(len(options))  # 1.0 / len(options)) for j in range(len(options))
 ]
 total_from_state = sum(current_prob_values_from_state)
 probs_for_charts = (
     else torch.tensor([v / total_from_state for v in current_prob_values_from_state])
 )
+# Use manual GT token when not in running demo
+gt_choice_for_charts = (
+    st.session_state["manual_ground_truth"]
+    if not st.session_state.running_demo
+    else st.session_state["ground_truth"]
+)
 if gt_choice_for_charts == "Text":
     gt_index_for_charts = 10  # Assuming "Text" is the 11th item (index 10)
     gt_numeric_for_charts = None
     gt_numeric_for_charts = gt_index_for_charts
 gt = st.session_state["ground_truth"]
+demo_name = st.session_state["demo_name"]
+st.markdown(f"#### Predicted distribution — ground truth: {gt}")
 df_dist = pd.DataFrame(
     {"token": options, "probability": probs_for_charts.numpy().round(2)}
 )
     "Ground Truth" if token == gt_choice_for_charts else "Prediction"
     for token in options
 ]
 bars = (
     alt.Chart(df_dist)
+    .mark_bar(color="dodgerblue", size=40)
     .encode(
         x=alt.X(
             "token:N",
             title="Token",
             sort=options,
+            axis=alt.Axis(
+                labelAngle=0,
+                labelFontSize=14,
+                titleFontSize=16,
+                labelAlign="center",
+                labelFlush=False,
+            ),
         ),
         y=alt.Y(
             "probability:Q",
             scale=alt.Scale(domain=[0, 1]),
             axis=alt.Axis(format=".2f", labelFontSize=14, titleFontSize=16),
         ),
         tooltip=[
             alt.Tooltip("token:N", title="Token"),
+            alt.Tooltip("probability:Q", title="Predicted Prob.", format=".2f"),
         ],
     )
 )
+bg_bar = pd.DataFrame({"token": [gt], "height": [1.0]})
+gt_bar = (
+    alt.Chart(bg_bar)
+    .mark_bar(
+        color="darkgreen",
+        size=20,
+        opacity=0.3,
+        stroke="gray",
+        strokeWidth=2,
+        strokeDash=[4, 4],
+    )
+    .encode(
+        x=alt.X("token:N", sort=options),
+        y=alt.Y("height:Q", scale=alt.Scale(domain=[0, 1])),
+        tooltip=[
+            alt.Tooltip("token:N", title="Ground Truth"),
+            alt.Tooltip("height:Q", title="Desired mass", format=".2f"),
+        ],
+    )
+)
 annot1 = (
     alt.Chart(pd.DataFrame({"token": [gt]}))
     .mark_text(
         dx=25,
         fontSize=14,
         fontWeight="bold",
+        color="darkgreen",
     )
     .encode(x=alt.X("token:N", sort=options), y=alt.value(1))
 )
 annot2 = (
     alt.Chart(pd.DataFrame({"token": [gt]}))
     .mark_text(
         dx=35,
         fontSize=14,
         fontWeight="bold",
+        color="darkgreen",
     )
     .encode(x=alt.X("token:N", sort=options), y=alt.value(1))
 )
 # 4) Layer them in order: background, bars, annotation
+final_chart = (gt_bar + bars + annot1 + annot2).properties(height=200)
 st.altair_chart(final_chart, use_container_width=True)
+ce_val, mae_val, was_val = compute_losses(probs_for_charts, gt_choice_for_charts)
+if (
+    st.session_state.running_demo
+    and len(st.session_state.loss_history) < st.session_state.demo_step + 1
+):
+    step = st.session_state.demo_step
+    scenario = st.session_state.active_scenarios[step]
+    ce, mae, was = compute_losses(probs_for_charts, gt_choice_for_charts)
+    # pick x_val differently for bimodal vs others
+    if st.session_state.demo_name.startswith("Bimodal"):
+        x_val = scenario["name"]  # e.g. "(4,4)", "(3,5)", …
     else:
+        # exactly like before:
+        best_idx = np.argmax(scenario["values"])
+        x_val = options[best_idx]  # "0", "1", …, or "Text"
     st.session_state.loss_history.append(
         {
+            "step": step,
+            "x_val": x_val,
+            "Cross Entropy": ce,
+            "NTL-MAE": mae,
+            "NTL-WAS": was,
         }
     )
+#  1) build a raw DF from histories
+df = pd.DataFrame(st.session_state.loss_history)
+if df.empty:
+    # define an empty "melted" DataFrame with the right columns
+    df_loss_plot = pd.DataFrame(columns=["step", "x_val", "Loss Type", "Loss Value"])
+else:
+    # now it's safe to melt
+    df_loss_plot = df.melt(
+        id_vars=["step", "x_val"],
+        value_vars=["Cross Entropy", "NTL-MAE", "NTL-WAS"],
+        var_name="Loss Type",
+        value_name="Loss Value",
+    )
 loss_data = {"Loss": ["Cross Entropy"], "Value": [ce_val]}
 if was_val != "N/A":
     loss_data["Loss"].append("NTL-WAS")
     loss_data["Value"].append(was_val)
+if mae_val != "N/A":
+    loss_data["Loss"].append("NTL-MAE")
+    loss_data["Value"].append(mae_val)
 loss_df = pd.DataFrame(loss_data)
+if st.session_state.demo_name.startswith("Bimodal"):
+    domain = [sc["name"] for sc in st.session_state.active_scenarios]
+    x_title = f"Offset from GT {st.session_state['ground_truth']}"
+else:
+    domain = options
+    x_title = f"Maximum of predicted {st.session_state['demo_name']} distribution"
+# ============== Chart Display ==============
+st.markdown("#### Loss as a function of predicted distribution")
 grouped_chart = (
     alt.Chart(df_loss_plot)
     .mark_bar()
     .encode(
         x=alt.X(
+            "x_val:N",
+            title=x_title,
+            sort=domain,
+            scale=alt.Scale(domain=domain),
+            axis=alt.Axis(labelAngle=0, labelFontSize=14, titleFontSize=16),
         ),
         y=alt.Y(
+            "Loss Value:Q",
+            title="Loss Value",
+            scale=alt.Scale(domain=[0, MAX_LOSS_PLOT], nice=False, clamp=True),
+            axis=alt.Axis(labelFontSize=14, titleFontSize=16),
         ),
+        color=alt.Color(
+            "Loss Type:N",
+            scale=alt.Scale(
+                domain=["Cross Entropy", "NTL-WAS", "NTL-MAE"],
+                range=["red", "limegreen", "blueviolet"],
+            ),
+            legend=alt.Legend(
+                title="",
+                orient="top",
+                direction="horizontal",
+                columns=3,
+            ),
+        ),
+        xOffset="Loss Type:N",  # grouped bars
+        tooltip=[
+            alt.Tooltip("x_val:N", title="Scenario"),
+            alt.Tooltip("Loss Type:N", title="Loss Type"),
+            alt.Tooltip("Loss Value:Q", title="Value", format=".3f"),
+        ],
     )
+    .properties(height=250)
 )
 st.altair_chart(grouped_chart, use_container_width=True)
 # Create a single chart for loss visualization
+if not st.session_state.running_demo:
+    for i in range(len(options)):
+        st.session_state[f"slider_{i}"] = 0.0
+    st.session_state.demo_step = 0
+    st.subheader("Demo 2 -- Manual loss comparison")
+    st.subheader("🧪 Demo 2 — Craft your own distribution")
+    st.markdown("""
+    This demo gives you more control but is harder to interpret. See it as a playground! 🎨
+    Manually adjust the sliders to change the predicted probabilities for each token.
+    The demo normalizes the values to form a valid probability distribution and calculates the losses.
+    👣 **Steps:**
+    - Use the **vertical sliders** to allocate probability to each token.
+    - Choose the correct **Ground Truth Token** (0–9 or "Text" 📜).
+    - Observe how each loss function reacts.
+    💡 **Tip:** Want to trick the loss? Try putting all mass on the wrong token or spread it wildly. See how NTL handles it! 😈
+    """)
+    manual_gt = st.selectbox(
+        "Ground Truth Token",
+        options=options,
+        key="manual_ground_truth",
+    )
+    loss_df = pd.DataFrame(
+        {
+            "Loss": ["Cross Entropy", "NTL-MAE", "NTL-WAS"],
+            "Value": [ce_val, mae_val, was_val],
+        }
     )
+    # Sliders and Ground Truth Selector
+    # These widgets will read their initial values from st.session_state.
+    # User interactions will update st.session_state directly due to their keys.
+    st.markdown("#### Adjust the predicted token probability")
     cols = st.columns(len(options))
     for i, col in enumerate(cols):
         label = options[i]  # Use token name directly for label
                 max_value=1.0,
                 step=0.01,
                 height=50,
+                key=f"slider_{i}",
                 slider_color="green",
                 track_color="lightgray",
                 thumb_color="black",
             )
+    chart = (
+        alt.Chart(loss_df)
+        .mark_bar()
+        .encode(
+            x=alt.X("Loss:N", sort=loss_df["Loss"].tolist()),
+            y=alt.Y(
+                "Value:Q",
+                scale=alt.Scale(
+                    domain=[
+                        0,
+                        max(
+                            loss_df["Value"].max() * 1.2,
+                            20 if st.session_state.running_demo else 0.5,
+                        ),
+                    ]
+                ),
+            ),
+            color=alt.Color(
+                "Loss:N",
+                scale=alt.Scale(
+                    domain=["Cross Entropy", "NTL-WAS", "NTL-MAE"],
+                    range=["orangered", "limegreen", "blueviolet"],
+                ),
+            ),
+            tooltip=["Loss", "Value"],
+        )
+        .properties(height=300)
+    )
+    text = chart.mark_text(
+        align="center", baseline="bottom", dy=-5, fontSize=14
+    ).encode(text=alt.Text("Value:Q", format=".3f"))
+    final_chart = chart + text
+    st.altair_chart(final_chart, use_container_width=True)
+# # Add value labels on top of bars
+# text = chart.mark_text(align="center", baseline="bottom", dy=-5, fontSize=14).encode(
+#     text=alt.Text("Value:Q", format=".3f")
+# )
+# # Combine chart and text
+# final_chart = chart + text
 # Display chart with the full container width
+# st.altair_chart(final_chart, use_container_width=True)
 # --- Polling Rerun for Demo Mode ---
 # If the demo is running and we haven't just advanced (which would have caused a rerun),
     time.sleep(0.1)
     st.rerun()
 st.markdown("""
+### 🤔 TL;DR — Why NTL?
+Cross Entropy only cares if the prediction is exactly right or wrong ❌✅ — it doesn’t care *how close* a guess is!
+That’s bad for LLMs doing math and numeric reasoning 🧮.
+💥 NTL fixes that: it behaves like a regression loss on the token head, rewarding predictions that are numerically close.
 """)
+st.markdown("#### 📚 Further Resources")
 st.markdown("""
+- 📄 [ICML 2025 Paper](https://arxiv.org/abs/2411.02083)
+- 🌐 [NTL Landing Page](https://tum-ai.github.io/number-token-loss/)
+- 💻 [GitHub Code](https://github.com/tum-ai/number-token-loss)
 """)