Spaces:

RishiRP
/

Talk2TaskDemo1

Sleeping

App Files Files Community

RishiRP commited on Sep 23

Commit

db91bf5

verified ·

1 Parent(s): 3e2cf36

Update app.py

Browse files

Files changed (1) hide show

app.py +145 -67

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 # app.py
-# From Talk to Task — Accuracy & Diagnostics (stable / fast)
 # Model: swiss-ai/Apertus-8B-Instruct-2509
-# Few-shot (EN/FR/DE/IT one each), deterministic by default, optional fallback sampling toggle,
-# soft token cap = 1024 by default, CUDA fp16 + optional 4-bit, GT scoring & downloads.
 import os
 import re
@@ -46,28 +46,20 @@ CONTEXT_GUIDE = (
     "- update_kyc_*: KYC updates (activity, purpose, origin of assets, total assets)\n"
 )
-# Few-shot: exactly ONE compact example per language
 FEW_SHOTS = [
     # EN
-    {
-        "transcript": "Agent: Can we meet Friday 3pm on Teams?\nClient: Yes, Friday 3pm works.\nAgent: I’ll send the invite.",
-        "labels": ["schedule_meeting"]
-    },
     # FR
-    {
-        "transcript": "Client: Mon numéro a changé: +41 44 000 00 00.\nConseiller: Merci, je mets à jour vos coordonnées.",
-        "labels": ["update_contact_info_non_postal"]
-    },
     # DE
-    {
-        "transcript": "Kunde: Neue Postadresse: Musterstrasse 1, 8000 Zürich.\nBerater: Danke, ich aktualisiere die Postadresse.",
-        "labels": ["update_contact_info_postal_address"]
-    },
     # IT
-    {
-        "transcript": "Cliente: Totale patrimonio confermato a 8 milioni CHF.\nConsulente: Perfetto, aggiorno i dati KYC sul totale degli asset.",
-        "labels": ["update_kyc_total_assets"]
-    },
 ]
 # --------------------- WRITABLE HF CACHE -----------------------------
@@ -101,6 +93,27 @@ RE_DISCLAIMER = re.compile(r"^\s*disclaimer\s*:", re.IGNORECASE)
 RE_DROP = re.compile(r"(readme|terms|synthetic transcript)", re.IGNORECASE)
 SMALLTALK_RE = re.compile(r"\b(thanks?|merci|grazie|danke|tsch(ü|u)ss|ciao|bye|ok(ay)?)\b", re.IGNORECASE)
 def _json_from_text(text: str) -> str:
     s = text.strip()
     if s.startswith("{") and s.endswith("}"):
@@ -114,8 +127,7 @@ def safe_json_labels(s: str, allowed: List[str]) -> List[str]:
     except Exception:
         return []
     labels = data.get("labels", [])
-    clean = []
-    seen = set()
     for lab in labels:
         if lab in allowed and lab not in seen:
             clean.append(lab); seen.add(lab)
@@ -221,6 +233,44 @@ def card_markdown(title: str, value: str, hint: str = "") -> str:
 </div>
 """
 # -------------------------- MODEL -----------------------------------
 class HFModel:
@@ -264,10 +314,10 @@ class HFModel:
             self.model = self.model.to(DEVICE)
     @torch.inference_mode()
-    def generate_json(self, prompt: str, max_new_tokens=48, allow_sampling=False) -> Tuple[str, Dict[str, int]]:
         """
         Deterministic by default. If allow_sampling=True (toggle), we use mild temperature.
-        Returns (json_text, token_stats)
         """
         tok = self.tokenizer
         mdl = self.model
@@ -282,12 +332,13 @@ class HFModel:
             eos_token_id=tok.eos_token_id,
         )
         if allow_sampling:
-            # mild sampling; disabled by default to avoid CUDA multinomial issues on T4
             kwargs.update(dict(do_sample=True, temperature=0.25, top_p=0.9))
         else:
             kwargs.update(dict(do_sample=False, temperature=0.0, top_p=1.0))
         out = mdl.generate(**inputs, **kwargs)
         prompt_tokens = int(inputs.input_ids.shape[-1])
         output_tokens = int(out.shape[-1] - inputs.input_ids.shape[-1])
@@ -299,7 +350,7 @@ class HFModel:
             "prompt_tokens": prompt_tokens,
             "output_tokens": output_tokens,
             "total_tokens": total_tokens,
-        }
 _MODEL_CACHE: Dict[Tuple[str, Optional[str], bool], HFModel] = {}
@@ -323,6 +374,19 @@ def preprocess_text(txt: str, add_header: bool, strip_smalltalk: bool) -> str:
     cleaned = "\n".join(lines[-32768:])
     return f"[EMAIL/MESSAGE SIGNAL]\n{cleaned}" if add_header else cleaned
 def run_single(
     custom_repo_id: str,
     rules_json: Optional[gr.File],
@@ -341,45 +405,41 @@ def run_single(
 ):
     """Returns: repo, revision, predicted_json, metric_cards_md, diag_cards_md, raw_metrics_json"""
     repo = (custom_repo_id or DEFAULT_REPO).strip()
     revision = "main"
     allowed = read_rules_labels(rules_json) or DEFAULT_LABEL_SET
-    # Preprocess + cap
-    effective_len = len(transcript)
     if preprocess:
         transcript = preprocess_text(transcript, add_header, strip_smalltalk)
-        effective_len = len(transcript)
-    cap_info = ""
-    if soft_token_cap and soft_token_cap > 0:
-        approx_chars = int(soft_token_cap * 4)
-        if len(transcript) > approx_chars:
-            transcript = transcript[-approx_chars:]
-            cap_info = f"(soft cap ~{soft_token_cap}t)"
     # Build prompt
     system = system_instructions or SYSTEM_INSTRUCTIONS_BASE
-    prompt = build_prompt(system, context_text or CONTEXT_GUIDE, transcript, allowed, use_fewshot)
     model = get_model(repo, revision, load_in_4bit)
-    # Deterministic pass only (fast & stable)
-    t0 = time.perf_counter()
-    raw_json, tok_stats = model.generate_json(prompt, max_new_tokens=48, allow_sampling=False)
     pred_labels = safe_json_labels(raw_json, allowed)
     # Optional fallback sampling (OFF by default)
     fallback_used = False
     if enable_fallback_sampling and not pred_labels:
-        raw_json2, tok_stats2 = model.generate_json(prompt, max_new_tokens=48, allow_sampling=True)
         pred_labels2 = safe_json_labels(raw_json2, allowed)
         if pred_labels2:
             pred_labels = pred_labels2
             tok_stats = tok_stats2
             fallback_used = True
-    total_latency = time.perf_counter() - t0
     est_cost = (total_latency / 3600.0) * max(0.0, float(hourly_rate or 0.0))
     # Ground truth
@@ -406,16 +466,17 @@ def run_single(
     metric_cards += card("Missing labels", json.dumps(missing, ensure_ascii=False) if gt_labels is not None else "—", "Expected but not predicted")
     metric_cards += card("Extra labels", json.dumps(extra, ensure_ascii=False) if gt_labels is not None else "—", "Predicted but not expected")
-    # Diagnostics cards
     diag_cards = ""
     diag_cards += card("Model / Rev", f"{repo} / {revision}")
     diag_cards += card("Device", f"{DEVICE} ({GPU_NAME})")
     diag_cards += card("Precision dtype", f"{DTYPE_FALLBACK}")
     diag_cards += card("4-bit", f"{bool(load_in_4bit)}")
     diag_cards += card("Allowed labels", json.dumps(allowed, ensure_ascii=False))
-    diag_cards += card("Effective text length", f"{effective_len} chars {cap_info}")
     diag_cards += card("Tokens", f"prompt={tok_stats['prompt_tokens']}, output={tok_stats['output_tokens']}, total={tok_stats['total_tokens']}", "Token counts influence latency & cost")
-    diag_cards += card("Latency", f"{total_latency:.2f} s", "End-to-end time")
     diag_cards += card("Cost (est.)", f"${(est_cost):.6f} @ {hourly_rate:.4f}/hr")
     diag_cards += card("Fallback sampling used", "Yes" if fallback_used else "No", "Sampling can be slower/unstable on T4; off by default")
@@ -431,9 +492,11 @@ def run_single(
         "extra": extra if gt_labels is not None else None,
         "per_label": per_label if gt_labels is not None else None,
         "token_stats": tok_stats,
-        "latency_seconds": round(total_latency, 3),
         "estimated_cost_usd": round(est_cost, 6),
         "fallback_used": fallback_used,
     }
     return (
@@ -475,10 +538,11 @@ def run_batch(
     model = get_model(repo, revision, load_in_4bit)
     rows = [["filename","labels"]]
-    per_sample_rows = [["filename","pred_labels","gold_labels","precision","recall","f1","exact_match","hamming_loss","missing","extra"]]
     totals = {"tp":0,"fp":0,"fn":0,"pred_total":0,"gold_total":0}
     label_global = {lab: {"tp":0,"fp":0,"fn":0} for lab in allowed}
-    total_prompt_tokens = 0; total_output_tokens = 0; total_secs = 0.0; n=0; with_gt=0
     system = system_instructions or SYSTEM_INSTRUCTIONS_BASE
@@ -488,29 +552,28 @@ def run_batch(
         except Exception:
             rows.append([name, "[]  # unreadable"]); continue
         if preprocess:
             txt = preprocess_text(txt, add_header, strip_smalltalk)
-        if soft_token_cap and soft_token_cap > 0:
-            approx_chars = int(soft_token_cap * 4)
-            if len(txt) > approx_chars:
-                txt = txt[-approx_chars:]
-        prompt = build_prompt(system, context_text or CONTEXT_GUIDE, txt, allowed, use_fewshot)
-        t0 = time.perf_counter()
-        raw_json, tok_stats = model.generate_json(prompt, max_new_tokens=48, allow_sampling=False)
         pred = safe_json_labels(raw_json, allowed)
         if enable_fallback_sampling and not pred:
-            raw_json2, tok_stats2 = model.generate_json(prompt, max_new_tokens=48, allow_sampling=True)
             pred2 = safe_json_labels(raw_json2, allowed)
             if pred2:
-                pred = pred2
-                tok_stats = tok_stats2
-        total_secs += (time.perf_counter() - t0)
         total_prompt_tokens += tok_stats["prompt_tokens"]
         total_output_tokens += tok_stats["output_tokens"]
         n += 1
         rows.append([name, json.dumps(pred, ensure_ascii=False)])
@@ -538,13 +601,21 @@ def run_batch(
                 round(ham,4),
                 json.dumps(missing, ensure_ascii=False),
                 json.dumps(extra, ensure_ascii=False),
             ])
     tp, fp, fn = totals["tp"], totals["fp"], totals["fn"]
     prec = tp / (tp + fp) if (tp + fp) else 0.0
     rec  = tp / (tp + fn) if (tp + fn) else 0.0
     f1   = 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0
-    est_cost = (total_secs / 3600.0) * max(0.0, float(hourly_rate or 0.0))
     coverage = {lab: 0 for lab in allowed}
     for r in rows[1:]:
@@ -572,8 +643,10 @@ def run_batch(
             "avg_prompt_tokens": round(total_prompt_tokens / n, 2) if n else 0.0,
             "avg_output_tokens": round(total_output_tokens / n, 2) if n else 0.0,
         },
-        "latency_seconds_total": round(total_secs, 3),
-        "avg_latency_seconds": round(total_secs / n, 3) if n else 0.0,
         "estimated_cost_usd": round(est_cost, 6),
     }
@@ -587,7 +660,8 @@ def run_batch(
     diag_cards += card("4-bit", f"{bool(load_in_4bit)}")
     diag_cards += card("Files processed", f"{n} (with GT: {with_gt})")
     diag_cards += card("Tokens (totals)", f"prompt={total_prompt_tokens}, output={total_output_tokens}")
-    diag_cards += card("Latency", f"total={summary['latency_seconds_total']} s, avg={summary['avg_latency_seconds']} s")
     diag_cards += card("Cost (est.)", f"${summary['estimated_cost_usd']} @ {hourly_rate:.4f}/hr")
     diag_cards += card("Allowed labels", json.dumps(allowed, ensure_ascii=False))
@@ -612,13 +686,17 @@ def run_batch(
 # ----------------------------- UI -----------------------------------
-with gr.Blocks(title="From Talk to Task — Accuracy & Diagnostics (stable)") as demo:
     gr.Markdown(
         f"""
         # From Talk to Task — Accuracy & Diagnostics (EN/FR/DE/IT)
         **Default model:** `{DEFAULT_REPO}` (GPU + 4-bit recommended).
-        Upload ground truth to compute **Precision / Recall / F1 / Exact match / Hamming loss**.
         Upload a **Rules JSON** (`{{"labels":[...]}}`) to override allowed labels.
         **Model output schema:** `{{"labels": [...]}}`
@@ -640,7 +718,7 @@ with gr.Blocks(title="From Talk to Task — Accuracy & Diagnostics (stable)") as
     context = gr.Textbox(label="Context (User prefix)", value=CONTEXT_GUIDE, lines=6)
     with gr.Row():
-        soft_cap = gr.Slider(512, 32768, value=1024, step=1, label="Soft token cap (approx)")
         preprocess = gr.Checkbox(value=True, label="Enable preprocessing")
         add_header = gr.Checkbox(value=True, label="Add cues header")
         strip_smalltalk = gr.Checkbox(value=False, label="Strip smalltalk")

 # app.py
+# From Talk to Task — Windowed extraction + two latency measures
 # Model: swiss-ai/Apertus-8B-Instruct-2509
+# Few-shot: 1 each EN/FR/DE/IT; deterministic by default; optional sampling fallback toggle.
+# Soft token cap: 1024 by default. CUDA fp16 + optional 4-bit. GT scoring + downloads.
 import os
 import re
     "- update_kyc_*: KYC updates (activity, purpose, origin of assets, total assets)\n"
 )
+# Few-shot: exactly one per language (compact)
 FEW_SHOTS = [
     # EN
+    {"transcript": "Agent: Can we meet Friday 3pm on Teams?\nClient: Yes, Friday 3pm works.\nAgent: I’ll send the invite.",
+     "labels": ["schedule_meeting"]},
     # FR
+    {"transcript": "Client: Mon numéro a changé: +41 44 000 00 00.\nConseiller: Merci, je mets à jour vos coordonnées.",
+     "labels": ["update_contact_info_non_postal"]},
     # DE
+    {"transcript": "Kunde: Neue Postadresse: Musterstrasse 1, 8000 Zürich.\nBerater: Danke, ich aktualisiere die Postadresse.",
+     "labels": ["update_contact_info_postal_address"]},
     # IT
+    {"transcript": "Cliente: Totale patrimonio confermato a 8 milioni CHF.\nConsulente: Aggiorno i dati KYC sul totale degli asset.",
+     "labels": ["update_kyc_total_assets"]},
 ]
 # --------------------- WRITABLE HF CACHE -----------------------------
 RE_DROP = re.compile(r"(readme|terms|synthetic transcript)", re.IGNORECASE)
 SMALLTALK_RE = re.compile(r"\b(thanks?|merci|grazie|danke|tsch(ü|u)ss|ciao|bye|ok(ay)?)\b", re.IGNORECASE)
+# keyword windows (EN/FR/DE/IT) — expand as needed
+WINDOW_KEYWORDS = [
+    # meeting / schedule
+    r"\b(meet|meeting|schedule|appointment|teams|zoom|google meet|calendar)\b",
+    r"\b(rendez[- ]?vous|réunion|planifier|calendrier|teams|zoom)\b",
+    r"\b(termin|treffen|besprechung|kalender|teams|zoom)\b",
+    r"\b(appuntamento|riunione|calendario|teams|zoom)\b",
+    # address / phone / email
+    r"\b(address|street|avenue|road|postcode|phone|email)\b",
+    r"\b(adresse|rue|avenue|code postal|téléphone|courriel|email)\b",
+    r"\b(adresse|straße|strasse|plz|telefon|e-?mail)\b",
+    r"\b(indirizzo|via|cap|telefono|e-?mail)\b",
+    # KYC assets / totals / origin / purpose
+    r"\b(total assets|net worth|portfolio|real estate|origin of assets|source of wealth|purpose of relationship)\b",
+    r"\b(actifs totaux|patrimoine|immobilier|origine des fonds|source de richesse|but de la relation)\b",
+    r"\b(gesamtverm(ö|o)gen|verm(ö|o)gen|immobilien|herkunft der verm(ö|o)genswerte|zweck der gesch(ä|a)ftsbeziehung)\b",
+    r"\b(patrimonio totale|immobiliare|origine dei fondi|scopo della relazione)\b",
+    r"\b(chf|eur|usd|cur[13]|francs?)\b",
+    r"\b(\d{1,3}([.'’ ]\d{3})*(,\d+)?)(\s?(chf|eur|usd))\b",
+]
 def _json_from_text(text: str) -> str:
     s = text.strip()
     if s.startswith("{") and s.endswith("}"):
     except Exception:
         return []
     labels = data.get("labels", [])
+    clean, seen = [], set()
     for lab in labels:
         if lab in allowed and lab not in seen:
             clean.append(lab); seen.add(lab)
 </div>
 """
+# ------------------- WINDOWED EXTRACTION (fix for empty labels) -------------------
+def extract_windows(text: str, max_windows: int = 6, half_span_lines: int = 3) -> str:
+    """
+    Find up to `max_windows` windows around keyword hits; each window is ±`half_span_lines` lines.
+    If no hits, return the FIRST 8k characters instead of last chunk (common cause of misses).
+    """
+    lines = text.splitlines()
+    n = len(lines)
+    # collect hit line indices
+    hits: List[int] = []
+    pattern = re.compile("|".join(WINDOW_KEYWORDS), re.IGNORECASE)
+    for i, ln in enumerate(lines):
+        if pattern.search(ln):
+            hits.append(i)
+    # de-duplicate and cap
+    unique_hits = []
+    seen = set()
+    for idx in hits:
+        # bucket nearby hits to avoid redundant windows
+        bucket = idx // 2  # coarse bucketing
+        if bucket not in seen:
+            seen.add(bucket)
+            unique_hits.append(idx)
+    unique_hits = unique_hits[:max_windows]
+    if not unique_hits:
+        # return the opening chunk; most KYC/context often appears early
+        return "\n".join(lines[: min(2000, n)])
+    # Build windows and merge
+    windows = []
+    for idx in unique_hits:
+        a = max(0, idx - half_span_lines)
+        b = min(n, idx + half_span_lines + 1)
+        windows.append("\n".join(lines[a:b]))
+    return "\n...\n".join(windows)
 # -------------------------- MODEL -----------------------------------
 class HFModel:
             self.model = self.model.to(DEVICE)
     @torch.inference_mode()
+    def generate_json(self, prompt: str, max_new_tokens=48, allow_sampling=False) -> Tuple[str, Dict[str, int], float]:
         """
         Deterministic by default. If allow_sampling=True (toggle), we use mild temperature.
+        Returns (json_text, token_stats, model_latency_seconds)
         """
         tok = self.tokenizer
         mdl = self.model
             eos_token_id=tok.eos_token_id,
         )
         if allow_sampling:
             kwargs.update(dict(do_sample=True, temperature=0.25, top_p=0.9))
         else:
             kwargs.update(dict(do_sample=False, temperature=0.0, top_p=1.0))
+        t0 = time.perf_counter()
         out = mdl.generate(**inputs, **kwargs)
+        model_latency = time.perf_counter() - t0
         prompt_tokens = int(inputs.input_ids.shape[-1])
         output_tokens = int(out.shape[-1] - inputs.input_ids.shape[-1])
             "prompt_tokens": prompt_tokens,
             "output_tokens": output_tokens,
             "total_tokens": total_tokens,
+        }, model_latency
 _MODEL_CACHE: Dict[Tuple[str, Optional[str], bool], HFModel] = {}
     cleaned = "\n".join(lines[-32768:])
     return f"[EMAIL/MESSAGE SIGNAL]\n{cleaned}" if add_header else cleaned
+def window_then_cap(text: str, soft_token_cap: int) -> Tuple[str, str]:
+    """
+    Apply keyword windowing; then hard cap by approximate chars (~4 chars/token).
+    Returns (final_text, info_string).
+    """
+    windowed = extract_windows(text)
+    approx_chars = int(max(soft_token_cap, 0) * 4) if soft_token_cap else 0
+    info = "windowed"
+    if approx_chars and len(windowed) > approx_chars:
+        windowed = windowed[:approx_chars]
+        info = f"windowed + soft cap ~{soft_token_cap}t"
+    return windowed, info
 def run_single(
     custom_repo_id: str,
     rules_json: Optional[gr.File],
 ):
     """Returns: repo, revision, predicted_json, metric_cards_md, diag_cards_md, raw_metrics_json"""
+    total_t0 = time.perf_counter()  # TOTAL latency starts here
     repo = (custom_repo_id or DEFAULT_REPO).strip()
     revision = "main"
     allowed = read_rules_labels(rules_json) or DEFAULT_LABEL_SET
+    # Preprocess + window + cap
+    effective_len_before = len(transcript)
     if preprocess:
         transcript = preprocess_text(transcript, add_header, strip_smalltalk)
+    windowed, cap_info = window_then_cap(transcript, soft_token_cap)
+    effective_len_after = len(windowed)
     # Build prompt
     system = system_instructions or SYSTEM_INSTRUCTIONS_BASE
+    prompt = build_prompt(system, context_text or CONTEXT_GUIDE, windowed, allowed, use_fewshot)
     model = get_model(repo, revision, load_in_4bit)
+    # Deterministic pass only
+    raw_json, tok_stats, model_latency = model.generate_json(prompt, max_new_tokens=48, allow_sampling=False)
     pred_labels = safe_json_labels(raw_json, allowed)
     # Optional fallback sampling (OFF by default)
     fallback_used = False
     if enable_fallback_sampling and not pred_labels:
+        raw_json2, tok_stats2, model_latency2 = model.generate_json(prompt, max_new_tokens=48, allow_sampling=True)
         pred_labels2 = safe_json_labels(raw_json2, allowed)
         if pred_labels2:
             pred_labels = pred_labels2
             tok_stats = tok_stats2
+            model_latency = model_latency2
             fallback_used = True
+    total_latency = time.perf_counter() - total_t0
     est_cost = (total_latency / 3600.0) * max(0.0, float(hourly_rate or 0.0))
     # Ground truth
     metric_cards += card("Missing labels", json.dumps(missing, ensure_ascii=False) if gt_labels is not None else "—", "Expected but not predicted")
     metric_cards += card("Extra labels", json.dumps(extra, ensure_ascii=False) if gt_labels is not None else "—", "Predicted but not expected")
+    # Diagnostics cards — now with TWO latency measures
     diag_cards = ""
     diag_cards += card("Model / Rev", f"{repo} / {revision}")
     diag_cards += card("Device", f"{DEVICE} ({GPU_NAME})")
     diag_cards += card("Precision dtype", f"{DTYPE_FALLBACK}")
     diag_cards += card("4-bit", f"{bool(load_in_4bit)}")
     diag_cards += card("Allowed labels", json.dumps(allowed, ensure_ascii=False))
+    diag_cards += card("Effective text length", f"before={effective_len_before} chars → after={effective_len_after} ({cap_info})")
     diag_cards += card("Tokens", f"prompt={tok_stats['prompt_tokens']}, output={tok_stats['output_tokens']}, total={tok_stats['total_tokens']}", "Token counts influence latency & cost")
+    diag_cards += card("Model latency", f"{model_latency:.2f} s", "Time spent in model.generate(...) only")
+    diag_cards += card("Total latency", f"{total_latency:.2f} s", "End-to-end time (preprocess → model → postprocess)")
     diag_cards += card("Cost (est.)", f"${(est_cost):.6f} @ {hourly_rate:.4f}/hr")
     diag_cards += card("Fallback sampling used", "Yes" if fallback_used else "No", "Sampling can be slower/unstable on T4; off by default")
         "extra": extra if gt_labels is not None else None,
         "per_label": per_label if gt_labels is not None else None,
         "token_stats": tok_stats,
+        "model_latency_seconds": round(model_latency, 3),
+        "total_latency_seconds": round(total_latency, 3),
         "estimated_cost_usd": round(est_cost, 6),
         "fallback_used": fallback_used,
+        "cap_info": cap_info,
     }
     return (
     model = get_model(repo, revision, load_in_4bit)
     rows = [["filename","labels"]]
+    per_sample_rows = [["filename","pred_labels","gold_labels","precision","recall","f1","exact_match","hamming_loss","missing","extra","model_latency_s","total_latency_s","prompt_tokens","output_tokens"]]
     totals = {"tp":0,"fp":0,"fn":0,"pred_total":0,"gold_total":0}
     label_global = {lab: {"tp":0,"fp":0,"fn":0} for lab in allowed}
+    total_prompt_tokens = 0; total_output_tokens = 0; sum_model_s = 0.0; sum_total_s = 0.0
+    n=0; with_gt=0
     system = system_instructions or SYSTEM_INSTRUCTIONS_BASE
         except Exception:
             rows.append([name, "[]  # unreadable"]); continue
+        total_t0 = time.perf_counter()  # TOTAL latency per file
         if preprocess:
             txt = preprocess_text(txt, add_header, strip_smalltalk)
+        txt_windowed, cap_info = window_then_cap(txt, soft_token_cap)
+        prompt = build_prompt(system, context_text or CONTEXT_GUIDE, txt_windowed, allowed, use_fewshot)
+        raw_json, tok_stats, model_latency = model.generate_json(prompt, max_new_tokens=48, allow_sampling=False)
         pred = safe_json_labels(raw_json, allowed)
         if enable_fallback_sampling and not pred:
+            raw_json2, tok_stats2, model_latency2 = model.generate_json(prompt, max_new_tokens=48, allow_sampling=True)
             pred2 = safe_json_labels(raw_json2, allowed)
             if pred2:
+                pred = pred2; tok_stats = tok_stats2; model_latency = model_latency2
+        total_latency = time.perf_counter() - total_t0
         total_prompt_tokens += tok_stats["prompt_tokens"]
         total_output_tokens += tok_stats["output_tokens"]
+        sum_model_s += model_latency
+        sum_total_s += total_latency
         n += 1
         rows.append([name, json.dumps(pred, ensure_ascii=False)])
                 round(ham,4),
                 json.dumps(missing, ensure_ascii=False),
                 json.dumps(extra, ensure_ascii=False),
+                round(model_latency,3), round(total_latency,3),
+                tok_stats["prompt_tokens"], tok_stats["output_tokens"],
+            ])
+        else:
+            per_sample_rows.append([
+                name, json.dumps(pred, ensure_ascii=False), None, None, None, None, None, None, None, None,
+                round(model_latency,3), round(total_latency,3),
+                tok_stats["prompt_tokens"], tok_stats["output_tokens"],
             ])
     tp, fp, fn = totals["tp"], totals["fp"], totals["fn"]
     prec = tp / (tp + fp) if (tp + fp) else 0.0
     rec  = tp / (tp + fn) if (tp + fn) else 0.0
     f1   = 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0
+    est_cost = (sum_total_s / 3600.0) * max(0.0, float(hourly_rate or 0.0))
     coverage = {lab: 0 for lab in allowed}
     for r in rows[1:]:
             "avg_prompt_tokens": round(total_prompt_tokens / n, 2) if n else 0.0,
             "avg_output_tokens": round(total_output_tokens / n, 2) if n else 0.0,
         },
+        "latency_seconds_model_total": round(sum_model_s, 3),
+        "latency_seconds_total": round(sum_total_s, 3),
+        "avg_model_latency_seconds": round(sum_model_s / n, 3) if n else 0.0,
+        "avg_total_latency_seconds": round(sum_total_s / n, 3) if n else 0.0,
         "estimated_cost_usd": round(est_cost, 6),
     }
     diag_cards += card("4-bit", f"{bool(load_in_4bit)}")
     diag_cards += card("Files processed", f"{n} (with GT: {with_gt})")
     diag_cards += card("Tokens (totals)", f"prompt={total_prompt_tokens}, output={total_output_tokens}")
+    diag_cards += card("Latency (model)", f"total={summary['latency_seconds_model_total']} s, avg={summary['avg_model_latency_seconds']} s")
+    diag_cards += card("Latency (total)", f"total={summary['latency_seconds_total']} s, avg={summary['avg_total_latency_seconds']} s")
     diag_cards += card("Cost (est.)", f"${summary['estimated_cost_usd']} @ {hourly_rate:.4f}/hr")
     diag_cards += card("Allowed labels", json.dumps(allowed, ensure_ascii=False))
 # ----------------------------- UI -----------------------------------
+with gr.Blocks(title="From Talk to Task — Windowed + Two Latencies") as demo:
     gr.Markdown(
         f"""
         # From Talk to Task — Accuracy & Diagnostics (EN/FR/DE/IT)
         **Default model:** `{DEFAULT_REPO}` (GPU + 4-bit recommended).
+        Now includes **keyword windowing** (keeps early cues) and **two latency measures**:
+        - **Model latency:** time spent inside the model generate call
+        - **Total latency:** end-to-end time (preprocess → model → postprocess)
+        Upload ground truth to compute **Precision / Recall / F1 / Exact match / Hamming loss**.
         Upload a **Rules JSON** (`{{"labels":[...]}}`) to override allowed labels.
         **Model output schema:** `{{"labels": [...]}}`
     context = gr.Textbox(label="Context (User prefix)", value=CONTEXT_GUIDE, lines=6)
     with gr.Row():
+        soft_cap = gr.Slider(512, 32768, value=1024, step=1, label="Soft token cap (approx; applied after keyword windows)")
         preprocess = gr.Checkbox(value=True, label="Enable preprocessing")
         add_header = gr.Checkbox(value=True, label="Add cues header")
         strip_smalltalk = gr.Checkbox(value=False, label="Strip smalltalk")