Spaces:

RishiRP
/

Talk2TaskDemo1

Sleeping

App Files Files Community

RishiRP commited on Sep 25

Commit

b80450d

verified ·

1 Parent(s): 5a60238

Update app.py

Browse files

Files changed (1) hide show

app.py +372 -741

app.py CHANGED Viewed

@@ -1,788 +1,419 @@
-# app.py
-# From Talk to Task — Windowed extraction + two latency measures
-# Model: swiss-ai/Apertus-8B-Instruct-2509
-# Few-shot: 1 each EN/FR/DE/IT; deterministic by default; optional sampling fallback toggle.
-# Soft token cap: 1024 by default. CUDA fp16 + optional 4-bit. GT scoring + downloads.
-import os
-import re
-import time
-import json
-import csv
-import zipfile
-from pathlib import Path
-from typing import Dict, Tuple, Optional, List
-import gradio as gr
-# --------------------------- MODEL / LABELS ---------------------------------
-DEFAULT_REPO = "swiss-ai/Apertus-8B-Instruct-2509"
-DEFAULT_LABEL_SET = [
-    "plan_contact",
-    "schedule_meeting",
-    "update_contact_info_non_postal",
-    "update_contact_info_postal_address",
-    "update_kyc_activity",
-    "update_kyc_origin_of_assets",
-    "update_kyc_purpose_of_businessrelation",
-    "update_kyc_total_assets",
-]
-SYSTEM_INSTRUCTIONS_BASE = (
-    "You are a task extraction assistant. Input transcript language may be English, French, "
-    "German, or Italian. Return ONLY valid JSON with a single field:\n"
-    '"labels": a list of strings chosen ONLY from the allowed label set.\n'
-    "Do NOT add other fields or prose. Do NOT translate labels. If multiple labels apply, return all.\n"
-    "If none apply, return an empty list."
-)
-CONTEXT_GUIDE = (
-    "- plan_contact: conversation without a firm date/time\n"
-    "- schedule_meeting: explicit date/time/modality is agreed\n"
-    "- update_contact_info_non_postal: email/phone updates\n"
-    "- update_contact_info_postal_address: mailing address updates\n"
-    "- update_kyc_*: KYC updates (activity, purpose, origin of assets, total assets)\n"
-)
-# Few-shot: exactly one per language (compact)
-FEW_SHOTS = [
-    # EN
-    {"transcript": "Agent: Can we meet Friday 3pm on Teams?\nClient: Yes, Friday 3pm works.\nAgent: I’ll send the invite.",
-     "labels": ["schedule_meeting"]},
-    # FR
-    {"transcript": "Client: Mon numéro a changé: +41 44 000 00 00.\nConseiller: Merci, je mets à jour vos coordonnées.",
-     "labels": ["update_contact_info_non_postal"]},
-    # DE
-    {"transcript": "Kunde: Neue Postadresse: Musterstrasse 1, 8000 Zürich.\nBerater: Danke, ich aktualisiere die Postadresse.",
-     "labels": ["update_contact_info_postal_address"]},
-    # IT
-    {"transcript": "Cliente: Totale patrimonio confermato a 8 milioni CHF.\nConsulente: Aggiorno i dati KYC sul totale degli asset.",
-     "labels": ["update_kyc_total_assets"]},
-]
-# --------------------- WRITABLE HF CACHE -----------------------------
-HOME = Path(os.environ.get("HOME", "/home/user"))
-CACHE_DIR = HOME / ".cache" / "huggingface"
-CACHE_DIR.mkdir(parents=True, exist_ok=True)
-os.environ.setdefault("HF_HOME", str(CACHE_DIR))
-os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
-HF_TOKEN = (os.environ.get("HF_TOKEN") or "").strip() or None
-# -------------------- TRANSFORMERS / TORCH ---------------------------
-try:
-    import torch
-    from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
-except Exception as e:
-    raise RuntimeError(
-        "Missing deps. requirements.txt must include: transformers>=4.56.0, torch, accelerate, huggingface_hub, bitsandbytes, gradio"
-    ) from e
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-GPU_NAME = torch.cuda.get_device_name(0) if DEVICE == "cuda" else "cpu"
-# T4 doesn't support bf16 → use fp16; CPU uses fp32
-DTYPE_FALLBACK = torch.float16 if DEVICE == "cuda" else torch.float32
-# -------------------------- HELPERS ---------------------------------
-RE_DISCLAIMER = re.compile(r"^\s*disclaimer\s*:", re.IGNORECASE)
-RE_DROP = re.compile(r"(readme|terms|synthetic transcript)", re.IGNORECASE)
-SMALLTALK_RE = re.compile(r"\b(thanks?|merci|grazie|danke|tsch(ü|u)ss|ciao|bye|ok(ay)?)\b", re.IGNORECASE)
-# keyword windows (EN/FR/DE/IT) — expand as needed
-WINDOW_KEYWORDS = [
-    # meeting / schedule
-    r"\b(meet|meeting|schedule|appointment|teams|zoom|google meet|calendar)\b",
-    r"\b(rendez[- ]?vous|réunion|planifier|calendrier|teams|zoom)\b",
-    r"\b(termin|treffen|besprechung|kalender|teams|zoom)\b",
-    r"\b(appuntamento|riunione|calendario|teams|zoom)\b",
-    # address / phone / email
-    r"\b(address|street|avenue|road|postcode|phone|email)\b",
-    r"\b(adresse|rue|avenue|code postal|téléphone|courriel|email)\b",
-    r"\b(adresse|straße|strasse|plz|telefon|e-?mail)\b",
-    r"\b(indirizzo|via|cap|telefono|e-?mail)\b",
-    # KYC assets / totals / origin / purpose
-    r"\b(total assets|net worth|portfolio|real estate|origin of assets|source of wealth|purpose of relationship)\b",
-    r"\b(actifs totaux|patrimoine|immobilier|origine des fonds|source de richesse|but de la relation)\b",
-    r"\b(gesamtverm(ö|o)gen|verm(ö|o)gen|immobilien|herkunft der verm(ö|o)genswerte|zweck der gesch(ä|a)ftsbeziehung)\b",
-    r"\b(patrimonio totale|immobiliare|origine dei fondi|scopo della relazione)\b",
-    r"\b(chf|eur|usd|cur[13]|francs?)\b",
-    r"\b(\d{1,3}([.'’ ]\d{3})*(,\d+)?)(\s?(chf|eur|usd))\b",
-]
-def _json_from_text(text: str) -> str:
-    s = text.strip()
-    if s.startswith("{") and s.endswith("}"):
-        return s
-    m = re.search(r"\{.*\}", s, re.DOTALL)
-    return m.group(0) if m else '{"labels": []}'
-def safe_json_labels(s: str, allowed: List[str]) -> List[str]:
-    try:
-        data = json.loads(s)
-    except Exception:
-        return []
-    labels = data.get("labels", [])
-    clean, seen = [], set()
-    for lab in labels:
-        if lab in allowed and lab not in seen:
-            clean.append(lab); seen.add(lab)
-    return clean
-def read_rules_labels(file_obj: Optional[gr.File]) -> Optional[List[str]]:
-    if not file_obj:
-        return None
-    try:
-        data = json.loads(Path(file_obj.name).read_text(encoding="utf-8"))
-        labs = data.get("labels", [])
-        return [x for x in labs if isinstance(x, str)]
-    except Exception:
-        return None
-def read_single_ground_truth(file_obj: Optional[gr.File]) -> Optional[List[str]]:
-    if not file_obj:
-        return None
     try:
-        data = json.loads(Path(file_obj.name).read_text(encoding="utf-8"))
-        labels = data.get("labels", [])
-        return [lab for lab in labels if isinstance(lab, str)]
     except Exception:
-        return None
-def read_batch_ground_truth_zip(zip_file: Optional[gr.File]) -> Dict[str, List[str]]:
-    out: Dict[str, List[str]] = {}
-    if not zip_file:
         return out
-    try:
-        with zipfile.ZipFile(zip_file.name) as z:
-            for name in z.namelist():
-                if not name.lower().endswith(".json"):
-                    continue
-                try:
-                    data = json.loads(z.read(name).decode("utf-8", errors="replace"))
-                    labs = [lab for lab in data.get("labels", []) if isinstance(lab, str)]
-                    out[Path(name).with_suffix("").name] = labs
-                except Exception:
-                    pass
-    except Exception:
-        pass
     return out
-def build_fewshot_block(allowed: List[str]) -> str:
-    shots = []
-    for ex in FEW_SHOTS:
-        shots.append(
-            f"- Transcript:\n{ex['transcript']}\n- Correct labels (choose subset from {allowed}): {ex['labels']}\n"
-        )
-    return "\n".join(shots)
-def build_prompt(system: str, context: str, transcript: str, allowed: List[str], use_fewshot: bool) -> str:
-    fewshot_section = f"\n### Examples\n{build_fewshot_block(allowed)}\n" if use_fewshot else ""
-    return (
-        f"### System\n{system}\n\n"
-        f"### Allowed label set\n{allowed}\n\n"
-        f"### Context\n{context}\n"
-        f"{fewshot_section}\n"
-        f"### Transcript\n{transcript}\n\n"
-        "### Output\nReturn JSON only: {\"labels\": [...]}"
-    )
-def prf1_accuracy(pred: List[str], gold: List[str]) -> Tuple[float, float, float, float, Dict[str, int]]:
-    pset, gset = set(pred), set(gold)
-    tp = len(pset & gset); fp = len(pset - gset); fn = len(gset - pset)
-    prec = tp / (tp + fp) if (tp + fp) else 0.0
-    rec = tp / (tp + fn) if (tp + fn) else 0.0
-    f1 = 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0
-    denom = len(pset | gset)
-    acc = (tp / denom) if denom else 1.0
-    return prec, rec, f1, acc, {"tp": tp, "fp": fp, "fn": fn, "pred_total": len(pset), "gold_total": len(gset)}
-def per_label_counts(pred: List[str], gold: List[str], all_labels: List[str]) -> Dict[str, Dict[str, int]]:
-    pset, gset = set(pred), set(gold)
-    out = {}
-    for lab in all_labels:
-        tp = 1 if (lab in pset and lab in gset) else 0
-        fp = 1 if (lab in pset and lab not in gset) else 0
-        fn = 1 if (lab in gset and lab not in pset) else 0
-        out[lab] = {"tp": tp, "fp": fp, "fn": fn}
-    return out
-def hamming_loss(pred: List[str], gold: List[str], all_labels: List[str]) -> float:
-    pset, gset = set(pred), set(gold)
-    wrong = 0
-    for lab in all_labels:
-        in_p, in_g = (lab in pset), (lab in gset)
-        wrong += int(in_p != in_g)
-    return wrong / max(1, len(all_labels))
-def write_csv(path: Path, rows: List[List[str]]):
-    with path.open("w", newline="", encoding="utf-8") as f:
-        w = csv.writer(f); w.writerows(rows)
-def card_markdown(title: str, value: str, hint: str = "") -> str:
-    hint_md = f"<div style='font-size:12px;opacity:0.8'>{hint}</div>" if hint else ""
-    return f"""
-<div style="border:1px solid #3a3a3a;border-radius:10px;padding:10px;margin:6px">
-  <div style="font-weight:600">{title}</div>
-  <div style="font-size:20px;margin-top:4px">{value}</div>
-  {hint_md}
-</div>
-"""
-# ------------------- WINDOWED EXTRACTION (fix for empty labels) -------------------
-def extract_windows(text: str, max_windows: int = 6, half_span_lines: int = 3) -> str:
-    """
-    Find up to `max_windows` windows around keyword hits; each window is ±`half_span_lines` lines.
-    If no hits, return the FIRST 8k characters instead of last chunk (common cause of misses).
-    """
-    lines = text.splitlines()
-    n = len(lines)
-    # collect hit line indices
-    hits: List[int] = []
-    pattern = re.compile("|".join(WINDOW_KEYWORDS), re.IGNORECASE)
-    for i, ln in enumerate(lines):
-        if pattern.search(ln):
-            hits.append(i)
-    # de-duplicate and cap
-    unique_hits = []
-    seen = set()
-    for idx in hits:
-        # bucket nearby hits to avoid redundant windows
-        bucket = idx // 2  # coarse bucketing
-        if bucket not in seen:
-            seen.add(bucket)
-            unique_hits.append(idx)
-    unique_hits = unique_hits[:max_windows]
-    if not unique_hits:
-        # return the opening chunk; most KYC/context often appears early
-        return "\n".join(lines[: min(2000, n)])
-    # Build windows and merge
-    windows = []
-    for idx in unique_hits:
-        a = max(0, idx - half_span_lines)
-        b = min(n, idx + half_span_lines + 1)
-        windows.append("\n".join(lines[a:b]))
-    return "\n...\n".join(windows)
-# -------------------------- MODEL -----------------------------------
-class HFModel:
-    def __init__(
-        self,
-        repo_id: str,
-        revision: Optional[str],
-        token: Optional[str],
-        load_in_4bit: bool,
-        dtype
-    ):
         self.repo_id = repo_id
-        self.revision = revision or "main"
-        self.token = token
-        self.load_in_4bit = load_in_4bit and (DEVICE == "cuda")
-        self.dtype = dtype
         self.tokenizer = None
         self.model = None
     def load(self):
         qcfg = None
-        if self.load_in_4bit:
             qcfg = BitsAndBytesConfig(
                 load_in_4bit=True,
                 bnb_4bit_quant_type="nf4",
                 bnb_4bit_compute_dtype=torch.float16,
                 bnb_4bit_use_double_quant=True,
             )
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            self.repo_id, revision=self.revision, token=self.token,
-            cache_dir=str(CACHE_DIR), use_fast=True, trust_remote_code=True
         )
-        self.model = AutoModelForCausalLM.from_pretrained(
-            self.repo_id, revision=self.revision, token=self.token,
-            cache_dir=str(CACHE_DIR), trust_remote_code=True,
-            torch_dtype=self.dtype,
             device_map="auto" if DEVICE == "cuda" else None,
-            quantization_config=qcfg, low_cpu_mem_usage=True
         )
-        if DEVICE == "cpu":
-            self.model = self.model.to(DEVICE)
     @torch.inference_mode()
-    def generate_json(self, prompt: str, max_new_tokens=48, allow_sampling=False) -> Tuple[str, Dict[str, int], float]:
-        """
-        Deterministic by default. If allow_sampling=True (toggle), we use mild temperature.
-        Returns (json_text, token_stats, model_latency_seconds)
-        """
-        tok = self.tokenizer
-        mdl = self.model
-        messages = [{"role": "user", "content": prompt}]
-        templated = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = tok([templated], return_tensors="pt", add_special_tokens=False).to(mdl.device)
-        kwargs = dict(
-            max_new_tokens=max_new_tokens,
-            pad_token_id=tok.eos_token_id,
-            eos_token_id=tok.eos_token_id,
-        )
-        if allow_sampling:
-            kwargs.update(dict(do_sample=True, temperature=0.25, top_p=0.9))
         else:
-            kwargs.update(dict(do_sample=False, temperature=0.0, top_p=1.0))
-        t0 = time.perf_counter()
-        out = mdl.generate(**inputs, **kwargs)
-        model_latency = time.perf_counter() - t0
-        prompt_tokens = int(inputs.input_ids.shape[-1])
-        output_tokens = int(out.shape[-1] - inputs.input_ids.shape[-1])
-        total_tokens = prompt_tokens + output_tokens
-        decoded = tok.decode(out[0], skip_special_tokens=True)
-        gen = decoded[len(templated):].strip() if decoded.startswith(templated) else decoded
-        return _json_from_text(gen), {
-            "prompt_tokens": prompt_tokens,
-            "output_tokens": output_tokens,
-            "total_tokens": total_tokens,
-        }, model_latency
-_MODEL_CACHE: Dict[Tuple[str, Optional[str], bool], HFModel] = {}
-def get_model(repo_id: str, revision: Optional[str], load_in_4bit: bool) -> HFModel:
-    key = (repo_id, revision, load_in_4bit)
-    if key in _MODEL_CACHE:
-        return _MODEL_CACHE[key]
-    mdl = HFModel(repo_id, revision, HF_TOKEN, load_in_4bit, DTYPE_FALLBACK)
-    mdl.load()
-    _MODEL_CACHE[key] = mdl
-    return mdl
-# ---------------------- INFERENCE ROUTES ----------------------------
-def preprocess_text(txt: str, add_header: bool, strip_smalltalk: bool) -> str:
-    lines = [ln.rstrip() for ln in txt.splitlines()]
-    lines = [ln for ln in lines if not RE_DISCLAIMER.match(ln)]
-    lines = [ln for ln in lines if not RE_DROP.search(ln)]
-    if strip_smalltalk:
-        lines = [ln for ln in lines if not SMALLTALK_RE.search(ln)]
-    cleaned = "\n".join(lines[-32768:])
-    return f"[EMAIL/MESSAGE SIGNAL]\n{cleaned}" if add_header else cleaned
-def window_then_cap(text: str, soft_token_cap: int) -> Tuple[str, str]:
-    """
-    Apply keyword windowing; then hard cap by approximate chars (~4 chars/token).
-    Returns (final_text, info_string).
-    """
-    windowed = extract_windows(text)
-    approx_chars = int(max(soft_token_cap, 0) * 4) if soft_token_cap else 0
-    info = "windowed"
-    if approx_chars and len(windowed) > approx_chars:
-        windowed = windowed[:approx_chars]
-        info = f"windowed + soft cap ~{soft_token_cap}t"
-    return windowed, info
-def run_single(
-    custom_repo_id: str,
-    rules_json: Optional[gr.File],
-    system_instructions: str,
-    context_text: str,
-    transcript: str,
-    soft_token_cap: int,
-    preprocess: bool,
-    add_header: bool,
-    strip_smalltalk: bool,
-    load_in_4bit: bool,
-    hourly_rate: float,
-    gt_json_file: Optional[gr.File],
-    use_fewshot: bool,
-    enable_fallback_sampling: bool,
-):
-    """Returns: repo, revision, predicted_json, metric_cards_md, diag_cards_md, raw_metrics_json"""
-    total_t0 = time.perf_counter()  # TOTAL latency starts here
-    repo = (custom_repo_id or DEFAULT_REPO).strip()
-    revision = "main"
-    allowed = read_rules_labels(rules_json) or DEFAULT_LABEL_SET
-    # Preprocess + window + cap
-    effective_len_before = len(transcript)
-    if preprocess:
-        transcript = preprocess_text(transcript, add_header, strip_smalltalk)
-    windowed, cap_info = window_then_cap(transcript, soft_token_cap)
-    effective_len_after = len(windowed)
-    # Build prompt
-    system = system_instructions or SYSTEM_INSTRUCTIONS_BASE
-    prompt = build_prompt(system, context_text or CONTEXT_GUIDE, windowed, allowed, use_fewshot)
-    model = get_model(repo, revision, load_in_4bit)
-    # Deterministic pass only
-    raw_json, tok_stats, model_latency = model.generate_json(prompt, max_new_tokens=48, allow_sampling=False)
-    pred_labels = safe_json_labels(raw_json, allowed)
-    # Optional fallback sampling (OFF by default)
-    fallback_used = False
-    if enable_fallback_sampling and not pred_labels:
-        raw_json2, tok_stats2, model_latency2 = model.generate_json(prompt, max_new_tokens=48, allow_sampling=True)
-        pred_labels2 = safe_json_labels(raw_json2, allowed)
-        if pred_labels2:
-            pred_labels = pred_labels2
-            tok_stats = tok_stats2
-            model_latency = model_latency2
-            fallback_used = True
-    total_latency = time.perf_counter() - total_t0
-    est_cost = (total_latency / 3600.0) * max(0.0, float(hourly_rate or 0.0))
-    # Ground truth
-    gt_labels = read_single_ground_truth(gt_json_file)
-    pr = rc = f1 = acc = 0.0
-    ham = None
-    missing = []; extra = []; per_label = {}
-    if gt_labels is not None:
-        pr, rc, f1, acc, counts = prf1_accuracy(pred_labels, gt_labels)
-        ham = hamming_loss(pred_labels, gt_labels, allowed)
-        per_label = per_label_counts(pred_labels, gt_labels, allowed)
-        missing = sorted(list(set(gt_labels) - set(pred_labels)))
-        extra   = sorted(list(set(pred_labels) - set(gt_labels)))
-    # Metric cards
-    def card(title, val, hint=""):
-        return card_markdown(title, val, hint)
-    metric_cards = ""
-    metric_cards += card("Precision", f"{pr:.3f}" if gt_labels is not None else "—", "Correct positives / All predicted positives")
-    metric_cards += card("Recall", f"{rc:.3f}" if gt_labels is not None else "—", "Correct positives / All actual positives")
-    metric_cards += card("F1 score", f"{f1:.3f}" if gt_labels is not None else "—", "Harmonic mean of Precision & Recall")
-    metric_cards += card("Exact match", f"{1.0 if gt_labels and set(pred_labels)==set(gt_labels) else 0.0 if gt_labels is not None else '—'}", "1.0 if predicted set equals truth")
-    metric_cards += card("Hamming loss", f"{ham:.3f}" if ham is not None else "—", "Fraction of labels where prediction disagrees with truth (lower better)")
-    metric_cards += card("Missing labels", json.dumps(missing, ensure_ascii=False) if gt_labels is not None else "—", "Expected but not predicted")
-    metric_cards += card("Extra labels", json.dumps(extra, ensure_ascii=False) if gt_labels is not None else "—", "Predicted but not expected")
-    # Diagnostics cards — now with TWO latency measures
-    diag_cards = ""
-    diag_cards += card("Model / Rev", f"{repo} / {revision}")
-    diag_cards += card("Device", f"{DEVICE} ({GPU_NAME})")
-    diag_cards += card("Precision dtype", f"{DTYPE_FALLBACK}")
-    diag_cards += card("4-bit", f"{bool(load_in_4bit)}")
-    diag_cards += card("Allowed labels", json.dumps(allowed, ensure_ascii=False))
-    diag_cards += card("Effective text length", f"before={effective_len_before} chars → after={effective_len_after} ({cap_info})")
-    diag_cards += card("Tokens", f"prompt={tok_stats['prompt_tokens']}, output={tok_stats['output_tokens']}, total={tok_stats['total_tokens']}", "Token counts influence latency & cost")
-    diag_cards += card("Model latency", f"{model_latency:.2f} s", "Time spent in model.generate(...) only")
-    diag_cards += card("Total latency", f"{total_latency:.2f} s", "End-to-end time (preprocess → model → postprocess)")
-    diag_cards += card("Cost (est.)", f"${(est_cost):.6f} @ {hourly_rate:.4f}/hr")
-    diag_cards += card("Fallback sampling used", "Yes" if fallback_used else "No", "Sampling can be slower/unstable on T4; off by default")
-    raw_metrics = {
-        "labels_pred": pred_labels,
-        "ground_truth_labels": gt_labels,
-        "precision": round(pr, 4) if gt_labels is not None else None,
-        "recall": round(rc, 4) if gt_labels is not None else None,
-        "f1": round(f1, 4) if gt_labels is not None else None,
-        "exact_match": 1.0 if gt_labels and set(pred_labels)==set(gt_labels) else (0.0 if gt_labels is not None else None),
-        "hamming_loss": round(ham, 4) if ham is not None else None,
-        "missing": missing if gt_labels is not None else None,
-        "extra": extra if gt_labels is not None else None,
-        "per_label": per_label if gt_labels is not None else None,
-        "token_stats": tok_stats,
-        "model_latency_seconds": round(model_latency, 3),
-        "total_latency_seconds": round(total_latency, 3),
-        "estimated_cost_usd": round(est_cost, 6),
-        "fallback_used": fallback_used,
-        "cap_info": cap_info,
-    }
-    return (
-        repo, revision,
-        json.dumps({"labels": pred_labels}, ensure_ascii=False),
-        metric_cards, diag_cards,
-        json.dumps(raw_metrics, indent=2)
-    )
-def run_batch(
-    custom_repo_id: str,
-    rules_json: Optional[gr.File],
-    system_instructions: str,
-    context_text: str,
-    transcripts_zip: Optional[gr.File],
-    gt_zip: Optional[gr.File],
-    soft_token_cap: int,
-    preprocess: bool,
-    add_header: bool,
-    strip_smalltalk: bool,
-    load_in_4bit: bool,
-    hourly_rate: float,
-    use_fewshot: bool,
-    enable_fallback_sampling: bool,
-):
-    repo = (custom_repo_id or DEFAULT_REPO).strip()
-    revision = "main"
-    if not transcripts_zip:
-        return repo, revision, "filename,labels\n", "<div>No transcript ZIP provided.</div>", "{}", None, None, None
-    allowed = read_rules_labels(rules_json) or DEFAULT_LABEL_SET
     try:
-        z = zipfile.ZipFile(transcripts_zip.name)
-        txt_names = [n for n in z.namelist() if n.lower().endswith(".txt")]
     except Exception as e:
-        return repo, revision, "filename,labels\n", f"<div>Bad transcript ZIP: {e}</div>", "{}", None, None, None
-    gt_map = read_batch_ground_truth_zip(gt_zip)
-    model = get_model(repo, revision, load_in_4bit)
-    rows = [["filename","labels"]]
-    per_sample_rows = [["filename","pred_labels","gold_labels","precision","recall","f1","exact_match","hamming_loss","missing","extra","model_latency_s","total_latency_s","prompt_tokens","output_tokens"]]
-    totals = {"tp":0,"fp":0,"fn":0,"pred_total":0,"gold_total":0}
-    label_global = {lab: {"tp":0,"fp":0,"fn":0} for lab in allowed}
-    total_prompt_tokens = 0; total_output_tokens = 0; sum_model_s = 0.0; sum_total_s = 0.0
-    n=0; with_gt=0
-    system = system_instructions or SYSTEM_INSTRUCTIONS_BASE
-    for name in txt_names:
-        try:
-            txt = z.read(name).decode("utf-8", errors="replace")
-        except Exception:
-            rows.append([name, "[]  # unreadable"]); continue
-        total_t0 = time.perf_counter()  # TOTAL latency per file
-        if preprocess:
-            txt = preprocess_text(txt, add_header, strip_smalltalk)
-        txt_windowed, cap_info = window_then_cap(txt, soft_token_cap)
-        prompt = build_prompt(system, context_text or CONTEXT_GUIDE, txt_windowed, allowed, use_fewshot)
-        raw_json, tok_stats, model_latency = model.generate_json(prompt, max_new_tokens=48, allow_sampling=False)
-        pred = safe_json_labels(raw_json, allowed)
-        if enable_fallback_sampling and not pred:
-            raw_json2, tok_stats2, model_latency2 = model.generate_json(prompt, max_new_tokens=48, allow_sampling=True)
-            pred2 = safe_json_labels(raw_json2, allowed)
-            if pred2:
-                pred = pred2; tok_stats = tok_stats2; model_latency = model_latency2
-        total_latency = time.perf_counter() - total_t0
-        total_prompt_tokens += tok_stats["prompt_tokens"]
-        total_output_tokens += tok_stats["output_tokens"]
-        sum_model_s += model_latency
-        sum_total_s += total_latency
-        n += 1
-        rows.append([name, json.dumps(pred, ensure_ascii=False)])
-        stem = Path(name).with_suffix("").name
-        gold = gt_map.get(stem)
-        if gold is not None:
-            with_gt += 1
-            pr, rc, f1, acc, counts = prf1_accuracy(pred, gold)
-            ham = hamming_loss(pred, gold, allowed)
-            missing = sorted(list(set(gold) - set(pred)))
-            extra = sorted(list(set(pred) - set(gold)))
-            for k in ["tp","fp","fn","pred_total","gold_total"]:
-                totals[k] += counts[k]
-            pl = per_label_counts(pred, gold, allowed)
-            for lab, c in pl.items():
-                for k in ["tp","fp","fn"]:
-                    label_global[lab][k] += c[k]
-            per_sample_rows.append([
-                name,
-                json.dumps(pred, ensure_ascii=False),
-                json.dumps(gold, ensure_ascii=False),
-                round(pr,4), round(rc,4), round(f1,4),
-                1.0 if set(pred)==set(gold) else 0.0,
-                round(ham,4),
-                json.dumps(missing, ensure_ascii=False),
-                json.dumps(extra, ensure_ascii=False),
-                round(model_latency,3), round(total_latency,3),
-                tok_stats["prompt_tokens"], tok_stats["output_tokens"],
-            ])
-        else:
-            per_sample_rows.append([
-                name, json.dumps(pred, ensure_ascii=False), None, None, None, None, None, None, None, None,
-                round(model_latency,3), round(total_latency,3),
-                tok_stats["prompt_tokens"], tok_stats["output_tokens"],
-            ])
-    tp, fp, fn = totals["tp"], totals["fp"], totals["fn"]
-    prec = tp / (tp + fp) if (tp + fp) else 0.0
-    rec  = tp / (tp + fn) if (tp + fn) else 0.0
-    f1   = 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0
-    est_cost = (sum_total_s / 3600.0) * max(0.0, float(hourly_rate or 0.0))
-    coverage = {lab: 0 for lab in allowed}
-    for r in rows[1:]:
-        try:
-            labs = set(json.loads(r[1]))
-            for lab in labs:
-                if lab in coverage:
-                    coverage[lab] += 1
-        except Exception:
-            pass
-    summary = {
-        "files_processed": n,
-        "files_with_ground_truth": with_gt,
-        "labels_allowed": allowed,
-        "precision_micro": round(prec, 4),
-        "recall_micro": round(rec, 4),
-        "f1_micro": round(f1, 4),
-        "per_label_counts": label_global,
-        "coverage_counts": coverage,
-        "token_stats": {
-            "prompt_tokens_total": total_prompt_tokens,
-            "output_tokens_total": total_output_tokens,
-            "total_tokens": total_prompt_tokens + total_output_tokens,
-            "avg_prompt_tokens": round(total_prompt_tokens / n, 2) if n else 0.0,
-            "avg_output_tokens": round(total_output_tokens / n, 2) if n else 0.0,
-        },
-        "latency_seconds_model_total": round(sum_model_s, 3),
-        "latency_seconds_total": round(sum_total_s, 3),
-        "avg_model_latency_seconds": round(sum_model_s / n, 3) if n else 0.0,
-        "avg_total_latency_seconds": round(sum_total_s / n, 3) if n else 0.0,
-        "estimated_cost_usd": round(est_cost, 6),
-    }
-    # Diagnostic cards (HTML)
-    diag_cards = ""
-    def card(title, val, hint=""):
-        return card_markdown(title, val, hint)
-    diag_cards += card("Model / Rev", f"{repo} / {revision}")
-    diag_cards += card("Device", f"{DEVICE} ({GPU_NAME})")
-    diag_cards += card("Precision dtype", f"{DTYPE_FALLBACK}")
-    diag_cards += card("4-bit", f"{bool(load_in_4bit)}")
-    diag_cards += card("Files processed", f"{n} (with GT: {with_gt})")
-    diag_cards += card("Tokens (totals)", f"prompt={total_prompt_tokens}, output={total_output_tokens}")
-    diag_cards += card("Latency (model)", f"total={summary['latency_seconds_model_total']} s, avg={summary['avg_model_latency_seconds']} s")
-    diag_cards += card("Latency (total)", f"total={summary['latency_seconds_total']} s, avg={summary['avg_total_latency_seconds']} s")
-    diag_cards += card("Cost (est.)", f"${summary['estimated_cost_usd']} @ {hourly_rate:.4f}/hr")
-    diag_cards += card("Allowed labels", json.dumps(allowed, ensure_ascii=False))
-    # Artifacts
-    tmp_dir = Path("/tmp")
-    pred_csv = tmp_dir / "predictions.csv"
-    per_sample_csv = tmp_dir / "per_sample_metrics.csv"
-    summary_json = tmp_dir / "summary_metrics.json"
-    with pred_csv.open("w", newline="", encoding="utf-8") as f:
-        w = csv.writer(f); w.writerows(rows)
-    with per_sample_csv.open("w", newline="", encoding="utf-8") as f:
-        w = csv.writer(f); w.writerows(per_sample_rows)
-    summary_json.write_text(json.dumps(summary, indent=2), encoding="utf-8")
-    return (
-        repo, revision,
-        "\n".join([",".join(r) for r in rows]),
-        diag_cards,
-        json.dumps(summary, indent=2),
-        str(pred_csv), str(per_sample_csv), str(summary_json)
     )
-# ----------------------------- UI -----------------------------------
-with gr.Blocks(title="From Talk to Task — Windowed + Two Latencies") as demo:
     gr.Markdown(
-        f"""
-        # From Talk to Task — Accuracy & Diagnostics (EN/FR/DE/IT)
-        **Default model:** `{DEFAULT_REPO}` (GPU + 4-bit recommended).
-        Now includes **keyword windowing** (keeps early cues) and **two latency measures**:
-        - **Model latency:** time spent inside the model generate call
-        - **Total latency:** end-to-end time (preprocess → model → postprocess)
-        Upload ground truth to compute **Precision / Recall / F1 / Exact match / Hamming loss**.
-        Upload a **Rules JSON** (`{{"labels":[...]}}`) to override allowed labels.
-        **Model output schema:** `{{"labels": [...]}}`
-        """
     )
     with gr.Row():
-        custom_repo = gr.Textbox(
-            label="Model repo (empty → default)",
-            placeholder="e.g. swiss-ai/Apertus-8B-Instruct-2509"
-        )
-        load_4bit = gr.Checkbox(value=True, label="Load in 4-bit (GPU only)")
-        use_fewshot = gr.Checkbox(value=True, label="Use few-shot (1 per EN/FR/DE/IT)")
-        enable_fallback_sampling = gr.Checkbox(value=False, label="Enable fallback sampling (slower/unstable on T4)")
-    rules_file = gr.File(label="Rules JSON (optional) — overrides allowed labels", file_types=[".json"])
-    system = gr.Textbox(label="Instructions (System)", value=SYSTEM_INSTRUCTIONS_BASE, lines=6)
-    context = gr.Textbox(label="Context (User prefix)", value=CONTEXT_GUIDE, lines=6)
-    with gr.Row():
-        soft_cap = gr.Slider(512, 32768, value=1024, step=1, label="Soft token cap (approx; applied after keyword windows)")
-        preprocess = gr.Checkbox(value=True, label="Enable preprocessing")
-        add_header = gr.Checkbox(value=True, label="Add cues header")
-        strip_smalltalk = gr.Checkbox(value=False, label="Strip smalltalk")
-    hourly_rate = gr.Number(value=0.40, precision=4, label="Hourly hardware price (USD) for cost estimate")
-    with gr.Tabs():
-        with gr.Tab("Single Transcript"):
-            transcript = gr.Textbox(label="Paste transcript (EN/FR/DE/IT)", lines=14)
-            gt_single = gr.File(label="Ground truth JSON — {\"labels\": [..]}", file_types=[".json"])
-            run_btn = gr.Button("Run (Single)", variant="primary")
-            repo_used = gr.Textbox(label="Repo used", interactive=False)
-            rev_used = gr.Textbox(label="Revision", interactive=False)
-            json_out = gr.Code(label="Predicted JSON", language="json")
-            metric_cards_md = gr.HTML(label="Metrics (cards)")
-            diag_cards_md = gr.HTML(label="Diagnostics (cards)")
-            raw_metrics = gr.Code(label="Raw metrics JSON", language="json")
-            def _single(*args):
-                return run_single(*args)
-            run_btn.click(
-                _single,
-                inputs=[
-                    custom_repo, rules_file, system, context, transcript,
-                    soft_cap, preprocess, add_header, strip_smalltalk,
-                    load_4bit, hourly_rate, gt_single, use_fewshot, enable_fallback_sampling
-                ],
-                outputs=[repo_used, rev_used, json_out, metric_cards_md, diag_cards_md, raw_metrics],
             )
-        with gr.Tab("Batch (ZIP)"):
-            zip_in = gr.File(label="Upload ZIP of .txt transcripts", file_types=[".zip"])
-            gt_zip = gr.File(label="Upload ZIP of ground truth .json (match basenames)", file_types=[".zip"])
-            run_batch_btn = gr.Button("Run (Batch)", variant="primary")
-            repo_used_b = gr.Textbox(label="Repo used", interactive=False)
-            rev_used_b = gr.Textbox(label="Revision", interactive=False)
-            csv_out = gr.Textbox(label="Predictions CSV (filename,labels)", lines=12)
-            diag_cards_b = gr.HTML(label="Diagnostics (cards)")
-            metrics_out_b = gr.Code(label="Summary metrics JSON", language="json")
-            preds_file = gr.File(label="Download predictions.csv")
-            per_sample_file = gr.File(label="Download per_sample_metrics.csv")
-            summary_file = gr.File(label="Download summary_metrics.json")
-            def _batch(*args):
-                return run_batch(*args)
-            run_batch_btn.click(
-                _batch,
-                inputs=[
-                    custom_repo, rules_file, system, context, zip_in, gt_zip,
-                    soft_cap, preprocess, add_header, strip_smalltalk,
-                    load_4bit, hourly_rate, use_fewshot, enable_fallback_sampling
-                ],
-                outputs=[repo_used_b, rev_used_b, csv_out, diag_cards_b, metrics_out_b, preds_file, per_sample_file, summary_file],
             )
-    gr.Markdown(
-        f"- **HF_TOKEN:** {'✅ set' if HF_TOKEN else '⚠️ not set (only needed for gated/private)'}  \n"
-        f"- **Device:** {DEVICE} ({GPU_NAME}) | **DType:** {DTYPE_FALLBACK} | **Cache dir:** `{CACHE_DIR}`"
     )
 if __name__ == "__main__":

+Allowed Labels (strict, case-insensitive match; output must use canonical label text exactly):
+{allowed_labels_list}
+Instructions:
+1) Extract every concrete task the advisor or client must take.
+2) For each, choose ONE label from Allowed Labels (or leave empty if none match).
+3) Output STRICT JSON only, no prose:
+{{
+  "labels": ["LabelA","LabelB", ...],
+  "tasks": [
+    {{"label": "LabelA", "explanation": "…", "evidence": "…"}},
+    {{"label": "LabelB", "explanation": "…", "evidence": "…"}}
+  ]
+}}
+"""
+# =========================
+# Utilities
+# =========================
+def _now_ms() -> int:
+    return int(time.time() * 1000)
+def read_file_to_text(file: gr.File) -> str:
+    if not file or not file.name:
+        return ""
+    name = file.name.lower()
+    data = file.read()
+    # Restrict to light parsers (txt/md/json) for speed/reliability
+    if name.endswith(".json"):
+        try:
+            obj = json.loads(data.decode("utf-8", errors="ignore"))
+            # Accept either {"transcript": "..."} or list/str
+            if isinstance(obj, dict) and "transcript" in obj:
+                return str(obj["transcript"])
+            return json.dumps(obj, ensure_ascii=False)
+        except Exception:
+            return data.decode("utf-8", errors="ignore")
+    else:
+        # txt / md or anything texty
+        try:
+            return data.decode("utf-8", errors="ignore")
+        except Exception:
+            try:
+                return data.decode("latin-1", errors="ignore")
+            except Exception:
+                return ""
+def normalize_labels(labels: List[str]) -> List[str]:
+    return list(dict.fromkeys([l.strip() for l in labels if isinstance(l, str) and l.strip()]))
+def canonicalize_map(allowed: List[str]) -> Dict[str, str]:
+    """
+    Build a case-insensitive map: lowercase -> canonical label
+    """
+    m = {}
+    for lab in allowed:
+        m[lab.lower()] = lab
+    return m
+def robust_json_extract(text: str) -> Dict[str, Any]:
+    """
+    Try to parse strict JSON from model output.
+    If the model added extra tokens, strip to first {...} block.
+    """
+    if not text:
+        return {"labels": [], "tasks": []}
+    # Find first JSON object
+    start = text.find("{")
+    end = text.rfind("}")
+    if start != -1 and end != -1 and end > start:
+        candidate = text[start : end + 1]
+    else:
+        candidate = text
+    # Remove trailing junk commas and try json.loads
     try:
+        return json.loads(candidate)
     except Exception:
+        # Fallback: try to repair common issues
+        candidate = re.sub(r",\s*}", "}", candidate)
+        candidate = re.sub(r",\s*]", "]", candidate)
+        try:
+            return json.loads(candidate)
+        except Exception:
+            return {"labels": [], "tasks": []}
+def restrict_to_allowed(pred: Dict[str, Any], allowed: List[str]) -> Dict[str, Any]:
+    """
+    Keep only tasks whose label ∈ allowed; map case-insensitively to canonical.
+    """
+    out = {"labels": [], "tasks": []}
+    if not isinstance(pred, dict):
         return out
+    raw_labels = pred.get("labels", []) or []
+    raw_tasks = pred.get("tasks", []) or []
+    allowed_map = canonicalize_map(allowed)
+    # Filter labels
+    filt_labels: List[str] = []
+    for l in raw_labels:
+        if not isinstance(l, str):
+            continue
+        k = l.strip().lower()
+        if k in allowed_map:
+            filt_labels.append(allowed_map[k])
+    filt_labels = normalize_labels(filt_labels)
+    # Filter tasks
+    filt_tasks = []
+    for t in raw_tasks:
+        if not isinstance(t, dict):
+            continue
+        lbl = t.get("label", "")
+        k = str(lbl).strip().lower()
+        if k in allowed_map:
+            new_t = dict(t)
+            new_t["label"] = allowed_map[k]
+            filt_tasks.append(new_t)
+    # Ensure labels reflect tasks (union)
+    from_tasks = [tt["label"] for tt in filt_tasks if isinstance(tt.get("label"), str)]
+    merged = normalize_labels(list(set(filt_labels) | set(from_tasks)))
+    out["labels"] = merged
+    out["tasks"] = filt_tasks
     return out
+def truncate_tokens(tokenizer, text: str, max_input_tokens: int) -> str:
+    if max_input_tokens <= 0:
+        return text
+    toks = tokenizer(text, add_special_tokens=False, return_attention_mask=False, return_tensors=None)["input_ids"]
+    if len(toks) <= max_input_tokens:
+        return text
+    # Keep the tail (most recent part of the convo often carries actionable tasks)
+    keep_ids = toks[-max_input_tokens:]
+    return tokenizer.decode(keep_ids, skip_special_tokens=True)
+# =========================
+# Model Loading
+# =========================
+class ModelWrapper:
+    def __init__(self, repo_id: str, hf_token: Optional[str], load_in_4bit: bool):
         self.repo_id = repo_id
+        self.hf_token = hf_token
+        self.load_in_4bit = load_in_4bit
         self.tokenizer = None
         self.model = None
     def load(self):
         qcfg = None
+        if self.load_in_4bit and DEVICE == "cuda":
             qcfg = BitsAndBytesConfig(
                 load_in_4bit=True,
                 bnb_4bit_quant_type="nf4",
                 bnb_4bit_compute_dtype=torch.float16,
                 bnb_4bit_use_double_quant=True,
             )
+        tok = AutoTokenizer.from_pretrained(
+            self.repo_id,
+            token=self.hf_token,
+            cache_dir=str(SPACE_CACHE),
+            trust_remote_code=True,
+            use_fast=True,
         )
+        # Some models lack pad token—safe default
+        if tok.pad_token is None and tok.eos_token is not None:
+            tok.pad_token = tok.eos_token
+        model = AutoModelForCausalLM.from_pretrained(
+            self.repo_id,
+            token=self.hf_token,
+            cache_dir=str(SPACE_CACHE),
+            trust_remote_code=True,
+            torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
             device_map="auto" if DEVICE == "cuda" else None,
+            low_cpu_mem_usage=True,
+            quantization_config=qcfg,
+            attn_implementation="sdpa",  # T4-safe and faster than 'eager'
         )
+        self.tokenizer = tok
+        self.model = model
     @torch.inference_mode()
+    def generate(self, system_prompt: str, user_prompt: str) -> str:
+        # Chat template if available; otherwise a simple format
+        if hasattr(self.tokenizer, "apply_chat_template"):
+            messages = [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt},
+            ]
+            input_ids = self.tokenizer.apply_chat_template(
+                messages,
+                add_generation_prompt=True,
+                return_tensors="pt",
+            ).to(self.model.device)
         else:
+            text = f"<s>[SYSTEM]\n{system_prompt}\n[/SYSTEM]\n[USER]\n{user_prompt}\n[/USER]\n"
+            input_ids = self.tokenizer(text, return_tensors="pt").to(self.model.device)
+        with torch.cuda.amp.autocast(enabled=(DEVICE == "cuda")):
+            out_ids = self.model.generate(
+                **input_ids,
+                generation_config=GEN_CONFIG,
+                eos_token_id=self.tokenizer.eos_token_id,
+                pad_token_id=self.tokenizer.pad_token_id,
+            )
+        out = self.tokenizer.decode(out_ids[0], skip_special_tokens=True)
+        # Heuristic: strip the prompting part if the model echoes input
+        if "}" in out:
+            tail = out[out.rfind("}") + 1 :]
+            body = out[: out.rfind("}") + 1]
+            # Prefer the last JSON object if multiple
+            if "{" in tail and "}" in tail:
+                # do nothing—rare; handled by robust_json_extract
+                pass
+            return body
+        return out
+# Keep one live model per repo for snappy re-runs
+_MODEL_CACHE: Dict[str, ModelWrapper] = {}
+def get_model(repo_id: str, hf_token: Optional[str], load_in_4bit: bool) -> ModelWrapper:
+    key = f"{repo_id}::{'4bit' if (load_in_4bit and DEVICE=='cuda') else 'full'}"
+    if key not in _MODEL_CACHE:
+        mw = ModelWrapper(repo_id, hf_token, load_in_4bit)
+        mw.load()
+        _MODEL_CACHE[key] = mw
+    return _MODEL_CACHE[key]
+# =========================
+# Inference Pipeline
+# =========================
+def run_extraction(
+    transcript_text: str,
+    transcript_file: gr.File,
+    allowed_labels_text: str,
+    model_repo: str,
+    use_4bit: bool,
+    max_input_tokens: int,
+    hf_token: str,
+) -> Tuple[str, str, str, str]:
+    t0 = _now_ms()
+    # 1) Get transcript: prefer file (drag-drop), else textarea
+    raw_text = ""
+    if transcript_file:
+        raw_text = read_file_to_text(transcript_file)
+    if not raw_text:
+        raw_text = transcript_text or ""
+    raw_text = raw_text.strip()
+    if not raw_text:
+        return "", "", "No transcript provided.", json.dumps({"labels": [], "tasks": []}, ensure_ascii=False, indent=2)
+    # 2) Allowed labels: combine UI text with default (so we NEVER end up empty)
+    user_allowed = [ln.strip() for ln in (allowed_labels_text or "").splitlines() if ln.strip()]
+    allowed = normalize_labels(user_allowed or DEFAULT_ALLOWED_LABELS)
+    # 3) Load model
+    hf_tok = hf_token.strip() or None
     try:
+        model = get_model(model_repo, hf_tok, load_in_4bit=use_4bit)
     except Exception as e:
+        msg = (
+            f"Model load failed for '{model_repo}'. If gated/private, set HF_TOKEN in Space secrets.\n"
+            f"Error: {e}"
+        )
+        return "", "", msg, json.dumps({"labels": [], "tasks": []}, ensure_ascii=False, indent=2)
+    # 4) Truncate input to speed up
+    trunc_text = truncate_tokens(model.tokenizer, raw_text, max_input_tokens=max_input_tokens)
+    # 5) Build prompts
+    allowed_list_str = "\n".join(f"- {lab}" for lab in allowed)
+    user_prompt = USER_PROMPT_TEMPLATE.format(
+        transcript=trunc_text,
+        allowed_labels_list=allowed_list_str,
     )
+    # 6) Generate
+    t1 = _now_ms()
+    try:
+        model_out = model.generate(SYSTEM_PROMPT, user_prompt)
+    except Exception as e:
+        return "", "", f"Generation error: {e}", json.dumps({"labels": [], "tasks": []}, ensure_ascii=False, indent=2)
+    t2 = _now_ms()
+    # 7) Parse & filter strictly to allowed
+    parsed = robust_json_extract(model_out)
+    filtered = restrict_to_allowed(parsed, allowed)
+    # 8) Compose UI outputs
+    # Diagnostics
+    diag = [
+        f"Device: {DEVICE} (4-bit: {'Yes' if (use_4bit and DEVICE=='cuda') else 'No'})",
+        f"Model: {model_repo}",
+        f"Tokens (input, approx): ≤ {max_input_tokens}",
+        f"Latency: load+prep {(t1 - t0)} ms, generate {(t2 - t1)} ms, total {(t2 - t0)} ms",
+        f"Allowed Labels Used (n={len(allowed)}): {', '.join(allowed)}",
+    ]
+    diag_str = "\n".join(diag)
+    # Summary plain text
+    labs = filtered.get("labels", [])
+    tasks = filtered.get("tasks", [])
+    summ_lines = []
+    if labs:
+        summ_lines.append("Detected labels:\n  - " + "\n  - ".join(labs))
+    else:
+        summ_lines.append("Detected labels: (none)")
+    if tasks:
+        summ_lines.append("\nTasks:")
+        for t in tasks:
+            lab = t.get("label", "")
+            expl = t.get("explanation", "")
+            ev = t.get("evidence", "")
+            summ_lines.append(f"• [{lab}] {expl} | evidence: {ev[:140]}{'…' if len(ev)>140 else ''}")
+    else:
+        summ_lines.append("\nTasks: (none)")
+    summary = "\n".join(summ_lines)
+    # JSON pretty
+    json_str = json.dumps(filtered, ensure_ascii=False, indent=2)
+    # Raw model text (to help debug label empty issues)
+    raw_out = model_out.strip()
+    return summary, json_str, diag_str, raw_out
+# =========================
+# UI
+# =========================
+MODEL_CHOICES = [
+    "swiss-ai/Apertus-8B-Instruct-2509",         # default
+    "meta-llama/Meta-Llama-3-8B-Instruct",       # may be gated; handled in code
+    "mistralai/Mistral-7B-Instruct-v0.3",        # widely available, strong baseline
+]
+with gr.Blocks(theme=gr.themes.Soft(), fill_height=True) as demo:
+    gr.Markdown("# Talk2Task — Task Extraction Demo")
     gr.Markdown(
+        "Drop a transcript file **or** paste text, choose a model, and get strict JSON back. "
+        "For best speed, keep inputs concise or lower the input token limit."
     )
     with gr.Row():
+        with gr.Column(scale=3):
+            transcript_file = gr.File(
+                label="Drag & drop transcript (.txt / .md / .json)",
+                file_types=[".txt", ".md", ".json"],
+                type="filepath",
             )
+            transcript_text = gr.Textbox(
+                label="Or paste transcript here",
+                lines=14,
+                placeholder="Paste conversation transcript…",
             )
+            allowed_labels_text = gr.Textbox(
+                label="Allowed Labels (one per line) — leave empty to use defaults",
+                value="",
+                lines=8,
+            )
+        with gr.Column(scale=2):
+            model_repo = gr.Dropdown(
+                label="Model Repository",
+                choices=MODEL_CHOICES,
+                value=MODEL_CHOICES[0],
+            )
+            use_4bit = gr.Checkbox(
+                label="Use 4-bit quantization (recommended on GPU/T4)",
+                value=True,
+            )
+            max_input_tokens = gr.Slider(
+                label="Max input tokens (truncate from end for speed)",
+                minimum=1024,
+                maximum=8192,
+                step=512,
+                value=4096,
+            )
+            hf_token = gr.Textbox(
+                label="HF_TOKEN (only needed for gated/private models)",
+                type="password",
+                value=os.environ.get("HF_TOKEN", ""),
+            )
+            run_btn = gr.Button("Run Extraction", variant="primary")
+    with gr.Row():
+        with gr.Column():
+            summary_out = gr.Textbox(label="Summary", lines=10)
+        with gr.Column():
+            json_out = gr.Code(label="Strict JSON Output", language="json")
+    with gr.Row():
+        with gr.Column():
+            diag_out = gr.Textbox(label="Diagnostics & Timing", lines=8)
+        with gr.Column():
+            raw_out = gr.Textbox(label="Raw Model Output (debug)", lines=8)
+    run_btn.click(
+        fn=run_extraction,
+        inputs=[
+            transcript_text,
+            transcript_file,
+            allowed_labels_text,
+            model_repo,
+            use_4bit,
+            max_input_tokens,
+            hf_token,
+        ],
+        outputs=[summary_out, json_out, diag_out, raw_out],
     )
 if __name__ == "__main__":