Spaces:

RishiRP
/

Talk2TaskDemo1

Sleeping

App Files Files Community

RishiRP commited on Sep 25

Commit

6acd2cc

verified ·

1 Parent(s): 7cd757f

Update app.py

Browse files

Files changed (1) hide show

app.py +447 -624

app.py CHANGED Viewed

@@ -1,232 +1,149 @@
-"""
-Gradio application for the Swiss {ai} Weeks "From Talk to Task" challenge.
-This app provides two modes of operation:
-* **Single transcript** – Paste or upload a single conversation transcript
-  and the model will extract actionable tasks according to a
-  predefined list of labels.  It outputs a human‑readable summary,
-  strict JSON, and diagnostic information (e.g. device, latency).
-* **Batch evaluation** – Upload a ZIP archive containing one `.txt`
-  transcript per call and a matching `.json` file with the ground
-  truth labels.  The app runs the model on each transcript, compares
-  the predictions against the true labels and computes the official
-  weighted score used by the challenge organisers.  It also reports
-  precision, recall and F1, and provides a per‑sample results table
-  that can be downloaded as CSV.
-The official allowed labels and evaluation function are taken from
-the challenge repository README【235032860356166†L37-L51】【235032860356166†L76-L90】.  False negatives are penalised twice as
-heavily as false positives, so recall is especially important.
-"""
-import os
-import io
-import re
-import json
-import time
-import zipfile
-from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional
-import gradio as gr
-import numpy as np
-import pandas as pd
-import torch
-from transformers import (
-    AutoTokenizer,
-    AutoModelForCausalLM,
-    BitsAndBytesConfig,
-    GenerationConfig,
-)
-# =============================================================================
-# Configuration and Constants
-# =============================================================================
-# Cache directory for HuggingFace models
-SPACE_CACHE = Path.home() / ".cache" / "huggingface"
-SPACE_CACHE.mkdir(parents=True, exist_ok=True)
-# Device selection
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# Generation parameters tuned for speed/quality
-GEN_CONFIG = GenerationConfig(
-    temperature=0.2,
-    top_p=0.9,
-    do_sample=False,
-    max_new_tokens=256,
-)
-# Official allowed task labels【235032860356166†L37-L51】
-DEFAULT_ALLOWED_LABELS = [
-    "plan_contact",
-    "schedule_meeting",
-    "update_contact_info_non_postal",
-    "update_contact_info_postal_address",
-    "update_kyc_activity",
-    "update_kyc_origin_of_assets",
-    "update_kyc_purpose_of_businessrelation",
-    "update_kyc_total_assets",
-]
-# System and user prompt templates
-SYSTEM_PROMPT = (
-    "You are a precise banking assistant that extracts ACTIONABLE TASKS "
-    "from client–advisor transcripts. Return STRICT JSON with fields: "
-    '{"labels": ["<Label1>", ...], "tasks": [{"label": "<Label1>", "explanation": "<why>", "evidence": "<span>"}]} '"
-    "Only use labels from the provided Allowed Labels list; if none apply, return an empty list."
-)
-USER_PROMPT_TEMPLATE = """Transcript:
-```
-{transcript}
-```
-Allowed Labels:
 {allowed_labels_list}
-Output STRICT JSON only, no prose:
-{{
-  "labels": ["LabelA", "LabelB", ...],
-  "tasks": [
-    {{"label": "LabelA", "explanation": "…", "evidence": "…"}},
-    {{"label": "LabelB", "explanation": "…", "evidence": "…"}}
-  ]
-}}
-"""
-# =============================================================================
-# Utility Functions
-# =============================================================================
 def _now_ms() -> int:
-    """Return the current time in milliseconds."""
     return int(time.time() * 1000)
-def read_file_to_text(file: Optional[gr.File]) -> str:
-    """
-    Read an uploaded file (txt/md/json) to a string.  For JSON files,
-    return the value of the "transcript" field if present, otherwise
-    return the entire JSON as a compact string.
-    """
-    if not file or not file.name:
-        return ""
-    name = file.name.lower()
-    data = file.read()
-    if name.endswith(".json"):
-        try:
-            obj = json.loads(data.decode("utf-8", errors="ignore"))
-            if isinstance(obj, dict) and "transcript" in obj:
-                return str(obj["transcript"])
-            return json.dumps(obj, ensure_ascii=False)
-        except Exception:
-            return data.decode("utf-8", errors="ignore")
-    return data.decode("utf-8", errors="ignore")
 def normalize_labels(labels: List[str]) -> List[str]:
-    """Deduplicate and strip whitespace from a list of labels."""
     return list(dict.fromkeys([l.strip() for l in labels if isinstance(l, str) and l.strip()]))
 def canonicalize_map(allowed: List[str]) -> Dict[str, str]:
-    """Map lowercase labels to their canonical names."""
     return {lab.lower(): lab for lab in allowed}
 def robust_json_extract(text: str) -> Dict[str, Any]:
-    """
-    Extract the first JSON object from a string.  Removes common
-    trailing comma mistakes.  Returns an empty prediction if no JSON
-    object is found.
-    """
     if not text:
         return {"labels": [], "tasks": []}
     start, end = text.find("{"), text.rfind("}")
-    candidate = text[start:end + 1] if (start != -1 and end != -1) else text
-    candidate = re.sub(r",\s*}\s*", "}", candidate)
-    candidate = re.sub(r",\s*]\s*", "]", candidate)
     try:
         return json.loads(candidate)
     except Exception:
-        return {"labels": [], "tasks": []}
 def restrict_to_allowed(pred: Dict[str, Any], allowed: List[str]) -> Dict[str, Any]:
-    """
-    Restrict predicted labels and tasks to those in the allowed list.
-    Case‑insensitive matching is performed and the canonical form is
-    returned.  Duplicates are removed.
-    """
     out = {"labels": [], "tasks": []}
     allowed_map = canonicalize_map(allowed)
-    # Filter labels
-    filt_labels: List[str] = []
     for l in pred.get("labels", []) or []:
-        if not isinstance(l, str):
-            continue
-        k = l.strip().lower()
         if k in allowed_map:
             filt_labels.append(allowed_map[k])
     filt_labels = normalize_labels(filt_labels)
-    # Filter tasks
     filt_tasks = []
     for t in pred.get("tasks", []) or []:
         if not isinstance(t, dict):
             continue
-        lbl = t.get("label", "")
-        k = str(lbl).strip().lower()
         if k in allowed_map:
             new_t = dict(t)
             new_t["label"] = allowed_map[k]
             filt_tasks.append(new_t)
-    # Merge labels from tasks
-    from_tasks = [tt["label"] for tt in filt_tasks if isinstance(tt.get("label"), str)]
-    merged = normalize_labels(list(set(filt_labels) | set(from_tasks)))
     out["labels"] = merged
     out["tasks"] = filt_tasks
     return out
-def truncate_tokens(tokenizer, text: str, max_tokens: int) -> str:
-    """
-    Keep only the last `max_tokens` tokens of a string according to
-    the provided tokenizer.  This is useful for long transcripts to
-    reduce inference time.
-    """
-    if max_tokens <= 0:
-        return text
-    tok = tokenizer(text, add_special_tokens=False)["input_ids"]
-    if len(tok) <= max_tokens:
         return text
-    return tokenizer.decode(tok[-max_tokens:], skip_special_tokens=True)
-# =============================================================================
-# Model Wrapper and Loader
-# =============================================================================
 class ModelWrapper:
-    """
-    Wraps a HuggingFace model and tokenizer, with optional 4‑bit
-    quantisation.  Instances are cached per model and quantisation
-    setting.
-    """
     def __init__(self, repo_id: str, hf_token: Optional[str], load_in_4bit: bool):
         self.repo_id = repo_id
         self.hf_token = hf_token
         self.load_in_4bit = load_in_4bit
-        self.tokenizer: Optional[AutoTokenizer] = None
-        self.model: Optional[AutoModelForCausalLM] = None
     def load(self):
-        # 4‑bit quantisation config
         qcfg = None
         if self.load_in_4bit and DEVICE == "cuda":
             qcfg = BitsAndBytesConfig(
@@ -235,527 +152,433 @@ class ModelWrapper:
                 bnb_4bit_compute_dtype=torch.float16,
                 bnb_4bit_use_double_quant=True,
             )
-        # Tokenizer
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            self.repo_id,
-            token=self.hf_token,
-            cache_dir=str(SPACE_CACHE),
-            trust_remote_code=True,
-            use_fast=True,
         )
-        if self.tokenizer.pad_token is None and self.tokenizer.eos_token:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-        # Model
-        self.model = AutoModelForCausalLM.from_pretrained(
-            self.repo_id,
-            token=self.hf_token,
-            cache_dir=str(SPACE_CACHE),
             trust_remote_code=True,
             torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
             device_map="auto" if DEVICE == "cuda" else None,
-            low_cpu_mem_usage=True,
-            quantization_config=qcfg,
             attn_implementation="sdpa",
         )
     @torch.inference_mode()
     def generate(self, system_prompt: str, user_prompt: str) -> str:
-        """
-        Generate text from system and user prompts.  Chat templates are
-        used if defined on the tokenizer.
-        """
-        assert self.tokenizer is not None and self.model is not None
-        # Chat template support
         if hasattr(self.tokenizer, "apply_chat_template"):
-            messages = [
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": user_prompt},
-            ]
-            input_ids = self.tokenizer.apply_chat_template(
-                messages, add_generation_prompt=True, return_tensors="pt"
             ).to(self.model.device)
         else:
-            text = f"<s>[SYSTEM]{system_prompt}[/SYSTEM][USER]{user_prompt}[/USER]"
-            input_ids = self.tokenizer(text, return_tensors="pt").to(self.model.device)
         with torch.cuda.amp.autocast(enabled=(DEVICE == "cuda")):
             out_ids = self.model.generate(
-                **input_ids,
                 generation_config=GEN_CONFIG,
                 eos_token_id=self.tokenizer.eos_token_id,
                 pad_token_id=self.tokenizer.pad_token_id,
             )
         return self.tokenizer.decode(out_ids[0], skip_special_tokens=True)
-# Model cache keyed by (repo_id, quantisation)
 _MODEL_CACHE: Dict[str, ModelWrapper] = {}
 def get_model(repo_id: str, hf_token: Optional[str], load_in_4bit: bool) -> ModelWrapper:
-    """Retrieve or load a ModelWrapper from the cache."""
     key = f"{repo_id}::{'4bit' if (load_in_4bit and DEVICE=='cuda') else 'full'}"
     if key not in _MODEL_CACHE:
-        mw = ModelWrapper(repo_id, hf_token, load_in_4bit)
-        mw.load()
-        _MODEL_CACHE[key] = mw
     return _MODEL_CACHE[key]
-# =============================================================================
-# Evaluation Function
-# =============================================================================
 def evaluate_predictions(y_true: List[List[str]], y_pred: List[List[str]]) -> float:
-    """
-    Official weighted score for the challenge【235032860356166†L76-L90】.  False negatives
-    incur double the penalty of false positives.  Returns a score
-    between 0.0 and 1.0, where 1.0 is perfect.
-    """
-    ALLOWED_LABELS = DEFAULT_ALLOWED_LABELS
     LABEL_TO_IDX = {label: idx for idx, label in enumerate(ALLOWED_LABELS)}
     FN_PENALTY = 2.0
     FP_PENALTY = 1.0
     if len(y_true) != len(y_pred):
-        raise ValueError(f"y_true and y_pred lengths differ: {len(y_true)} vs {len(y_pred)}")
     n_samples = len(y_true)
-    n_labels = len(ALLOWED_LABELS)
     y_true_binary = np.zeros((n_samples, n_labels), dtype=int)
     y_pred_binary = np.zeros((n_samples, n_labels), dtype=int)
-    for i, labels in enumerate(y_true):
-        for l in labels:
-            if l not in LABEL_TO_IDX:
-                raise ValueError(f"Invalid true label '{l}'")
-            y_true_binary[i, LABEL_TO_IDX[l]] = 1
-    for i, labels in enumerate(y_pred):
-        for l in labels:
-            if l not in LABEL_TO_IDX:
-                raise ValueError(f"Invalid predicted label '{l}'")
-            y_pred_binary[i, LABEL_TO_IDX[l]] = 1
-    false_negatives = np.sum((y_true_binary == 1) & (y_pred_binary == 0), axis=1)
-    false_positives = np.sum((y_true_binary == 0) & (y_pred_binary == 1), axis=1)
-    weighted_errors = FN_PENALTY * false_negatives + FP_PENALTY * false_positives
-    max_errors_per_sample = FN_PENALTY * np.sum(y_true_binary, axis=1) + FP_PENALTY * (
-        n_labels - np.sum(y_true_binary, axis=1)
-    )
-    per_sample_scores = np.where(
-        max_errors_per_sample > 0,
-        1.0 - (weighted_errors / max_errors_per_sample),
-        1.0,
-    )
-    final_score = float(np.mean(per_sample_scores))
-    return max(0.0, min(1.0, final_score))
-# =============================================================================
-# Prediction Utilities
-# =============================================================================
-def predict_labels_for_text(
-    model: ModelWrapper,
-    transcript: str,
-    allowed: List[str],
-    max_tokens: int,
-) -> List[str]:
-    """
-    Predict labels for a transcript string using the given model.
-    The transcript is truncated to the last `max_tokens` tokens to
-    reduce inference time.  Only labels in `allowed` are returned.
-    """
-    # Truncate transcript
-    truncated = truncate_tokens(model.tokenizer, transcript, max_tokens)
-    allowed_list_str = "\n".join(f"- {lab}" for lab in allowed)
-    user_prompt = USER_PROMPT_TEMPLATE.format(
-        transcript=truncated,
-        allowed_labels_list=allowed_list_str,
-    )
-    raw_out = model.generate(SYSTEM_PROMPT, user_prompt)
-    parsed = robust_json_extract(raw_out)
-    filtered = restrict_to_allowed(parsed, allowed)
-    return filtered.get("labels", []) or []
-# =============================================================================
-# Single Transcript Handler
-# =============================================================================
 def run_single(
     transcript_text: str,
-    transcript_file: Optional[gr.File],
     allowed_labels_text: str,
     model_repo: str,
     use_4bit: bool,
     max_input_tokens: int,
     hf_token: str,
 ) -> Tuple[str, str, str, str]:
-    """
-    Process a single transcript and return (summary, json_output,
-    diagnostics, raw_model_output).  The summary is human‑readable,
-    json_output is the strict JSON string, diagnostics contains
-    performance information, and raw_model_output is the unfiltered
-    model response for debugging.
-    """
     t0 = _now_ms()
-    # Determine transcript text
-    raw_text = read_file_to_text(transcript_file) if transcript_file else (transcript_text or "")
-    raw_text = raw_text.strip()
     if not raw_text:
-        return (
-            "",
-            "",
-            "No transcript provided.",
-            json.dumps({"labels": [], "tasks": []}, indent=2),
-        )
-    # Determine allowed labels
     user_allowed = [ln.strip() for ln in (allowed_labels_text or "").splitlines() if ln.strip()]
-    allowed = normalize_labels(user_allowed or DEFAULT_ALLOWED_LABELS)
-    # Load model
     try:
-        model = get_model(model_repo, hf_token.strip() or None, use_4bit)
     except Exception as e:
-        return (
-            "",
-            "",
-            f"Model load failed: {e}",
-            json.dumps({"labels": [], "tasks": []}, indent=2),
-        )
-    t1 = _now_ms()
-    # Truncate transcript
-    truncated = truncate_tokens(model.tokenizer, raw_text, max_input_tokens)
-    allowed_list_str = "\n".join(f"- {lab}" for lab in allowed)
     user_prompt = USER_PROMPT_TEMPLATE.format(
-        transcript=truncated,
         allowed_labels_list=allowed_list_str,
     )
     # Generate
     try:
-        model_out = model.generate(SYSTEM_PROMPT, user_prompt)
     except Exception as e:
-        return (
-            "",
-            "",
-            f"Generation error: {e}",
-            json.dumps({"labels": [], "tasks": []}, indent=2),
-        )
     t2 = _now_ms()
-    # Parse and filter
-    parsed = robust_json_extract(model_out)
     filtered = restrict_to_allowed(parsed, allowed)
-    # Compose summary
     labs = filtered.get("labels", [])
     tasks = filtered.get("tasks", [])
-    summ_lines: List[str] = []
-    if labs:
-        summ_lines.append("Detected labels:\n  - " + "\n  - ".join(labs))
-    else:
-        summ_lines.append("Detected labels: (none)")
     if tasks:
-        summ_lines.append("\nTasks:")
-        for t in tasks:
-            lab = t.get("label", "")
-            expl = t.get("explanation", "")
-            ev = t.get("evidence", "")
-            trimmed = ev[:140] + ("…" if len(ev) > 140 else "")
-            summ_lines.append(f"• [{lab}] {expl} | evidence: {trimmed}")
     else:
-        summ_lines.append("\nTasks: (none)")
-    summary = "\n".join(summ_lines)
-    # Diagnostics
-    diag = [
-        f"Device: {DEVICE} (4‑bit: {'Yes' if (use_4bit and DEVICE=='cuda') else 'No'})",
-        f"Model: {model_repo}",
-        f"Tokens (input ≤): {max_input_tokens}",
-        f"Latency: load/prep {t1 - t0} ms, generate {t2 - t1} ms, total {t2 - t0} ms",
-        f"Allowed labels (n={len(allowed)}): {', '.join(allowed)}",
-    ]
-    diag_str = "\n".join(diag)
-    json_str = json.dumps(filtered, ensure_ascii=False, indent=2)
-    raw_out = model_out.strip()
-    return summary, json_str, diag_str, raw_out
-# =============================================================================
-# Batch Evaluation Handler
-# =============================================================================
 def run_batch(
-    zip_file: Optional[gr.File],
-    allowed_labels_text: str,
     model_repo: str,
     use_4bit: bool,
     max_input_tokens: int,
     hf_token: str,
-    max_files: int,
 ) -> Tuple[str, str, str, pd.DataFrame, str]:
-    """
-    Run batch evaluation on a ZIP archive of transcripts and ground
-    truths.  Returns (score_str, recall_precision_f1_str, extra_info,
-    dataframe, download_path).
-    """
-    if zip_file is None or not zip_file.name.lower().endswith(".zip"):
-        return ("No ZIP file provided.", "", "", pd.DataFrame(), "")
-    # Allowed labels
-    user_allowed = [ln.strip() for ln in (allowed_labels_text or "").splitlines() if ln.strip()]
-    allowed = normalize_labels(user_allowed or DEFAULT_ALLOWED_LABELS)
-    # Load model once
     try:
-        model = get_model(model_repo, hf_token.strip() or None, use_4bit)
     except Exception as e:
         return (f"Model load failed: {e}", "", "", pd.DataFrame(), "")
-    # Extract ZIP to temp directory
-    timestamp = int(time.time())
-    extract_root = Path("/tmp") / f"batch_{timestamp}"
-    extract_root.mkdir(parents=True, exist_ok=True)
-    try:
-        with zipfile.ZipFile(io.BytesIO(zip_file.read())) as zf:
-            zf.extractall(extract_root)
-    except Exception as e:
-        return (f"Failed to extract ZIP: {e}", "", "", pd.DataFrame(), "")
-    # Collect transcript and label paths
-    transcript_paths: Dict[str, Path] = {}
-    truth_paths: Dict[str, Path] = {}
-    for path in extract_root.rglob("*"):
-        if path.is_file():
-            stem = path.stem
-            ext = path.suffix.lower()
-            if ext == ".txt":
-                transcript_paths[stem] = path
-            elif ext == ".json":
-                truth_paths[stem] = path
-    # Pair transcripts and truth files
-    paired = [
-        (stem, transcript_paths[stem], truth_paths.get(stem))
-        for stem in sorted(transcript_paths.keys())
-    ]
-    if not paired:
-        return ("No transcript files found in ZIP.", "", "", pd.DataFrame(), "")
-    # Optionally limit number of files
-    if max_files > 0:
-        paired = paired[: max_files]
-    # Lists for evaluation and per‑sample results
-    y_true_list: List[List[str]] = []
-    y_pred_list: List[List[str]] = []
-    result_rows: List[Dict[str, Any]] = []
-    total_tp = total_fp = total_fn = 0
-    # Iterate through samples
-    for stem, txt_path, truth_path in paired:
-        # Read transcript
-        try:
-            with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
-                transcript = f.read().strip()
-        except Exception:
-            transcript = ""
-        # Read ground truth labels
-        true_labels: List[str] = []
-        if truth_path and truth_path.is_file():
-            try:
-                with open(truth_path, "r", encoding="utf-8", errors="ignore") as f:
-                    obj = json.load(f)
-                if isinstance(obj, dict) and "labels" in obj:
-                    true_labels = [str(l).strip() for l in obj["labels"] if isinstance(l, str)]
-                elif isinstance(obj, list):
-                    true_labels = [str(l).strip() for l in obj if isinstance(l, str)]
-            except Exception:
-                true_labels = []
-        # Predict labels
-        pred_labels: List[str] = []
-        if transcript:
             try:
-                pred_labels = predict_labels_for_text(model, transcript, allowed, max_input_tokens)
             except Exception:
-                pred_labels = []
-        # Compute per‑sample metrics
-        true_set = set(true_labels)
-        pred_set = set(pred_labels)
-        tp = len(true_set & pred_set)
-        fp = len(pred_set - true_set)
-        fn = len(true_set - pred_set)
-        total_tp += tp
-        total_fp += fp
-        total_fn += fn
-        y_true_list.append(list(true_set))
-        y_pred_list.append(list(pred_set))
-        result_rows.append(
-            {
-                "file": stem,
-                "true_labels": ", ".join(sorted(true_set)) if true_set else "",
-                "pred_labels": ", ".join(sorted(pred_set)) if pred_set else "",
-                "true_positives": tp,
-                "false_positives": fp,
-                "false_negatives": fn,
-            }
-        )
-    # Compute metrics
-    if y_true_list:
-        try:
-            weighted_score = evaluate_predictions(y_true_list, y_pred_list)
-        except Exception:
-            weighted_score = 0.0
-        precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 1.0
-        recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 1.0
-        f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0
-    else:
-        weighted_score = precision = recall = f1 = 0.0
-    df = pd.DataFrame(result_rows)
-    score_str = f"Weighted score: {weighted_score:.3f}"
-    metrics_str = f"Recall: {recall:.3f} | Precision: {precision:.3f} | F1: {f1:.3f}"
-    extra_str = f"Processed {len(paired)} transcripts | TP={total_tp} FP={total_fp} FN={total_fn}"
-    # Write CSV
-    csv_path = extract_root / "batch_results.csv"
-    try:
-        df.to_csv(csv_path, index=False)
-        csv_path_str = str(csv_path)
-    except Exception:
-        csv_path_str = ""
-    return (score_str, metrics_str, extra_str, df, csv_path_str)
-# =============================================================================
-# Interface
-# =============================================================================
-def build_ui() -> gr.Blocks:
-    """Construct the Gradio interface."""
-    with gr.Blocks(theme=gr.themes.Soft(), fill_height=True) as demo:
-        gr.Markdown("# Talk2Task – Transcript Task Extraction and Evaluation")
-        gr.Markdown(
-            "This app extracts actionable tasks from client–advisor transcripts using "
-            "a selectable language model.  You can run it on a single transcript "
-            "or evaluate a batch of transcripts against provided ground truth labels."
-        )
-        with gr.Tab("Single Transcript"):
-            with gr.Row():
-                with gr.Column(scale=3):
-                    transcript_file = gr.File(
-                        label="Upload transcript (.txt/.md/.json)",
-                        file_types=[".txt", ".md", ".json"],
-                        type="filepath",
-                    )
-                    transcript_text = gr.Textbox(
-                        label="Or paste transcript here",
-                        lines=12,
-                        placeholder="Paste conversation transcript…",
-                    )
-                    allowed_labels_text = gr.Textbox(
-                        label="Allowed Labels (one per line; leave blank for defaults)",
-                        lines=8,
-                    )
-                with gr.Column(scale=2):
-                    model_repo = gr.Dropdown(
-                        label="Model Repository",
-                        choices=[
-                            "swiss-ai/Apertus-8B-Instruct-2509",
-                            "meta-llama/Meta-Llama-3-8B-Instruct",
-                            "mistralai/Mistral-7B-Instruct-v0.3",
-                        ],
-                        value="swiss-ai/Apertus-8B-Instruct-2509",
-                    )
-                    use_4bit = gr.Checkbox(
-                        label="Use 4-bit quantisation (GPU only)", value=True
-                    )
-                    max_input_tokens = gr.Slider(
-                        label="Max input tokens (truncate from end)",
-                        minimum=1024,
-                        maximum=8192,
-                        step=512,
-                        value=4096,
-                    )
-                    hf_token = gr.Textbox(
-                        label="HF_TOKEN (for gated/private models)",
-                        type="password",
-                        value=os.environ.get("HF_TOKEN", ""),
-                    )
-                    single_button = gr.Button("Run Extraction", variant="primary")
-            with gr.Row():
-                summary = gr.Textbox(label="Summary", lines=12)
-                json_out = gr.Code(label="Strict JSON Output", language="json")
-            with gr.Row():
-                diag = gr.Textbox(label="Diagnostics", lines=6)
-                raw_out = gr.Textbox(label="Raw Model Output", lines=6)
-            # Hook up single button
-            single_button.click(
-                fn=run_single,
-                inputs=[
-                    transcript_text,
-                    transcript_file,
-                    allowed_labels_text,
-                    model_repo,
-                    use_4bit,
-                    max_input_tokens,
-                    hf_token,
-                ],
-                outputs=[summary, json_out, diag, raw_out],
-            )
-        with gr.Tab("Batch Evaluation"):
-            with gr.Row():
-                with gr.Column(scale=3):
-                    zip_input = gr.File(
-                        label="ZIP of transcripts and labels", file_types=[".zip"], type="filepath"
-                    )
-                    batch_allowed_labels = gr.Textbox(
-                        label="Allowed Labels (one per line; leave blank for defaults)",
-                        lines=8,
-                    )
-                    max_files_slider = gr.Slider(
-                        label="Max files to process (0 = no limit)",
-                        minimum=0,
-                        maximum=1000,
-                        step=1,
-                        value=0,
-                    )
-                with gr.Column(scale=2):
-                    batch_model_repo = gr.Dropdown(
-                        label="Model Repository",
-                        choices=[
-                            "swiss-ai/Apertus-8B-Instruct-2509",
-                            "meta-llama/Meta-Llama-3-8B-Instruct",
-                            "mistralai/Mistral-7B-Instruct-v0.3",
-                        ],
-                        value="swiss-ai/Apertus-8B-Instruct-2509",
-                    )
-                    batch_use_4bit = gr.Checkbox(
-                        label="Use 4-bit quantisation (GPU only)", value=True
-                    )
-                    batch_max_input_tokens = gr.Slider(
-                        label="Max input tokens (truncate from end)",
-                        minimum=1024,
-                        maximum=8192,
-                        step=512,
-                        value=4096,
-                    )
-                    batch_hf_token = gr.Textbox(
-                        label="HF_TOKEN (for gated/private models)",
-                        type="password",
-                        value=os.environ.get("HF_TOKEN", ""),
-                    )
-                    batch_button = gr.Button("Run Batch Evaluation", variant="primary")
-            # Outputs
-            batch_score = gr.Textbox(label="Score")
-            batch_metrics = gr.Textbox(label="Recall / Precision / F1")
-            batch_extra = gr.Textbox(label="Summary", lines=2)
-            batch_df = gr.Dataframe(label="Per‑sample results", interactive=True, wrap=True)
-            batch_download = gr.File(label="Download results (CSV)")
-            # Hook up batch button
-            def on_batch(zip_file, allowed_text, repo, use4, max_tok, token, max_f):
-                score, metrics, extra, df, csv_path = run_batch(
-                    zip_file, allowed_text, repo, use4, max_tok, token, int(max_f)
                 )
-                return score, metrics, extra, df, csv_path
-            batch_button.click(
-                fn=on_batch,
-                inputs=[
-                    zip_input,
-                    batch_allowed_labels,
-                    batch_model_repo,
-                    batch_use_4bit,
-                    batch_max_input_tokens,
-                    batch_hf_token,
-                    max_files_slider,
-                ],
-                outputs=[batch_score, batch_metrics, batch_extra, batch_df, batch_download],
-            )
-    return demo
 if __name__ == "__main__":
-    demo = build_ui()
-    demo.launch()

+Allowed Labels (canonical; use only these):
 {allowed_labels_list}
+Context cues (keywords/phrases that often indicate each label):
+{keyword_context}
+Instructions:
+1) Identify EVERY concrete task implied by the conversation.
+2) Choose ONE label from Allowed Labels for each task (or none if truly inapplicable).
+3) Return STRICT JSON only in the exact schema described by the system prompt.
+"""
+# =========================
+# Utilities
+# =========================
 def _now_ms() -> int:
     return int(time.time() * 1000)
 def normalize_labels(labels: List[str]) -> List[str]:
     return list(dict.fromkeys([l.strip() for l in labels if isinstance(l, str) and l.strip()]))
 def canonicalize_map(allowed: List[str]) -> Dict[str, str]:
     return {lab.lower(): lab for lab in allowed}
 def robust_json_extract(text: str) -> Dict[str, Any]:
     if not text:
         return {"labels": [], "tasks": []}
     start, end = text.find("{"), text.rfind("}")
+    candidate = text[start:end+1] if (start != -1 and end != -1 and end > start) else text
     try:
         return json.loads(candidate)
     except Exception:
+        candidate = re.sub(r",\s*}", "}", candidate)
+        candidate = re.sub(r",\s*]", "]", candidate)
+        try:
+            return json.loads(candidate)
+        except Exception:
+            return {"labels": [], "tasks": []}
 def restrict_to_allowed(pred: Dict[str, Any], allowed: List[str]) -> Dict[str, Any]:
     out = {"labels": [], "tasks": []}
     allowed_map = canonicalize_map(allowed)
+    # labels
+    filt_labels = []
     for l in pred.get("labels", []) or []:
+        k = str(l).strip().lower()
         if k in allowed_map:
             filt_labels.append(allowed_map[k])
     filt_labels = normalize_labels(filt_labels)
+    # tasks
     filt_tasks = []
     for t in pred.get("tasks", []) or []:
         if not isinstance(t, dict):
             continue
+        k = str(t.get("label", "")).strip().lower()
         if k in allowed_map:
             new_t = dict(t)
             new_t["label"] = allowed_map[k]
             filt_tasks.append(new_t)
+    merged = normalize_labels(list(set(filt_labels) | {tt["label"] for tt in filt_tasks}))
     out["labels"] = merged
     out["tasks"] = filt_tasks
     return out
+# =========================
+# Default pre-processing
+# =========================
+# These are conservative; they remove boilerplate that appears in many files
+# and does not affect tasks. You can toggle this in the UI.
+_DISCLAIMER_PATTERNS = [
+    r"(?is)^\s*(?:disclaimer|legal notice|confidentiality notice).+?(?:\n{2,}|$)",
+    r"(?is)^\s*the information contained.+?(?:\n{2,}|$)",
+    r"(?is)^\s*this message \(including any attachments\).+?(?:\n{2,}|$)",
+]
+_FOOTER_PATTERNS = [
+    r"(?is)\n+kind regards[^\n]*\n.*$", r"(?is)\n+best regards[^\n]*\n.*$",
+    r"(?is)\n+sent from my.*$", r"(?is)\n+ubs ag.*$",
+]
+_TIMESTAMP_SPEAKER = [
+    r"\[\d{1,2}:\d{2}(:\d{2})?\]",     # [00:01] or [00:01:02]
+    r"^\s*(advisor|client)\s*:\s*",    # Advisor: / Client:
+    r"^\s*(speaker\s*\d+)\s*:\s*",     # Speaker 1:
+]
+def clean_transcript(text: str) -> str:
+    if not text:
         return text
+    s = text
+    # Remove common timestamps and speaker prefixes (line-wise)
+    lines = []
+    for ln in s.splitlines():
+        ln2 = ln
+        for pat in _TIMESTAMP_SPEAKER:
+            ln2 = re.sub(pat, "", ln2, flags=re.IGNORECASE)
+        lines.append(ln2)
+    s = "\n".join(lines)
+    # Remove top disclaimers
+    for pat in _DISCLAIMER_PATTERNS:
+        s = re.sub(pat, "", s).strip()
+    # Remove trailing footers/signatures
+    for pat in _FOOTER_PATTERNS:
+        s = re.sub(pat, "", s)
+    # Collapse repeated whitespace
+    s = re.sub(r"[ \t]+", " ", s)
+    s = re.sub(r"\n{3,}", "\n\n", s).strip()
+    return s
+def read_text_from_file(file: gr.File) -> str:
+    if not file or not file.name:
+        return ""
+    name = file.name.lower()
+    data = file.read()
+    if name.endswith(".json"):
+        try:
+            obj = json.loads(data.decode("utf-8", errors="ignore"))
+            if isinstance(obj, dict) and "transcript" in obj:
+                return str(obj["transcript"])
+            return json.dumps(obj, ensure_ascii=False)
+        except Exception:
+            return data.decode("utf-8", errors="ignore")
+    else:
+        return data.decode("utf-8", errors="ignore")
+def truncate_tokens(tokenizer, text: str, max_tokens: int) -> str:
+    toks = tokenizer(text, add_special_tokens=False)["input_ids"]
+    if len(toks) <= max_tokens:
+        return text
+    return tokenizer.decode(toks[-max_tokens:], skip_special_tokens=True)
+# =========================
+# HF model wrapper
+# =========================
 class ModelWrapper:
     def __init__(self, repo_id: str, hf_token: Optional[str], load_in_4bit: bool):
         self.repo_id = repo_id
         self.hf_token = hf_token
         self.load_in_4bit = load_in_4bit
+        self.tokenizer = None
+        self.model = None
     def load(self):
         qcfg = None
         if self.load_in_4bit and DEVICE == "cuda":
             qcfg = BitsAndBytesConfig(
                 bnb_4bit_compute_dtype=torch.float16,
                 bnb_4bit_use_double_quant=True,
             )
+        tok = AutoTokenizer.from_pretrained(
+            self.repo_id, token=self.hf_token, cache_dir=str(SPACE_CACHE),
+            trust_remote_code=True, use_fast=True,
         )
+        if tok.pad_token is None and tok.eos_token:
+            tok.pad_token = tok.eos_token
+        model = AutoModelForCausalLM.from_pretrained(
+            self.repo_id, token=self.hf_token, cache_dir=str(SPACE_CACHE),
             trust_remote_code=True,
             torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
             device_map="auto" if DEVICE == "cuda" else None,
+            low_cpu_mem_usage=True, quantization_config=qcfg,
             attn_implementation="sdpa",
         )
+        self.tokenizer = tok
+        self.model = model
     @torch.inference_mode()
     def generate(self, system_prompt: str, user_prompt: str) -> str:
         if hasattr(self.tokenizer, "apply_chat_template"):
+            msgs = [{"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt}]
+            inputs = self.tokenizer.apply_chat_template(
+                msgs, add_generation_prompt=True, return_tensors="pt"
             ).to(self.model.device)
         else:
+            text = f"<s>[SYSTEM]\n{system_prompt}\n[/SYSTEM]\n[USER]\n{user_prompt}\n[/USER]\n"
+            inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
         with torch.cuda.amp.autocast(enabled=(DEVICE == "cuda")):
             out_ids = self.model.generate(
+                **inputs,
                 generation_config=GEN_CONFIG,
                 eos_token_id=self.tokenizer.eos_token_id,
                 pad_token_id=self.tokenizer.pad_token_id,
             )
         return self.tokenizer.decode(out_ids[0], skip_special_tokens=True)
 _MODEL_CACHE: Dict[str, ModelWrapper] = {}
 def get_model(repo_id: str, hf_token: Optional[str], load_in_4bit: bool) -> ModelWrapper:
     key = f"{repo_id}::{'4bit' if (load_in_4bit and DEVICE=='cuda') else 'full'}"
     if key not in _MODEL_CACHE:
+        m = ModelWrapper(repo_id, hf_token, load_in_4bit)
+        m.load()
+        _MODEL_CACHE[key] = m
     return _MODEL_CACHE[key]
+# =========================
+# Official evaluation (from README)
+# =========================
 def evaluate_predictions(y_true: List[List[str]], y_pred: List[List[str]]) -> float:
+    ALLOWED_LABELS = OFFICIAL_LABELS
     LABEL_TO_IDX = {label: idx for idx, label in enumerate(ALLOWED_LABELS)}
     FN_PENALTY = 2.0
     FP_PENALTY = 1.0
+    def _process_sample_labels(sample_labels: List[str], sample_name: str) -> List[str]:
+        if not isinstance(sample_labels, list):
+            raise ValueError(f"{sample_name} must be a list of strings, got {type(sample_labels)}")
+        # dedupe
+        seen, uniq = set(), []
+        for label in sample_labels:
+            if not isinstance(label, str):
+                raise ValueError(f"{sample_name} contains non-string: {label} (type: {type(label)})")
+            if label in seen:
+                raise ValueError(f"{sample_name} contains duplicate label: '{label}'")
+            seen.add(label); uniq.append(label)
+        # validity
+        valid = []
+        for label in uniq:
+            if label not in ALLOWED_LABELS:
+                raise ValueError(f"{sample_name} contains invalid label: '{label}'. Allowed: {ALLOWED_LABELS}")
+            valid.append(label)
+        return valid
     if len(y_true) != len(y_pred):
+        raise ValueError(f"y_true and y_pred must have same length. Got {len(y_true)} vs {len(y_pred)}")
     n_samples = len(y_true)
+    n_labels = len(OFFICIAL_LABELS)
     y_true_binary = np.zeros((n_samples, n_labels), dtype=int)
     y_pred_binary = np.zeros((n_samples, n_labels), dtype=int)
+    for i, sample_labels in enumerate(y_true):
+        for label in _process_sample_labels(sample_labels, f"y_true[{i}]"):
+            y_true_binary[i, LABEL_TO_IDX[label]] = 1
+    for i, sample_labels in enumerate(y_pred):
+        for label in _process_sample_labels(sample_labels, f"y_pred[{i}]"):
+            y_pred_binary[i, LABEL_TO_IDX[label]] = 1
+    fn = np.sum((y_true_binary == 1) & (y_pred_binary == 0), axis=1)
+    fp = np.sum((y_true_binary == 0) & (y_pred_binary == 1), axis=1)
+    weighted = 2.0 * fn + 1.0 * fp
+    max_err = 2.0 * np.sum(y_true_binary, axis=1) + 1.0 * (n_labels - np.sum(y_true_binary, axis=1))
+    per_sample = np.where(max_err > 0, 1.0 - (weighted / max_err), 1.0)
+    return float(max(0.0, min(1.0, np.mean(per_sample))))
+# =========================
+# Inference helpers
+# =========================
+def build_keyword_context(allowed: List[str]) -> str:
+    parts = []
+    for lab in allowed:
+        kws = LABEL_KEYWORDS.get(lab, [])
+        if kws:
+            parts.append(f"- {lab}: " + ", ".join(kws))
+        else:
+            parts.append(f"- {lab}: (no default cues)")
+    return "\n".join(parts)
 def run_single(
     transcript_text: str,
+    transcript_file: gr.File,
+    use_cleaning: bool,
     allowed_labels_text: str,
     model_repo: str,
     use_4bit: bool,
     max_input_tokens: int,
     hf_token: str,
 ) -> Tuple[str, str, str, str]:
     t0 = _now_ms()
+    # Get transcript
+    raw_text = read_text_from_file(transcript_file) if transcript_file else (transcript_text or "")
+    raw_text = (raw_text or "").strip()
     if not raw_text:
+        return "", "", "No transcript provided.", json.dumps({"labels": [], "tasks": []}, indent=2)
+    # Cleaning
+    text = clean_transcript(raw_text) if use_cleaning else raw_text
+    # Allowed labels
     user_allowed = [ln.strip() for ln in (allowed_labels_text or "").splitlines() if ln.strip()]
+    allowed = normalize_labels(user_allowed or OFFICIAL_LABELS)
+    # Model
     try:
+        model = get_model(model_repo, (hf_token or "").strip() or None, use_4bit)
     except Exception as e:
+        return "", "", f"Model load failed: {e}", json.dumps({"labels": [], "tasks": []}, indent=2)
+    # Truncate
+    trunc = truncate_tokens(model.tokenizer, text, max_input_tokens)
+    # Build prompt
+    allowed_list_str = "\n".join(f"- {l}" for l in allowed)
+    keyword_ctx = build_keyword_context(allowed)
     user_prompt = USER_PROMPT_TEMPLATE.format(
+        transcript=trunc,
         allowed_labels_list=allowed_list_str,
+        keyword_context=keyword_ctx,
     )
     # Generate
+    t1 = _now_ms()
     try:
+        out = model.generate(SYSTEM_PROMPT, user_prompt)
     except Exception as e:
+        return "", "", f"Generation error: {e}", json.dumps({"labels": [], "tasks": []}, indent=2)
     t2 = _now_ms()
+    # Parse + filter
+    parsed = robust_json_extract(out)
     filtered = restrict_to_allowed(parsed, allowed)
+    # Diagnostics
+    diag = "\n".join([
+        f"Device: {DEVICE} (4-bit: {'Yes' if (use_4bit and DEVICE=='cuda') else 'No'})",
+        f"Model: {model_repo}",
+        f"Input cleaned: {'Yes' if use_cleaning else 'No'}",
+        f"Tokens (input, approx): ≤ {max_input_tokens}",
+        f"Latency: prep {t1-t0} ms, gen {t2-t1} ms, total {t2-t0} ms",
+        f"Allowed labels: {', '.join(allowed)}",
+    ])
+    # Summary
     labs = filtered.get("labels", [])
     tasks = filtered.get("tasks", [])
+    summary = "Detected labels:\n" + ("\n".join(f"- {l}" for l in labs) if labs else "(none)")
     if tasks:
+        summary += "\n\nTasks:\n" + "\n".join(
+            f"• [{t['label']}] {t.get('explanation','')} | ev: {t.get('evidence','')[:140]}{'…' if len(t.get('evidence',''))>140 else ''}"
+            for t in tasks
+        )
     else:
+        summary += "\n\nTasks: (none)"
+    return summary, json.dumps(filtered, indent=2, ensure_ascii=False), diag, out.strip()
+# =========================
+# Batch mode (ZIP with transcripts + truths)
+# =========================
+def read_zip(fileobj: io.BytesIO, exdir: Path) -> List[Path]:
+    exdir.mkdir(parents=True, exist_ok=True)
+    with zipfile.ZipFile(fileobj) as zf:
+        zf.extractall(exdir)
+    out = []
+    for p in exdir.rglob("*"):
+        if p.is_file():
+            out.append(p)
+    return out
 def run_batch(
+    zip_file: gr.File,
+    use_cleaning: bool,
     model_repo: str,
     use_4bit: bool,
     max_input_tokens: int,
     hf_token: str,
+    limit_files: int,
 ) -> Tuple[str, str, str, pd.DataFrame, str]:
+    if not zip_file:
+        return ("No ZIP provided.", "", "", pd.DataFrame(), "")
+    work = Path("/tmp/batch")
+    if work.exists():
+        for p in work.rglob("*"):
+            try: p.unlink()
+            except Exception: pass
+        try: work.rmdir()
+        except Exception: pass
+    work.mkdir(parents=True, exist_ok=True)
+    # Unzip
+    data = zip_file.read()
+    files = read_zip(io.BytesIO(data), work)
+    # Gather pairs by stem
+    txts: Dict[str, Path] = {}
+    gts: Dict[str, Path] = {}
+    for p in files:
+        if p.suffix.lower() == ".txt":
+            txts[p.stem] = p
+        elif p.suffix.lower() == ".json":
+            gts[p.stem] = p
+    stems = sorted(txts.keys())
+    if limit_files > 0:
+        stems = stems[:limit_files]
+    if not stems:
+        return ("No .txt transcripts found in ZIP.", "", "", pd.DataFrame(), "")
+    # Model
     try:
+        model = get_model(model_repo, (hf_token or "").strip() or None, use_4bit)
     except Exception as e:
         return (f"Model load failed: {e}", "", "", pd.DataFrame(), "")
+    allowed = OFFICIAL_LABELS[:]  # fixed for scoring
+    allowed_list_str = "\n".join(f"- {l}" for l in allowed)
+    keyword_ctx = build_keyword_context(allowed)
+    y_true, y_pred = [], []
+    rows = []
+    t_start = _now_ms()
+    for stem in stems:
+        raw = txts[stem].read_text(encoding="utf-8", errors="ignore")
+        text = clean_transcript(raw) if use_cleaning else raw
+        trunc = truncate_tokens(model.tokenizer, text, max_input_tokens)
+        user_prompt = USER_PROMPT_TEMPLATE.format(
+            transcript=trunc,
+            allowed_labels_list=allowed_list_str,
+            keyword_context=keyword_ctx,
+        )
+        t0 = _now_ms()
+        out = model.generate(SYSTEM_PROMPT, user_prompt)
+        t1 = _now_ms()
+        parsed = robust_json_extract(out)
+        filtered = restrict_to_allowed(parsed, allowed)
+        pred_labels = filtered.get("labels", [])
+        y_pred.append(pred_labels)
+        # Ground truth (optional)
+        gt_labels = []
+        if stem in gts:
             try:
+                gt_obj = json.loads(gts[stem].read_text(encoding="utf-8", errors="ignore"))
+                if isinstance(gt_obj, dict) and "labels" in gt_obj and isinstance(gt_obj["labels"], list):
+                    gt_labels = [x for x in gt_obj["labels"] if x in OFFICIAL_LABELS]
             except Exception:
+                pass
+        y_true.append(gt_labels)
+        # FP/FN counts for table
+        gt_set = set(gt_labels)
+        pr_set = set(pred_labels)
+        tp = sorted(gt_set & pr_set)
+        fp = sorted(pr_set - gt_set)
+        fn = sorted(gt_set - pr_set)
+        rows.append({
+            "file": stem,
+            "true_labels": ", ".join(gt_labels),
+            "pred_labels": ", ".join(pred_labels),
+            "TP": len(tp), "FP": len(fp), "FN": len(fn),
+            "gen_ms": t1 - t0
+        })
+    # Metrics
+    # If there is no ground truth in the ZIP, we still compute a table and skip score.
+    have_truth = any(len(v) > 0 for v in y_true)
+    score = evaluate_predictions(y_true, y_pred) if have_truth else None
+    df = pd.DataFrame(rows).sort_values(["FN", "FP", "file"])
+    diag = [
+        f"Processed files: {len(stems)}",
+        f"Device: {DEVICE} (4-bit: {'Yes' if (use_4bit and DEVICE=='cuda') else 'No'})",
+        f"Model: {model_repo}",
+        f"Input cleaned: {'Yes' if use_cleaning else 'No'}",
+        f"Tokens (input, approx): ≤ {max_input_tokens}",
+        f"Batch time: {_now_ms()-t_start} ms",
+    ]
+    if have_truth and score is not None:
+        # Simple derived metrics
+        total_tp = int(df["TP"].sum())
+        total_fp = int(df["FP"].sum())
+        total_fn = int(df["FN"].sum())
+        recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) else 1.0
+        precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) else 1.0
+        f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) else 1.0
+        diag += [
+            f"Official weighted score (0–1): {score:.3f}",
+            f"Recall: {recall:.3f} | Precision: {precision:.3f} | F1: {f1:.3f}",
+            f"Total TP={total_tp} FP={total_fp} FN={total_fn}",
+        ]
+    diag_str = "\n".join(diag)
+    # CSV preview and data URL
+    csv_buf = io.StringIO()
+    df.to_csv(csv_buf, index=False)
+    csv_data = csv_buf.getvalue()
+    return ("Batch done.", diag_str, csv_data, df, csv_data)
+# =========================
+# UI
+# =========================
+MODEL_CHOICES = [
+    "swiss-ai/Apertus-8B-Instruct-2509",
+    "meta-llama/Meta-Llama-3-8B-Instruct",
+    "mistralai/Mistral-7B-Instruct-v0.3",
+]
+with gr.Blocks(theme=gr.themes.Soft(), fill_height=True) as demo:
+    gr.Markdown("# Talk2Task — Task Extraction (UBS Challenge)")
+    gr.Markdown(
+        "This tool extracts challenge labels from transcripts. "
+        "Use **Single** for quick tests; use **Batch** to score a ZIP with transcripts + truths. "
+        "_Note: False negatives are penalised twice as much as false positives in the official metric; "
+        "we bias for recall._"
+    )
+    with gr.Tab("Single transcript"):
+        with gr.Row():
+            with gr.Column(scale=3):
+                file = gr.File(
+                    label="Drag & drop transcript (.txt / .md / .json)",
+                    file_types=[".txt", ".md", ".json"],
+                    type="filepath",
                 )
+                text = gr.Textbox(label="Or paste transcript", lines=14)
+                use_cleaning = gr.Checkbox(label="Apply default cleaning (remove disclaimers, timestamps, footers)", value=True)
+                labels_text = gr.Textbox(
+                    label="Allowed Labels (one per line; leave empty to use official list)",
+                    value="",
+                    lines=8,
+                )
+            with gr.Column(scale=2):
+                repo = gr.Dropdown(label="Model", choices=MODEL_CHOICES, value=MODEL_CHOICES[0])
+                use_4bit = gr.Checkbox(label="Use 4-bit (GPU only)", value=True)
+                max_tokens = gr.Slider(label="Max input tokens", minimum=1024, maximum=8192, step=512, value=4096)
+                hf_token = gr.Textbox(label="HF_TOKEN (only for gated models)", type="password", value=os.environ.get("HF_TOKEN",""))
+                run_btn = gr.Button("Run Extraction", variant="primary")
+        with gr.Row():
+            summary = gr.Textbox(label="Summary", lines=12)
+            json_out = gr.Code(label="Strict JSON Output", language="json")
+        with gr.Row():
+            diag = gr.Textbox(label="Diagnostics", lines=8)
+            raw = gr.Textbox(label="Raw Model Output", lines=8)
+        run_btn.click(
+            fn=run_single,
+            inputs=[text, file, use_cleaning, labels_text, repo, use_4bit, max_tokens, hf_token],
+            outputs=[summary, json_out, diag, raw],
+        )
+    with gr.Tab("Batch evaluation"):
+        with gr.Row():
+            with gr.Column(scale=3):
+                zip_in = gr.File(label="ZIP with transcripts (.txt) and truths (.json)", file_types=[".zip"], type="filepath")
+                use_cleaning_b = gr.Checkbox(label="Apply default cleaning", value=True)
+            with gr.Column(scale=2):
+                repo_b = gr.Dropdown(label="Model", choices=MODEL_CHOICES, value=MODEL_CHOICES[0])
+                use_4bit_b = gr.Checkbox(label="Use 4-bit (GPU only)", value=True)
+                max_tokens_b = gr.Slider(label="Max input tokens", minimum=1024, maximum=8192, step=512, value=4096)
+                hf_token_b = gr.Textbox(label="HF_TOKEN (only for gated models)", type="password", value=os.environ.get("HF_TOKEN",""))
+                limit_files = gr.Slider(label="Process at most N files (0 = all)", minimum=0, maximum=2000, step=10, value=0)
+                run_batch_btn = gr.Button("Run Batch", variant="primary")
+        with gr.Row():
+            status = gr.Textbox(label="Status", lines=1)
+            diag_b = gr.Textbox(label="Batch diagnostics & metrics", lines=10)
+        with gr.Row():
+            df_out = gr.Dataframe(label="Per-file results (TP/FP/FN, times)", interactive=False)
+        csv_out = gr.File(label="Download CSV (click to save)", interactive=False)
+        def _save_csv(csv_text: str) -> str:
+            if not csv_text:
+                return ""
+            out_path = Path("/tmp/batch_results.csv")
+            out_path.write_text(csv_text, encoding="utf-8")
+            return str(out_path)
+        run_batch_btn.click(
+            fn=run_batch,
+            inputs=[zip_in, use_cleaning_b, repo_b, use_4bit_b, max_tokens_b, hf_token_b, limit_files],
+            outputs=[status, diag_b, csv_out, df_out, gr.Textbox(visible=False)],
+        )
 if __name__ == "__main__":
+    demo.launch()