Spaces:
Sleeping
Sleeping
| """ | |
| Inference Script — Clinical Note Scribe | |
| ======================================== | |
| MANDATORY | |
| - Before submitting, ensure the following variables are defined: | |
| API_BASE_URL The API endpoint for the LLM. | |
| MODEL_NAME The model identifier to use for inference. | |
| HF_TOKEN Your Hugging Face / API key. | |
| LOCAL_IMAGE_NAME The name of the local image for the environment. | |
| - Defaults are set only for API_BASE_URL and MODEL_NAME: | |
| API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1") | |
| MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct") | |
| - The inference script must be named inference.py and placed in the root directory. | |
| - Participants must use OpenAI Client for all LLM calls using above variables. | |
| STDOUT FORMAT | |
| - The script must emit exactly three line types to stdout, in this order: | |
| [START] task=<task_name> env=<benchmark> model=<model_name> | |
| [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null> | |
| [END] success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn> | |
| Rules: | |
| - One [START] line at episode begin. | |
| - One [STEP] line per step, immediately after env.step() returns. | |
| - One [END] line after the task finishes, always emitted (even on exception). | |
| - reward and rewards are formatted to 2 decimal places. | |
| - done and success are lowercase booleans: true or false. | |
| - error is the raw last_action_error string, or null if none. | |
| - All fields on a single line with no newlines within a line. | |
| - Each task should return score in [0, 1]. | |
| Designed to complete in under 20 minutes on 2 vCPU / 8 GB RAM. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| import os | |
| import sys | |
| import textwrap | |
| from typing import Any, List, Optional | |
| from openai import OpenAI | |
| # Silence the underlying env's stdout JSON logs (redirect them to stderr) | |
| env_logger = logging.getLogger("clinical_note_scribe") | |
| env_logger.setLevel(logging.INFO) | |
| env_logger.handlers.clear() | |
| env_logger.addHandler(logging.StreamHandler(sys.stderr)) | |
| env_logger.propagate = False | |
| # Environment imports | |
| sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) | |
| from environment import ClinicalNoteScribeEnv, Action, SOAPNote # noqa: E402 | |
| from environment.tasks import TASK_REGISTRY # noqa: E402 | |
| # Config | |
| LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME") | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1") | |
| MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct") | |
| BENCHMARK = "clinical-note-scribe" | |
| TASK_IDS = list(TASK_REGISTRY.keys()) | |
| MAX_STEPS = 5 | |
| MAX_TOKENS = 1024 | |
| TEMPERATURE = 0.2 | |
| # System prompt | |
| SYSTEM_PROMPT = textwrap.dedent("""\ | |
| You are a clinical documentation assistant. Given a doctor-patient transcript | |
| and patient context, generate a concise, clinically accurate SOAP note. | |
| RULES: | |
| 1. Use professional medical language. Avoid over-certain phrasing such as | |
| "patient definitely has", "diagnosis is certain", or "100% certain". | |
| 2. Keep the note concise - aim for under 400 words total across all four sections. | |
| 3. Return your output as a **single valid JSON object** matching this schema exactly: | |
| { | |
| "action_type": "submit_note", | |
| "soap_note": { | |
| "subjective": "<patient's reported symptoms, history, and concerns>", | |
| "objective": "<exam findings, vitals, lab results, imaging>", | |
| "assessment": "<differential diagnoses and clinical reasoning>", | |
| "plan": "<treatment plan, medications, follow-up, referrals>" | |
| } | |
| } | |
| Return ONLY the JSON object. No markdown fences, no commentary, no extra keys. | |
| """).strip() | |
| # Stdout logging — mandatory hackathon format | |
| def log_start(task: str, env: str, model: str) -> None: | |
| print(f"[START] task={task} env={env} model={model}", flush=True) | |
| def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None: | |
| error_val = error if error else "null" | |
| done_val = str(done).lower() | |
| print( | |
| f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}", | |
| flush=True, | |
| ) | |
| def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None: | |
| rewards_str = ",".join(f"{r:.2f}" for r in rewards) | |
| print( | |
| f"[END] success={str(success).lower()} steps={steps} score={score:.2f} rewards={rewards_str}", | |
| flush=True, | |
| ) | |
| # Helpers | |
| def _build_user_prompt(transcript: str, patient_context: dict[str, Any]) -> str: | |
| """Build the user message containing the transcript and context.""" | |
| ctx_str = json.dumps(patient_context, indent=2, default=str) | |
| return ( | |
| f"## Patient Context\n```json\n{ctx_str}\n```\n\n" | |
| f"## Doctor-Patient Transcript\n```\n{transcript}\n```\n\n" | |
| "Generate the SOAP note as a JSON Action object." | |
| ) | |
| def _parse_json(raw: str) -> dict[str, Any]: | |
| """Parse the model's raw text output into a dict, tolerating markdown fences.""" | |
| cleaned = raw.strip() | |
| if cleaned.startswith("```"): | |
| first_newline = cleaned.index("\n") | |
| cleaned = cleaned[first_newline + 1:] | |
| if cleaned.endswith("```"): | |
| cleaned = cleaned[:-3] | |
| cleaned = cleaned.strip() | |
| try: | |
| return json.loads(cleaned) | |
| except json.JSONDecodeError as exc: | |
| print(f"[DEBUG] Failed to parse model output as JSON: {exc}", file=sys.stderr, flush=True) | |
| print(f"[DEBUG] Raw output:\n{raw}", file=sys.stderr, flush=True) | |
| raise | |
| def get_soap_note(client: OpenAI, transcript: str, patient_context: dict[str, Any]) -> dict[str, Any]: | |
| """Call the OpenAI-compatible API and return the parsed JSON action dict.""" | |
| user_prompt = _build_user_prompt(transcript, patient_context) | |
| try: | |
| completion = client.chat.completions.create( | |
| model=MODEL_NAME, | |
| messages=[ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": user_prompt}, | |
| ], | |
| temperature=TEMPERATURE, | |
| max_tokens=MAX_TOKENS, | |
| stream=False, | |
| ) | |
| raw = (completion.choices[0].message.content or "").strip() | |
| return _parse_json(raw) | |
| except Exception as exc: | |
| print(f"[DEBUG] Model request failed: {exc}", file=sys.stderr, flush=True) | |
| raise | |
| # Per-task runner | |
| def run_task(client: OpenAI, env: ClinicalNoteScribeEnv, task_id: str) -> dict[str, Any]: | |
| """Run a single task episode and return the result dict.""" | |
| rewards: List[float] = [] | |
| steps_taken = 0 | |
| score = 0.0 | |
| success = False | |
| last_error: Optional[str] = None | |
| log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME) | |
| try: | |
| obs = env.reset(task_id) | |
| for step in range(1, MAX_STEPS + 1): | |
| try: | |
| action_dict = get_soap_note(client, obs.transcript, obs.patient_context) | |
| action = Action(**action_dict) | |
| action_str = f"submit_note(sections=S,O,A,P)" | |
| except Exception as exc: | |
| # On model / parse failure, submit an empty note | |
| action = Action( | |
| action_type="submit_note", | |
| soap_note=SOAPNote( | |
| subjective="", | |
| objective="", | |
| assessment="", | |
| plan="", | |
| ), | |
| ) | |
| action_str = "submit_note(fallback)" | |
| last_error = str(exc) | |
| obs, reward_obj, done, info = env.step(action) | |
| reward_val = reward_obj.value | |
| rewards.append(reward_val) | |
| steps_taken = step | |
| error_msg = None | |
| if obs.errors_so_far: | |
| error_msg = obs.errors_so_far[-1] | |
| elif last_error: | |
| error_msg = last_error | |
| last_error = None | |
| log_step( | |
| step=step, | |
| action=action_str, | |
| reward=reward_val, | |
| done=done, | |
| error=error_msg, | |
| ) | |
| if done: | |
| break | |
| score = rewards[-1] if rewards else 0.0 | |
| score = min(max(score, 0.0), 1.0) | |
| success = score > 0.0 | |
| except Exception as exc: | |
| print(f"[DEBUG] Task {task_id} failed: {exc}", file=sys.stderr, flush=True) | |
| score = 0.0 | |
| success = False | |
| finally: | |
| log_end(success=success, steps=steps_taken, score=score, rewards=rewards) | |
| return { | |
| "task_id": task_id, | |
| "score": score, | |
| "steps": steps_taken, | |
| "rewards": rewards, | |
| "success": success, | |
| } | |
| # Main | |
| def main() -> None: | |
| if not HF_TOKEN: | |
| print( | |
| "[DEBUG] WARNING: HF_TOKEN is not set. " | |
| "Model calls will fail unless the endpoint requires no auth.", | |
| file=sys.stderr, | |
| flush=True, | |
| ) | |
| client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN) | |
| env = ClinicalNoteScribeEnv() | |
| results: List[dict[str, Any]] = [] | |
| for task_id in TASK_IDS: | |
| result = run_task(client, env, task_id) | |
| results.append(result) | |
| # Summary table | |
| print("", file=sys.stderr, flush=True) | |
| print("=" * 60, file=sys.stderr, flush=True) | |
| print(" SUMMARY", file=sys.stderr, flush=True) | |
| print("=" * 60, file=sys.stderr, flush=True) | |
| col_task = max(len("Task"), *(len(r["task_id"]) for r in results)) | |
| header = f" {'Task':<{col_task}} {'Score':>7} {'Steps':>5}" | |
| sep = f" {'-' * col_task} {'-' * 7} {'-' * 5}" | |
| print(header, file=sys.stderr, flush=True) | |
| print(sep, file=sys.stderr, flush=True) | |
| total_score = 0.0 | |
| for r in results: | |
| s = f"{r['score']:.4f}" if r["success"] else "ERROR" | |
| print(f" {r['task_id']:<{col_task}} {s:>7} {r['steps']:>5}", file=sys.stderr, flush=True) | |
| total_score += r["score"] | |
| print(sep, file=sys.stderr, flush=True) | |
| avg = total_score / len(results) if results else 0.0 | |
| print(f" {'AVERAGE':<{col_task}} {avg:>7.4f}", file=sys.stderr, flush=True) | |
| print("", file=sys.stderr, flush=True) | |
| if __name__ == "__main__": | |
| main() | |