Spaces:

Mungert
/

GradLLM

Running

App Files Files Community

johnbridges commited on Sep 18

Commit

1175344

1 Parent(s): b416f51

.

Browse files

Files changed (1) hide show

hf_backend.py +24 -6

hf_backend.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# hf_backend.py
 import time, logging, json, asyncio
 from contextlib import nullcontext
 from typing import Any, Dict, AsyncIterable, Tuple
@@ -33,16 +32,38 @@ except Exception as e:
     load_error = f"Failed to load tokenizer: {e}"
     logger.exception(load_error)
 def _pick_cpu_dtype() -> torch.dtype:
     try:
-        if hasattr(torch, "cpu") and hasattr(torch.cpu, "is_bf16_supported") and torch.cpu.is_bf16_supported():
-            logger.info("[dtype] CPU BF16 supported -> torch.bfloat16")
             return torch.bfloat16
     except Exception as e:
         logger.warning(f"[dtype] BF16 probe failed: {e}")
     logger.info("[dtype] fallback -> torch.float32")
     return torch.float32
 _MODEL_CACHE: Dict[tuple[str, torch.dtype], AutoModelForCausalLM] = {}
 def _get_model(device: str, dtype: torch.dtype) -> Tuple[AutoModelForCausalLM, torch.dtype]:
@@ -153,12 +174,10 @@ class HFChatBackend(ChatBackend):
             zero_client.HEADERS["X-IP-Token"] = x_ip_token
             logger.info("[req] injected X-IP-Token into ZeroGPU headers")
-        # Build prompt (pass tools to template)
         if hasattr(tokenizer, "apply_chat_template") and getattr(tokenizer, "chat_template", None):
             try:
                 prompt = tokenizer.apply_chat_template(
                     messages,
-                    #tools=tools,
                     tokenize=False,
                     add_generation_prompt=True,
                 )
@@ -212,7 +231,6 @@ class HFChatBackend(ChatBackend):
             logger.info(f"[gen] text len={len(text)}\n{_snippet(text, 1200)}")
             return text
-        # Offload heavy work to a worker thread so asyncio heartbeats continue
         if spaces:
             @spaces.GPU(duration=120)
             def run_once_sync(prompt: str) -> str:

 import time, logging, json, asyncio
 from contextlib import nullcontext
 from typing import Any, Dict, AsyncIterable, Tuple
     load_error = f"Failed to load tokenizer: {e}"
     logger.exception(load_error)
+def probe_bf16_runtime() -> bool:
+    """Check if BF16 is both reported and actually used in ops on CPU."""
+    if not (hasattr(torch, "cpu") and hasattr(torch.cpu, "is_bf16_supported")):
+        return False
+    if not torch.cpu.is_bf16_supported():
+        return False
+    try:
+        a = torch.randn(16, 16, dtype=torch.bfloat16)
+        b = torch.randn(16, 16, dtype=torch.bfloat16)
+        c = a @ b
+        return c.dtype == torch.bfloat16
+    except Exception:
+        return False
 def _pick_cpu_dtype() -> torch.dtype:
     try:
+        if probe_bf16_runtime():
+            logger.info("[dtype] Verified BF16 execution on CPU -> torch.bfloat16")
             return torch.bfloat16
     except Exception as e:
         logger.warning(f"[dtype] BF16 probe failed: {e}")
     logger.info("[dtype] fallback -> torch.float32")
     return torch.float32
+# Log CPU dtype capability at startup
+CPU_DTYPE = _pick_cpu_dtype()
+logger.info(f"[init] Default CPU dtype = {CPU_DTYPE}")
 _MODEL_CACHE: Dict[tuple[str, torch.dtype], AutoModelForCausalLM] = {}
 def _get_model(device: str, dtype: torch.dtype) -> Tuple[AutoModelForCausalLM, torch.dtype]:
             zero_client.HEADERS["X-IP-Token"] = x_ip_token
             logger.info("[req] injected X-IP-Token into ZeroGPU headers")
         if hasattr(tokenizer, "apply_chat_template") and getattr(tokenizer, "chat_template", None):
             try:
                 prompt = tokenizer.apply_chat_template(
                     messages,
                     tokenize=False,
                     add_generation_prompt=True,
                 )
             logger.info(f"[gen] text len={len(text)}\n{_snippet(text, 1200)}")
             return text
         if spaces:
             @spaces.GPU(duration=120)
             def run_once_sync(prompt: str) -> str: