Spaces:

Mungert
/

GradLLM

Running

App Files Files Community

johnbridges commited on Sep 17

Commit

d279e64

1 Parent(s): 849364d

.

Browse files

Files changed (1) hide show

hf_backend.py +40 -12

hf_backend.py CHANGED Viewed

@@ -33,11 +33,10 @@ except Exception as e:
 # ---------------- helpers ----------------
 def _pick_cpu_dtype() -> torch.dtype:
-    # Prefer BF16 if CPU supports it
     if hasattr(torch, "cpu") and hasattr(torch.cpu, "is_bf16_supported"):
         try:
             if torch.cpu.is_bf16_supported():
-                logger.info("CPU BF16 supported, using torch.bfloat16")
                 return torch.bfloat16
         except Exception:
             pass
@@ -57,17 +56,32 @@ def _get_model(device: str, dtype: torch.dtype):
     cfg = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
     if hasattr(cfg, "quantization_config"):
         logger.warning("Removing quantization_config from model config")
-        delattr(cfg, "quantization_config")  # delete instead of setting None
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID,
-        config=cfg,
-        torch_dtype=dtype,
-        trust_remote_code=True,
-        device_map="auto" if device != "cpu" else {"": "cpu"},
-    )
     model.eval()
-    _MODEL_CACHE[key] = model
     return model
@@ -78,7 +92,6 @@ class HFChatBackend(ChatBackend):
             raise RuntimeError(load_error)
         messages = request.get("messages", [])
-        prompt = messages[-1]["content"] if messages else "(empty)"
         temperature = float(request.get("temperature", settings.LlmTemp or 0.7))
         max_tokens = int(request.get("max_tokens", settings.LlmOpenAICtxSize or 512))
@@ -91,6 +104,21 @@ class HFChatBackend(ChatBackend):
             zero_client.HEADERS["X-IP-Token"] = x_ip_token
             logger.debug("Injected X-IP-Token into ZeroGPU headers")
         def _run_once(prompt: str, device: str, dtype: torch.dtype) -> str:
             model = _get_model(device, dtype)
             inputs = tokenizer(prompt, return_tensors="pt").to(device)

 # ---------------- helpers ----------------
 def _pick_cpu_dtype() -> torch.dtype:
     if hasattr(torch, "cpu") and hasattr(torch.cpu, "is_bf16_supported"):
         try:
             if torch.cpu.is_bf16_supported():
+                logger.info("CPU BF16 supported, will attempt torch.bfloat16")
                 return torch.bfloat16
         except Exception:
             pass
     cfg = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
     if hasattr(cfg, "quantization_config"):
         logger.warning("Removing quantization_config from model config")
+        delattr(cfg, "quantization_config")
+    try:
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            config=cfg,
+            torch_dtype=dtype,
+            trust_remote_code=True,
+            device_map="auto" if device != "cpu" else {"": "cpu"},
+        )
+    except Exception as e:
+        if device == "cpu" and dtype == torch.bfloat16:
+            logger.warning(f"BF16 load failed on CPU: {e}. Retrying with FP32.")
+            model = AutoModelForCausalLM.from_pretrained(
+                MODEL_ID,
+                config=cfg,
+                torch_dtype=torch.float32,
+                trust_remote_code=True,
+                device_map={"": "cpu"},
+            )
+            dtype = torch.float32
+        else:
+            raise
     model.eval()
+    _MODEL_CACHE[(device, dtype)] = model
     return model
             raise RuntimeError(load_error)
         messages = request.get("messages", [])
         temperature = float(request.get("temperature", settings.LlmTemp or 0.7))
         max_tokens = int(request.get("max_tokens", settings.LlmOpenAICtxSize or 512))
             zero_client.HEADERS["X-IP-Token"] = x_ip_token
             logger.debug("Injected X-IP-Token into ZeroGPU headers")
+        # Build prompt using chat template if available
+        if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template:
+            try:
+                prompt = tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                    add_generation_prompt=True,
+                )
+                logger.debug("Applied chat template for prompt")
+            except Exception as e:
+                logger.warning(f"Failed to apply chat template: {e}, using fallback")
+                prompt = messages[-1]["content"] if messages else "(empty)"
+        else:
+            prompt = messages[-1]["content"] if messages else "(empty)"
         def _run_once(prompt: str, device: str, dtype: torch.dtype) -> str:
             model = _get_model(device, dtype)
             inputs = tokenizer(prompt, return_tensors="pt").to(device)