Spaces:

Mungert
/

GradLLM

Running

App Files Files Community

johnbridges commited on Aug 17

Commit

60a9595

1 Parent(s): 5b32c71

.

Browse files

Files changed (1) hide show

hf_backend.py +42 -63

hf_backend.py CHANGED Viewed

@@ -1,5 +1,5 @@
-# hf_backend.py (patched)
-import time, logging, os
 from typing import Any, Dict, AsyncIterable
 import torch
@@ -7,23 +7,24 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 from backends_base import ChatBackend, ImagesBackend
 from config import settings
 try:
     import spaces
-    from spaces.zero.client import SpaceZeroClient
 except ImportError:
-    spaces, SpaceZeroClient = None, None
-logger = logging.getLogger(__name__)
 MODEL_ID = settings.LlmHFModelID or "Qwen/Qwen2.5-1.5B-Instruct"
-logger.info(f"Loading {MODEL_ID} on CPU at startup (ZeroGPU safe)...")
 tokenizer, model, load_error = None, None, None
 try:
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, use_fast=False)
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
-        torch_dtype=torch.float32,
         trust_remote_code=True,
     )
     model.eval()
@@ -32,22 +33,7 @@ except Exception as e:
     logger.exception(load_error)
-def pick_device() -> str:
-    if torch.cuda.is_available():
-        return "cuda"
-    if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
-        return "mps"
-    return "cpu"
-def pick_dtype(device: str) -> torch.dtype:
-    if device == "cuda":
-        major, _ = torch.cuda.get_device_capability()
-        return torch.bfloat16 if major >= 8 else torch.float16
-    if device == "mps":
-        return torch.float16
-    return torch.float32
 class HFChatBackend(ChatBackend):
     async def stream(self, request: Dict[str, Any]) -> AsyncIterable[Dict[str, Any]]:
         if load_error:
@@ -61,19 +47,24 @@ class HFChatBackend(ChatBackend):
         rid = f"chatcmpl-hf-{int(time.time())}"
         now = int(time.time())
-        # --- ✅ Extract X-IP-Token from RabbitMQ message
         x_ip_token = request.get("x_ip_token")
-        headers = {}
-        if x_ip_token:
-            headers["X-IP-Token"] = x_ip_token
-            logger.info("Using X-IP-Token from request for ZeroGPU attribution")
-        def _gpu_inference_fn(prompt: str) -> str:
-            device = pick_device()
-            dtype = pick_dtype(device)
-            model.to(device=device, dtype=dtype).eval()
             inputs = tokenizer(prompt, return_tensors="pt").to(device)
             with torch.inference_mode(), torch.autocast(device_type=device, dtype=dtype):
                 outputs = model.generate(
                     **inputs,
@@ -83,35 +74,23 @@ class HFChatBackend(ChatBackend):
                 )
             return tokenizer.decode(outputs[0], skip_special_tokens=True)
-        if spaces and SpaceZeroClient:
-            # Use a custom SpaceZeroClient with headers
-            client = SpaceZeroClient(headers=headers or None)
-            try:
-                text = await client.run(_gpu_inference_fn, args=[prompt], duration=120)
-            except Exception:
-                logger.exception("HF inference (ZeroGPU) failed")
-                raise
-        else:
-            # CPU fallback
-            inputs = tokenizer(prompt, return_tensors="pt")
-            with torch.inference_mode():
-                outputs = model.generate(
-                    **inputs,
-                    max_new_tokens=max_tokens,
-                    temperature=temperature,
-                    do_sample=True,
-                )
-            text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        yield {
-            "id": rid,
-            "object": "chat.completion.chunk",
-            "created": now,
-            "model": MODEL_ID,
-            "choices": [
-                {"index": 0, "delta": {"content": text}, "finish_reason": "stop"}
-            ],
-        }
 class StubImagesBackend(ImagesBackend):
     """
     Stub backend for images since HFChatBackend is text-only.

+# hf_backend.py
+import time, logging
 from typing import Any, Dict, AsyncIterable
 import torch
 from backends_base import ChatBackend, ImagesBackend
 from config import settings
+logger = logging.getLogger(__name__)
 try:
     import spaces
+    from spaces.zero import client as zero_client
 except ImportError:
+    spaces, zero_client = None, None
+# --- Model setup (CPU-safe load, real inference on GPU only) ---
 MODEL_ID = settings.LlmHFModelID or "Qwen/Qwen2.5-1.5B-Instruct"
+logger.info(f"Preloading tokenizer for {MODEL_ID} on CPU (ZeroGPU safe)...")
 tokenizer, model, load_error = None, None, None
 try:
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, use_fast=False)
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
+        torch_dtype=torch.float32,   # dummy dtype for CPU preload
         trust_remote_code=True,
     )
     model.eval()
     logger.exception(load_error)
+# ---------------- Chat Backend ----------------
 class HFChatBackend(ChatBackend):
     async def stream(self, request: Dict[str, Any]) -> AsyncIterable[Dict[str, Any]]:
         if load_error:
         rid = f"chatcmpl-hf-{int(time.time())}"
         now = int(time.time())
+        if not spaces:
+            raise RuntimeError("ZeroGPU (spaces) is required but not available!")
+        # --- Inject X-IP-Token into global headers ---
         x_ip_token = request.get("x_ip_token")
+        if x_ip_token and zero_client:
+            zero_client.HEADERS["X-IP-Token"] = x_ip_token
+            logger.debug("Injected X-IP-Token into ZeroGPU headers")
+        # --- Define the GPU-only inference function ---
+        @spaces.GPU(duration=120)
+        def run_once(prompt: str) -> str:
+            device = "cuda"   # force CUDA
+            dtype = torch.float16
+            model.to(device=device, dtype=dtype).eval()
             inputs = tokenizer(prompt, return_tensors="pt").to(device)
             with torch.inference_mode(), torch.autocast(device_type=device, dtype=dtype):
                 outputs = model.generate(
                     **inputs,
                 )
             return tokenizer.decode(outputs[0], skip_special_tokens=True)
+        try:
+            text = run_once(prompt)
+            yield {
+                "id": rid,
+                "object": "chat.completion.chunk",
+                "created": now,
+                "model": MODEL_ID,
+                "choices": [
+                    {"index": 0, "delta": {"content": text}, "finish_reason": "stop"}
+                ],
+            }
+        except Exception:
+            logger.exception("HF inference failed")
+            raise
+# ---------------- Stub Images Backend ----------------
 class StubImagesBackend(ImagesBackend):
     """
     Stub backend for images since HFChatBackend is text-only.