Spaces:
Running on Zero
Running on Zero
File size: 3,348 Bytes
7f9dfed | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 | from __future__ import annotations
import importlib.util
from dataclasses import dataclass
from importlib import import_module
from pathlib import Path
from typing import Any
from models.base import BackendStatus
from models.model_catalog import ModelInfo
from models.response_parsing import extract_chat_response
@dataclass(frozen=True)
class LlamaCppPythonConfig:
"""Runtime configuration for llama-cpp-python."""
model_path: str = ""
n_ctx: int = 4096
n_gpu_layers: int = 0
temperature: float = 0.7
max_tokens: int = 512
class LlamaCppPythonService:
"""Direct llama-cpp-python GGUF inference service."""
def __init__(
self,
model: ModelInfo,
config: LlamaCppPythonConfig | None = None,
) -> None:
self.model = model
self.config = config or LlamaCppPythonConfig()
@staticmethod
def status(model_path: str = "") -> BackendStatus:
if importlib.util.find_spec("llama_cpp") is None:
return BackendStatus(
"llama-cpp-python",
False,
"Python package llama-cpp-python is not installed in the current environment.",
)
if not model_path:
return BackendStatus(
"llama-cpp-python",
False,
"llama-cpp-python is installed, but no GGUF model path is configured.",
)
if not Path(model_path).exists():
return BackendStatus(
"llama-cpp-python",
False,
f"Configured GGUF model was not found: {model_path}",
)
return BackendStatus("llama-cpp-python", True, "llama-cpp-python is ready.")
def chat(self, system_prompt: str, user_prompt: str) -> str:
status = self.status(self.config.model_path)
if not status.available:
return (
"[llama-cpp-python unavailable]\n\n"
f"{status.detail}\n\n"
"Install llama-cpp-python and configure a local GGUF path before retrying."
)
llama = self._load_llama()
response = llama.create_chat_completion(
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
temperature=self.config.temperature,
max_tokens=self.config.max_tokens,
)
return self._extract_response(response)
def vision_chat(self, has_image: bool, prompt: str, image=None) -> str:
del image
if has_image:
return (
"[llama-cpp-python vision note]\n\n"
"Direct multimodal llama-cpp-python support requires model-specific mmproj "
"wiring and image serialization. Use llama-server for the current vision path."
)
return self.chat("", prompt)
def _load_llama(self):
llama_module = import_module("llama_cpp")
llama_class = llama_module.Llama
return llama_class(
model_path=self.config.model_path,
n_ctx=self.config.n_ctx,
n_gpu_layers=self.config.n_gpu_layers,
verbose=False,
)
@staticmethod
def _extract_response(data: dict[str, Any]) -> str:
return extract_chat_response(data)
|