Spaces:

build-small-hackathon
/

workbench

Running on Zero

File size: 3,348 Bytes

7f9dfed

from __future__ import annotations

import importlib.util
from dataclasses import dataclass
from importlib import import_module
from pathlib import Path
from typing import Any

from models.base import BackendStatus
from models.model_catalog import ModelInfo
from models.response_parsing import extract_chat_response


@dataclass(frozen=True)
class LlamaCppPythonConfig:
    """Runtime configuration for llama-cpp-python."""

    model_path: str = ""
    n_ctx: int = 4096
    n_gpu_layers: int = 0
    temperature: float = 0.7
    max_tokens: int = 512


class LlamaCppPythonService:
    """Direct llama-cpp-python GGUF inference service."""

    def __init__(
        self,
        model: ModelInfo,
        config: LlamaCppPythonConfig | None = None,
    ) -> None:
        self.model = model
        self.config = config or LlamaCppPythonConfig()

    @staticmethod
    def status(model_path: str = "") -> BackendStatus:
        if importlib.util.find_spec("llama_cpp") is None:
            return BackendStatus(
                "llama-cpp-python",
                False,
                "Python package llama-cpp-python is not installed in the current environment.",
            )
        if not model_path:
            return BackendStatus(
                "llama-cpp-python",
                False,
                "llama-cpp-python is installed, but no GGUF model path is configured.",
            )
        if not Path(model_path).exists():
            return BackendStatus(
                "llama-cpp-python",
                False,
                f"Configured GGUF model was not found: {model_path}",
            )
        return BackendStatus("llama-cpp-python", True, "llama-cpp-python is ready.")

    def chat(self, system_prompt: str, user_prompt: str) -> str:
        status = self.status(self.config.model_path)
        if not status.available:
            return (
                "[llama-cpp-python unavailable]\n\n"
                f"{status.detail}\n\n"
                "Install llama-cpp-python and configure a local GGUF path before retrying."
            )

        llama = self._load_llama()
        response = llama.create_chat_completion(
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            temperature=self.config.temperature,
            max_tokens=self.config.max_tokens,
        )
        return self._extract_response(response)

    def vision_chat(self, has_image: bool, prompt: str, image=None) -> str:
        del image
        if has_image:
            return (
                "[llama-cpp-python vision note]\n\n"
                "Direct multimodal llama-cpp-python support requires model-specific mmproj "
                "wiring and image serialization. Use llama-server for the current vision path."
            )
        return self.chat("", prompt)

    def _load_llama(self):
        llama_module = import_module("llama_cpp")
        llama_class = llama_module.Llama

        return llama_class(
            model_path=self.config.model_path,
            n_ctx=self.config.n_ctx,
            n_gpu_layers=self.config.n_gpu_layers,
            verbose=False,
        )

    @staticmethod
    def _extract_response(data: dict[str, Any]) -> str:
        return extract_chat_response(data)