from pydantic import BaseModel from huggingface_hub import hf_hub_download import logging from typing import ( List, Optional, Literal, ) MODEL_ARGS = { "llama3.2": dict( repo_id="hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF", filename="llama-3.2-3b-instruct-q8_0.gguf", ), "falcon-mamba": dict( repo_id="bartowski/falcon-mamba-7b-GGUF", filename="falcon-mamba-7b-Q4_K_M.gguf", ), "mistral-nemo": dict( repo_id="lmstudio-community/Mistral-Nemo-Instruct-2407-GGUF", filename="Mistral-Nemo-Instruct-2407-Q4_K_M.gguf", ), } logger = logging.getLogger("uvicorn.error") for model_arg in MODEL_ARGS.values(): logger.info(f"Checking for {model_arg['repo_id']}") hf_hub_download(**model_arg) class Message(BaseModel): role: str content: str class ChatRequest(BaseModel): chat_history: List[Message] model: Literal["llama3.2", "falcon-mamba", "mistral-nemo"] = "llama3.2" max_tokens: Optional[int] = 65536 temperature: float = 0.8 top_p: float = 0.95 min_p: float = 0.05 typical_p: float = 1.0 frequency_penalty: float = 0.0 presence_penalty: float = 0.0 repeat_penalty: float = 1.0 top_k: int = 40 seed: Optional[int] = None tfs_z: float = 1.0 mirostat_mode: int = 0 mirostat_tau: float = 5.0 mirostat_eta: float = 0.1 # logprobs: Optional[int] = None # logit_bias: Optional[Dict[str, float]] = None