Spaces:
Running
Running
from pydantic import BaseModel | |
from huggingface_hub import hf_hub_download | |
import logging | |
from typing import ( | |
List, | |
Optional, | |
Literal, | |
) | |
MODEL_ARGS = { | |
"llama3.2": dict( | |
repo_id="hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF", | |
filename="llama-3.2-3b-instruct-q8_0.gguf", | |
), | |
"falcon-mamba": dict( | |
repo_id="bartowski/falcon-mamba-7b-GGUF", | |
filename="falcon-mamba-7b-Q4_K_M.gguf", | |
), | |
"mistral-nemo": dict( | |
repo_id="lmstudio-community/Mistral-Nemo-Instruct-2407-GGUF", | |
filename="Mistral-Nemo-Instruct-2407-Q4_K_M.gguf", | |
), | |
} | |
logger = logging.getLogger("uvicorn.error") | |
for model_arg in MODEL_ARGS.values(): | |
logger.info(f"Checking for {model_arg['repo_id']}") | |
hf_hub_download(**model_arg) | |
class Message(BaseModel): | |
role: str | |
content: str | |
class ChatRequest(BaseModel): | |
chat_history: List[Message] | |
model: Literal["llama3.2", "falcon-mamba", "mistral-nemo"] = "llama3.2" | |
max_tokens: Optional[int] = 65536 | |
temperature: float = 0.8 | |
top_p: float = 0.95 | |
min_p: float = 0.05 | |
typical_p: float = 1.0 | |
frequency_penalty: float = 0.0 | |
presence_penalty: float = 0.0 | |
repeat_penalty: float = 1.0 | |
top_k: int = 40 | |
seed: Optional[int] = None | |
tfs_z: float = 1.0 | |
mirostat_mode: int = 0 | |
mirostat_tau: float = 5.0 | |
mirostat_eta: float = 0.1 | |
# logprobs: Optional[int] = None | |
# logit_bias: Optional[Dict[str, float]] = None | |