Spaces:
Sleeping
Sleeping
from fastapi import FastAPI | |
from pydantic import BaseModel | |
from llama_cpp import Llama | |
from huggingface_hub import hf_hub_download | |
import os | |
import psutil | |
import multiprocessing | |
app = FastAPI(title="LLM Agent API", version="1.0.0") | |
class ChatRequest(BaseModel): | |
message: str | |
max_tokens: int = 100 | |
temperature: float = 0.7 | |
class ChatResponse(BaseModel): | |
response: str | |
class LocalLLMAgent: | |
def __init__(self): | |
# Download do modelo se não existir | |
model_path = "./llama-2-7b-chat.Q4_K_M.gguf" | |
if not os.path.exists(model_path): | |
print("📥 Baixando modelo Llama-2-7B-Chat (Q4_K_M)...", flush=True) | |
print(" Isso pode levar alguns minutos...", flush=True) | |
model_path = hf_hub_download( | |
repo_id="TheBloke/Llama-2-7B-Chat-GGUF", | |
filename="llama-2-7b-chat.Q4_K_M.gguf", | |
local_dir="./" | |
) | |
print("✅ Modelo baixado com sucesso!", flush=True) | |
else: | |
print("📁 Modelo já existe, carregando...", flush=True) | |
# Configura para usar todas as CPUs disponíveis | |
n_threads = multiprocessing.cpu_count() | |
print(f"🔧 Configurando llama-cpp-python:", flush=True) | |
print(f" - CPUs disponíveis: {n_threads}", flush=True) | |
print(f" - Threads: {n_threads}", flush=True) | |
print(f" - Contexto: 2048 tokens", flush=True) | |
print("🚀 Inicializando modelo...", flush=True) | |
self.llm = Llama( | |
model_path=model_path, | |
chat_format="llama-2", | |
n_ctx=2048, | |
n_threads=n_threads, | |
n_threads_batch=n_threads, | |
verbose=False | |
) | |
print(f"✅ Modelo carregado! Usando {n_threads} threads", flush=True) | |
self.messages = [ | |
{"role": "system", "content": "Responda sempre em português brasileiro de forma natural e conversacional."} | |
] | |
def chat(self, message: str, max_tokens: int = 100, temperature: float = 0.75) -> str: | |
print(f"💬 Nova mensagem: {message[:50]}{'...' if len(message) > 50 else ''}") | |
print(f" Parâmetros: max_tokens={max_tokens}, temperature={temperature}") | |
self.messages.append({"role": "user", "content": message}) | |
response = self.llm.create_chat_completion( | |
messages=self.messages, | |
max_tokens=max_tokens, | |
temperature=temperature | |
) | |
assistant_message = response['choices'][0]['message']['content'] | |
self.messages.append({"role": "assistant", "content": assistant_message}) | |
print(f"✅ Resposta gerada ({len(assistant_message)} chars)") | |
return assistant_message | |
# Inicializa o agente globalmente | |
agent = None | |
async def startup_event(): | |
print("=== INICIANDO LLM AGENT API ===", flush=True) | |
print(f"CPUs disponíveis: {multiprocessing.cpu_count()}", flush=True) | |
print(f"Memória total: {round(psutil.virtual_memory().total / (1024**3), 2)} GB", flush=True) | |
global agent | |
agent = LocalLLMAgent() | |
print("✅ API pronta para uso!", flush=True) | |
print("Endpoints disponíveis:", flush=True) | |
print(" - POST /chat", flush=True) | |
print(" - GET /health", flush=True) | |
print(" - GET /system", flush=True) | |
async def chat_endpoint(request: ChatRequest): | |
if agent is None: | |
return ChatResponse(response="Modelo ainda carregando, tente novamente.") | |
response = agent.chat(request.message, request.max_tokens, request.temperature) | |
return ChatResponse(response=response) | |
async def health_check(): | |
return {"status": "healthy"} | |
async def system_info(): | |
cpu_count = multiprocessing.cpu_count() | |
cpu_percent = psutil.cpu_percent(interval=1, percpu=True) | |
memory = psutil.virtual_memory() | |
return { | |
"cpu_cores": cpu_count, | |
"cpu_usage_per_core": cpu_percent, | |
"cpu_usage_total": psutil.cpu_percent(interval=1), | |
"memory_total_gb": round(memory.total / (1024**3), 2), | |
"memory_used_gb": round(memory.used / (1024**3), 2), | |
"memory_percent": memory.percent | |
} | |
# Removido - uvicorn será executado pelo Dockerfile |