Spaces:
Sleeping
Sleeping
File size: 4,355 Bytes
244d22f a6981fd 244d22f fcba0d8 244d22f fcba0d8 a6981fd fcba0d8 244d22f a6981fd fcba0d8 a6981fd fcba0d8 244d22f a6981fd 244d22f fcba0d8 244d22f a6981fd 244d22f a6981fd 244d22f fcba0d8 a6981fd 244d22f a6981fd fcba0d8 244d22f a6981fd 244d22f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import psutil
import multiprocessing
app = FastAPI(title="LLM Agent API", version="1.0.0")
class ChatRequest(BaseModel):
message: str
max_tokens: int = 100
temperature: float = 0.7
class ChatResponse(BaseModel):
response: str
class LocalLLMAgent:
def __init__(self):
# Download do modelo se não existir
model_path = "./llama-2-7b-chat.Q4_K_M.gguf"
if not os.path.exists(model_path):
print("📥 Baixando modelo Llama-2-7B-Chat (Q4_K_M)...", flush=True)
print(" Isso pode levar alguns minutos...", flush=True)
model_path = hf_hub_download(
repo_id="TheBloke/Llama-2-7B-Chat-GGUF",
filename="llama-2-7b-chat.Q4_K_M.gguf",
local_dir="./"
)
print("✅ Modelo baixado com sucesso!", flush=True)
else:
print("📁 Modelo já existe, carregando...", flush=True)
# Configura para usar todas as CPUs disponíveis
n_threads = multiprocessing.cpu_count()
print(f"🔧 Configurando llama-cpp-python:", flush=True)
print(f" - CPUs disponíveis: {n_threads}", flush=True)
print(f" - Threads: {n_threads}", flush=True)
print(f" - Contexto: 2048 tokens", flush=True)
print("🚀 Inicializando modelo...", flush=True)
self.llm = Llama(
model_path=model_path,
chat_format="llama-2",
n_ctx=2048,
n_threads=n_threads,
n_threads_batch=n_threads,
verbose=False
)
print(f"✅ Modelo carregado! Usando {n_threads} threads", flush=True)
self.messages = [
{"role": "system", "content": "Responda sempre em português brasileiro de forma natural e conversacional."}
]
def chat(self, message: str, max_tokens: int = 100, temperature: float = 0.75) -> str:
print(f"💬 Nova mensagem: {message[:50]}{'...' if len(message) > 50 else ''}")
print(f" Parâmetros: max_tokens={max_tokens}, temperature={temperature}")
self.messages.append({"role": "user", "content": message})
response = self.llm.create_chat_completion(
messages=self.messages,
max_tokens=max_tokens,
temperature=temperature
)
assistant_message = response['choices'][0]['message']['content']
self.messages.append({"role": "assistant", "content": assistant_message})
print(f"✅ Resposta gerada ({len(assistant_message)} chars)")
return assistant_message
# Inicializa o agente globalmente
agent = None
@app.on_event("startup")
async def startup_event():
print("=== INICIANDO LLM AGENT API ===", flush=True)
print(f"CPUs disponíveis: {multiprocessing.cpu_count()}", flush=True)
print(f"Memória total: {round(psutil.virtual_memory().total / (1024**3), 2)} GB", flush=True)
global agent
agent = LocalLLMAgent()
print("✅ API pronta para uso!", flush=True)
print("Endpoints disponíveis:", flush=True)
print(" - POST /chat", flush=True)
print(" - GET /health", flush=True)
print(" - GET /system", flush=True)
@app.post("/chat", response_model=ChatResponse)
async def chat_endpoint(request: ChatRequest):
if agent is None:
return ChatResponse(response="Modelo ainda carregando, tente novamente.")
response = agent.chat(request.message, request.max_tokens, request.temperature)
return ChatResponse(response=response)
@app.get("/health")
async def health_check():
return {"status": "healthy"}
@app.get("/system")
async def system_info():
cpu_count = multiprocessing.cpu_count()
cpu_percent = psutil.cpu_percent(interval=1, percpu=True)
memory = psutil.virtual_memory()
return {
"cpu_cores": cpu_count,
"cpu_usage_per_core": cpu_percent,
"cpu_usage_total": psutil.cpu_percent(interval=1),
"memory_total_gb": round(memory.total / (1024**3), 2),
"memory_used_gb": round(memory.used / (1024**3), 2),
"memory_percent": memory.percent
}
# Removido - uvicorn será executado pelo Dockerfile |