Update app.py
Browse files
app.py
CHANGED
|
@@ -4,6 +4,7 @@ from pathlib import Path
|
|
| 4 |
from fastapi import FastAPI, HTTPException
|
| 5 |
from pydantic import BaseModel
|
| 6 |
from llama_cpp import Llama
|
|
|
|
| 7 |
|
| 8 |
# ================= CONFIG =================
|
| 9 |
MODEL_URL = "https://huggingface.co/Neon-AI/Kushina/resolve/main/model.gguf"
|
|
@@ -16,37 +17,49 @@ TEMPERATURE = 0.7
|
|
| 16 |
TOP_P = 0.9
|
| 17 |
# ==========================================
|
| 18 |
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
|
|
|
| 21 |
llm = None # lazy-loaded
|
| 22 |
|
| 23 |
# ---------- Download GGUF if not present ----------
|
| 24 |
if not Path(MODEL_PATH).exists():
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
|
|
|
| 28 |
with open(MODEL_PATH, "wb") as f:
|
| 29 |
for chunk in r.iter_content(chunk_size=8192):
|
| 30 |
f.write(chunk)
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
| 34 |
|
| 35 |
# ---------- Lazy load llama.cpp ----------
|
| 36 |
def get_llm():
|
| 37 |
global llm
|
| 38 |
if llm is None:
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
return llm
|
| 51 |
|
| 52 |
# ---------- Request schema ----------
|
|
@@ -55,7 +68,6 @@ class PromptRequest(BaseModel):
|
|
| 55 |
|
| 56 |
# ---------- System prompt ----------
|
| 57 |
SYSTEM_PROMPT = """You are Kushina.
|
| 58 |
-
|
| 59 |
Modes: CHAT or CODE
|
| 60 |
Rules:
|
| 61 |
- CHAT: mirror user tone, short responses, no explanations unless asked.
|
|
@@ -67,18 +79,18 @@ Otherwise use CHAT.
|
|
| 67 |
def build_prompt(user_text: str) -> str:
|
| 68 |
return f"<|system|>\n{SYSTEM_PROMPT}\n<|user|>\n{user_text}\n<|assistant|>\n"
|
| 69 |
|
| 70 |
-
# ---------- API
|
| 71 |
@app.get("/")
|
| 72 |
def root():
|
| 73 |
return {"status": "ok"}
|
| 74 |
-
|
| 75 |
@app.post("/generate")
|
| 76 |
def generate(req: PromptRequest):
|
| 77 |
-
llm_instance = get_llm() # lazy load
|
| 78 |
-
full_prompt = build_prompt(req.prompt)
|
| 79 |
-
output_text = ""
|
| 80 |
-
|
| 81 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
for chunk in llm_instance(
|
| 83 |
full_prompt,
|
| 84 |
max_tokens=MAX_TOKENS,
|
|
@@ -89,6 +101,10 @@ def generate(req: PromptRequest):
|
|
| 89 |
):
|
| 90 |
if "choices" in chunk:
|
| 91 |
output_text += chunk["choices"][0]["text"]
|
|
|
|
|
|
|
| 92 |
return {"response": output_text}
|
|
|
|
| 93 |
except Exception as e:
|
| 94 |
-
|
|
|
|
|
|
| 4 |
from fastapi import FastAPI, HTTPException
|
| 5 |
from pydantic import BaseModel
|
| 6 |
from llama_cpp import Llama
|
| 7 |
+
import logging
|
| 8 |
|
| 9 |
# ================= CONFIG =================
|
| 10 |
MODEL_URL = "https://huggingface.co/Neon-AI/Kushina/resolve/main/model.gguf"
|
|
|
|
| 17 |
TOP_P = 0.9
|
| 18 |
# ==========================================
|
| 19 |
|
| 20 |
+
# ---------- Logging setup ----------
|
| 21 |
+
logging.basicConfig(
|
| 22 |
+
level=logging.INFO,
|
| 23 |
+
format="%(asctime)s [%(levelname)s] %(message)s"
|
| 24 |
+
)
|
| 25 |
+
logger = logging.getLogger("KushinaAPI")
|
| 26 |
|
| 27 |
+
app = FastAPI(title="Kushina API", version="1.0")
|
| 28 |
llm = None # lazy-loaded
|
| 29 |
|
| 30 |
# ---------- Download GGUF if not present ----------
|
| 31 |
if not Path(MODEL_PATH).exists():
|
| 32 |
+
try:
|
| 33 |
+
logger.info("Downloading model.gguf from Hugging Face...")
|
| 34 |
+
r = requests.get(MODEL_URL, stream=True)
|
| 35 |
+
r.raise_for_status()
|
| 36 |
with open(MODEL_PATH, "wb") as f:
|
| 37 |
for chunk in r.iter_content(chunk_size=8192):
|
| 38 |
f.write(chunk)
|
| 39 |
+
logger.info("Download complete ✅")
|
| 40 |
+
except Exception as e:
|
| 41 |
+
logger.exception("Failed to download model.gguf")
|
| 42 |
+
raise RuntimeError(f"Failed to download model.gguf: {e}")
|
| 43 |
|
| 44 |
# ---------- Lazy load llama.cpp ----------
|
| 45 |
def get_llm():
|
| 46 |
global llm
|
| 47 |
if llm is None:
|
| 48 |
+
try:
|
| 49 |
+
logger.info("Loading GGUF model into llama.cpp…")
|
| 50 |
+
llm = Llama(
|
| 51 |
+
model_path=MODEL_PATH,
|
| 52 |
+
n_ctx=N_CTX,
|
| 53 |
+
n_threads=N_THREADS,
|
| 54 |
+
n_batch=N_BATCH,
|
| 55 |
+
f16_kv=True,
|
| 56 |
+
use_mmap=True,
|
| 57 |
+
verbose=False,
|
| 58 |
+
)
|
| 59 |
+
logger.info("Model loaded ✅")
|
| 60 |
+
except Exception as e:
|
| 61 |
+
logger.exception("Failed to load GGUF model")
|
| 62 |
+
raise RuntimeError(f"Failed to load GGUF model: {e}")
|
| 63 |
return llm
|
| 64 |
|
| 65 |
# ---------- Request schema ----------
|
|
|
|
| 68 |
|
| 69 |
# ---------- System prompt ----------
|
| 70 |
SYSTEM_PROMPT = """You are Kushina.
|
|
|
|
| 71 |
Modes: CHAT or CODE
|
| 72 |
Rules:
|
| 73 |
- CHAT: mirror user tone, short responses, no explanations unless asked.
|
|
|
|
| 79 |
def build_prompt(user_text: str) -> str:
|
| 80 |
return f"<|system|>\n{SYSTEM_PROMPT}\n<|user|>\n{user_text}\n<|assistant|>\n"
|
| 81 |
|
| 82 |
+
# ---------- API endpoints ----------
|
| 83 |
@app.get("/")
|
| 84 |
def root():
|
| 85 |
return {"status": "ok"}
|
| 86 |
+
|
| 87 |
@app.post("/generate")
|
| 88 |
def generate(req: PromptRequest):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
try:
|
| 90 |
+
llm_instance = get_llm() # lazy load
|
| 91 |
+
full_prompt = build_prompt(req.prompt)
|
| 92 |
+
output_text = ""
|
| 93 |
+
|
| 94 |
for chunk in llm_instance(
|
| 95 |
full_prompt,
|
| 96 |
max_tokens=MAX_TOKENS,
|
|
|
|
| 101 |
):
|
| 102 |
if "choices" in chunk:
|
| 103 |
output_text += chunk["choices"][0]["text"]
|
| 104 |
+
|
| 105 |
+
logger.info(f"Prompt processed successfully: {req.prompt[:50]}...")
|
| 106 |
return {"response": output_text}
|
| 107 |
+
|
| 108 |
except Exception as e:
|
| 109 |
+
logger.exception("Error during generation")
|
| 110 |
+
raise HTTPException(status_code=500, detail=f"Generation error: {str(e)}")
|