Spaces:

Neon-AI
/

chatbot

Paused

App Files Files Community

Neon-AI commited on Jan 29

Commit

d0c5b7c

verified ·

1 Parent(s): cb317d4

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -26

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ from pathlib import Path
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from llama_cpp import Llama
 # ================= CONFIG =================
 MODEL_URL = "https://huggingface.co/Neon-AI/Kushina/resolve/main/model.gguf"
@@ -16,37 +17,49 @@ TEMPERATURE = 0.7
 TOP_P = 0.9
 # ==========================================
-app = FastAPI(title="Kushina API", version="1.0")
 llm = None  # lazy-loaded
 # ---------- Download GGUF if not present ----------
 if not Path(MODEL_PATH).exists():
-    print("Downloading model.gguf from Hugging Face...")
-    r = requests.get(MODEL_URL, stream=True)
-    if r.status_code == 200:
         with open(MODEL_PATH, "wb") as f:
             for chunk in r.iter_content(chunk_size=8192):
                 f.write(chunk)
-        print("Download complete ✅")
-    else:
-        raise RuntimeError(f"Failed to download model.gguf: {r.status_code}")
 # ---------- Lazy load llama.cpp ----------
 def get_llm():
     global llm
     if llm is None:
-        print("Loading GGUF model into llama.cpp…")
-        llm = Llama(
-            model_path=MODEL_PATH,
-            n_ctx=N_CTX,
-            n_threads=N_THREADS,
-            n_batch=N_BATCH,
-            f16_kv=True,
-            use_mmap=True,
-            verbose=False,
-        )
-        print("Model loaded ✅")
     return llm
 # ---------- Request schema ----------
@@ -55,7 +68,6 @@ class PromptRequest(BaseModel):
 # ---------- System prompt ----------
 SYSTEM_PROMPT = """You are Kushina.
 Modes: CHAT or CODE
 Rules:
 - CHAT: mirror user tone, short responses, no explanations unless asked.
@@ -67,18 +79,18 @@ Otherwise use CHAT.
 def build_prompt(user_text: str) -> str:
     return f"<|system|>\n{SYSTEM_PROMPT}\n<|user|>\n{user_text}\n<|assistant|>\n"
-# ---------- API endpoint ----------
 @app.get("/")
 def root():
     return {"status": "ok"}
 @app.post("/generate")
 def generate(req: PromptRequest):
-    llm_instance = get_llm()  # lazy load
-    full_prompt = build_prompt(req.prompt)
-    output_text = ""
     try:
         for chunk in llm_instance(
             full_prompt,
             max_tokens=MAX_TOKENS,
@@ -89,6 +101,10 @@ def generate(req: PromptRequest):
         ):
             if "choices" in chunk:
                 output_text += chunk["choices"][0]["text"]
         return {"response": output_text}
     except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))

 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from llama_cpp import Llama
+import logging
 # ================= CONFIG =================
 MODEL_URL = "https://huggingface.co/Neon-AI/Kushina/resolve/main/model.gguf"
 TOP_P = 0.9
 # ==========================================
+# ---------- Logging setup ----------
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s"
+)
+logger = logging.getLogger("KushinaAPI")
+app = FastAPI(title="Kushina API", version="1.0")
 llm = None  # lazy-loaded
 # ---------- Download GGUF if not present ----------
 if not Path(MODEL_PATH).exists():
+    try:
+        logger.info("Downloading model.gguf from Hugging Face...")
+        r = requests.get(MODEL_URL, stream=True)
+        r.raise_for_status()
         with open(MODEL_PATH, "wb") as f:
             for chunk in r.iter_content(chunk_size=8192):
                 f.write(chunk)
+        logger.info("Download complete ✅")
+    except Exception as e:
+        logger.exception("Failed to download model.gguf")
+        raise RuntimeError(f"Failed to download model.gguf: {e}")
 # ---------- Lazy load llama.cpp ----------
 def get_llm():
     global llm
     if llm is None:
+        try:
+            logger.info("Loading GGUF model into llama.cpp…")
+            llm = Llama(
+                model_path=MODEL_PATH,
+                n_ctx=N_CTX,
+                n_threads=N_THREADS,
+                n_batch=N_BATCH,
+                f16_kv=True,
+                use_mmap=True,
+                verbose=False,
+            )
+            logger.info("Model loaded ✅")
+        except Exception as e:
+            logger.exception("Failed to load GGUF model")
+            raise RuntimeError(f"Failed to load GGUF model: {e}")
     return llm
 # ---------- Request schema ----------
 # ---------- System prompt ----------
 SYSTEM_PROMPT = """You are Kushina.
 Modes: CHAT or CODE
 Rules:
 - CHAT: mirror user tone, short responses, no explanations unless asked.
 def build_prompt(user_text: str) -> str:
     return f"<|system|>\n{SYSTEM_PROMPT}\n<|user|>\n{user_text}\n<|assistant|>\n"
+# ---------- API endpoints ----------
 @app.get("/")
 def root():
     return {"status": "ok"}
 @app.post("/generate")
 def generate(req: PromptRequest):
     try:
+        llm_instance = get_llm()  # lazy load
+        full_prompt = build_prompt(req.prompt)
+        output_text = ""
         for chunk in llm_instance(
             full_prompt,
             max_tokens=MAX_TOKENS,
         ):
             if "choices" in chunk:
                 output_text += chunk["choices"][0]["text"]
+        logger.info(f"Prompt processed successfully: {req.prompt[:50]}...")
         return {"response": output_text}
     except Exception as e:
+        logger.exception("Error during generation")
+        raise HTTPException(status_code=500, detail=f"Generation error: {str(e)}")