Neon-AI commited on
Commit
d0c5b7c
·
verified ·
1 Parent(s): cb317d4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -26
app.py CHANGED
@@ -4,6 +4,7 @@ from pathlib import Path
4
  from fastapi import FastAPI, HTTPException
5
  from pydantic import BaseModel
6
  from llama_cpp import Llama
 
7
 
8
  # ================= CONFIG =================
9
  MODEL_URL = "https://huggingface.co/Neon-AI/Kushina/resolve/main/model.gguf"
@@ -16,37 +17,49 @@ TEMPERATURE = 0.7
16
  TOP_P = 0.9
17
  # ==========================================
18
 
19
- app = FastAPI(title="Kushina API", version="1.0")
 
 
 
 
 
20
 
 
21
  llm = None # lazy-loaded
22
 
23
  # ---------- Download GGUF if not present ----------
24
  if not Path(MODEL_PATH).exists():
25
- print("Downloading model.gguf from Hugging Face...")
26
- r = requests.get(MODEL_URL, stream=True)
27
- if r.status_code == 200:
 
28
  with open(MODEL_PATH, "wb") as f:
29
  for chunk in r.iter_content(chunk_size=8192):
30
  f.write(chunk)
31
- print("Download complete ✅")
32
- else:
33
- raise RuntimeError(f"Failed to download model.gguf: {r.status_code}")
 
34
 
35
  # ---------- Lazy load llama.cpp ----------
36
  def get_llm():
37
  global llm
38
  if llm is None:
39
- print("Loading GGUF model into llama.cpp…")
40
- llm = Llama(
41
- model_path=MODEL_PATH,
42
- n_ctx=N_CTX,
43
- n_threads=N_THREADS,
44
- n_batch=N_BATCH,
45
- f16_kv=True,
46
- use_mmap=True,
47
- verbose=False,
48
- )
49
- print("Model loaded ✅")
 
 
 
 
50
  return llm
51
 
52
  # ---------- Request schema ----------
@@ -55,7 +68,6 @@ class PromptRequest(BaseModel):
55
 
56
  # ---------- System prompt ----------
57
  SYSTEM_PROMPT = """You are Kushina.
58
-
59
  Modes: CHAT or CODE
60
  Rules:
61
  - CHAT: mirror user tone, short responses, no explanations unless asked.
@@ -67,18 +79,18 @@ Otherwise use CHAT.
67
  def build_prompt(user_text: str) -> str:
68
  return f"<|system|>\n{SYSTEM_PROMPT}\n<|user|>\n{user_text}\n<|assistant|>\n"
69
 
70
- # ---------- API endpoint ----------
71
  @app.get("/")
72
  def root():
73
  return {"status": "ok"}
74
-
75
  @app.post("/generate")
76
  def generate(req: PromptRequest):
77
- llm_instance = get_llm() # lazy load
78
- full_prompt = build_prompt(req.prompt)
79
- output_text = ""
80
-
81
  try:
 
 
 
 
82
  for chunk in llm_instance(
83
  full_prompt,
84
  max_tokens=MAX_TOKENS,
@@ -89,6 +101,10 @@ def generate(req: PromptRequest):
89
  ):
90
  if "choices" in chunk:
91
  output_text += chunk["choices"][0]["text"]
 
 
92
  return {"response": output_text}
 
93
  except Exception as e:
94
- raise HTTPException(status_code=500, detail=str(e))
 
 
4
  from fastapi import FastAPI, HTTPException
5
  from pydantic import BaseModel
6
  from llama_cpp import Llama
7
+ import logging
8
 
9
  # ================= CONFIG =================
10
  MODEL_URL = "https://huggingface.co/Neon-AI/Kushina/resolve/main/model.gguf"
 
17
  TOP_P = 0.9
18
  # ==========================================
19
 
20
+ # ---------- Logging setup ----------
21
+ logging.basicConfig(
22
+ level=logging.INFO,
23
+ format="%(asctime)s [%(levelname)s] %(message)s"
24
+ )
25
+ logger = logging.getLogger("KushinaAPI")
26
 
27
+ app = FastAPI(title="Kushina API", version="1.0")
28
  llm = None # lazy-loaded
29
 
30
  # ---------- Download GGUF if not present ----------
31
  if not Path(MODEL_PATH).exists():
32
+ try:
33
+ logger.info("Downloading model.gguf from Hugging Face...")
34
+ r = requests.get(MODEL_URL, stream=True)
35
+ r.raise_for_status()
36
  with open(MODEL_PATH, "wb") as f:
37
  for chunk in r.iter_content(chunk_size=8192):
38
  f.write(chunk)
39
+ logger.info("Download complete ✅")
40
+ except Exception as e:
41
+ logger.exception("Failed to download model.gguf")
42
+ raise RuntimeError(f"Failed to download model.gguf: {e}")
43
 
44
  # ---------- Lazy load llama.cpp ----------
45
  def get_llm():
46
  global llm
47
  if llm is None:
48
+ try:
49
+ logger.info("Loading GGUF model into llama.cpp…")
50
+ llm = Llama(
51
+ model_path=MODEL_PATH,
52
+ n_ctx=N_CTX,
53
+ n_threads=N_THREADS,
54
+ n_batch=N_BATCH,
55
+ f16_kv=True,
56
+ use_mmap=True,
57
+ verbose=False,
58
+ )
59
+ logger.info("Model loaded ✅")
60
+ except Exception as e:
61
+ logger.exception("Failed to load GGUF model")
62
+ raise RuntimeError(f"Failed to load GGUF model: {e}")
63
  return llm
64
 
65
  # ---------- Request schema ----------
 
68
 
69
  # ---------- System prompt ----------
70
  SYSTEM_PROMPT = """You are Kushina.
 
71
  Modes: CHAT or CODE
72
  Rules:
73
  - CHAT: mirror user tone, short responses, no explanations unless asked.
 
79
  def build_prompt(user_text: str) -> str:
80
  return f"<|system|>\n{SYSTEM_PROMPT}\n<|user|>\n{user_text}\n<|assistant|>\n"
81
 
82
+ # ---------- API endpoints ----------
83
  @app.get("/")
84
  def root():
85
  return {"status": "ok"}
86
+
87
  @app.post("/generate")
88
  def generate(req: PromptRequest):
 
 
 
 
89
  try:
90
+ llm_instance = get_llm() # lazy load
91
+ full_prompt = build_prompt(req.prompt)
92
+ output_text = ""
93
+
94
  for chunk in llm_instance(
95
  full_prompt,
96
  max_tokens=MAX_TOKENS,
 
101
  ):
102
  if "choices" in chunk:
103
  output_text += chunk["choices"][0]["text"]
104
+
105
+ logger.info(f"Prompt processed successfully: {req.prompt[:50]}...")
106
  return {"response": output_text}
107
+
108
  except Exception as e:
109
+ logger.exception("Error during generation")
110
+ raise HTTPException(status_code=500, detail=f"Generation error: {str(e)}")