| import os |
| from fastapi import FastAPI |
| from llama_cpp import Llama |
| import requests |
|
|
| |
| threads = int(os.cpu_count() or 2) |
|
|
| |
| |
| llm = Llama.from_pretrained( |
| repo_id="google/gemma-3-1b-it-GGUF", |
| filename="*q4_k_m.gguf", |
| n_ctx=2048, |
| n_threads=threads, |
| verbose=False |
| ) |
|
|
| main = FastAPI() |
|
|
| def web_search(query): |
| try: |
| url = f"https://api.duckduckgo.com/?q={query}&format=json" |
| response = requests.get(url, timeout=5).json() |
| return response.get("AbstractText", "No data.") |
| except: |
| return "Search failed." |
|
|
| @main.post("/v1/chat") |
| async def chat(data: dict): |
| user_query = data.get("message", "") |
| |
| |
| system_instr = ( |
| "You are Inachi AI, developed by the Inachi Team. " |
| "You are an expert system architect." |
| ) |
| |
| |
| search_context = "" |
| if "search" in user_query.lower(): |
| search_context = f"\nContext: {web_search(user_query)}" |
|
|
| |
| prompt = f"<bos><start_of_turn>system\n{system_instr}{search_context}<end_of_turn>\n<start_of_turn>user\n{user_query}<end_of_turn>\n<start_of_turn>model\n" |
| |
| |
| output = llm( |
| prompt, |
| max_tokens=512, |
| stop=["<end_of_turn>"], |
| echo=False |
| ) |
| |
| reply = output['choices'][0]['text'].strip() |
| return {"reply": reply} |
|
|
| if __name__ == "__main__": |
| import uvicorn |
| uvicorn.run(main, host="0.0.0.0", port=7860) |