from fastapi import FastAPI, HTTPException, Request from fastapi.responses import JSONResponse from llama_cpp import Llama import gradio as gr app = FastAPI() llm = gr.Llama(model_path="model.gguf", n_ctx=4000, n_threads=2, chat_format="chatml") @app.post("/api/v1/chat") async def chat_post(request: Request): data = await request.json() message = data.get("message") history = data.get("history", []) temperature = data.get("temperature", 0.3) max_tokens = data.get("max_tokens", 512) async def generate(): system_prompt = "You are OpenChat, a useful AI assistant." formatted_prompt = [{"role": "system", "content": system_prompt}] for user_prompt, bot_response in history: formatted_prompt.append({"role": "user", "content": user_prompt}) formatted_prompt.append({"role": "assistant", "content": bot_response }) formatted_prompt.append({"role": "user", "content": message}) stream_response = llm.create_chat_completion(messages=formatted_prompt, temperature=temperature, max_tokens=max_tokens, stream=True) response = "" for chunk in stream_response: if len(chunk['choices'][0]["delta"]) != 0 and "content" in chunk['choices'][0]["delta"]: response += chunk['choices'][0]["delta"]["content"] yield response return JSONResponse(content={"response": await generate()}) @app.get("/api/v1/chat") async def chat_get(): return {"message": "Send a POST request to this endpoint to chat."} if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8000)