from quart import Quart, request from llama_cpp import Llama app = Quart(__name__) with open('system.prompt', 'r', encoding='utf-8') as f: prompt = f.read() @app.post("/request") async def echo(): try: data = await request.get_json() if data.get("max_tokens") != None and data.get("max_tokens") > 500: data['max_tokens'] = 500 userPrompt = prompt + "\n\nUser: " + data['request'] + "\nAssistant: " except: return {"error": "Not enough data"}, 400 return {"output": output} @app.get("/") async def get(): return '''

Hello, world!

This is showcase how to make own server with OpenBuddy's model.
I'm using here 3b model just for example. Also here's only CPU power.
But you can use GPU power as well!

How to GPU?

'''