from quart import Quart, request from llama_cpp import Llama app = Quart(__name__) with open('system.prompt', 'r', encoding='utf-8') as f: prompt = f.read() @app.post("/request") async def echo(): try: data = await request.get_json() if data.get("max_tokens") != None and data.get("max_tokens") > 500: data['max_tokens'] = 500 userPrompt = prompt + "\n\nUser: " + data['request'] + "\nAssistant: " except: return {"error": "Not enough data"}, 400 return {"output": output} @app.get("/") async def get(): return '''