from quart import Quart, request from llama_cpp import Llama app = Quart(__name__) llm = Llama(model_path="./model.bin") with open('system.prompt', 'r', encoding='utf-8') as f: prompt = f.read() @app.post("/request") async def echo(): try: data = await request.get_json() maxTokens = data.get("max_tokens", 64) userPrompt = prompt + "\n\nUser: " + data['request'] + "\nAssistant: " except: return {"error": "Not enough data"}, 400 try: output = llm(userPrompt, max_tokens=maxTokens, stop=["User:", "\n"], echo=False) return {"output": output["choices"][0]["text"]} except Exception as e: print(e) return {"error": "Server error"}, 500 @app.get("/") async def get(): return '''
`CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS`
in Dockerfile on `CMAKE_ARGS="-DLLAMA_CUBLAS=on"`
. Also you can try `DLLAMA_CLBLAST`
, `DLLAMA_METAL`
or `DLLAMA_METAL`
.`run-docker.sh`
for ya. To stop container run `docker ps`
, find name of container and run `docker stop _dockerContainerName_`