File size: 863 Bytes
b9f8edf
 
 
83dacf1
b9f8edf
 
 
 
83dacf1
b9f8edf
 
 
 
1227086
 
83dacf1
b9f8edf
 
 
 
 
 
 
 
 
83dacf1
b9f8edf
83dacf1
b9f8edf
 
 
 
83dacf1
 
1227086
83dacf1
 
 
b9f8edf
 
83dacf1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from typing import Union
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama


class InferenceRequest(BaseModel):
    input: Union[str, None] = None
    max_tokens: Union[int, None] = 0


app = FastAPI()

llm = Llama(model_path="./models/mistral-7b-openorca.Q4_K_S.gguf",
            verbose=False, n_ctx=8192)


@app.get("/")
async def root():
    return {"message": "Hello World"}


@app.post('/inference')
async def inference(request: InferenceRequest):
    input_text = request.input
    max_tokens = 256
    try:
        max_tokens = int(request.max_tokens)
    except:
        pass

    # process request
    try:
        result = llm(input_text, temperature=0.2,
                     top_k=5, max_tokens=max_tokens, stop=["<|im_end|>"])
        return result
    except:
        pass

    # create response
    return {}