vllm / main.py
Wolf369's picture
Update endpoint to make it more generic
8b64a94
raw
history blame
No virus
490 Bytes
from fastapi import FastAPI
from typing import List
from vllm import LLM, SamplingParams
app = FastAPI()
@app.get("/llm_inference")
def read_root(
prompts: List[str],
model: str = "meta-llama/Llama-2-7b-hf",
temperature: float = 0.,
max_tokens: int = 1024) -> List:
sampling_params = SamplingParams(temperature=temperature, max_tokens=max_tokens)
llm = LLM(model=model)
response = llm.generate(prompts, sampling_params)
return response