from fastapi import FastAPI from typing import List from vllm import LLM, SamplingParams import os from dotenv import load_dotenv load_dotenv() token: str = os.environ.get("HUGGINGFACE_TOKEN") app = FastAPI() @app.get("/llm_inference") def read_root( prompt: str, model: str = "mistralai/Mistral-7B-v0.1", temperature: float = 0., max_tokens: int = 1024) -> List: sampling_params = SamplingParams(temperature=temperature, max_tokens=max_tokens) llm = LLM(model=model) response = llm.generate([prompt], sampling_params) return response