from fastapi import FastAPI import requests from llama_cpp import Llama app = FastAPI() llm = Llama(model_path="./tinyllama-1.1b-chat.gguf") @app.post("/llm") async def stream(item: dict): if 'prompt' not in item.keys(): raise ValueError("prompt é obrigatório") prompt = "<|system|>You are a helpfull assistant<|user|>"+item['prompt']+"<|assistant|>" temperatura = item['temperatura'] if 'temperatura' in item.keys() else 0.2 max_tokens = item['max_tokens'] if 'max_tokens' in item.keys() else 512 return llm(prompt, max_tokens=max_tokens, temperature=temperatura)