from llama_cpp import Llama from fastapi import FastAPI from pydantic import BaseModel import requests from ctransformers import AutoModelForCausalLM llm = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v0.6", model_file="ggml-model-q4_0.gguf") #Pydantic object class validation(BaseModel): prompt: str #Fast API app = FastAPI() @app.post("/llm_on_cpu") async def stream(item: validation): prefix="""<|user|> """ suffix="""<|endoftext|><|assistant|>""" user=""" {prompt}""" prompt = f"{prefix}{user.replace('{prompt}', item.prompt)}{suffix}" return llm(prompt)