from ctransformers import AutoModelForCausalLM from fastapi import FastAPI, Form from pydantic import BaseModel #Model loading llm = AutoModelForCausalLM.from_pretrained("TheBloke/CodeLlama-7B-Python-GGUF", model_file="Meta-Llama-3-8B-Instruct-Q4_K_M.gguf", model_type='llama', # max_new_tokens = 1096, threads = 3, ) #Pydantic object class validation(BaseModel): prompt: str #Fast API app = FastAPI() #Zephyr completion @app.post("/llm_on_cpu") async def stream(item: validation): system_prompt = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' prompt = f''' <|begin_of_text|><|start_header_id|>system<|end_header_id|> {system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|> {item.prompt.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|> ''' return llm(prompt)