import os import requests from ctransformers import AutoModelForCausalLM from fastapi import FastAPI from pydantic import BaseModel # Define the public URL for the model file MODEL_URL = "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF/resolve/main/zephyr-7b-beta.Q5_K_S.gguf" MODEL_PATH = "zephyr-7b-beta.Q4_K_S.gguf" # Download the model file if not already present def download_model(model_url, model_path): if not os.path.exists(model_path): print(f"Downloading model from {model_url}...") response = requests.get(model_url, stream=True) with open(model_path, "wb") as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) print("Model download complete.") else: print("Model already exists locally.") # Ensure the model file is downloaded download_model(MODEL_URL, MODEL_PATH) # Load the model llm = AutoModelForCausalLM.from_pretrained( MODEL_PATH, model_type="mistral", max_new_tokens=1096, threads=3, ) # Pydantic object for request validation class Validation(BaseModel): prompt: str # Initialize FastAPI app app = FastAPI() # Zephyr completion endpoint @app.post("/llm_on_cpu") async def stream(item: Validation): system_prompt = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' E_INST = "" user, assistant = "<|user|>", "<|assistant|>" prompt = f"{system_prompt}{E_INST}\n{user}\n{item.prompt.strip()}{E_INST}\n{assistant}\n" return llm(prompt)