import os import urllib.request from llama_cpp import Llama from fastapi import FastAPI app = FastAPI(docs_url="/") def download_file(file_link, filename): # Checks if the file already exists before downloading if not os.path.isfile(filename): urllib.request.urlretrieve(file_link, filename) print("File downloaded successfully.") else: print("File already exists.") # Dowloading GGML model from HuggingFace ggml_model_path = "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF/resolve/main/zephyr-7b-beta.Q4_0.gguf" filename = "zephyr-7b-beta.Q4_0.gguf" #download_file(ggml_model_path, filename) llm = Llama(model_path="/home/mo/Desktop/web/oGBackend/qwen1_5-0_5b-chat-q2_k.gguf", n_ctx=512, n_batch=126, chat_format="llama") def generate_text( prompt="Who is the COlor of Apple?", max_tokens=256, temperature=0.7, top_p=0.5, echo=False, stop=["#"], ): output = llm( prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p, echo=echo, stop=stop, ) output_text = output["choices"][0]["text"] return output_text def generate_prompt_from_template(input): chat_prompt_template = f"""<|im_start|>system You are a helpful chatbot.<|im_end|> <|im_start|>user {input}<|im_end|>""" return chat_prompt_template @app.get("/generate") def generate(text: str): prompt = generate_prompt_from_template(text) generate_text( prompt, max_tokens=356, )