| import gradio as gr |
| from llama_cpp import Llama |
| from huggingface_hub import hf_hub_download |
|
|
| |
| |
| REPO_ID = "bartowski/Qwen2.5-14B-Instruct-GGUF" |
| FILENAME = "Qwen2.5-14B-Instruct-Q4_K_M.gguf" |
|
|
| print(f"Downloading {FILENAME} from {REPO_ID}...") |
| model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME) |
|
|
| print("Loading model...") |
| llm = Llama( |
| model_path=model_path, |
| n_ctx=8192, |
| n_threads=2, |
| verbose=False |
| ) |
|
|
| def generate_pro(message, history): |
| |
| prompt = "" |
| for user_msg, bot_msg in history: |
| prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{bot_msg}<|im_end|>\n" |
| prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n" |
|
|
| stream = llm.create_completion( |
| prompt, |
| max_tokens=2048, |
| stop=["<|im_end|>"], |
| stream=True, |
| temperature=0.7, |
| top_p=0.9 |
| ) |
|
|
| partial_text = "" |
| for output in stream: |
| delta = output['choices'][0]['text'] |
| partial_text += delta |
| yield partial_text |
|
|
| chat_interface = gr.ChatInterface( |
| fn=generate_pro, |
| title="🌟 Lumin Pro (Qwen 14B)", |
| description="Running Qwen2.5-14B-Instruct (GGUF). Balanced Power.", |
| ) |
|
|
| if __name__ == "__main__": |
| chat_interface.launch(server_name="0.0.0.0", server_port=7860) |