| |
| """ |
| llama.cpp chat with zindango-slm (GGUF) for English chat verification. |
| Uses llama-cpp-python with the Q8_0 quantized model from Hugging Face. |
| """ |
| import os |
| import sys |
|
|
|
|
| def main(): |
| try: |
| from llama_cpp import Llama |
| except ImportError: |
| print("llama-cpp-python not installed.") |
| print("Install: pip install llama-cpp-python") |
| print("Or use pre-built wheels: pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu") |
| print("For GPU: pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121") |
| print("\nAlternatively run: ./scripts/llamacpp_chat.sh (requires llama-cli from llama.cpp)") |
| return 1 |
|
|
| script_dir = os.path.dirname(os.path.abspath(__file__)) |
| project_root = os.path.dirname(script_dir) |
| model_dir = os.path.join(project_root, "models", "zindango-slm") |
| gguf_path = os.path.join(model_dir, "zindango-slm-Q8_0.gguf") |
|
|
| if not os.path.isfile(gguf_path): |
| print(f"GGUF not found at {gguf_path}") |
| print("Download with: huggingface-cli download ksjpswaroop/zindango-slm zindango-slm-Q8_0.gguf --local-dir models/zindango-slm") |
| os.makedirs(model_dir, exist_ok=True) |
| try: |
| from huggingface_hub import hf_hub_download |
| print("Downloading zindango-slm-Q8_0.gguf from Hugging Face...") |
| path = hf_hub_download( |
| repo_id="ksjpswaroop/zindango-slm", |
| filename="zindango-slm-Q8_0.gguf", |
| local_dir=model_dir, |
| local_dir_use_symlinks=False, |
| ) |
| gguf_path = path |
| except Exception as e: |
| print(f"Download failed: {e}") |
| return 1 |
|
|
| print("Loading zindango-slm (Q8_0)...") |
| llm = Llama( |
| model_path=gguf_path, |
| n_ctx=2048, |
| n_threads=os.cpu_count() or 4, |
| chat_format="chatml", |
| verbose=False, |
| ) |
|
|
| messages = [ |
| {"role": "system", "content": "You are a helpful assistant. Always respond in English."}, |
| ] |
|
|
| print("\n" + "=" * 60) |
| print("zindango-slm Chat (llama.cpp) - English verification") |
| print("=" * 60) |
| print("Type your message and press Enter. Commands: /quit, /clear") |
| print() |
|
|
| while True: |
| try: |
| user_input = input("You: ").strip() |
| except (EOFError, KeyboardInterrupt): |
| print("\nBye!") |
| break |
|
|
| if not user_input: |
| continue |
| if user_input.lower() in ("/quit", "/exit", "quit", "exit"): |
| print("Bye!") |
| break |
| if user_input.lower() == "/clear": |
| messages = [messages[0]] |
| print("[Context cleared]") |
| continue |
|
|
| messages.append({"role": "user", "content": user_input}) |
|
|
| print("Assistant: ", end="", flush=True) |
| stream = llm.create_chat_completion( |
| messages=messages, |
| max_tokens=512, |
| temperature=0.7, |
| stream=True, |
| ) |
| full_reply = "" |
| for chunk in stream: |
| delta = chunk["choices"][0].get("delta", {}) |
| content = delta.get("content", "") |
| if content: |
| print(content, end="", flush=True) |
| full_reply += content |
| print() |
|
|
| if full_reply: |
| messages.append({"role": "assistant", "content": full_reply}) |
|
|
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| sys.exit(main()) |
|
|