Spaces:
Build error
Build error
| import gradio as gr | |
| from llama_cpp import Llama | |
| # verbose=False stops the model from printing its internal logic to the logs | |
| llm = Llama( | |
| model_path="llama3.2-1b-Uncensored-Q4_K_M.gguf", | |
| n_ctx=512, | |
| n_threads=4, | |
| verbose=False | |
| ) | |
| def stream_chat(message, history): | |
| # Minimalist prompt to prevent the model from entering "test/quiz" mode | |
| prompt = f"User: {message}\nAssistant:" | |
| stream = llm( | |
| prompt, | |
| max_tokens=256, | |
| stop=["User:", "Assistant:", "\n"], | |
| stream=True, | |
| temperature=0, # Greedy search: prevents "wandering" into reasoning | |
| repeat_penalty=1.2 | |
| ) | |
| partial_text = "" | |
| # Phrases that usually indicate the model is starting a monologue | |
| forbidden_start = ["Okay", "I think", "First", "Let me", "The user"] | |
| for output in stream: | |
| token = output["choices"][0]["text"] | |
| combined = partial_text + token | |
| # Kill generation if it starts meta-commentary | |
| if any(combined.startswith(phrase) for phrase in forbidden_start): | |
| break | |
| partial_text += token | |
| yield partial_text.strip() | |
| with gr.Blocks() as demo: | |
| gr.ChatInterface( | |
| fn=stream_chat, | |
| title="DIRECT-LLAMA-MAX", | |
| description="Fast, direct, and uncensored." | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) |