# # SPDX-FileCopyrightText: Hadad # SPDX-License-Identifier: Apache-2.0 # import os from ollama import AsyncClient import gradio as gr async def playground( message, history, num_ctx, temperature, repeat_penalty, min_p, top_k, top_p, presence_penalty ): if not isinstance(message, str) or not message.strip(): yield [] return client = AsyncClient( host=os.getenv("OLLAMA_API_BASE_URL"), headers={ "Authorization": f"Bearer {os.getenv('OLLAMA_API_KEY')}" } ) messages = [] for item in history: if isinstance(item, dict) and "role" in item and "content" in item: messages.append({ "role": item["role"], "content": item["content"] }) messages.append({"role": "user", "content": message}) response = "" async for part in await client.chat( model="qwen3:0.6b", messages=messages, options={ "num_ctx": int(num_ctx), "temperature": float(temperature), "repeat_penalty": float(repeat_penalty), "min_p": float(min_p), "top_k": int(top_k), "top_p": float(top_p), "presence_penalty": float(presence_penalty) }, stream=True ): response += part.get("message", {}).get("content", "") yield response with gr.Blocks( fill_height=True, fill_width=True ) as app: with gr.Sidebar(): gr.Markdown("## Ollama Playground by UltimaX Intelligence") gr.HTML( """ This space run the Qwen 3 (0.6B) model from Alibaba Cloud, hosted on a server using Ollama and accessed via the Ollama Python SDK.

Official documentation for using Ollama with the Python SDK can be found here.

Qwen 3 (0.6B) runs entirely on a dual-core CPU. Thanks to its small size, the model can operate efficiently on minimal hardware.

The Qwen 3 (0.6B) model can also be viewed or downloaded from the official Ollama website here.

Like this project? You can support me by buying a coffee. """ ) gr.Markdown("---") gr.Markdown("## Model Parameters") num_ctx = gr.Slider( minimum=512, maximum=1024, value=512, step=128, label="Context Length (num_ctx)", info="Maximum context window size. Limited to CPU usage." ) gr.Markdown("") temperature = gr.Slider( minimum=0.1, maximum=2.0, value=0.6, step=0.1, label="Temperature", info="Controls randomness in generation" ) gr.Markdown("") repeat_penalty = gr.Slider( minimum=0.1, maximum=2.0, value=1.0, step=0.1, label="Repeat Penalty", info="Penalty for repeating tokens" ) gr.Markdown("") min_p = gr.Slider( minimum=0.0, maximum=1.0, value=0.00, step=0.01, label="Min P", info="Minimum probability threshold" ) gr.Markdown("") top_k = gr.Slider( minimum=0, maximum=100, value=20, step=1, label="Top K", info="Number of top tokens to consider" ) gr.Markdown("") top_p = gr.Slider( minimum=0.0, maximum=1.0, value=0.95, step=0.05, label="Top P", info="Cumulative probability threshold" ) gr.Markdown("") presence_penalty = gr.Slider( minimum=0.0, maximum=2.0, value=1.5, step=0.1, label="Presence Penalty", info="Penalty for introducing new tokens" ) gr.ChatInterface( fn=playground, additional_inputs=[ num_ctx, temperature, repeat_penalty, min_p, top_k, top_p, presence_penalty ], chatbot=gr.Chatbot( label="Ollama | Qwen 3 (0.6B)", type="messages", show_copy_button=True, allow_tags=["think"], scale=1 ), type="messages", examples=[ ["Please introduce yourself."], ["What caused World War II?"], ["Give me a short introduction to large language model."], ["Explain about quantum computers."] ], cache_examples=False, show_api=False ) app.launch( server_name="0.0.0.0", pwa=True )