import gradio as gr from llama_cpp import Llama try: llm = Llama.from_pretrained( repo_id="operablepattern/gemma-2b-it-Q", filename="*Q5_K_M.gguf", chat_format="gemma", verbose=True ) except: llm = Llama( model_path="./gemma-2b-it-Q5_K_M.gguf", chat_format="gemma", verbose=True ) def response(message, history): print(message) print(history) output = llm(message,max_tokens=32) print(output) return output["choices"][0]["text"].strip() gr.ChatInterface( fn=response, title="Chat with Gemma", ).queue().launch()