import gradio as gr
from llama_cpp import Llama

try:
    llm = Llama.from_pretrained(
        repo_id="operablepattern/gemma-2b-it-Q",
        filename="*Q5_K_M.gguf",
        chat_format="gemma",
        verbose=True
    )
except:
    llm = Llama(
        model_path="./gemma-2b-it-Q5_K_M.gguf",
        chat_format="gemma",
        verbose=True
    )

def response(message, history):
    print(message)
    print(history)
    output = llm(message,max_tokens=32)
    print(output)
    return output["choices"][0]["text"].strip()

gr.ChatInterface(
    fn=response,
    title="Chat with Gemma",
).queue().launch()