import os
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Hugging FaceのAPIトークンを設定
#os.environ["HUGGINGFACE_TOKEN"] = os.getenv("HUGGINGFACE_TOKEN")

model_name_or_path = "TheBloke/OpenBuddy-Llama2-13B-v11.1-GGUF"
model_basename = "openbuddy-llama2-13b-v11.1.Q2_K.gguf"

model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename, revision="main")
llama = Llama(model_path)

def predict(message, history):
    messages = []
    for human_content, system_content in history:
        message_human = {
            "role": "user",
            "content": human_content + "\n",
        }
        message_system = {
            "role": "system",
            "content": system_content + "\n",
        }
        messages.append(message_human)
        messages.append(message_system)
    message_human = {
        "role": "user",
        "content": message + "\n",
    }
    messages.append(message_human)
    # Llamaでの回答を取得（ストリーミングオン）
    streamer = llama.create_chat_completion(messages, stream=True)

    partial_message = ""
    for msg in streamer:
        message = msg['choices'][0]['delta']
        if 'content' in message:
            partial_message += message['content']
            yield partial_message

gr.ChatInterface(predict,
    examples=[
        "What's the relationship between Harry Potter and Hermione ?",
        "请解释下面的emoji符号描述的情景👨👩🔥❄️",
        "明朝内阁制度的特点是什么?",
        "如何进行经济建设?", 
        "你听说过马克思吗？",
    ],
    cache_examples=False,
                ).launch(enable_queue=True)