from huggingface_hub import InferenceClient
import gradio as gr

# Set up the client for Mistral model inference
client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.3")

# Function to format the conversation history
def format_prompt(message, history):
    prompt = "<s>"
    for user_prompt, bot_response in history:
        prompt += f"[INST] {user_prompt} [/INST] {bot_response} "
    prompt += f"[INST] {message} [/INST]</s>"
    return prompt

# Text generation function with parameters
def generate(
    prompt, history, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0,
):
    # Ensure temperature and top_p are correctly set
    temperature = max(float(temperature), 1e-2)  # Prevent temperature going below 0.01
    top_p = float(top_p)

    # Keyword arguments for generation configuration
    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        seed=42,  # Ensures results are reproducible
    )

    # Format the prompt with the user's message and history
    formatted_prompt = format_prompt(prompt, history)

    # Call the text generation endpoint
    stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
    output = ""  # Initialize an empty string for the output

    # Stream the response token by token
    for response in stream:
        output += response.token.text  # Append the generated tokens to output
        yield output  # Yield partial output for real-time display
    return output

# Additional inputs (sliders) for controlling generation parameters
additional_inputs=[
    gr.Slider(
        label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, 
        interactive=True, info="Higher values produce more diverse outputs"
    ),
    gr.Slider(
        label="Max new tokens", value=256, minimum=0, maximum=1048, step=64, 
        interactive=True, info="The maximum numbers of new tokens"
    ),
    gr.Slider(
        label="Top-p (nucleus sampling)", value=0.90, minimum=0.0, maximum=1.0, step=0.05, 
        interactive=True, info="Higher values sample more low-probability tokens"
    ),
    gr.Slider(
        label="Repetition penalty", value=1.2, minimum=1.0, maximum=2.0, step=0.05, 
        interactive=True, info="Penalize repeated tokens"
    )
]

# Gradio Chat Interface for the chatbot
gr.ChatInterface(
    fn=generate,  # The generate function is called when the user submits input
    chatbot=gr.Chatbot(show_label=False, show_share_button=False, show_copy_button=True, likeable=True, layout="panel"),
    additional_inputs=additional_inputs,  # Sliders for adjusting generation parameters
    title="Mistral 7B v0.3 ChatGPT Clone",  # Title for the interface
    description="A ChatGPT clone using Mistral 7B model. Adjust parameters to fine-tune the generation."
).launch(show_api=False)  # Launch the interface without showing the API key