File size: 2,926 Bytes

1a8ce84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9851082
1a8ce84
 
3a5bc19
1a8ce84
 
 
 
 
3a5bc19
1a8ce84
 
3a5bc19
1a8ce84
 
 
 
 
3a5bc19
1a8ce84
 
3a5bc19
1a8ce84
 
 
 
 
3a5bc19
1a8ce84
 
3a5bc19
1a8ce84
 
 
 
 
3a5bc19
1a8ce84
 
 
ca365ca
1a8ce84
3a5bc19
ca365ca
3a5bc19
ca365ca
1a8ce84
3a5bc19
1a8ce84
 
3a5bc19

from huggingface_hub import InferenceClient
import gradio as gr

class MistralChatbot:
    def __init__(self):
        self.client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1")

    def format_prompt(self, message, history):
        prompt = "<s>"
        for user_prompt, bot_response in history:
            prompt += f"[INST] {user_prompt} [/INST]"
            prompt += f" {bot_response}</s> "
        prompt += f"[INST] {message} [/INST]"
        return prompt

    def generate(self, prompt, history, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0):
        temperature = float(temperature)
        if temperature < 1e-2:
            temperature = 1e-2
        top_p = float(top_p)

        generate_kwargs = dict(
            temperature=temperature,
            max_new_tokens=max_new_tokens,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
            do_sample=True,
            seed=42,
        )

        formatted_prompt = self.format_prompt(prompt, history)

        stream = self.client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
        output = ""

        for response in stream:
            output += response.token.text
            yield output
        return output

    def launch_chat(self):
        additional_inputs=[
            gr.Slider(
                label="Temperature",
                value=0.9,
                minimum=0.0,
                maximum=1.0,
                step=0.05,
                interactive=True,
                info="Higher values produce more diverse outputs",
            ),
            gr.Slider(
                label="Max new tokens",
                value=256,
                minimum=0,
                maximum=1048,
                step=64,
                interactive=True,
                info="The maximum numbers of new tokens",
            ),
            gr.Slider(
                label="Top-p (nucleus sampling)",
                value=0.90,
                minimum=0.0,
                maximum=1,
                step=0.05,
                interactive=True,
                info="Higher values sample more low-probability tokens",
            ),
            gr.Slider(
                label="Repetition penalty",
                value=1.2,
                minimum=1.0,
                maximum=2.0,
                step=0.05,
                interactive=True,
                info="Penalize repeated tokens",
            )
        ]

        gr.ChatInterface(
            fn=self.generate,
            chatbot=gr.Chatbot(show_label=False, show_share_button=False, show_copy_button=True, likeable=True, layout="panel"),
            additional_inputs=additional_inputs,
            title="Mistral 7B"
        ).launch(show_api=False)

# Example usage:
if __name__ == "__main__":
    chatbot = MistralChatbot()
    chatbot.launch_chat()