Spaces:

mii-llm
/

maestrale-chat-v0.4-beta

Sleeping

File size: 4,014 Bytes

import gradio as gr
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TextIteratorStreamer,
)
import os
from threading import Thread
import spaces
import time
import subprocess

PLACEHOLDER = """
<div style="padding: 40px; text-align: center; display: flex; flex-direction: column; align-items: center;">
   <img src="https://i.imgur.com/yu0sVwC.png" style="width: 90%; max-width: 650px; height: auto; opacity: 0.8; border-radius: 20px;"> 
   <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Fammi una domanda!</p>
</div>
"""

css = """
.message-row {
    justify-content: space-evenly !important;
}
.message-bubble-border {
    border-radius: 6px !important;
}
.dark.message-bubble-border {
    border-color: #21293b !important;
}
.dark.user {
    background: #0a1120 !important;
}
.dark.assistant {
    background: transparent !important;
}
"""

DESCRIPTION = """<div>
<p>🇮🇹 Italian LLM <a href="https://huggingface.co/mii-llm/maestrale-chat-v0.4-beta"><b>Maestrale Chat v0.4 beta</b></a>. Maestrale is a powerful language model for Italian, trained by mii-llm, based on Mistral 7B.</p>
<p>🔎 For more details about Maestrale and how to use it with <code>transformers</code>, visit the <a href="https://huggingface.co/mii-llm/maestrale-chat-v0.4-beta">model card</a>.</p>
</div>"""

tokenizer = AutoTokenizer.from_pretrained("mii-llm/maestrale-chat-v0.4-beta")
model = AutoModelForCausalLM.from_pretrained("mii-llm/maestrale-chat-v0.4-beta", torch_dtype=torch.bfloat16, device_map="auto")

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|im_end|>")
]

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(device)}")
else:
    device = torch.device("cpu")
    print("Using CPU")


model = model.to(device)


@spaces.GPU()
def chat(message, history, system, temperature, do_sample, max_tokens):
    chat = [{"role": "system", "content": system}] if system else []
    chat.extend(
        {"role": role, "content": content}
        for user, assistant in history
        for role, content in [("user", user), ("assistant", assistant)]
    )
    chat.append({"role": "user", "content": message})

    messages = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer([messages], return_tensors="pt").to(device)

    streamer = TextIteratorStreamer(
        tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True
    )

    generate_kwargs = {
        **model_inputs,
        "streamer": streamer,
        "max_new_tokens": max_tokens,
        "do_sample": do_sample,
        "temperature": temperature,
        "eos_token_id": terminators,
        "pad_token_id": tokenizer.eos_token_id
    }

    thread = Thread(target=model.generate, kwargs=generate_kwargs)
    thread.start()

    partial_text = ""
    for new_text in streamer:
        partial_text += new_text
        yield partial_text

    yield partial_text


chatbot = gr.Chatbot(height=550, placeholder=PLACEHOLDER, label='Conversazione', show_copy_button=True)

demo = gr.ChatInterface(
    fn=chat,
    chatbot=chatbot,
    fill_height=True,
    theme=gr.themes.Soft(),
    css=css,
    additional_inputs_accordion=gr.Accordion(
        label="⚙️ Parametri", open=False, render=False
    ),
    additional_inputs=[
        gr.Textbox(
            label="System",
            value="Sei un assistente utile.",
        ),
        gr.Slider(
            minimum=0, maximum=1, step=0.1, value=0.7, label="Temperature", render=False
        ),
        gr.Checkbox(label="Sampling", value=True),
        gr.Slider(
            minimum=128,
            maximum=4096,
            step=1,
            value=768,
            label="Max new tokens",
            render=False,
        ),
    ],
    stop_btn="Stop Generation",
    cache_examples=False,
    title="Maestrale Chat v0.4 beta",
    description=DESCRIPTION
)

demo.launch()