File size: 4,164 Bytes
a873875
 
3300f28
 
 
72324f9
a873875
 
 
 
3300f28
72324f9
 
3300f28
72324f9
3300f28
 
a873875
3300f28
72324f9
 
 
 
3300f28
 
 
a873875
 
 
3300f28
 
 
 
 
 
 
a873875
3300f28
 
 
 
 
 
 
 
 
 
 
 
 
a873875
3300f28
 
 
7927870
3300f28
7927870
3300f28
 
7927870
 
 
 
3300f28
 
 
 
 
a873875
3300f28
 
 
 
 
 
a873875
 
 
 
 
 
3300f28
 
a873875
 
 
 
 
 
3300f28
 
a873875
 
 
 
 
 
3300f28
 
a873875
 
 
 
 
 
3300f28
 
a873875
 
 
 
 
 
 
 
 
 
3300f28
7927870
72324f9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import pprint
import subprocess
from threading import Thread

import gradio as gr
from optimum.intel.openvino import OVModelForCausalLM
from transformers import AutoTokenizer, TextIteratorStreamer

result = subprocess.run(["lscpu"], text=True, capture_output=True)
pprint.pprint(result.stdout)

original_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
model_id = "helenai/mistralai-Mistral-7B-Instruct-v0.2-ov"

model = OVModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)


def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
    # message = [{"role": "user", "content": "You are a helpful assistant"}, {"role": "assistant", "content": "How can I help?"}, {"role":"user", "content":user_text}]
    message = [{"role": "user", "content": user_text}]

    model_inputs = tokenizer.apply_chat_template(message, return_tensors="pt", return_dict=True)

    # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
    # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
    streamer = TextIteratorStreamer(
        tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
    )
    generate_kwargs = dict(
        model_inputs,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=top_p,
        temperature=float(temperature),
        top_k=top_k,
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    # Pull the generated text from the streamer, and update the model output.
    model_output = ""
    for new_text in streamer:
        model_output += new_text
        yield model_output
    return model_output


def reset_textbox():
    return gr.update(value="")


with gr.Blocks() as demo:
    original_link = "https://huggingface.co/spaces/joaogante/transformers_streaming"
    gr.Markdown(
        "# OpenVINO and 🤗 Transformers 🔥Streaming🔥 on Gradio\n"
        "This demo showcases the use of the "
        "[streaming feature](https://huggingface.co/docs/transformers/main/en/generation_strategies#streaming) "
        "of 🤗 Transformers with OpenVINO models and Gradio to generate text in real-time. It uses "
        f"[{original_model_id}](https://huggingface.co/{original_model_id}), "
        "converted to OpenVINO.\n\n"
        f"This space was duplicated from {original_link} and modified for OpenVINO models."
    )

    with gr.Row():
        with gr.Column(scale=4):
            user_text = gr.Textbox(
                label="User input",
            )
            model_output = gr.Textbox(label="Model output", lines=10, interactive=False)
            button_submit = gr.Button(value="Submit")

        with gr.Column(scale=1):
            max_new_tokens = gr.Slider(
                minimum=1,
                maximum=1000,
                value=250,
                step=1,
                interactive=True,
                label="Max New Tokens",
            )
            top_p = gr.Slider(
                minimum=0.05,
                maximum=1.0,
                value=0.95,
                step=0.05,
                interactive=True,
                label="Top-p (nucleus sampling)",
            )
            top_k = gr.Slider(
                minimum=1,
                maximum=50,
                value=50,
                step=1,
                interactive=True,
                label="Top-k",
            )
            temperature = gr.Slider(
                minimum=0.1,
                maximum=5.0,
                value=0.8,
                step=0.1,
                interactive=True,
                label="Temperature",
            )

    user_text.submit(
        run_generation,
        [user_text, top_p, temperature, top_k, max_new_tokens],
        model_output,
    )
    button_submit.click(
        run_generation,
        [user_text, top_p, temperature, top_k, max_new_tokens],
        model_output,
    )

    demo.queue(max_size=32).launch(enable_queue=True, server_name="0.0.0.0")
    # For local use:
    # demo.launch(server_name="0.0.0.0")