lightmate commited on
Commit
8f9fe18
·
verified ·
1 Parent(s): 4acb2ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -60
app.py CHANGED
@@ -13,18 +13,62 @@ import openvino.properties.streams as streams
13
  import gradio as gr
14
 
15
  from llm_config import SUPPORTED_LLM_MODELS
16
- from notebook_utils import device_widget
17
 
18
  # Initialize model language options
19
  model_languages = list(SUPPORTED_LLM_MODELS)
20
 
 
 
 
 
 
 
 
 
21
  def update_model_id(model_language_value):
22
  model_ids = list(SUPPORTED_LLM_MODELS[model_language_value])
23
  return model_ids[0], gr.update(choices=model_ids)
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  # Function to download the model if not already present
26
  def download_model_if_needed(model_language_value, model_id_value):
27
  model_configuration, int4_model_dir, pt_model_name = get_model_path(model_language_value, model_id_value)
 
28
  int4_weights = int4_model_dir / "openvino_model.bin"
29
 
30
  if not int4_weights.exists():
@@ -34,12 +78,14 @@ def download_model_if_needed(model_language_value, model_id_value):
34
  # r = requests.get(model_configuration["model_url"])
35
  # with open(int4_weights, "wb") as f:
36
  # f.write(r.content)
37
-
38
  return int4_model_dir
39
 
40
  # Load the model
41
  def load_model(model_language_value, model_id_value):
42
  int4_model_dir = download_model_if_needed(model_language_value, model_id_value)
 
 
43
  ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""}
44
  core = ov.Core()
45
 
@@ -49,7 +95,7 @@ def load_model(model_language_value, model_id_value):
49
  tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
50
  ov_model = OVModelForCausalLM.from_pretrained(
51
  model_dir,
52
- device=device.value,
53
  ov_config=ov_config,
54
  config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True),
55
  trust_remote_code=True
@@ -60,9 +106,18 @@ def load_model(model_language_value, model_id_value):
60
  # Gradio interface function for generating text responses
61
  def generate_response(history, temperature, top_p, top_k, repetition_penalty, model_language_value, model_id_value):
62
  tok, ov_model, model_configuration = load_model(model_language_value, model_id_value)
63
- input_ids = tok(" ".join([msg[0] for msg in history]), return_tensors="pt").input_ids
 
 
 
 
 
 
 
 
64
  streamer = gr.Textbox.update()
65
 
 
66
  generate_kwargs = dict(
67
  input_ids=input_ids,
68
  max_new_tokens=256,
@@ -73,72 +128,49 @@ def generate_response(history, temperature, top_p, top_k, repetition_penalty, mo
73
  streamer=streamer
74
  )
75
 
 
76
  event = Event()
 
77
  def generate_and_signal_complete():
78
  ov_model.generate(**generate_kwargs)
79
  event.set()
80
-
81
  t1 = Thread(target=generate_and_signal_complete)
82
  t1.start()
83
-
 
84
  partial_text = ""
85
  for new_text in streamer:
86
  partial_text += new_text
87
  history[-1][1] = partial_text
88
  yield history
89
 
90
- # Gradio UI within a Blocks context
91
- with gr.Blocks() as iface:
92
- model_language = gr.Dropdown(
93
- choices=model_languages,
94
- value=model_languages[0],
95
- label="Model Language"
96
- )
97
-
98
- model_id = gr.Dropdown(
99
- choices=[], # dynamically populated
100
- label="Model",
101
- value=None
102
- )
103
-
104
- model_language.change(update_model_id, inputs=model_language, outputs=[model_id])
105
-
106
- prepare_int4_model = gr.Checkbox(
107
- value=True,
108
- label="Prepare INT4 Model"
109
- )
110
- enable_awq = gr.Checkbox(
111
- value=False,
112
- label="Enable AWQ",
113
- visible=False
114
- )
115
-
116
- device = device_widget("CPU", exclude=["NPU"])
117
-
118
- temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, label="Temperature")
119
- top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, label="Top P")
120
- top_k = gr.Slider(minimum=0, maximum=50, value=50, label="Top K")
121
- repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, label="Repetition Penalty")
122
-
123
- history = gr.State([])
124
-
125
- iface_interface = gr.Interface(
126
- fn=generate_response,
127
- inputs=[
128
- history,
129
- temperature,
130
- top_p,
131
- top_k,
132
- repetition_penalty,
133
- model_language,
134
- model_id
135
- ],
136
- outputs=[gr.Textbox(label="Conversation History")],
137
- live=True,
138
- title="OpenVINO Chatbot"
139
- )
140
-
141
- iface_interface.launch(debug=True, share=True, server_name="0.0.0.0", server_port=7860)
142
-
143
  if __name__ == "__main__":
144
- iface.launch()
 
13
  import gradio as gr
14
 
15
  from llm_config import SUPPORTED_LLM_MODELS
 
16
 
17
  # Initialize model language options
18
  model_languages = list(SUPPORTED_LLM_MODELS)
19
 
20
+ # Gradio components for selecting model language and model ID
21
+ model_language = gr.Dropdown(
22
+ choices=model_languages,
23
+ value=model_languages[0],
24
+ label="Model Language"
25
+ )
26
+
27
+ # Gradio dropdown for selecting model ID based on language
28
  def update_model_id(model_language_value):
29
  model_ids = list(SUPPORTED_LLM_MODELS[model_language_value])
30
  return model_ids[0], gr.update(choices=model_ids)
31
 
32
+ model_id = gr.Dropdown(
33
+ choices=[], # will be dynamically populated
34
+ label="Model",
35
+ value=None
36
+ )
37
+
38
+ model_language.change(update_model_id, inputs=model_language, outputs=[model_id])
39
+
40
+ # Gradio checkbox for preparing INT4 model
41
+ prepare_int4_model = gr.Checkbox(
42
+ value=True,
43
+ label="Prepare INT4 Model"
44
+ )
45
+
46
+ # Gradio checkbox for enabling AWQ (depends on INT4 checkbox)
47
+ enable_awq = gr.Checkbox(
48
+ value=False,
49
+ label="Enable AWQ",
50
+ visible=False
51
+ )
52
+
53
+ # Gradio dropdown for device selection (replaces device_widget)
54
+ device = gr.Dropdown(
55
+ choices=["CPU", "GPU"],
56
+ value="CPU",
57
+ label="Device"
58
+ )
59
+
60
+ # Model directory and setup based on selections
61
+ def get_model_path(model_language_value, model_id_value):
62
+ model_configuration = SUPPORTED_LLM_MODELS[model_language_value][model_id_value]
63
+ pt_model_id = model_configuration["model_id"]
64
+ pt_model_name = model_id_value.split("-")[0]
65
+ int4_model_dir = Path(model_id_value) / "INT4_compressed_weights"
66
+ return model_configuration, int4_model_dir, pt_model_name
67
+
68
  # Function to download the model if not already present
69
  def download_model_if_needed(model_language_value, model_id_value):
70
  model_configuration, int4_model_dir, pt_model_name = get_model_path(model_language_value, model_id_value)
71
+
72
  int4_weights = int4_model_dir / "openvino_model.bin"
73
 
74
  if not int4_weights.exists():
 
78
  # r = requests.get(model_configuration["model_url"])
79
  # with open(int4_weights, "wb") as f:
80
  # f.write(r.content)
81
+
82
  return int4_model_dir
83
 
84
  # Load the model
85
  def load_model(model_language_value, model_id_value):
86
  int4_model_dir = download_model_if_needed(model_language_value, model_id_value)
87
+
88
+ # Load the OpenVINO model
89
  ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""}
90
  core = ov.Core()
91
 
 
95
  tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
96
  ov_model = OVModelForCausalLM.from_pretrained(
97
  model_dir,
98
+ device=device.value, # Use Gradio dropdown value for device
99
  ov_config=ov_config,
100
  config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True),
101
  trust_remote_code=True
 
106
  # Gradio interface function for generating text responses
107
  def generate_response(history, temperature, top_p, top_k, repetition_penalty, model_language_value, model_id_value):
108
  tok, ov_model, model_configuration = load_model(model_language_value, model_id_value)
109
+
110
+ # Convert history to tokens
111
+ def convert_history_to_token(history):
112
+ # (Your history conversion logic here)
113
+ # Use model_configuration to determine the exact format
114
+ input_tokens = tok(" ".join([msg[0] for msg in history]), return_tensors="pt").input_ids
115
+ return input_tokens
116
+
117
+ input_ids = convert_history_to_token(history)
118
  streamer = gr.Textbox.update()
119
 
120
+ # Adjust generation kwargs
121
  generate_kwargs = dict(
122
  input_ids=input_ids,
123
  max_new_tokens=256,
 
128
  streamer=streamer
129
  )
130
 
131
+ # Start streaming response
132
  event = Event()
133
+
134
  def generate_and_signal_complete():
135
  ov_model.generate(**generate_kwargs)
136
  event.set()
137
+
138
  t1 = Thread(target=generate_and_signal_complete)
139
  t1.start()
140
+
141
+ # Collect generated text
142
  partial_text = ""
143
  for new_text in streamer:
144
  partial_text += new_text
145
  history[-1][1] = partial_text
146
  yield history
147
 
148
+ # Gradio UI components
149
+ temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, label="Temperature")
150
+ top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, label="Top P")
151
+ top_k = gr.Slider(minimum=0, maximum=50, value=50, label="Top K")
152
+ repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, label="Repetition Penalty")
153
+
154
+ # Conversation history input/output
155
+ history = gr.State([]) # store the conversation history
156
+
157
+ # Gradio Interface
158
+ iface = gr.Interface(
159
+ fn=generate_response,
160
+ inputs=[
161
+ history,
162
+ temperature,
163
+ top_p,
164
+ top_k,
165
+ repetition_penalty,
166
+ model_language,
167
+ model_id
168
+ ],
169
+ outputs=[gr.Textbox(label="Conversation History")],
170
+ live=True,
171
+ title="OpenVINO Chatbot"
172
+ )
173
+
174
+ # Launch Gradio app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  if __name__ == "__main__":
176
+ iface.launch(debug=True, share=True, server_name="0.0.0.0", server_port=7860)