Spaces:

lightmate
/

llm-chatbot

Running

App Files Files Community

lightmate commited on Nov 6, 2024

Commit

8f9fe18

verified ·

1 Parent(s): 4acb2ad

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -60

app.py CHANGED Viewed

@@ -13,18 +13,62 @@ import openvino.properties.streams as streams
 import gradio as gr
 from llm_config import SUPPORTED_LLM_MODELS
-from notebook_utils import device_widget
 # Initialize model language options
 model_languages = list(SUPPORTED_LLM_MODELS)
 def update_model_id(model_language_value):
     model_ids = list(SUPPORTED_LLM_MODELS[model_language_value])
     return model_ids[0], gr.update(choices=model_ids)
 # Function to download the model if not already present
 def download_model_if_needed(model_language_value, model_id_value):
     model_configuration, int4_model_dir, pt_model_name = get_model_path(model_language_value, model_id_value)
     int4_weights = int4_model_dir / "openvino_model.bin"
     if not int4_weights.exists():
@@ -34,12 +78,14 @@ def download_model_if_needed(model_language_value, model_id_value):
         # r = requests.get(model_configuration["model_url"])
         # with open(int4_weights, "wb") as f:
         #     f.write(r.content)
     return int4_model_dir
 # Load the model
 def load_model(model_language_value, model_id_value):
     int4_model_dir = download_model_if_needed(model_language_value, model_id_value)
     ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""}
     core = ov.Core()
@@ -49,7 +95,7 @@ def load_model(model_language_value, model_id_value):
     tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
     ov_model = OVModelForCausalLM.from_pretrained(
         model_dir,
-        device=device.value,
         ov_config=ov_config,
         config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True),
         trust_remote_code=True
@@ -60,9 +106,18 @@ def load_model(model_language_value, model_id_value):
 # Gradio interface function for generating text responses
 def generate_response(history, temperature, top_p, top_k, repetition_penalty, model_language_value, model_id_value):
     tok, ov_model, model_configuration = load_model(model_language_value, model_id_value)
-    input_ids = tok(" ".join([msg[0] for msg in history]), return_tensors="pt").input_ids
     streamer = gr.Textbox.update()
     generate_kwargs = dict(
         input_ids=input_ids,
         max_new_tokens=256,
@@ -73,72 +128,49 @@ def generate_response(history, temperature, top_p, top_k, repetition_penalty, mo
         streamer=streamer
     )
     event = Event()
     def generate_and_signal_complete():
         ov_model.generate(**generate_kwargs)
         event.set()
     t1 = Thread(target=generate_and_signal_complete)
     t1.start()
     partial_text = ""
     for new_text in streamer:
         partial_text += new_text
         history[-1][1] = partial_text
         yield history
-# Gradio UI within a Blocks context
-with gr.Blocks() as iface:
-    model_language = gr.Dropdown(
-        choices=model_languages,
-        value=model_languages[0],
-        label="Model Language"
-    )
-    model_id = gr.Dropdown(
-        choices=[],  # dynamically populated
-        label="Model",
-        value=None
-    )
-    model_language.change(update_model_id, inputs=model_language, outputs=[model_id])
-    prepare_int4_model = gr.Checkbox(
-        value=True,
-        label="Prepare INT4 Model"
-    )
-    enable_awq = gr.Checkbox(
-        value=False,
-        label="Enable AWQ",
-        visible=False
-    )
-    device = device_widget("CPU", exclude=["NPU"])
-    temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, label="Temperature")
-    top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, label="Top P")
-    top_k = gr.Slider(minimum=0, maximum=50, value=50, label="Top K")
-    repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, label="Repetition Penalty")
-    history = gr.State([])
-    iface_interface = gr.Interface(
-        fn=generate_response,
-        inputs=[
-            history,
-            temperature,
-            top_p,
-            top_k,
-            repetition_penalty,
-            model_language,
-            model_id
-        ],
-        outputs=[gr.Textbox(label="Conversation History")],
-        live=True,
-        title="OpenVINO Chatbot"
-    )
-    iface_interface.launch(debug=True, share=True, server_name="0.0.0.0", server_port=7860)
 if __name__ == "__main__":
-    iface.launch()

 import gradio as gr
 from llm_config import SUPPORTED_LLM_MODELS
 # Initialize model language options
 model_languages = list(SUPPORTED_LLM_MODELS)
+# Gradio components for selecting model language and model ID
+model_language = gr.Dropdown(
+    choices=model_languages,
+    value=model_languages[0],
+    label="Model Language"
+)
+# Gradio dropdown for selecting model ID based on language
 def update_model_id(model_language_value):
     model_ids = list(SUPPORTED_LLM_MODELS[model_language_value])
     return model_ids[0], gr.update(choices=model_ids)
+model_id = gr.Dropdown(
+    choices=[],  # will be dynamically populated
+    label="Model",
+    value=None
+)
+model_language.change(update_model_id, inputs=model_language, outputs=[model_id])
+# Gradio checkbox for preparing INT4 model
+prepare_int4_model = gr.Checkbox(
+    value=True,
+    label="Prepare INT4 Model"
+)
+# Gradio checkbox for enabling AWQ (depends on INT4 checkbox)
+enable_awq = gr.Checkbox(
+    value=False,
+    label="Enable AWQ",
+    visible=False
+)
+# Gradio dropdown for device selection (replaces device_widget)
+device = gr.Dropdown(
+    choices=["CPU", "GPU"],
+    value="CPU",
+    label="Device"
+)
+# Model directory and setup based on selections
+def get_model_path(model_language_value, model_id_value):
+    model_configuration = SUPPORTED_LLM_MODELS[model_language_value][model_id_value]
+    pt_model_id = model_configuration["model_id"]
+    pt_model_name = model_id_value.split("-")[0]
+    int4_model_dir = Path(model_id_value) / "INT4_compressed_weights"
+    return model_configuration, int4_model_dir, pt_model_name
 # Function to download the model if not already present
 def download_model_if_needed(model_language_value, model_id_value):
     model_configuration, int4_model_dir, pt_model_name = get_model_path(model_language_value, model_id_value)
     int4_weights = int4_model_dir / "openvino_model.bin"
     if not int4_weights.exists():
         # r = requests.get(model_configuration["model_url"])
         # with open(int4_weights, "wb") as f:
         #     f.write(r.content)
     return int4_model_dir
 # Load the model
 def load_model(model_language_value, model_id_value):
     int4_model_dir = download_model_if_needed(model_language_value, model_id_value)
+    # Load the OpenVINO model
     ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""}
     core = ov.Core()
     tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
     ov_model = OVModelForCausalLM.from_pretrained(
         model_dir,
+        device=device.value,  # Use Gradio dropdown value for device
         ov_config=ov_config,
         config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True),
         trust_remote_code=True
 # Gradio interface function for generating text responses
 def generate_response(history, temperature, top_p, top_k, repetition_penalty, model_language_value, model_id_value):
     tok, ov_model, model_configuration = load_model(model_language_value, model_id_value)
+    # Convert history to tokens
+    def convert_history_to_token(history):
+        # (Your history conversion logic here)
+        # Use model_configuration to determine the exact format
+        input_tokens = tok(" ".join([msg[0] for msg in history]), return_tensors="pt").input_ids
+        return input_tokens
+    input_ids = convert_history_to_token(history)
     streamer = gr.Textbox.update()
+    # Adjust generation kwargs
     generate_kwargs = dict(
         input_ids=input_ids,
         max_new_tokens=256,
         streamer=streamer
     )
+    # Start streaming response
     event = Event()
     def generate_and_signal_complete():
         ov_model.generate(**generate_kwargs)
         event.set()
     t1 = Thread(target=generate_and_signal_complete)
     t1.start()
+    # Collect generated text
     partial_text = ""
     for new_text in streamer:
         partial_text += new_text
         history[-1][1] = partial_text
         yield history
+# Gradio UI components
+temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, label="Temperature")
+top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, label="Top P")
+top_k = gr.Slider(minimum=0, maximum=50, value=50, label="Top K")
+repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, label="Repetition Penalty")
+# Conversation history input/output
+history = gr.State([])  # store the conversation history
+# Gradio Interface
+iface = gr.Interface(
+    fn=generate_response,
+    inputs=[
+        history,
+        temperature,
+        top_p,
+        top_k,
+        repetition_penalty,
+        model_language,
+        model_id
+    ],
+    outputs=[gr.Textbox(label="Conversation History")],
+    live=True,
+    title="OpenVINO Chatbot"
+)
+# Launch Gradio app
 if __name__ == "__main__":
+    iface.launch(debug=True, share=True, server_name="0.0.0.0", server_port=7860)