Spaces:

lightmate
/

llm-chatbot

Running

App Files Files Community

lightmate commited on Nov 6, 2024

Commit

d8164ce

verified ·

1 Parent(s): e9d91d2

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -113

app.py CHANGED Viewed

@@ -1,158 +1,173 @@
 import os
-import torch
-import gradio as gr
 from pathlib import Path
 from transformers import AutoConfig, AutoTokenizer
 from optimum.intel.openvino import OVModelForCausalLM
-from typing import List, Tuple
-from threading import Event, Thread
-from gradio_helper import make_demo  # Your helper function for Gradio demo
-from llm_config import SUPPORTED_LLM_MODELS  # Model configuration
-from notebook_utils import device_widget  # Device selection utility
 import openvino as ov
 import openvino.properties as props
 import openvino.properties.hint as hints
 import openvino.properties.streams as streams
-import requests
-# Define the model loading function (same as in your notebook)
-def convert_to_int4(model_id, model_configuration, enable_awq=False):
-    compression_configs = {
-        "qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0},
-        "default": {"sym": False, "group_size": 128, "ratio": 0.8},
-    }
-    model_compression_params = compression_configs.get(model_id, compression_configs["default"])
-    # Example conversion logic
-    int4_model_dir = Path(model_id) / "INT4_compressed_weights"
-    if (int4_model_dir / "openvino_model.xml").exists():
-        return int4_model_dir
-    remote_code = model_configuration.get("remote_code", False)
-    export_command_base = f"optimum-cli export openvino --model {model_configuration['model_id']} --task text-generation-with-past --weight-format int4"
-    int4_compression_args = f" --group-size {model_compression_params['group_size']} --ratio {model_compression_params['ratio']}"
-    if model_compression_params["sym"]:
-        int4_compression_args += " --sym"
-    if enable_awq:
-        int4_compression_args += " --awq --dataset wikitext2 --num-samples 128"
-    export_command_base += int4_compression_args
-    if remote_code:
-        export_command_base += " --trust-remote-code"
-    export_command = export_command_base + f" {str(int4_model_dir)}"
-    # Execute export command (shell command)
-    os.system(export_command)
     return int4_model_dir
-# Model and tokenizer loading
-def load_model(model_dir, device):
     ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""}
     core = ov.Core()
-    model_name = model_configuration["model_id"]
     tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
     ov_model = OVModelForCausalLM.from_pretrained(
         model_dir,
-        device=device,
         ov_config=ov_config,
         config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True),
-        trust_remote_code=True,
     )
-    return ov_model, tok
-# Gradio Interface for Bot interaction
-def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
-    input_ids = convert_history_to_token(history)
-    if input_ids.shape[1] > 2000:
-        history = [history[-1]]  # Limit input size
-        input_ids = convert_history_to_token(history)
-    streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         input_ids=input_ids,
         max_new_tokens=256,
         temperature=temperature,
-        do_sample=temperature > 0.0,
         top_p=top_p,
         top_k=top_k,
         repetition_penalty=repetition_penalty,
-        streamer=streamer,
     )
-    # Function to generate response in a separate thread
     def generate_and_signal_complete():
         ov_model.generate(**generate_kwargs)
-        stream_complete.set()
     t1 = Thread(target=generate_and_signal_complete)
     t1.start()
-    # Process partial text and return updated history
     partial_text = ""
     for new_text in streamer:
-        partial_text = text_processor(partial_text, new_text)
         history[-1][1] = partial_text
         yield history
-# Define a Gradio interface for user interaction
-def create_gradio_interface():
-    # Dropdown for selecting model language and model ID
-    model_language = list(SUPPORTED_LLM_MODELS.keys())  # List of model languages
-    model_id = gr.Dropdown(choices=model_language, value=model_language[0], label="Model Language")
-    # Once model language is selected, show the respective model IDs
-    def update_model_ids(model_language):
-        model_ids = list(SUPPORTED_LLM_MODELS[model_language].keys())
-        return gr.Dropdown.update(choices=model_ids, value=model_ids[0])
-    model_id_selector = gr.Dropdown(choices=model_language, value=model_language[0], label="Model ID")
-    # Set up a checkbox for enabling AWQ compression
-    enable_awq = gr.Checkbox(value=False, label="Enable AWQ for Compression")
-    # Initialize model selection based on language and ID
-    def load_model_on_select(model_language, model_id, enable_awq):
-        model_configuration = SUPPORTED_LLM_MODELS[model_language][model_id]
-        int4_model_dir = convert_to_int4(model_id, model_configuration, enable_awq)
-        # Load the model and tokenizer
-        device = device_widget("CPU")  # or any device you want to use
-        ov_model, tok = load_model(int4_model_dir, device)
-        # Return the loaded model and tokenizer
-        return ov_model, tok
-    # Create the Gradio chatbot interface
-    chatbot = gr.Chatbot()
-    # Parameters for bot generation
-    temperature = gr.Slider(minimum=0, maximum=1, step=0.1, label="Temperature", value=0.7)
-    top_p = gr.Slider(minimum=0, maximum=1, step=0.1, label="Top-p", value=0.9)
-    top_k = gr.Slider(minimum=0, maximum=50, step=1, label="Top-k", value=50)
-    repetition_penalty = gr.Slider(minimum=0, maximum=2, step=0.1, label="Repetition Penalty", value=1.0)
-    with gr.Blocks() as demo:
-        # Create the Gradio components and add them to the Blocks context
-        model_id_selector.change(update_model_ids, inputs=model_language, outputs=model_id_selector)
-        load_button = gr.Button("Load Model")
-        load_button.click(load_model_on_select, inputs=[model_language, model_id, enable_awq], outputs=[gr.Textbox(label="Model Status")])
-        # Set up the chatbot UI with all the required components
-        gr.Row([model_id_selector, enable_awq])  # Arrange the dropdowns and checkbox in a row
-        gr.Row([load_button])  # Add the button below the inputs
-        gr.Row([chatbot])  # Add the chatbot output
-        # Parameters for generation
-        gr.Row([temperature, top_p, top_k, repetition_penalty])  # Add sliders in a row
-        # Define bot function and run the interface
-        demo.queue()  # This is used to queue inputs and outputs, handling concurrent generation calls
-        demo.launch(debug=True, share=True)  # For public access
-    return demo
-# Run the Gradio app
 if __name__ == "__main__":
-    app = create_gradio_interface()
-    app.launch(debug=True, share=True)  # share=True for public access

 import os
 from pathlib import Path
+import requests
+import shutil
+import torch
+from threading import Event, Thread
 from transformers import AutoConfig, AutoTokenizer
 from optimum.intel.openvino import OVModelForCausalLM
 import openvino as ov
 import openvino.properties as props
 import openvino.properties.hint as hints
 import openvino.properties.streams as streams
+import gradio as gr
+from llm_config import SUPPORTED_LLM_MODELS
+from notebook_utils import device_widget
+# Initialize model language options
+model_languages = list(SUPPORTED_LLM_MODELS)
+# Gradio components for selecting model language and model ID
+model_language = gr.Dropdown(
+    choices=model_languages,
+    value=model_languages[0],
+    label="Model Language"
+)
+# Gradio dropdown for selecting model ID based on language
+def update_model_id(model_language_value):
+    model_ids = list(SUPPORTED_LLM_MODELS[model_language_value])
+    return model_ids[0], gr.update(choices=model_ids)
+model_id = gr.Dropdown(
+    choices=[],  # will be dynamically populated
+    label="Model",
+    value=None
+)
+model_language.change(update_model_id, inputs=model_language, outputs=[model_id])
+# Gradio checkbox for preparing INT4 model
+prepare_int4_model = gr.Checkbox(
+    value=True,
+    label="Prepare INT4 Model"
+)
+# Gradio checkbox for enabling AWQ (depends on INT4 checkbox)
+enable_awq = gr.Checkbox(
+    value=False,
+    label="Enable AWQ",
+    visible=False
+)
+# Device selection widget (e.g., CPU or GPU)
+device = device_widget("CPU", exclude=["NPU"])
+# Model directory and setup based on selections
+def get_model_path(model_language_value, model_id_value):
+    model_configuration = SUPPORTED_LLM_MODELS[model_language_value][model_id_value]
+    pt_model_id = model_configuration["model_id"]
+    pt_model_name = model_id_value.split("-")[0]
+    int4_model_dir = Path(model_id_value) / "INT4_compressed_weights"
+    return model_configuration, int4_model_dir, pt_model_name
+# Function to download the model if not already present
+def download_model_if_needed(model_language_value, model_id_value):
+    model_configuration, int4_model_dir, pt_model_name = get_model_path(model_language_value, model_id_value)
+    int4_weights = int4_model_dir / "openvino_model.bin"
+    if not int4_weights.exists():
+        print(f"Downloading model {model_id_value}...")
+        # Add your download logic here (e.g., from a URL)
+        # Example:
+        # r = requests.get(model_configuration["model_url"])
+        # with open(int4_weights, "wb") as f:
+        #     f.write(r.content)
     return int4_model_dir
+# Load the model
+def load_model(model_language_value, model_id_value):
+    int4_model_dir = download_model_if_needed(model_language_value, model_id_value)
+    # Load the OpenVINO model
     ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""}
     core = ov.Core()
+    model_dir = int4_model_dir
+    model_configuration = SUPPORTED_LLM_MODELS[model_language_value][model_id_value]
     tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
     ov_model = OVModelForCausalLM.from_pretrained(
         model_dir,
+        device=device.value,
         ov_config=ov_config,
         config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True),
+        trust_remote_code=True
     )
+    return tok, ov_model, model_configuration
+# Gradio interface function for generating text responses
+def generate_response(history, temperature, top_p, top_k, repetition_penalty, model_language_value, model_id_value):
+    tok, ov_model, model_configuration = load_model(model_language_value, model_id_value)
+    # Convert history to tokens
+    def convert_history_to_token(history):
+        # (Your history conversion logic here)
+        # Use model_configuration to determine the exact format
+        input_tokens = tok(" ".join([msg[0] for msg in history]), return_tensors="pt").input_ids
+        return input_tokens
+    input_ids = convert_history_to_token(history)
+    streamer = gr.Textbox.update()
+    # Adjust generation kwargs
     generate_kwargs = dict(
         input_ids=input_ids,
         max_new_tokens=256,
         temperature=temperature,
         top_p=top_p,
         top_k=top_k,
         repetition_penalty=repetition_penalty,
+        streamer=streamer
     )
+    # Start streaming response
+    event = Event()
     def generate_and_signal_complete():
         ov_model.generate(**generate_kwargs)
+        event.set()
     t1 = Thread(target=generate_and_signal_complete)
     t1.start()
+    # Collect generated text
     partial_text = ""
     for new_text in streamer:
+        partial_text += new_text
         history[-1][1] = partial_text
         yield history
+# Gradio UI components
+temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, label="Temperature")
+top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, label="Top P")
+top_k = gr.Slider(minimum=0, maximum=50, value=50, label="Top K")
+repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, label="Repetition Penalty")
+# Conversation history input/output
+history = gr.State([])  # store the conversation history
+# Gradio Interface
+iface = gr.Interface(
+    fn=generate_response,
+    inputs=[
+        history,
+        temperature,
+        top_p,
+        top_k,
+        repetition_penalty,
+        model_language,
+        model_id
+    ],
+    outputs=[gr.Textbox(label="Conversation History")],
+    live=True,
+    title="OpenVINO Chatbot"
+)
+# Launch Gradio app
 if __name__ == "__main__":
+    iface.launch(debug=True, share=True, server_name="0.0.0.0", server_port=7860)