Spaces:

lightmate
/

llm-chatbot

Running

App Files Files Community

lightmate commited on Nov 6, 2024

Commit

210ec4a

•

1 Parent(s): fb42888

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -44

app.py CHANGED Viewed

@@ -1,50 +1,78 @@
 import os
 import torch
-from transformers import AutoTokenizer, AutoConfig
-from optimum.intel.openvino import OVModelForCausalLM
-import openvino as ov
 import gradio as gr
 from typing import List, Tuple
 from threading import Event, Thread
-from gradio_helper import make_demo
-from llm_config import SUPPORTED_LLM_MODELS
-# Define model configuration
-model_language = "English"  # For example, set the model language to English
-model_id = "qwen2.5-0.5b-instruct"  # For example, select a model ID
-# Load model configuration
-model_configuration = SUPPORTED_LLM_MODELS[model_language][model_id]
-pt_model_id = model_configuration["model_id"]
-int4_model_dir = os.path.join(model_id, "INT4_compressed_weights")
-# Load the OpenVINO model and tokenizer
-device = "CPU"  # Or GPU if available
-core = ov.Core()
-model_name = model_configuration["model_id"]
-tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True)
-# Load the OpenVINO model
-ov_model = OVModelForCausalLM.from_pretrained(
-    int4_model_dir,
-    device=device,
-    config=AutoConfig.from_pretrained(int4_model_dir, trust_remote_code=True),
-    trust_remote_code=True,
-)
-def convert_history_to_token(history: List[Tuple[str, str]]):
-    """
-    Converts conversation history to tokens based on model configuration.
-    """
-    input_ids = tok.encode(history[-1][0])  # Simple example for tokenizing the last user input.
-    return torch.LongTensor([input_ids])
 def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
-    """
-    Generates the next part of the conversation.
-    """
     input_ids = convert_history_to_token(history)
     streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         input_ids=input_ids,
         max_new_tokens=256,
@@ -55,20 +83,44 @@ def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id)
         repetition_penalty=repetition_penalty,
         streamer=streamer,
     )
-    # Generation process
-    ov_model.generate(**generate_kwargs)
-    # Stream and update history
     partial_text = ""
     for new_text in streamer:
-        partial_text += new_text
         history[-1][1] = partial_text
         yield history
-def request_cancel():
-    ov_model.request.cancel()
-# Gradio UI
-demo = make_demo(run_fn=bot, stop_fn=request_cancel, title="OpenVINO Chatbot", language="en")
-demo.launch(debug=True, share=True)

 import os
 import torch
 import gradio as gr
+import ipywidgets as widgets
+from pathlib import Path
+from transformers import AutoConfig, AutoTokenizer
+from optimum.intel.openvino import OVModelForCausalLM
 from typing import List, Tuple
 from threading import Event, Thread
+from gradio_helper import make_demo  # Your helper function for Gradio demo
+from llm_config import SUPPORTED_LLM_MODELS  # Model configuration
+from notebook_utils import device_widget  # Device selection utility
+import openvino as ov
+import openvino.properties as props
+import openvino.properties.hint as hints
+import openvino.properties.streams as streams
+import requests
+# Define the model loading function (same as in your notebook)
+def convert_to_int4(model_id, model_configuration, enable_awq=False):
+    # Model conversion logic here (same as in notebook)
+    compression_configs = {
+        "qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0},
+        "default": {"sym": False, "group_size": 128, "ratio": 0.8},
+    }
+    model_compression_params = compression_configs.get(model_id, compression_configs["default"])
+    # Example conversion logic
+    int4_model_dir = Path(model_id) / "INT4_compressed_weights"
+    if (int4_model_dir / "openvino_model.xml").exists():
+        return int4_model_dir
+    remote_code = model_configuration.get("remote_code", False)
+    export_command_base = f"optimum-cli export openvino --model {model_configuration['model_id']} --task text-generation-with-past --weight-format int4"
+    int4_compression_args = f" --group-size {model_compression_params['group_size']} --ratio {model_compression_params['ratio']}"
+    if model_compression_params["sym"]:
+        int4_compression_args += " --sym"
+    if enable_awq:
+        int4_compression_args += " --awq --dataset wikitext2 --num-samples 128"
+    export_command_base += int4_compression_args
+    if remote_code:
+        export_command_base += " --trust-remote-code"
+    export_command = export_command_base + f" {str(int4_model_dir)}"
+    # Execute export command (shell command)
+    os.system(export_command)
+    return int4_model_dir
+# Model and tokenizer loading
+def load_model(model_dir, device):
+    # Load model using OpenVINO
+    ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""}
+    core = ov.Core()
+    model_name = model_configuration["model_id"]
+    tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
+    ov_model = OVModelForCausalLM.from_pretrained(
+        model_dir,
+        device=device,
+        ov_config=ov_config,
+        config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True),
+        trust_remote_code=True,
+    )
+    return ov_model, tok
+# Define the bot function that interacts with Gradio UI
 def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
     input_ids = convert_history_to_token(history)
+    if input_ids.shape[1] > 2000:
+        history = [history[-1]]  # Limit input size
+        input_ids = convert_history_to_token(history)
     streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         input_ids=input_ids,
         max_new_tokens=256,
         repetition_penalty=repetition_penalty,
         streamer=streamer,
     )
+    # Function to generate response in a separate thread
+    def generate_and_signal_complete():
+        ov_model.generate(**generate_kwargs)
+        stream_complete.set()
+    t1 = Thread(target=generate_and_signal_complete)
+    t1.start()
+    # Process partial text and return updated history
     partial_text = ""
     for new_text in streamer:
+        partial_text = text_processor(partial_text, new_text)
         history[-1][1] = partial_text
         yield history
+# Gradio interface setup
+def create_gradio_interface():
+    model_language = SUPPORTED_LLM_MODELS.keys()  # List of model languages
+    model_id = widgets.Dropdown(options=model_language, value=model_language[0], description="Model Language:")
+    # Choose model based on the selected language
+    model_configuration = SUPPORTED_LLM_MODELS[model_language[0]][model_id.value]
+    # Prepare model (convert to INT4, etc.)
+    int4_model_dir = convert_to_int4(model_id.value, model_configuration)
+    # Load model and tokenizer
+    device = device_widget("CPU")
+    ov_model, tok = load_model(int4_model_dir, device)
+    # Create the Gradio app
+    demo = make_demo(run_fn=bot, stop_fn=request_cancel, title=f"OpenVINO Chatbot", language=model_language[0])
+    return demo
+# Run the Gradio app
+if __name__ == "__main__":
+    app = create_gradio_interface()
+    app.launch(debug=True, share=True)  # share=True for public access