lightmate commited on
Commit
210ec4a
1 Parent(s): fb42888

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -44
app.py CHANGED
@@ -1,50 +1,78 @@
1
  import os
2
  import torch
3
- from transformers import AutoTokenizer, AutoConfig
4
- from optimum.intel.openvino import OVModelForCausalLM
5
- import openvino as ov
6
  import gradio as gr
 
 
 
 
7
  from typing import List, Tuple
8
  from threading import Event, Thread
9
- from gradio_helper import make_demo
10
- from llm_config import SUPPORTED_LLM_MODELS
 
 
 
 
 
 
11
 
12
- # Define model configuration
13
- model_language = "English" # For example, set the model language to English
14
- model_id = "qwen2.5-0.5b-instruct" # For example, select a model ID
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- # Load model configuration
17
- model_configuration = SUPPORTED_LLM_MODELS[model_language][model_id]
18
- pt_model_id = model_configuration["model_id"]
19
- int4_model_dir = os.path.join(model_id, "INT4_compressed_weights")
20
 
21
- # Load the OpenVINO model and tokenizer
22
- device = "CPU" # Or GPU if available
23
- core = ov.Core()
24
- model_name = model_configuration["model_id"]
25
- tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True)
 
 
26
 
27
- # Load the OpenVINO model
28
- ov_model = OVModelForCausalLM.from_pretrained(
29
- int4_model_dir,
30
- device=device,
31
- config=AutoConfig.from_pretrained(int4_model_dir, trust_remote_code=True),
32
- trust_remote_code=True,
33
- )
34
 
35
- def convert_history_to_token(history: List[Tuple[str, str]]):
36
- """
37
- Converts conversation history to tokens based on model configuration.
38
- """
39
- input_ids = tok.encode(history[-1][0]) # Simple example for tokenizing the last user input.
40
- return torch.LongTensor([input_ids])
41
 
 
42
  def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
43
- """
44
- Generates the next part of the conversation.
45
- """
46
  input_ids = convert_history_to_token(history)
 
 
 
 
47
  streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)
 
48
  generate_kwargs = dict(
49
  input_ids=input_ids,
50
  max_new_tokens=256,
@@ -55,20 +83,44 @@ def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id)
55
  repetition_penalty=repetition_penalty,
56
  streamer=streamer,
57
  )
58
-
59
- # Generation process
60
- ov_model.generate(**generate_kwargs)
61
-
62
- # Stream and update history
 
 
 
 
 
63
  partial_text = ""
64
  for new_text in streamer:
65
- partial_text += new_text
66
  history[-1][1] = partial_text
67
  yield history
68
 
69
- def request_cancel():
70
- ov_model.request.cancel()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
- # Gradio UI
73
- demo = make_demo(run_fn=bot, stop_fn=request_cancel, title="OpenVINO Chatbot", language="en")
74
- demo.launch(debug=True, share=True)
 
1
  import os
2
  import torch
 
 
 
3
  import gradio as gr
4
+ import ipywidgets as widgets
5
+ from pathlib import Path
6
+ from transformers import AutoConfig, AutoTokenizer
7
+ from optimum.intel.openvino import OVModelForCausalLM
8
  from typing import List, Tuple
9
  from threading import Event, Thread
10
+ from gradio_helper import make_demo # Your helper function for Gradio demo
11
+ from llm_config import SUPPORTED_LLM_MODELS # Model configuration
12
+ from notebook_utils import device_widget # Device selection utility
13
+ import openvino as ov
14
+ import openvino.properties as props
15
+ import openvino.properties.hint as hints
16
+ import openvino.properties.streams as streams
17
+ import requests
18
 
19
+ # Define the model loading function (same as in your notebook)
20
+ def convert_to_int4(model_id, model_configuration, enable_awq=False):
21
+ # Model conversion logic here (same as in notebook)
22
+ compression_configs = {
23
+ "qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0},
24
+ "default": {"sym": False, "group_size": 128, "ratio": 0.8},
25
+ }
26
+ model_compression_params = compression_configs.get(model_id, compression_configs["default"])
27
+
28
+ # Example conversion logic
29
+ int4_model_dir = Path(model_id) / "INT4_compressed_weights"
30
+ if (int4_model_dir / "openvino_model.xml").exists():
31
+ return int4_model_dir
32
+ remote_code = model_configuration.get("remote_code", False)
33
+ export_command_base = f"optimum-cli export openvino --model {model_configuration['model_id']} --task text-generation-with-past --weight-format int4"
34
+ int4_compression_args = f" --group-size {model_compression_params['group_size']} --ratio {model_compression_params['ratio']}"
35
+ if model_compression_params["sym"]:
36
+ int4_compression_args += " --sym"
37
+ if enable_awq:
38
+ int4_compression_args += " --awq --dataset wikitext2 --num-samples 128"
39
+ export_command_base += int4_compression_args
40
+ if remote_code:
41
+ export_command_base += " --trust-remote-code"
42
+ export_command = export_command_base + f" {str(int4_model_dir)}"
43
+
44
+ # Execute export command (shell command)
45
+ os.system(export_command)
46
+ return int4_model_dir
47
 
 
 
 
 
48
 
49
+ # Model and tokenizer loading
50
+ def load_model(model_dir, device):
51
+ # Load model using OpenVINO
52
+ ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""}
53
+ core = ov.Core()
54
+ model_name = model_configuration["model_id"]
55
+ tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
56
 
57
+ ov_model = OVModelForCausalLM.from_pretrained(
58
+ model_dir,
59
+ device=device,
60
+ ov_config=ov_config,
61
+ config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True),
62
+ trust_remote_code=True,
63
+ )
64
 
65
+ return ov_model, tok
 
 
 
 
 
66
 
67
+ # Define the bot function that interacts with Gradio UI
68
  def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
 
 
 
69
  input_ids = convert_history_to_token(history)
70
+ if input_ids.shape[1] > 2000:
71
+ history = [history[-1]] # Limit input size
72
+ input_ids = convert_history_to_token(history)
73
+
74
  streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)
75
+
76
  generate_kwargs = dict(
77
  input_ids=input_ids,
78
  max_new_tokens=256,
 
83
  repetition_penalty=repetition_penalty,
84
  streamer=streamer,
85
  )
86
+
87
+ # Function to generate response in a separate thread
88
+ def generate_and_signal_complete():
89
+ ov_model.generate(**generate_kwargs)
90
+ stream_complete.set()
91
+
92
+ t1 = Thread(target=generate_and_signal_complete)
93
+ t1.start()
94
+
95
+ # Process partial text and return updated history
96
  partial_text = ""
97
  for new_text in streamer:
98
+ partial_text = text_processor(partial_text, new_text)
99
  history[-1][1] = partial_text
100
  yield history
101
 
102
+ # Gradio interface setup
103
+ def create_gradio_interface():
104
+ model_language = SUPPORTED_LLM_MODELS.keys() # List of model languages
105
+ model_id = widgets.Dropdown(options=model_language, value=model_language[0], description="Model Language:")
106
+
107
+ # Choose model based on the selected language
108
+ model_configuration = SUPPORTED_LLM_MODELS[model_language[0]][model_id.value]
109
+
110
+ # Prepare model (convert to INT4, etc.)
111
+ int4_model_dir = convert_to_int4(model_id.value, model_configuration)
112
+
113
+ # Load model and tokenizer
114
+ device = device_widget("CPU")
115
+ ov_model, tok = load_model(int4_model_dir, device)
116
+
117
+ # Create the Gradio app
118
+ demo = make_demo(run_fn=bot, stop_fn=request_cancel, title=f"OpenVINO Chatbot", language=model_language[0])
119
+
120
+ return demo
121
+
122
+ # Run the Gradio app
123
+ if __name__ == "__main__":
124
+ app = create_gradio_interface()
125
+ app.launch(debug=True, share=True) # share=True for public access
126