lightmate commited on
Commit
fb42888
·
verified ·
1 Parent(s): 6fbaf70

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -61
app.py CHANGED
@@ -4,81 +4,44 @@ from transformers import AutoTokenizer, AutoConfig
4
  from optimum.intel.openvino import OVModelForCausalLM
5
  import openvino as ov
6
  import gradio as gr
 
 
7
  from gradio_helper import make_demo
8
  from llm_config import SUPPORTED_LLM_MODELS
9
- from pathlib import Path
10
 
11
  # Define model configuration
12
- model_language = "English" # Example: set to English
13
- model_id = "qwen2.5-0.5b-instruct" # Example model ID
14
 
15
- # Define model directories
16
- pt_model_id = SUPPORTED_LLM_MODELS[model_language][model_id]["model_id"]
17
- int4_model_dir = Path(model_id) / "INT4_compressed_weights"
 
18
 
19
- # Load tokenizer
 
 
 
20
  tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True)
21
 
22
- # Ensure INT4 weights exist; if not, attempt conversion (locally before deployment)
23
- def check_and_convert_model():
24
- if not (int4_model_dir / "openvino_model.xml").exists():
25
- print("INT4 model weights not found. Attempting compression...")
26
- convert_to_int4()
27
-
28
- def convert_to_int4():
29
- """
30
- Converts a model to INT4 precision using the optimum-cli tool.
31
- This function should only be run locally or in an environment that supports shell commands.
32
- """
33
- # Define compression parameters
34
- compression_configs = {
35
- "qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0},
36
- "default": {"sym": False, "group_size": 128, "ratio": 0.8},
37
- }
38
-
39
- model_compression_params = compression_configs.get(model_id, compression_configs["default"])
40
-
41
- # Check if the INT4 model already exists
42
- if (int4_model_dir / "openvino_model.xml").exists():
43
- print("INT4 model already exists.")
44
- return # Exit if the model is already converted
45
-
46
- # Run model compression using `optimum-cli`
47
- export_command_base = f"optimum-cli export openvino --model {pt_model_id} --task text-generation-with-past --weight-format int4"
48
- int4_compression_args = f" --group-size {model_compression_params['group_size']} --ratio {model_compression_params['ratio']}"
49
- if model_compression_params["sym"]:
50
- int4_compression_args += " --sym"
51
-
52
- # You can add other custom compression arguments here (like AWQ)
53
- export_command = export_command_base + int4_compression_args
54
- print(f"Running compression command: {export_command}")
55
-
56
- # Execute the export command (this is typically done locally, not in Hugging Face Spaces)
57
- # For deployment, the model needs to be pre-compressed and uploaded
58
- os.system(export_command)
59
-
60
- # Check if the INT4 model exists or needs conversion
61
- check_and_convert_model()
62
-
63
- # Initialize OpenVINO model
64
- core = ov.Core()
65
  ov_model = OVModelForCausalLM.from_pretrained(
66
- str(int4_model_dir),
67
- device="CPU", # Adjust device as needed (e.g., "GPU" or "CPU")
68
- config=AutoConfig.from_pretrained(str(int4_model_dir), trust_remote_code=True),
69
  trust_remote_code=True,
70
  )
71
 
72
- def convert_history_to_token(history):
73
  """
74
- Convert the history of the conversation into tokens for the model.
75
  """
76
- input_ids = tok.encode(history[-1][0]) # Example tokenization
77
  return torch.LongTensor([input_ids])
78
 
79
  def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
80
  """
81
- Bot logic to process conversation history and generate responses.
82
  """
83
  input_ids = convert_history_to_token(history)
84
  streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)
@@ -93,16 +56,19 @@ def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id)
93
  streamer=streamer,
94
  )
95
 
96
- # Generate response
97
  ov_model.generate(**generate_kwargs)
98
 
99
- # Stream and update history with generated response
100
  partial_text = ""
101
  for new_text in streamer:
102
  partial_text += new_text
103
  history[-1][1] = partial_text
104
  yield history
105
 
106
- # Gradio interface setup
107
- demo = make_demo(run_fn=bot, stop_fn=None, title="OpenVINO Chatbot", language="en")
 
 
 
108
  demo.launch(debug=True, share=True)
 
4
  from optimum.intel.openvino import OVModelForCausalLM
5
  import openvino as ov
6
  import gradio as gr
7
+ from typing import List, Tuple
8
+ from threading import Event, Thread
9
  from gradio_helper import make_demo
10
  from llm_config import SUPPORTED_LLM_MODELS
 
11
 
12
  # Define model configuration
13
+ model_language = "English" # For example, set the model language to English
14
+ model_id = "qwen2.5-0.5b-instruct" # For example, select a model ID
15
 
16
+ # Load model configuration
17
+ model_configuration = SUPPORTED_LLM_MODELS[model_language][model_id]
18
+ pt_model_id = model_configuration["model_id"]
19
+ int4_model_dir = os.path.join(model_id, "INT4_compressed_weights")
20
 
21
+ # Load the OpenVINO model and tokenizer
22
+ device = "CPU" # Or GPU if available
23
+ core = ov.Core()
24
+ model_name = model_configuration["model_id"]
25
  tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True)
26
 
27
+ # Load the OpenVINO model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  ov_model = OVModelForCausalLM.from_pretrained(
29
+ int4_model_dir,
30
+ device=device,
31
+ config=AutoConfig.from_pretrained(int4_model_dir, trust_remote_code=True),
32
  trust_remote_code=True,
33
  )
34
 
35
+ def convert_history_to_token(history: List[Tuple[str, str]]):
36
  """
37
+ Converts conversation history to tokens based on model configuration.
38
  """
39
+ input_ids = tok.encode(history[-1][0]) # Simple example for tokenizing the last user input.
40
  return torch.LongTensor([input_ids])
41
 
42
  def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
43
  """
44
+ Generates the next part of the conversation.
45
  """
46
  input_ids = convert_history_to_token(history)
47
  streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)
 
56
  streamer=streamer,
57
  )
58
 
59
+ # Generation process
60
  ov_model.generate(**generate_kwargs)
61
 
62
+ # Stream and update history
63
  partial_text = ""
64
  for new_text in streamer:
65
  partial_text += new_text
66
  history[-1][1] = partial_text
67
  yield history
68
 
69
+ def request_cancel():
70
+ ov_model.request.cancel()
71
+
72
+ # Gradio UI
73
+ demo = make_demo(run_fn=bot, stop_fn=request_cancel, title="OpenVINO Chatbot", language="en")
74
  demo.launch(debug=True, share=True)