helenai commited on
Commit
72324f9
1 Parent(s): b1b5ce7

Switch to Mistral model

Browse files
Files changed (2) hide show
  1. README.md +3 -0
  2. app.py +10 -8
README.md CHANGED
@@ -8,6 +8,9 @@ sdk_version: 3.23.0
8
  app_file: app.py
9
  pinned: false
10
  duplicated_from: joaogante/transformers_streaming
 
 
 
11
  ---
12
 
13
  # Environment
 
8
  app_file: app.py
9
  pinned: false
10
  duplicated_from: joaogante/transformers_streaming
11
+
12
+ preload_from_hub:
13
+ - helenai/mistralai-Mistral-7B-Instruct-v0.2-ov
14
  ---
15
 
16
  # Environment
app.py CHANGED
@@ -3,23 +3,24 @@ import subprocess
3
  from threading import Thread
4
 
5
  import gradio as gr
6
- from optimum.intel.openvino import OVModelForSeq2SeqLM
7
  from transformers import AutoTokenizer, TextIteratorStreamer
8
 
9
  result = subprocess.run(["lscpu"], text=True, capture_output=True)
10
  pprint.pprint(result.stdout)
11
 
12
- # original_model_id = "declare-lab/flan-alpaca-xl"
13
- original_model_id = "declare-lab/flan-alpaca-large"
14
- model_id = f"helenai/{original_model_id.replace('/','-')}-ov"
15
 
16
- model = OVModelForSeq2SeqLM.from_pretrained(model_id)
17
  tokenizer = AutoTokenizer.from_pretrained(model_id)
18
 
19
 
20
  def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
21
- # Get the model and tokenizer, and tokenize the user text.
22
- model_inputs = tokenizer([user_text], return_tensors="pt")
 
 
23
 
24
  # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
25
  # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
@@ -65,7 +66,6 @@ with gr.Blocks() as demo:
65
  with gr.Row():
66
  with gr.Column(scale=4):
67
  user_text = gr.Textbox(
68
- placeholder="Write an email about an alpaca that likes flan",
69
  label="User input",
70
  )
71
  model_output = gr.Textbox(label="Model output", lines=10, interactive=False)
@@ -117,3 +117,5 @@ with gr.Blocks() as demo:
117
  )
118
 
119
  demo.queue(max_size=32).launch(enable_queue=True, server_name="0.0.0.0")
 
 
 
3
  from threading import Thread
4
 
5
  import gradio as gr
6
+ from optimum.intel.openvino import OVModelForCausalLM
7
  from transformers import AutoTokenizer, TextIteratorStreamer
8
 
9
  result = subprocess.run(["lscpu"], text=True, capture_output=True)
10
  pprint.pprint(result.stdout)
11
 
12
+ original_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
13
+ model_id = "helenai/mistralai-Mistral-7B-Instruct-v0.2-ov"
 
14
 
15
+ model = OVModelForCausalLM.from_pretrained(model_id)
16
  tokenizer = AutoTokenizer.from_pretrained(model_id)
17
 
18
 
19
  def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
20
+ # message = [{"role": "user", "content": "You are a helpful assistant"}, {"role": "assistant", "content": "How can I help?"}, {"role":"user", "content":user_text}]
21
+ message = [{"role": "user", "content": user_text}]
22
+
23
+ model_inputs = tokenizer.apply_chat_template(message, return_tensors="pt", return_dict=True)
24
 
25
  # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
26
  # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
 
66
  with gr.Row():
67
  with gr.Column(scale=4):
68
  user_text = gr.Textbox(
 
69
  label="User input",
70
  )
71
  model_output = gr.Textbox(label="Model output", lines=10, interactive=False)
 
117
  )
118
 
119
  demo.queue(max_size=32).launch(enable_queue=True, server_name="0.0.0.0")
120
+ # For local use:
121
+ # demo.launch(server_name="0.0.0.0")