rasyosef commited on
Commit
bb35b84
1 Parent(s): fd20f2c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -19
app.py CHANGED
@@ -1,18 +1,29 @@
1
  import gradio as gr
2
 
3
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
4
  import torch
 
 
5
 
6
  # The huggingface model id for Microsoft's phi-2 model
7
  checkpoint = "microsoft/phi-2"
8
 
9
  # Download and load model and tokenizer
10
  tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
11
- model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.float32, device_map="cpu", trust_remote_code=True)
12
- model.config.eos_token_id = tokenizer.eos_token_id
 
 
13
 
14
  # Text generation pipeline
15
- phi2 = pipeline("text-generation", tokenizer=tokenizer, model=model)
 
 
 
 
 
 
 
 
16
 
17
  # Function that accepts a prompt and generates text using the phi2 pipeline
18
  def generate(prompt, chat_history, max_new_tokens):
@@ -27,18 +38,25 @@ def generate(prompt, chat_history, max_new_tokens):
27
  final_prompt += "User: " + prompt + "\n"
28
  final_prompt += "Output:"
29
 
30
- generated_text = phi2(final_prompt, max_new_tokens=max_new_tokens)[0]["generated_text"]
31
- response = generated_text[len(final_prompt):].strip()
 
 
 
 
 
 
32
 
33
- if "User:" in response:
34
- response = response.split("User:")[0].strip()
35
 
36
- if "Assistant:" in response:
37
- response = response.split("Assistant:")[1].strip()
38
 
39
- chat_history.append((prompt, response))
 
40
 
41
- return "", chat_history
42
 
43
  # Chat interface with gradio
44
  with gr.Blocks() as demo:
@@ -46,15 +64,20 @@ with gr.Blocks() as demo:
46
  # Phi-2 Chatbot Demo
47
  This chatbot was created using Microsoft's 2.7 billion parameter [phi-2](https://huggingface.co/microsoft/phi-2) Transformer model.
48
 
49
- In order to reduce the response time on this hardware, `max_new_tokens` has been set to `24` in the text generation pipeline. With this default configuration, it takes approximately `60 seconds` for each response to be generated. Use the slider below to increase or decrease the length of the generated text.
50
  """)
51
 
52
- tokens_slider = gr.Slider(8, 128, value=24, label="Maximum new tokens", info="A larger `max_new_tokens` parameter value gives you longer text responses but at the cost of a slower response time.")
53
 
54
  chatbot = gr.Chatbot(label="Phi-2 Chatbot")
55
  msg = gr.Textbox(label="Message", placeholder="Enter text here")
56
-
57
- clear = gr.ClearButton([msg, chatbot])
58
- msg.submit(fn=generate, inputs=[msg, chatbot, tokens_slider], outputs=[msg, chatbot])
59
-
60
- demo.launch()
 
 
 
 
 
 
1
  import gradio as gr
2
 
 
3
  import torch
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, pipeline
5
+ from threading import Thread
6
 
7
  # The huggingface model id for Microsoft's phi-2 model
8
  checkpoint = "microsoft/phi-2"
9
 
10
  # Download and load model and tokenizer
11
  tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
12
+ model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.float32, device_map="auto", trust_remote_code=True)
13
+
14
+ # Streamer
15
+ streamer = TextIteratorStreamer(tokenizer=tokenizer, skip_prompt=True)
16
 
17
  # Text generation pipeline
18
+ phi2 = pipeline(
19
+ "text-generation",
20
+ tokenizer=tokenizer,
21
+ model=model,
22
+ streamer=streamer,
23
+ pad_token_id=tokenizer.eos_token_id,
24
+ eos_token_id=tokenizer.eos_token_id,
25
+ device_map="auto"
26
+ )
27
 
28
  # Function that accepts a prompt and generates text using the phi2 pipeline
29
  def generate(prompt, chat_history, max_new_tokens):
 
38
  final_prompt += "User: " + prompt + "\n"
39
  final_prompt += "Output:"
40
 
41
+ thread = Thread(target=phi2, kwargs={"text_inputs":final_prompt, "max_new_tokens":max_new_tokens})
42
+ thread.start()
43
+
44
+ generated_text = ""
45
+ chat_history.append((prompt, ""))
46
+ for word in streamer:
47
+ generated_text += word
48
+ response = generated_text.strip()
49
 
50
+ if "User:" in response:
51
+ response = response.split("User:")[0].strip()
52
 
53
+ if "Assistant:" in response:
54
+ response = response.split("Assistant:")[1].strip()
55
 
56
+ chat_history.pop()
57
+ chat_history.append((prompt, response))
58
 
59
+ yield "", chat_history
60
 
61
  # Chat interface with gradio
62
  with gr.Blocks() as demo:
 
64
  # Phi-2 Chatbot Demo
65
  This chatbot was created using Microsoft's 2.7 billion parameter [phi-2](https://huggingface.co/microsoft/phi-2) Transformer model.
66
 
67
+ In order to reduce the response time on this hardware, `max_new_tokens` has been set to `32` in the text generation pipeline. With this default configuration, it takes approximately `60 seconds` for each response to be generated. Use the slider below to increase or decrease the length of the generated text.
68
  """)
69
 
70
+ tokens_slider = gr.Slider(8, 128, value=32, label="Maximum new tokens", info="A larger `max_new_tokens` parameter value gives you longer text responses but at the cost of a slower response time.")
71
 
72
  chatbot = gr.Chatbot(label="Phi-2 Chatbot")
73
  msg = gr.Textbox(label="Message", placeholder="Enter text here")
74
+ with gr.Row():
75
+ with gr.Column():
76
+ btn = gr.Button("Submit")
77
+ with gr.Column():
78
+ clear = gr.ClearButton([msg, chatbot])
79
+
80
+ btn.click(fn=generate, inputs=[msg, chatbot, tokens_slider], outputs=[msg, chatbot])
81
+ examples = gr.Examples(examples=["Who is Leonhard Euler?"], inputs=[msg])
82
+
83
+ demo.queue().launch()