arad1367 commited on
Commit
3c17b0f
β€’
1 Parent(s): c305876

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +276 -276
app.py CHANGED
@@ -1,277 +1,277 @@
1
- import os
2
- import time
3
- import torch
4
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig, AutoProcessor
5
- import gradio as gr
6
- from threading import Thread
7
- from PIL import Image
8
- import subprocess
9
- import spaces
10
-
11
- # Install flash-attn if not already installed
12
- subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
13
-
14
- # Model and tokenizer for the chatbot
15
- MODEL_ID1 = "microsoft/Phi-3.5-mini-instruct"
16
- MODEL_LIST1 = ["microsoft/Phi-3.5-mini-instruct"]
17
- HF_TOKEN = os.environ.get("HF_TOKEN", None)
18
-
19
- device = "cuda" # for GPU usage or "cpu" for CPU usage / But you need GPU :)
20
-
21
- quantization_config = BitsAndBytesConfig(
22
- load_in_4bit=True,
23
- bnb_4bit_compute_dtype=torch.bfloat16,
24
- bnb_4bit_use_double_quant=True,
25
- bnb_4bit_quant_type="nf4")
26
-
27
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID1)
28
- model = AutoModelForCausalLM.from_pretrained(
29
- MODEL_ID1,
30
- torch_dtype=torch.bfloat16,
31
- device_map="auto",
32
- quantization_config=quantization_config)
33
-
34
- # Chatbot tab function
35
- @spaces.GPU()
36
- def stream_chat(
37
- message: str,
38
- history: list,
39
- system_prompt: str,
40
- temperature: float = 0.8,
41
- max_new_tokens: int = 1024,
42
- top_p: float = 1.0,
43
- top_k: int = 20,
44
- penalty: float = 1.2,
45
- ):
46
- print(f'message: {message}')
47
- print(f'history: {history}')
48
-
49
- conversation = [
50
- {"role": "system", "content": system_prompt}
51
- ]
52
- for prompt, answer in history:
53
- conversation.extend([
54
- {"role": "user", "content": prompt},
55
- {"role": "assistant", "content": answer},
56
- ])
57
-
58
- conversation.append({"role": "user", "content": message})
59
-
60
- input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device)
61
-
62
- streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
63
-
64
- generate_kwargs = dict(
65
- input_ids=input_ids,
66
- max_new_tokens = max_new_tokens,
67
- do_sample = False if temperature == 0 else True,
68
- top_p = top_p,
69
- top_k = top_k,
70
- temperature = temperature,
71
- eos_token_id=[128001,128008,128009],
72
- streamer=streamer,
73
- )
74
-
75
- with torch.no_grad():
76
- thread = Thread(target=model.generate, kwargs=generate_kwargs)
77
- thread.start()
78
-
79
- buffer = ""
80
- for new_text in streamer:
81
- buffer += new_text
82
- yield buffer
83
-
84
- # Vision model setup
85
- models = {
86
- "microsoft/Phi-3.5-vision-instruct": AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True, torch_dtype="auto", _attn_implementation="flash_attention_2").cuda().eval()
87
- }
88
-
89
- processors = {
90
- "microsoft/Phi-3.5-vision-instruct": AutoProcessor.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True)
91
- }
92
-
93
- user_prompt = '\n'
94
- assistant_prompt = '\n'
95
- prompt_suffix = "\n"
96
-
97
- # Vision model tab function
98
- @spaces.GPU()
99
- def stream_vision(image, text_input=None, model_id="microsoft/Phi-3.5-vision-instruct"):
100
- model = models[model_id]
101
- processor = processors[model_id]
102
-
103
- # Prepare the image list and corresponding tags
104
- images = [Image.fromarray(image).convert("RGB")]
105
- placeholder = "<|image_1|>\n" # Using the image tag as per the example
106
-
107
- # Construct the prompt with the image tag and the user's text input
108
- if text_input:
109
- prompt_content = placeholder + text_input
110
- else:
111
- prompt_content = placeholder
112
-
113
- messages = [
114
- {"role": "user", "content": prompt_content},
115
- ]
116
-
117
- # Apply the chat template to the messages
118
- prompt = processor.tokenizer.apply_chat_template(
119
- messages,
120
- tokenize=False,
121
- add_generation_prompt=True
122
- )
123
-
124
- # Process the inputs with the processor
125
- inputs = processor(prompt, images, return_tensors="pt").to("cuda:0")
126
-
127
- # Generation parameters
128
- generation_args = {
129
- "max_new_tokens": 1000,
130
- "temperature": 0.0,
131
- "do_sample": False,
132
- }
133
-
134
- # Generate the response
135
- generate_ids = model.generate(
136
- **inputs,
137
- eos_token_id=processor.tokenizer.eos_token_id,
138
- **generation_args
139
- )
140
-
141
- # Remove input tokens from the generated response
142
- generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
143
-
144
- # Decode the generated output
145
- response = processor.batch_decode(
146
- generate_ids,
147
- skip_special_tokens=True,
148
- clean_up_tokenization_spaces=False
149
- )[0]
150
-
151
- return response
152
-
153
- # CSS for the interface
154
- CSS = """
155
- .duplicate-button {
156
- margin: auto !important;
157
- color: white !important;
158
- background: black !important;
159
- border-radius: 100vh !important;
160
- }
161
- h3 {
162
- text-align: center;
163
- }
164
- """
165
-
166
- PLACEHOLDER = """
167
- <center>
168
- <p>Hi! I'm your assistant. Feel free to ask your questions</p>
169
- </center>
170
- """
171
-
172
- TITLE = "<h1><center>Phi-3.5 Chatbot & Phi-3.5 Vision</center></h1>"
173
-
174
- EXPLANATION = """
175
- <div style="text-align: center; margin-top: 20px;">
176
- <p>This app supports both the microsoft/Phi-3.5-mini-instruct model for chat bot and the microsoft/Phi-3.5-vision-instruct model for multimodal model.</p>
177
- <p>Phi-3.5-vision is a lightweight, state-of-the-art open multimodal model built upon datasets which include - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data both on text and vision. The model belongs to the Phi-3 model family, and the multimodal version comes with 128K context length (in tokens) it can support. The model underwent a rigorous enhancement process, incorporating both supervised fine-tuning and direct preference optimization to ensure precise instruction adherence and robust safety measures.</p>
178
- <p>Phi-3.5-mini is a lightweight, state-of-the-art open model built upon datasets used for Phi-3 - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data. The model belongs to the Phi-3 model family and supports 128K token context length. The model underwent a rigorous enhancement process, incorporating both supervised fine-tuning, proximal policy optimization, and direct preference optimization to ensure precise instruction adherence and robust safety measures.</p>
179
- </div>
180
- """
181
-
182
- footer = """
183
- <div style="text-align: center; margin-top: 20px;">
184
- <a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank">LinkedIn</a> |
185
- <a href="https://github.com/arad1367" target="_blank">GitHub</a> |
186
- <a href="https://arad1367.pythonanywhere.com/" target="_blank">Live demo of my PhD defense</a> |
187
- <a href="https://huggingface.co/microsoft/Phi-3.5-mini-instruct" target="_blank">microsoft/Phi-3.5-mini-instruct</a> |
188
- <a href="https://huggingface.co/microsoft/Phi-3.5-vision-instruct" target="_blank">microsoft/Phi-3.5-vision-instruct</a>
189
- <br>
190
- Made with πŸ’– by Pejman Ebrahimi
191
- </div>
192
- """
193
-
194
- # Gradio app with two tabs
195
- with gr.Blocks(css=CSS, theme="small_and_pretty") as demo:
196
- gr.HTML(TITLE)
197
- gr.HTML(EXPLANATION)
198
- with gr.Tab("Chatbot"):
199
- chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
200
- gr.ChatInterface(
201
- fn=stream_chat,
202
- chatbot=chatbot,
203
- fill_height=True,
204
- additional_inputs_accordion=gr.Accordion(label="βš™οΈ Parameters", open=False, render=False),
205
- additional_inputs=[
206
- gr.Textbox(
207
- value="You are a helpful assistant",
208
- label="System Prompt",
209
- render=False,
210
- ),
211
- gr.Slider(
212
- minimum=0,
213
- maximum=1,
214
- step=0.1,
215
- value=0.8,
216
- label="Temperature",
217
- render=False,
218
- ),
219
- gr.Slider(
220
- minimum=128,
221
- maximum=8192,
222
- step=1,
223
- value=1024,
224
- label="Max new tokens",
225
- render=False,
226
- ),
227
- gr.Slider(
228
- minimum=0.0,
229
- maximum=1.0,
230
- step=0.1,
231
- value=1.0,
232
- label="top_p",
233
- render=False,
234
- ),
235
- gr.Slider(
236
- minimum=1,
237
- maximum=20,
238
- step=1,
239
- value=20,
240
- label="top_k",
241
- render=False,
242
- ),
243
- gr.Slider(
244
- minimum=0.0,
245
- maximum=2.0,
246
- step=0.1,
247
- value=1.2,
248
- label="Repetition penalty",
249
- render=False,
250
- ),
251
- ],
252
- examples=[
253
- ["How to make a self-driving car?"],
254
- ["Give me a creative idea to establish a startup"],
255
- ["How can I improve my programming skills?"],
256
- ["Show me a code snippet of a website's sticky header in CSS and JavaScript."],
257
- ],
258
- cache_examples=False,
259
- )
260
- with gr.Tab("Vision"):
261
- with gr.Row():
262
- input_img = gr.Image(label="Input Picture")
263
- with gr.Row():
264
- model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="microsoft/Phi-3.5-vision-instruct")
265
- with gr.Row():
266
- text_input = gr.Textbox(label="Question")
267
- with gr.Row():
268
- submit_btn = gr.Button(value="Submit")
269
- with gr.Row():
270
- output_text = gr.Textbox(label="Output Text")
271
-
272
- submit_btn.click(stream_vision, [input_img, text_input, model_selector], [output_text])
273
-
274
- gr.HTML(footer)
275
-
276
- # Launch the combined app
277
  demo.launch(debug=True)
 
1
+ import os
2
+ import time
3
+ import torch
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig, AutoProcessor
5
+ import gradio as gr
6
+ from threading import Thread
7
+ from PIL import Image
8
+ import subprocess
9
+ import spaces
10
+
11
+ # Install flash-attn if not already installed
12
+ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
13
+
14
+ # Model and tokenizer for the chatbot
15
+ MODEL_ID1 = "microsoft/Phi-3.5-mini-instruct"
16
+ MODEL_LIST1 = ["microsoft/Phi-3.5-mini-instruct"]
17
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
18
+
19
+ device = "cuda" if torch.cuda.is_available() else "cpu" # for GPU usage or "cpu" for CPU usage / But you need GPU :)
20
+
21
+ quantization_config = BitsAndBytesConfig(
22
+ load_in_4bit=True,
23
+ bnb_4bit_compute_dtype=torch.bfloat16,
24
+ bnb_4bit_use_double_quant=True,
25
+ bnb_4bit_quant_type="nf4")
26
+
27
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID1)
28
+ model = AutoModelForCausalLM.from_pretrained(
29
+ MODEL_ID1,
30
+ torch_dtype=torch.bfloat16,
31
+ device_map="auto",
32
+ quantization_config=quantization_config)
33
+
34
+ # Chatbot tab function
35
+ @spaces.GPU()
36
+ def stream_chat(
37
+ message: str,
38
+ history: list,
39
+ system_prompt: str,
40
+ temperature: float = 0.8,
41
+ max_new_tokens: int = 1024,
42
+ top_p: float = 1.0,
43
+ top_k: int = 20,
44
+ penalty: float = 1.2,
45
+ ):
46
+ print(f'message: {message}')
47
+ print(f'history: {history}')
48
+
49
+ conversation = [
50
+ {"role": "system", "content": system_prompt}
51
+ ]
52
+ for prompt, answer in history:
53
+ conversation.extend([
54
+ {"role": "user", "content": prompt},
55
+ {"role": "assistant", "content": answer},
56
+ ])
57
+
58
+ conversation.append({"role": "user", "content": message})
59
+
60
+ input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device)
61
+
62
+ streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
63
+
64
+ generate_kwargs = dict(
65
+ input_ids=input_ids,
66
+ max_new_tokens = max_new_tokens,
67
+ do_sample = False if temperature == 0 else True,
68
+ top_p = top_p,
69
+ top_k = top_k,
70
+ temperature = temperature,
71
+ eos_token_id=[128001,128008,128009],
72
+ streamer=streamer,
73
+ )
74
+
75
+ with torch.no_grad():
76
+ thread = Thread(target=model.generate, kwargs=generate_kwargs)
77
+ thread.start()
78
+
79
+ buffer = ""
80
+ for new_text in streamer:
81
+ buffer += new_text
82
+ yield buffer
83
+
84
+ # Vision model setup
85
+ models = {
86
+ "microsoft/Phi-3.5-vision-instruct": AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True, torch_dtype="auto", _attn_implementation="flash_attention_2").cuda().eval()
87
+ }
88
+
89
+ processors = {
90
+ "microsoft/Phi-3.5-vision-instruct": AutoProcessor.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True)
91
+ }
92
+
93
+ user_prompt = '\n'
94
+ assistant_prompt = '\n'
95
+ prompt_suffix = "\n"
96
+
97
+ # Vision model tab function
98
+ @spaces.GPU()
99
+ def stream_vision(image, text_input=None, model_id="microsoft/Phi-3.5-vision-instruct"):
100
+ model = models[model_id]
101
+ processor = processors[model_id]
102
+
103
+ # Prepare the image list and corresponding tags
104
+ images = [Image.fromarray(image).convert("RGB")]
105
+ placeholder = "<|image_1|>\n" # Using the image tag as per the example
106
+
107
+ # Construct the prompt with the image tag and the user's text input
108
+ if text_input:
109
+ prompt_content = placeholder + text_input
110
+ else:
111
+ prompt_content = placeholder
112
+
113
+ messages = [
114
+ {"role": "user", "content": prompt_content},
115
+ ]
116
+
117
+ # Apply the chat template to the messages
118
+ prompt = processor.tokenizer.apply_chat_template(
119
+ messages,
120
+ tokenize=False,
121
+ add_generation_prompt=True
122
+ )
123
+
124
+ # Process the inputs with the processor
125
+ inputs = processor(prompt, images, return_tensors="pt").to("cuda:0")
126
+
127
+ # Generation parameters
128
+ generation_args = {
129
+ "max_new_tokens": 1000,
130
+ "temperature": 0.0,
131
+ "do_sample": False,
132
+ }
133
+
134
+ # Generate the response
135
+ generate_ids = model.generate(
136
+ **inputs,
137
+ eos_token_id=processor.tokenizer.eos_token_id,
138
+ **generation_args
139
+ )
140
+
141
+ # Remove input tokens from the generated response
142
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
143
+
144
+ # Decode the generated output
145
+ response = processor.batch_decode(
146
+ generate_ids,
147
+ skip_special_tokens=True,
148
+ clean_up_tokenization_spaces=False
149
+ )[0]
150
+
151
+ return response
152
+
153
+ # CSS for the interface
154
+ CSS = """
155
+ .duplicate-button {
156
+ margin: auto !important;
157
+ color: white !important;
158
+ background: black !important;
159
+ border-radius: 100vh !important;
160
+ }
161
+ h3 {
162
+ text-align: center;
163
+ }
164
+ """
165
+
166
+ PLACEHOLDER = """
167
+ <center>
168
+ <p>Hi! I'm your assistant. Feel free to ask your questions</p>
169
+ </center>
170
+ """
171
+
172
+ TITLE = "<h1><center>Phi-3.5 Chatbot & Phi-3.5 Vision</center></h1>"
173
+
174
+ EXPLANATION = """
175
+ <div style="text-align: center; margin-top: 20px;">
176
+ <p>This app supports both the microsoft/Phi-3.5-mini-instruct model for chat bot and the microsoft/Phi-3.5-vision-instruct model for multimodal model.</p>
177
+ <p>Phi-3.5-vision is a lightweight, state-of-the-art open multimodal model built upon datasets which include - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data both on text and vision. The model belongs to the Phi-3 model family, and the multimodal version comes with 128K context length (in tokens) it can support. The model underwent a rigorous enhancement process, incorporating both supervised fine-tuning and direct preference optimization to ensure precise instruction adherence and robust safety measures.</p>
178
+ <p>Phi-3.5-mini is a lightweight, state-of-the-art open model built upon datasets used for Phi-3 - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data. The model belongs to the Phi-3 model family and supports 128K token context length. The model underwent a rigorous enhancement process, incorporating both supervised fine-tuning, proximal policy optimization, and direct preference optimization to ensure precise instruction adherence and robust safety measures.</p>
179
+ </div>
180
+ """
181
+
182
+ footer = """
183
+ <div style="text-align: center; margin-top: 20px;">
184
+ <a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank">LinkedIn</a> |
185
+ <a href="https://github.com/arad1367" target="_blank">GitHub</a> |
186
+ <a href="https://arad1367.pythonanywhere.com/" target="_blank">Live demo of my PhD defense</a> |
187
+ <a href="https://huggingface.co/microsoft/Phi-3.5-mini-instruct" target="_blank">microsoft/Phi-3.5-mini-instruct</a> |
188
+ <a href="https://huggingface.co/microsoft/Phi-3.5-vision-instruct" target="_blank">microsoft/Phi-3.5-vision-instruct</a>
189
+ <br>
190
+ Made with πŸ’– by Pejman Ebrahimi
191
+ </div>
192
+ """
193
+
194
+ # Gradio app with two tabs
195
+ with gr.Blocks(css=CSS, theme="small_and_pretty") as demo:
196
+ gr.HTML(TITLE)
197
+ gr.HTML(EXPLANATION)
198
+ with gr.Tab("Chatbot"):
199
+ chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
200
+ gr.ChatInterface(
201
+ fn=stream_chat,
202
+ chatbot=chatbot,
203
+ fill_height=True,
204
+ additional_inputs_accordion=gr.Accordion(label="βš™οΈ Parameters", open=False, render=False),
205
+ additional_inputs=[
206
+ gr.Textbox(
207
+ value="You are a helpful assistant",
208
+ label="System Prompt",
209
+ render=False,
210
+ ),
211
+ gr.Slider(
212
+ minimum=0,
213
+ maximum=1,
214
+ step=0.1,
215
+ value=0.8,
216
+ label="Temperature",
217
+ render=False,
218
+ ),
219
+ gr.Slider(
220
+ minimum=128,
221
+ maximum=8192,
222
+ step=1,
223
+ value=1024,
224
+ label="Max new tokens",
225
+ render=False,
226
+ ),
227
+ gr.Slider(
228
+ minimum=0.0,
229
+ maximum=1.0,
230
+ step=0.1,
231
+ value=1.0,
232
+ label="top_p",
233
+ render=False,
234
+ ),
235
+ gr.Slider(
236
+ minimum=1,
237
+ maximum=20,
238
+ step=1,
239
+ value=20,
240
+ label="top_k",
241
+ render=False,
242
+ ),
243
+ gr.Slider(
244
+ minimum=0.0,
245
+ maximum=2.0,
246
+ step=0.1,
247
+ value=1.2,
248
+ label="Repetition penalty",
249
+ render=False,
250
+ ),
251
+ ],
252
+ examples=[
253
+ ["How to make a self-driving car?"],
254
+ ["Give me a creative idea to establish a startup"],
255
+ ["How can I improve my programming skills?"],
256
+ ["Show me a code snippet of a website's sticky header in CSS and JavaScript."],
257
+ ],
258
+ cache_examples=False,
259
+ )
260
+ with gr.Tab("Vision"):
261
+ with gr.Row():
262
+ input_img = gr.Image(label="Input Picture")
263
+ with gr.Row():
264
+ model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="microsoft/Phi-3.5-vision-instruct")
265
+ with gr.Row():
266
+ text_input = gr.Textbox(label="Question")
267
+ with gr.Row():
268
+ submit_btn = gr.Button(value="Submit")
269
+ with gr.Row():
270
+ output_text = gr.Textbox(label="Output Text")
271
+
272
+ submit_btn.click(stream_vision, [input_img, text_input, model_selector], [output_text])
273
+
274
+ gr.HTML(footer)
275
+
276
+ # Launch the combined app
277
  demo.launch(debug=True)