Dongxu Li commited on
Commit
4ecd25d
1 Parent(s): 0314a2f
Files changed (1) hide show
  1. app.py +65 -49
app.py CHANGED
@@ -125,12 +125,20 @@ def inference_caption(
125
  return output[0]
126
 
127
 
 
 
 
 
 
 
 
128
  title = """<h1 align="center">BLIP-2</h1>"""
129
- description = """Gradio demo for BLIP-2, image-to-text generation from Salesforce Research. To use it, simply upload your image, or click one of the examples to load them.</p>
130
- <p> <strong>Disclaimer</strong>: This is a research prototype and is not intended for production use. No data including but not restricted to text and images is collected. </p>"""
131
  article = """<strong>Paper</strong>: <a href='https://arxiv.org/abs/2301.12597' target='_blank'>BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models</a>
132
  <br> <strong>Code</strong>: BLIP2 is now integrated into GitHub repo: <a href='https://github.com/salesforce/LAVIS' target='_blank'>LAVIS: a One-stop Library for Language and Vision</a>
133
  <br> <strong>Project Page</strong>: <a href='https://github.com/salesforce/LAVIS/tree/main/projects/blip2' target='_blank'> BLIP2 on LAVIS</a>
 
134
  """
135
 
136
  endpoint = Endpoint()
@@ -147,6 +155,7 @@ with gr.Blocks() as iface:
147
  gr.Markdown(title)
148
  gr.Markdown(description)
149
  gr.Markdown(article)
 
150
  with gr.Row():
151
  with gr.Column():
152
  image_input = gr.Image(type="pil")
@@ -189,54 +198,61 @@ with gr.Blocks() as iface:
189
  with gr.Column():
190
 
191
  with gr.Column():
192
- # with gr.Row():
193
- caption_output = gr.Textbox(lines=1, label="Caption Output (from OPT)")
194
- caption_button = gr.Button(
195
- value="Caption it!", interactive=True, variant="primary"
196
- )
197
- caption_button.click(
198
- inference_caption,
199
- [
200
- image_input,
201
- sampling,
202
- temperature,
203
- len_penalty,
204
- rep_penalty,
205
- ],
206
- [caption_output],
207
- )
208
-
209
- with gr.Column():
210
- chat_input = gr.Textbox(lines=1, label="Chat Input (recommend prompt for QA, Question: {} Answer:)")
211
- with gr.Row():
212
  chatbot = gr.Chatbot(label="Chat Output (from FlanT5)")
213
- image_input.change(lambda: (None, "", "", []), [], [chatbot, chat_input, caption_output, state])
214
-
215
- with gr.Row():
216
-
217
- clear_button = gr.Button(value="Clear", interactive=True)
218
- clear_button.click(
219
- lambda: ("", None, [], []),
220
- [],
221
- [chat_input, image_input, chatbot, state],
222
- )
223
-
224
- submit_button = gr.Button(
225
- value="Submit", interactive=True, variant="primary"
226
- )
227
- submit_button.click(
228
- inference_chat,
229
- [
230
- image_input,
231
- chat_input,
232
- sampling,
233
- temperature,
234
- len_penalty,
235
- rep_penalty,
236
- state,
237
- ],
238
- [chatbot, state],
239
- )
 
 
 
 
 
 
 
 
240
 
241
  examples = gr.Examples(
242
  examples=examples,
 
125
  return output[0]
126
 
127
 
128
+ def clear_fn(image_input, chatbot, chat_input, caption_output, state):
129
+ if image_input is None:
130
+ return (None, "", "", [])
131
+ else:
132
+ return chatbot, chat_input, caption_output, state
133
+
134
+
135
  title = """<h1 align="center">BLIP-2</h1>"""
136
+ description = """Gradio demo for BLIP-2, image-to-text generation from Salesforce Research. To use it, simply upload your image, or click one of the examples to load them.
137
+ <br> <strong>Disclaimer</strong>: This is a research prototype and is not intended for production use. No data including but not restricted to text and images is collected."""
138
  article = """<strong>Paper</strong>: <a href='https://arxiv.org/abs/2301.12597' target='_blank'>BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models</a>
139
  <br> <strong>Code</strong>: BLIP2 is now integrated into GitHub repo: <a href='https://github.com/salesforce/LAVIS' target='_blank'>LAVIS: a One-stop Library for Language and Vision</a>
140
  <br> <strong>Project Page</strong>: <a href='https://github.com/salesforce/LAVIS/tree/main/projects/blip2' target='_blank'> BLIP2 on LAVIS</a>
141
+ <br> <strong>Description</strong>: Captioning results from <strong>BLIP2_OPT_6.7B</strong>. Chat results from <strong>BLIP2_FlanT5xxl</strong>.
142
  """
143
 
144
  endpoint = Endpoint()
 
155
  gr.Markdown(title)
156
  gr.Markdown(description)
157
  gr.Markdown(article)
158
+
159
  with gr.Row():
160
  with gr.Column():
161
  image_input = gr.Image(type="pil")
 
198
  with gr.Column():
199
 
200
  with gr.Column():
201
+ caption_output = gr.Textbox(lines=1, label="Caption Output")
202
+ caption_button = gr.Button(
203
+ value="Caption it!", interactive=True, variant="primary"
204
+ )
205
+ caption_button.click(
206
+ inference_caption,
207
+ [
208
+ image_input,
209
+ sampling,
210
+ temperature,
211
+ len_penalty,
212
+ rep_penalty,
213
+ ],
214
+ [caption_output],
215
+ )
216
+
217
+ gr.Markdown("""Trying prompting your input for chat; e.g. recommended prompt for QA, \"Question: {} Answer:\"""")
218
+ with gr.Row():
219
+ with gr.Column():
 
220
  chatbot = gr.Chatbot(label="Chat Output (from FlanT5)")
221
+
222
+ # with gr.Row():
223
+ with gr.Column():
224
+ chat_input = gr.Textbox(lines=1, label="Chat Input")
225
+
226
+ with gr.Row():
227
+ clear_button = gr.Button(value="Clear", interactive=True)
228
+ clear_button.click(
229
+ lambda: ("", [], []),
230
+ [],
231
+ [chat_input, chatbot, state],
232
+ )
233
+
234
+ submit_button = gr.Button(
235
+ value="Submit", interactive=True, variant="primary"
236
+ )
237
+ submit_button.click(
238
+ inference_chat,
239
+ [
240
+ image_input,
241
+ chat_input,
242
+ sampling,
243
+ temperature,
244
+ len_penalty,
245
+ rep_penalty,
246
+ state,
247
+ ],
248
+ [chatbot, state],
249
+ )
250
+
251
+ image_input.change(
252
+ clear_fn,
253
+ [image_input, chatbot, chat_input, caption_output, state],
254
+ [chatbot, chat_input, caption_output, state]
255
+ )
256
 
257
  examples = gr.Examples(
258
  examples=examples,