Keyven commited on
Commit
0ddcdf3
Β·
1 Parent(s): e70cad0

updating UI

Browse files
Files changed (1) hide show
  1. app.py +21 -30
app.py CHANGED
@@ -5,12 +5,6 @@ import re
5
  import copy
6
  import secrets
7
  from pathlib import Path
8
- import os
9
- os.system("pip install git+https://github.com/openai/whisper.git")
10
- import whisper
11
-
12
-
13
- model_whisper = whisper.load_model("small")
14
 
15
  # Constants
16
  BOX_TAG_PATTERN = r"<box>([\s\S]*?)</box>"
@@ -52,15 +46,6 @@ def format_text(text):
52
  text = "".join(lines)
53
  return text
54
 
55
- def transcribe_audio(audio):
56
- audio = whisper.load_audio(audio)
57
- audio = whisper.pad_or_trim(audio)
58
- mel = whisper.log_mel_spectrogram(audio).to(model_whisper.device)
59
- _, probs = model_whisper.detect_language(mel)
60
- options = whisper.DecodingOptions(fp16 = False)
61
- result = whisper.decode(model_whisper, mel, options)
62
- return result.text
63
-
64
 
65
  def get_chat_response(chatbot, task_history):
66
  global model, tokenizer
@@ -148,28 +133,36 @@ def handle_regeneration(chatbot, task_history):
148
 
149
 
150
  with gr.Blocks(theme='gradio/soft') as demo:
151
- audio = gr.Audio(
152
- label="Input Audio",
153
- show_label=False,
154
- source="microphone",
155
- type="filepath"
156
- )
157
  gr.Markdown("# Qwen-VL Multimodal-Vision-Insight")
158
  gr.Markdown(
159
  "## Developed by Keyvan Hardani (Keyvven on [Twitter](https://twitter.com/Keyvven))\n"
160
  "Special thanks to [@Artificialguybr](https://twitter.com/artificialguybr) for the inspiration from his code.\n"
161
  "### Qwen-VL: A Multimodal Large Vision Language Model by Alibaba Cloud\n"
162
  )
163
- chatbot = gr.Chatbot(label='Qwen-VL-Chat', elem_classes="control-height", height=520)
164
- query = gr.Textbox(lines=2, label='Input')
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  task_history = gr.State([])
166
 
 
167
  with gr.Row():
168
- upload_btn = gr.UploadButton("πŸ“ Upload", file_types=["image"], elem_classes="control-width")
169
- submit_btn = gr.Button("πŸš€ Submit", elem_classes="control-width", variant="primary")
170
- regen_btn = gr.Button("πŸ€”οΈ Regenerate", elem_classes="control-width")
171
- clear_btn = gr.Button("🧹 Clear History", elem_classes="control-width", variant="secondary")
172
-
173
  gr.Markdown("### Key Features:\n- **Strong Performance**: Surpasses existing LVLMs on multiple English benchmarks including Zero-shot Captioning and VQA.\n- **Multi-lingual Support**: Supports English, Chinese, and multi-lingual conversation.\n- **High Resolution**: Utilizes 448*448 resolution for fine-grained recognition and understanding.")
174
  submit_btn.click(handle_text_input, [chatbot, task_history, query], [chatbot, task_history]).then(
175
  get_chat_response, [chatbot, task_history], [chatbot], show_progress=True
@@ -179,8 +172,6 @@ with gr.Blocks(theme='gradio/soft') as demo:
179
  clear_btn.click(clear_history, [task_history], [chatbot], show_progress=True)
180
  regen_btn.click(handle_regeneration, [chatbot, task_history], [chatbot], show_progress=True)
181
  upload_btn.upload(handle_file_upload, [chatbot, task_history, upload_btn], [chatbot, task_history], show_progress=True)
182
- audio.change(transcribe_audio, inputs=[audio], outputs=[query])
183
-
184
 
185
 
186
  demo.launch()
 
5
  import copy
6
  import secrets
7
  from pathlib import Path
 
 
 
 
 
 
8
 
9
  # Constants
10
  BOX_TAG_PATTERN = r"<box>([\s\S]*?)</box>"
 
46
  text = "".join(lines)
47
  return text
48
 
 
 
 
 
 
 
 
 
 
49
 
50
  def get_chat_response(chatbot, task_history):
51
  global model, tokenizer
 
133
 
134
 
135
  with gr.Blocks(theme='gradio/soft') as demo:
 
 
 
 
 
 
136
  gr.Markdown("# Qwen-VL Multimodal-Vision-Insight")
137
  gr.Markdown(
138
  "## Developed by Keyvan Hardani (Keyvven on [Twitter](https://twitter.com/Keyvven))\n"
139
  "Special thanks to [@Artificialguybr](https://twitter.com/artificialguybr) for the inspiration from his code.\n"
140
  "### Qwen-VL: A Multimodal Large Vision Language Model by Alibaba Cloud\n"
141
  )
142
+ chatbot = gr.Chatbot([("Hello", "Hi"), ("Describe the image", "I can describe images. Please upload one.")], label='Qwen-VL-Chat', elem_classes="control-height", height=520)
143
+
144
+ gr.Markdown(
145
+ "### Chat with Qwen-VL\n"
146
+ "You can ask questions or make statements in the chat input below. "
147
+ "You can also upload an image and ask questions about it like "
148
+ "'Describe this image', 'What can you see in this image?', or "
149
+ "'Explain what's happening in this image'."
150
+ )
151
+ query = gr.Textbox(
152
+ lines=2,
153
+ label='Chat Input',
154
+ placeholder='Type your question or statement here, or upload an image and ask about it...',
155
+ hint='E.g., "Describe this image" or "What is the capital of France?"'
156
+ )
157
  task_history = gr.State([])
158
 
159
+
160
  with gr.Row():
161
+ upload_btn = gr.File("πŸ–ΌοΈ Upload", file_types=["image"], elem_classes="control-width", label='Upload File')
162
+ submit_btn = gr.Button("πŸš€ Submit", elem_classes="control-width", variant="primary")
163
+ regen_btn = gr.Button("πŸ”„ Regenerate", elem_classes="control-width")
164
+ clear_btn = gr.Button("🧹 Clear History", elem_classes="control-width", variant="secondary")
165
+
166
  gr.Markdown("### Key Features:\n- **Strong Performance**: Surpasses existing LVLMs on multiple English benchmarks including Zero-shot Captioning and VQA.\n- **Multi-lingual Support**: Supports English, Chinese, and multi-lingual conversation.\n- **High Resolution**: Utilizes 448*448 resolution for fine-grained recognition and understanding.")
167
  submit_btn.click(handle_text_input, [chatbot, task_history, query], [chatbot, task_history]).then(
168
  get_chat_response, [chatbot, task_history], [chatbot], show_progress=True
 
172
  clear_btn.click(clear_history, [task_history], [chatbot], show_progress=True)
173
  regen_btn.click(handle_regeneration, [chatbot, task_history], [chatbot], show_progress=True)
174
  upload_btn.upload(handle_file_upload, [chatbot, task_history, upload_btn], [chatbot, task_history], show_progress=True)
 
 
175
 
176
 
177
  demo.launch()