Spaces:
Runtime error
Runtime error
Add application file
Browse files- README.md +28 -4
- app.py +150 -0
- requirements.txt +4 -0
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
title: Multimodal Vision Insight
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.45.2
|
8 |
app_file: app.py
|
@@ -10,4 +10,28 @@ pinned: false
|
|
10 |
license: apache-2.0
|
11 |
---
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
title: Multimodal Vision Insight
|
3 |
+
emoji: π
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: purple
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.45.2
|
8 |
app_file: app.py
|
|
|
10 |
license: apache-2.0
|
11 |
---
|
12 |
|
13 |
+
Explore the world of multimodal interactions with the Multimodal Vision Insight (MVI) application. With the power of Vision Language Models (VLMs), MVI provides an interface for users to interact with text and images seamlessly. Built on top of Gradio, this application serves as a bridge between human inputs and machine understanding, fostering a cooperative environment for solving real-world tasks.
|
14 |
+
|
15 |
+
[Check out the configuration reference for more details on configuring your space.](https://huggingface.co/docs/hub/spaces-config-reference)
|
16 |
+
|
17 |
+
## Features:
|
18 |
+
- **Multimodal Interaction**: Engage in a conversation with the model using both text and images.
|
19 |
+
- **Real-time Feedback**: Receive instant responses from the model to navigate through tasks efficiently.
|
20 |
+
- **High-Resolution Image Understanding**: Utilize high-resolution images for fine-grained recognition and understanding, enhancing the quality of interaction.
|
21 |
+
- **User-Friendly Interface**: With a clean and intuitive UI, exploring multimodal interactions has never been easier.
|
22 |
+
|
23 |
+
## Usage:
|
24 |
+
1. Input your text or upload an image to start the conversation.
|
25 |
+
2. Use the available controls to navigate through the conversation, regenerate responses, or clear the history.
|
26 |
+
3. Explore the potential of Vision Language Models in understanding and interacting with multimodal data.
|
27 |
+
|
28 |
+
## Developers:
|
29 |
+
Developed by Keyvan Hardani (Keyvven on [Twitter](https://twitter.com/Keyvven) and on [Hugging Face](https://huggingface.co/Keyvven)).
|
30 |
+
Special thanks to [@Artificialguybr](https://twitter.com/artificialguybr) for the inspiration from his code.
|
31 |
+
|
32 |
+
## Acknowledgments:
|
33 |
+
This project is powered by Alibaba Cloud's Qwen-VL, a state-of-the-art multimodal large vision language model.
|
34 |
+
|
35 |
+
Feel free to explore, contribute, and raise issues on the [project repository](<link to your repository>).
|
36 |
+
|
37 |
+
|
app.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
3 |
+
from PIL import Image
|
4 |
+
import re
|
5 |
+
import copy
|
6 |
+
import secrets
|
7 |
+
from pathlib import Path
|
8 |
+
|
9 |
+
# Constants
|
10 |
+
BOX_TAG_PATTERN = r"<box>([\s\S]*?)</box>"
|
11 |
+
PUNCTUATION = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
|
12 |
+
|
13 |
+
# Initialize model and tokenizer
|
14 |
+
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat-Int4", trust_remote_code=True)
|
15 |
+
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat-Int4", device_map="auto", trust_remote_code=True).eval()
|
16 |
+
|
17 |
+
def format_text(text):
|
18 |
+
"""Format text for rendering in the chat UI."""
|
19 |
+
lines = text.split("\n")
|
20 |
+
lines = [line for line in lines if line != ""]
|
21 |
+
count = 0
|
22 |
+
for i, line in enumerate(lines):
|
23 |
+
if "```" in line:
|
24 |
+
count += 1
|
25 |
+
items = line.split("`")
|
26 |
+
if count % 2 == 1:
|
27 |
+
lines[i] = f'<pre><code class="language-{items[-1]}">'
|
28 |
+
else:
|
29 |
+
lines[i] = f"<br></code></pre>"
|
30 |
+
else:
|
31 |
+
if i > 0:
|
32 |
+
if count % 2 == 1:
|
33 |
+
line = line.replace("`", r"\`")
|
34 |
+
line = line.replace("<", "<")
|
35 |
+
line = line.replace(">", ">")
|
36 |
+
line = line.replace(" ", " ")
|
37 |
+
line = line.replace("*", "*")
|
38 |
+
line = line.replace("_", "_")
|
39 |
+
line = line.replace("-", "-")
|
40 |
+
line = line.replace(".", ".")
|
41 |
+
line = line.replace("!", "!")
|
42 |
+
line = line.replace("(", "(")
|
43 |
+
line = line.replace(")", ")")
|
44 |
+
line = line.replace("$", "$")
|
45 |
+
lines[i] = "<br>" + line
|
46 |
+
text = "".join(lines)
|
47 |
+
return text
|
48 |
+
|
49 |
+
def get_chat_response(chatbot, task_history):
|
50 |
+
"""Generate a response using the model."""
|
51 |
+
chat_query = chatbot[-1][0]
|
52 |
+
query = task_history[-1][0]
|
53 |
+
history_cp = copy.deepcopy(task_history)
|
54 |
+
full_response = ""
|
55 |
+
|
56 |
+
history_filter = []
|
57 |
+
pic_idx = 1
|
58 |
+
pre = ""
|
59 |
+
for i, (q, a) in enumerate(history_cp):
|
60 |
+
if isinstance(q, (tuple, list)):
|
61 |
+
q = f'Picture {pic_idx}: <img>{q[0]}</img>'
|
62 |
+
pre += q + '\n'
|
63 |
+
pic_idx += 1
|
64 |
+
else:
|
65 |
+
pre += q
|
66 |
+
history_filter.append((pre, a))
|
67 |
+
pre = ""
|
68 |
+
history, message = history_filter[:-1], history_filter[-1][0]
|
69 |
+
response, history = model.chat(tokenizer, message, history=history)
|
70 |
+
# ... (rest of the code remains the same)
|
71 |
+
|
72 |
+
def handle_text_input(history, task_history, text):
|
73 |
+
"""Handle text input from the user."""
|
74 |
+
task_text = text
|
75 |
+
if len(text) >= 2 and text[-1] in PUNCTUATION and text[-2] not in PUNCTUATION:
|
76 |
+
task_text = text[:-1]
|
77 |
+
history = history + [(format_text(text), None)]
|
78 |
+
task_history = task_history + [(task_text, None)]
|
79 |
+
return history, task_history, ""
|
80 |
+
|
81 |
+
def handle_file_upload(history, task_history, file):
|
82 |
+
"""Handle file upload from the user."""
|
83 |
+
history = history + [((file.name,), None)]
|
84 |
+
task_history = task_history + [((file.name,), None)]
|
85 |
+
return history, task_history
|
86 |
+
|
87 |
+
def clear_input():
|
88 |
+
"""Clear the user input."""
|
89 |
+
return gr.update(value="")
|
90 |
+
|
91 |
+
def clear_history(task_history):
|
92 |
+
"""Clear the chat history."""
|
93 |
+
task_history.clear()
|
94 |
+
return []
|
95 |
+
|
96 |
+
def handle_regeneration(chatbot, task_history):
|
97 |
+
"""Handle the regeneration of the last response."""
|
98 |
+
print("Regenerate clicked")
|
99 |
+
print("Before:", task_history, chatbot)
|
100 |
+
if not task_history:
|
101 |
+
return chatbot
|
102 |
+
item = task_history[-1]
|
103 |
+
if item[1] is None:
|
104 |
+
return chatbot
|
105 |
+
task_history[-1] = (item[0], None)
|
106 |
+
chatbot_item = chatbot.pop(-1)
|
107 |
+
if chatbot_item[0] is None:
|
108 |
+
chatbot[-1] = (chatbot[-1][0], None)
|
109 |
+
else:
|
110 |
+
chatbot.append((chatbot_item[0], None))
|
111 |
+
print("After:", task_history, chatbot)
|
112 |
+
return get_chat_response(chatbot, task_history)
|
113 |
+
|
114 |
+
# Custom CSS
|
115 |
+
css = '''
|
116 |
+
.gradio-container {
|
117 |
+
max-width: 800px !important;
|
118 |
+
}
|
119 |
+
/* ... (add more custom CSS if needed) */
|
120 |
+
'''
|
121 |
+
|
122 |
+
# Build and launch the UI
|
123 |
+
with gr.Blocks(css=css) as demo:
|
124 |
+
gr.Markdown("# Qwen-VL-Chat Bot")
|
125 |
+
gr.Markdown(
|
126 |
+
"## Developed by Keyvan Hardani (Keyvven on [Twitter](https://twitter.com/Keyvven) and on [Hugging Face](https://huggingface.co/Keyvven))\n"
|
127 |
+
"Special thanks to [@Artificialguybr](https://twitter.com/artificialguybr) for the inspiration from his code.\n"
|
128 |
+
"### Qwen-VL: A Multimodal Large Vision Language Model by Alibaba Cloud\n"
|
129 |
+
)
|
130 |
+
chatbot = gr.Chatbot(label='Qwen-VL-Chat', elem_classes="control-height", height=520)
|
131 |
+
query = gr.Textbox(lines=2, label='Input')
|
132 |
+
task_history = gr.State([])
|
133 |
+
|
134 |
+
with gr.Row():
|
135 |
+
upload_btn = gr.UploadButton("π Upload", file_types=["image"])
|
136 |
+
submit_btn = gr.Button("π Submit")
|
137 |
+
regen_btn = gr.Button("π€οΈ Regenerate")
|
138 |
+
clear_btn = gr.Button("π§Ή Clear History")
|
139 |
+
|
140 |
+
gr.Markdown("### Key Features:\n- **Strong Performance**: Surpasses existing LVLMs on multiple English benchmarks including Zero-shot Captioning and VQA.\n- **Multi-lingual Support**: Supports English, Chinese, and multi-lingual conversation.\n- **High Resolution**: Utilizes 448*448 resolution for fine-grained recognition and understanding.")
|
141 |
+
submit_btn.click(handle_text_input, [chatbot, task_history, query], [chatbot, task_history]).then(
|
142 |
+
get_chat_response, [chatbot, task_history], [chatbot], show_progress=True
|
143 |
+
)
|
144 |
+
submit_btn.click(clear_input, [], [query])
|
145 |
+
clear_btn.click(clear_history, [task_history], [chatbot], show_progress=True)
|
146 |
+
regen_btn.click(handle_regeneration, [chatbot, task_history], [chatbot], show_progress=True)
|
147 |
+
upload_btn.upload(handle_file_upload, [chatbot, task_history, upload_btn], [chatbot, task_history], show_progress=True)
|
148 |
+
|
149 |
+
# Launch the demo
|
150 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
transformers
|
3 |
+
Pillow
|
4 |
+
transformers_stream_generator
|