File size: 7,510 Bytes
5c6427d
 
 
 
 
 
 
2449b43
 
 
 
 
 
5c6427d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2449b43
 
 
 
 
 
 
 
 
75992d9
52ae6ba
 
75992d9
 
5c6427d
75992d9
5c6427d
 
 
 
 
 
 
 
 
 
 
 
 
 
75992d9
 
 
 
 
 
 
 
 
3d3aea0
75992d9
 
3d3aea0
75992d9
 
 
11df678
 
 
 
2f4b8e0
dfc4fa8
5c6427d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c35d25e
5c6427d
 
 
2d256fb
5c6427d
2d256fb
 
 
 
 
 
 
5c6427d
2d256fb
 
5c6427d
 
 
2d256fb
2449b43
 
 
 
 
 
75992d9
64f1def
 
 
 
 
a076c9d
 
 
 
 
5def0b1
 
 
 
 
 
 
 
64f1def
a076c9d
 
c35d25e
 
9f0f596
a076c9d
 
2d256fb
a076c9d
2449b43
a076c9d
64f1def
9f0f596
2f4b8e0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import re
import copy
import secrets
from pathlib import Path
import os
os.system("pip install git+https://github.com/openai/whisper.git")
import whisper


model_whisper = whisper.load_model("small")

# Constants
BOX_TAG_PATTERN = r"<box>([\s\S]*?)</box>"
PUNCTUATION = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"

# Initialize model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat-Int4", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat-Int4", device_map="auto", trust_remote_code=True).eval()

def format_text(text):
    """Format text for rendering in the chat UI."""
    lines = text.split("\n")
    lines = [line for line in lines if line != ""]
    count = 0
    for i, line in enumerate(lines):
        if "```" in line:
            count += 1
            items = line.split("`")
            if count % 2 == 1:
                lines[i] = f'<pre><code class="language-{items[-1]}">'
            else:
                lines[i] = f"<br></code></pre>"
        else:
            if i > 0:
                if count % 2 == 1:
                    line = line.replace("`", r"\`")
                    line = line.replace("<", "&lt;")
                    line = line.replace(">", "&gt;")
                    line = line.replace(" ", "&nbsp;")
                    line = line.replace("*", "&ast;")
                    line = line.replace("_", "&lowbar;")
                    line = line.replace("-", "&#45;")
                    line = line.replace(".", "&#46;")
                    line = line.replace("!", "&#33;")
                    line = line.replace("(", "&#40;")
                    line = line.replace(")", "&#41;")
                    line = line.replace("$", "&#36;")
                lines[i] = "<br>" + line
    text = "".join(lines)
    return text

def transcribe_audio(audio):
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(model_whisper.device)
    _, probs = model_whisper.detect_language(mel)
    options = whisper.DecodingOptions(fp16 = False)
    result = whisper.decode(model_whisper, mel, options)
    return result.text


def get_chat_response(chatbot, task_history):
    global model, tokenizer 
    chat_query = chatbot[-1][0]
    query = task_history[-1][0]
    history_cp = copy.deepcopy(task_history)
    full_response = ""

    history_filter = []
    pic_idx = 1
    pre = ""
    for i, (q, a) in enumerate(history_cp):
        if isinstance(q, (tuple, list)):
            q = f'Picture {pic_idx}: <img>{q[0]}</img>'
            pre += q + '\n'
            pic_idx += 1
        else:
            pre += q
            history_filter.append((pre, a))
            pre = ""
    history, message = history_filter[:-1], history_filter[-1][0]
    response, history = model.chat(tokenizer, message, history=history)
    image = tokenizer.draw_bbox_on_latest_picture(response, history)
    if image is not None:
        temp_dir = secrets.token_hex(20)
        temp_dir = Path("/tmp") / temp_dir
        temp_dir.mkdir(exist_ok=True, parents=True)
        name = f"tmp{secrets.token_hex(5)}.jpg"
        filename = temp_dir / name
        image.save(str(filename))
        chatbot[-1] = (format_text(chat_query), (str(filename),))  # Hier verwenden wir format_text statt _parse_text
        chat_response = response.replace("<ref>", "")
        chat_response = chat_response.replace(r"</ref>", "")
        chat_response = re.sub(BOX_TAG_PATTERN, "", chat_response)
        if chat_response != "":
            chatbot.append((None, chat_response))
    else:
        chatbot[-1] = (format_text(chat_query), response)
    full_response = format_text(response)
    task_history[-1] = (query, full_response)
    return chatbot


def handle_text_input(history, task_history, text):
    """Handle text input from the user."""
    task_text = text
    if len(text) >= 2 and text[-1] in PUNCTUATION and text[-2] not in PUNCTUATION:
        task_text = text[:-1]
    history = history + [(format_text(text), None)]
    task_history = task_history + [(task_text, None)]
    return history, task_history, ""

def handle_file_upload(history, task_history, file):
    """Handle file upload from the user."""
    history = history + [((file.name,), None)]
    task_history = task_history + [((file.name,), None)]
    return history, task_history

def clear_input():
    """Clear the user input."""
    return gr.update(value="")

def clear_history(task_history):
    """Clear the chat history."""
    task_history.clear()
    return []

def handle_regeneration(chatbot, task_history):
    """Handle the regeneration of the last response."""
    print("Regenerate clicked")
    print("Before:", task_history, chatbot)
    if not task_history:
        return chatbot
    item = task_history[-1]
    if item[1] is None:
        return chatbot
    task_history[-1] = (item[0], None)
    chatbot_item = chatbot.pop(-1)
    if chatbot_item[0] is None:
        chatbot[-1] = (chatbot[-1][0], None)
    else:
        chatbot.append((chatbot_item[0], None))
    print("After:", task_history, chatbot)
    return get_chat_response(chatbot, task_history)


with gr.Blocks(theme='gradio/soft') as demo:
    audio = gr.Audio(
        label="Input Audio",
        show_label=False,
        source="microphone",
        type="filepath"
    )
    gr.Markdown("# Qwen-VL Multimodal-Vision-Insight")
    gr.Markdown(
        "## Developed by Keyvan Hardani (Keyvven on [Twitter](https://twitter.com/Keyvven))\n"
        "Special thanks to [@Artificialguybr](https://twitter.com/artificialguybr) for the inspiration from his code.\n"
        "### Qwen-VL: A Multimodal Large Vision Language Model by Alibaba Cloud\n"
    )
    chatbot = gr.Chatbot(label='Qwen-VL-Chat', elem_classes="control-height", height=520)
    query = gr.Textbox(lines=2, label='Input')
    task_history = gr.State([])

    with gr.Row():
        with gr.Column(width=4):
            upload_btn = gr.UploadButton("πŸ“ Upload", file_types=["image"], elem_classes="control-width")
        with gr.Column(width=2):
            submit_btn = gr.Button("πŸš€ Submit", elem_classes="control-width")
        with gr.Column(width=2):
            regen_btn = gr.Button("πŸ€”οΈ Regenerate", elem_classes="control-width")
        with gr.Column(width=2):
            clear_btn = gr.Button("🧹 Clear History", elem_classes="control-width")
    
    gr.Markdown("### Key Features:\n- **Strong Performance**: Surpasses existing LVLMs on multiple English benchmarks including Zero-shot Captioning and VQA.\n- **Multi-lingual Support**: Supports English, Chinese, and multi-lingual conversation.\n- **High Resolution**: Utilizes 448*448 resolution for fine-grained recognition and understanding.")
    submit_btn.click(handle_text_input, [chatbot, task_history, query], [chatbot, task_history]).then(
        get_chat_response, [chatbot, task_history], [chatbot], show_progress=True
    )

    submit_btn.click(clear_input, [], [query])
    clear_btn.click(clear_history, [task_history], [chatbot], show_progress=True)
    regen_btn.click(handle_regeneration, [chatbot, task_history], [chatbot], show_progress=True)
    upload_btn.upload(handle_file_upload, [chatbot, task_history, upload_btn], [chatbot, task_history], show_progress=True)
    audio.on_change(transcribe_audio, inputs=[audio], outputs=[query])


demo.launch()