File size: 7,844 Bytes
8366b03
c2f7915
8366b03
 
 
 
 
 
 
 
 
 
 
c2f7915
8366b03
111455d
8366b03
 
 
 
 
 
 
 
 
fd75694
c2f7915
8366b03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2f7915
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8366b03
 
 
 
 
 
 
 
 
 
 
 
 
 
ff1d74e
12ef62d
8366b03
c2f7915
8366b03
c2f7915
8366b03
 
 
 
 
 
 
 
 
 
 
c2f7915
 
 
 
 
 
 
 
 
 
 
 
8366b03
 
 
 
 
 
 
 
c2f7915
 
8366b03
 
 
 
 
c2f7915
bde1dff
fad9462
bde1dff
 
 
 
8366b03
 
c2f7915
8366b03
c2f7915
8366b03
c2f7915
8366b03
c2f7915
8366b03
 
c2f7915
8366b03
 
 
 
 
c2f7915
 
 
 
8366b03
 
c2f7915
 
 
8366b03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2f7915
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8366b03
 
 
 
 
 
 
 
 
 
 
c2f7915
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
from transformers import AutoModel, AutoTokenizer
from copy import deepcopy
import os
import ipdb
import gradio as gr
import mdtex2html
from model.openllama import OpenLLAMAPEFTModel
import torch
import json

# init the model
args = {
    'model': 'openllama_peft',
    'imagebind_ckpt_path': 'pretrained_ckpt/imagebind_ckpt',
    'vicuna_ckpt_path': 'openllmplayground/vicuna_7b_v0',
    'delta_ckpt_path': 'pretrained_ckpt/pandagpt_ckpt/7b/pytorch_model.pt',
    'stage': 2,
    'max_tgt_len': 128,
    'lora_r': 32,
    'lora_alpha': 32,
    'lora_dropout': 0.1,
}
model = OpenLLAMAPEFTModel(**args)
delta_ckpt = torch.load(args['delta_ckpt_path'], map_location=torch.device('cpu'))
model.load_state_dict(delta_ckpt, strict=False)
model = model.half().cuda().eval() if torch.cuda.is_available() else model.eval()
print(f'[!] init the 13b model over ...')

"""Override Chatbot.postprocess"""


def postprocess(self, y):
    if y is None:
        return []
    for i, (message, response) in enumerate(y):
        y[i] = (
            None if message is None else mdtex2html.convert((message)),
            None if response is None else mdtex2html.convert(response),
        )
    return y


gr.Chatbot.postprocess = postprocess


def parse_text(text):
    """copy from https://github.com/GaiZhenbiao/ChuanhuChatGPT/"""
    lines = text.split("\n")
    lines = [line for line in lines if line != ""]
    count = 0
    for i, line in enumerate(lines):
        if "```" in line:
            count += 1
            items = line.split('`')
            if count % 2 == 1:
                lines[i] = f'<pre><code class="language-{items[-1]}">'
            else:
                lines[i] = f'<br></code></pre>'
        else:
            if i > 0:
                if count % 2 == 1:
                    line = line.replace("`", "\`")
                    line = line.replace("<", "&lt;")
                    line = line.replace(">", "&gt;")
                    line = line.replace(" ", "&nbsp;")
                    line = line.replace("*", "&ast;")
                    line = line.replace("_", "&lowbar;")
                    line = line.replace("-", "&#45;")
                    line = line.replace(".", "&#46;")
                    line = line.replace("!", "&#33;")
                    line = line.replace("(", "&#40;")
                    line = line.replace(")", "&#41;")
                    line = line.replace("$", "&#36;")
                lines[i] = "<br>"+line
    text = "".join(lines)
    return text


def re_predict(
    input, 
    image_path, 
    audio_path, 
    video_path, 
    thermal_path, 
    chatbot, 
    max_length, 
    top_p, 
    temperature, 
    history, 
    modality_cache, 
):
    # drop the latest query and answers and generate again
    q, a = history.pop()
    chatbot.pop()
    return predict(q, image_path, audio_path, video_path, thermal_path, chatbot, max_length, top_p, temperature, history, modality_cache)


def predict(
    input, 
    image_path, 
    audio_path, 
    video_path, 
    thermal_path, 
    chatbot, 
    max_length, 
    top_p, 
    temperature, 
    history, 
    modality_cache, 
):
    if image_path is None and audio_path is None and video_path is None and thermal_path is None:
        # return [(input, "图片和音频以及视频为空!请重新上传才能开启对话。")]
        gr.Error("图片和音频以及视频为空!请重新上传才能开启对话。")
    else:
        print(f'[!] image path: {image_path}\n[!] audio path: {audio_path}\n[!] video path: {video_path}\n[!] thermal path: {thermal_path}')

    # prepare the prompt
    prompt_text = ''
    for idx, (q, a) in enumerate(history):
        if idx == 0:
            prompt_text += f'{q}\n### Assistant: {a}\n###'
        else:
            prompt_text += f' Human: {q}\n### Assistant: {a}\n###'
    if len(history) == 0:
        prompt_text += f'{input}'
    else:
        prompt_text += f' Human: {input}'

    with torch.no_grad():
        response = model.generate({
            'prompt': prompt_text,
            'image_paths': [image_path] if image_path else [],
            'audio_paths': [audio_path] if audio_path else [],
            'video_paths': [video_path] if video_path else [],
            'thermal_paths': [thermal_path] if thermal_path else [],
            'top_p': top_p,
            'temperature': temperature,
            'max_tgt_len': max_length,
            'modality_embeds': modality_cache
        })
    chatbot.append((parse_text(input), parse_text(response)))
    history.append((input, response))
    return chatbot, history, modality_cache


def reset_user_input():
    return gr.update(value='')

def reset_dialog():
    return [], []

def reset_state():
    return None, None, None, None, [], [], []


with gr.Blocks(scale=4) as demo:
    gr.HTML("""<h1 align="center">PandaGPT</h1>""")
    gr.Markdown('''We note that the current online demo uses the 7B version of PandaGPT due to the limitation of computation resource. 
    
    Better results should be expected when switching to the 13B version of PandaGPT. 
    
    For more details on how to run 13B PandaGPT, please refer to our [main project repository](https://github.com/yxuansu/PandaGPT).''')

    with gr.Row(scale=4):
        with gr.Column(scale=1):
            image_path = gr.Image(type="filepath", label="Image", value=None)
        with gr.Column(scale=1):
            audio_path = gr.Audio(type="filepath", label="Audio", value=None)
        with gr.Column(scale=1):
            video_path = gr.Video(type='file', label="Video")
        with gr.Column(scale=1):
            thermal_path = gr.Image(type="filepath", label="Thermal Image", value=None)

    chatbot = gr.Chatbot().style(height=300)
    with gr.Row():
        with gr.Column(scale=4):
            with gr.Column(scale=12):
                user_input = gr.Textbox(show_label=False, placeholder="Input...", lines=10).style(container=False)
            with gr.Column(min_width=32, scale=1):
                with gr.Row(scale=1):
                    submitBtn = gr.Button("Submit", variant="primary")
                with gr.Row(scale=1):
                    resubmitBtn = gr.Button("Resubmit", variant="primary")
        with gr.Column(scale=1):
            emptyBtn = gr.Button("Clear History")
            max_length = gr.Slider(0, 400, value=256, step=1.0, label="Maximum length", interactive=True)
            top_p = gr.Slider(0, 1, value=0.01, step=0.01, label="Top P", interactive=True)
            temperature = gr.Slider(0, 1, value=1.0, step=0.01, label="Temperature", interactive=True)

    history = gr.State([])
    modality_cache = gr.State([])

    submitBtn.click(
        predict, [
            user_input, 
            image_path, 
            audio_path, 
            video_path, 
            thermal_path, 
            chatbot, 
            max_length, 
            top_p, 
            temperature, 
            history, 
            modality_cache,
        ], [
            chatbot, 
            history,
            modality_cache
        ],
        show_progress=True
    )

    resubmitBtn.click(
        re_predict, [
            user_input, 
            image_path, 
            audio_path, 
            video_path, 
            thermal_path, 
            chatbot, 
            max_length, 
            top_p, 
            temperature, 
            history, 
            modality_cache,
        ], [
            chatbot, 
            history,
            modality_cache
        ],
        show_progress=True
    )


    submitBtn.click(reset_user_input, [], [user_input])
    emptyBtn.click(reset_state, outputs=[
        image_path,
        audio_path,
        video_path,
        thermal_path,
        chatbot, 
        history, 
        modality_cache
    ], show_progress=True)

demo.launch(enable_queue=True)