File size: 8,408 Bytes
96fe658
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'


def _infer_audio(model, use_chat_template: bool = True, max_model_len=8192, system=None):
    engine = VllmEngine(model, max_model_len=max_model_len, limit_mm_per_prompt={'audio': 2})
    if not use_chat_template:
        engine.default_template.use_chat_template = False
    audios = ['http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/weather.wav']
    messages = []
    if system is not None:
        messages += [{'role': 'system', 'content': system}]
    messages.append({'role': 'user', 'content': 'describe the audio.'})
    resp_list = engine.infer([InferRequest(messages=messages, audios=audios)],
                             RequestConfig(temperature=0, max_tokens=64, repetition_penalty=1.))
    return resp_list[0].choices[0].message.content


def _infer_image(model, use_chat_template: bool = True, max_model_len=8192, system=None):
    engine = VllmEngine(model, max_model_len=max_model_len, limit_mm_per_prompt={'image': 5, 'video': 2})
    if not use_chat_template:
        engine.default_template.use_chat_template = False
    images = ['http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png']
    messages = []
    if system is not None:
        messages += [{'role': 'system', 'content': system}]
    messages.append({'role': 'user', 'content': 'describe the image.'})
    resp_list = engine.infer([InferRequest(messages=messages, images=images)],
                             RequestConfig(temperature=0, max_tokens=64, repetition_penalty=1.))
    return resp_list[0].choices[0].message.content


def _infer_video(model, use_chat_template: bool = True, max_model_len=8192, system=None, limit_mm_per_prompt=None):
    limit_mm_per_prompt = limit_mm_per_prompt or {'image': 16, 'video': 2}
    engine = VllmEngine(model, max_model_len=max_model_len, limit_mm_per_prompt=limit_mm_per_prompt)
    if not use_chat_template:
        engine.default_template.use_chat_template = False
    videos = ['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4']
    messages = []
    if system is not None:
        messages += [{'role': 'system', 'content': system}]
    messages.append({'role': 'user', 'content': 'describe the video.'})
    resp_list = engine.infer([InferRequest(messages=messages, videos=videos)],
                             RequestConfig(temperature=0, max_tokens=64, repetition_penalty=1.))
    return resp_list[0].choices[0].message.content


def test_qwen2_audio():
    response = _infer_audio('Qwen/Qwen2-Audio-7B-Instruct')
    assert response == "The audio is a man speaking in Mandarin saying '今天天气真好呀'."


def test_qwen2_vl():
    response = _infer_image('Qwen/Qwen2-VL-2B-Instruct')
    assert response == (
        'The image depicts a cute kitten with a fluffy, white and gray striped coat. The kitten has large, '
        'expressive blue eyes and is looking directly at the camera. Its ears are perked up, and it has a '
        'small red mark on its left ear. The background is blurred, focusing attention on the kitten. The overall')


def test_qwen2_5_vl():
    response = _infer_image('Qwen/Qwen2.5-VL-3B-Instruct')
    assert response == (
        'The image depicts a cute, fluffy kitten with striking blue eyes and a white and gray fur pattern. '
        'The kitten has a small, pink nose and is looking directly at the camera with a curious expression. '
        "The background is blurred, drawing attention to the kitten's face. "
        'The overall appearance is very endearing and charming.')


def test_deepseek_vl_v2():
    response = _infer_image('deepseek-ai/deepseek-vl2-tiny', max_model_len=4096)
    assert response == ('The image depicts a close-up of a adorable kitten with large, expressive eyes. The kitten has '
                        'a mix of white and gray fur with distinct black stripes, giving it a tabby-like appearance. '
                        'Its ears are perked up, and its whiskers are prominently visible. The background is blurred, '
                        'focusing attention on the kitten')


def test_internvl2():
    response = _infer_image('OpenGVLab/InternVL2-2B', max_model_len=4096, system='')
    assert response == ('The image features a kitten with striking blue eyes and a mix of white and black fur. '
                        'The kitten has large, expressive eyes and a small, pink nose. Its ears are perked up, '
                        'and it appears to be looking directly at the camera. The fur is soft and fluffy, with a mix')


def test_minicpmv_2_5():
    response = _infer_image('OpenBMB/MiniCPM-Llama3-V-2_5', max_model_len=4096)
    assert response == (
        "The image is a digital painting of a kitten that captures the essence of a young feline's innocence "
        "and curiosity. The kitten's fur is rendered with a mix of gray, white, and black stripes, "
        'giving it a realistic and adorable appearance. Its large, expressive eyes are a striking blue, '
        "which draws the viewer's")


def test_minicpmv_2_6():
    response = _infer_image('OpenBMB/MiniCPM-V-2_6', max_model_len=4096)
    assert response == (
        'The image features a close-up of a kitten with striking blue eyes and a mix of '
        "white and dark fur, possibly gray or black. The kitten's gaze is directed forward, giving it an "
        "expressive and captivating look. The background is blurred, drawing focus to the kitten's face. "
        "The overall composition emphasizes the kitten's features")


def test_minicpmo_2_6_video():
    response = _infer_video('OpenBMB/MiniCPM-o-2_6')
    assert response == ('The video features a young child sitting on a bed, deeply engaged in reading a book. '
                        'The child, dressed in a light blue sleeveless top and pink pants, is surrounded by a '
                        'cozy and homely environment. The bed is adorned with a patterned blanket, and a white cloth '
                        'is casually draped over the side.')


def test_qwen2_5_vl_video():
    response = _infer_video('Qwen/Qwen2.5-VL-3B-Instruct')
    assert response == ('A baby wearing sunglasses is sitting on a bed and reading a book. '
                        'The baby is holding the book with both hands and is looking at the pages. '
                        'The baby is wearing a light blue shirt and pink pants. The baby is sitting '
                        'on a white blanket. The baby is looking at the book and is smiling. The baby')


def test_qwen2_5_omni():
    limit_mm_per_prompt = {'image': 1, 'video': 1, 'audio': 1}
    response = _infer_video('Qwen/Qwen2.5-Omni-7B', limit_mm_per_prompt=limit_mm_per_prompt)
    # response = _infer_audio('Qwen/Qwen2.5-Omni-7B')
    assert response


def test_ovis2():
    response = _infer_image('AIDC-AI/Ovis2-1B', max_model_len=4096)
    assert response[:200] == ('The image showcases a charming digital painting of a kitten, capturing its '
                              'adorable features in a unique style. The kitten has a predominantly white face '
                              'with black stripes and spots, giving it a stri')


def test_keye_vl():
    response = _infer_image('Kwai-Keye/Keye-VL-8B-Preview', max_model_len=4096)
    assert response[:200] == ('<analysis>This question asks for a description of the image, which is '
                              'straightforward and involves observing the visual content. Therefore, '
                              '/no_think is more appropriate.</analysis>The image features ')


def test_kimi_vl():
    response = _infer_image('moonshotai/Kimi-VL-A3B-Instruct', max_model_len=4096)
    print(f'response: {response}')


def test_glm4v():
    response = _infer_image('ZhipuAI/glm-4v-9b', max_model_len=4096)
    print(f'response: {response}')


def test_glm4_1v():
    response = _infer_image('ZhipuAI/GLM-4.1V-9B-Thinking', max_model_len=4096)
    print(f'response: {response}')


if __name__ == '__main__':
    from swift.llm import VllmEngine, InferRequest, RequestConfig
    # test_qwen2_vl()
    # test_qwen2_5_vl()
    # test_deepseek_vl_v2()
    # test_internvl2()
    # test_qwen2_audio()
    # test_minicpmv_2_5()
    # test_minicpmv_2_6()
    # test_minicpmo_2_6_video()
    # test_qwen2_5_vl_video()
    # test_qwen2_5_omni()
    # test_ovis2()
    # test_keye_vl()
    # test_kimi_vl()
    # test_glm4v()
    test_glm4_1v()