|
import os |
|
|
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0' |
|
|
|
|
|
def _infer_audio(model, use_chat_template: bool = True, max_model_len=8192, system=None): |
|
engine = VllmEngine(model, max_model_len=max_model_len, limit_mm_per_prompt={'audio': 2}) |
|
if not use_chat_template: |
|
engine.default_template.use_chat_template = False |
|
audios = ['http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/weather.wav'] |
|
messages = [] |
|
if system is not None: |
|
messages += [{'role': 'system', 'content': system}] |
|
messages.append({'role': 'user', 'content': 'describe the audio.'}) |
|
resp_list = engine.infer([InferRequest(messages=messages, audios=audios)], |
|
RequestConfig(temperature=0, max_tokens=64, repetition_penalty=1.)) |
|
return resp_list[0].choices[0].message.content |
|
|
|
|
|
def _infer_image(model, use_chat_template: bool = True, max_model_len=8192, system=None): |
|
engine = VllmEngine(model, max_model_len=max_model_len, limit_mm_per_prompt={'image': 5, 'video': 2}) |
|
if not use_chat_template: |
|
engine.default_template.use_chat_template = False |
|
images = ['http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png'] |
|
messages = [] |
|
if system is not None: |
|
messages += [{'role': 'system', 'content': system}] |
|
messages.append({'role': 'user', 'content': 'describe the image.'}) |
|
resp_list = engine.infer([InferRequest(messages=messages, images=images)], |
|
RequestConfig(temperature=0, max_tokens=64, repetition_penalty=1.)) |
|
return resp_list[0].choices[0].message.content |
|
|
|
|
|
def _infer_video(model, use_chat_template: bool = True, max_model_len=8192, system=None, limit_mm_per_prompt=None): |
|
limit_mm_per_prompt = limit_mm_per_prompt or {'image': 16, 'video': 2} |
|
engine = VllmEngine(model, max_model_len=max_model_len, limit_mm_per_prompt=limit_mm_per_prompt) |
|
if not use_chat_template: |
|
engine.default_template.use_chat_template = False |
|
videos = ['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4'] |
|
messages = [] |
|
if system is not None: |
|
messages += [{'role': 'system', 'content': system}] |
|
messages.append({'role': 'user', 'content': 'describe the video.'}) |
|
resp_list = engine.infer([InferRequest(messages=messages, videos=videos)], |
|
RequestConfig(temperature=0, max_tokens=64, repetition_penalty=1.)) |
|
return resp_list[0].choices[0].message.content |
|
|
|
|
|
def test_qwen2_audio(): |
|
response = _infer_audio('Qwen/Qwen2-Audio-7B-Instruct') |
|
assert response == "The audio is a man speaking in Mandarin saying '今天天气真好呀'." |
|
|
|
|
|
def test_qwen2_vl(): |
|
response = _infer_image('Qwen/Qwen2-VL-2B-Instruct') |
|
assert response == ( |
|
'The image depicts a cute kitten with a fluffy, white and gray striped coat. The kitten has large, ' |
|
'expressive blue eyes and is looking directly at the camera. Its ears are perked up, and it has a ' |
|
'small red mark on its left ear. The background is blurred, focusing attention on the kitten. The overall') |
|
|
|
|
|
def test_qwen2_5_vl(): |
|
response = _infer_image('Qwen/Qwen2.5-VL-3B-Instruct') |
|
assert response == ( |
|
'The image depicts a cute, fluffy kitten with striking blue eyes and a white and gray fur pattern. ' |
|
'The kitten has a small, pink nose and is looking directly at the camera with a curious expression. ' |
|
"The background is blurred, drawing attention to the kitten's face. " |
|
'The overall appearance is very endearing and charming.') |
|
|
|
|
|
def test_deepseek_vl_v2(): |
|
response = _infer_image('deepseek-ai/deepseek-vl2-tiny', max_model_len=4096) |
|
assert response == ('The image depicts a close-up of a adorable kitten with large, expressive eyes. The kitten has ' |
|
'a mix of white and gray fur with distinct black stripes, giving it a tabby-like appearance. ' |
|
'Its ears are perked up, and its whiskers are prominently visible. The background is blurred, ' |
|
'focusing attention on the kitten') |
|
|
|
|
|
def test_internvl2(): |
|
response = _infer_image('OpenGVLab/InternVL2-2B', max_model_len=4096, system='') |
|
assert response == ('The image features a kitten with striking blue eyes and a mix of white and black fur. ' |
|
'The kitten has large, expressive eyes and a small, pink nose. Its ears are perked up, ' |
|
'and it appears to be looking directly at the camera. The fur is soft and fluffy, with a mix') |
|
|
|
|
|
def test_minicpmv_2_5(): |
|
response = _infer_image('OpenBMB/MiniCPM-Llama3-V-2_5', max_model_len=4096) |
|
assert response == ( |
|
"The image is a digital painting of a kitten that captures the essence of a young feline's innocence " |
|
"and curiosity. The kitten's fur is rendered with a mix of gray, white, and black stripes, " |
|
'giving it a realistic and adorable appearance. Its large, expressive eyes are a striking blue, ' |
|
"which draws the viewer's") |
|
|
|
|
|
def test_minicpmv_2_6(): |
|
response = _infer_image('OpenBMB/MiniCPM-V-2_6', max_model_len=4096) |
|
assert response == ( |
|
'The image features a close-up of a kitten with striking blue eyes and a mix of ' |
|
"white and dark fur, possibly gray or black. The kitten's gaze is directed forward, giving it an " |
|
"expressive and captivating look. The background is blurred, drawing focus to the kitten's face. " |
|
"The overall composition emphasizes the kitten's features") |
|
|
|
|
|
def test_minicpmo_2_6_video(): |
|
response = _infer_video('OpenBMB/MiniCPM-o-2_6') |
|
assert response == ('The video features a young child sitting on a bed, deeply engaged in reading a book. ' |
|
'The child, dressed in a light blue sleeveless top and pink pants, is surrounded by a ' |
|
'cozy and homely environment. The bed is adorned with a patterned blanket, and a white cloth ' |
|
'is casually draped over the side.') |
|
|
|
|
|
def test_qwen2_5_vl_video(): |
|
response = _infer_video('Qwen/Qwen2.5-VL-3B-Instruct') |
|
assert response == ('A baby wearing sunglasses is sitting on a bed and reading a book. ' |
|
'The baby is holding the book with both hands and is looking at the pages. ' |
|
'The baby is wearing a light blue shirt and pink pants. The baby is sitting ' |
|
'on a white blanket. The baby is looking at the book and is smiling. The baby') |
|
|
|
|
|
def test_qwen2_5_omni(): |
|
limit_mm_per_prompt = {'image': 1, 'video': 1, 'audio': 1} |
|
response = _infer_video('Qwen/Qwen2.5-Omni-7B', limit_mm_per_prompt=limit_mm_per_prompt) |
|
|
|
assert response |
|
|
|
|
|
def test_ovis2(): |
|
response = _infer_image('AIDC-AI/Ovis2-1B', max_model_len=4096) |
|
assert response[:200] == ('The image showcases a charming digital painting of a kitten, capturing its ' |
|
'adorable features in a unique style. The kitten has a predominantly white face ' |
|
'with black stripes and spots, giving it a stri') |
|
|
|
|
|
def test_keye_vl(): |
|
response = _infer_image('Kwai-Keye/Keye-VL-8B-Preview', max_model_len=4096) |
|
assert response[:200] == ('<analysis>This question asks for a description of the image, which is ' |
|
'straightforward and involves observing the visual content. Therefore, ' |
|
'/no_think is more appropriate.</analysis>The image features ') |
|
|
|
|
|
def test_kimi_vl(): |
|
response = _infer_image('moonshotai/Kimi-VL-A3B-Instruct', max_model_len=4096) |
|
print(f'response: {response}') |
|
|
|
|
|
def test_glm4v(): |
|
response = _infer_image('ZhipuAI/glm-4v-9b', max_model_len=4096) |
|
print(f'response: {response}') |
|
|
|
|
|
def test_glm4_1v(): |
|
response = _infer_image('ZhipuAI/GLM-4.1V-9B-Thinking', max_model_len=4096) |
|
print(f'response: {response}') |
|
|
|
|
|
if __name__ == '__main__': |
|
from swift.llm import VllmEngine, InferRequest, RequestConfig |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_glm4_1v() |
|
|