sparse / ms-swift /tests /test_align /test_template /test_video.py

Upload folder using huggingface_hub

96fe658 verified about 1 month ago

9.62 kB

	import os

	import torch

	os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
	os.environ['SWIFT_DEBUG'] = '1'


	def _infer_model(pt_engine, system=None, messages=None, videos=None, max_tokens=128):
	seed_everything(42)
	request_config = RequestConfig(max_tokens=max_tokens, temperature=0)
	if messages is None:
	messages = []
	if not messages:
	if system is not None:
	messages += [{'role': 'system', 'content': system}]
	messages += [{'role': 'user', 'content': '你好'}]
	resp = pt_engine.infer([{'messages': messages}], request_config=request_config)
	response = resp[0].choices[0].message.content
	messages += [{'role': 'assistant', 'content': response}, {'role': 'user', 'content': '<video>描述视频'}]
	else:
	messages = messages.copy()
	if videos is None:
	videos = ['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4']
	resp = pt_engine.infer([{'messages': messages, 'videos': videos}], request_config=request_config)
	response = resp[0].choices[0].message.content
	messages += [{'role': 'assistant', 'content': response}]
	logger.info(f'model: {pt_engine.model_info.model_name}, messages: {messages}')
	return response


	def test_qwen2_vl():
	os.environ['FPS_MAX_FRAMES'] = '24'
	os.environ['MAX_PIXELS'] = '100352'
	os.environ['VIDEO_MAX_PIXELS'] = str(100352 // 4)
	pt_engine = PtEngine('Qwen/Qwen2-VL-2B-Instruct')
	response = _infer_model(pt_engine)
	pt_engine.default_template.template_backend = 'jinja'
	response2 = _infer_model(pt_engine)
	assert response == response2


	def test_internvl2_5():
	pt_engine = PtEngine('OpenGVLab/InternVL2_5-2B')
	_infer_model(pt_engine)
	pt_engine.default_template.template_backend = 'jinja'
	_infer_model(pt_engine, system='你是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。')


	def test_internvl2_5_mpo():
	pt_engine = PtEngine('OpenGVLab/InternVL2_5-1B-MPO', model_type='internvl2_5')
	response = _infer_model(pt_engine, messages=[{'role': 'user', 'content': '<video>这是什么'}])
	assert response == ('这是一段婴儿在阅读的视频。婴儿穿着浅绿色的上衣和粉色的裤子，戴着黑框眼镜，坐在床上，正在翻阅一本打开的书。'
	'背景中可以看到婴儿床、衣物和一些家具。视频中可以看到“clipo.com”的水印。婴儿看起来非常专注，似乎在认真地阅读。')


	def test_xcomposer2_5():
	pt_engine = PtEngine('Shanghai_AI_Laboratory/internlm-xcomposer2d5-ol-7b:base', torch.float16)
	messages = [{'role': 'user', 'content': '<video>Describe the video'}]
	messages_with_system = messages.copy()
	messages_with_system.insert(0, {'role': 'system', 'content': ''})
	response = _infer_model(pt_engine, messages=messages_with_system)
	pt_engine.default_template.template_backend = 'jinja'
	response2 = _infer_model(pt_engine, messages=messages, system='')
	assert response == response2

	response = _infer_model(pt_engine, messages=messages)
	std_response = (
	'The video features a young child sitting on a bed, deeply engaged in reading a book. '
	'The child is dressed in a light blue sleeveless top and pink pants, and is wearing glasses. '
	'The bed is covered with a textured white blanket, and there are various items scattered on it, '
	'including a white cloth and a striped piece of clothing. In the background, '
	'a wooden crib and a dresser with a mirror can be seen. The child flips through the pages of the book, '
	'occasionally pausing to look at the illustrations. The child appears to be enjoying the book, '
	'and the overall atmosphere is one of quiet concentration and enjoyment.')

	assert response == std_response[:len(response)]


	def test_mplug3():
	pt_engine = PtEngine('iic/mPLUG-Owl3-7B-240728')
	# pt_engine = PtEngine('iic/mPLUG-Owl3-7B-241101')
	_infer_model(pt_engine, system='')
	pt_engine.default_template.template_backend = 'jinja'
	_infer_model(pt_engine, system='')


	def test_minicpmv():
	pt_engine = PtEngine('OpenBMB/MiniCPM-V-2_6')
	_infer_model(pt_engine)
	pt_engine.default_template.template_backend = 'jinja'
	_infer_model(pt_engine)


	def test_minicpmo():
	os.environ['VIDEO_MAX_SLICE_NUMS'] = '2'
	pt_engine = PtEngine('OpenBMB/MiniCPM-o-2_6')
	messages = [{'role': 'user', 'content': '<video>Describe the video'}]
	response = _infer_model(pt_engine, messages=messages)
	pt_engine.default_template.template_backend = 'jinja'
	response2 = _infer_model(pt_engine, messages=messages)
	assert response == response2 == (
	'The video features a young child sitting on a bed, deeply engrossed in reading a large book. The child, '
	'dressed in a light blue sleeveless top and pink pants, is surrounded by a cozy and homely environment. '
	'The bed is adorned with a patterned blanket, and a white cloth is casually draped over the side. '
	'In the background, a crib and a television are visible, adding to the domestic setting. '
	'The child is seen flipping through the pages of the book, occasionally pausing to look at the pages, '
	'and then continuing to turn them. The video captures the child\'s focused and curious demeanor as they '
	'explore the contents of the book, creating a heartwarming '
	'scene of a young reader immersed in their world of stories.')[:len(response)]


	def test_valley():
	pt_engine = PtEngine('bytedance-research/Valley-Eagle-7B')
	_infer_model(pt_engine)


	def test_qwen2_5_vl():
	os.environ['FPS'] = '1'
	os.environ['VIDEO_MAX_PIXELS'] = str(360 * 420)
	pt_engine = PtEngine('Qwen/Qwen2.5-VL-7B-Instruct')
	messages = [{'role': 'user', 'content': '<video>What happened in the video?'}]
	videos = ['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4']
	response = _infer_model(pt_engine, messages=messages, videos=videos)
	pt_engine.default_template.template_backend = 'jinja'
	response2 = _infer_model(pt_engine, messages=messages, videos=videos)
	assert response == response2 == (
	'In the video, a young child is sitting on a bed and appears to be reading or flipping '
	'through a book. The child is wearing sunglasses and seems focused on the book. '
	'The setting looks like a cozy bedroom with various items such as clothes and '
	"possibly toys around. The child's actions suggest they might be exploring or "
	'learning about the book.')


	def test_qwen2_5_omni():
	USE_AUDIO_IN_VIDEO = True
	os.environ['USE_AUDIO_IN_VIDEO'] = str(USE_AUDIO_IN_VIDEO)
	pt_engine = PtEngine('Qwen/Qwen2.5-Omni-7B', attn_impl='flash_attn')
	system = ('You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, '
	'capable of perceiving auditory and visual inputs, as well as generating text and speech.')
	messages = [{'role': 'system', 'content': system}, {'role': 'user', 'content': '<video>'}]
	videos = ['https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4']
	response = _infer_model(pt_engine, messages=messages, videos=videos)
	pt_engine.default_template.template_backend = 'jinja'
	response2 = _infer_model(pt_engine, messages=messages, videos=videos)
	if USE_AUDIO_IN_VIDEO:
	ground_truth = ("Oh, that's a really cool drawing! It looks like a guitar. You've got the body "
	'and the neck drawn in a simple yet effective way. The lines are clean and the '
	'shape is well-defined. What made you choose to draw a guitar?')
	else:
	ground_truth = ('嗯，你是在用平板画画呢。你画的这把吉他，看起来很简洁明了。你用的笔触也很流畅，线条很清晰。你对颜色的运用也很不错，整体看起来很协调。你要是还有啥想法或者问题，随时跟我说哈。')
	assert response == response2 == ground_truth


	def test_glm4_1v():
	messages = [{'role': 'user', 'content': '<video>What happened in the video?'}]
	videos = ['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4']
	pt_engine = PtEngine('ZhipuAI/GLM-4.1V-9B-Thinking')
	response = _infer_model(pt_engine, messages=messages, videos=videos)
	pt_engine.default_template.template_backend = 'jinja'
	response2 = _infer_model(pt_engine, messages=messages, videos=videos)
	assert response == response2


	def test_keye_vl():
	pt_engine = PtEngine('Kwai-Keye/Keye-VL-8B-Preview', attn_impl='flash_attention_2')
	messages = [{'role': 'user', 'content': '<video>What happened in the video?'}]
	videos = ['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4']
	response = _infer_model(pt_engine, messages=messages, videos=videos)
	pt_engine.default_template.template_backend = 'jinja'
	response2 = _infer_model(pt_engine, messages=messages, videos=videos)
	assert response == response2


	if __name__ == '__main__':
	from swift.llm import PtEngine, RequestConfig
	from swift.utils import get_logger, seed_everything
	logger = get_logger()
	# test_qwen2_vl()
	# test_internvl2_5()
	# test_xcomposer2_5()
	# test_internvl2_5_mpo()
	# test_mplug3()
	# test_minicpmv()
	# test_minicpmo()
	# test_valley()
	# test_qwen2_5_vl()
	# test_qwen2_5_omni()
	# test_glm4_1v() # bug now, wait model fix
	test_keye_vl()