Unite-Base-Qwen2-VL-2B / inference_demo /inference.py

Upload inference code

6fad96b verified 5 months ago

5.26 kB

	import torch
	from transformers import AutoTokenizer, AutoProcessor
	from qwen_vl_utils import process_vision_info

	from modeling_unite import UniteQwen2VL


	model_path = 'friedrichor/Unite-Base-Qwen2-VL-2B'
	model = UniteQwen2VL.from_pretrained(
	model_path,
	torch_dtype=torch.bfloat16,
	device_map="cuda"
	)

	# We recommend enabling flash_attention_2 for better acceleration and memory saving.
	# model = UniteQwen2VL.from_pretrained(
	# model_path,
	# device_map="cuda",
	# torch_dtype=torch.bfloat16,
	# attn_implementation='flash_attention_2',
	# low_cpu_mem_usage=True,
	# )

	tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
	processor = AutoProcessor.from_pretrained(model_path, min_pixels=2562828, max_pixels=12802828)

	def process_messages(msg):
	text = processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=False)
	print(text)
	image_inputs, video_inputs = process_vision_info(msg)
	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	)
	inputs = inputs.to("cuda")

	return inputs


	## ============================== Text-Image ==============================
	messages_txt = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": "The book titled 'Riding with Reindeer - A Bicycle Odyssey through Finland, Lapland, and the Arctic' provides a detailed account of a journey that explores the regions of Lapland and the Arctic, focusing on the experience of riding with reindeer."},
	{"type": "text", "text": "\nSummary above sentence in one word:"},
	],
	},
	{
	"role": "assistant",
	"content": [
	{"type": "text", "text": "<\|endoftext\|>"},
	],
	}
	]

	messages_img = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": "https://images-na.ssl-images-amazon.com/images/I/518L0uDGe0L.jpg"},
	{"type": "text", "text": "\nSummary above image in one word:"},
	],
	},
	{
	"role": "assistant",
	"content": [
	{"type": "text", "text": "<\|endoftext\|>"},
	],
	}
	]

	inputs_txt = process_messages(messages_txt)
	inputs_img = process_messages(messages_img)

	with torch.no_grad():
	embeddings_txt = model(**inputs_txt) # [1, 1536]
	embeddings_img = model(**inputs_img) # [1, 1536]

	print(torch.matmul(embeddings_txt, embeddings_img.T))
	# tensor([[0.9102]], dtype=torch.bfloat16)

	## ============================== Text-Video ==============================
	messages_txt = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": "Pictorial upper view sunset beams light waterfall stone cascade and fresh green tropical trees"},
	{"type": "text", "text": "\nSummary above sentence in one word:"},
	],
	},
	{
	"role": "assistant",
	"content": [
	{"type": "text", "text": "<\|endoftext\|>"},
	],
	}
	]

	messages_vid = [
	{
	"role": "user",
	"content": [
	{
	"type": "video",
	"video": "./examples/stock-footage-pictorial-upper-view-sunset-beams-light-waterfall-stone-cascade-and-fresh-green-tropical-trees.mp4",
	"max_pixels": 360 * 420,
	"fps": 1,
	"max_frames": 32
	},
	{"type": "text", "text": "\nSummary above video in one word:"},
	],
	},
	{
	"role": "assistant",
	"content": [
	{"type": "text", "text": "<\|endoftext\|>"},
	],
	}
	]

	inputs_txt = process_messages(messages_txt)
	inputs_vid = process_messages(messages_vid)

	with torch.no_grad():
	embeddings_txt = model(**inputs_txt) # [1, 1536]
	embeddings_vid = model(**inputs_vid) # [1, 1536]

	print(torch.matmul(embeddings_txt, embeddings_vid.T))
	# tensor([[0.8516]], dtype=torch.bfloat16)

	## ============================== Fused Modal ==============================
	messages_qry = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": "./examples/1408px-Lilium_philadelphicum_var._philadelphicum.jpg"},
	{"type": "text", "text": "What part of the us is this plant native to?"},
	{"type": "text", "text": "\nSummary above sentence and image in one word:"},
	],
	},
	{
	"role": "assistant",
	"content": [
	{"type": "text", "text": "<\|endoftext\|>"},
	],
	}
	]

	messages_tgt = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": "Midwestern"},
	{"type": "text", "text": "\nSummary above sentence in one word:"},
	],
	},
	{
	"role": "assistant",
	"content": [
	{"type": "text", "text": "<\|endoftext\|>"},
	],
	}
	]

	inputs_qry = process_messages(messages_qry)
	inputs_tgt = process_messages(messages_tgt)

	with torch.no_grad():
	embeddings_qry = model(**inputs_qry) # [1, 1536]
	embeddings_tgt = model(**inputs_tgt) # [1, 1536]

	print(torch.matmul(embeddings_qry, embeddings_tgt.T))
	# tensor([[0.8945]], dtype=torch.bfloat16)