|
|
import torch |
|
|
from transformers import AutoTokenizer, AutoProcessor |
|
|
from qwen_vl_utils import process_vision_info |
|
|
|
|
|
from modeling_unite import UniteQwen2VL |
|
|
|
|
|
|
|
|
model_path = 'friedrichor/Unite-Base-Qwen2-VL-2B' |
|
|
model = UniteQwen2VL.from_pretrained( |
|
|
model_path, |
|
|
torch_dtype=torch.bfloat16, |
|
|
device_map="cuda" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) |
|
|
processor = AutoProcessor.from_pretrained(model_path, min_pixels=256*28*28, max_pixels=1280*28*28) |
|
|
|
|
|
def process_messages(msg): |
|
|
text = processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=False) |
|
|
print(text) |
|
|
image_inputs, video_inputs = process_vision_info(msg) |
|
|
inputs = processor( |
|
|
text=[text], |
|
|
images=image_inputs, |
|
|
videos=video_inputs, |
|
|
padding=True, |
|
|
return_tensors="pt", |
|
|
) |
|
|
inputs = inputs.to("cuda") |
|
|
|
|
|
return inputs |
|
|
|
|
|
|
|
|
|
|
|
messages_txt = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "text", "text": "The book titled 'Riding with Reindeer - A Bicycle Odyssey through Finland, Lapland, and the Arctic' provides a detailed account of a journey that explores the regions of Lapland and the Arctic, focusing on the experience of riding with reindeer."}, |
|
|
{"type": "text", "text": "\nSummary above sentence in one word:"}, |
|
|
], |
|
|
}, |
|
|
{ |
|
|
"role": "assistant", |
|
|
"content": [ |
|
|
{"type": "text", "text": "<|endoftext|>"}, |
|
|
], |
|
|
} |
|
|
] |
|
|
|
|
|
messages_img = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "image", "image": "https://images-na.ssl-images-amazon.com/images/I/518L0uDGe0L.jpg"}, |
|
|
{"type": "text", "text": "\nSummary above image in one word:"}, |
|
|
], |
|
|
}, |
|
|
{ |
|
|
"role": "assistant", |
|
|
"content": [ |
|
|
{"type": "text", "text": "<|endoftext|>"}, |
|
|
], |
|
|
} |
|
|
] |
|
|
|
|
|
inputs_txt = process_messages(messages_txt) |
|
|
inputs_img = process_messages(messages_img) |
|
|
|
|
|
with torch.no_grad(): |
|
|
embeddings_txt = model(**inputs_txt) |
|
|
embeddings_img = model(**inputs_img) |
|
|
|
|
|
print(torch.matmul(embeddings_txt, embeddings_img.T)) |
|
|
|
|
|
|
|
|
|
|
|
messages_txt = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "text", "text": "Pictorial upper view sunset beams light waterfall stone cascade and fresh green tropical trees"}, |
|
|
{"type": "text", "text": "\nSummary above sentence in one word:"}, |
|
|
], |
|
|
}, |
|
|
{ |
|
|
"role": "assistant", |
|
|
"content": [ |
|
|
{"type": "text", "text": "<|endoftext|>"}, |
|
|
], |
|
|
} |
|
|
] |
|
|
|
|
|
messages_vid = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{ |
|
|
"type": "video", |
|
|
"video": "./examples/stock-footage-pictorial-upper-view-sunset-beams-light-waterfall-stone-cascade-and-fresh-green-tropical-trees.mp4", |
|
|
"max_pixels": 360 * 420, |
|
|
"fps": 1, |
|
|
"max_frames": 32 |
|
|
}, |
|
|
{"type": "text", "text": "\nSummary above video in one word:"}, |
|
|
], |
|
|
}, |
|
|
{ |
|
|
"role": "assistant", |
|
|
"content": [ |
|
|
{"type": "text", "text": "<|endoftext|>"}, |
|
|
], |
|
|
} |
|
|
] |
|
|
|
|
|
inputs_txt = process_messages(messages_txt) |
|
|
inputs_vid = process_messages(messages_vid) |
|
|
|
|
|
with torch.no_grad(): |
|
|
embeddings_txt = model(**inputs_txt) |
|
|
embeddings_vid = model(**inputs_vid) |
|
|
|
|
|
print(torch.matmul(embeddings_txt, embeddings_vid.T)) |
|
|
|
|
|
|
|
|
|
|
|
messages_qry = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "image", "image": "./examples/1408px-Lilium_philadelphicum_var._philadelphicum.jpg"}, |
|
|
{"type": "text", "text": "What part of the us is this plant native to?"}, |
|
|
{"type": "text", "text": "\nSummary above sentence and image in one word:"}, |
|
|
], |
|
|
}, |
|
|
{ |
|
|
"role": "assistant", |
|
|
"content": [ |
|
|
{"type": "text", "text": "<|endoftext|>"}, |
|
|
], |
|
|
} |
|
|
] |
|
|
|
|
|
messages_tgt = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "text", "text": "Midwestern"}, |
|
|
{"type": "text", "text": "\nSummary above sentence in one word:"}, |
|
|
], |
|
|
}, |
|
|
{ |
|
|
"role": "assistant", |
|
|
"content": [ |
|
|
{"type": "text", "text": "<|endoftext|>"}, |
|
|
], |
|
|
} |
|
|
] |
|
|
|
|
|
inputs_qry = process_messages(messages_qry) |
|
|
inputs_tgt = process_messages(messages_tgt) |
|
|
|
|
|
with torch.no_grad(): |
|
|
embeddings_qry = model(**inputs_qry) |
|
|
embeddings_tgt = model(**inputs_tgt) |
|
|
|
|
|
print(torch.matmul(embeddings_qry, embeddings_tgt.T)) |
|
|
|