import torch | |
import transformers | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from PIL import Image | |
import warnings | |
import os | |
# disable some warnings | |
transformers.logging.set_verbosity_error() | |
transformers.logging.disable_progress_bar() | |
warnings.filterwarnings("ignore") | |
# set device | |
device = "cuda" # or cpu | |
# create model | |
model = AutoModelForCausalLM.from_pretrained( | |
"Zero-Vision/Llama-3-MixSense", | |
torch_dtype=torch.float16, # float32 for cpu | |
device_map="auto", | |
trust_remote_code=True, | |
) | |
tokenizer = AutoTokenizer.from_pretrained( | |
"Zero-Vision/Llama-3-MixSense", | |
trust_remote_code=True, | |
) | |
qs = "describe the image detailly." | |
input_ids = model.text_process(qs, tokenizer).to(device) | |
image = Image.open("example.jpg") | |
image_tensor = model.image_process([image]).to(dtype=model.dtype, device=device) | |
# generate | |
with torch.inference_mode(): | |
output_ids = model.generate( | |
input_ids, | |
images=image_tensor, | |
max_new_tokens=2048, | |
use_cache=True, | |
eos_token_id=[ | |
tokenizer.eos_token_id, | |
tokenizer.convert_tokens_to_ids(["<|eot_id|>"])[0], | |
], | |
) | |
print(tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()) | |