Llama-3-MixSense / demo.py
xrx
Add initial model parameters and code
5192214
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import warnings
import os
# disable some warnings
transformers.logging.set_verbosity_error()
transformers.logging.disable_progress_bar()
warnings.filterwarnings("ignore")
# set device
device = "cuda" # or cpu
# create model
model = AutoModelForCausalLM.from_pretrained(
"Zero-Vision/Llama-3-MixSense",
torch_dtype=torch.float16, # float32 for cpu
device_map="auto",
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(
"Zero-Vision/Llama-3-MixSense",
trust_remote_code=True,
)
qs = "describe the image detailly."
input_ids = model.text_process(qs, tokenizer).to(device)
image = Image.open("example.jpg")
image_tensor = model.image_process([image]).to(dtype=model.dtype, device=device)
# generate
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=image_tensor,
max_new_tokens=2048,
use_cache=True,
eos_token_id=[
tokenizer.eos_token_id,
tokenizer.convert_tokens_to_ids(["<|eot_id|>"])[0],
],
)
print(tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip())