File size: 3,676 Bytes
e3a071a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
# Credit for this script goes to @ariG23498 who opened a PR on the apple/ml-fastvlm repo to add this model to hugging face transformers. The apple team still needs to convert the weights in order for it to be officially available.
import os
import argparse
import torch
from PIL import Image
from llava.conversation import conv_templates
from llava.mm_utils import tokenizer_image_token, process_images
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPImageProcessor
def predict(args):
model_id = args.model_path.split("/")[-1]
print(f"{model_id=}")
# Remove generation config from model folder
# to read generation parameters from args
model_path = os.path.expanduser(args.model_path)
generation_config = None
if os.path.exists(os.path.join(model_path, 'generation_config.json')):
generation_config = os.path.join(model_path, '.generation_config.json')
os.rename(os.path.join(model_path, 'generation_config.json'),
generation_config)
tokenizer = AutoTokenizer.from_pretrained(f"riddhimanrana/{model_id}")
model = AutoModelForCausalLM.from_pretrained(f"riddhimanrana/{model_id}", torch_dtype=torch.float16, device_map="cuda")
image_processor = CLIPImageProcessor.from_pretrained(f"riddhimanrana/{model_id}")
# Construct prompt
qs = args.prompt
if model.config.mm_use_im_start_end:
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
else:
qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
conv = conv_templates[args.conv_mode].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
# Set the pad token id for generation
model.generation_config.pad_token_id = tokenizer.pad_token_id
# Tokenize prompt
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(torch.device("cuda"))
# Load and preprocess image
image = Image.open(args.image_file).convert('RGB')
image_tensor = process_images([image], image_processor, model.config)[0]
# Run inference
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=image_tensor.unsqueeze(0).half(),
image_sizes=[image.size],
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
top_p=args.top_p,
num_beams=args.num_beams,
max_new_tokens=256,
use_cache=True)
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
print(outputs)
# Restore generation config
if generation_config is not None:
os.rename(generation_config, os.path.join(model_path, 'generation_config.json'))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-path", type=str, default="./llava-v1.5-0.5b")
parser.add_argument("--model-base", type=str, default=None)
parser.add_argument("--image-file", type=str, default=None, help="location of image file")
parser.add_argument("--prompt", type=str, default="Describe the image.", help="Prompt for VLM.")
parser.add_argument("--conv-mode", type=str, default="qwen_2")
parser.add_argument("--temperature", type=float, default=0.0)
parser.add_argument("--top_p", type=float, default=None)
parser.add_argument("--num_beams", type=int, default=1)
args = parser.parse_args()
predict(args)
|