LlavaMistral1

Runtime error

App Files Files Community

LlavaMistral1 / script.py

lorocksUMD

Update script.py

43605d2 verified 4 days ago

raw

history blame

4.56 kB

	import gradio as gr
	from huggingface_hub import InferenceClient

	from transformers import AutoTokenizer
	from llava.model.language_model.llava_mistral import LlavaMistralForCausalLM
	from llava.model.builder import load_pretrained_model
	from llava.mm_utils import (
	process_images,
	tokenizer_image_token,
	get_model_name_from_path,
	)
	from llava.constants import (
	IMAGE_TOKEN_INDEX,
	DEFAULT_IMAGE_TOKEN,
	DEFAULT_IM_START_TOKEN,
	DEFAULT_IM_END_TOKEN,
	IMAGE_PLACEHOLDER,
	)
	from llava.conversation import conv_templates, SeparatorStyle

	import argparse
	import torch
	import requests
	from PIL import Image
	from io import BytesIO
	import re

	parser = argparse.ArgumentParser()
	parser.add_argument("--model-path", type=str, default="liuhaotian/llava-v1.6-mistral-7b")
	parser.add_argument("--image-file", type=str, required=True)
	parser.add_argument("--inference-type", type=str, default="auto")
	parser.add_argument("--prompt", type=str, default="Explain this image")
	cmd_args = parser.parse_args()

	# Line 138 uncomment the cuda() to use GPUs

	# device = "cpu"
	device = cmd_args.inference_type

	prompt = cmd_args.prompt
	image_file = cmd_args.image_file

	model_path = cmd_args.model_path



	# Functions for inference
	def image_parser(args):
	out = args.image_file.split(args.sep)
	return out


	def load_image(image_file):
	if image_file.startswith("http") or image_file.startswith("https"):
	response = requests.get(image_file)
	image = Image.open(BytesIO(response.content)).convert("RGB")
	else:
	image = Image.open(image_file).convert("RGB")
	return image


	def load_images(image_files):
	out = []
	for image_file in image_files:
	image = load_image(image_file)
	out.append(image)
	return out


	model_name = get_model_name_from_path('llava-v1.6-mistral-7b')

	args = type('Args', (), {
	"model_path": model_path,
	"model_base": None,
	"model_name": model_name,
	"query": prompt,
	"conv_mode": None,
	"image_file": image_file,
	"sep": ",",
	"temperature": 0,
	"top_p": None,
	"num_beams": 1,
	"max_new_tokens": 512
	})()

	tokenizer, model, image_processor, context_len = load_pretrained_model(
	model_path, None, model_name, device_map=device
	)

	qs = args.query
	image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
	if IMAGE_PLACEHOLDER in qs:
	if model.config.mm_use_im_start_end:
	qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
	else:
	qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
	else:
	if model.config.mm_use_im_start_end:
	qs = image_token_se + "\n" + qs
	else:
	qs = DEFAULT_IMAGE_TOKEN + "\n" + qs

	if "llama-2" in model_name.lower():
	conv_mode = "llava_llama_2"
	elif "mistral" in model_name.lower():
	conv_mode = "mistral_instruct"
	elif "v1.6-34b" in model_name.lower():
	conv_mode = "chatml_direct"
	elif "v1" in model_name.lower():
	conv_mode = "llava_v1"
	elif "mpt" in model_name.lower():
	conv_mode = "mpt"
	else:
	conv_mode = "llava_v0"

	if args.conv_mode is not None and conv_mode != args.conv_mode:
	print(
	"[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(
	conv_mode, args.conv_mode, args.conv_mode
	)
	)
	else:
	args.conv_mode = conv_mode

	conv = conv_templates[args.conv_mode].copy()
	conv.append_message(conv.roles[0], qs)
	conv.append_message(conv.roles[1], None)
	prompt = conv.get_prompt()

	image_files = image_parser(args)
	images = load_images(image_files)
	image_sizes = [x.size for x in images]
	images_tensor = process_images(
	images,
	image_processor,
	model.config
	).to(model.device, dtype=torch.float16)

	input_ids = (
	tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
	.unsqueeze(0)
	# .cuda()
	)

	with torch.inference_mode():
	output_ids = model.generate(
	input_ids,
	images=images_tensor,
	image_sizes=image_sizes,
	do_sample=True if args.temperature > 0 else False,
	temperature=args.temperature,
	top_p=args.top_p,
	num_beams=args.num_beams,
	max_new_tokens=args.max_new_tokens,
	use_cache=True,
	)

	outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

	if "dataset1" in image_file:
	print("Num of words: ", len(outputs))
	elif "dataset2" in image_file:
	print()
	else:
	print("Is single word?", len((outputs).split()) == 1)

	print(outputs)
	# End Llava inference