Spaces:

Nemil
/

ImageCaptionGenerator

Runtime error

App Files Files Community

ImageCaptionGenerator / app.py

Nemil

Upload app.py

83ff097 verified about 1 year ago

raw

history blame contribute delete

10.1 kB

	from transformers import AutoProcessor, AutoModelForCausalLM, BitsAndBytesConfig
	import torch
	from PIL import Image
	import requests
	import traceback
	import os

	from huggingface_hub import login
	login(token=os.getenv("HF_TOKEN"))

	class Image2Text:
	def __init__(self):
	# Load the GIT coco model
	preprocessor_git_large_coco = AutoProcessor.from_pretrained("microsoft/git-large-coco")
	model_git_large_coco = AutoModelForCausalLM.from_pretrained("microsoft/git-large-coco")

	self.device = "cuda" if torch.cuda.is_available() else "cpu"

	self.preprocessor = preprocessor_git_large_coco
	self.model = model_git_large_coco
	self.model.to(self.device)


	def image_description(
	self,
	image_url,
	max_length=50,
	temperature=0.1,
	use_sample_image=False,
	):
	"""
	Generate captions for the given image.

	-----
	Parameters
	image_url: Image URL
	The image to generate captions for.
	max_length: int
	The max length of the generated descriptions.

	-----
	Returns
	str
	The generated image description.
	"""
	caption_git_large_coco = ""

	if use_sample_image:
	image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"

	image = Image.open(requests.get(image_url, stream=True).raw)

	# Generate captions for the image using the GIT coco model
	try:
	caption_git_large_coco = self._generate_description(image, max_length, False).strip()
	return caption_git_large_coco

	except Exception as e:
	print(e)
	traceback.print_exc()


	def _generate_description(
	self,
	image,
	max_length=50,
	use_float_16=False,
	):
	"""
	Generate captions for the given image.

	-----
	Parameters
	image: PIL.Image
	The image to generate captions for.
	max_length: int
	The max length of the generated descriptions.
	use_float_16: bool
	Whether to use float16 precision. This can speed up inference, but may lead to worse results.

	-----
	Returns
	str
	The generated caption.
	"""
	# inputs = preprocessor(image, return_tensors="pt").to(device)
	pixel_values = self.preprocessor(images=image, return_tensors="pt").pixel_values.to(self.device)
	generated_ids = self.model.generate(
	pixel_values=pixel_values,
	max_length=max_length,
	)
	generated_caption = self.preprocessor.batch_decode(generated_ids, skip_special_tokens=True)[0]
	return generated_caption

	import json
	from pprint import pprint

	import bitsandbytes as bnb
	import pandas as pd

	import torch
	import torch.nn as nn
	import transformers
	from datasets import load_dataset
	from huggingface_hub import notebook_login
	from peft import (
	LoraConfig ,
	PeftConfig ,
	PeftModel ,
	get_peft_model ,
	prepare_model_for_kbit_training,
	)
	from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig
	from peft import LoraConfig, get_peft_model


	os.environ["CUDA_VISIBLE_DEVICES"] = "0"

	class Social_Media_Captioner:
	def __init__(self, use_finetuned: bool=True, temp=0.1):
	self.use_finetuned = use_finetuned
	self.MODEL_NAME = "vilsonrodrigues/falcon-7b-instruct-sharded"
	self.peft_model_name = "ayush-vatsal/caption_qlora_finetune"
	self.model_loaded = False
	self.device = "cuda:0"

	self._load_model()

	self.generation_config = self.model.generation_config
	self.generation_config.max_new_tokens = 50
	self.generation_config.temperature = temp
	self.generation_config.top_p = 0.7
	self.generation_config.num_return_sequences = 1
	self.generation_config.pad_token_id = self.tokenizer.eos_token_id
	self.generation_config.eos_token_id = self.tokenizer.eos_token_id

	self.cache: list[dict] = [] # [{"image_decription": "A man", "caption": ["A man"]}]


	def _load_model(self):
	self.bnb_config = BitsAndBytesConfig(
	load_in_4bit = True,
	llm_int8_enable_fp32_cpu_offload=True,
	bnb_4bit_use_double_quant = True,
	bnb_4bit_quant_type= "nf4",
	bnb_4bit_compute_dtype=torch.bfloat16,
	load_in_8bit_fp32_cpu_offload=True
	)
	self.model = AutoModelForCausalLM.from_pretrained(
	self.MODEL_NAME,
	device_map = "auto",
	trust_remote_code = True,
	quantization_config = self.bnb_config
	)

	# Defining the tokenizers
	self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL_NAME)
	self.tokenizer.pad_token = self.tokenizer.eos_token

	# if self.use_finetuned:
	# # LORA Config Model
	# self.lora_config = LoraConfig(
	# r=16,
	# lora_alpha=32,
	# target_modules=["query_key_value"],
	# lora_dropout=0.05,
	# bias="none",
	# task_type="CAUSAL_LM"
	# )
	# self.model = get_peft_model(self.model, self.lora_config)

	# # Fitting the adapters
	# self.peft_config = PeftConfig.from_pretrained(self.peft_model_name)
	# self.model = AutoModelForCausalLM.from_pretrained(
	# self.peft_config.base_model_name_or_path,
	# return_dict = True,
	# quantization_config = self.bnb_config,
	# device_map= "auto",
	# trust_remote_code = True
	# )
	# self.model = PeftModel.from_pretrained(self.model, self.peft_model_name)

	# # Defining the tokenizers
	# self.tokenizer = AutoTokenizer.from_pretrained(self.peft_config.base_model_name_or_path)
	# self.tokenizer.pad_token = self.tokenizer.eos_token

	self.model_loaded = True
	print("Model Loaded successfully")

	def inference(self, input_text: str, use_cached=True, cache_generation=True) -> str \| None:
	if not self.model_loaded:
	raise Exception("Model not loaded")

	try:
	prompt = Social_Media_Captioner._prompt(input_text)
	if use_cached:
	for item in self.cache:
	if item['image_description'] == input_text:
	return item['caption']

	encoding = self.tokenizer(prompt, return_tensors = "pt").to(self.device)
	with torch.inference_mode():
	outputs = self.model.generate(
	input_ids = encoding.input_ids,
	attention_mask = encoding.attention_mask,
	generation_config = self.generation_config
	)
	generated_caption = (self.tokenizer.decode(outputs[0], skip_special_tokens=True).split('Caption: "')[-1]).split('"')[0]

	if cache_generation:
	for item in self.cache:
	if item['image_description'] == input_text:
	item['caption'].append(generated_caption)
	break
	else:
	self.cache.append({
	'image_description': input_text,
	'caption': [generated_caption]
	})

	return generated_caption
	except Exception as e:
	print(e)
	return None


	def _prompt(input_text="A man walking alone in the road"):
	if input_text is None:
	raise Exception("Enter a valid input text to generate a valid prompt")

	return f"""
	Convert the given image description to a appropriate metaphoric caption
	Description: {input_text}
	Caption:
	""".strip()

	@staticmethod
	def get_trainable_parameters(model):
	trainable_params = 0
	all_param = 0
	for _, param in model.named_parameters():
	all_param += param.numel()
	if param.requires_grad:
	trainable_params += param.numel()
	return f"trainable_params: {trainable_params} \|\| all_params: {all_param} \|\| Percentage of trainable params: {100*trainable_params / all_param}"


	def __repr__(self):
	return f"""
	Base Model Name: {self.MODEL_NAME}
	PEFT Model Name: {self.peft_model_name}
	Using PEFT Finetuned Model: {self.use_finetuned}
	Model: {self.model}

	------------------------------------------------------------

	{Social_Media_Captioner.get_trainable_parameters(self.model)}
	"""

	class Captions:
	def __init__(self, use_finetuned_LLM: bool=True, temp_LLM=0.1):
	self.image_to_text = Image2Text()
	self.LLM = Social_Media_Captioner(use_finetuned_LLM, temp_LLM)

	def generate_captions(
	self,
	image,
	image_url=None,
	max_length_GIT=50,
	temperature_GIT=0.1,
	use_sample_image_GIT=False,
	use_cached_LLM=True,
	cache_generation_LLM=True
	):
	if image_url:
	image_description = self.image_to_text.image_description(image_url, max_length=max_length_GIT, temperature=temperature_GIT, use_sample_image=use_sample_image_GIT)
	else:
	image_description = self.image_to_text._generate_description(image, max_length=max_length_GIT)
	captions = self.LLM.inference(image_description, use_cached=use_cached_LLM, cache_generation=cache_generation_LLM)
	return captions

	caption_generator = Captions()

	import gradio as gr

	def setup(image):
	return caption_generator.generate_captions(image = image)

	iface = gr.Interface(
	fn=setup,
	inputs=gr.inputs.Image(type="pil", label="Upload Image"),
	outputs=gr.outputs.Textbox(label="Caption")
	)

	iface.launch()