from transformers import AutoProcessor, AutoModelForCausalLM, BitsAndBytesConfig import torch from PIL import Image import requests import traceback import os from huggingface_hub import login login(token=os.getenv("HF_TOKEN")) class Image2Text: def __init__(self): # Load the GIT coco model preprocessor_git_large_coco = AutoProcessor.from_pretrained("microsoft/git-large-coco") model_git_large_coco = AutoModelForCausalLM.from_pretrained("microsoft/git-large-coco") self.device = "cuda" if torch.cuda.is_available() else "cpu" self.preprocessor = preprocessor_git_large_coco self.model = model_git_large_coco def image_description( self, image_url, max_length=50, temperature=0.1, use_sample_image=False, ): """ Generate captions for the given image. ----- Parameters image_url: Image URL The image to generate captions for. max_length: int The max length of the generated descriptions. ----- Returns str The generated image description. """ caption_git_large_coco = "" if use_sample_image: image_url = "" image =, stream=True).raw) # Generate captions for the image using the GIT coco model try: caption_git_large_coco = self._generate_description(image, max_length, False).strip() return caption_git_large_coco except Exception as e: print(e) traceback.print_exc() def _generate_description( self, image, max_length=50, use_float_16=False, ): """ Generate captions for the given image. ----- Parameters image: PIL.Image The image to generate captions for. max_length: int The max length of the generated descriptions. use_float_16: bool Whether to use float16 precision. This can speed up inference, but may lead to worse results. ----- Returns str The generated caption. """ # inputs = preprocessor(image, return_tensors="pt").to(device) pixel_values = self.preprocessor(images=image, return_tensors="pt") generated_ids = self.model.generate( pixel_values=pixel_values, max_length=max_length, ) generated_caption = self.preprocessor.batch_decode(generated_ids, skip_special_tokens=True)[0] return generated_caption import json from pprint import pprint import bitsandbytes as bnb import pandas as pd import torch import torch.nn as nn import transformers from datasets import load_dataset from huggingface_hub import notebook_login from peft import ( LoraConfig , PeftConfig , PeftModel , get_peft_model , prepare_model_for_kbit_training, ) from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig from peft import LoraConfig, get_peft_model os.environ["CUDA_VISIBLE_DEVICES"] = "0" class Social_Media_Captioner: def __init__(self, use_finetuned: bool=True, temp=0.1): self.use_finetuned = use_finetuned self.MODEL_NAME = "vilsonrodrigues/falcon-7b-instruct-sharded" self.peft_model_name = "ayush-vatsal/caption_qlora_finetune" self.model_loaded = False self.device = "cuda:0" self._load_model() self.generation_config = self.model.generation_config self.generation_config.max_new_tokens = 50 self.generation_config.temperature = temp self.generation_config.top_p = 0.7 self.generation_config.num_return_sequences = 1 self.generation_config.pad_token_id = self.tokenizer.eos_token_id self.generation_config.eos_token_id = self.tokenizer.eos_token_id self.cache: list[dict] = [] # [{"image_decription": "A man", "caption": ["A man"]}] def _load_model(self): self.bnb_config = BitsAndBytesConfig( load_in_4bit = True, llm_int8_enable_fp32_cpu_offload=True, bnb_4bit_use_double_quant = True, bnb_4bit_quant_type= "nf4", bnb_4bit_compute_dtype=torch.bfloat16, load_in_8bit_fp32_cpu_offload=True ) self.model = AutoModelForCausalLM.from_pretrained( self.MODEL_NAME, device_map = "auto", trust_remote_code = True, quantization_config = self.bnb_config ) # Defining the tokenizers self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL_NAME) self.tokenizer.pad_token = self.tokenizer.eos_token # if self.use_finetuned: # # LORA Config Model # self.lora_config = LoraConfig( # r=16, # lora_alpha=32, # target_modules=["query_key_value"], # lora_dropout=0.05, # bias="none", # task_type="CAUSAL_LM" # ) # self.model = get_peft_model(self.model, self.lora_config) # # Fitting the adapters # self.peft_config = PeftConfig.from_pretrained(self.peft_model_name) # self.model = AutoModelForCausalLM.from_pretrained( # self.peft_config.base_model_name_or_path, # return_dict = True, # quantization_config = self.bnb_config, # device_map= "auto", # trust_remote_code = True # ) # self.model = PeftModel.from_pretrained(self.model, self.peft_model_name) # # Defining the tokenizers # self.tokenizer = AutoTokenizer.from_pretrained(self.peft_config.base_model_name_or_path) # self.tokenizer.pad_token = self.tokenizer.eos_token self.model_loaded = True print("Model Loaded successfully") def inference(self, input_text: str, use_cached=True, cache_generation=True) -> str | None: if not self.model_loaded: raise Exception("Model not loaded") try: prompt = Social_Media_Captioner._prompt(input_text) if use_cached: for item in self.cache: if item['image_description'] == input_text: return item['caption'] encoding = self.tokenizer(prompt, return_tensors = "pt").to(self.device) with torch.inference_mode(): outputs = self.model.generate( input_ids = encoding.input_ids, attention_mask = encoding.attention_mask, generation_config = self.generation_config ) generated_caption = (self.tokenizer.decode(outputs[0], skip_special_tokens=True).split('Caption: "')[-1]).split('"')[0] if cache_generation: for item in self.cache: if item['image_description'] == input_text: item['caption'].append(generated_caption) break else: self.cache.append({ 'image_description': input_text, 'caption': [generated_caption] }) return generated_caption except Exception as e: print(e) return None def _prompt(input_text="A man walking alone in the road"): if input_text is None: raise Exception("Enter a valid input text to generate a valid prompt") return f""" Convert the given image description to a appropriate metaphoric caption Description: {input_text} Caption: """.strip() @staticmethod def get_trainable_parameters(model): trainable_params = 0 all_param = 0 for _, param in model.named_parameters(): all_param += param.numel() if param.requires_grad: trainable_params += param.numel() return f"trainable_params: {trainable_params} || all_params: {all_param} || Percentage of trainable params: {100*trainable_params / all_param}" def __repr__(self): return f""" Base Model Name: {self.MODEL_NAME} PEFT Model Name: {self.peft_model_name} Using PEFT Finetuned Model: {self.use_finetuned} Model: {self.model} ------------------------------------------------------------ {Social_Media_Captioner.get_trainable_parameters(self.model)} """ class Captions: def __init__(self, use_finetuned_LLM: bool=True, temp_LLM=0.1): self.image_to_text = Image2Text() self.LLM = Social_Media_Captioner(use_finetuned_LLM, temp_LLM) def generate_captions( self, image, image_url=None, max_length_GIT=50, temperature_GIT=0.1, use_sample_image_GIT=False, use_cached_LLM=True, cache_generation_LLM=True ): if image_url: image_description = self.image_to_text.image_description(image_url, max_length=max_length_GIT, temperature=temperature_GIT, use_sample_image=use_sample_image_GIT) else: image_description = self.image_to_text._generate_description(image, max_length=max_length_GIT) captions = self.LLM.inference(image_description, use_cached=use_cached_LLM, cache_generation=cache_generation_LLM) return captions caption_generator = Captions() import gradio as gr def setup(image): return caption_generator.generate_captions(image = image) iface = gr.Interface( fn=setup, inputs=gr.inputs.Image(type="pil", label="Upload Image"), outputs=gr.outputs.Textbox(label="Caption") ) iface.launch()