Spaces:
Runtime error
Runtime error
| from transformers import AutoProcessor, AutoModelForCausalLM, BitsAndBytesConfig | |
| import torch | |
| from PIL import Image | |
| import requests | |
| import traceback | |
| import os | |
| from huggingface_hub import login | |
| login(token=os.getenv("HF_TOKEN")) | |
| class Image2Text: | |
| def __init__(self): | |
| # Load the GIT coco model | |
| preprocessor_git_large_coco = AutoProcessor.from_pretrained("microsoft/git-large-coco") | |
| model_git_large_coco = AutoModelForCausalLM.from_pretrained("microsoft/git-large-coco") | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| self.preprocessor = preprocessor_git_large_coco | |
| self.model = model_git_large_coco | |
| self.model.to(self.device) | |
| def image_description( | |
| self, | |
| image_url, | |
| max_length=50, | |
| temperature=0.1, | |
| use_sample_image=False, | |
| ): | |
| """ | |
| Generate captions for the given image. | |
| ----- | |
| Parameters | |
| image_url: Image URL | |
| The image to generate captions for. | |
| max_length: int | |
| The max length of the generated descriptions. | |
| ----- | |
| Returns | |
| str | |
| The generated image description. | |
| """ | |
| caption_git_large_coco = "" | |
| if use_sample_image: | |
| image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" | |
| image = Image.open(requests.get(image_url, stream=True).raw) | |
| # Generate captions for the image using the GIT coco model | |
| try: | |
| caption_git_large_coco = self._generate_description(image, max_length, False).strip() | |
| return caption_git_large_coco | |
| except Exception as e: | |
| print(e) | |
| traceback.print_exc() | |
| def _generate_description( | |
| self, | |
| image, | |
| max_length=50, | |
| use_float_16=False, | |
| ): | |
| """ | |
| Generate captions for the given image. | |
| ----- | |
| Parameters | |
| image: PIL.Image | |
| The image to generate captions for. | |
| max_length: int | |
| The max length of the generated descriptions. | |
| use_float_16: bool | |
| Whether to use float16 precision. This can speed up inference, but may lead to worse results. | |
| ----- | |
| Returns | |
| str | |
| The generated caption. | |
| """ | |
| # inputs = preprocessor(image, return_tensors="pt").to(device) | |
| pixel_values = self.preprocessor(images=image, return_tensors="pt").pixel_values.to(self.device) | |
| generated_ids = self.model.generate( | |
| pixel_values=pixel_values, | |
| max_length=max_length, | |
| ) | |
| generated_caption = self.preprocessor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| return generated_caption | |
| import json | |
| from pprint import pprint | |
| import bitsandbytes as bnb | |
| import pandas as pd | |
| import torch | |
| import torch.nn as nn | |
| import transformers | |
| from datasets import load_dataset | |
| from huggingface_hub import notebook_login | |
| from peft import ( | |
| LoraConfig , | |
| PeftConfig , | |
| PeftModel , | |
| get_peft_model , | |
| prepare_model_for_kbit_training, | |
| ) | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig | |
| from peft import LoraConfig, get_peft_model | |
| os.environ["CUDA_VISIBLE_DEVICES"] = "0" | |
| class Social_Media_Captioner: | |
| def __init__(self, use_finetuned: bool=True, temp=0.1): | |
| self.use_finetuned = use_finetuned | |
| self.MODEL_NAME = "vilsonrodrigues/falcon-7b-instruct-sharded" | |
| self.peft_model_name = "ayush-vatsal/caption_qlora_finetune" | |
| self.model_loaded = False | |
| self.device = "cuda:0" | |
| self._load_model() | |
| self.generation_config = self.model.generation_config | |
| self.generation_config.max_new_tokens = 50 | |
| self.generation_config.temperature = temp | |
| self.generation_config.top_p = 0.7 | |
| self.generation_config.num_return_sequences = 1 | |
| self.generation_config.pad_token_id = self.tokenizer.eos_token_id | |
| self.generation_config.eos_token_id = self.tokenizer.eos_token_id | |
| self.cache: list[dict] = [] # [{"image_decription": "A man", "caption": ["A man"]}] | |
| def _load_model(self): | |
| self.bnb_config = BitsAndBytesConfig( | |
| load_in_4bit = True, | |
| llm_int8_enable_fp32_cpu_offload=True, | |
| bnb_4bit_use_double_quant = True, | |
| bnb_4bit_quant_type= "nf4", | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| load_in_8bit_fp32_cpu_offload=True | |
| ) | |
| self.model = AutoModelForCausalLM.from_pretrained( | |
| self.MODEL_NAME, | |
| device_map = "auto", | |
| trust_remote_code = True, | |
| quantization_config = self.bnb_config | |
| ) | |
| # Defining the tokenizers | |
| self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL_NAME) | |
| self.tokenizer.pad_token = self.tokenizer.eos_token | |
| # if self.use_finetuned: | |
| # # LORA Config Model | |
| # self.lora_config = LoraConfig( | |
| # r=16, | |
| # lora_alpha=32, | |
| # target_modules=["query_key_value"], | |
| # lora_dropout=0.05, | |
| # bias="none", | |
| # task_type="CAUSAL_LM" | |
| # ) | |
| # self.model = get_peft_model(self.model, self.lora_config) | |
| # # Fitting the adapters | |
| # self.peft_config = PeftConfig.from_pretrained(self.peft_model_name) | |
| # self.model = AutoModelForCausalLM.from_pretrained( | |
| # self.peft_config.base_model_name_or_path, | |
| # return_dict = True, | |
| # quantization_config = self.bnb_config, | |
| # device_map= "auto", | |
| # trust_remote_code = True | |
| # ) | |
| # self.model = PeftModel.from_pretrained(self.model, self.peft_model_name) | |
| # # Defining the tokenizers | |
| # self.tokenizer = AutoTokenizer.from_pretrained(self.peft_config.base_model_name_or_path) | |
| # self.tokenizer.pad_token = self.tokenizer.eos_token | |
| self.model_loaded = True | |
| print("Model Loaded successfully") | |
| def inference(self, input_text: str, use_cached=True, cache_generation=True) -> str | None: | |
| if not self.model_loaded: | |
| raise Exception("Model not loaded") | |
| try: | |
| prompt = Social_Media_Captioner._prompt(input_text) | |
| if use_cached: | |
| for item in self.cache: | |
| if item['image_description'] == input_text: | |
| return item['caption'] | |
| encoding = self.tokenizer(prompt, return_tensors = "pt").to(self.device) | |
| with torch.inference_mode(): | |
| outputs = self.model.generate( | |
| input_ids = encoding.input_ids, | |
| attention_mask = encoding.attention_mask, | |
| generation_config = self.generation_config | |
| ) | |
| generated_caption = (self.tokenizer.decode(outputs[0], skip_special_tokens=True).split('Caption: "')[-1]).split('"')[0] | |
| if cache_generation: | |
| for item in self.cache: | |
| if item['image_description'] == input_text: | |
| item['caption'].append(generated_caption) | |
| break | |
| else: | |
| self.cache.append({ | |
| 'image_description': input_text, | |
| 'caption': [generated_caption] | |
| }) | |
| return generated_caption | |
| except Exception as e: | |
| print(e) | |
| return None | |
| def _prompt(input_text="A man walking alone in the road"): | |
| if input_text is None: | |
| raise Exception("Enter a valid input text to generate a valid prompt") | |
| return f""" | |
| Convert the given image description to a appropriate metaphoric caption | |
| Description: {input_text} | |
| Caption: | |
| """.strip() | |
| def get_trainable_parameters(model): | |
| trainable_params = 0 | |
| all_param = 0 | |
| for _, param in model.named_parameters(): | |
| all_param += param.numel() | |
| if param.requires_grad: | |
| trainable_params += param.numel() | |
| return f"trainable_params: {trainable_params} || all_params: {all_param} || Percentage of trainable params: {100*trainable_params / all_param}" | |
| def __repr__(self): | |
| return f""" | |
| Base Model Name: {self.MODEL_NAME} | |
| PEFT Model Name: {self.peft_model_name} | |
| Using PEFT Finetuned Model: {self.use_finetuned} | |
| Model: {self.model} | |
| ------------------------------------------------------------ | |
| {Social_Media_Captioner.get_trainable_parameters(self.model)} | |
| """ | |
| class Captions: | |
| def __init__(self, use_finetuned_LLM: bool=True, temp_LLM=0.1): | |
| self.image_to_text = Image2Text() | |
| self.LLM = Social_Media_Captioner(use_finetuned_LLM, temp_LLM) | |
| def generate_captions( | |
| self, | |
| image, | |
| image_url=None, | |
| max_length_GIT=50, | |
| temperature_GIT=0.1, | |
| use_sample_image_GIT=False, | |
| use_cached_LLM=True, | |
| cache_generation_LLM=True | |
| ): | |
| if image_url: | |
| image_description = self.image_to_text.image_description(image_url, max_length=max_length_GIT, temperature=temperature_GIT, use_sample_image=use_sample_image_GIT) | |
| else: | |
| image_description = self.image_to_text._generate_description(image, max_length=max_length_GIT) | |
| captions = self.LLM.inference(image_description, use_cached=use_cached_LLM, cache_generation=cache_generation_LLM) | |
| return captions | |
| caption_generator = Captions() | |
| import gradio as gr | |
| def setup(image): | |
| return caption_generator.generate_captions(image = image) | |
| iface = gr.Interface( | |
| fn=setup, | |
| inputs=gr.inputs.Image(type="pil", label="Upload Image"), | |
| outputs=gr.outputs.Textbox(label="Caption") | |
| ) | |
| iface.launch() |