# -*- coding: utf-8 -*- """Image caption generator.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1kJdblTHuqDn8HCKTuEpoApkN05Gzjpot """ pip install transformers pip install gradio #used for creating the demo pip install timm pip install huggingface_hub #from huggingface_hub import notebook_login #notebook_login() import gradio as gr import requests import matplotlib.pyplot as plt import io from PIL import Image from matplotlib.patches import Rectangle #Load model directly from transformers import AutoProcessor, BlipForConditionalGeneration, pipeline # Loading the BLIP model directly which generates the caption processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") #Using transformers to load DETR model for object detection #This model adds a bounding box and label to detected objects object_detector = pipeline("object-detection", model="facebook/detr-resnet-50") #generates the caption for uploaded image def caption_generator(input_img): inputs = processor(input_img, return_tensors="pt") out = model.generate(**inputs, max_new_tokens=500) caption = processor.decode(out[0], skip_special_tokens=True) return caption #function to filter the generated caption checking whether human, cats and/or dogs are present using the labels from the object detection #this is the method used in this project def filter_caption(object_detection_results): labels = [result['label'] for result in object_detection_results] keywords = ["dog","dogs", "cat","cats", "human","humans","man", "men","woman","women","child","children","adult","adults","person"] return True if any(keyword in labels for keyword in keywords) else False #function to filter the generated caption checking whether human, cats and/or dogs are present using the generated caption #initial method considered def filter(caption): #If any of these keywords are present, True is returned keywords = ["dog","dogs", "cat","cats", "human","humans","man", "men","woman","women","child","children","adult","adults","person"] caption = caption.lower() return True if any(keyword in caption for keyword in keywords) else False #function to create the bounding box and label #takes an image and list of results as inputs def create_image_bbx_w_label(image, results): # Set up the plot fig, ax = plt.subplots(figsize=(12, 8)) ax.imshow(image) # Plot the bounding boxes and labels for res in results: box = res['box'] width = box['xmax'] - box['xmin'] height = box['ymax'] - box['ymin'] rect = Rectangle((box['xmin'], box['ymin']), width, height, linewidth=1, edgecolor='r', facecolor='none') ax.add_patch(rect) # Position the label above the rectangle label_position = (box['xmin'], box['ymin'] - 10) # Display the label and score label_text = f"{res['label']}: {res['score']:.2f}" ax.text(*label_position, label_text, color='white', fontsize=8, bbox=dict(facecolor='red', alpha=0.5)) ax.axis('off') fname = './img.png' plt.savefig(fname, format='png', bbox_inches='tight', pad_inches=0) plt.close(fig) # Load this buffer into a PIL Image pil_img = Image.open(fname) # Return the PIL Image object return pil_img def image_caption_generator(input_image): #detecting objects in image object_detection_results = object_detector(input_image) annotated_img = create_image_bbx_w_label(input_image, object_detection_results) #Generating caption of input image caption = caption_generator(input_image) #Filtering the captions for specific case (humans and/or cats/dogs) #filtered_caption = filter(caption) uncomment this if you want to filter using the generated caption filtered_caption = filter_caption(object_detection_results) #uses the generated labels from object detection to filter the captions if filtered_caption: return caption, annotated_img else: return "There are no humans, cats or dogs in this image!", annotated_img demo = gr.Interface(fn = image_caption_generator, inputs=[gr.Image(label="Upload image", type="pil")], outputs=[gr.Textbox(label="Caption"), 'image'], title="CaptionPlus - Image Caption Generator", description="Captioning images of humans, cats and/or dogs with object detection", allow_flagging="never", examples=["/content/Example.jpg", '/content/OIP.jpg']) demo.launch(share=True)