CaptionQuest / app.py
Frantz103's picture
Update app.py
65c6a4e
from transformers import pipeline
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoProcessor, AutoModelForCausalLM, pipeline
import torch
from torchvision import transforms
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import textstat
import spacy
import re
# Initialize the processor and model for the large COCO model
processor = AutoProcessor.from_pretrained("microsoft/git-large-coco")
model = AutoModelForCausalLM.from_pretrained("microsoft/git-large-coco")
detection_pipe = pipeline("object-detection", model="facebook/detr-resnet-50")
classification_pipe = pipeline("zero-shot-image-classification", model="openai/clip-vit-large-patch14")
# Initialize the pipeline for the VIT model
vit_pipeline = pipeline(task="image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
# Move the COCO model to the device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
def generate_text_and_caption(image):
# Define the preprocessing pipeline for the image
preprocess = transforms.Compose([
transforms.Resize((256, 256)), # Resize to 256x256, change this to match the required dimensions
transforms.CenterCrop(224), # Center crop to 224x224, change this to match the required dimensions
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # Normalize with ImageNet mean and std
])
# Apply the preprocessing pipeline to the image
preprocessed_image = preprocess(image).unsqueeze(0).to(device) # unsqueeze to add batch dimension
# For large COCO model
generated_ids = model.generate(pixel_values=preprocessed_image, max_length=20)
caption1 = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
# For VIT model
vit_output = vit_pipeline(image)
caption2_info = vit_output[0] if vit_output else {"generated_text": "N/A"}
caption2 = caption2_info.get('generated_text', 'N/A')
return caption1, caption2
def get_unique_refined_labels(image):
original_output = detection_pipe(image)
filtered_output = [item for item in original_output if item['score'] >= 0.95]
unique_refined_labels = {}
for item in filtered_output:
box = item['box']
label = item['label']
xmin, ymin, xmax, ymax = box['xmin'], box['ymin'], box['xmax'], box['ymax']
cropped_image = image.crop((xmin, ymin, xmax, ymax))
predictions = classification_pipe(cropped_image, candidate_labels=[label])
if predictions:
top_prediction = sorted(predictions, key=lambda x: x['score'], reverse=True)[0]
top_label = top_prediction['label']
top_score = top_prediction['score']
if top_label not in unique_refined_labels or unique_refined_labels[top_label] < top_score:
unique_refined_labels[top_label] = top_score
return unique_refined_labels, original_output, filtered_output
# Load NLP model for entity extraction
nlp = spacy.load("en_core_web_sm")
def extract_main_words(text):
doc = nlp(text)
main_words = [token.lemma_ for token in doc if token.pos_ == 'NOUN']
return main_words
def get_topics(text):
# Vectorize the text
vectorizer = CountVectorizer()
text_vec = vectorizer.fit_transform([text])
# Fit LDA model to get topics
lda = LatentDirichletAllocation(n_components=1, random_state=0)
lda.fit(text_vec)
# Get the top words per topic (assuming one topic for simplicity)
feature_names = vectorizer.get_feature_names_out()
top_words = [feature_names[i] for i in lda.components_[0].argsort()[:-10 - 1:-1]]
return top_words
def check_readability(caption):
# Compute the Flesch Reading Ease score of the caption
reading_ease_score = textstat.flesch_reading_ease(caption)
return reading_ease_score
def compute_similarity(caption1, caption2):
vectorizer = TfidfVectorizer().fit_transform([caption1, caption2])
vectors = vectorizer.toarray()
cosine_sim = cosine_similarity(vectors)
# The similarity between the captions is the off-diagonal value of the cosine_sim matrix
similarity_score = cosine_sim[0, 1]
return similarity_score
def evaluate_caption(image, caption1, caption2, unique_refined_labels):
# Scores initialization
score_caption1 = 0
score_caption2 = 0
# Initialize object presence scores
object_presence_score1 = 0
object_presence_score2 = 0
# Assume you have a function to extract main words
main_words_caption1 = extract_main_words(caption1)
main_words_caption2 = extract_main_words(caption2)
# Check for object presence using unique_refined_labels
object_presence_score1 += sum([1 for word in main_words_caption1 if word in unique_refined_labels.keys()])
object_presence_score2 += sum([1 for word in main_words_caption2 if word in unique_refined_labels.keys()])
# Entity Extraction
entities_caption1 = [ent.text for ent in nlp(caption1).ents]
entities_caption2 = [ent.text for ent in nlp(caption2).ents]
# Check for object presence using unique_refined_labels
score_caption1 += sum([1 for entity in entities_caption1 if entity in unique_refined_labels.keys()])
score_caption2 += sum([1 for entity in entities_caption2 if entity in unique_refined_labels.keys()])
# Topic Modeling
topics_caption1 = get_topics(caption1)
topics_caption2 = get_topics(caption2)
# Check for topic relevance using unique_refined_labels
score_caption1 += sum([1 for topic in topics_caption1 if topic in unique_refined_labels.keys()])
score_caption2 += sum([1 for topic in topics_caption2 if topic in unique_refined_labels.keys()])
# Implement custom rules
def custom_rules(caption):
score = 0
# Rule for starting with a capital letter
if not caption[0].isupper():
score -= 1
# Rule for ending with punctuation
if caption[-1] not in ['.', '!', '?']:
score -= 1
return score
# Custom rule scores
custom_score1 = custom_rules(caption1)
custom_score2 = custom_rules(caption2)
# Update scores based on custom rules
score_caption1 += custom_score1 # Note: if these were errors, you'd subtract instead
score_caption2 += custom_score2
# Check length
length_caption1 = len(caption1.split())
length_caption2 = len(caption2.split())
if length_caption1 < 3: # assuming a reasonable caption should have at least 3 words
score_caption1 -= 3 # arbitrary penalty
if length_caption2 < 3:
score_caption2 -= 3 # arbitrary penalty
#Define similarity threshold
similarity_score = compute_similarity(caption1, caption2)
similarity_threshold = 0.9 # Replace this with whatever you consider "close enough"
score_difference = abs(score_caption1 - score_caption2)
score_threshold = 2 # Replace this with whatever you consider "close enough"
if score_difference <= score_threshold:
if similarity_score > similarity_threshold:
readability_score_caption1 = check_readability(caption1)
readability_score_caption2 = check_readability(caption2)
return caption1 if readability_score_caption1 > readability_score_caption2 else caption2
else:
return caption1 if score_caption1 > score_caption2 else caption2
# Fallback return statement
return caption2 if score_caption2 > score_caption2 else caption1
# Define the post_process_caption function
def post_process_caption(caption):
# Remove [unusedX] tokens, where X is any number
cleaned_caption = re.sub(r'\[\s*unused\d+\s*\](, )? ?', '', caption)
return cleaned_caption
def process_image(image_path):
image = Image.open(image_path).convert("RGB")
caption1, caption2 = generate_text_and_caption(image)
unique_refined_labels, _, _ = get_unique_refined_labels(image)
# Update return values for caption1
caption1 = post_process_caption(caption1)
# evealuate the captions
better_caption = evaluate_caption(image, caption1, caption2, unique_refined_labels)
return caption1, caption2, better_caption
import gradio as gr
img_cap_ui = gr.Interface(
fn=process_image,
title="Image Captioning with Automatic Evaluation",
description="Caution: this is a research experiment for personal use, please review the captions before using.",
inputs=gr.inputs.Image(type="filepath",label="Add your image"),
outputs=[gr.Textbox(label="Caption from the git-coco model", show_copy_button=True),
gr.Textbox(label="Caption from the nlp-connect model", show_copy_button=True),
gr.Textbox(label="Suggested caption after automatic evaluation", show_copy_button=True)],
examples=["image_31.jpg","image_41.jpg","image_48.jpg", "image_50.jpg"],
article="The caption evaluation method use a simple voting scheme from outputs of 2 additional models. This is an experiment, please make edit if you use the generated caption.",
theme=gr.themes.Soft()
)
img_cap_ui.launch()