Spaces:

Frantz103
/

CaptionQuest

Sleeping

File size: 9,292 Bytes

from transformers import pipeline
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

from transformers import AutoProcessor, AutoModelForCausalLM, pipeline
import torch
from torchvision import transforms


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import textstat
import spacy

import re

# Initialize the processor and model for the large COCO model
processor = AutoProcessor.from_pretrained("microsoft/git-large-coco")
model = AutoModelForCausalLM.from_pretrained("microsoft/git-large-coco")

detection_pipe = pipeline("object-detection", model="facebook/detr-resnet-50")
classification_pipe = pipeline("zero-shot-image-classification", model="openai/clip-vit-large-patch14")

# Initialize the pipeline for the VIT model
vit_pipeline = pipeline(task="image-to-text", model="nlpconnect/vit-gpt2-image-captioning")

# Move the COCO model to the device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def generate_text_and_caption(image):
    # Define the preprocessing pipeline for the image
    preprocess = transforms.Compose([
        transforms.Resize((256, 256)),  # Resize to 256x256, change this to match the required dimensions
        transforms.CenterCrop(224),  # Center crop to 224x224, change this to match the required dimensions
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize with ImageNet mean and std
    ])

    # Apply the preprocessing pipeline to the image
    preprocessed_image = preprocess(image).unsqueeze(0).to(device)  # unsqueeze to add batch dimension

    # For large COCO model
    generated_ids = model.generate(pixel_values=preprocessed_image, max_length=20)
    caption1 = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # For VIT model
    vit_output = vit_pipeline(image)
    caption2_info = vit_output[0] if vit_output else {"generated_text": "N/A"}
    caption2 = caption2_info.get('generated_text', 'N/A')

    return caption1, caption2

def get_unique_refined_labels(image):
    original_output = detection_pipe(image)
    filtered_output = [item for item in original_output if item['score'] >= 0.95]
    unique_refined_labels = {}
    for item in filtered_output:
        box = item['box']
        label = item['label']
        xmin, ymin, xmax, ymax = box['xmin'], box['ymin'], box['xmax'], box['ymax']
        cropped_image = image.crop((xmin, ymin, xmax, ymax))
        predictions = classification_pipe(cropped_image, candidate_labels=[label])
        if predictions:
            top_prediction = sorted(predictions, key=lambda x: x['score'], reverse=True)[0]
            top_label = top_prediction['label']
            top_score = top_prediction['score']
            if top_label not in unique_refined_labels or unique_refined_labels[top_label] < top_score:
                unique_refined_labels[top_label] = top_score
    return unique_refined_labels, original_output, filtered_output


# Load NLP model for entity extraction
nlp = spacy.load("en_core_web_sm")


def extract_main_words(text):
    doc = nlp(text)
    main_words = [token.lemma_ for token in doc if token.pos_ == 'NOUN']
    return main_words

def get_topics(text):
    # Vectorize the text
    vectorizer = CountVectorizer()
    text_vec = vectorizer.fit_transform([text])
    # Fit LDA model to get topics
    lda = LatentDirichletAllocation(n_components=1, random_state=0)
    lda.fit(text_vec)
    # Get the top words per topic (assuming one topic for simplicity)
    feature_names = vectorizer.get_feature_names_out()
    top_words = [feature_names[i] for i in lda.components_[0].argsort()[:-10 - 1:-1]]
    return top_words

def check_readability(caption):
    # Compute the Flesch Reading Ease score of the caption
    reading_ease_score = textstat.flesch_reading_ease(caption)
    return reading_ease_score

def compute_similarity(caption1, caption2):
    vectorizer = TfidfVectorizer().fit_transform([caption1, caption2])
    vectors = vectorizer.toarray()
    cosine_sim = cosine_similarity(vectors)
    # The similarity between the captions is the off-diagonal value of the cosine_sim matrix
    similarity_score = cosine_sim[0, 1]
    return similarity_score

def evaluate_caption(image, caption1, caption2, unique_refined_labels):
    # Scores initialization
    score_caption1 = 0
    score_caption2 = 0

    # Initialize object presence scores
    object_presence_score1 = 0
    object_presence_score2 = 0

     # Assume you have a function to extract main words
    main_words_caption1 = extract_main_words(caption1)
    main_words_caption2 = extract_main_words(caption2)

    # Check for object presence using unique_refined_labels
    object_presence_score1 += sum([1 for word in main_words_caption1 if word in unique_refined_labels.keys()])
    object_presence_score2 += sum([1 for word in main_words_caption2 if word in unique_refined_labels.keys()])

    # Entity Extraction
    entities_caption1 = [ent.text for ent in nlp(caption1).ents]
    entities_caption2 = [ent.text for ent in nlp(caption2).ents]

    # Check for object presence using unique_refined_labels
    score_caption1 += sum([1 for entity in entities_caption1 if entity in unique_refined_labels.keys()])
    score_caption2 += sum([1 for entity in entities_caption2 if entity in unique_refined_labels.keys()])

    # Topic Modeling
    topics_caption1 = get_topics(caption1)
    topics_caption2 = get_topics(caption2)

    # Check for topic relevance using unique_refined_labels
    score_caption1 += sum([1 for topic in topics_caption1 if topic in unique_refined_labels.keys()])
    score_caption2 += sum([1 for topic in topics_caption2 if topic in unique_refined_labels.keys()])


    # Implement custom rules
    def custom_rules(caption):
        score = 0

        # Rule for starting with a capital letter
        if not caption[0].isupper():
            score -= 1

        # Rule for ending with punctuation
        if caption[-1] not in ['.', '!', '?']:
            score -= 1

        return score

    # Custom rule scores
    custom_score1 = custom_rules(caption1)
    custom_score2 = custom_rules(caption2)

    # Update scores based on custom rules
    score_caption1 += custom_score1  # Note: if these were errors, you'd subtract instead
    score_caption2 += custom_score2


    # Check length
    length_caption1 = len(caption1.split())
    length_caption2 = len(caption2.split())

    if length_caption1 < 3:  # assuming a reasonable caption should have at least 3 words
        score_caption1 -= 3  # arbitrary penalty
    if length_caption2 < 3:
        score_caption2 -= 3  # arbitrary penalty

    #Define similarity threshold
    similarity_score = compute_similarity(caption1, caption2)

    similarity_threshold = 0.9  # Replace this with whatever you consider "close enough"

    score_difference = abs(score_caption1 - score_caption2)
    score_threshold = 2  # Replace this with whatever you consider "close enough"

    if score_difference <= score_threshold:
        if similarity_score > similarity_threshold:
            readability_score_caption1 = check_readability(caption1)
            readability_score_caption2 = check_readability(caption2)

            return caption1 if readability_score_caption1 > readability_score_caption2 else caption2
    else:
        return caption1 if score_caption1 > score_caption2 else caption2

    # Fallback return statement
    return caption2 if score_caption2 > score_caption2 else caption1

# Define the post_process_caption function
def post_process_caption(caption):
    # Remove [unusedX] tokens, where X is any number
    cleaned_caption = re.sub(r'\[\s*unused\d+\s*\](, )? ?', '', caption)
    return cleaned_caption

def process_image(image_path):
    image = Image.open(image_path).convert("RGB")
    caption1, caption2 = generate_text_and_caption(image)
    unique_refined_labels, _, _ = get_unique_refined_labels(image)

    # Update return values for caption1
    caption1 = post_process_caption(caption1)

    # evealuate the captions
    better_caption = evaluate_caption(image, caption1, caption2, unique_refined_labels)

    return caption1, caption2, better_caption

import gradio as gr

img_cap_ui = gr.Interface(
    fn=process_image,
    title="Image Captioning with Automatic Evaluation",
    description="Caution: this is a research experiment for personal use, please review the captions before using.",
    inputs=gr.inputs.Image(type="filepath",label="Add your image"),
    outputs=[gr.Textbox(label="Caption from the git-coco model", show_copy_button=True),
    gr.Textbox(label="Caption from the nlp-connect model", show_copy_button=True),
    gr.Textbox(label="Suggested caption after automatic evaluation", show_copy_button=True)],
    examples=["image_31.jpg","image_41.jpg","image_48.jpg", "image_50.jpg"],
    article="The caption evaluation method use a simple voting scheme from outputs of 2 additional models. This is an experiment, please make edit if you use the generated caption.",
    theme=gr.themes.Soft()
)

img_cap_ui.launch()