from transformers import pipeline
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

from transformers import AutoProcessor, AutoModelForCausalLM, pipeline
import torch
from torchvision import transforms


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import textstat
import spacy

import re

# Initialize the processor and model for the large COCO model
processor = AutoProcessor.from_pretrained("microsoft/git-large-coco")
model = AutoModelForCausalLM.from_pretrained("microsoft/git-large-coco")

detection_pipe = pipeline("object-detection", model="facebook/detr-resnet-50")
classification_pipe = pipeline("zero-shot-image-classification", model="openai/clip-vit-large-patch14")

# Initialize the pipeline for the VIT model
vit_pipeline = pipeline(task="image-to-text", model="nlpconnect/vit-gpt2-image-captioning")

# Move the COCO model to the device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def generate_text_and_caption(image):
    # Define the preprocessing pipeline for the image
    preprocess = transforms.Compose([
        transforms.Resize((256, 256)),  # Resize to 256x256, change this to match the required dimensions
        transforms.CenterCrop(224),  # Center crop to 224x224, change this to match the required dimensions
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize with ImageNet mean and std
    ])

    # Apply the preprocessing pipeline to the image
    preprocessed_image = preprocess(image).unsqueeze(0).to(device)  # unsqueeze to add batch dimension

    # For large COCO model
    generated_ids = model.generate(pixel_values=preprocessed_image, max_length=20)
    caption1 = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # For VIT model
    vit_output = vit_pipeline(image)
    caption2_info = vit_output[0] if vit_output else {"generated_text": "N/A"}
    caption2 = caption2_info.get('generated_text', 'N/A')

    return caption1, caption2

def get_unique_refined_labels(image):
    original_output = detection_pipe(image)
    filtered_output = [item for item in original_output if item['score'] >= 0.95]
    unique_refined_labels = {}
    for item in filtered_output:
        box = item['box']
        label = item['label']
        xmin, ymin, xmax, ymax = box['xmin'], box['ymin'], box['xmax'], box['ymax']
        cropped_image = image.crop((xmin, ymin, xmax, ymax))
        predictions = classification_pipe(cropped_image, candidate_labels=[label])
        if predictions:
            top_prediction = sorted(predictions, key=lambda x: x['score'], reverse=True)[0]
            top_label = top_prediction['label']
            top_score = top_prediction['score']
            if top_label not in unique_refined_labels or unique_refined_labels[top_label] < top_score:
                unique_refined_labels[top_label] = top_score
    return unique_refined_labels, original_output, filtered_output


# Load NLP model for entity extraction
nlp = spacy.load("en_core_web_sm")


def extract_main_words(text):
    doc = nlp(text)
    main_words = [token.lemma_ for token in doc if token.pos_ == 'NOUN']
    return main_words

def get_topics(text):
    # Vectorize the text
    vectorizer = CountVectorizer()
    text_vec = vectorizer.fit_transform([text])
    # Fit LDA model to get topics
    lda = LatentDirichletAllocation(n_components=1, random_state=0)
    lda.fit(text_vec)
    # Get the top words per topic (assuming one topic for simplicity)
    feature_names = vectorizer.get_feature_names_out()
    top_words = [feature_names[i] for i in lda.components_[0].argsort()[:-10 - 1:-1]]
    return top_words

def check_readability(caption):
    # Compute the Flesch Reading Ease score of the caption
    reading_ease_score = textstat.flesch_reading_ease(caption)
    return reading_ease_score

def compute_similarity(caption1, caption2):
    vectorizer = TfidfVectorizer().fit_transform([caption1, caption2])
    vectors = vectorizer.toarray()
    cosine_sim = cosine_similarity(vectors)
    # The similarity between the captions is the off-diagonal value of the cosine_sim matrix
    similarity_score = cosine_sim[0, 1]
    return similarity_score

def evaluate_caption(image, caption1, caption2, unique_refined_labels):
    # Scores initialization
    score_caption1 = 0
    score_caption2 = 0

    # Initialize object presence scores
    object_presence_score1 = 0
    object_presence_score2 = 0

     # Assume you have a function to extract main words
    main_words_caption1 = extract_main_words(caption1)
    main_words_caption2 = extract_main_words(caption2)

    # Check for object presence using unique_refined_labels
    object_presence_score1 += sum([1 for word in main_words_caption1 if word in unique_refined_labels.keys()])
    object_presence_score2 += sum([1 for word in main_words_caption2 if word in unique_refined_labels.keys()])

    # Entity Extraction
    entities_caption1 = [ent.text for ent in nlp(caption1).ents]
    entities_caption2 = [ent.text for ent in nlp(caption2).ents]

    # Check for object presence using unique_refined_labels
    score_caption1 += sum([1 for entity in entities_caption1 if entity in unique_refined_labels.keys()])
    score_caption2 += sum([1 for entity in entities_caption2 if entity in unique_refined_labels.keys()])

    # Topic Modeling
    topics_caption1 = get_topics(caption1)
    topics_caption2 = get_topics(caption2)

    # Check for topic relevance using unique_refined_labels
    score_caption1 += sum([1 for topic in topics_caption1 if topic in unique_refined_labels.keys()])
    score_caption2 += sum([1 for topic in topics_caption2 if topic in unique_refined_labels.keys()])


    # Implement custom rules
    def custom_rules(caption):
        score = 0

        # Rule for starting with a capital letter
        if not caption[0].isupper():
            score -= 1

        # Rule for ending with punctuation
        if caption[-1] not in ['.', '!', '?']:
            score -= 1

        return score

    # Custom rule scores
    custom_score1 = custom_rules(caption1)
    custom_score2 = custom_rules(caption2)

    # Update scores based on custom rules
    score_caption1 += custom_score1  # Note: if these were errors, you'd subtract instead
    score_caption2 += custom_score2


    # Check length
    length_caption1 = len(caption1.split())
    length_caption2 = len(caption2.split())

    if length_caption1 < 3:  # assuming a reasonable caption should have at least 3 words
        score_caption1 -= 3  # arbitrary penalty
    if length_caption2 < 3:
        score_caption2 -= 3  # arbitrary penalty

    #Define similarity threshold
    similarity_score = compute_similarity(caption1, caption2)

    similarity_threshold = 0.9  # Replace this with whatever you consider "close enough"

    score_difference = abs(score_caption1 - score_caption2)
    score_threshold = 2  # Replace this with whatever you consider "close enough"

    if score_difference <= score_threshold:
        if similarity_score > similarity_threshold:
            readability_score_caption1 = check_readability(caption1)
            readability_score_caption2 = check_readability(caption2)

            return caption1 if readability_score_caption1 > readability_score_caption2 else caption2
    else:
        return caption1 if score_caption1 > score_caption2 else caption2

    # Fallback return statement
    return caption2 if score_caption2 > score_caption2 else caption1

# Define the post_process_caption function
def post_process_caption(caption):
    # Remove [unusedX] tokens, where X is any number
    cleaned_caption = re.sub(r'\[\s*unused\d+\s*\](, )? ?', '', caption)
    return cleaned_caption

def process_image(image_path):
    image = Image.open(image_path).convert("RGB")
    caption1, caption2 = generate_text_and_caption(image)
    unique_refined_labels, _, _ = get_unique_refined_labels(image)

    # Update return values for caption1
    caption1 = post_process_caption(caption1)

    # evealuate the captions
    better_caption = evaluate_caption(image, caption1, caption2, unique_refined_labels)

    return caption1, caption2, better_caption

import gradio as gr

img_cap_ui = gr.Interface(
    fn=process_image,
    title="Image Captioning with Automatic Evaluation",
    description="Caution: this is a research experiment for personal use, please review the captions before using.",
    inputs=gr.inputs.Image(type="filepath",label="Add your image"),
    outputs=[gr.Textbox(label="Caption from the git-coco model", show_copy_button=True),
    gr.Textbox(label="Caption from the nlp-connect model", show_copy_button=True),
    gr.Textbox(label="Suggested caption after automatic evaluation", show_copy_button=True)],
    examples=["image_31.jpg","image_41.jpg","image_48.jpg", "image_50.jpg"],
    article="The caption evaluation method use a simple voting scheme from outputs of 2 additional models. This is an experiment, please make edit if you use the generated caption.",
    theme=gr.themes.Soft()
)

img_cap_ui.launch()