Spaces:
Sleeping
Sleeping
| # ============================================================================== | |
| # Josh Guimond | |
| # Unit 8 Assignment: End-to-End AI Solution Implementation | |
| # ARIN 460 | |
| # 12/03/2025 | |
| # Description: This script implements a multimodal AI web app using Gradio to | |
| # run two image captioning models, a text “vibe” classifier, and NLP metrics on | |
| # uploaded images, allowing direct comparison of model captions to ground-truth | |
| # descriptions. | |
| # ============================================================================== | |
| # ============================================================================== | |
| # SECTION 1: SETUP & INSTALLATIONS | |
| # ============================================================================== | |
| # Install libraries | |
| import gradio as gr | |
| from transformers import pipeline, AutoTokenizer, AutoModelForImageTextToText | |
| from sentence_transformers import SentenceTransformer, util | |
| import evaluate | |
| import warnings | |
| import logging | |
| # Filter out the "FutureWarning" and "UserWarning" to keep the console clean | |
| warnings.filterwarnings("ignore", category=FutureWarning) | |
| warnings.filterwarnings("ignore", category=UserWarning) | |
| logging.getLogger("transformers").setLevel(logging.ERROR) | |
| # ============================================================================== | |
| # SECTION 2: LOAD MODELS | |
| # ============================================================================== | |
| # --- 1. Load Image Captioning Models --- | |
| # Model 1: BLIP (Base) | |
| print("Loading Model 1 (BLIP)...") | |
| captioner_model1 = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") | |
| # Model 2: ViT-GPT2 (With Tokenizer Fix) | |
| print("Loading Model 2 (ViT-GPT2)...") | |
| # Load the tokenizer manually to set the pad_token and fix the warning | |
| vit_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
| vit_tokenizer.pad_token = vit_tokenizer.eos_token # <--- THE FIX | |
| captioner_model2 = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", tokenizer=vit_tokenizer) | |
| # --- 2. Load NLP Analysis Models (Unit 4 Techniques) --- | |
| # A. Zero-Shot Classifier (For Nuanced Vibe/Sentiment) | |
| print("Loading Zero-Shot Classifier...") | |
| classifier = pipeline("zero-shot-classification", model="MoritzLaurer/deberta-v3-xsmall-zeroshot-v1.1-all-33") | |
| # B. Semantic Similarity (For Model Agreement) | |
| print("Loading Sentence Transformer...") | |
| similarity_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
| # C. ROUGE Metric (For Accuracy vs Ground Truth) | |
| print("Loading ROUGE Metric...") | |
| rouge = evaluate.load("rouge") | |
| # Define Nuanced Labels based on the image list | |
| # These cover: Peaceful dog, Sad funeral, Happy kids, Angry man, Scared people, Fighting tigers | |
| VIBE_LABELS = ["Peaceful/Calm", "Happy/Joy", "Sad/Sorrow", "Angry/Upset", "Fear/Scared", "Action/Violence"] | |
| # ============================================================================== | |
| # SECTION 3: ANALYSIS FUNCTIONS | |
| # ============================================================================== | |
| # --- Analysis Function --- | |
| def analyze_image(image, ground_truth): | |
| # -- A. Generate Captions -- | |
| res1 = captioner_model1(image) | |
| cap1 = res1[0]['generated_text'] | |
| res2 = captioner_model2(image) | |
| cap2 = res2[0]['generated_text'] | |
| # -- B. Analyze Vibe (Zero-Shot) -- | |
| # Model 1 Vibe | |
| vibe1_result = classifier(cap1, VIBE_LABELS) | |
| vibe1_label = vibe1_result['labels'][0] | |
| vibe1_score = vibe1_result['scores'][0] | |
| # Model 2 Vibe | |
| vibe2_result = classifier(cap2, VIBE_LABELS) | |
| vibe2_label = vibe2_result['labels'][0] | |
| vibe2_score = vibe2_result['scores'][0] | |
| # -- C. Calculate Statistics -- | |
| # 1. Semantic Similarity (Do the models agree?) | |
| emb1 = similarity_model.encode(cap1, convert_to_tensor=True) | |
| emb2 = similarity_model.encode(cap2, convert_to_tensor=True) | |
| sim_score = util.pytorch_cos_sim(emb1, emb2).item() | |
| # 2. ROUGE Scores (How accurate are they vs Ground Truth?) | |
| rouge_output = "N/A (No Ground Truth provided)" | |
| if ground_truth and ground_truth.strip() != "": | |
| # Calculate scores | |
| r1 = rouge.compute(predictions=[cap1], references=[ground_truth]) | |
| r2 = rouge.compute(predictions=[cap2], references=[ground_truth]) | |
| # Format the ROUGE output nicely | |
| rouge_output = ( | |
| f"Model 1 ROUGE-L: {r1['rougeL']:.3f}\n" | |
| f"Model 2 ROUGE-L: {r2['rougeL']:.3f}\n" | |
| f"(Higher is better)" | |
| ) | |
| # -- D. Format Output Strings -- | |
| # Create clean, formatted strings for the large textboxes | |
| out1 = ( | |
| f"CAPTION: {cap1}\n" | |
| f"-----------------------------\n" | |
| f"DETECTED VIBE: {vibe1_label}\n" | |
| f"CONFIDENCE: {vibe1_score:.1%}" | |
| ) | |
| out2 = ( | |
| f"CAPTION: {cap2}\n" | |
| f"-----------------------------\n" | |
| f"DETECTED VIBE: {vibe2_label}\n" | |
| f"CONFIDENCE: {vibe2_score:.1%}" | |
| ) | |
| stats = ( | |
| f"--- 1. MODEL AGREEMENT (Semantic Similarity) ---\n" | |
| f"Score: {sim_score:.3f}\n" | |
| f"(Scale: 0.0 = Different, 1.0 = Identical)\n\n" | |
| f"--- 2. OBJECT IDENTIFICATION ACCURACY (ROUGE) ---\n" | |
| f"Ground Truth: '{ground_truth}'\n" | |
| f"{rouge_output}" | |
| ) | |
| return out1, out2, stats | |
| # ============================================================================== | |
| # SECTION 4: GRADIO INTERFACE | |
| # ============================================================================== | |
| # Define Inputs | |
| image_input = gr.Image(type="pil", label="Upload Image") | |
| text_input = gr.Textbox(label="Ground Truth Description", placeholder="e.g. 'A peaceful dog on a beach'") | |
| # Define Outputs with LARGER viewing areas (lines=5 or 10) | |
| output_m1 = gr.Textbox(label="Model 1 (BLIP) Analysis", lines=4) | |
| output_m2 = gr.Textbox(label="Model 2 (ViT-GPT2) Analysis", lines=4) | |
| output_stats = gr.Textbox(label="Comparison Metrics & Statistics", lines=10) | |
| # Create Interface | |
| interface = gr.Interface( | |
| fn=analyze_image, | |
| inputs=[image_input, text_input], | |
| outputs=[output_m1, output_m2, output_stats], | |
| title="Multimodal AI: Nuanced Image Analysis", | |
| description="This application uses two Image Captioning models (BLIP & ViT-GPT2) to identify objects, Zero-Shot Classification to detect emotional vibes (Happy, Sad, Angry, etc.), and calculates ROUGE/Similarity metrics.", | |
| examples=[ | |
| ["images/1.png", "A peaceful dog on a sunny beach"], | |
| ["images/2.png", "Sad men carrying a casket at a funeral"], | |
| ["images/3.png", "Happy kids at a birthday party"], | |
| ["images/4.png", "An angry man in a car"], | |
| ["images/5.png", "Two people happy mountain biking"], | |
| ["images/6.png", "A man upset about his food at a restaurant"], | |
| ["images/7.png", "A couple happy at a restaurant"], | |
| ["images/8.png", "A sad woman reading a book"], | |
| ["images/9.png", "People scared at a movie"], | |
| ["images/10.png", "Two tigers fighting"] | |
| ] | |
| ) | |
| if __name__ == "__main__": | |
| interface.launch() |