therealsaed's picture
Update app.py
24b38ad verified
"""
Hugging Face Spaces App - Image Captioning
Deploy this to HF Spaces for free hosting
"""
import gradio as gr
import torch
from PIL import Image
import time
def load_models():
"""Load models with error handling"""
models = {}
try:
from transformers import BlipProcessor, BlipForConditionalGeneration
print("Loading BLIP model...")
models['blip_processor'] = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
models['blip_model'] = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
print("βœ… BLIP loaded successfully")
except Exception as e:
print(f"❌ BLIP failed to load: {e}")
models['blip_error'] = str(e)
try:
from transformers import AutoProcessor, AutoModelForCausalLM
print("Loading GIT model...")
models['git_processor'] = AutoProcessor.from_pretrained("microsoft/git-base")
models['git_model'] = AutoModelForCausalLM.from_pretrained("microsoft/git-base")
print("βœ… GIT loaded successfully")
except Exception as e:
print(f"❌ GIT failed to load: {e}")
models['git_error'] = str(e)
return models
# Load models at startup
print("πŸš€ Loading AI models...")
models = load_models()
print(f"πŸ“¦ Models loading completed")
def generate_captions(image, true_caption=""):
"""Generate captions using available models"""
if image is None:
return "❌ Please upload an image first."
# Ensure image is in RGB format
if image.mode != 'RGB':
image = image.convert('RGB')
results = []
start_time = time.time()
# Add true caption if provided
if true_caption.strip():
results.append(f"**🎯 True Caption:**")
results.append(f"{true_caption.strip()}")
results.append("")
# BLIP model
if 'blip_model' in models:
try:
blip_start = time.time()
inputs = models['blip_processor'](image, return_tensors="pt")
out = models['blip_model'].generate(**inputs, max_length=50, num_beams=5)
blip_caption = models['blip_processor'].decode(out[0], skip_special_tokens=True)
blip_time = time.time() - blip_start
results.append(f"**πŸ€– BLIP Model:** ({blip_time:.2f}s)")
results.append(f"{blip_caption}")
results.append("")
except Exception as e:
results.append(f"**πŸ€– BLIP Model:** Error - {str(e)}")
results.append("")
elif 'blip_error' in models:
results.append(f"**πŸ€– BLIP Model:** Not available - {models['blip_error']}")
results.append("")
# GIT model
if 'git_model' in models:
try:
git_start = time.time()
inputs = models['git_processor'](images=image, return_tensors="pt")
generated_ids = models['git_model'].generate(
pixel_values=inputs.pixel_values,
max_length=50,
num_beams=5
)
git_caption = models['git_processor'].batch_decode(generated_ids, skip_special_tokens=True)[0]
git_time = time.time() - git_start
results.append(f"**🧠 GIT Model:** ({git_time:.2f}s)")
results.append(f"{git_caption}")
results.append("")
except Exception as e:
results.append(f"**🧠 GIT Model:** Error - {str(e)}")
results.append("")
elif 'git_error' in models:
results.append(f"**🧠 GIT Model:** Not available - {models['git_error']}")
results.append("")
total_time = time.time() - start_time
results.append("---")
results.append(f"**⏱️ Total Processing Time:** {total_time:.2f} seconds")
results.append("")
results.append("**πŸ“Š About the Models:**")
results.append("β€’ **BLIP**: Salesforce's Bootstrapping Language-Image Pre-training")
results.append("β€’ **GIT**: Microsoft's Generative Image-to-text Transformer")
return "\n".join(results)
# Create Gradio interface
with gr.Blocks(
title="AI Image Captioning",
theme=gr.themes.Soft(),
css="""
.gradio-container {
max-width: 1200px !important;
}
"""
) as demo:
gr.Markdown("""
# πŸ€– AI Image Captioning
Upload an image and get captions from multiple state-of-the-art AI models!
**Available Models:**
- πŸ€– **BLIP** (Salesforce): Fast and accurate image captioning
- 🧠 **GIT** (Microsoft): Advanced generative image-to-text model
*Simply upload an image or try one of the examples below!*
""")
with gr.Row():
with gr.Column(scale=1):
image_input = gr.Image(
type="pil",
label="πŸ“Έ Upload Your Image",
height=400
)
true_caption_input = gr.Textbox(
label="🎯 True Caption (Optional)",
placeholder="Enter the correct caption to compare with AI predictions...",
lines=2
)
generate_btn = gr.Button(
"✨ Generate Captions",
variant="primary",
size="lg"
)
with gr.Column(scale=1):
output = gr.Textbox(
label="πŸ€– AI Generated Captions",
lines=20,
max_lines=25,
show_copy_button=True
)
# Example images
gr.Markdown("### πŸ“‹ Try These Examples:")
example_images = [
["https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat.jpg", "A cat sitting on a surface"],
["https://huggingface.co/datasets/mishig/sample_images/resolve/main/dog.jpg", "A dog in a field"],
["https://images.unsplash.com/photo-1506905925346-21bda4d32df4?w=500", "A mountain landscape with snow"],
["https://images.unsplash.com/photo-1549298916-b41d501d3772?w=500", "A red sports car"],
["https://images.unsplash.com/photo-1551963831-b3b1ca40c98e?w=500", "A breakfast with coffee and pastries"],
]
gr.Examples(
examples=example_images,
inputs=[image_input, true_caption_input],
outputs=output,
fn=generate_captions,
cache_examples=False
)
# Event handlers
generate_btn.click(
fn=generate_captions,
inputs=[image_input, true_caption_input],
outputs=output
)
# Auto-generate when image is uploaded
image_input.change(
fn=generate_captions,
inputs=[image_input, true_caption_input],
outputs=output
)
gr.Markdown("""
---
**πŸ”§ Technical Details:**
- Models run on Hugging Face's infrastructure
- Processing time varies based on image size and complexity
- All models are open-source and publicly available
**πŸ“ Tips:**
- Try different types of images (people, objects, landscapes, etc.)
- Compare the AI captions with your own description
- Larger images may take longer to process
""")
# Launch the app
if __name__ == "__main__":
demo.launch()