Spaces:
Runtime error
Runtime error
""" | |
Hugging Face Spaces App - Image Captioning | |
Deploy this to HF Spaces for free hosting | |
""" | |
import gradio as gr | |
import torch | |
from PIL import Image | |
import time | |
def load_models(): | |
"""Load models with error handling""" | |
models = {} | |
try: | |
from transformers import BlipProcessor, BlipForConditionalGeneration | |
print("Loading BLIP model...") | |
models['blip_processor'] = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") | |
models['blip_model'] = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") | |
print("β BLIP loaded successfully") | |
except Exception as e: | |
print(f"β BLIP failed to load: {e}") | |
models['blip_error'] = str(e) | |
try: | |
from transformers import AutoProcessor, AutoModelForCausalLM | |
print("Loading GIT model...") | |
models['git_processor'] = AutoProcessor.from_pretrained("microsoft/git-base") | |
models['git_model'] = AutoModelForCausalLM.from_pretrained("microsoft/git-base") | |
print("β GIT loaded successfully") | |
except Exception as e: | |
print(f"β GIT failed to load: {e}") | |
models['git_error'] = str(e) | |
return models | |
# Load models at startup | |
print("π Loading AI models...") | |
models = load_models() | |
print(f"π¦ Models loading completed") | |
def generate_captions(image, true_caption=""): | |
"""Generate captions using available models""" | |
if image is None: | |
return "β Please upload an image first." | |
# Ensure image is in RGB format | |
if image.mode != 'RGB': | |
image = image.convert('RGB') | |
results = [] | |
start_time = time.time() | |
# Add true caption if provided | |
if true_caption.strip(): | |
results.append(f"**π― True Caption:**") | |
results.append(f"{true_caption.strip()}") | |
results.append("") | |
# BLIP model | |
if 'blip_model' in models: | |
try: | |
blip_start = time.time() | |
inputs = models['blip_processor'](image, return_tensors="pt") | |
out = models['blip_model'].generate(**inputs, max_length=50, num_beams=5) | |
blip_caption = models['blip_processor'].decode(out[0], skip_special_tokens=True) | |
blip_time = time.time() - blip_start | |
results.append(f"**π€ BLIP Model:** ({blip_time:.2f}s)") | |
results.append(f"{blip_caption}") | |
results.append("") | |
except Exception as e: | |
results.append(f"**π€ BLIP Model:** Error - {str(e)}") | |
results.append("") | |
elif 'blip_error' in models: | |
results.append(f"**π€ BLIP Model:** Not available - {models['blip_error']}") | |
results.append("") | |
# GIT model | |
if 'git_model' in models: | |
try: | |
git_start = time.time() | |
inputs = models['git_processor'](images=image, return_tensors="pt") | |
generated_ids = models['git_model'].generate( | |
pixel_values=inputs.pixel_values, | |
max_length=50, | |
num_beams=5 | |
) | |
git_caption = models['git_processor'].batch_decode(generated_ids, skip_special_tokens=True)[0] | |
git_time = time.time() - git_start | |
results.append(f"**π§ GIT Model:** ({git_time:.2f}s)") | |
results.append(f"{git_caption}") | |
results.append("") | |
except Exception as e: | |
results.append(f"**π§ GIT Model:** Error - {str(e)}") | |
results.append("") | |
elif 'git_error' in models: | |
results.append(f"**π§ GIT Model:** Not available - {models['git_error']}") | |
results.append("") | |
total_time = time.time() - start_time | |
results.append("---") | |
results.append(f"**β±οΈ Total Processing Time:** {total_time:.2f} seconds") | |
results.append("") | |
results.append("**π About the Models:**") | |
results.append("β’ **BLIP**: Salesforce's Bootstrapping Language-Image Pre-training") | |
results.append("β’ **GIT**: Microsoft's Generative Image-to-text Transformer") | |
return "\n".join(results) | |
# Create Gradio interface | |
with gr.Blocks( | |
title="AI Image Captioning", | |
theme=gr.themes.Soft(), | |
css=""" | |
.gradio-container { | |
max-width: 1200px !important; | |
} | |
""" | |
) as demo: | |
gr.Markdown(""" | |
# π€ AI Image Captioning | |
Upload an image and get captions from multiple state-of-the-art AI models! | |
**Available Models:** | |
- π€ **BLIP** (Salesforce): Fast and accurate image captioning | |
- π§ **GIT** (Microsoft): Advanced generative image-to-text model | |
*Simply upload an image or try one of the examples below!* | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
image_input = gr.Image( | |
type="pil", | |
label="πΈ Upload Your Image", | |
height=400 | |
) | |
true_caption_input = gr.Textbox( | |
label="π― True Caption (Optional)", | |
placeholder="Enter the correct caption to compare with AI predictions...", | |
lines=2 | |
) | |
generate_btn = gr.Button( | |
"β¨ Generate Captions", | |
variant="primary", | |
size="lg" | |
) | |
with gr.Column(scale=1): | |
output = gr.Textbox( | |
label="π€ AI Generated Captions", | |
lines=20, | |
max_lines=25, | |
show_copy_button=True | |
) | |
# Example images | |
gr.Markdown("### π Try These Examples:") | |
example_images = [ | |
["https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat.jpg", "A cat sitting on a surface"], | |
["https://huggingface.co/datasets/mishig/sample_images/resolve/main/dog.jpg", "A dog in a field"], | |
["https://images.unsplash.com/photo-1506905925346-21bda4d32df4?w=500", "A mountain landscape with snow"], | |
["https://images.unsplash.com/photo-1549298916-b41d501d3772?w=500", "A red sports car"], | |
["https://images.unsplash.com/photo-1551963831-b3b1ca40c98e?w=500", "A breakfast with coffee and pastries"], | |
] | |
gr.Examples( | |
examples=example_images, | |
inputs=[image_input, true_caption_input], | |
outputs=output, | |
fn=generate_captions, | |
cache_examples=False | |
) | |
# Event handlers | |
generate_btn.click( | |
fn=generate_captions, | |
inputs=[image_input, true_caption_input], | |
outputs=output | |
) | |
# Auto-generate when image is uploaded | |
image_input.change( | |
fn=generate_captions, | |
inputs=[image_input, true_caption_input], | |
outputs=output | |
) | |
gr.Markdown(""" | |
--- | |
**π§ Technical Details:** | |
- Models run on Hugging Face's infrastructure | |
- Processing time varies based on image size and complexity | |
- All models are open-source and publicly available | |
**π Tips:** | |
- Try different types of images (people, objects, landscapes, etc.) | |
- Compare the AI captions with your own description | |
- Larger images may take longer to process | |
""") | |
# Launch the app | |
if __name__ == "__main__": | |
demo.launch() | |