File size: 2,146 Bytes
f90e7b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import gradio as gr
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import numpy as np
# Initialize model and processor globally - much smaller model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# Move to GPU if available, otherwise stays on CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
def process_input(image, text=""):
"""Process image and optional text input to generate description"""
try:
# Convert numpy array to PIL Image
if isinstance(image, np.ndarray):
pil_image = Image.fromarray(image)
else:
return "Please provide a valid image"
# Set conditional text if provided
conditional_text = text if text else "a video of"
# Process image
inputs = processor(
pil_image,
text=conditional_text,
return_tensors="pt"
).to(device)
# Generate with careful parameters
output = model.generate(
**inputs,
max_new_tokens=100,
num_beams=5,
length_penalty=1.0,
repetition_penalty=1.5
)
# Decode
result = processor.decode(output[0], skip_special_tokens=True)
return result.strip()
except Exception as e:
return f"Error processing input: {str(e)}"
# Create Gradio interface
demo = gr.Interface(
fn=process_input,
inputs=[
gr.Image(type="numpy", label="Upload Image"),
gr.Textbox(
label="Prompt (Optional)",
placeholder="Guide the description or leave empty for automatic caption",
lines=2
),
],
outputs=gr.Textbox(label="Generated Description", lines=6),
title="Scene Description Generator",
description="Upload an image and optionally add a prompt to guide the description. Created by <a href='https://justlab.ai'>Justlab.ai</a>",
)
if __name__ == "__main__":
demo.launch() |