File size: 2,146 Bytes
f90e7b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import gradio as gr
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import numpy as np

# Initialize model and processor globally - much smaller model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Move to GPU if available, otherwise stays on CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def process_input(image, text=""):
    """Process image and optional text input to generate description"""
    try:
        # Convert numpy array to PIL Image
        if isinstance(image, np.ndarray):
            pil_image = Image.fromarray(image)
        else:
            return "Please provide a valid image"

        # Set conditional text if provided
        conditional_text = text if text else "a video of"

        # Process image
        inputs = processor(
            pil_image, 
            text=conditional_text,
            return_tensors="pt"
        ).to(device)

        # Generate with careful parameters
        output = model.generate(
            **inputs,
            max_new_tokens=100,
            num_beams=5,
            length_penalty=1.0,
            repetition_penalty=1.5
        )

        # Decode
        result = processor.decode(output[0], skip_special_tokens=True)
        
        return result.strip()

    except Exception as e:
        return f"Error processing input: {str(e)}"

# Create Gradio interface
demo = gr.Interface(
    fn=process_input,
    inputs=[
        gr.Image(type="numpy", label="Upload Image"),
        gr.Textbox(
            label="Prompt (Optional)",
            placeholder="Guide the description or leave empty for automatic caption",
            lines=2
        ),
    ],
    outputs=gr.Textbox(label="Generated Description", lines=6),
    title="Scene Description Generator",
    description="Upload an image and optionally add a prompt to guide the description. Created by <a href='https://justlab.ai'>Justlab.ai</a>",

)

if __name__ == "__main__":
    demo.launch()