Spaces:

aavetis
/

ugen-image-captioning

Running on Zero

File size: 1,634 Bytes

cdcc865
 
 
109b906
bf23625
cdcc865
 
e292dd7
cdcc865
 
bf23625
cdcc865
 
b9c1647
cdcc865
 
109b906
 
 
 
 
 
 
 
e292dd7
cdcc865
109b906
 
 
 
cdcc865
 
4e87b4a
 
 
 
cdcc865
 
4e87b4a
582a256
4e87b4a
cdcc865

import gradio as gr
from uform import gen_model
from PIL import Image
import torch
import spaces

# Load the model and processor
model = gen_model.VLMForCausalLM.from_pretrained("unum-cloud/uform-gen").to('cuda')
processor = gen_model.VLMProcessor.from_pretrained("unum-cloud/uform-gen")

@spaces.GPU
def generate_caption(image, prompt):
    # Process the image and the prompt
    inputs = processor(texts=[prompt], images=[image], return_tensors="pt").to('cuda')

    # Generate the output
    with torch.inference_mode():
        output = model.generate(
            **inputs,
            do_sample=False,
            use_cache=True,
            max_new_tokens=128,
            eos_token_id=32001,
            pad_token_id=processor.tokenizer.pad_token_id
        )

    prompt_len = inputs["input_ids"].shape[1]
    decoded_text = processor.batch_decode(output[:, prompt_len:])[0]

    return decoded_text

# Define the Gradio interface
description = """Quick demonstration of the new Unum uForm-gen for image captioning. Upload an image to generate a detailed caption. Modify the Prompt to change the level of detail in the caption.

The model used in this app is available at [Hugging Face Model Hub](https://huggingface.co/unum-cloud/uform-gen) and the source code can be found on [GitHub](https://github.com/unum-cloud/uform)."""

iface = gr.Interface(
    fn=generate_caption,
    inputs=[gr.Image(type="pil", label="Upload Image"), gr.Textbox(label="Prompt", value="Describe the image in great detail")],
    outputs=gr.Textbox(label="Generated Caption"),
    description=description
)

# Launch the interface
iface.launch()