File size: 1,164 Bytes
fabaa3c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
import torch
import gradio as gr
from transformers import AutoModel
from transformers import AutoProcessor
import spaces
# Load pre-trained models for image captioning and language modeling
model3 = AutoModel.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
processor = AutoProcessor.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
# Define a function for image captioning
@spaces.GPU(queue=False)
def videochat(image3, prompt3):
# Process input image and prompt
inputs = processor(text=[prompt3], images=[image3], return_tensors="pt")
# Generate captions
with torch.inference_mode():
output = model3.generate(
**inputs,
do_sample=False,
use_cache=True,
max_new_tokens=256,
eos_token_id=151645,
pad_token_id=processor.tokenizer.pad_token_id
)
prompt_len = inputs["input_ids"].shape[1]
# Decode and return the generated captions
decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
if decoded_text.endswith("<|im_end|>"):
decoded_text = decoded_text[:-10]
yield decoded_text |