Spaces:
Runtime error
Runtime error
File size: 2,723 Bytes
4d426dd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
from decord import VideoReader
import torch
from transformers import AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel
import gradio as gr
device = "cuda" if torch.cuda.is_available() else "cpu"
# load pretrained processor, tokenizer, and model
image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = VisionEncoderDecoderModel.from_pretrained(
"Neleac/timesformer-gpt2-video-captioning"
).to(device)
with gr.Blocks() as demo:
demo.title = "Semantic Summarization of Videos using DLSG"
gr.Markdown('# Semantic Summarization of Videos using DLSG, Demo by Batch_B29')
with gr.Row():
with gr.Column(scale=2):
video = gr.Video(label="Upload Video", format="mp4")
generate = gr.Button(value="Generate Caption")
with gr.Column(scale=1):
text = gr.Textbox(label="Caption", placeholder="Caption will appear here")
with gr.Accordion("Settings", open=True):
with gr.Row():
max_length = gr.Slider(
label="Max Length", minimum=10, maximum=100, value=20, step=1
)
min_length = gr.Slider(
label="Min Length", minimum=1, maximum=10, value=10, step=1
)
beam_size = gr.Slider(label="Beam size", minimum=1, maximum=8, value=8, step=1)
througputs = gr.Radio(
label="througputs", choices=[1, 2, 3], value=1
)
def generate_caption(video, max_length, min_length, beam_size, througputs):
# read video
container = VideoReader(video)
clip_len = model.config.encoder.num_frames
frames = container.get_batch(
range(0, len(container), len(container) // (througputs * clip_len))
).asnumpy()
frames = [frame for frame in frames[:-1]]
# process frames
# generate caption
gen_kwargs = {
"min_length": min_length,
"max_length": max_length,
"num_beams": beam_size,
}
pixel_values = image_processor(frames, return_tensors="pt").pixel_values.to(
device
)
tokens = model.generate(pixel_values, **gen_kwargs)
caption = tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
return caption
generate.click(
generate_caption,
inputs=[video, max_length, min_length, beam_size, througputs],
outputs=text,
)
if __name__ == "__main__":
demo.launch() |