Spaces:

IbrahimHasani
/

ActionDetectionVideo

Runtime error

File size: 4,206 Bytes

56de2d4
 
b8466ce
 
 
 
 
10696ac
 
 
 
 
 
56de2d4
 
 
 
 
 
 
 
 
 
 
b8466ce
56de2d4
 
 
b8466ce
 
 
 
 
 
 
 
 
 
 
 
56de2d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8466ce
56de2d4
 
 
 
 
 
b8466ce
 
56de2d4
b8466ce
 
56de2d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8466ce
56de2d4
b8466ce
56de2d4
 
b8466ce
 
56de2d4
b8466ce
 
56de2d4
b8466ce
56de2d4
 
 
 
 
 
 
 
 
 
b8466ce
56de2d4
 
b8466ce
56de2d4
 
b8466ce
56de2d4
 
 
 
b8466ce

import gradio as gr
import torch
import numpy as np
from transformers import AutoProcessor, AutoModel
from PIL import Image
from decord import VideoReader, cpu
import cv2

print(f"Is CUDA available: {torch.cuda.is_available()}")
# True
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
# Tesla T4

def sample_uniform_frame_indices(clip_len, seg_len):
    if seg_len < clip_len:
        repeat_factor = np.ceil(clip_len / seg_len).astype(int)
        indices = np.arange(seg_len).tolist() * repeat_factor
        indices = indices[:clip_len]
    else:
        spacing = seg_len // clip_len
        indices = [i * spacing for i in range(clip_len)]
    return np.array(indices).astype(np.int64)

def read_video_decord(file_path, indices):
    vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
    video = vr.get_batch(indices).asnumpy()
    return video

def read_video_opencv(file_path, indices):
    vidcap = cv2.VideoCapture(file_path)
    frames = []
    for idx in indices:
        vidcap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        success, image = vidcap.read()
        if success:
            # Convert BGR to RGB
            frames.append(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    return np.array(frames)


def concatenate_frames(frames, clip_len):
    layout = {
        32: (4, 8),
        16: (4, 4),
        8:  (2, 4)
    }
    rows, cols = layout[clip_len]
    combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows))
    frame_iter = iter(frames)
    y_offset = 0
    for i in range(rows):
        x_offset = 0
        for j in range(cols):
            img = Image.fromarray(next(frame_iter))
            combined_image.paste(img, (x_offset, y_offset))
            x_offset += frames[0].shape[1]
        y_offset += frames[0].shape[0]
    return combined_image

def model_interface(uploaded_video, model_choice, activity):
    clip_len = {
        "microsoft/xclip-base-patch16-zero-shot": 32,
        "microsoft/xclip-base-patch32-16-frames": 16,
        "microsoft/xclip-base-patch32": 8
    }.get(model_choice, 32)
    indices = sample_uniform_frame_indices(clip_len, seg_len=len(VideoReader(uploaded_video)))
    video = read_video_opencv(uploaded_video, indices)
    concatenated_image = concatenate_frames(video, clip_len)

    # Appending "other" to the list of activities
    activities_list = [activity, "other"]
    processor = AutoProcessor.from_pretrained(model_choice)
    model = AutoModel.from_pretrained(model_choice)
    inputs = processor(
        text=activities_list,
        videos=list(video),
        return_tensors="pt",
        padding=True,
    )

    with torch.no_grad():
        outputs = model(**inputs)

    logits_per_video = outputs.logits_per_video
    probs = logits_per_video.softmax(dim=1)

    results_probs = []
    results_logits = []
    max_prob_index = torch.argmax(probs[0]).item()
    for i in range(len(activities_list)):
        current_activity = activities_list[i]
        prob = float(probs[0][i])
        logit = float(logits_per_video[0][i])
        results_probs.append((current_activity, f"Probability: {prob * 100:.2f}%"))
        results_logits.append((current_activity, f"Raw Score: {logit:.2f}"))

    likely_label = activities_list[max_prob_index]
    likely_probability = float(probs[0][max_prob_index]) * 100

    return concatenated_image, results_probs, results_logits, [ likely_label , likely_probability ]

iface = gr.Interface(
    fn=model_interface,
    inputs=[
        gr.components.Video(label="Upload a video file"),
        gr.components.Dropdown(choices=[
            "microsoft/xclip-base-patch16-zero-shot",
            "microsoft/xclip-base-patch32-16-frames",
            "microsoft/xclip-base-patch32"
        ], label="Model Choice"),
        gr.components.Textbox(default="dancing", label="Desired Activity to Recognize"),
    ],
    outputs=[
        gr.components.Image(type="pil", label="Sampled Frames"),
        gr.components.Textbox(type="text", label="Probabilities"),
        gr.components.Textbox(type="text", label="Raw Scores"),
        gr.components.Textbox(type="text", label="Top Prediction")
    ],
    live=False
)

iface.launch()