Spaces:

kiokpam
/

SAM-SLR-V1

Sleeping

File size: 2,473 Bytes

import time
import torch
import pandas as pd
import gradio as gr
import onnxruntime as ort
from mediapipe.python.solutions import holistic
from utils.model import get_predictions
from utils.data import preprocess

title = '''

'''

cite_markdown = '''

'''

description = '''

'''

examples = []

# Load the configuration file.
ort_session = ort.InferenceSession('VSL_SAM_SLR_V2_joint.onnx')

# Load id-to-gloss mapping.
id2gloss = pd.read_csv('gloss.csv', names=['id', 'gloss']).to_dict()['gloss']


def inference(
    video: str,
    progress: gr.Progress = gr.Progress(),
) -> str:
    '''
    Video-based inference for Vietnamese Sign Language recognition.

    Parameters
    ----------
    video : str
        The path to the video.
    progress : gr.Progress, optional
        The progress bar, by default gr.Progress()

    Returns
    -------
    str
        The inference message.
    '''
    keypoints_detector = holistic.Holistic(
        static_image_mode=False,
        model_complexity=2,
        enable_segmentation=True,
        refine_face_landmarks=True,
    )

    progress(0, desc='Preprocessing video')
    start_time = time.time()
    inputs = preprocess(
        source=video,
        keypoints_detector=keypoints_detector,
    )
    end_time = time.time()
    data_time = end_time - start_time

    progress(1/2, desc='Getting predictions')
    start_time = time.time()
    predictions = get_predictions(
        inputs=inputs, ort_session=ort_session, id2gloss=id2gloss, k=3
    )
    end_time = time.time()
    model_time = end_time - start_time
    
    

    if len(predictions) == 0:
        output_message = 'No sign language detected in the video. Please try again.'
    else:
        output_message = 'The top-3 predictions are:\n'
        for i, prediction in enumerate(predictions):
            output_message += f'\t{i+1}. {prediction["label"]} ({prediction["score"]:2f})\n'
        output_message += f'Data processing time: {data_time:.2f} seconds\n'
        output_message += f'Model inference time: {model_time:.2f} seconds\n'
        output_message += f'Total time: {data_time + model_time:.2f} seconds'
        output_message += f'\nInput shape: {inputs.shape}'

    progress(1/2, desc='Completed')

    return output_message


iface = gr.Interface(
    fn=inference,
    inputs='video',
    outputs='text',
    examples=examples,
    title=title,
    description=description,
)
iface.launch()
# print(inference('000_con_cho.mp4'))