File size: 2,473 Bytes
e317171
0b68f81
e317171
 
 
 
1c83e34
 
e317171
 
 
 
 
 
 
 
 
 
 
 
 
0817cbf
e317171
 
1baa500
e317171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ec94e0
5362fbc
e317171
 
 
 
 
 
 
 
 
 
1baa500
e317171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import time
import torch
import pandas as pd
import gradio as gr
import onnxruntime as ort
from mediapipe.python.solutions import holistic
from utils.model import get_predictions
from utils.data import preprocess

title = '''

'''

cite_markdown = '''

'''

description = '''

'''

examples = []

# Load the configuration file.
ort_session = ort.InferenceSession('VSL_SAM_SLR_V2_joint.onnx')

# Load id-to-gloss mapping.
id2gloss = pd.read_csv('gloss.csv', names=['id', 'gloss']).to_dict()['gloss']


def inference(
    video: str,
    progress: gr.Progress = gr.Progress(),
) -> str:
    '''
    Video-based inference for Vietnamese Sign Language recognition.

    Parameters
    ----------
    video : str
        The path to the video.
    progress : gr.Progress, optional
        The progress bar, by default gr.Progress()

    Returns
    -------
    str
        The inference message.
    '''
    keypoints_detector = holistic.Holistic(
        static_image_mode=False,
        model_complexity=2,
        enable_segmentation=True,
        refine_face_landmarks=True,
    )

    progress(0, desc='Preprocessing video')
    start_time = time.time()
    inputs = preprocess(
        source=video,
        keypoints_detector=keypoints_detector,
    )
    end_time = time.time()
    data_time = end_time - start_time

    progress(1/2, desc='Getting predictions')
    start_time = time.time()
    predictions = get_predictions(
        inputs=inputs, ort_session=ort_session, id2gloss=id2gloss, k=3
    )
    end_time = time.time()
    model_time = end_time - start_time
    
    

    if len(predictions) == 0:
        output_message = 'No sign language detected in the video. Please try again.'
    else:
        output_message = 'The top-3 predictions are:\n'
        for i, prediction in enumerate(predictions):
            output_message += f'\t{i+1}. {prediction["label"]} ({prediction["score"]:2f})\n'
        output_message += f'Data processing time: {data_time:.2f} seconds\n'
        output_message += f'Model inference time: {model_time:.2f} seconds\n'
        output_message += f'Total time: {data_time + model_time:.2f} seconds'
        output_message += f'\nInput shape: {inputs.shape}'

    progress(1/2, desc='Completed')

    return output_message


iface = gr.Interface(
    fn=inference,
    inputs='video',
    outputs='text',
    examples=examples,
    title=title,
    description=description,
)
iface.launch()
# print(inference('000_con_cho.mp4'))