Spaces:
Sleeping
Sleeping
File size: 2,473 Bytes
e317171 0b68f81 e317171 1c83e34 e317171 0817cbf e317171 1baa500 e317171 7ec94e0 5362fbc e317171 1baa500 e317171 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import time
import torch
import pandas as pd
import gradio as gr
import onnxruntime as ort
from mediapipe.python.solutions import holistic
from utils.model import get_predictions
from utils.data import preprocess
title = '''
'''
cite_markdown = '''
'''
description = '''
'''
examples = []
# Load the configuration file.
ort_session = ort.InferenceSession('VSL_SAM_SLR_V2_joint.onnx')
# Load id-to-gloss mapping.
id2gloss = pd.read_csv('gloss.csv', names=['id', 'gloss']).to_dict()['gloss']
def inference(
video: str,
progress: gr.Progress = gr.Progress(),
) -> str:
'''
Video-based inference for Vietnamese Sign Language recognition.
Parameters
----------
video : str
The path to the video.
progress : gr.Progress, optional
The progress bar, by default gr.Progress()
Returns
-------
str
The inference message.
'''
keypoints_detector = holistic.Holistic(
static_image_mode=False,
model_complexity=2,
enable_segmentation=True,
refine_face_landmarks=True,
)
progress(0, desc='Preprocessing video')
start_time = time.time()
inputs = preprocess(
source=video,
keypoints_detector=keypoints_detector,
)
end_time = time.time()
data_time = end_time - start_time
progress(1/2, desc='Getting predictions')
start_time = time.time()
predictions = get_predictions(
inputs=inputs, ort_session=ort_session, id2gloss=id2gloss, k=3
)
end_time = time.time()
model_time = end_time - start_time
if len(predictions) == 0:
output_message = 'No sign language detected in the video. Please try again.'
else:
output_message = 'The top-3 predictions are:\n'
for i, prediction in enumerate(predictions):
output_message += f'\t{i+1}. {prediction["label"]} ({prediction["score"]:2f})\n'
output_message += f'Data processing time: {data_time:.2f} seconds\n'
output_message += f'Model inference time: {model_time:.2f} seconds\n'
output_message += f'Total time: {data_time + model_time:.2f} seconds'
output_message += f'\nInput shape: {inputs.shape}'
progress(1/2, desc='Completed')
return output_message
iface = gr.Interface(
fn=inference,
inputs='video',
outputs='text',
examples=examples,
title=title,
description=description,
)
iface.launch()
# print(inference('000_con_cho.mp4'))
|