import copy
import torch
import numpy as np
import gradio as gr
from spoter_mod.skeleton_extractor import obtain_pose_data
from spoter_mod.normalization.body_normalization import normalize_single_dict as normalize_single_body_dict, BODY_IDENTIFIERS
from spoter_mod.normalization.hand_normalization import normalize_single_dict as normalize_single_hand_dict, HAND_IDENTIFIERS
model = torch.load("spoter-checkpoint.pth", map_location=torch.device('cpu'))
HAND_IDENTIFIERS = [id + "_Left" for id in HAND_IDENTIFIERS] + [id + "_Right" for id in HAND_IDENTIFIERS]
GLOSS = ['book', 'drink', 'computer', 'before', 'chair', 'go', 'clothes', 'who', 'candy', 'cousin', 'deaf', 'fine',
'help', 'no', 'thin', 'walk', 'year', 'yes', 'all', 'black', 'cool', 'finish', 'hot', 'like', 'many', 'mother',
'now', 'orange', 'table', 'thanksgiving', 'what', 'woman', 'bed', 'blue', 'bowling', 'can', 'dog', 'family',
'fish', 'graduate', 'hat', 'hearing', 'kiss', 'language', 'later', 'man', 'shirt', 'study', 'tall', 'white',
'wrong', 'accident', 'apple', 'bird', 'change', 'color', 'corn', 'cow', 'dance', 'dark', 'doctor', 'eat',
'enjoy', 'forget', 'give', 'last', 'meet', 'pink', 'pizza', 'play', 'school', 'secretary', 'short', 'time',
'want', 'work', 'africa', 'basketball', 'birthday', 'brown', 'but', 'cheat', 'city', 'cook', 'decide', 'full',
'how', 'jacket', 'letter', 'medicine', 'need', 'paint', 'paper', 'pull', 'purple', 'right', 'same', 'son',
'tell', 'thursday']
device = torch.device("cpu")
if torch.cuda.is_available():
device = torch.device("cuda")
def tensor_to_dictionary(landmarks_tensor: torch.Tensor) -> dict:
data_array = landmarks_tensor.numpy()
output = {}
for landmark_index, identifier in enumerate(BODY_IDENTIFIERS + HAND_IDENTIFIERS):
output[identifier] = data_array[:, landmark_index]
return output
def dictionary_to_tensor(landmarks_dict: dict) -> torch.Tensor:
output = np.empty(shape=(len(landmarks_dict["leftEar"]), len(BODY_IDENTIFIERS + HAND_IDENTIFIERS), 2))
for landmark_index, identifier in enumerate(BODY_IDENTIFIERS + HAND_IDENTIFIERS):
output[:, landmark_index, 0] = [frame[0] for frame in landmarks_dict[identifier]]
output[:, landmark_index, 1] = [frame[1] for frame in landmarks_dict[identifier]]
return torch.from_numpy(output)
def greet(label, video0, video1):
if label == "Webcam":
video = video0
elif label == "Video":
video = video1
elif label == "X":
return {"A": 0.8, "B": 0.1, "C": 0.1}
return {}
data = obtain_pose_data(video)
depth_map = np.empty(shape=(len(data.data_hub["nose_X"]), len(BODY_IDENTIFIERS + HAND_IDENTIFIERS), 2))
for index, identifier in enumerate(BODY_IDENTIFIERS + HAND_IDENTIFIERS):
depth_map[:, index, 0] = data.data_hub[identifier + "_X"]
depth_map[:, index, 1] = data.data_hub[identifier + "_Y"]
depth_map = torch.from_numpy(np.copy(depth_map))
depth_map = tensor_to_dictionary(depth_map)
keys = copy.copy(list(depth_map.keys()))
for key in keys:
data = depth_map[key]
del depth_map[key]
depth_map[key.replace("_Left", "_0").replace("_Right", "_1")] = data
depth_map = normalize_single_body_dict(depth_map)
depth_map = normalize_single_hand_dict(depth_map)
keys = copy.copy(list(depth_map.keys()))
for key in keys:
data = depth_map[key]
del depth_map[key]
depth_map[key.replace("_0", "_Left").replace("_1", "_Right")] = data
depth_map = dictionary_to_tensor(depth_map)
depth_map = depth_map - 0.5
inputs = depth_map.squeeze(0).to(device)
outputs = model(inputs).expand(1, -1, -1)
results = torch.nn.functional.softmax(outputs, dim=2).detach().numpy()[0, 0]
results = {GLOSS[i]: float(results[i]) for i in range(100)}
return results
label = gr.outputs.Label(num_top_classes=3, label="Top class probabilities")
demo = gr.Interface(fn=greet, inputs=[gr.Dropdown(["Webcam", "Video"], label="Please select the input type:", type="value"), gr.Video(source="webcam", label="Webcam recording", type="mp4"), gr.Video(source="upload", label="Video upload", type="mp4")], outputs=label,
title="SPOTER Sign language recognition",
- Upload or record a video.
- Ensure that there is only a single person in the shot.
- The signer should be front-facing and have a calm background.
- Click "Submit".
- Results will appear in "Results" panel on the right shortly.
We do not collect any user information. The videos are deleted from our servers after the inference is completed, unless you flag any of them for further inspection.
article="by [Matyáš Boháček](",
@font-face {
font-family: Graphik;
font-weight: regular;
src: url("") format("opentype");
@font-face {
font-family: Graphik;
font-weight: bold;
src: url("") format("opentype");
@font-face {
font-family: MonumentExpanded;
font-weight: regular;
src: url("") format("opentype");
@font-face {
font-family: MonumentExpanded;
font-weight: bold;
src: url("") format("opentype");
html {
font-family: "Graphik";
h1 {
font-family: "MonumentExpanded";
#12 {
- background-image: linear-gradient(to left, #61D836, #6CB346) !important;
background-color: #61D836 !important;
#12:hover {
- background-image: linear-gradient(to left, #61D836, #6CB346) !important;
background-color: #6CB346 !important;
border: 0 !important;
border-color: 0 !important;
.dark .gr-button-primary {
--tw-gradient-from: #61D836;
--tw-gradient-to: #6CB346;
border: 0 !important;
border-color: 0 !important;
.dark .gr-button-primary:hover {
--tw-gradient-from: #64A642;
--tw-gradient-to: #58933B;
border: 0 !important;
border-color: 0 !important;
.gr-prose li {
margin-top: 0 !important;
margin-bottom: 0 !important;
.gr-prose ol ol, .gr-prose ol ul, .gr-prose ul ol, .gr-prose ul ul {
margin-top: 0 !important;
margin-bottom: 0 !important;
.gr-prose h1 {
font-size: 1.75em !important;
text-align: left !important;
.unselectable {
-webkit-user-select: none;
-moz-user-select: none;
-ms-user-select: none;
user-select: none;
footer {
opacity: 0 !important;
alpha: 0 !important;