Spaces:
Build error
Build error
File size: 5,906 Bytes
ccdf9bb 9ff5263 ccdf9bb 9f09b81 d1d9d76 9f09b81 d21fd5e 9f09b81 ccdf9bb ada447d 039e5e3 ccdf9bb 25cbc9c ac2a7ce ad6702d ac2a7ce ad6702d 31f9148 ad6702d 3a0d867 ad6702d 3a0d867 ad6702d 3a0d867 ad6702d 31f9148 ad6702d 31f9148 ad6702d e75d68f ac2a7ce 525f7c4 ac2a7ce d2bad0b ccdf9bb 4cb7892 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import copy
import os
os.system("pip uninstall -y gradio")
os.system("pip install gradio==3.16.0")
import torch
import numpy as np
import gradio as gr
from spoter_mod.skeleton_extractor import obtain_pose_data
from spoter_mod.normalization.body_normalization import normalize_single_dict as normalize_single_body_dict, BODY_IDENTIFIERS
from spoter_mod.normalization.hand_normalization import normalize_single_dict as normalize_single_hand_dict, HAND_IDENTIFIERS
model = torch.load("spoter-checkpoint.pth", map_location=torch.device('cpu'))
model.train(False)
HAND_IDENTIFIERS = [id + "_Left" for id in HAND_IDENTIFIERS] + [id + "_Right" for id in HAND_IDENTIFIERS]
GLOSS = ['book', 'drink', 'computer', 'before', 'chair', 'go', 'clothes', 'who', 'candy', 'cousin', 'deaf', 'fine',
'help', 'no', 'thin', 'walk', 'year', 'yes', 'all', 'black', 'cool', 'finish', 'hot', 'like', 'many', 'mother',
'now', 'orange', 'table', 'thanksgiving', 'what', 'woman', 'bed', 'blue', 'bowling', 'can', 'dog', 'family',
'fish', 'graduate', 'hat', 'hearing', 'kiss', 'language', 'later', 'man', 'shirt', 'study', 'tall', 'white',
'wrong', 'accident', 'apple', 'bird', 'change', 'color', 'corn', 'cow', 'dance', 'dark', 'doctor', 'eat',
'enjoy', 'forget', 'give', 'last', 'meet', 'pink', 'pizza', 'play', 'school', 'secretary', 'short', 'time',
'want', 'work', 'africa', 'basketball', 'birthday', 'brown', 'but', 'cheat', 'city', 'cook', 'decide', 'full',
'how', 'jacket', 'letter', 'medicine', 'need', 'paint', 'paper', 'pull', 'purple', 'right', 'same', 'son',
'tell', 'thursday']
device = torch.device("cpu")
if torch.cuda.is_available():
device = torch.device("cuda")
def tensor_to_dictionary(landmarks_tensor: torch.Tensor) -> dict:
data_array = landmarks_tensor.numpy()
output = {}
for landmark_index, identifier in enumerate(BODY_IDENTIFIERS + HAND_IDENTIFIERS):
output[identifier] = data_array[:, landmark_index]
return output
def dictionary_to_tensor(landmarks_dict: dict) -> torch.Tensor:
output = np.empty(shape=(len(landmarks_dict["leftEar"]), len(BODY_IDENTIFIERS + HAND_IDENTIFIERS), 2))
for landmark_index, identifier in enumerate(BODY_IDENTIFIERS + HAND_IDENTIFIERS):
output[:, landmark_index, 0] = [frame[0] for frame in landmarks_dict[identifier]]
output[:, landmark_index, 1] = [frame[1] for frame in landmarks_dict[identifier]]
return torch.from_numpy(output)
def greet(label, video0, video1):
if label == "Webcam":
video = video0
elif label == "Video":
video = video1
elif label == "X":
return {"A": 0.8, "B": 0.1, "C": 0.1}
else:
return {}
data = obtain_pose_data(video)
depth_map = np.empty(shape=(len(data.data_hub["nose_X"]), len(BODY_IDENTIFIERS + HAND_IDENTIFIERS), 2))
for index, identifier in enumerate(BODY_IDENTIFIERS + HAND_IDENTIFIERS):
depth_map[:, index, 0] = data.data_hub[identifier + "_X"]
depth_map[:, index, 1] = data.data_hub[identifier + "_Y"]
depth_map = torch.from_numpy(np.copy(depth_map))
depth_map = tensor_to_dictionary(depth_map)
keys = copy.copy(list(depth_map.keys()))
for key in keys:
data = depth_map[key]
del depth_map[key]
depth_map[key.replace("_Left", "_0").replace("_Right", "_1")] = data
depth_map = normalize_single_body_dict(depth_map)
depth_map = normalize_single_hand_dict(depth_map)
keys = copy.copy(list(depth_map.keys()))
for key in keys:
data = depth_map[key]
del depth_map[key]
depth_map[key.replace("_0", "_Left").replace("_1", "_Right")] = data
depth_map = dictionary_to_tensor(depth_map)
depth_map = depth_map - 0.5
inputs = depth_map.squeeze(0).to(device)
outputs = model(inputs).expand(1, -1, -1)
results = torch.nn.functional.softmax(outputs, dim=2).detach().numpy()[0, 0]
results = {GLOSS[i]: float(results[i]) for i in range(100)}
return results
label = gr.outputs.Label(num_top_classes=3, label="Top class probabilities")
demo = gr.Interface(fn=greet, inputs=[gr.Dropdown(["Webcam", "Video"], label="Input source", type="value"), gr.Video(source="webcam", label="Webcam recording", type="mp4"), gr.Video(source="upload", label="Video upload", type="mp4")], outputs=label,
title="",
description="""
<img src="https://www.signlanguagerecognition.com/spoter-logo.png" style="width: 120px; margin-left: -5px;">
<h1 style="margin-top: -20px; color: #F7B832;">ASL Recognition Demo</h1>
<details>
<summary style="font-family: MonumentExpanded; font-size: 1em !important;" class="unselectable">
Instructions
</summary>
<ol>
<li> Upload or record a video.
<ul>
<li> Ensure that there is only a single person in the shot.
<li> The signer should be front-facing and have a calm background.
</ul>
<li> Click "Submit".
<li> Results will appear in "Results" panel on the right shortly.
</ol>
</details>
<details>
<summary style="font-family: MonumentExpanded;font-size: 1em !important;" class="unselectable">
Privacy
</summary>
We do not collect any user information. The videos are deleted from our servers after the inference is completed, unless you flag any of them for further inspection.
</details>
""",
article="by [Matyáš Boháček](https://www.matyasbohacek.com)",
css="""styles.css""",
cache_examples=True,
examples=[
["Video", None, "examples/chair.mp4"],
["Video", None, "examples/computer.mp4"],
["Video", None, "examples/work.mp4"]
]
)
demo.launch(debug=True)
|