|
|
|
|
|
|
|
import os |
|
os.environ["KERAS_BACKEND"] = "torch" |
|
import keras |
|
import streamlit as st |
|
import cv2 |
|
import numpy as np |
|
import tempfile |
|
import time |
|
from PIL import Image |
|
from keras.models import Sequential |
|
import os |
|
from keras.models import Sequential |
|
import pickle |
|
import keras |
|
from keras.models import Sequential |
|
import os |
|
from keras.layers import LSTM, Dense, Bidirectional, Dropout,Input,BatchNormalization |
|
from model import handpose_model, bodypose_25_model |
|
from expression_mapping import expression_mapping |
|
from ISL_Model_parameter import ISLSignPosTranslator |
|
import pandas as pd |
|
import numpy as np |
|
import ffmpeg |
|
import subprocess |
|
from typing import NamedTuple |
|
import json |
|
import util |
|
|
|
class FFProbeResult(NamedTuple): |
|
return_code: int |
|
json: str |
|
error: str |
|
|
|
|
|
def ffprobe(file_path) -> FFProbeResult: |
|
command_array = ["ffprobe", |
|
"-v", "quiet", |
|
"-print_format", "json", |
|
"-show_format", |
|
"-show_streams", |
|
file_path] |
|
result = subprocess.run(command_array, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) |
|
return FFProbeResult(return_code=result.returncode, |
|
json=result.stdout, |
|
error=result.stderr) |
|
X_body_test = [f'bodypeaks_x_{i}' for i in range(15)] + [f'bodypeaks_y_{i}' for i in range(15)] |
|
X_hand0_test = [f'hand0peaks_x_{i}' for i in range(21)] + [f'hand0peaks_y_{i}' for i in range(21)] + [f'hand0peaks_peaktxt{i}' for i in range(21)] |
|
X_hand1_test = [f'hand1peaks_x_{i}' for i in range(21)] + [f'hand1peaks_y_{i}' for i in range(21)] + [f'hand1peaks_peaktxt{i}' for i in range(21)] |
|
|
|
feature_columns_new = X_body_test + X_hand0_test + X_hand1_test |
|
label_columns = ['Expression_encoded'] |
|
|
|
@st.cache_resource |
|
def create_timeseries_data(isl_data,feature_columns,label_columns, window_size=20): |
|
""" |
|
Creates timeseries data from a DataFrame with a specified window size |
|
and padding at the end. |
|
|
|
Args: |
|
df (pandas.DataFrame): The input DataFrame. |
|
window_size (int, optional): The window size for creating timeseries data. Defaults to 20. |
|
pad_value (any, optional): The value to use for padding at the end. Defaults to None. |
|
|
|
Returns: |
|
list: A list of lists, where each inner list represents a window of timeseries data. |
|
""" |
|
|
|
|
|
if isl_data.empty: |
|
return [],[] |
|
|
|
X=[] |
|
y=[] |
|
i=0 |
|
for group, file_df in isl_data.groupby(['Type','Expression_encoded','FileName']): |
|
expr_types,exprs,filepaths=group |
|
|
|
|
|
|
|
first_frame=np.zeros((1,156)) |
|
for idx,x in enumerate([file_df[i:i+window_size] for i in range(0,file_df.shape[0],1)]): |
|
|
|
|
|
if x.shape[0]<window_size: |
|
X.append(np.concatenate((np.repeat(first_frame, (window_size-x.shape[0]), axis=0),x[feature_columns].values), axis=0)) |
|
y.append(exprs) |
|
|
|
|
|
continue |
|
|
|
|
|
X.append(x[feature_columns].values) |
|
y.append(exprs) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return X,y |
|
|
|
translation_model=None |
|
|
|
@st.cache_resource |
|
def get_translator_model(): |
|
translation_model = Sequential() |
|
translation_model.add(Input(shape=((20, 156)))) |
|
translation_model.add(keras.layers.Masking(mask_value=0.)) |
|
translation_model.add(BatchNormalization()) |
|
translation_model.add(Bidirectional(LSTM(32, recurrent_dropout=0.2, return_sequences=True))) |
|
|
|
translation_model.add(Dropout(0.2)) |
|
translation_model.add(Bidirectional(LSTM(32, recurrent_dropout=0.2))) |
|
|
|
translation_model.add(keras.layers.Activation('elu')) |
|
translation_model.add(Dense(32, use_bias=False, kernel_initializer='he_normal')) |
|
|
|
translation_model.add(BatchNormalization()) |
|
translation_model.add(Dropout(0.2)) |
|
translation_model.add(keras.layers.Activation('elu')) |
|
translation_model.add(Dense(32, kernel_initializer='he_normal',use_bias=False)) |
|
|
|
translation_model.add(BatchNormalization()) |
|
translation_model.add(keras.layers.Activation('elu')) |
|
translation_model.add(Dropout(0.2)) |
|
translation_model.add(Dense(len(list(expression_mapping.keys())), activation='softmax')) |
|
translation_model.load_weights('isl_model_final.keras') |
|
return translation_model |
|
|
|
|
|
testing_df=pd.read_csv('testing_cleaned.csv') |
|
|
|
test_files_df=pd.read_csv('test_files.csv') |
|
|
|
|
|
|
|
|
|
class Writer(): |
|
def __init__(self, output_file, input_fps, input_framesize, input_pix_fmt, |
|
input_vcodec): |
|
|
|
|
|
self.ff_proc = ( |
|
ffmpeg |
|
.input('pipe:', |
|
format='rawvideo', |
|
pix_fmt="bgr24", |
|
s='%sx%s'%(input_framesize[1],input_framesize[0]), |
|
r=input_fps) |
|
.output(output_file, pix_fmt=input_pix_fmt, vcodec=input_vcodec) |
|
.overwrite_output() |
|
.run_async(pipe_stdin=True) |
|
) |
|
|
|
def __call__(self, frame): |
|
self.ff_proc.stdin.write(frame.tobytes()) |
|
|
|
def close(self): |
|
self.ff_proc.stdin.close() |
|
self.ff_proc.wait() |
|
|
|
|
|
st.title('ISL Indian Sign Language translation using LSTM') |
|
|
|
st.markdown( |
|
""" |
|
<style> |
|
[data-testid="stSidebar"][aria-expanded="true"] > div:first-child { |
|
width: 350px; |
|
} |
|
[data-testid="stSidebar"][aria-expanded="false"] > div:first-child { |
|
width: 350px; |
|
margin-left: -350px; |
|
} |
|
</style> |
|
""", |
|
unsafe_allow_html=True, |
|
) |
|
|
|
st.sidebar.title('ISL Sign Language Translation using Openpose') |
|
st.sidebar.subheader('Parameters') |
|
frame_wise_outputs={} |
|
|
|
def weighted_average(nums, weights): |
|
if sum(weights)==0: |
|
return 0 |
|
return sum(x * y for x, y in zip(nums, weights)) / sum(weights) |
|
|
|
|
|
@st.cache_data |
|
def image_resize(image, width=None, height=None, inter=cv2.INTER_AREA): |
|
|
|
|
|
dim = None |
|
(h, w) = image.shape[:2] |
|
|
|
|
|
|
|
if width is None and height is None: |
|
return image |
|
|
|
|
|
if width is None: |
|
|
|
|
|
r = height / float(h) |
|
dim = (int(w * r), height) |
|
|
|
|
|
else: |
|
|
|
|
|
r = width / float(w) |
|
dim = (width, int(h * r)) |
|
|
|
|
|
resized = cv2.resize(image, dim, interpolation=inter) |
|
|
|
|
|
return resized |
|
|
|
app_mode = st.sidebar.selectbox('Choose the App mode', |
|
['About App','Run on Test Videos'] |
|
) |
|
|
|
if app_mode =='About App': |
|
st.markdown('In this application we are demonstrating model developed for translating the Indian Sign Language(ISL) using LSTM') |
|
st.markdown( |
|
""" |
|
<style> |
|
[data-testid="stSidebar"][aria-expanded="true"] > div:first-child { |
|
width: 400px; |
|
} |
|
[data-testid="stSidebar"][aria-expanded="false"] > div:first-child { |
|
width: 400px; |
|
margin-left: -400px; |
|
} |
|
</style> |
|
""", |
|
unsafe_allow_html=True, |
|
) |
|
|
|
|
|
st.markdown(''' |
|
# Dataset Used \n |
|
This model is trained using [INCLUDE](https://zenodo.org/records/4010759) dataset. \n |
|
|
|
### Key Statistics for the dataset is as follows- |
|
|
|
|
|
|
|
+-----------------------+-----------------+ |
|
| Charasteristics | INCLUDE-DATASET | |
|
+-----------------------+-----------------+ |
|
| Categories | 15 | |
|
| Words | 263 | |
|
| Videos | 4257 | |
|
| Avg Videos per class | 16.3 | |
|
| Avg Video Length | 2.57s | |
|
| Min Video Length | 1.28s | |
|
| Max Video Length | 6.16s | |
|
| Frame Rate | 25fps | |
|
| Resolution | 1920x1080 | |
|
+-----------------------+-----------------+ |
|
#### Size of each category |
|
|
|
|
|
+--------------------+-------------------+------------------+ |
|
| Category | Number of Classes | Number of Videos | |
|
+--------------------+-------------------+------------------+ |
|
| Adjectives | 59 | 791 | |
|
| Animals | 8 | 166 | |
|
| Clothes | 10 | 198 | |
|
| Colours | 11 | 222 | |
|
| Days and Time | 22 | 306 | |
|
| Electronics | 10 | 140 | |
|
| Greetings | 9 | 185 | |
|
| Means of Transport | 9 | 186 | |
|
| Objects at Home | 27 | 379 | |
|
| Occupations | 16 | 225 | |
|
| People | 26 | 513 | |
|
| Places | 19 | 399 | |
|
| Pronouns | 8 | 168 | |
|
| Seasons | 6 | 85 | |
|
| Society | 23 | 324 | |
|
| | Categories# 263 | Total Videos-4287| |
|
+--------------------+-------------------+------------------+ |
|
|
|
|
|
|
|
Below are count of videos we were able to process (1986 of 4287). We processed limited set of records due to time/compute constraints. |
|
|
|
''') |
|
|
|
image = np.array(Image.open('eda/categories_processed.png')) |
|
|
|
st.image(image) |
|
st.markdown(''' |
|
#### Below are the count of Videos per Label for each Dataframe |
|
''') |
|
image = np.array(Image.open('eda/distribution_of_data.png')) |
|
|
|
|
|
|
|
st.image(image) |
|
|
|
st.markdown(''' |
|
### Date Pipeline |
|
''') |
|
|
|
image = np.array(Image.open('DataPipeline.png')) |
|
|
|
st.image(image) |
|
st.markdown(''' |
|
### Model structure |
|
``` |
|
translation_model = Sequential() |
|
translation_model.add(Input(shape=((20, 156)))) |
|
translation_model.add(keras.layers.Masking(mask_value=0.)) |
|
translation_model.add(BatchNormalization()) |
|
translation_model.add(Bidirectional(LSTM(32, recurrent_dropout=0.2, return_sequences=True))) |
|
|
|
translation_model.add(Dropout(0.2)) |
|
translation_model.add(Bidirectional(LSTM(32, recurrent_dropout=0.2))) |
|
|
|
translation_model.add(keras.layers.Activation('elu')) |
|
translation_model.add(Dense(32, use_bias=False, kernel_initializer='he_normal')) |
|
|
|
translation_model.add(BatchNormalization()) |
|
translation_model.add(Dropout(0.2)) |
|
translation_model.add(keras.layers.Activation('elu')) |
|
translation_model.add(Dense(32, kernel_initializer='he_normal',use_bias=False)) |
|
|
|
translation_model.add(BatchNormalization()) |
|
translation_model.add(keras.layers.Activation('elu')) |
|
translation_model.add(Dropout(0.2)) |
|
translation_model.add(Dense(len(list(expression_mapping.keys())), activation='softmax')) |
|
isl_translator=ISLSignPosTranslator(bodypose_25_model(),handpose_model(), translation_model) |
|
``` |
|
|
|
Total params: 82,679 (322.96 KB) |
|
Trainable params: 82,239 (321.25 KB) |
|
Non-trainable params: 440 (1.72 KB) |
|
''') |
|
image = np.array(Image.open('model-graph.png')) |
|
|
|
st.image(image) |
|
st.markdown(''' |
|
# Training |
|
[Tensorboard](https://huggingface.co/cdsteameight/ISL-SignLanguageTranslation/tensorboard) |
|
|
|
''') |
|
|
|
elif app_mode =='Run on Test Videos': |
|
|
|
category = st.sidebar.selectbox('Choose Category', |
|
np.sort(test_files_df['Category'].unique(), axis=-1, kind='mergesort')) |
|
|
|
mask = (test_files_df['Category']==category) |
|
test_files_df_category=test_files_df[mask] |
|
cls = st.sidebar.selectbox('Choose Class', |
|
np.sort(test_files_df_category['Class'].unique(), axis=-1, kind='mergesort') |
|
) |
|
mask = (test_files_df['Class']==cls) |
|
filename = st.sidebar.selectbox('Choose File', |
|
np.sort(test_files_df_category[mask]['Filename'].unique(), axis=-1, kind='mergesort') |
|
) |
|
|
|
|
|
|
|
|
|
if st.sidebar.button("Start", type="primary"): |
|
mask = (testing_df['FileName'] == filename) & (testing_df['Type']==category)& (testing_df['Expression']==cls) |
|
|
|
|
|
window_size=20 |
|
current_test_df=testing_df[mask] |
|
X_test_filtered,y_test_filtered = create_timeseries_data(current_test_df,feature_columns_new,label_columns,window_size=window_size) |
|
|
|
X_test_filtered=np.array(X_test_filtered) |
|
|
|
|
|
st.set_option('deprecation.showfileUploaderEncoding', False) |
|
|
|
|
|
|
|
|
|
|
|
|
|
st.sidebar.markdown('---') |
|
st.markdown( |
|
""" |
|
<style> |
|
[data-testid="stSidebar"][aria-expanded="true"] > div:first-child { |
|
width: 400px; |
|
} |
|
[data-testid="stSidebar"][aria-expanded="false"] > div:first-child { |
|
width: 400px; |
|
margin-left: -400px; |
|
} |
|
</style> |
|
""", |
|
unsafe_allow_html=True, |
|
) |
|
|
|
st.sidebar.markdown('---') |
|
|
|
st.markdown(' ## Output') |
|
|
|
runtime_progress = st.empty() |
|
|
|
with runtime_progress.container(): |
|
df1 = pd.DataFrame([['--','--']], columns=['Frames Processed','Detected Class']) |
|
|
|
my_table = st.table(df1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
view = st.empty() |
|
|
|
st.markdown("<hr/>", unsafe_allow_html=True) |
|
stframes = st.empty() |
|
|
|
|
|
|
|
|
|
vid = cv2.VideoCapture(f'test/{category}/{cls}/{filename}') |
|
|
|
ffprobe_result = ffprobe(f'test/{category}/{cls}/{filename}') |
|
info = json.loads(ffprobe_result.json) |
|
videoinfo = [i for i in info["streams"] if i["codec_type"] == "video"][0] |
|
input_fps = videoinfo["avg_frame_rate"] |
|
|
|
input_pix_fmt = videoinfo["pix_fmt"] |
|
input_vcodec = videoinfo["codec_name"] |
|
postfix = info["format"]["format_name"].split(",")[0] |
|
|
|
|
|
width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) |
|
height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) |
|
fps_input = int(vid.get(cv2.CAP_PROP_FPS)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fps = 0 |
|
i = 0 |
|
|
|
|
|
|
|
|
|
totalFrames=int(vid.get(cv2.CAP_PROP_FRAME_COUNT)) |
|
window_size=20 |
|
|
|
|
|
window=[] |
|
|
|
|
|
prevTime = 0 |
|
postfix = info["format"]["format_name"].split(",")[0] |
|
|
|
with tempfile.NamedTemporaryFile(suffix=f'.{postfix}',delete=False) as tfflie: |
|
output_file = tfflie.name |
|
|
|
|
|
fps_input = int(vid.get(cv2.CAP_PROP_FPS)) |
|
|
|
|
|
|
|
|
|
out = None |
|
writer=None |
|
weighted_avg_dict={} |
|
|
|
idx=0 |
|
|
|
for _, row in current_test_df.iterrows(): |
|
|
|
if(vid.isOpened()): |
|
ret, frame = vid.read() |
|
|
|
|
|
if len(window)<window_size: |
|
canvas=util.drawStickmodel(frame,eval(row['bodypose_circles']),eval(row['bodypose_sticks']),eval(row['handpose_edges']),eval(row['handpose_peaks'])) |
|
canvas_with_plot=util.draw_bar_plot_below_image(canvas,{}, f'Prediction bar plot - Frame number {idx+1} [** no predictions]',canvas) |
|
canvas_with_plot=util.draw_bar_plot_below_image(canvas_with_plot,weighted_avg_dict, f'Weighted avg - Frame number {idx+1} [** no predictions]',canvas) |
|
canvas_with_plot=util.add_padding_to_bottom(canvas_with_plot,(255,255,255),100) |
|
|
|
if writer is None: |
|
input_framesize = canvas_with_plot.shape[:2] |
|
writer = Writer(output_file, input_fps, input_framesize, input_pix_fmt, |
|
input_vcodec) |
|
|
|
|
|
|
|
|
|
|
|
writer(canvas_with_plot) |
|
|
|
with runtime_progress.container(): |
|
df1 = pd.DataFrame([[f'{idx+1}/{current_test_df.shape[0]}','<model will output after 20 frames>']], columns=['Frames Processed','Detected Class']) |
|
|
|
my_table = st.table(df1) |
|
window.append(frame) |
|
|
|
|
|
with view.container(): |
|
st.image(canvas_with_plot,channels = 'BGR',use_column_width=True) |
|
else: |
|
|
|
window[:-1] = window[1:] |
|
window[-1]=frame |
|
translation_model=get_translator_model() |
|
|
|
|
|
encoded_translation = translation_model(X_test_filtered[idx-20].reshape(1,X_test_filtered[idx-20].shape[0],X_test_filtered[idx-20].shape[1])) |
|
encoded_translation=encoded_translation[0].cpu().detach().numpy() |
|
sorted_index=np.argsort(encoded_translation)[::-1] |
|
maxindex=np.argmax(encoded_translation) |
|
|
|
top_3_probs = encoded_translation.argsort()[-3:][::-1] |
|
top_3_categories = [expression_mapping[i] for i in top_3_probs] |
|
top_3_values = encoded_translation[top_3_probs] |
|
|
|
for category, prob in zip(top_3_categories, top_3_values): |
|
if category not in frame_wise_outputs: |
|
frame_wise_outputs[category]=[] |
|
frame_wise_outputs[category].append(prob) |
|
current_prob={} |
|
|
|
for category, prob in zip(top_3_categories, top_3_values): |
|
current_prob[category]=prob |
|
|
|
for key in frame_wise_outputs: |
|
weighted_avg_dict[key]=weighted_average(frame_wise_outputs[key],[len(frame_wise_outputs[key]) for i in range(len(frame_wise_outputs[key]))]) |
|
|
|
sorted_dict = dict(sorted(weighted_avg_dict.items(), key=lambda item: item[1], reverse=True)) |
|
canvas=util.drawStickmodel(frame,eval(row['bodypose_circles']),eval(row['bodypose_sticks']),eval(row['handpose_edges']),eval(row['handpose_peaks'])) |
|
canvas_with_plot=util.draw_bar_plot_below_image(canvas,current_prob, f'Prediction at frame window({idx-20+1}-{idx+1})',canvas) |
|
canvas_with_plot=util.draw_bar_plot_below_image(canvas_with_plot,weighted_avg_dict, f'Weighted avg till window {idx+1}',canvas) |
|
canvas_with_plot=util.add_padding_to_bottom(canvas_with_plot,(255,255,255),100) |
|
writer(canvas_with_plot) |
|
|
|
|
|
currTime = time.time() |
|
fps = 1 / (currTime - prevTime) |
|
prevTime = currTime |
|
|
|
|
|
|
|
|
|
|
|
|
|
max_prob = float('-inf') |
|
max_key = None |
|
|
|
for exp, prob in weighted_avg_dict.items(): |
|
if prob > max_prob: |
|
max_prob = prob |
|
max_key = exp |
|
with runtime_progress.container(): |
|
df1 = pd.DataFrame([[f'{idx+1}/{current_test_df.shape[0]}',f'{max_key} ({max_prob*100:.2f}%)']], columns=['Frames Processed','Detected Class']) |
|
my_table = st.table(df1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with view.container(): |
|
st.image(canvas_with_plot,channels = 'BGR',use_column_width=True) |
|
|
|
idx=idx+1 |
|
|
|
|
|
|
|
with view.container(): |
|
writer.close() |
|
|
|
output_video = open(output_file,'rb') |
|
out_bytes = output_video.read() |
|
st.video(out_bytes) |
|
|
|
|
|
print(f'Output file - {output_file}') |
|
cv2.destroyAllWindows() |
|
vid.release() |
|
|
|
|
|
|