shin-mashita
Added documentation
2a7c856
import torch
import cv2
import videotransforms
import numpy as np
import gradio as gr
from einops import rearrange
from torchvision import transforms
from pytorch_i3d import InceptionI3d
def preprocess(vidpath):
# Fetch video
cap = cv2.VideoCapture(vidpath)
frames = []
cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
# Extract frames from video
for _ in range(num):
_, img = cap.read()
# Skip NoneType frames
if img is None:
continue
# Resize if (w,h) < (226,226)
w, h, c = img.shape
if w < 226 or h < 226:
d = 226. - min(w, h)
sc = 1 + d / min(w, h)
img = cv2.resize(img, dsize=(0, 0), fx=sc, fy=sc)
# Normalize
img = (img / 255.) * 2 - 1
frames.append(img)
frames = torch.Tensor(np.asarray(frames, dtype=np.float32))
# Transform tensor and reshape to (1, c, t ,w, h)
transform = transforms.Compose([videotransforms.CenterCrop(224)])
frames = transform(frames)
frames = rearrange(frames, 't w h c-> 1 c t w h')
return frames
def classify(video,dataset='WLASL100'):
to_load = {
'WLASL100':{'logits':100,'path':'weights/asl100/FINAL_nslt_100_iters=896_top1=65.89_top5=84.11_top10=89.92.pt'},
'WLASL2000':{'logits':2000,'path':'weights/asl2000/FINAL_nslt_2000_iters=5104_top1=32.48_top5=57.31_top10=66.31.pt'}
}
# Preprocess video
input = preprocess(video)
# Load model
model = InceptionI3d()
model.load_state_dict(torch.load('weights/rgb_imagenet.pt',map_location=torch.device('cpu')))
model.replace_logits(to_load[dataset]['logits'])
model.load_state_dict(torch.load(to_load[dataset]['path'],map_location=torch.device('cpu')))
# Run on cpu. Spaces environment is limited to CPU for free users.
model.cpu()
# Evaluation mode
model.eval()
with torch.no_grad(): # Disable gradient computation
per_frame_logits = model(input) # Inference
per_frame_logits.cpu()
model.cpu()
# Load predictions
predictions = rearrange(per_frame_logits,'1 j k -> j k')
predictions = torch.mean(predictions, dim = 1)
# Fetch top 10 predictions
_, index = torch.topk(predictions,10)
index = index.cpu().numpy()
# Load labels
with open('wlasl_class_list.txt') as f:
idx2label = dict()
for line in f:
idx2label[int(line.split()[0])]=line.split()[1]
# Get probabilities
predictions = torch.nn.functional.softmax(predictions, dim=0).cpu().numpy()
# Return dict {label:pred}
return {idx2label[i]:float(predictions[i]) for i in index}
# Gradio App config
title = "I3D Sign Language Recognition"
description = "Gradio demo of word-level sign language classification using I3D model pretrained on the WLASL video dataset. " \
"WLASL is a large-scale dataset containing more than 2000 words in American Sign Language. " \
"Examples used in the demo are videos from the the test subset. " \
"Note that WLASL100 contains 100 words while WLASL2000 contains 2000."
examples = [
['videos/no.mp4','WLASL100'],
['videos/all.mp4','WLASL100'],
['videos/before.mp4','WLASL100'],
['videos/blue.mp4','WLASL2000'],
['videos/white.mp4','WLASL2000'],
['videos/accident2.mp4','WLASL2000']
]
article = "NOTE: This is not the official demonstration of the I3D sign language classification on the WLASL dataset. "\
"More information about the WLASL dataset and pretrained I3D models can be found <a href=https://github.com/dxli94/WLASL>here</a>."
# Gradio App interface
gr.Interface( fn=classify,
inputs=[gr.inputs.Video(label="Video (*.mp4)"),gr.inputs.Radio(choices=['WLASL100','WLASL2000'], default='WLASL100', label='Trained on:')],
outputs=[gr.outputs.Label(num_top_classes=5, label='Top 5 Predictions')],
allow_flagging="never",
title=title,
description=description,
examples=examples,
article=article).launch()