Spaces:

cedssama
/

I3D_Sign_Language_Classification

Running

App Files Files Community

shin-mashita commited on Mar 27, 2022

Commit

2a7c856

•

1 Parent(s): abe3bfd

Added documentation

Browse files

Files changed (1) hide show

app.py +33 -12

app.py CHANGED Viewed

@@ -9,33 +9,39 @@ from pytorch_i3d import InceptionI3d
 def preprocess(vidpath):
     cap = cv2.VideoCapture(vidpath)
     frames = []
     cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
     num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     for _ in range(num):
         _, img = cap.read()
         if img is None:
             continue
         w, h, c = img.shape
         if w < 226 or h < 226:
             d = 226. - min(w, h)
             sc = 1 + d / min(w, h)
             img = cv2.resize(img, dsize=(0, 0), fx=sc, fy=sc)
         img = (img / 255.) * 2 - 1
         frames.append(img)
-    # frames = torch.cuda.FloatTensor(np.asarray(frames, dtype=np.float32)) if torch.cuda.is_available() else torch.Tensor(np.asarray(frames, dtype=np.float32))
     frames = torch.Tensor(np.asarray(frames, dtype=np.float32))
     transform = transforms.Compose([videotransforms.CenterCrop(224)])
     frames = transform(frames)
-    frames = rearrange(frames, 't h w c-> 1 c t h w')
     return frames
@@ -45,42 +51,53 @@ def classify(video,dataset='WLASL100'):
         'WLASL2000':{'logits':2000,'path':'weights/asl2000/FINAL_nslt_2000_iters=5104_top1=32.48_top5=57.31_top10=66.31.pt'}
         }
     input = preprocess(video)
     model = InceptionI3d()
     model.load_state_dict(torch.load('weights/rgb_imagenet.pt',map_location=torch.device('cpu')))
     model.replace_logits(to_load[dataset]['logits'])
     model.load_state_dict(torch.load(to_load[dataset]['path'],map_location=torch.device('cpu')))
-    # device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
-    # model.to(device)
     model.cpu()
     model.eval()
-    with torch.no_grad():
-        per_frame_logits = model(input)
     per_frame_logits.cpu()
     model.cpu()
     predictions = rearrange(per_frame_logits,'1 j k -> j k')
     predictions = torch.mean(predictions, dim = 1)
-    top = torch.argmax(predictions).item()
     _, index = torch.topk(predictions,10)
     index = index.cpu().numpy()
     with open('wlasl_class_list.txt') as f:
         idx2label = dict()
         for line in f:
             idx2label[int(line.split()[0])]=line.split()[1]
     predictions = torch.nn.functional.softmax(predictions, dim=0).cpu().numpy()
     return {idx2label[i]:float(predictions[i]) for i in index}
 title = "I3D Sign Language Recognition"
-description = "Description here"
 examples = [
         ['videos/no.mp4','WLASL100'],
         ['videos/all.mp4','WLASL100'],
@@ -90,11 +107,15 @@ examples = [
         ['videos/accident2.mp4','WLASL2000']
     ]
 gr.Interface(   fn=classify,
-                inputs=[gr.inputs.Video(label="VIDEO"),gr.inputs.Dropdown(choices=['WLASL100','WLASL2000'], default='WLASL100', label='DATASET USED')],
                 outputs=[gr.outputs.Label(num_top_classes=5, label='Top 5 Predictions')],
                 allow_flagging="never",
                 title=title,
                 description=description,
-                examples=examples).launch()

 def preprocess(vidpath):
+    # Fetch video
     cap = cv2.VideoCapture(vidpath)
     frames = []
     cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
     num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    # Extract frames from video
     for _ in range(num):
         _, img = cap.read()
+        # Skip NoneType frames
         if img is None:
             continue
+        # Resize if (w,h) < (226,226)
         w, h, c = img.shape
         if w < 226 or h < 226:
             d = 226. - min(w, h)
             sc = 1 + d / min(w, h)
             img = cv2.resize(img, dsize=(0, 0), fx=sc, fy=sc)
+        # Normalize
         img = (img / 255.) * 2 - 1
         frames.append(img)
     frames = torch.Tensor(np.asarray(frames, dtype=np.float32))
+    # Transform tensor and reshape to (1, c, t ,w, h)
     transform = transforms.Compose([videotransforms.CenterCrop(224)])
     frames = transform(frames)
+    frames = rearrange(frames, 't w h c-> 1 c t w h')
     return frames
         'WLASL2000':{'logits':2000,'path':'weights/asl2000/FINAL_nslt_2000_iters=5104_top1=32.48_top5=57.31_top10=66.31.pt'}
         }
+    # Preprocess video
     input = preprocess(video)
+    # Load model
     model = InceptionI3d()
     model.load_state_dict(torch.load('weights/rgb_imagenet.pt',map_location=torch.device('cpu')))
     model.replace_logits(to_load[dataset]['logits'])
     model.load_state_dict(torch.load(to_load[dataset]['path'],map_location=torch.device('cpu')))
+    # Run on cpu. Spaces environment is limited to CPU for free users.
     model.cpu()
+    # Evaluation mode
     model.eval()
+    with torch.no_grad(): # Disable gradient computation
+        per_frame_logits = model(input) # Inference
     per_frame_logits.cpu()
     model.cpu()
+    # Load predictions
     predictions = rearrange(per_frame_logits,'1 j k -> j k')
     predictions = torch.mean(predictions, dim = 1)
+    # Fetch top 10 predictions
     _, index = torch.topk(predictions,10)
     index = index.cpu().numpy()
+    # Load labels
     with open('wlasl_class_list.txt') as f:
         idx2label = dict()
         for line in f:
             idx2label[int(line.split()[0])]=line.split()[1]
+    # Get probabilities
     predictions = torch.nn.functional.softmax(predictions, dim=0).cpu().numpy()
+    # Return dict {label:pred}
     return {idx2label[i]:float(predictions[i]) for i in index}
+# Gradio App config
 title = "I3D Sign Language Recognition"
+description =   "Gradio demo of word-level sign language classification using I3D model pretrained on the WLASL video dataset. " \
+                "WLASL is a large-scale dataset containing more than 2000 words in American Sign Language. " \
+                "Examples used in the demo are videos from the the test subset. "  \
+                "Note that WLASL100 contains 100 words while WLASL2000 contains 2000."
 examples = [
         ['videos/no.mp4','WLASL100'],
         ['videos/all.mp4','WLASL100'],
         ['videos/accident2.mp4','WLASL2000']
     ]
+article =   "NOTE: This is not the official demonstration of the I3D sign language classification on the WLASL dataset. "\
+            "More information about the WLASL dataset and pretrained I3D models can be found <a href=https://github.com/dxli94/WLASL>here</a>."
+# Gradio App interface
 gr.Interface(   fn=classify,
+                inputs=[gr.inputs.Video(label="Video (*.mp4)"),gr.inputs.Radio(choices=['WLASL100','WLASL2000'], default='WLASL100', label='Trained on:')],
                 outputs=[gr.outputs.Label(num_top_classes=5, label='Top 5 Predictions')],
                 allow_flagging="never",
                 title=title,
                 description=description,
+                examples=examples,
+                article=article).launch()