Ahsen Khaliq
commited on
Commit
β’
c6db8a6
1
Parent(s):
e3ff57d
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
# Choose the `x3d_s` model
|
3 |
+
import json
|
4 |
+
import urllib
|
5 |
+
from pytorchvideo.data.encoded_video import EncodedVideo
|
6 |
+
|
7 |
+
from torchvision.transforms import Compose, Lambda
|
8 |
+
from torchvision.transforms._transforms_video import (
|
9 |
+
CenterCropVideo,
|
10 |
+
NormalizeVideo,
|
11 |
+
)
|
12 |
+
from pytorchvideo.transforms import (
|
13 |
+
ApplyTransformToKey,
|
14 |
+
ShortSideScale,
|
15 |
+
UniformTemporalSubsample
|
16 |
+
)
|
17 |
+
|
18 |
+
import gradio as gr
|
19 |
+
|
20 |
+
#Video
|
21 |
+
torch.hub.download_url_to_file('https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4', 'archery.mp4')
|
22 |
+
|
23 |
+
model_name = 'x3d_s'
|
24 |
+
model = torch.hub.load('facebookresearch/pytorchvideo', model_name, pretrained=True)
|
25 |
+
|
26 |
+
# Set to GPU or CPU
|
27 |
+
device = "cuda"
|
28 |
+
model = model.eval()
|
29 |
+
model = model.to(device)
|
30 |
+
|
31 |
+
json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
|
32 |
+
json_filename = "kinetics_classnames.json"
|
33 |
+
try: urllib.URLopener().retrieve(json_url, json_filename)
|
34 |
+
except: urllib.request.urlretrieve(json_url, json_filename)
|
35 |
+
|
36 |
+
with open(json_filename, "r") as f:
|
37 |
+
kinetics_classnames = json.load(f)
|
38 |
+
|
39 |
+
# Create an id to label name mapping
|
40 |
+
kinetics_id_to_classname = {}
|
41 |
+
for k, v in kinetics_classnames.items():
|
42 |
+
kinetics_id_to_classname[v] = str(k).replace('"', "")
|
43 |
+
|
44 |
+
mean = [0.45, 0.45, 0.45]
|
45 |
+
std = [0.225, 0.225, 0.225]
|
46 |
+
frames_per_second = 30
|
47 |
+
model_transform_params = {
|
48 |
+
"x3d_xs": {
|
49 |
+
"side_size": 182,
|
50 |
+
"crop_size": 182,
|
51 |
+
"num_frames": 4,
|
52 |
+
"sampling_rate": 12,
|
53 |
+
},
|
54 |
+
"x3d_s": {
|
55 |
+
"side_size": 182,
|
56 |
+
"crop_size": 182,
|
57 |
+
"num_frames": 13,
|
58 |
+
"sampling_rate": 6,
|
59 |
+
},
|
60 |
+
"x3d_m": {
|
61 |
+
"side_size": 256,
|
62 |
+
"crop_size": 256,
|
63 |
+
"num_frames": 16,
|
64 |
+
"sampling_rate": 5,
|
65 |
+
}
|
66 |
+
}
|
67 |
+
|
68 |
+
# Get transform parameters based on model
|
69 |
+
transform_params = model_transform_params[model_name]
|
70 |
+
|
71 |
+
# Note that this transform is specific to the slow_R50 model.
|
72 |
+
transform = ApplyTransformToKey(
|
73 |
+
key="video",
|
74 |
+
transform=Compose(
|
75 |
+
[
|
76 |
+
UniformTemporalSubsample(transform_params["num_frames"]),
|
77 |
+
Lambda(lambda x: x/255.0),
|
78 |
+
NormalizeVideo(mean, std),
|
79 |
+
ShortSideScale(size=transform_params["side_size"]),
|
80 |
+
CenterCropVideo(
|
81 |
+
crop_size=(transform_params["crop_size"], transform_params["crop_size"])
|
82 |
+
)
|
83 |
+
]
|
84 |
+
),
|
85 |
+
)
|
86 |
+
|
87 |
+
# The duration of the input clip is also specific to the model.
|
88 |
+
clip_duration = (transform_params["num_frames"] * transform_params["sampling_rate"])/frames_per_second
|
89 |
+
|
90 |
+
def x3dpred(video):
|
91 |
+
|
92 |
+
# Select the duration of the clip to load by specifying the start and end duration
|
93 |
+
# The start_sec should correspond to where the action occurs in the video
|
94 |
+
start_sec = 0
|
95 |
+
end_sec = start_sec + clip_duration
|
96 |
+
|
97 |
+
# Initialize an EncodedVideo helper class and load the video
|
98 |
+
video = EncodedVideo.from_path(video)
|
99 |
+
|
100 |
+
# Load the desired clip
|
101 |
+
video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
|
102 |
+
|
103 |
+
# Apply a transform to normalize the video input
|
104 |
+
video_data = transform(video_data)
|
105 |
+
|
106 |
+
# Move the inputs to the desired device
|
107 |
+
inputs = video_data["video"]
|
108 |
+
inputs = inputs.to(device)
|
109 |
+
|
110 |
+
|
111 |
+
# Pass the input clip through the model
|
112 |
+
preds = model(inputs[None, ...])
|
113 |
+
|
114 |
+
# Get the predicted classes
|
115 |
+
post_act = torch.nn.Softmax(dim=1)
|
116 |
+
preds = post_act(preds)
|
117 |
+
pred_classes = preds.topk(k=5).indices[0]
|
118 |
+
|
119 |
+
# Map the predicted classes to the label names
|
120 |
+
pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
|
121 |
+
return "%s" % ", ".join(pred_class_names)
|
122 |
+
|
123 |
+
inputs = gr.inputs.Video(label="Input Video")
|
124 |
+
outputs = gr.outputs.Textbox(label="Top 5 predicted labels")
|
125 |
+
|
126 |
+
title = "X3D"
|
127 |
+
description = "Gradio demo for X3D networks pretrained on the Kinetics 400 dataset. To use it, simply upload your video, or click one of the examples to load them. Read more at the links below."
|
128 |
+
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2004.04730'>X3D: Expanding Architectures for Efficient Video Recognition</a> | <a href='https://github.com/facebookresearch/pytorchvideo'>Github Repo</a></p>"
|
129 |
+
|
130 |
+
examples = [
|
131 |
+
['archery.mp4']
|
132 |
+
]
|
133 |
+
|
134 |
+
gr.Interface(x3dpred, inputs, outputs, title=title, description=description, article=article, examples=examples, analytics_enabled=False).launch(debug=True)
|