Ahsen Khaliq commited on
Commit
c6db8a6
β€’
1 Parent(s): e3ff57d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -0
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ # Choose the `x3d_s` model
3
+ import json
4
+ import urllib
5
+ from pytorchvideo.data.encoded_video import EncodedVideo
6
+
7
+ from torchvision.transforms import Compose, Lambda
8
+ from torchvision.transforms._transforms_video import (
9
+ CenterCropVideo,
10
+ NormalizeVideo,
11
+ )
12
+ from pytorchvideo.transforms import (
13
+ ApplyTransformToKey,
14
+ ShortSideScale,
15
+ UniformTemporalSubsample
16
+ )
17
+
18
+ import gradio as gr
19
+
20
+ #Video
21
+ torch.hub.download_url_to_file('https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4', 'archery.mp4')
22
+
23
+ model_name = 'x3d_s'
24
+ model = torch.hub.load('facebookresearch/pytorchvideo', model_name, pretrained=True)
25
+
26
+ # Set to GPU or CPU
27
+ device = "cuda"
28
+ model = model.eval()
29
+ model = model.to(device)
30
+
31
+ json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
32
+ json_filename = "kinetics_classnames.json"
33
+ try: urllib.URLopener().retrieve(json_url, json_filename)
34
+ except: urllib.request.urlretrieve(json_url, json_filename)
35
+
36
+ with open(json_filename, "r") as f:
37
+ kinetics_classnames = json.load(f)
38
+
39
+ # Create an id to label name mapping
40
+ kinetics_id_to_classname = {}
41
+ for k, v in kinetics_classnames.items():
42
+ kinetics_id_to_classname[v] = str(k).replace('"', "")
43
+
44
+ mean = [0.45, 0.45, 0.45]
45
+ std = [0.225, 0.225, 0.225]
46
+ frames_per_second = 30
47
+ model_transform_params = {
48
+ "x3d_xs": {
49
+ "side_size": 182,
50
+ "crop_size": 182,
51
+ "num_frames": 4,
52
+ "sampling_rate": 12,
53
+ },
54
+ "x3d_s": {
55
+ "side_size": 182,
56
+ "crop_size": 182,
57
+ "num_frames": 13,
58
+ "sampling_rate": 6,
59
+ },
60
+ "x3d_m": {
61
+ "side_size": 256,
62
+ "crop_size": 256,
63
+ "num_frames": 16,
64
+ "sampling_rate": 5,
65
+ }
66
+ }
67
+
68
+ # Get transform parameters based on model
69
+ transform_params = model_transform_params[model_name]
70
+
71
+ # Note that this transform is specific to the slow_R50 model.
72
+ transform = ApplyTransformToKey(
73
+ key="video",
74
+ transform=Compose(
75
+ [
76
+ UniformTemporalSubsample(transform_params["num_frames"]),
77
+ Lambda(lambda x: x/255.0),
78
+ NormalizeVideo(mean, std),
79
+ ShortSideScale(size=transform_params["side_size"]),
80
+ CenterCropVideo(
81
+ crop_size=(transform_params["crop_size"], transform_params["crop_size"])
82
+ )
83
+ ]
84
+ ),
85
+ )
86
+
87
+ # The duration of the input clip is also specific to the model.
88
+ clip_duration = (transform_params["num_frames"] * transform_params["sampling_rate"])/frames_per_second
89
+
90
+ def x3dpred(video):
91
+
92
+ # Select the duration of the clip to load by specifying the start and end duration
93
+ # The start_sec should correspond to where the action occurs in the video
94
+ start_sec = 0
95
+ end_sec = start_sec + clip_duration
96
+
97
+ # Initialize an EncodedVideo helper class and load the video
98
+ video = EncodedVideo.from_path(video)
99
+
100
+ # Load the desired clip
101
+ video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
102
+
103
+ # Apply a transform to normalize the video input
104
+ video_data = transform(video_data)
105
+
106
+ # Move the inputs to the desired device
107
+ inputs = video_data["video"]
108
+ inputs = inputs.to(device)
109
+
110
+
111
+ # Pass the input clip through the model
112
+ preds = model(inputs[None, ...])
113
+
114
+ # Get the predicted classes
115
+ post_act = torch.nn.Softmax(dim=1)
116
+ preds = post_act(preds)
117
+ pred_classes = preds.topk(k=5).indices[0]
118
+
119
+ # Map the predicted classes to the label names
120
+ pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
121
+ return "%s" % ", ".join(pred_class_names)
122
+
123
+ inputs = gr.inputs.Video(label="Input Video")
124
+ outputs = gr.outputs.Textbox(label="Top 5 predicted labels")
125
+
126
+ title = "X3D"
127
+ description = "Gradio demo for X3D networks pretrained on the Kinetics 400 dataset. To use it, simply upload your video, or click one of the examples to load them. Read more at the links below."
128
+ article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2004.04730'>X3D: Expanding Architectures for Efficient Video Recognition</a> | <a href='https://github.com/facebookresearch/pytorchvideo'>Github Repo</a></p>"
129
+
130
+ examples = [
131
+ ['archery.mp4']
132
+ ]
133
+
134
+ gr.Interface(x3dpred, inputs, outputs, title=title, description=description, article=article, examples=examples, analytics_enabled=False).launch(debug=True)