shivi commited on
Commit
3d12539
β€’
1 Parent(s): 77d621b

Adding all app files with examples

Browse files
app.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from utils.predict import predict_action
3
+ import os
4
+ import glob
5
+
6
+ ##Create Dataset for loading examples
7
+ example_list = glob.glob("examples/*")
8
+ example_list = list(map(lambda el:[el], example_list))
9
+
10
+
11
+ def load_example(video):
12
+ return video[0]
13
+
14
+ demo = gr.Blocks()
15
+
16
+
17
+
18
+ with demo:
19
+
20
+ gr.Markdown("# **<p align='center'>Video Classification with Transformers</p>**")
21
+ gr.Markdown("This space demonstrates the use of hybrid Transformer-based models for video classification that operate on CNN feature maps.")
22
+
23
+ with gr.Tabs():
24
+
25
+ with gr.TabItem("Upload & Predict"):
26
+ with gr.Box():
27
+
28
+ with gr.Row():
29
+ input_video = gr.Video(label="Input Video", show_label=True)
30
+ output_label = gr.Label(label="Model Output", show_label=True)
31
+ output_gif = gr.Image(label="Video Gif", show_label=True)
32
+
33
+ gr.Markdown("**Predict**")
34
+
35
+ with gr.Box():
36
+ with gr.Row():
37
+ submit_button = gr.Button("Submit")
38
+
39
+ gr.Markdown("**Examples:**")
40
+ gr.Markdown("The model is trained to classify videos belonging to the following classes:")
41
+ gr.Markdown("CricketShot, PlayingCello, Punch, ShavingBeard, TennisSwing")
42
+
43
+ with gr.Column():
44
+ # gr.Examples("examples", [input_video], [output_label,output_gif], predict_action, cache_examples=True)
45
+ examples = gr.components.Dataset(components=[input_video], samples=example_list, type='values')
46
+ examples.click(load_example, examples, input_video)
47
+
48
+ submit_button.click(predict_action, inputs=input_video, outputs=[output_label,output_gif])
49
+
50
+ gr.Markdown('\n Author: <a href=\"https://www.linkedin.com/in/shivalika-singh/\">Shivalika Singh</a> <br> Based on this <a href=\"https://keras.io/examples/vision/video_transformers/\">Keras example</a> by <a href=\"https://twitter.com/RisingSayak\">Sayak Paul</a> <br> Demo Powered by this <a href=\"https://huggingface.co/shivi/video-transformers/\"> Video Classification</a> model')
51
+
52
+ demo.launch()
examples/v_CricketShot_g01_c01.mp4 ADDED
Binary file (138 kB). View file
examples/v_PlayingCello_g11_c03.mp4 ADDED
Binary file (401 kB). View file
examples/v_Punch_g09_c07.mp4 ADDED
Binary file (705 kB). View file
examples/v_ShavingBeard_g09_c03.mp4 ADDED
Binary file (206 kB). View file
examples/v_TennisSwing_g10_c04.mp4 ADDED
Binary file (75.6 kB). View file
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
1
+ tensorflow
2
+ gradio
3
+ opencv-python
4
+ imageio
utils/constants.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
1
+ MAX_SEQ_LENGTH = 20
2
+ NUM_FEATURES = 1024
3
+ IMG_SIZE = 128
4
+ CLASS_VOCAB = ['CricketShot', 'PlayingCello', 'Punch', 'ShavingBeard', 'TennisSwing']
utils/custom_layers.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ from tensorflow import keras
3
+ from keras import layers
4
+
5
+
6
+ class PositionalEmbedding(layers.Layer):
7
+ def __init__(self, sequence_length, output_dim, **kwargs):
8
+ super().__init__(**kwargs)
9
+ self.position_embeddings = layers.Embedding(
10
+ input_dim=sequence_length, output_dim=output_dim
11
+ )
12
+ self.sequence_length = sequence_length
13
+ self.output_dim = output_dim
14
+
15
+ def call(self, inputs):
16
+ # The inputs are of shape: `(batch_size, frames, num_features)`
17
+ length = tf.shape(inputs)[1]
18
+ positions = tf.range(start=0, limit=length, delta=1)
19
+ embedded_positions = self.position_embeddings(positions)
20
+ return inputs + embedded_positions
21
+
22
+ def compute_mask(self, inputs, mask=None):
23
+ mask = tf.reduce_any(tf.cast(inputs, "bool"), axis=-1)
24
+ return mask
25
+
26
+ def get_config(self):
27
+ config = super().get_config()
28
+ config.update({
29
+ "sequence_length": self.sequence_length,
30
+ "output_dim": self.output_dim,
31
+ })
32
+ return config
33
+
34
+
35
+ class TransformerEncoder(layers.Layer):
36
+ def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
37
+ super().__init__(**kwargs)
38
+ self.embed_dim = embed_dim
39
+ self.dense_dim = dense_dim
40
+ self.num_heads = num_heads
41
+ self.attention = layers.MultiHeadAttention(
42
+ num_heads=num_heads, key_dim=embed_dim, dropout=0.3
43
+ )
44
+ self.dense_proj = keras.Sequential(
45
+ [layers.Dense(dense_dim, activation=tf.nn.gelu), layers.Dense(embed_dim),]
46
+ )
47
+ self.layernorm_1 = layers.LayerNormalization()
48
+ self.layernorm_2 = layers.LayerNormalization()
49
+
50
+ def call(self, inputs, mask=None):
51
+ if mask is not None:
52
+ mask = mask[:, tf.newaxis, :]
53
+
54
+ attention_output = self.attention(inputs, inputs, attention_mask=mask)
55
+ proj_input = self.layernorm_1(inputs + attention_output)
56
+ proj_output = self.dense_proj(proj_input)
57
+ return self.layernorm_2(proj_input + proj_output)
58
+
59
+
60
+ def get_config(self):
61
+ config = super().get_config()
62
+ config.update({
63
+ "embed_dim": self.embed_dim,
64
+ "dense_dim": self.dense_dim,
65
+ "num_heads": self.num_heads,
66
+ })
67
+ return config
utils/predict.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .custom_layers import TransformerEncoder, PositionalEmbedding
2
+ from .constants import MAX_SEQ_LENGTH, NUM_FEATURES, IMG_SIZE, CLASS_VOCAB
3
+ from huggingface_hub import from_pretrained_keras
4
+ from tensorflow import keras
5
+ from keras import layers
6
+ import numpy as np
7
+ import imageio
8
+ import cv2
9
+
10
+ model = from_pretrained_keras("shivi/video-classification",custom_objects={"PositionalEmbedding":PositionalEmbedding,"TransformerEncoder": TransformerEncoder})
11
+
12
+ # model = from_pretrained_keras("shivi/video-transformers")
13
+
14
+ def build_feature_extractor():
15
+ feature_extractor = keras.applications.DenseNet121(
16
+ weights="imagenet",
17
+ include_top=False,
18
+ pooling="avg",
19
+ input_shape=(IMG_SIZE, IMG_SIZE, 3),
20
+ )
21
+ preprocess_input = keras.applications.densenet.preprocess_input
22
+
23
+ inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
24
+ preprocessed = preprocess_input(inputs)
25
+
26
+ outputs = feature_extractor(preprocessed)
27
+ return keras.Model(inputs, outputs, name="feature_extractor")
28
+
29
+
30
+ feature_extractor = build_feature_extractor()
31
+
32
+
33
+
34
+ def crop_center(frame):
35
+ center_crop_layer = layers.CenterCrop(IMG_SIZE, IMG_SIZE)
36
+ cropped = center_crop_layer(frame[None, ...])
37
+ cropped = cropped.numpy().squeeze()
38
+ return cropped
39
+
40
+ def load_video(path, max_frames=0):
41
+ cap = cv2.VideoCapture(path)
42
+ frames = []
43
+ try:
44
+ while True:
45
+ ret, frame = cap.read()
46
+ if not ret:
47
+ break
48
+ frame = crop_center(frame)
49
+ frame = frame[:, :, [2, 1, 0]]
50
+ frames.append(frame)
51
+
52
+ if len(frames) == max_frames:
53
+ break
54
+ finally:
55
+ cap.release()
56
+ return np.array(frames)
57
+
58
+ def prepare_single_video(frames):
59
+ frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")
60
+
61
+ # Pad shorter videos.
62
+ if len(frames) < MAX_SEQ_LENGTH:
63
+ diff = MAX_SEQ_LENGTH - len(frames)
64
+ padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3))
65
+ frames = np.concatenate(frames, padding)
66
+
67
+ frames = frames[None, ...]
68
+
69
+ # Extract features from the frames of the current video.
70
+ for i, batch in enumerate(frames):
71
+ video_length = batch.shape[0]
72
+ length = min(MAX_SEQ_LENGTH, video_length)
73
+ for j in range(length):
74
+ if np.mean(batch[j, :]) > 0.0:
75
+ frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
76
+ else:
77
+ frame_features[i, j, :] = 0.0
78
+
79
+ return frame_features
80
+
81
+
82
+ def predict_action(path):
83
+ frames = load_video(path)
84
+ frame_features = prepare_single_video(frames)
85
+ probabilities = model.predict(frame_features)[0]
86
+ confidences = {}
87
+
88
+ for i in np.argsort(probabilities)[::-1]:
89
+ confidences[CLASS_VOCAB[i]] = float(probabilities[i])
90
+
91
+ gif_out = to_gif(frames[:MAX_SEQ_LENGTH])
92
+ # gif_out = gen_moviepy_gif(path, start_seconds, end_seconds)
93
+
94
+ print(confidences)
95
+ return confidences, gif_out
96
+
97
+
98
+ def to_gif(images):
99
+ converted_images = images.astype(np.uint8)
100
+ imageio.mimsave("animation.gif", converted_images, fps=10)
101
+ return "animation.gif"
102
+