shivi's picture
Update utils/predict.py
15d73ae
#from .custom_layers import TransformerEncoder, PositionalEmbedding
from .constants import MAX_SEQ_LENGTH, NUM_FEATURES, IMG_SIZE, CLASS_VOCAB
from huggingface_hub import from_pretrained_keras
from tensorflow import keras
from keras import layers
import numpy as np
import imageio
import cv2
#model = from_pretrained_keras("shivi/video-classification",custom_objects={"PositionalEmbedding":PositionalEmbedding,"TransformerEncoder": TransformerEncoder})
model = from_pretrained_keras("keras-io/video-transformers")
"""
Below code is taken from the Video-Transformers example on keras-io by Sayak Paul
"""
def build_feature_extractor():
feature_extractor = keras.applications.DenseNet121(
weights="imagenet",
include_top=False,
pooling="avg",
input_shape=(IMG_SIZE, IMG_SIZE, 3),
)
preprocess_input = keras.applications.densenet.preprocess_input
inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
preprocessed = preprocess_input(inputs)
outputs = feature_extractor(preprocessed)
return keras.Model(inputs, outputs, name="feature_extractor")
feature_extractor = build_feature_extractor()
def crop_center(frame):
center_crop_layer = layers.CenterCrop(IMG_SIZE, IMG_SIZE)
cropped = center_crop_layer(frame[None, ...])
cropped = cropped.numpy().squeeze()
return cropped
def load_video(path, max_frames=0):
cap = cv2.VideoCapture(path)
frames = []
try:
while True:
ret, frame = cap.read()
if not ret:
break
frame = crop_center(frame)
frame = frame[:, :, [2, 1, 0]]
frames.append(frame)
if len(frames) == max_frames:
break
finally:
cap.release()
return np.array(frames)
def prepare_single_video(frames):
frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")
# Pad shorter videos.
if len(frames) < MAX_SEQ_LENGTH:
diff = MAX_SEQ_LENGTH - len(frames)
padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3))
frames = np.concatenate(frames, padding)
frames = frames[None, ...]
# Extract features from the frames of the current video.
for i, batch in enumerate(frames):
video_length = batch.shape[0]
length = min(MAX_SEQ_LENGTH, video_length)
for j in range(length):
if np.mean(batch[j, :]) > 0.0:
frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
else:
frame_features[i, j, :] = 0.0
return frame_features
def predict_action(path):
frames = load_video(path)
frame_features = prepare_single_video(frames)
probabilities = model.predict(frame_features)[0]
confidences = {}
for i in np.argsort(probabilities)[::-1]:
confidences[CLASS_VOCAB[i]] = float(probabilities[i])
gif_out = to_gif(frames[:MAX_SEQ_LENGTH])
print(confidences)
return confidences, gif_out
def to_gif(images):
converted_images = images.astype(np.uint8)
imageio.mimsave("animation.gif", converted_images, fps=10)
return "animation.gif"