#from .custom_layers import TransformerEncoder, PositionalEmbedding from .constants import MAX_SEQ_LENGTH, NUM_FEATURES, IMG_SIZE, CLASS_VOCAB from huggingface_hub import from_pretrained_keras from tensorflow import keras from keras import layers import numpy as np import imageio import cv2 #model = from_pretrained_keras("shivi/video-classification",custom_objects={"PositionalEmbedding":PositionalEmbedding,"TransformerEncoder": TransformerEncoder}) model = from_pretrained_keras("keras-io/video-transformers") """ Below code is taken from the Video-Transformers example on keras-io by Sayak Paul """ def build_feature_extractor(): feature_extractor = keras.applications.DenseNet121( weights="imagenet", include_top=False, pooling="avg", input_shape=(IMG_SIZE, IMG_SIZE, 3), ) preprocess_input = keras.applications.densenet.preprocess_input inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3)) preprocessed = preprocess_input(inputs) outputs = feature_extractor(preprocessed) return keras.Model(inputs, outputs, name="feature_extractor") feature_extractor = build_feature_extractor() def crop_center(frame): center_crop_layer = layers.CenterCrop(IMG_SIZE, IMG_SIZE) cropped = center_crop_layer(frame[None, ...]) cropped = cropped.numpy().squeeze() return cropped def load_video(path, max_frames=0): cap = cv2.VideoCapture(path) frames = [] try: while True: ret, frame = cap.read() if not ret: break frame = crop_center(frame) frame = frame[:, :, [2, 1, 0]] frames.append(frame) if len(frames) == max_frames: break finally: cap.release() return np.array(frames) def prepare_single_video(frames): frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32") # Pad shorter videos. if len(frames) < MAX_SEQ_LENGTH: diff = MAX_SEQ_LENGTH - len(frames) padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3)) frames = np.concatenate(frames, padding) frames = frames[None, ...] # Extract features from the frames of the current video. for i, batch in enumerate(frames): video_length = batch.shape[0] length = min(MAX_SEQ_LENGTH, video_length) for j in range(length): if np.mean(batch[j, :]) > 0.0: frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :]) else: frame_features[i, j, :] = 0.0 return frame_features def predict_action(path): frames = load_video(path) frame_features = prepare_single_video(frames) probabilities = model.predict(frame_features)[0] confidences = {} for i in np.argsort(probabilities)[::-1]: confidences[CLASS_VOCAB[i]] = float(probabilities[i]) gif_out = to_gif(frames[:MAX_SEQ_LENGTH]) print(confidences) return confidences, gif_out def to_gif(images): converted_images = images.astype(np.uint8) imageio.mimsave("animation.gif", converted_images, fps=10) return "animation.gif"