import gradio as gr import os import numpy as np import tensorflow as tf from tensorflow import keras import pandas as pd import cv2 # Define constants IMG_SIZE = 224 MAX_SEQ_LENGTH = 30 NUM_FEATURES = 2048 # Load the trained model model_filepath = "lstm_model.h5" # Replace with the actual path loaded_model = keras.models.load_model(model_filepath) train_df = pd.DataFrame({ 'tag': ['BabyCrawling', 'CricketShot'] }) label_processor = keras.layers.StringLookup(num_oov_indices=0, vocabulary=np.unique(train_df["tag"])) def crop_center_square(frame): y, x = frame.shape[0:2] min_dim = min(y, x) start_x = (x // 2) - (min_dim // 2) start_y = (y // 2) - (min_dim // 2) return frame[start_y : start_y + min_dim, start_x : start_x + min_dim] def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)): cap = cv2.VideoCapture(path) frames = [] try: while True: ret, frame = cap.read() if not ret: break frame = crop_center_square(frame) frame = cv2.resize(frame, resize) frame = frame[:, :, [2, 1, 0]] frames.append(frame) if len(frames) == max_frames: break finally: cap.release() return np.array(frames) # Load the feature extractor def build_feature_extractor(): feature_extractor = keras.applications.InceptionV3( weights="imagenet", include_top=False, pooling="avg", input_shape=(IMG_SIZE, IMG_SIZE, 3), ) preprocess_input = keras.applications.inception_v3.preprocess_input inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3)) preprocessed = preprocess_input(inputs) outputs = feature_extractor(preprocessed) return keras.Model(inputs, outputs, name="feature_extractor") feature_extractor = build_feature_extractor() # Function for preparing a single video for prediction def prepare_single_video(frames): frames = frames[None, ...] frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool") frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32") for i, batch in enumerate(frames): video_length = batch.shape[0] length = min(MAX_SEQ_LENGTH, video_length) for j in range(length): frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :]) frame_mask[i, :length] = 1 # 1 = not masked, 0 = masked return frame_features, frame_mask # Function for making predictions def sequence_prediction(video_file): class_vocab = label_processor.get_vocabulary() # Load the video frames frames = load_video(video_file) # Prepare the frames for prediction frame_features, frame_mask = prepare_single_video(frames) # Make predictions using the loaded model probabilities = loaded_model.predict([frame_features, frame_mask])[0] # Get the predicted label predicted_label = class_vocab[np.argmax(probabilities)] return predicted_label example_list=[ ["video-1.mp4"], ["video-2.mp4"], ] # Gradio interface iface = gr.Interface( fn=sequence_prediction, inputs=gr.Video(label="Upload a video file"), outputs="text", examples=example_list, ) # Launch the Gradio app iface.launch()