salman508's picture
Update app.py
c216f22 verified
import gradio as gr
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import cv2
# Define constants
IMG_SIZE = 224
MAX_SEQ_LENGTH = 30
NUM_FEATURES = 2048
# Load the trained model
model_filepath = "lstm_model.h5" # Replace with the actual path
loaded_model = keras.models.load_model(model_filepath)
train_df = pd.DataFrame({
'tag': ['BabyCrawling', 'CricketShot']
})
label_processor = keras.layers.StringLookup(num_oov_indices=0, vocabulary=np.unique(train_df["tag"]))
def crop_center_square(frame):
y, x = frame.shape[0:2]
min_dim = min(y, x)
start_x = (x // 2) - (min_dim // 2)
start_y = (y // 2) - (min_dim // 2)
return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]
def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
cap = cv2.VideoCapture(path)
frames = []
try:
while True:
ret, frame = cap.read()
if not ret:
break
frame = crop_center_square(frame)
frame = cv2.resize(frame, resize)
frame = frame[:, :, [2, 1, 0]]
frames.append(frame)
if len(frames) == max_frames:
break
finally:
cap.release()
return np.array(frames)
# Load the feature extractor
def build_feature_extractor():
feature_extractor = keras.applications.InceptionV3(
weights="imagenet",
include_top=False,
pooling="avg",
input_shape=(IMG_SIZE, IMG_SIZE, 3),
)
preprocess_input = keras.applications.inception_v3.preprocess_input
inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
preprocessed = preprocess_input(inputs)
outputs = feature_extractor(preprocessed)
return keras.Model(inputs, outputs, name="feature_extractor")
feature_extractor = build_feature_extractor()
# Function for preparing a single video for prediction
def prepare_single_video(frames):
frames = frames[None, ...]
frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")
for i, batch in enumerate(frames):
video_length = batch.shape[0]
length = min(MAX_SEQ_LENGTH, video_length)
for j in range(length):
frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
frame_mask[i, :length] = 1 # 1 = not masked, 0 = masked
return frame_features, frame_mask
# Function for making predictions
def sequence_prediction(video_file):
class_vocab = label_processor.get_vocabulary()
# Load the video frames
frames = load_video(video_file)
# Prepare the frames for prediction
frame_features, frame_mask = prepare_single_video(frames)
# Make predictions using the loaded model
probabilities = loaded_model.predict([frame_features, frame_mask])[0]
# Get the predicted label
predicted_label = class_vocab[np.argmax(probabilities)]
return predicted_label
example_list=[
["video-1.mp4"],
["video-2.mp4"],
]
# Gradio interface
iface = gr.Interface(
fn=sequence_prediction,
inputs=gr.Video(label="Upload a video file"),
outputs="text",
examples=example_list,
)
# Launch the Gradio app
iface.launch()