import gradio as gr
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import cv2

# Define constants
IMG_SIZE = 224
MAX_SEQ_LENGTH = 30
NUM_FEATURES = 2048

# Load the trained model
model_filepath = "lstm_model.h5"  # Replace with the actual path
loaded_model = keras.models.load_model(model_filepath)
train_df = pd.DataFrame({
    'tag': ['BabyCrawling', 'CricketShot']
})
label_processor = keras.layers.StringLookup(num_oov_indices=0, vocabulary=np.unique(train_df["tag"]))
def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]
    
def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)
# Load the feature extractor
def build_feature_extractor():
    feature_extractor = keras.applications.InceptionV3(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.inception_v3.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")

feature_extractor = build_feature_extractor()

# Function for preparing a single video for prediction
def prepare_single_video(frames):
    frames = frames[None, ...]
    frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

    for i, batch in enumerate(frames):
        video_length = batch.shape[0]
        length = min(MAX_SEQ_LENGTH, video_length)
        for j in range(length):
            frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
        frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

    return frame_features, frame_mask

# Function for making predictions
def sequence_prediction(video_file):
    class_vocab = label_processor.get_vocabulary()

    # Load the video frames
    frames = load_video(video_file)

    # Prepare the frames for prediction
    frame_features, frame_mask = prepare_single_video(frames)

    # Make predictions using the loaded model
    probabilities = loaded_model.predict([frame_features, frame_mask])[0]

    # Get the predicted label
    predicted_label = class_vocab[np.argmax(probabilities)]

    return predicted_label
example_list=[
        ["video-1.mp4"],
        ["video-2.mp4"],
    ]
# Gradio interface
iface = gr.Interface(
    fn=sequence_prediction,
    inputs=gr.Video(label="Upload a video file"),
    outputs="text",
    examples=example_list,
)

# Launch the Gradio app
iface.launch()