import tempfile import cv2 import gradio as gr import tensorflow as tf from moviepy.editor import VideoFileClip from moviepy.video.io.ImageSequenceClip import ImageSequenceClip from configuration import Config from model import load_classifier, load_detector from inference import format_frame, detect_object, classify_action, draw_boxes config = Config() print(f'TensorFlow {tf.__version__}') print(f'Load classifier from {config.classifier_path}') classifier = load_classifier(config) classifier.trainable = False classifier.summary() print('Load detector.') detector = load_detector(config) def fn(video: gr.Video): print('Process video.') with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as f: output = f.name clip = VideoFileClip(video) processed_frames = [] frames = [] actions = [] detections = ([], []) for i, frame in enumerate(clip.iter_frames()): if i % config.classify_action_frame_step == 0: frames.append(format_frame(frame, config)) if i % config.detect_object_frame_step == 0: print(f'Detect object: Frame {i}') detections = detect_object(detector, frame) if len(frames) == config.classify_action_num_frames: print(f'Classify action: Until frame {i}') actions = classify_action(classifier, frames, config.id_to_name) frames = [] frame = draw_boxes(frame, detections, actions) processed_frames.append(frame) if i % config.yield_frame_steps == 0: quality = 9 image_array = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) _, image_encoded = cv2.imencode('.jpg', image_array, [int(cv2.IMWRITE_JPEG_QUALITY), quality]) with tempfile.NamedTemporaryFile(suffix='.jpeg') as f: f.write(image_encoded) yield f.name, None processed_clip = ImageSequenceClip(processed_frames, clip.fps) processed_clip.audio = clip.audio processed_clip.write_videofile(output, fps=clip.fps, audio_codec='aac', logger=None) yield frame, output inputs = gr.Video(sources=['upload'], label='Input Video') outputs = [ gr.Image(interactive=False, label='Last Frame Processed'), gr.Video(interactive=False, label='Aeroplane Position and Action Marked')] examples = [ ['examples/ZFLFDfovqls_001310_001320.mp4'], # cspell: disable-line ['examples/Zv7GyH-fpEY_2023.0_2033.0.mp4']] iface = gr.Interface( title='Aeroplane Position and Action Detection', description='Detect aeroplane position and action in a video.', theme='soft', fn=fn, inputs=inputs, outputs=outputs, examples=examples, cache_examples=False) iface.launch()