Spaces:

chiyoi
/

aero-recognize

Sleeping

App Files Files Community

chiyoi commited on Jan 12

Commit

139dd3e

•

1 Parent(s): 72d3c6b

update

Browse files

Files changed (14) hide show

.gitattributes +1 -0
.gitignore +4 -1
app.py +35 -3
core/data.py +77 -0
core/inference.py +91 -0
core/model.py +44 -0
movinet/data.py +0 -79
playgrounds/load_video.py +0 -112
playgrounds/movinet.py +0 -80
playgrounds/verify_metal.py +0 -14
playgrounds/yolo.py +0 -40
requirements.txt +5 -1
weights/classifier-7.keras +3 -0
weights/yolov8n.pt +3 -0

.gitattributes CHANGED Viewed

@@ -7,6 +7,7 @@
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
 *.mlmodel filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text

 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
+*.keras filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
 *.mlmodel filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -4,7 +4,10 @@
 # data
 assets
-out
 # python
 __pycache__

 # data
 assets
+output
 # python
 __pycache__
+# gradio
+flagged

app.py CHANGED Viewed

@@ -1,7 +1,39 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 iface.launch()

 import gradio as gr
+import tensorflow as tf
+from moviepy.editor import VideoFileClip
+from ultralytics import YOLO
+from core.data import ClassMapping
+from core.model import load_classifier
+from core.inference import FrameProcessor
+print("Tensorflow version " + tf.__version__)
+print('Load classifier.')
+classifier_path = 'weights/classifier-7.keras'
+classifier = load_classifier(classifier_path)
+print('Load detector.')
+detector_path = 'weights/yolov8n.pt'
+detector = YOLO(detector_path)
+def fn(video: gr.Video):
+  print('Process video.')
+  output = f'Marked-{str(video)}'
+  clip = VideoFileClip(video)
+  data_dir = 'storage/dataset'
+  id_to_name, _ = ClassMapping(data_dir)
+  process_frame = FrameProcessor(detector, classifier, id_to_name)
+  clip = clip.fl_image(process_frame)
+  clip.write_videofile(output, fps=clip.fps, audio_codec='aac', logger=None)
+  return video
+inputs = gr.Video(sources=['upload'], label='Input Video')
+outputs = gr.Video(interactive=False, label='Aeroplane Position and Action Marked')
+iface = gr.Interface(
+  fn=fn,
+  inputs=inputs,
+  outputs=outputs,
+)
 iface.launch()

core/data.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from pathlib import Path
+import random
+from typing import Literal
+import cv2
+import numpy as np
+import tensorflow as tf
+training_ratio = 0.7
+validation_ratio = 0.02
+num_frames = 8
+frame_step = 15
+frame_size = (224, 224)
+def format_frame(frame):
+  frame = tf.image.convert_image_dtype(frame, tf.float32)
+  frame = tf.image.resize_with_pad(frame, *frame_size)
+  return frame
+def pick_frames(video: str):
+  capture = cv2.VideoCapture(video)
+  if not capture.isOpened(): raise ValueError('Video file could not be opened.')
+  total_frames = capture.get(cv2.CAP_PROP_FRAME_COUNT)
+  need_frames = 1 + (num_frames - 1) * frame_step
+  if need_frames <= total_frames:
+    start = random.randint(0, total_frames - need_frames + 1)
+    capture.set(cv2.CAP_PROP_POS_FRAMES, start)
+  frames = []
+  for _ in range(num_frames):
+    for _ in range(frame_step):
+      ok, frame = capture.read()
+    if ok: frames.append(format_frame(frame))
+    else: frames.append(np.zeros(frame_size + (3,)))
+  capture.release()
+  frames = np.array(frames)
+  frames = frames[..., [2, 1, 0]]
+  return frames
+def Data(data_dir: str):
+  data_dir = Path(data_dir)
+  return {
+  'training':{
+    a.name: (lambda ps: ps[:int(len(ps) * training_ratio)])([x for x in a.iterdir()])
+    for a in data_dir.iterdir()
+  },
+  'validation': {
+    a.name: (lambda ps: ps[
+      int(len(ps) * training_ratio) :
+      int(len(ps) * (training_ratio + validation_ratio))
+    ])([x for x in a.iterdir()])
+    for a in data_dir.iterdir()
+    },
+  }
+def ClassMapping(data_dir: str):
+  data_dir = Path(data_dir)
+  id_to_name = sorted([x.name for x in data_dir.iterdir()])
+  name_to_id = {
+    name: i
+    for i, name in enumerate(id_to_name)
+  }
+  return (id_to_name, name_to_id)
+def FrameGenerator(data_dir: str, split: Literal['training', 'validation']):
+  _, name_to_id = ClassMapping(data_dir)
+  data = Data(data_dir)
+  def generator():
+    pairs = [
+      (video, class_name)
+      for class_name, videos in data[split].items()
+      for video in videos
+    ]
+    random.shuffle(pairs)
+    for video, class_name in pairs:
+      frames = pick_frames(video)
+      label = name_to_id[class_name]
+      yield frames, label
+  return generator

core/inference.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from imgviz import instances2rgb
+import tensorflow as tf
+import numpy as np
+from core.data import format_frame
+# detections: (classes: list of class_name, boxes: list of [x1, y1, x2, y2])
+# actions: list of f'{action_name}: {confidence}'
+detect_object_frame_steps = 5
+classify_action_frame_steps = 15
+classify_action_num_frames = 8
+def detect_object(detector, frame):
+  result = detector(frame, classes=4, verbose=False)[0]
+  classes = result.boxes.cls.numpy()
+  boxes = result.boxes.xyxy.numpy()
+  predictions = [
+    (result.names[classes[i]].capitalize(), boxes[i])
+    for i in range(len(classes))
+  ]
+  detections = (
+    [result.names[i].capitalize() for i in classes],
+    boxes,
+  )
+  return detections
+def classify_action(classifier, frames, id_to_name):
+  actions = []
+  frames = np.array(frames)
+  frames = frames[..., [2, 1, 0]]
+  frames = tf.expand_dims(frames, 0)
+  output = classifier(frames, training=False)
+  confidences = tf.nn.softmax(output).numpy()[0]
+  for (class_id, confidence) in enumerate(confidences):
+    other_class_id = 2
+    if confidence > 0.3 and class_id != other_class_id:
+      actions.append(f'{id_to_name[class_id]}: {np.round(confidence, 2)}')
+  return actions
+def draw_boxes(frame, detections, actions):
+  (classes, boxes) = detections
+  max_area = 0
+  max_area_id = 0
+  for i, box in enumerate(boxes):
+    area = (box[3] - box[1]) * (box[2] - box[0])
+    if area > max_area:
+      max_area = area
+      max_area_id = i
+  labels = [0 for _ in classes]
+  colormap = [(0x39, 0xc5, 0xbb)]
+  line_width = 2
+  captions = [
+    f'{class_name}\n' + '\n'.join(actions if i == max_area_id else [])
+    for (i, class_name) in enumerate(classes)
+  ]
+  bboxes = [
+    [box[1], box[0], box[3], box[2]]
+    for box in boxes
+  ]
+  frame = instances2rgb(
+    frame,
+    labels=labels,
+    captions=captions,
+    bboxes=bboxes,
+    colormap=colormap,
+    font_size=20,
+    line_width=line_width,
+  )
+  return frame
+def FrameProcessor(detector, classifier, id_to_name):
+  current_frame = 0
+  frames = []
+  actions = []
+  detections = ([], [])
+  def process_frame(frame):
+    nonlocal current_frame, frames, actions, detections
+    current_frame += 1
+    if current_frame % classify_action_frame_steps == 0:
+      frames.append(format_frame(frame))
+    if current_frame % detect_object_frame_steps == 0:
+      print(f'Detect object: Frame {current_frame}')
+      detections = detect_object(detector, frame)
+    if len(frames) == classify_action_num_frames:
+      print(f'Classify action: Until frame {current_frame}')
+      actions = classify_action(classifier, frames, id_to_name)
+      frames = []
+    frame = draw_boxes(frame, detections, actions)
+    return frame
+  return process_frame

core/model.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import tensorflow as tf
+from tensorflow import keras
+from official.projects.movinet.modeling import movinet
+from official.projects.movinet.modeling import movinet_model
+model_id = 'a1'
+num_classes = 6
+num_frames = 8
+resolution = 224
+batch_size = 32
+learning_rate = 0.001
+backbone_trainable = True
+def build_classifier_with_pretrained_weights(checkpoint_dir: str):
+  backbone = movinet.Movinet(model_id=model_id)
+  backbone.trainable = backbone_trainable
+  model = movinet_model.MovinetClassifier(backbone=backbone, num_classes=600)
+  checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir)
+  checkpoint = tf.train.Checkpoint(model=model)
+  status = checkpoint.restore(checkpoint_path)
+  status.assert_existing_objects_matched()
+  model = movinet_model.MovinetClassifier(
+    backbone=backbone,
+    num_classes=num_classes,
+  )
+  model.build([batch_size, num_frames, resolution, resolution, 3])
+  return model
+def load_classifier(weights_path: str):
+  backbone = movinet.Movinet(model_id=model_id)
+  model = movinet_model.MovinetClassifier(
+    backbone=backbone,
+    num_classes=num_classes,
+  )
+  model.build([1, num_frames, resolution, resolution, 3])
+  model.load_weights(weights_path)
+  return model
+def compile_classifier(model):
+  loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+  optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
+  model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
+  return model

movinet/data.py DELETED Viewed

@@ -1,79 +0,0 @@
-from pathlib import Path
-import random
-from typing import Literal
-import cv2
-import numpy as np
-import tensorflow as tf
-TRAINING_RATIO = 0.1
-VALIDATION_RATIO = 0.01
-def format_frames(frame, output_size):
-    frame = tf.image.convert_image_dtype(frame, tf.float32)
-    frame = tf.image.resize_with_pad(frame, *output_size)
-    return frame
-def frames_from_video_file(video_path: str, n_frames: int, output_size=(256, 256), frame_step=15):
-    capture = cv2.VideoCapture(video_path)
-    if not capture.isOpened(): raise ValueError('Video file could not be opened.')
-    total_frames = capture.get(cv2.CAP_PROP_FRAME_COUNT)
-    need_frames = 1 + (n_frames - 1) * frame_step
-    if need_frames <= total_frames:
-        start = random.randint(0, total_frames - need_frames + 1)
-        capture.set(cv2.CAP_PROP_POS_FRAMES, start)
-    frames = []
-    for _ in range(n_frames - 1):
-        for _ in range(frame_step):
-            ok, frame = capture.read()
-        if ok:
-            frames.append(format_frames(frame, output_size))
-        else:
-            frames.append(np.zeros((output_size[0], output_size[1], 3)))
-    capture.release()
-    frames = np.array(frames)
-    frames = frames[..., [2, 1, 0]]
-    return frames
-def Data(data_dir: Path):
-    return {
-        'training':{
-            a.name: (lambda ps: ps[:int(len(ps) * TRAINING_RATIO)])([x for x in a.iterdir()])
-            for a in data_dir.iterdir()
-        },
-        'validation': {
-            a.name: (lambda ps: ps[
-                int(len(ps) * TRAINING_RATIO) :
-                int(len(ps) * (TRAINING_RATIO + VALIDATION_RATIO))
-            ])([x for x in a.iterdir()])
-            for a in data_dir.iterdir()
-        },
-    }
-def frame_generator(data_dir: Path, n_frames: int, split: Literal['training', 'validation']):
-    class_names = sorted([x.name for x in data_dir.iterdir()])
-    class_ids_for_name = {
-        name: i
-        for i, name in enumerate(class_names)
-    }
-    data = Data(data_dir)
-    def generator():
-        pairs = [
-            (path, name)
-            for name, paths in data[split].items()
-            for path in paths
-        ]
-        random.shuffle(pairs)
-        for path, name in pairs:
-            video_frames = frames_from_video_file(str(path), n_frames)
-            label = class_ids_for_name[name]
-            yield video_frames, label
-    return generator
-def total_steps(data_dir: Path):
-    data = Data(data_dir)
-    size = lambda d: sum([len(x) for x in d.values()])
-    return size(data['training']), size(data['validation'])

playgrounds/load_video.py DELETED Viewed

@@ -1,112 +0,0 @@
-import random
-from typing import *
-import numpy as np
-import tensorflow as tf
-import cv2
-from pathlib import Path
-SPLIT_RATIO = 0.7
-BATCH_SIZE = 8
-NUM_FRAMES = 8
-def main():
-    data_dir = Path('assets/dataset')
-    output_signature = (
-        tf.TensorSpec(shape = (None, None, None, 3), dtype = tf.float32),
-        tf.TensorSpec(shape = (), dtype = tf.int16),
-    )
-    train_ds = tf.data.Dataset.from_generator(frame_generator(data_dir, NUM_FRAMES, 'training'), output_signature=output_signature)
-    train_ds = train_ds.batch(BATCH_SIZE)
-def format_frames(frame, output_size):
-    """
-      Pad and resize an image from a video.
-      Args:
-        frame: Image that needs to resized and padded.
-        output_size: Pixel size of the output frame image.
-      Return:
-        Formatted frame with padding of specified output size.
-    """
-    frame = tf.image.convert_image_dtype(frame, tf.float32)
-    frame = tf.image.resize_with_pad(frame, *output_size)
-    return frame
-def frames_from_video_file(video_path, n_frames, output_size=(224, 224), frame_step=15):
-    """
-      Creates frames from each video file present for each category.
-      Args:
-        video_path: File path to the video.
-        n_frames: Number of frames to be created per video file.
-        output_size: Pixel size of the output frame image.
-      Return:
-        An NumPy array of frames in the shape of (n_frames, height, width, channels).
-    """
-    # Read each video frame by frame
-    result = []
-    src = cv2.VideoCapture(str(video_path))
-    video_length = src.get(cv2.CAP_PROP_FRAME_COUNT)
-    need_length = 1 + (n_frames - 1) * frame_step
-    if need_length > video_length:
-        start = 0
-    else:
-        max_start = video_length - need_length
-        start = random.randint(0, max_start + 1)
-    src.set(cv2.CAP_PROP_POS_FRAMES, start)
-    # ret is a boolean indicating whether read was successful, frame is the image itself
-    ok, frame = src.read()
-    if not ok:
-        raise ValueError('read video not success')
-    result.append(format_frames(frame, output_size))
-    for _ in range(n_frames - 1):
-        for _ in range(frame_step):
-            ok, frame = src.read()
-        if ok:
-            frame = format_frames(frame, output_size)
-            result.append(frame)
-        else:
-            result.append(np.zeros_like(result[0]))
-    src.release()
-    result = np.array(result)[..., [2, 1, 0]]
-    return result
-def frame_generator(data_dir: Path, n_frames: int, split: Literal['training', 'validation']):
-    class_names = sorted([x.name for x in data_dir.iterdir()])
-    class_ids_for_name = {
-        name: i
-        for i, name in enumerate(class_names)
-    }
-    data = {
-        'training':{
-            a.name: (lambda ps: ps[:int(len(ps) * SPLIT_RATIO)])([x for x in a.iterdir()])
-            for a in data_dir.iterdir()
-        },
-        'validation': {
-            a.name: (lambda ps: ps[int(len(ps) * SPLIT_RATIO):])([x for x in a.iterdir()])
-            for a in data_dir.iterdir()
-        },
-    }
-    def generator():
-        pairs = [
-            (path, name)
-            for name, paths in data[split].items()
-            for path in paths
-        ]
-        random.shuffle(pairs)
-        for path, name in pairs:
-            video_frames = frames_from_video_file(path, n_frames)
-            label = class_ids_for_name[name]  # Encode labels
-            yield video_frames, label
-    return generator

playgrounds/movinet.py DELETED Viewed

@@ -1,80 +0,0 @@
-import tensorflow as tf
-import numpy as np
-import tensorflow_hub as hub
-import keras
-labels_path = keras.utils.get_file(
-    fname='labels.txt',
-    origin='https://raw.githubusercontent.com/tensorflow/models/f8af2291cced43fc9f1d9b41ddbf772ae7b0d7d2/official/projects/movinet/files/kinetics_600_labels.txt'
-)
-with open(labels_path, 'r', encoding='utf-8') as file:
-    lines = file.read().splitlines()
-KINETICS_600_LABELS = np.array([line.strip() for line in lines])
-KINETICS_600_LABELS[:20]
-def main():
-    jumping_jack_path = 'assets/jumping_pack.gif'
-    jumping_jack = load_gif(jumping_jack_path)
-    id = 'a2'
-    mode = 'base'
-    version = '3'
-    hub_url = f'https://tfhub.dev/tensorflow/movinet/{id}/{mode}/kinetics-600/classification/{version}'
-    model = hub.load(hub_url)
-    sig = model.signatures['serving_default']
-    print('Model loaded.')
-    sig(image=jumping_jack[tf.newaxis, :1])
-    logits = sig(image=jumping_jack[tf.newaxis, ...])
-    logits = logits['classifier_head'][0]
-    probs = tf.nn.softmax(logits, axis=-1)
-    for label, p in get_top_k(probs):
-        print(f'{label:20s}: {p:.3f}')
-def get_top_k(probs, k=5, label_map=KINETICS_600_LABELS):
-    """Outputs the top k model labels and probabilities on the given video.
-    Args:
-      probs: probability tensor of shape (num_frames, num_classes) that represents
-        the probability of each class on each frame.
-      k: the number of top predictions to select.
-      label_map: a list of labels to map logit indices to label strings.
-    Returns:
-      a tuple of the top-k labels and probabilities.
-    """
-    # Sort predictions to find top_k
-    top_predictions = tf.argsort(probs, axis=-1, direction='DESCENDING')[:k]
-    # collect the labels of top_k predictions
-    top_labels = tf.gather(label_map, top_predictions, axis=-1)
-    # decode labels
-    top_labels = [label.decode('utf8') for label in top_labels.numpy()]
-    # top_k probabilities of the predictions
-    top_probs = tf.gather(probs, top_predictions, axis=-1).numpy()
-    return tuple(zip(top_labels, top_probs))
-def load_gif(file_path, image_size=(224, 224)):
-    """Loads a gif file into a TF tensor.
-    Use images resized to match what's expected by your model.
-    The model pages say the "A2" models expect 224 x 224 images at 5 fps
-    Args:
-      file_path: path to the location of a gif file.
-      image_size: a tuple of target size.
-    Returns:
-      a video of the gif file
-    """
-    # Load a gif file, convert it to a TF tensor
-    raw = tf.io.read_file(file_path)
-    video = tf.io.decode_gif(raw)
-    # Resize the video
-    video = tf.image.resize(video, image_size)
-    # change dtype to a float32
-    # Hub models always want images normalized to [0,1]
-    # ref: https://www.tensorflow.org/hub/common_signatures/images#input
-    video = tf.cast(video, tf.float32) / 255.
-    return video

playgrounds/verify_metal.py DELETED Viewed

@@ -1,14 +0,0 @@
-import tensorflow as tf
-cifar = tf.keras.datasets.cifar100
-(x_train, y_train), (x_test, y_test) = cifar.load_data()
-model = tf.keras.applications.ResNet50(
-    include_top=True,
-    weights=None,
-    input_shape=(32, 32, 3),
-    classes=100,
-)
-loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
-model.compile(optimizer="adam", loss=loss_fn, metrics=["accuracy"])
-model.fit(x_train, y_train, epochs=5, batch_size=64)

playgrounds/yolo.py DELETED Viewed

@@ -1,40 +0,0 @@
-import keras
-import keras_cv
-import numpy as np
-import tensorflow as tf
-from playgrounds.load_video import frames_from_video_file
-def main():
-    pretrained_model = keras_cv.models.YOLOV8Detector.from_preset(
-        "yolo_v8_m_pascalvoc", bounding_box_format="xywh"
-    )
-    print('Model loaded.')
-    inference_resizing = keras_cv.layers.Resizing(
-        640, 640, pad_to_aspect_ratio=True, bounding_box_format="xywh"
-    )
-    class_ids = [
-        "Aeroplane", "Bicycle", "Bird", "Boat", "Bottle", "Bus", "Car", "Cat", "Chair", "Cow", "Dining Table",
-        "Dog", "Horse", "Motorbike", "Person", "Potted Plant", "Sheep", "Sofa", "Train", "Tvmonitor", "Total",
-    ]
-    class_mapping = {i: c for (i, c) in enumerate(class_ids)}
-    # raw = tf.io.read_file('assets/IMG_9528.gif')
-    # video = tf.io.decode_gif(raw)
-    video = frames_from_video_file('assets/dataset/Flying/2kNjmM8BnD0_230.0_238.0.mp4', 3, (640,640))
-    image = video[0]
-    image = (image*255).astype(np.uint8)
-    file = tf.io.encode_png(image)
-    tf.io.write_file('out/t.png', file)
-    # image = keras.utils.load_img('assets/nick-morales-BwYcH78rcpI-unsplash.jpg')
-    # image = np.array(image)
-    image_batch = inference_resizing([image])
-    y_pred = pretrained_model.predict(image_batch)
-    classes = y_pred['classes']
-    boxes = y_pred["boxes"]
-    print(f'Classes: {classes}')
-    print(f'Boxes: {boxes}')

requirements.txt CHANGED Viewed

@@ -1,3 +1,7 @@
-gradio
 tensorflow
 opencv-python

 tensorflow
+numpy
 opencv-python
+tf-models-official
+ultralytics
+imgviz
+moviepy

weights/classifier-7.keras ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13a9436ec0971fe72b53f03d9dd57b89a7c48a4cb82380e14b298c3e2d712f50
+size 25261904

weights/yolov8n.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31e20dde3def09e2cf938c7be6fe23d9150bbbe503982af13345706515f2ef95
+size 6534387