Spaces:

spark-ds549
/

fal2022-videoanalysis-v2

Build error

App Files Files Community

Frank Pacini commited on Dec 9, 2022

Commit

6155c0e

•

1 Parent(s): e694ec3

copy repo

Browse files

Files changed (12) hide show

CustomFile.py +19 -0
README.md +4 -3
app.py +35 -0
audio_feature_extraction_final.py +125 -0
ava_action_list.pbtxt +240 -0
coco.names +80 -0
environment.yml +5 -0
requirements.txt +28 -0
slowfast.py +191 -0
video_object_extraction.py +185 -0
visualization.py +706 -0
yolov3.cfg +789 -0

CustomFile.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import gradio as gr
+# from typing import Dict
+# import base64
+# def encode_file_to_base64(f):
+#     with open(f, "rb") as file:
+#         encoded_string = base64.b64encode(file.read())
+#         base64_str = str(encoded_string, "utf-8")
+#         return base64_str
+class CustomFile(gr.File):
+    # def postprocess(self, y: str) -> Dict:
+    #     res = super().postprocess(y)
+    #     if res is not None:
+    #         res['data'] = encode_file_to_base64(res['name'])
+    #     return res
+    def dummy(self):
+        return

README.md CHANGED Viewed

@@ -1,12 +1,13 @@
 ---
-title: Fal2022 Videoanalysis V2
-emoji: 💩
 colorFrom: yellow
-colorTo: red
 sdk: gradio
 sdk_version: 3.12.0
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Fall2022 Videoanalysis
+emoji: 📈
 colorFrom: yellow
+colorTo: purple
 sdk: gradio
 sdk_version: 3.12.0
 app_file: app.py
 pinned: false
+license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import gradio as gr
+from slowfast import slow_fast_train
+from video_object_extraction import video_object_extraction
+from audio_feature_extraction_final import audio_feature_extraction
+from CustomFile import CustomFile
+import numpy as np
+import pandas as pd
+import pickle
+import torch
+try:
+    import detectron2
+except:
+    import os
+    os.system('pip install git+https://github.com/facebookresearch/detectron2.git')
+def predict(video_path, frames):
+    # gpu = torch.cuda.is_available()
+    # video_1, df1 = slow_fast_train(video_path, gpu)
+    # video_2, df2 = video_object_extraction(video_path,frames)
+    # audio_path = audio_feature_extraction(video_path, gpu)
+    # return ([video_1, video_2,audio_path], df1, df2)
+    audio_features = np.random.rand(2,2)
+    audio_path = 'audio_embeddings'
+    with open(audio_path, 'wb') as f:
+        pickle.dump(audio_features, f)
+    df = pd.DataFrame()
+    return ([video_path, video_path, audio_path], df, df)
+iface = gr.Interface(predict, inputs= [gr.Video(),gr.Slider(1, 100, value=15)], outputs=[gr.File(), gr.Dataframe(max_rows = 10),gr.Dataframe(max_rows = 10)])
+iface.launch(show_error=True, debug=True)

audio_feature_extraction_final.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import torch
+from torchaudio import load as torchaudio_load
+from moviepy.editor import VideoFileClip
+from pyannote.audio import Pipeline
+from sklearn.preprocessing import LabelEncoder
+from librosa import load as librosa_load
+import librosa.display
+import math
+import pandas as pd
+import sys
+from tqdm import tqdm
+import numpy as np
+from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration, pipeline as transformers_pipeline
+import pickle
+""""Author: Frank"""
+def extract_s2t_features(gpu):
+    model_name="medium"
+    processor = Speech2TextProcessor.from_pretrained("facebook/s2t-{}-librispeech-asr".format(model_name))
+    model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-{}-librispeech-asr".format(model_name))
+    if gpu:
+        model = model.cuda()
+    model.load_state_dict(torch.load('s2t_model'))
+    model.eval()
+    sample_rate = 16000
+    embedding_window = 10 # in secs
+    audio, _ = torchaudio_load('temp.wav')
+    audio = torch.mean(audio, dim=0)
+    embs = []
+    audio_clips = audio.split(embedding_window*sample_rate)
+    if len(audio_clips) > 1:
+        audio_clips = audio_clips[:-1]
+    for clip in tqdm(audio_clips):
+        with torch.no_grad():
+            inputs = processor(clip, sampling_rate=16000, return_tensors="pt")
+            features = inputs["input_features"]
+            decoder_input = torch.zeros(features.shape[:2], dtype=torch.int32)
+            if gpu:
+                features, decoder_input = features.cuda(), decoder_input.cuda()
+            h = model.model(features, decoder_input_ids=decoder_input).last_hidden_state.cpu()
+            emb = torch.mean(h,axis=1)
+        embs.append(emb)
+    return torch.cat(embs).numpy()
+""""Author: Sichao"""
+def extract_speaker_features(gpu):
+    x , sample_rate = librosa_load('temp.wav')
+    print('Input sample rate: {}, Length: {} s'.format(sample_rate, x.size/sample_rate))
+    # speaker diarization
+    print('Start speaker diarization...')
+    pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1", use_auth_token='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')
+    diarization = pipeline('temp.wav')
+    speaker_per_sec_dict = {i: 'UNKNOWN' for i in range(0, math.ceil(x.size/sample_rate))}
+    for turn, _, speaker in diarization.itertracks(yield_label=True):
+        for clip_start in range(math.ceil(turn.start), math.ceil(turn.end)):
+            if speaker_per_sec_dict[clip_start] == 'UNKNOWN':
+                speaker_per_sec_dict[clip_start] = speaker
+            elif speaker_per_sec_dict[clip_start] != speaker:
+                speaker_per_sec_dict[clip_start] = speaker_per_sec_dict[clip_start] + ' ' + speaker
+    speaker_per_clip = []
+    for i in range(0, math.ceil(x.size/sample_rate), 10):
+        speakers = []
+        for j in range(10):
+            if i + j in speaker_per_sec_dict and speaker_per_sec_dict[i + j] != 'UNKNOWN':
+                speakers.append(speaker_per_sec_dict[i + j])
+        if len(speakers) > 0:
+            is_single_speaker = all(s == speakers[0] for s in speakers)
+            if is_single_speaker:
+                speaker_per_clip.append(speakers[0])
+            else:
+                speaker_per_clip.append('MULTI SPEAKER')
+        else:
+            speaker_per_clip.append('UNKNOWN')
+    # Adult child classification
+    print('Start adult child classification...')
+    device = 0 if gpu else -1
+    audio_classifier = transformers_pipeline(task="audio-classification", model="bookbot/wav2vec2-adult-child-cls", device=device)
+    clip_idxs = [i for i in range(0, math.ceil(x.size/sample_rate), 10)]
+    classifications = []
+    for clip_start in tqdm(clip_idxs):
+        with torch.no_grad():
+            preds = audio_classifier(x[clip_start*sample_rate:(clip_start + 10)*sample_rate])
+        preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+        classifications.append(preds[0]['label'])
+    # output
+    print('Output...')
+    output = {'clip_start':clip_idxs, 'diarization':speaker_per_clip, 'adult_child_classification':classifications}
+    output_df = pd.DataFrame(output)
+    # Creating a instance of label Encoder.
+    le = LabelEncoder()
+    # encoder and return encoded label
+    output_df['diarization_numeric'] = le.fit_transform(output_df['diarization'])
+    output_df['adult_child_classification_numeric'] = le.fit_transform(output_df['adult_child_classification'])
+    return output_df['diarization_numeric'].values, output_df['adult_child_classification_numeric'].values
+def audio_feature_extraction(input_path, gpu=False):
+    output_path = 'audio_embedding'
+    audioTrack = VideoFileClip(input_path).audio
+    audioTrack.write_audiofile('temp.wav', codec='pcm_s16le', fps=16000)
+    print('Extracting s2t features...')
+    s2t_features = extract_s2t_features(gpu)
+    print('Extracting speaker features...')
+    diarization_features, adult_child_class_features = extract_speaker_features(gpu)
+    if len(diarization_features) > 1:
+        diarization_features, adult_child_class_features = diarization_features[:-1], adult_child_class_features[:-1]
+    audio_features = np.concatenate((s2t_features, diarization_features[:, None], adult_child_class_features[:, None]), axis=1)
+    with open(output_path, 'wb') as f:
+        pickle.dump(audio_features, f)
+    return output_path

ava_action_list.pbtxt ADDED Viewed

	@@ -0,0 +1,240 @@

+item {
+  name: "bend/bow (at the waist)"
+  id: 1
+}
+item {
+  name: "crouch/kneel"
+  id: 3
+}
+item {
+  name: "dance"
+  id: 4
+}
+item {
+  name: "fall down"
+  id: 5
+}
+item {
+  name: "get up"
+  id: 6
+}
+item {
+  name: "jump/leap"
+  id: 7
+}
+item {
+  name: "lie/sleep"
+  id: 8
+}
+item {
+  name: "martial art"
+  id: 9
+}
+item {
+  name: "run/jog"
+  id: 10
+}
+item {
+  name: "sit"
+  id: 11
+}
+item {
+  name: "stand"
+  id: 12
+}
+item {
+  name: "swim"
+  id: 13
+}
+item {
+  name: "walk"
+  id: 14
+}
+item {
+  name: "answer phone"
+  id: 15
+}
+item {
+  name: "carry/hold (an object)"
+  id: 17
+}
+item {
+  name: "climb (e.g., a mountain)"
+  id: 20
+}
+item {
+  name: "close (e.g., a door, a box)"
+  id: 22
+}
+item {
+  name: "cut"
+  id: 24
+}
+item {
+  name: "dress/put on clothing"
+  id: 26
+}
+item {
+  name: "drink"
+  id: 27
+}
+item {
+  name: "drive (e.g., a car, a truck)"
+  id: 28
+}
+item {
+  name: "eat"
+  id: 29
+}
+item {
+  name: "enter"
+  id: 30
+}
+item {
+  name: "hit (an object)"
+  id: 34
+}
+item {
+  name: "lift/pick up"
+  id: 36
+}
+item {
+  name: "listen (e.g., to music)"
+  id: 37
+}
+item {
+  name: "open (e.g., a window, a car door)"
+  id: 38
+}
+item {
+  name: "play musical instrument"
+  id: 41
+}
+item {
+  name: "point to (an object)"
+  id: 43
+}
+item {
+  name: "pull (an object)"
+  id: 45
+}
+item {
+  name: "push (an object)"
+  id: 46
+}
+item {
+  name: "put down"
+  id: 47
+}
+item {
+  name: "read"
+  id: 48
+}
+item {
+  name: "ride (e.g., a bike, a car, a horse)"
+  id: 49
+}
+item {
+  name: "sail boat"
+  id: 51
+}
+item {
+  name: "shoot"
+  id: 52
+}
+item {
+  name: "smoke"
+  id: 54
+}
+item {
+  name: "take a photo"
+  id: 56
+}
+item {
+  name: "text on/look at a cellphone"
+  id: 57
+}
+item {
+  name: "throw"
+  id: 58
+}
+item {
+  name: "touch (an object)"
+  id: 59
+}
+item {
+  name: "turn (e.g., a screwdriver)"
+  id: 60
+}
+item {
+  name: "watch (e.g., TV)"
+  id: 61
+}
+item {
+  name: "work on a computer"
+  id: 62
+}
+item {
+  name: "write"
+  id: 63
+}
+item {
+  name: "fight/hit (a person)"
+  id: 64
+}
+item {
+  name: "give/serve (an object) to (a person)"
+  id: 65
+}
+item {
+  name: "grab (a person)"
+  id: 66
+}
+item {
+  name: "hand clap"
+  id: 67
+}
+item {
+  name: "hand shake"
+  id: 68
+}
+item {
+  name: "hand wave"
+  id: 69
+}
+item {
+  name: "hug (a person)"
+  id: 70
+}
+item {
+  name: "kiss (a person)"
+  id: 72
+}
+item {
+  name: "lift (a person)"
+  id: 73
+}
+item {
+  name: "listen to (a person)"
+  id: 74
+}
+item {
+  name: "push (another person)"
+  id: 76
+}
+item {
+  name: "sing to (e.g., self, a person, a group)"
+  id: 77
+}
+item {
+  name: "take (an object) from (a person)"
+  id: 78
+}
+item {
+  name: "talk to (e.g., self, a person, a group)"
+  id: 79
+}
+item {
+  name: "watch (a person)"
+  id: 80
+}

coco.names ADDED Viewed

	@@ -0,0 +1,80 @@

+person
+bicycle
+car
+motorbike
+aeroplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+sofa
+pottedplant
+bed
+diningtable
+toilet
+tvmonitor
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush

environment.yml ADDED Viewed

	@@ -0,0 +1,5 @@

+name: env
+dependencies:
+  - cudatoolkit
+  - pip:
+    - -r requirements.txt

requirements.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+imutils
+matplotlib
+numpy
+pandas
+opencv-python
+ffmpeg-python
+pytorchvideo
+cython
+scipy
+tqdm
+gdown
+cmake
+#Torch
+--find-links https://download.pytorch.org/whl/cu111
+torch==1.10.0
+torchvision==0.11.1
+# Detectron
+--find-links https://dl.fbaipublicfiles.com/detectron2/wheels/cpu/torch1.10/index.html
+detectron2
+moviepy
+pyannote.audio
+scikit-learn
+librosa
+transformers

slowfast.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import numpy as np
+import pandas as pd
+import cv2
+import torch
+import warnings
+from detectron2.config import get_cfg
+from detectron2 import model_zoo
+from detectron2.engine import DefaultPredictor
+import ffmpeg
+import pytorchvideo
+from pytorchvideo.transforms.functional import (
+    uniform_temporal_subsample,
+    short_side_scale_with_boxes,
+    clip_boxes_to_image
+)
+from torchvision.transforms._functional_video import normalize
+from pytorchvideo.data.ava import AvaLabeledVideoFramePaths
+from pytorchvideo.models.hub import slowfast_r50_detection # Another option is slow_r50_detection
+from visualization import VideoVisualizer
+# This method takes in an image and generates the bounding boxes for people in the image.
+def get_person_bboxes(inp_img, predictor):
+    predictions = predictor(inp_img.cpu().detach().numpy())['instances'].to('cpu')
+    boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
+    scores = predictions.scores if predictions.has("scores") else None
+    classes = np.array(predictions.pred_classes.tolist() if predictions.has("pred_classes") else None)
+    predicted_boxes = boxes[np.logical_and(classes==0, scores>0.75 )].tensor.cpu() # only person
+    return predicted_boxes
+def ava_inference_transform(
+    clip,
+    boxes,
+    num_frames = 32, # 4 if using slowfast_r50_detection, change this to 32
+    crop_size = 256,
+    data_mean = [0.45, 0.45, 0.45],
+    data_std = [0.225, 0.225, 0.225],
+    slow_fast_alpha = 4, # if using slowfast_r50_detection, change None to 4
+    device = 'cpu'):
+    boxes = np.array(boxes)
+    ori_boxes = boxes.copy()
+    # Image [0, 255] -> [0, 1].
+    clip = uniform_temporal_subsample(clip, num_frames)
+    clip = clip.float()
+    clip = clip / 255.0
+    height, width = clip.shape[2], clip.shape[3]
+    # The format of boxes is [x1, y1, x2, y2]. The input boxes are in the
+    # range of [0, width] for x and [0,height] for y
+    boxes = clip_boxes_to_image(boxes, height, width)
+    # Resize short side to crop_size. Non-local and STRG uses 256.
+    clip, boxes = short_side_scale_with_boxes(clip, size=crop_size, boxes=boxes)
+    # Normalize images by mean and std.
+    clip = normalize(clip, np.array(data_mean, dtype=np.float32), np.array(data_std, dtype=np.float32))
+    boxes = clip_boxes_to_image(boxes, clip.shape[2],  clip.shape[3])
+    # Incase of slowfast, generate both pathways
+    if slow_fast_alpha is not None:
+        fast_pathway = clip
+        # Perform temporal sampling from the fast pathway.
+        slow_pathway = torch.index_select(clip, 1, torch.linspace(
+            0, clip.shape[1] - 1, clip.shape[1] // slow_fast_alpha).long())
+        clip = [slow_pathway.unsqueeze(0).to(device), fast_pathway.unsqueeze(0).to(device)]
+    return clip, torch.from_numpy(boxes), ori_boxes
+# get video info
+def with_opencv(filename):
+    video = cv2.VideoCapture(filename)
+    frame_count = video.get(cv2.CAP_PROP_FRAME_COUNT)
+    fps = video.get(cv2.CAP_PROP_FPS)
+    s = round(frame_count / fps)
+    video.release()
+    return int(s), fps
+def slow_fast_train(file_path, gpu=False):
+    device = 'cuda' if gpu else 'cpu'
+    top_k = 1
+    video_model = slowfast_r50_detection(True) # Another option is slow_r50_detection(True)
+    video_model = video_model.eval().to(device)
+    cfg = get_cfg()
+    cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"))
+    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.55  # set threshold for this model
+    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml")
+    cfg.MODEL.DEVICE = device
+    predictor = DefaultPredictor(cfg)
+    # Create an id to label name mapping
+    label_map, allowed_class_ids = AvaLabeledVideoFramePaths.read_label_map('ava_action_list.pbtxt')
+    # Create a video visualizer that can plot bounding boxes and visualize actions on bboxes.
+    video_visualizer = VideoVisualizer(81, label_map, top_k=top_k, mode="thres",thres=0.5) #get top3 predictions show in each bounding box
+    #preprocess video data
+    encoded_vid = pytorchvideo.data.encoded_video.EncodedVideo.from_path(file_path)
+    # Video predictions are generated each frame/second for the wholevideo.
+    total_sec, fps = with_opencv(file_path)
+    time_stamp_range = range(0, total_sec) # time stamps in video for which clip is sampled
+    clip_duration = 1.0 # Duration of clip used for each inference step.
+    gif_imgs = []
+    xleft, ytop, xright, ybottom = [], [], [], []
+    labels = []
+    time_frame = []
+    scores = []
+    for time_stamp in time_stamp_range:
+        # Generate clip around the designated time stamps
+        inp_imgs = encoded_vid.get_clip(
+            time_stamp - clip_duration/2.0,
+            time_stamp + clip_duration/2.0)
+        inp_imgs = inp_imgs['video']
+        #if time_stamp % 15 == 0:
+            # Generate people bbox predictions using Detectron2's off the self pre-trained predictor
+            # We use the the middle image in each clip to generate the bounding boxes.
+        inp_img = inp_imgs[:,inp_imgs.shape[1]//2,:,:]
+        inp_img = inp_img.permute(1,2,0)
+        # Predicted boxes are of the form List[(x_1, y_1, x_2, y_2)]
+        predicted_boxes = get_person_bboxes(inp_img, predictor)
+        if len(predicted_boxes) == 0:
+            print("Skipping clip no frames detected at time stamp: ", time_stamp)
+            continue
+        # Preprocess clip and bounding boxes for video action recognition.
+        inputs, inp_boxes, _ = ava_inference_transform(inp_imgs, predicted_boxes.numpy(), device=device)
+        # Prepend data sample id for each bounding box.
+        # For more details refere to the RoIAlign in Detectron2
+        inp_boxes = torch.cat([torch.zeros(inp_boxes.shape[0],1), inp_boxes], dim=1)
+        # Generate actions predictions for the bounding boxes in the clip.
+        # The model here takes in the pre-processed video clip and the detected bounding boxes.
+        preds = video_model(inputs, inp_boxes.to(device)) #change inputs to inputs.unsqueeze(0).to(device) if using slow_r50
+        preds = preds.to('cpu')
+        # The model is trained on AVA and AVA labels are 1 indexed so, prepend 0 to convert to 0 index.
+        preds = torch.cat([torch.zeros(preds.shape[0],1), preds], dim=1)
+        # Plot predictions on the video and save for later visualization.
+        inp_imgs = inp_imgs.permute(1,2,3,0)
+        inp_imgs = inp_imgs/255.0
+        out_img_pred = video_visualizer.draw_clip_range(inp_imgs, preds, predicted_boxes)
+        gif_imgs += out_img_pred
+        #format of bboxes(x_left, y_top, x_right, y_bottom)
+        predicted_boxes_lst = predicted_boxes.tolist()
+        topscores, topclasses = torch.topk(preds, k=1)
+        topscores, topclasses = topscores.tolist(), topclasses.tolist()
+        topclasses = np.concatenate(topclasses)
+        topscores = np.concatenate(topscores)
+        #add top 1 prediction of behaviors in each time step
+        for i in range(len(predicted_boxes_lst)):
+            xleft.append(predicted_boxes_lst[i][0])
+            ytop.append(predicted_boxes_lst[i][1])
+            xright.append(predicted_boxes_lst[i][2])
+            ybottom.append(predicted_boxes_lst[i][3])
+            labels.append(label_map.get(topclasses[i]))
+            time_frame.append(time_stamp)
+            scores.append(topscores[i])
+    print("Finished generating predictions.")
+    # Generate Metadata file
+    metadata = pd.DataFrame()
+    metadata['frame'] = time_frame
+    metadata['x_left'] = xleft
+    metadata['y_top'] = ytop
+    metadata['x_right'] = xright
+    metadata['y_bottom'] = ybottom
+    metadata['label'] = labels
+    metadata['confidence'] = scores
+    height, width = gif_imgs[0].shape[0], gif_imgs[0].shape[1]
+    video_save_path = 'activity_recognition.mp4'
+    video = cv2.VideoWriter(video_save_path, cv2.VideoWriter_fourcc(*'mp4v'), int(fps), (width, height))
+    for image in gif_imgs:
+        img = (255*image).astype(np.uint8)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        video.write(img)
+    video.release()
+    return video_save_path, metadata

video_object_extraction.py ADDED Viewed

	@@ -0,0 +1,185 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Nov 8 16:18:28 2022
+@author: ariellee
+"""
+# import argparse
+from pathlib import Path
+import cv2
+import numpy as np
+from imutils.video import FPS
+import pandas as pd
+import os
+# def str2bool(v):
+#     """
+#     Converts string to bool type, enables command line
+#     arguments in the format of '--arg1 true --arg2 false'
+#     """
+#     if isinstance(v, bool):
+#         return v
+#     if v.lower() in ('yes', 'true', 't', 'y', '1'):
+#         return True
+#     elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+#         return False
+#     else:
+#         raise argparse.ArgumentTypeError('Boolean value expected (true/false)')
+# def get_args_parser():
+#     parser = argparse.ArgumentParser('Wheelock evaluation script for classroom object detection',
+#                                      add_help=False)
+#     parser.add_argument('--output_dir', default='', type=str,
+#                         help='path to save the feature extraction results')
+#     parser.add_argument('--output_name', default='video_out', type=str, help='name of csv \
+#                         file with object features and annotated video with object tracking \
+#                             and bounding boxes')
+#     parser.add_argument('--video_path', default='short',
+#                         type=str, help='path to input video, do not include file extension')
+#     parser.add_argument('--is_mp4', type=str2bool, default=False,
+#                         help='must be an mp4 file')
+#     parser.add_argument('--save_csv', type=str2bool, default=True,
+#                         help='if true, a csv file of extracted features will be saved in output_dir')
+#     parser.add_argument('--labels', default='coco.names', type=str,
+#                         help='labels for classes model can detect')
+#     parser.add_argument('--weights', default='yolov3.weights', type=str,
+#                         help='weights for pretrained yolo model')
+#     parser.add_argument('--cfg', default='yolov3.cfg', type=str,
+#                         help='model configuration parameters')
+#     return parser
+def video_object_extraction(video_path, frames):
+    '''
+    Object detection and feature extraction with yolov3
+    Uses darknet repo by pjreddie
+    Returns: (1) csv file with extracted object features
+                 columns: frame_number, x_start, y_start, x_end, y_end, label, confidence
+             (2) mp4 video with object bounding boxes and tracking
+    '''
+    # video_path = args.video_path + '.mp4'
+    print('Reading from video {}...'.format(video_path))
+    cap = cv2.VideoCapture(video_path)
+    # get total number of frames in the video
+    total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
+    # get height and width of video
+    H = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    W = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    fps = FPS().start()
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    # (cols, rows) format
+    # root = os.path.join(args.output_dir, args.output_name)
+    wp = 'object_detection.mp4'
+    g_fps = int(cap.get(cv2.CAP_PROP_FPS))
+    writer = cv2.VideoWriter(wp, fourcc, g_fps, (W, H))
+    # labels = open(args.labels).read().strip().split('\n')
+    labels = open('coco.names').read().strip().split('\n')
+    bbox_colors = np.random.randint(0, 255, size=(len(labels), 3), dtype='uint8')
+    yolo = cv2.dnn.readNetFromDarknet('yolov3.cfg', 'yolov3.weights')
+    out_layers = yolo.getLayerNames()
+    layers = [out_layers[i - 1] for i in yolo.getUnconnectedOutLayers()]
+    count = 0
+    stat_list = []
+    while count < total_frames:
+        _, frame = cap.read()
+        if count == 0 or count % frames == 0:
+            blob = cv2.dnn.blobFromImage(frame, 1 / 255.0, (416, 416), swapRB=True)
+            yolo.setInput(blob)
+            layer_outputs = yolo.forward(layers)
+            boxes = []
+            confidences = []
+            classes = []
+            # loop over layer outputs and objects detected
+            for output in layer_outputs:
+                for obj in output:
+                    # extract class and detection likelihood of current object
+                    scores = obj[5:]
+                    obj_class = np.argmax(scores)
+                    confidence = scores[obj_class]
+                    # get rid of bad predictions
+                    if confidence > 0.4:
+                        # scale bbox coordinates relative to frame size
+                        box = obj[0:4] * np.array([W, H, W, H])
+                        centerX, centerY, width, height = box.astype('int')
+                        # final coordiantes
+                        x = int(centerX - (width / 2))
+                        y = int(centerY - (height / 2))
+                        # update list of bbox coordinates, confidences, classes
+                        boxes.append([x, y, int(width), int(height)])
+                        confidences.append(float(confidence))
+                        classes.append(obj_class)
+            # non-max suppression for overlapping bounding boxes
+            idxs = cv2.dnn.NMSBoxes(boxes, confidences, 0.4, 0.4)
+        for i in idxs.flatten():
+            # extract coordinates
+            (x, y) = (boxes[i][0], boxes[i][1])
+            (w, h) = (boxes[i][2], boxes[i][3])
+            # set up + add bboxes to frame
+            color = [int(c) for c in bbox_colors[classes[i]]]
+            cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
+            text = "{}: {:.4f}".format(labels[classes[i]], confidences[i])
+            (text_width, text_height), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
+            cv2.rectangle(frame, (x, y - text_height), (x + text_width, y), color, cv2.FILLED)
+            cv2.putText(frame, text, (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (38, 38, 38), 2)
+            # format of each csv file is: frame number / x / y / w / h / label / confidence
+            stat_list.append([count, x, y, w, h, labels[classes[i]], confidences[i]])
+        writer.write(frame)
+        fps.update()
+        count += 1
+    df = pd.DataFrame(stat_list, columns=['frame', 'x_left', 'y_top', 'x_right',
+                                          'y_bottom', 'label', 'confidence'])
+    fps.stop()
+    print('Time elapsed (seconds): {:.2f}'.format(fps.elapsed()))
+    writer.release()
+    cap.release()
+    return wp, df
+# if __name__ == '__main__':
+#     parser = argparse.ArgumentParser('Wheelock evaluation script for classroom object detection', parents=[get_args_parser()])
+#     args = parser.parse_args()
+#     if not args.is_mp4:
+#         print('Video must be an mp4 file.')
+#     else:
+#         if args.output_dir:
+#             Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+#         main(args)

visualization.py ADDED Viewed

	@@ -0,0 +1,706 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Note: This file has been barrowed from facebookresearch/slowfast repo. And it is used to add the bounding boxes and predictions to the frame.
+# TODO: Migrate this into the core PyTorchVideo libarary.
+from __future__ import annotations
+import itertools
+# import logging
+from types import SimpleNamespace
+from typing import Dict, List, Optional, Tuple, Union
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from detectron2.utils.visualizer import Visualizer
+# logger = logging.getLogger(__name__)
+def _create_text_labels(
+    classes: List[int],
+    scores: List[float],
+    class_names: List[str],
+    ground_truth: bool = False,
+) -> List[str]:
+    """
+    Create text labels.
+    Args:
+        classes (list[int]): a list of class ids for each example.
+        scores (list[float] or None): list of scores for each example.
+        class_names (list[str]): a list of class names, ordered by their ids.
+        ground_truth (bool): whether the labels are ground truth.
+    Returns:
+        labels (list[str]): formatted text labels.
+    """
+    try:
+        labels = [class_names.get(c, "n/a") for c in classes]
+    except IndexError:
+        # logger.error("Class indices get out of range: {}".format(classes))
+        return None
+    if ground_truth:
+        labels = ["[{}] {}".format("GT", label) for label in labels]
+    elif scores is not None:
+        assert len(classes) == len(scores)
+        labels = ["[{:.2f}] {}".format(s, label) for s, label in zip(scores, labels)]
+    return labels
+class ImgVisualizer(Visualizer):
+    def __init__(
+        self, img_rgb: torch.Tensor, meta: Optional[SimpleNamespace] = None, **kwargs
+    ) -> None:
+        """
+        See https://github.com/facebookresearch/detectron2/blob/main/detectron2/utils/visualizer.py
+        for more details.
+        Args:
+            img_rgb: a tensor or numpy array of shape (H, W, C), where H and W correspond to
+                the height and width of the image respectively. C is the number of
+                color channels. The image is required to be in RGB format since that
+                is a requirement of the Matplotlib library. The image is also expected
+                to be in the range [0, 255].
+            meta (MetadataCatalog): image metadata.
+                See https://github.com/facebookresearch/detectron2/blob/81d5a87763bfc71a492b5be89b74179bd7492f6b/detectron2/data/catalog.py#L90
+        """
+        super(ImgVisualizer, self).__init__(img_rgb, meta, **kwargs)
+    def draw_text(
+        self,
+        text: str,
+        position: List[int],
+        *,
+        font_size: Optional[int] = None,
+        color: str = "w",
+        horizontal_alignment: str = "center",
+        vertical_alignment: str = "bottom",
+        box_facecolor: str = "black",
+        alpha: float = 0.5,
+    ) -> None:
+        """
+        Draw text at the specified position.
+        Args:
+            text (str): the text to draw on image.
+            position (list of 2 ints): the x,y coordinate to place the text.
+            font_size (Optional[int]): font of the text. If not provided, a font size
+                proportional to the image width is calculated and used.
+            color (str): color of the text. Refer to `matplotlib.colors` for full list
+                of formats that are accepted.
+            horizontal_alignment (str): see `matplotlib.text.Text`.
+            vertical_alignment (str): see `matplotlib.text.Text`.
+            box_facecolor (str): color of the box wrapped around the text. Refer to
+                `matplotlib.colors` for full list of formats that are accepted.
+            alpha (float): transparency level of the box.
+        """
+        if not font_size:
+            font_size = self._default_font_size
+        x, y = position
+        self.output.ax.text(
+            x,
+            y,
+            text,
+            size=font_size * self.output.scale,
+            family="monospace",
+            bbox={
+                "facecolor": box_facecolor,
+                "alpha": alpha,
+                "pad": 0.7,
+                "edgecolor": "none",
+            },
+            verticalalignment=vertical_alignment,
+            horizontalalignment=horizontal_alignment,
+            color=color,
+            zorder=10,
+        )
+    def draw_multiple_text(
+        self,
+        text_ls: List[str],
+        box_coordinate: torch.Tensor,
+        *,
+        top_corner: bool = True,
+        font_size: Optional[int] = None,
+        color: str = "w",
+        box_facecolors: str = "black",
+        alpha: float = 0.5,
+    ) -> None:
+        """
+        Draw a list of text labels for some bounding box on the image.
+        Args:
+            text_ls (list of strings): a list of text labels.
+            box_coordinate (tensor): shape (4,). The (x_left, y_top, x_right, y_bottom)
+                coordinates of the box.
+            top_corner (bool): If True, draw the text labels at (x_left, y_top) of the box.
+                Else, draw labels at (x_left, y_bottom).
+            font_size (Optional[int]): font of the text. If not provided, a font size
+                proportional to the image width is calculated and used.
+            color (str): color of the text. Refer to `matplotlib.colors` for full list
+                of formats that are accepted.
+            box_facecolors (str): colors of the box wrapped around the text. Refer to
+                `matplotlib.colors` for full list of formats that are accepted.
+            alpha (float): transparency level of the box.
+        """
+        if not isinstance(box_facecolors, list):
+            box_facecolors = [box_facecolors] * len(text_ls)
+        assert len(box_facecolors) == len(
+            text_ls
+        ), "Number of colors provided is not equal to the number of text labels."
+        if not font_size:
+            font_size = self._default_font_size
+        text_box_width = font_size + font_size // 2
+        # If the texts does not fit in the assigned location,
+        # we split the text and draw it in another place.
+        if top_corner:
+            num_text_split = self._align_y_top(
+                box_coordinate, len(text_ls), text_box_width
+            )
+            y_corner = 1
+        else:
+            num_text_split = len(text_ls) - self._align_y_bottom(
+                box_coordinate, len(text_ls), text_box_width
+            )
+            y_corner = 3
+        text_color_sorted = sorted(
+            zip(text_ls, box_facecolors), key=lambda x: x[0], reverse=True
+        )
+        if len(text_color_sorted) != 0:
+            text_ls, box_facecolors = zip(*text_color_sorted)
+        else:
+            text_ls, box_facecolors = [], []
+        text_ls, box_facecolors = list(text_ls), list(box_facecolors)
+        self.draw_multiple_text_upward(
+            text_ls[:num_text_split][::-1],
+            box_coordinate,
+            y_corner=y_corner,
+            font_size=font_size,
+            color=color,
+            box_facecolors=box_facecolors[:num_text_split][::-1],
+            alpha=alpha,
+        )
+        self.draw_multiple_text_downward(
+            text_ls[num_text_split:],
+            box_coordinate,
+            y_corner=y_corner,
+            font_size=font_size,
+            color=color,
+            box_facecolors=box_facecolors[num_text_split:],
+            alpha=alpha,
+        )
+    def draw_multiple_text_upward(
+        self,
+        text_ls: List[str],
+        box_coordinate: torch.Tensor,
+        *,
+        y_corner: int = 1,
+        font_size: Optional[int] = None,
+        color: str = "w",
+        box_facecolors: str = "black",
+        alpha: float = 0.5,
+    ) -> None:
+        """
+        Draw a list of text labels for some bounding box on the image in upward direction.
+        The next text label will be on top of the previous one.
+        Args:
+            text_ls (list of strings): a list of text labels.
+            box_coordinate (tensor): shape (4,). The (x_left, y_top, x_right, y_bottom)
+                coordinates of the box.
+            y_corner (int): Value of either 1 or 3. Indicate the index of the y-coordinate of
+                the box to draw labels around.
+            font_size (Optional[int]): font of the text. If not provided, a font size
+                proportional to the image width is calculated and used.
+            color (str): color of the text. Refer to `matplotlib.colors` for full list
+                of formats that are accepted.
+            box_facecolors (str or list of strs): colors of the box wrapped around the
+                text. Refer to `matplotlib.colors` for full list of formats that
+                are accepted.
+            alpha (float): transparency level of the box.
+        """
+        if not isinstance(box_facecolors, list):
+            box_facecolors = [box_facecolors] * len(text_ls)
+        assert len(box_facecolors) == len(
+            text_ls
+        ), "Number of colors provided is not equal to the number of text labels."
+        assert y_corner in [1, 3], "Y_corner must be either 1 or 3"
+        if not font_size:
+            font_size = self._default_font_size
+        x, horizontal_alignment = self._align_x_coordinate(box_coordinate)
+        y = box_coordinate[y_corner].item()
+        for i, text in enumerate(text_ls):
+            self.draw_text(
+                text,
+                (x, y),
+                font_size=font_size,
+                color=color,
+                horizontal_alignment=horizontal_alignment,
+                vertical_alignment="bottom",
+                box_facecolor=box_facecolors[i],
+                alpha=alpha,
+            )
+            y -= font_size + font_size // 2
+    def draw_multiple_text_downward(
+        self,
+        text_ls: List[str],
+        box_coordinate: torch.Tensor,
+        *,
+        y_corner: int = 1,
+        font_size: Optional[int] = None,
+        color: str = "w",
+        box_facecolors: str = "black",
+        alpha: float = 0.5,
+    ) -> None:
+        """
+        Draw a list of text labels for some bounding box on the image in downward direction.
+        The next text label will be below the previous one.
+        Args:
+            text_ls (list of strings): a list of text labels.
+            box_coordinate (tensor): shape (4,). The (x_left, y_top, x_right, y_bottom)
+                coordinates of the box.
+            y_corner (int): Value of either 1 or 3. Indicate the index of the y-coordinate of
+                the box to draw labels around.
+            font_size (Optional[int]): font of the text. If not provided, a font size
+                proportional to the image width is calculated and used.
+            color (str): color of the text. Refer to `matplotlib.colors` for full list
+                of formats that are accepted.
+            box_facecolors (str): colors of the box wrapped around the text. Refer to
+                `matplotlib.colors` for full list of formats that are accepted.
+            alpha (float): transparency level of the box.
+        """
+        if not isinstance(box_facecolors, list):
+            box_facecolors = [box_facecolors] * len(text_ls)
+        assert len(box_facecolors) == len(
+            text_ls
+        ), "Number of colors provided is not equal to the number of text labels."
+        assert y_corner in [1, 3], "Y_corner must be either 1 or 3"
+        if not font_size:
+            font_size = self._default_font_size
+        x, horizontal_alignment = self._align_x_coordinate(box_coordinate)
+        y = box_coordinate[y_corner].item()
+        for i, text in enumerate(text_ls):
+            self.draw_text(
+                text,
+                (x, y),
+                font_size=font_size,
+                color=color,
+                horizontal_alignment=horizontal_alignment,
+                vertical_alignment="top",
+                box_facecolor=box_facecolors[i],
+                alpha=alpha,
+            )
+            y += font_size + font_size // 2
+    def _align_x_coordinate(self, box_coordinate: torch.Tensor) -> Tuple[float, str]:
+        """
+        Choose an x-coordinate from the box to make sure the text label
+        does not go out of frames. By default, the left x-coordinate is
+        chosen and text is aligned left. If the box is too close to the
+        right side of the image, then the right x-coordinate is chosen
+        instead and the text is aligned right.
+        Args:
+            box_coordinate (array-like): shape (4,). The (x_left, y_top, x_right, y_bottom)
+            coordinates of the box.
+        Returns:
+            x_coordinate (float): the chosen x-coordinate.
+            alignment (str): whether to align left or right.
+        """
+        # If the x-coordinate is greater than 5/6 of the image width,
+        # then we align test to the right of the box. This is
+        # chosen by heuristics.
+        if box_coordinate[0] > (self.output.width * 5) // 6:
+            return box_coordinate[2], "right"
+        return box_coordinate[0], "left"
+    def _align_y_top(
+        self, box_coordinate: torch.Tensor, num_text: int, textbox_width: float
+    ) -> int:
+        """
+        Calculate the number of text labels to plot on top of the box
+        without going out of frames.
+        Args:
+            box_coordinate (array-like): shape (4,). The (x_left, y_top, x_right, y_bottom)
+            coordinates of the box.
+            num_text (int): the number of text labels to plot.
+            textbox_width (float): the width of the box wrapped around text label.
+        """
+        dist_to_top = box_coordinate[1]
+        num_text_top = dist_to_top // textbox_width
+        if isinstance(num_text_top, torch.Tensor):
+            num_text_top = int(num_text_top.item())
+        return min(num_text, num_text_top)
+    def _align_y_bottom(
+        self, box_coordinate: torch.Tensor, num_text: int, textbox_width: float
+    ) -> int:
+        """
+        Calculate the number of text labels to plot at the bottom of the box
+        without going out of frames.
+        Args:
+            box_coordinate (array-like): shape (4,). The (x_left, y_top, x_right, y_bottom)
+            coordinates of the box.
+            num_text (int): the number of text labels to plot.
+            textbox_width (float): the width of the box wrapped around text label.
+        """
+        dist_to_bottom = self.output.height - box_coordinate[3]
+        num_text_bottom = dist_to_bottom // textbox_width
+        if isinstance(num_text_bottom, torch.Tensor):
+            num_text_bottom = int(num_text_bottom.item())
+        return min(num_text, num_text_bottom)
+class VideoVisualizer:
+    def __init__(
+        self,
+        num_classes: int,
+        class_names: Dict,
+        top_k: int = 1,
+        colormap: str = "rainbow",
+        thres: float = 0.7,
+        lower_thres: float = 0.3,
+        common_class_names: Optional[List[str]] = None,
+        mode: str = "top-k",
+    ) -> None:
+        """
+        Args:
+            num_classes (int): total number of classes.
+            class_names (dict): Dict mapping classID to name.
+            top_k (int): number of top predicted classes to plot.
+            colormap (str): the colormap to choose color for class labels from.
+                See https://matplotlib.org/tutorials/colors/colormaps.html
+            thres (float): threshold for picking predicted classes to visualize.
+            lower_thres (Optional[float]): If `common_class_names` if given,
+                this `lower_thres` will be applied to uncommon classes and
+                `thres` will be applied to classes in `common_class_names`.
+            common_class_names (Optional[list of str]): list of common class names
+                to apply `thres`. Class names not included in `common_class_names` will
+                have `lower_thres` as a threshold. If None, all classes will have
+                `thres` as a threshold. This is helpful for model trained on
+                highly imbalanced dataset.
+            mode (str): Supported modes are {"top-k", "thres"}.
+                This is used for choosing predictions for visualization.
+        """
+        assert mode in ["top-k", "thres"], "Mode {} is not supported.".format(mode)
+        self.mode = mode
+        self.num_classes = num_classes
+        self.class_names = class_names
+        self.top_k = top_k
+        self.thres = thres
+        self.lower_thres = lower_thres
+        if mode == "thres":
+            self._get_thres_array(common_class_names=common_class_names)
+        self.color_map = plt.get_cmap(colormap)
+    def _get_color(self, class_id: int) -> List[float]:
+        """
+        Get color for a class id.
+        Args:
+            class_id (int): class id.
+        """
+        return self.color_map(class_id / self.num_classes)[:3]
+    def draw_one_frame(
+        self,
+        frame: Union[torch.Tensor, np.ndarray],
+        preds: Union[torch.Tensor, List[float]],
+        bboxes: Optional[torch.Tensor] = None,
+        alpha: float = 0.5,
+        text_alpha: float = 0.7,
+        ground_truth: bool = False,
+    ) -> np.ndarray:
+        """
+        Draw labels and bouding boxes for one image. By default, predicted
+        labels are drawn in the top left corner of the image or corresponding
+        bounding boxes. For ground truth labels (setting True for ground_truth flag),
+        labels will be drawn in the bottom left corner.
+        Args:
+            frame (array-like): a tensor or numpy array of shape (H, W, C),
+            where H and W correspond to
+                the height and width of the image respectively. C is the number of
+                color channels. The image is required to be in RGB format since that
+                is a requirement of the Matplotlib library. The image is also expected
+                to be in the range [0, 255].
+            preds (tensor or list): If ground_truth is False, provide a float tensor of
+                shape (num_boxes, num_classes) that contains all of the confidence
+                scores of the model. For recognition task, input shape can be (num_classes,).
+                To plot true label (ground_truth is True), preds is a list contains int32
+                of the shape (num_boxes, true_class_ids) or (true_class_ids,).
+            bboxes (Optional[tensor]): shape (num_boxes, 4) that contains the coordinates
+                of the bounding boxes.
+            alpha (Optional[float]): transparency level of the bounding boxes.
+            text_alpha (Optional[float]): transparency level of the box wrapped around
+                text labels.
+            ground_truth (bool): whether the prodived bounding boxes are ground-truth.
+        Returns:
+            An image with bounding box annotations and corresponding bbox
+            labels plotted on it.
+        """
+        if isinstance(preds, torch.Tensor):
+            if preds.ndim == 1:
+                preds = preds.unsqueeze(0)
+            n_instances = preds.shape[0]
+        elif isinstance(preds, list):
+            n_instances = len(preds)
+        else:
+            # logger.error("Unsupported type of prediction input.")
+            return
+        if ground_truth:
+            top_scores, top_classes = [None] * n_instances, preds
+        elif self.mode == "top-k":
+            top_scores, top_classes = torch.topk(preds, k=self.top_k)
+            top_scores, top_classes = top_scores.tolist(), top_classes.tolist()
+        elif self.mode == "thres":
+            top_scores, top_classes = [], []
+            for pred in preds:
+                mask = pred >= self.thres
+                top_scores.append(pred[mask].tolist())
+                top_class = torch.squeeze(torch.nonzero(mask), dim=-1).tolist()
+                top_classes.append(top_class)
+        # Create labels top k predicted classes with their scores.
+        text_labels = []
+        for i in range(n_instances):
+            text_labels.append(
+                _create_text_labels(
+                    top_classes[i],
+                    top_scores[i],
+                    self.class_names,
+                    ground_truth=ground_truth,
+                )
+            )
+        frame_visualizer = ImgVisualizer(frame, meta=None)
+        font_size = min(max(np.sqrt(frame.shape[0] * frame.shape[1]) // 25, 5), 9)
+        top_corner = not ground_truth
+        if bboxes is not None:
+            assert len(preds) == len(
+                bboxes
+            ), "Encounter {} predictions and {} bounding boxes".format(
+                len(preds), len(bboxes)
+            )
+            for i, box in enumerate(bboxes):
+                text = text_labels[i]
+                pred_class = top_classes[i]
+                colors = [self._get_color(pred) for pred in pred_class]
+                box_color = "r" if ground_truth else "g"
+                line_style = "--" if ground_truth else "-."
+                frame_visualizer.draw_box(
+                    box,
+                    alpha=alpha,
+                    edge_color=box_color,
+                    line_style=line_style,
+                )
+                frame_visualizer.draw_multiple_text(
+                    text,
+                    box,
+                    top_corner=top_corner,
+                    font_size=font_size,
+                    box_facecolors=colors,
+                    alpha=text_alpha,
+                )
+        else:
+            text = text_labels[0]
+            pred_class = top_classes[0]
+            colors = [self._get_color(pred) for pred in pred_class]
+            frame_visualizer.draw_multiple_text(
+                text,
+                torch.Tensor([0, 5, frame.shape[1], frame.shape[0] - 5]),
+                top_corner=top_corner,
+                font_size=font_size,
+                box_facecolors=colors,
+                alpha=text_alpha,
+            )
+        return frame_visualizer.output.get_image()
+    def draw_clip_range(
+        self,
+        frames: Union[torch.Tensor, np.ndarray],
+        preds: Union[torch.Tensor, List[float]],
+        bboxes: Optional[torch.Tensor] = None,
+        text_alpha: float = 0.5,
+        ground_truth: bool = False,
+        keyframe_idx: Optional[int] = None,
+        draw_range: Optional[List[int]] = None,
+        repeat_frame: int = 1,
+    ) -> List[np.ndarray]:
+        """
+        Draw predicted labels or ground truth classes to clip.
+        Draw bouding boxes to clip if bboxes is provided. Boxes will gradually
+        fade in and out the clip, centered around the clip's central frame,
+        within the provided `draw_range`.
+        Args:
+            frames (array-like): video data in the shape (T, H, W, C).
+            preds (tensor): a tensor of shape (num_boxes, num_classes) that
+                contains all of the confidence scores of the model. For recognition
+                task or for ground_truth labels, input shape can be (num_classes,).
+            bboxes (Optional[tensor]): shape (num_boxes, 4) that contains the coordinates
+                of the bounding boxes.
+            text_alpha (float): transparency label of the box wrapped around text labels.
+            ground_truth (bool): whether the prodived bounding boxes are ground-truth.
+            keyframe_idx (int): the index of keyframe in the clip.
+            draw_range (Optional[list[ints]): only draw frames in range
+                [start_idx, end_idx] inclusively in the clip. If None, draw on
+                the entire clip.
+            repeat_frame (int): repeat each frame in draw_range for `repeat_frame`
+                time for slow-motion effect.
+        Returns:
+            A list of frames with bounding box annotations and corresponding
+            bbox labels ploted on them.
+        """
+        if draw_range is None:
+            draw_range = [0, len(frames) - 1]
+        if draw_range is not None:
+            draw_range[0] = max(0, draw_range[0])
+            left_frames = frames[: draw_range[0]]
+            right_frames = frames[draw_range[1] + 1 :]
+        draw_frames = frames[draw_range[0] : draw_range[1] + 1]
+        if keyframe_idx is None:
+            keyframe_idx = len(frames) // 2
+        img_ls = (
+            list(left_frames)
+            + self.draw_clip(
+                draw_frames,
+                preds,
+                bboxes=bboxes,
+                text_alpha=text_alpha,
+                ground_truth=ground_truth,
+                keyframe_idx=keyframe_idx - draw_range[0],
+                repeat_frame=repeat_frame,
+            )
+            + list(right_frames)
+        )
+        return img_ls
+    def draw_clip(
+        self,
+        frames: Union[torch.Tensor, np.ndarray],
+        preds: Union[torch.Tensor, List[float]],
+        bboxes: Optional[torch.Tensor] = None,
+        text_alpha: float = 0.5,
+        ground_truth: bool = False,
+        keyframe_idx: Optional[int] = None,
+        repeat_frame: int = 1,
+    ) -> List[np.ndarray]:
+        """
+        Draw predicted labels or ground truth classes to clip. Draw bouding boxes to clip
+        if bboxes is provided. Boxes will gradually fade in and out the clip, centered
+        around the clip's central frame.
+        Args:
+            frames (array-like): video data in the shape (T, H, W, C).
+            preds (tensor): a tensor of shape (num_boxes, num_classes) that contains
+                all of the confidence scores of the model. For recognition task or for
+                ground_truth labels, input shape can be (num_classes,).
+            bboxes (Optional[tensor]): shape (num_boxes, 4) that contains the coordinates
+                of the bounding boxes.
+            text_alpha (float): transparency label of the box wrapped around text labels.
+            ground_truth (bool): whether the prodived bounding boxes are ground-truth.
+            keyframe_idx (int): the index of keyframe in the clip.
+            repeat_frame (int): repeat each frame in draw_range for `repeat_frame`
+                time for slow-motion effect.
+        Returns:
+            A list of frames with bounding box annotations and corresponding
+            bbox labels plotted on them.
+        """
+        assert repeat_frame >= 1, "`repeat_frame` must be a positive integer."
+        repeated_seq = range(0, len(frames))
+        repeated_seq = list(
+            itertools.chain.from_iterable(
+                itertools.repeat(x, repeat_frame) for x in repeated_seq
+            )
+        )
+        frames, adjusted = self._adjust_frames_type(frames)
+        if keyframe_idx is None:
+            half_left = len(repeated_seq) // 2
+            half_right = (len(repeated_seq) + 1) // 2
+        else:
+            mid = int((keyframe_idx / len(frames)) * len(repeated_seq))
+            half_left = mid
+            half_right = len(repeated_seq) - mid
+        alpha_ls = np.concatenate(
+            [
+                np.linspace(0, 1, num=half_left),
+                np.linspace(1, 0, num=half_right),
+            ]
+        )
+        text_alpha = text_alpha
+        frames = frames[repeated_seq]
+        img_ls = []
+        for alpha, frame in zip(alpha_ls, frames):
+            draw_img = self.draw_one_frame(
+                frame,
+                preds,
+                bboxes,
+                alpha=alpha,
+                text_alpha=text_alpha,
+                ground_truth=ground_truth,
+            )
+            if adjusted:
+                draw_img = draw_img.astype("float32") / 255
+            img_ls.append(draw_img)
+        return img_ls
+    def _adjust_frames_type(
+        self, frames: torch.Tensor
+    ) -> Tuple[List[np.ndarray], bool]:
+        """
+        Modify video data to have dtype of uint8 and values range in [0, 255].
+        Args:
+            frames (array-like): 4D array of shape (T, H, W, C).
+        Returns:
+            frames (list of frames): list of frames in range [0, 1].
+            adjusted (bool): whether the original frames need adjusted.
+        """
+        assert (
+            frames is not None and len(frames) != 0
+        ), "Frames does not contain any values"
+        frames = np.array(frames)
+        assert np.array(frames).ndim == 4, "Frames must have 4 dimensions"
+        adjusted = False
+        if frames.dtype in [np.float32, np.float64]:
+            frames *= 255
+            frames = frames.astype(np.uint8)
+            adjusted = True
+        return frames, adjusted
+    def _get_thres_array(self, common_class_names: Optional[List[str]] = None) -> None:
+        """
+        Compute a thresholds array for all classes based on `self.thes` and `self.lower_thres`.
+        Args:
+            common_class_names (Optional[list of str]): a list of common class names.
+        """
+        common_class_ids = []
+        if common_class_names is not None:
+            common_classes = set(common_class_names)
+            for key, name in self.class_names.items():
+                if name in common_classes:
+                    common_class_ids.append(key)
+        else:
+            common_class_ids = list(range(self.num_classes))
+        thres_array = np.full(shape=(self.num_classes,), fill_value=self.lower_thres)
+        thres_array[common_class_ids] = self.thres
+        self.thres = torch.from_numpy(thres_array)

yolov3.cfg ADDED Viewed

	@@ -0,0 +1,789 @@

+[net]
+# Testing
+# batch=1
+# subdivisions=1
+# Training
+batch=64
+subdivisions=16
+width=608
+height=608
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+# Downsample
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=32
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+# Downsample
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+# Downsample
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+# Downsample
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+# Downsample
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=2
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+[shortcut]
+from=-3
+activation=linear
+######################
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+[yolo]
+mask = 6,7,8
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+[route]
+layers = -4
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[upsample]
+stride=2
+[route]
+layers = -1, 61
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+[yolo]
+mask = 3,4,5
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+[route]
+layers = -4
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[upsample]
+stride=2
+[route]
+layers = -1, 36
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+[yolo]
+mask = 0,1,2
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1