import project_path
import torch
from tqdm import tqdm
from functools import partial
import numpy as np
import json
import time
from unittest.mock import patch
import math
# assumes yolov5 on sys.path
from lib.yolov5.models.experimental import attempt_load
from lib.yolov5.utils.torch_utils import select_device
from lib.yolov5.utils.general import clip_boxes, scale_boxes, xywh2xyxy
from lib.yolov5.utils.metrics import box_iou
import torch
import torchvision
from lib.fish_eye.tracker import Tracker
### Configuration options
WEIGHTS = 'models/'
# will need to configure these based on GPU hardware
CONF_THRES = 0.05 # detection
MAX_AGE = 14 # time until missing fish get's new id
MIN_HITS = 16 # minimum number of frames with a specific fish for it to count
MIN_LENGTH = 0.3 # minimum fish length, in meters
IOU_THRES = 0.01 # IOU threshold for tracking
def norm(bbox, w, h):
Normalize a bounding box.
bbox: list of length 4. Can be [x,y,w,h] or [x0,y0,x1,y1]
w: image width
h: image height
bb = bbox.copy()
bb[0] /= w
bb[1] /= h
bb[2] /= w
bb[3] /= h
return bb
def do_full_inference(dataloader, image_meter_width, image_meter_height, gp=None, weights=WEIGHTS, conf_thresh=CONF_THRES, nms_iou=NMS_IOU, min_hits=MIN_HITS, max_age=MAX_AGE):
model, device = setup_model(weights)
load = False
save = False
if load:
with open('static/example/inference_output.json', 'r') as f:
json_object = json.load(f)
inference = json_object['inference']
width = json_object['width']
height = json_object['height']
image_shapes = json_object['image_shapes']
inference, image_shapes, width, height = do_detection(dataloader, model, device, gp=gp)
if save:
json_object = {
'inference': inference,
'width': width,
'height': height,
'image_shapes': image_shapes
json_text = json.dumps(json_object, indent=4)
with open('static/example/inference_output.json', 'w') as f:
outputs = do_suppression(inference, conf_thres=conf_thresh, iou_thres=nms_iou, gp=gp)
#do_confidence_boost(inference, outputs, gp=gp)
#new_outputs = do_suppression(inference, conf_thres=conf_thresh, iou_thres=nms_iou, gp=gp)
all_preds, real_width, real_height = format_predictions(image_shapes, outputs, width, height, gp=gp)
results = do_tracking(all_preds, image_meter_width, image_meter_height, min_hits=min_hits, max_age=max_age, gp=gp)
return results
def setup_model(weights_fp=WEIGHTS, imgsz=896, batch_size=32):
if torch.cuda.is_available():
device = select_device('0', batch_size=batch_size)
print("CUDA not available. Using CPU inference.")
device = select_device('cpu', batch_size=batch_size)
# Setup model for inference
model = attempt_load(weights_fp, device=device)
half = device.type != 'cpu' # half precision only supported on CUDA
if half:
# Create dataloader for batched inference
img = torch.zeros((1, 3, imgsz, imgsz), device=device)
_ = model(img.half() if half else img) if device.type != 'cpu' else None # run once
return model, device
def do_detection(dataloader, model, device, gp=None, batch_size=BATCH_SIZE, verbose=True):
frames_dir: a directory containing frames to be evaluated
image_meter_width: the width of each image, in meters (used for fish length calculation)
gp: a callback function which takes as input 1 parameter, (int) percent complete
prep_for_marking: re-index fish for manual marking output
if (gp): gp(0, "Detection...")
inference = []
image_shapes = []
# Run detection
with tqdm(total=len(dataloader)*batch_size, desc="Running detection", ncols=0, disable=not verbose) as pbar:
for batch_i, (img, _, shapes) in enumerate(dataloader):
if gp: gp(batch_i / len(dataloader), pbar.__str__())
img =, non_blocking=True)
img = img.half() if device.type != 'cpu' else img.float() # uint8 to fp16/32
img /= 255.0 # 0 - 255 to 0.0 - 1.0
size = tuple(img.shape)
nb, _, height, width = size # batch size, channels, height, width
# Run model & NMS
with torch.no_grad():
inf_out, _ = model(img, augment=False)
# Save shapes for resizing to original shape
batch_shape = []
for si, pred in enumerate(inf_out):
batch_shape.append((img[si].shape[1:], shapes[si]))
return inference, image_shapes, width, height
def do_suppression(inference, gp=None, batch_size=BATCH_SIZE, conf_thres=CONF_THRES, iou_thres=NMS_IOU, verbose=True):
frames_dir: a directory containing frames to be evaluated
image_meter_width: the width of each image, in meters (used for fish length calculation)
gp: a callback function which takes as input 1 parameter, (int) percent complete
prep_for_marking: re-index fish for manual marking output
if (gp): gp(0, "Suppression...")
# keep predictions to feed them ordered into the Tracker
# TODO: how to deal with large files?
outputs = []
with tqdm(total=len(inference)*batch_size, desc="Running suppression", ncols=0, disable=not verbose) as pbar:
for batch_i, inf_out in enumerate(inference):
if gp: gp(batch_i / len(inference), pbar.__str__())
with torch.no_grad():
output = non_max_suppression(inf_out, conf_thres=conf_thres, iou_thres=iou_thres)
return outputs
def format_predictions(image_shapes, outputs, width, height, gp=None, batch_size=BATCH_SIZE, verbose=True):
frames_dir: a directory containing frames to be evaluated
image_meter_width: the width of each image, in meters (used for fish length calculation)
gp: a callback function which takes as input 1 parameter, (int) percent complete
prep_for_marking: re-index fish for manual marking output
if (gp): gp(0, "Formatting...")
# keep predictions to feed them ordered into the Tracker
# TODO: how to deal with large files?
all_preds = {}
with tqdm(total=len(image_shapes)*batch_size, desc="Running formatting", ncols=0, disable=not verbose) as pbar:
for batch_i, batch in enumerate(outputs):
if gp: gp(batch_i / len(image_shapes), pbar.__str__())
batch_shapes = image_shapes[batch_i]
# Format results
for si, pred in enumerate(batch):
(image_shape, original_shape) = batch_shapes[si]
# Clip boxes to image bounds and resize to input shape
clip_boxes(pred, (height, width))
box = pred[:, :4].clone() # xyxy
confs = pred[:, 4].clone().tolist()
scale_boxes(image_shape, box, original_shape[0], original_shape[1]) # to original shape
# get boxes into tracker input format - normalized xyxy with confidence score
# confidence score currently not used by tracker; set to 1.0
boxes = None
if box.shape[0]:
real_width = original_shape[0][1]
real_height = original_shape[0][0]
do_norm = partial(norm, w=original_shape[0][1], h=original_shape[0][0])
normed = list((map(do_norm, box[:, :4].tolist())))
boxes = np.stack([ [*bb, conf] for bb, conf in zip(normed, confs) ])
frame_num = (batch_i, si)
all_preds[frame_num] = boxes
return all_preds, real_width, real_height
def do_confidence_boost(inference, safe_preds, gp=None, batch_size=BATCH_SIZE, verbose=True):
frames_dir: a directory containing frames to be evaluated
image_meter_width: the width of each image, in meters (used for fish length calculation)
gp: a callback function which takes as input 1 parameter, (int) percent complete
prep_for_marking: re-index fish for manual marking output
if (gp): gp(0, "Confidence Boost...")
# keep predictions to feed them ordered into the Tracker
# TODO: how to deal with large files?
outputs = []
with tqdm(total=len(inference), desc="Running confidence boost", ncols=0, disable=not verbose) as pbar:
for batch_i in range(len(inference)):
if gp: gp(batch_i / len(inference), pbar.__str__())
safe = safe_preds[batch_i]
infer = inference[batch_i]
for i in range(len(safe)):
safe_frame = safe[i]
if len(safe_frame) == 0:
has_next_batch = batch_i+1 < len(inference)
has_prev_batch = batch_i-1 >= 0
frames = [None, None]
next_frame = None
if i+1 < len(infer):
next_frame = infer[i+1]
elif has_next_batch:
next_frame = inference[batch_i + 1][0]
if next_frame != None:
boost_frame(safe_frame, next_frame, 1)
prev_frame = None
if i-1 >= 0:
prev_frame = infer[i-1]
elif has_prev_batch:
prev_frame = inference[batch_i - 1][len(inference[batch_i - 1]) - 1]
if prev_frame != None:
boost_frame(safe_frame, prev_frame, -1)
def boost_frame(safe_frame, base_frame, dt):
safe_boxes = safe_frame[:, :4]
boxes = xywh2xyxy(base_frame[:, :4]) # center_x, center_y, width, height) to (x1, y1, x2, y2)
ious = box_iou(boxes, safe_boxes)
score = torch.matmul(ious, safe_frame[:, 4])
# score = iou(safe_box, base_box) * confidence(safe_box)
base_frame[:, 4] *= 1 + (score)*math.exp(-dt*dt)
return base_frame
def do_tracking(all_preds, image_meter_width, image_meter_height, gp=None, max_age=MAX_AGE, iou_thres=IOU_THRES, min_hits=MIN_HITS, min_length=MIN_LENGTH, verbose=True):
if (gp): gp(0, "Tracking...")
# Initialize tracker
clip_info = {
'start_frame': 0,
'end_frame': len(all_preds),
'image_meter_width': image_meter_width,
'image_meter_height': image_meter_height
tracker = Tracker(clip_info, args={ 'max_age': max_age, 'min_hits': 0, 'iou_threshold': iou_thres}, min_hits=min_hits)
# Run tracking
with tqdm(total=len(all_preds), desc="Running tracking", ncols=0, disable=not verbose) as pbar:
for i, key in enumerate(sorted(all_preds.keys())):
if gp: gp(i / len(all_preds), pbar.__str__())
boxes = all_preds[key]
if boxes is not None:
json_data = tracker.finalize(min_length=min_length)
return json_data
@patch('json.encoder.c_make_encoder', None)
def json_dump_round_float(some_object, out_path, num_digits=4):
"""Write a json file to disk with a specified level of precision.
# saving original method
of = json.encoder._make_iterencode
def inner(*args, **kwargs):
args = list(args)
# fifth argument is float formater which will we replace
fmt_str = '{:.' + str(num_digits) + 'f}'
args[4] = lambda o: fmt_str.format(o)
return of(*args, **kwargs)
with patch('json.encoder._make_iterencode', wraps=inner):
return json.dump(some_object, open(out_path, 'w'), indent=2)
def non_max_suppression(
"""Non-Maximum Suppression (NMS) on inference results to reject overlapping detections
list of detections, on (n,6) tensor per image [xyxy, conf, cls]
# Checks
assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
if isinstance(prediction, (list, tuple)): # YOLOv5 model in validation model, output = (inference_out, loss_out)
prediction = prediction[0] # select only inference output
device = prediction.device
mps = 'mps' in device.type # Apple MPS
if mps: # MPS not fully supported yet, convert tensors to CPU before NMS
prediction = prediction.cpu()
bs = prediction.shape[0] # batch size
xc = prediction[..., 4] > conf_thres # candidates
# Settings
# min_wh = 2 # (pixels) minimum box width and height
max_nms = 30000 # maximum number of boxes into torchvision.ops.nms()
redundant = True # require redundant detections
merge = False # use merge-NMS
output = [torch.zeros((0, 6), device=prediction.device)] * bs
for xi, x in enumerate(prediction): # image index, image inference
# Keep boxes that pass confidence threshold
x = x[xc[xi]] # confidence
# If none remain process next image
if not x.shape[0]:
# Compute conf
x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf
# Box/Mask
box = xywh2xyxy(x[:, :4]) # center_x, center_y, width, height) to (x1, y1, x2, y2)
mask = x[:, 6:] # zero columns if no masks
# Detections matrix nx6 (xyxy, conf, cls)
conf, j = x[:, 5:6].max(1, keepdim=True)
x =, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres]
# Check shape
n = x.shape[0] # number of boxes
if not n: # no boxes
x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence and remove excess boxes
# Batched NMS
boxes = x[:, :4] # boxes (offset by class), scores
scores = x[:, 4]
i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
i = i[:max_det] # limit detections
if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
# update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
weights = iou * scores[None] # box weights
x[i, :4] =, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
if redundant:
i = i[iou.sum(1) > 1] # require redundancy
output[xi] = x[i]
if mps:
output[xi] = output[xi].to(device)
logging = False
return output