Spaces:

VerokeAI
/

Object_tracking_boxmot

Sleeping

App Files Files Community

usiddiquee commited on Apr 25

Commit

e1832f4

1 Parent(s): 3b054ae

hi

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +176 -0
boxmot/__init__.py +21 -0
boxmot/appearance/__init__.py +0 -0
boxmot/appearance/backbones/__init__.py +1 -0
boxmot/appearance/backbones/clip/__init__.py +1 -0
boxmot/appearance/backbones/clip/clip/__init__.py +1 -0
boxmot/appearance/backbones/clip/clip/bpe_simple_vocab_16e6.txt.gz +3 -0
boxmot/appearance/backbones/clip/clip/clip.py +222 -0
boxmot/appearance/backbones/clip/clip/model.py +504 -0
boxmot/appearance/backbones/clip/clip/simple_tokenizer.py +136 -0
boxmot/appearance/backbones/clip/config/__init__.py +1 -0
boxmot/appearance/backbones/clip/config/defaults.py +239 -0
boxmot/appearance/backbones/clip/config/defaults_base.py +190 -0
boxmot/appearance/backbones/clip/make_model.py +161 -0
boxmot/appearance/backbones/clip/make_model_clipreid.py +247 -0
boxmot/appearance/backbones/hacnn.py +406 -0
boxmot/appearance/backbones/lmbn/__init__.py +1 -0
boxmot/appearance/backbones/lmbn/attention.py +281 -0
boxmot/appearance/backbones/lmbn/bnneck.py +166 -0
boxmot/appearance/backbones/lmbn/lmbn_n.py +185 -0
boxmot/appearance/backbones/mlfn.py +240 -0
boxmot/appearance/backbones/mobilenetv2.py +246 -0
boxmot/appearance/backbones/osnet.py +560 -0
boxmot/appearance/backbones/osnet_ain.py +582 -0
boxmot/appearance/backbones/resnet.py +517 -0
boxmot/appearance/backends/base_backend.py +135 -0
boxmot/appearance/backends/onnx_backend.py +42 -0
boxmot/appearance/backends/openvino_backend.py +44 -0
boxmot/appearance/backends/pytorch_backend.py +24 -0
boxmot/appearance/backends/tensorrt_backend.py +126 -0
boxmot/appearance/backends/tflite_backend.py +86 -0
boxmot/appearance/backends/torchscript_backend.py +24 -0
boxmot/appearance/exporters/base_exporter.py +56 -0
boxmot/appearance/exporters/onnx_exporter.py +56 -0
boxmot/appearance/exporters/openvino_exporter.py +26 -0
boxmot/appearance/exporters/tensorrt_exporter.py +80 -0
boxmot/appearance/exporters/tflite_exporter.py +37 -0
boxmot/appearance/exporters/torchscript_exporter.py +15 -0
boxmot/appearance/reid/__init__.py +16 -0
boxmot/appearance/reid/auto_backend.py +128 -0
boxmot/appearance/reid/config.py +73 -0
boxmot/appearance/reid/export.py +227 -0
boxmot/appearance/reid/factory.py +40 -0
boxmot/appearance/reid/registry.py +87 -0
boxmot/configs/__init__.py +1 -0
boxmot/configs/boosttrack.yaml +90 -0
boxmot/configs/botsort.yaml +39 -0
boxmot/configs/bytetrack.yaml +24 -0
boxmot/configs/deepocsort.yaml +74 -0
boxmot/configs/hybridsort.yaml +49 -0

app.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import os
+import gradio as gr
+import subprocess
+import tempfile
+import shutil
+from pathlib import Path
+import sys
+import importlib.util
+# Ensure models directory exists
+MODELS_DIR = Path("models")
+os.makedirs(MODELS_DIR, exist_ok=True)
+def ensure_dependencies():
+    """Ensure all required dependencies are installed."""
+    required_packages = [
+        "ultralytics",
+        "boxmot",
+        "supervision"
+    ]
+    for package in required_packages:
+        try:
+            importlib.import_module(package)
+            print(f"✅ {package} is installed")
+        except ImportError:
+            print(f"⚠️ {package} is not installed, attempting to install...")
+            subprocess.run([sys.executable, "-m", "pip", "install", package], check=True)
+# Apply tracker patches if tracker_patch.py exists
+def apply_patches():
+    patch_path = Path("tracker_patch.py")
+    if patch_path.exists():
+        spec = importlib.util.spec_from_file_location("tracker_patch", patch_path)
+        if spec:
+            module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(module)
+            if hasattr(module, "patch_trackers"):
+                module.patch_trackers()
+                print("✅ Applied tracker patches")
+            else:
+                print("⚠️ tracker_patch.py exists but has no patch_trackers function")
+    else:
+        print("⚠️ tracker_patch.py not found, skipping patches")
+def run_tracking(video_file, yolo_model, reid_model, tracking_method, conf_threshold):
+    """Run object tracking on the uploaded video."""
+    try:
+        # Create temporary workspace
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Prepare input
+            input_path = os.path.join(temp_dir, "input_video.mp4")
+            shutil.copy(video_file, input_path)
+            # Prepare output directory
+            output_dir = os.path.join(temp_dir, "output")
+            os.makedirs(output_dir, exist_ok=True)
+            # Build command
+            cmd = [
+                "python", "tracking/track.py",
+                "--yolo-model", str(MODELS_DIR / yolo_model),
+                "--reid-model", str(MODELS_DIR / reid_model),
+                "--tracking-method", tracking_method,
+                "--source", input_path,
+                "--conf", str(conf_threshold),
+                "--save",
+                "--project", output_dir,
+                "--name", "track",
+                "--exist-ok"
+            ]
+            # Special handling for OcSort
+            if tracking_method == "ocsort":
+                cmd.append("--per-class")
+            # Execute tracking with error handling
+            process = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True
+            )
+            # Check for errors in output
+            if process.returncode != 0:
+                error_message = process.stderr or process.stdout
+                return None, f"Error in tracking process: {error_message}"
+            # Find output video
+            output_files = []
+            for root, _, files in os.walk(output_dir):
+                for file in files:
+                    if file.lower().endswith((".mp4", ".avi", ".mov")):
+                        output_files.append(os.path.join(root, file))
+            if not output_files:
+                return None, "No output video was generated. Check if tracking was successful."
+            return output_files[0], "Processing completed successfully!"
+    except Exception as e:
+        return None, f"Error: {str(e)}"
+# Define the Gradio interface
+def process_video(video_path, yolo_model, reid_model, tracking_method, conf_threshold):
+    # Validate inputs
+    if not video_path:
+        return None, "Please upload a video file"
+    output_path, status = run_tracking(
+        video_path,
+        yolo_model,
+        reid_model,
+        tracking_method,
+        conf_threshold
+    )
+    return output_path, status
+# Available models and tracking methods
+yolo_models = ["yolov8n.pt", "yolov8s.pt", "yolov8m.pt"]
+reid_models = ["osnet_x0_25_msmt17.pt"]
+tracking_methods = ["bytetrack", "botsort", "ocsort", "strongsort"]
+# Ensure dependencies and apply patches at startup
+ensure_dependencies()
+apply_patches()
+# Create the Gradio interface
+with gr.Blocks(title="YOLO Object Tracking") as app:
+    gr.Markdown("# 🚀 YOLO Object Tracking")
+    gr.Markdown("Upload a video file to detect and track objects. Processing may take a few minutes depending on video length.")
+    with gr.Row():
+        with gr.Column():
+            input_video = gr.Video(label="Input Video", sources=["upload"])
+            with gr.Group():
+                yolo_model = gr.Dropdown(
+                    choices=yolo_models,
+                    value="yolov8n.pt",
+                    label="YOLO Model"
+                )
+                reid_model = gr.Dropdown(
+                    choices=reid_models,
+                    value="osnet_x0_25_msmt17.pt",
+                    label="ReID Model"
+                )
+                tracking_method = gr.Dropdown(
+                    choices=tracking_methods,
+                    value="bytetrack",
+                    label="Tracking Method"
+                )
+                conf_threshold = gr.Slider(
+                    minimum=0.1,
+                    maximum=0.9,
+                    value=0.3,
+                    step=0.05,
+                    label="Confidence Threshold"
+                )
+            process_btn = gr.Button("Process Video", variant="primary")
+        with gr.Column():
+            output_video = gr.Video(label="Output Video with Tracking", autoplay=True)
+            status_text = gr.Textbox(label="Status", value="Ready to process video")
+    process_btn.click(
+        fn=process_video,
+        inputs=[input_video, yolo_model, reid_model, tracking_method, conf_threshold],
+        outputs=[output_video, status_text]
+    )
+# Launch the app
+if __name__ == "__main__":
+    app.launch(debug=True, share=True)

boxmot/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license
+__version__ = '12.0.7'
+from boxmot.postprocessing.gsi import gsi
+from boxmot.tracker_zoo import create_tracker, get_tracker_config
+from boxmot.trackers.botsort.botsort import BotSort
+from boxmot.trackers.bytetrack.bytetrack import ByteTrack
+from boxmot.trackers.deepocsort.deepocsort import DeepOcSort
+from boxmot.trackers.hybridsort.hybridsort import HybridSort
+from boxmot.trackers.ocsort.ocsort import OcSort
+from boxmot.trackers.strongsort.strongsort import StrongSort
+from boxmot.trackers.imprassoc.imprassoctrack import ImprAssocTrack
+from boxmot.trackers.boosttrack.boosttrack import BoostTrack
+TRACKERS = ['bytetrack', 'botsort', 'strongsort', 'ocsort', 'deepocsort', 'hybridsort', 'imprassoc', 'boosttrack']
+__all__ = ("__version__",
+           "StrongSort", "OcSort", "ByteTrack", "BotSort", "DeepOcSort", "HybridSort", "ImprAssocTrack", "BoostTrack",
+           "create_tracker", "get_tracker_config", "gsi")

boxmot/appearance/__init__.py ADDED Viewed

File without changes

boxmot/appearance/backbones/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license

boxmot/appearance/backbones/clip/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license

boxmot/appearance/backbones/clip/clip/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license

boxmot/appearance/backbones/clip/clip/bpe_simple_vocab_16e6.txt.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
+size 1356917

boxmot/appearance/backbones/clip/clip/clip.py ADDED Viewed

	@@ -0,0 +1,222 @@

+# Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license
+import hashlib
+import os
+import urllib
+import warnings
+from typing import List, Union
+import torch
+from PIL import Image
+from torchvision.transforms import (CenterCrop, Compose, Normalize, Resize,
+                                    ToTensor)
+from tqdm import tqdm
+from .model import build_model
+from .simple_tokenizer import SimpleTokenizer as _Tokenizer
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+__all__ = ["available_models", "load", "tokenize"]
+_tokenizer = _Tokenizer()
+_MODELS = {
+    "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",  # noqa: E501
+    "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",  # noqa: E501
+    "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",  # noqa: E501
+    "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",  # noqa: E501
+    "ViT-B-32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",  # noqa: E501
+    "ViT-B-16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",  # noqa: E501
+}
+def _download(url: str, root: str = os.path.expanduser("~/.cache/clip")):
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+    expected_sha256 = url.split("/")[-2]
+    download_target = os.path.join(root, filename)
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+    if os.path.isfile(download_target):
+        if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
+            return download_target
+        else:
+            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+                output.write(buffer)
+                loop.update(len(buffer))
+    if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
+        raise RuntimeError("Model has been downloaded but the SHA256 checksum does not not match")
+    return download_target
+def _transform(n_px):
+    return Compose([
+        Resize(n_px, interpolation=BICUBIC),
+        CenterCrop(n_px),
+        lambda image: image.convert("RGB"),
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+    ])
+def available_models() -> List[str]:
+    """Returns the names of available CLIP models"""
+    return list(_MODELS.keys())
+def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", jit=False):
+    """Load a CLIP model
+    Parameters
+    ----------
+    name : str
+        A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
+    device : Union[str, torch.device]
+        The device to put the loaded model
+    jit : bool
+        Whether to load the optimized JIT model or more hackable non-JIT model (default).
+    Returns
+    -------
+    model : torch.nn.Module
+        The CLIP model
+    preprocess : Callable[[PIL.Image], torch.Tensor]
+        A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
+    """
+    if name in _MODELS:
+        model_path = _download(_MODELS[name])
+    elif os.path.isfile(name):
+        model_path = name
+    else:
+        raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
+    try:
+        # loading JIT archive
+        model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
+        state_dict = None
+    except RuntimeError:
+        # loading saved state dict
+        if jit:
+            warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
+            jit = False
+        state_dict = torch.load(model_path, map_location="cpu")
+    if not jit:
+        model = build_model(state_dict or model.state_dict()).to(device)
+        if str(device) == "cpu":
+            model.float()
+        return model, _transform(model.visual.input_resolution)
+    # patch the device names
+    device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
+    device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
+    def patch_device(module):
+        try:
+            graphs = [module.graph] if hasattr(module, "graph") else []
+        except RuntimeError:
+            graphs = []
+        if hasattr(module, "forward1"):
+            graphs.append(module.forward1.graph)
+        for graph in graphs:
+            for node in graph.findAllNodes("prim::Constant"):
+                if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
+                    node.copyAttributes(device_node)
+    model.apply(patch_device)
+    patch_device(model.encode_image)
+    patch_device(model.encode_text)
+    # patch dtype to float32 on CPU
+    if str(device) == "cpu":
+        float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
+        float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
+        float_node = float_input.node()
+        def patch_float(module):
+            try:
+                graphs = [module.graph] if hasattr(module, "graph") else []
+            except RuntimeError:
+                graphs = []
+            if hasattr(module, "forward1"):
+                graphs.append(module.forward1.graph)
+            for graph in graphs:
+                for node in graph.findAllNodes("aten::to"):
+                    inputs = list(node.inputs())
+                    for i in [1, 2]:  # dtype can be the second or third argument to aten::to()
+                        if inputs[i].node()["value"] == 5:
+                            inputs[i].node().copyAttributes(float_node)
+        model.apply(patch_float)
+        patch_float(model.encode_image)
+        patch_float(model.encode_text)
+        model.float()
+    return model, _transform(model.input_resolution.item())
+def tokenize(texts: Union[str, List[str]], context_length: int = 77, truncate: bool = False) -> torch.LongTensor:
+    """
+    Returns the tokenized representation of given input string(s)
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+    truncate: bool
+        Whether to truncate the text in case its encoding is longer than the context length
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
+    """
+    # import pdb
+    # pdb.set_trace()
+    if isinstance(texts, str):
+        texts = [texts]  # ['a photo of a face.']
+    sot_token = _tokenizer.encoder["<|startoftext|>"]  # 49406
+    eot_token = _tokenizer.encoder["<|endoftext|>"]  # 49407
+    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)  # 1,77
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:  # context_length 77
+            if truncate:
+                tokens = tokens[:context_length]
+                tokens[-1] = eot_token
+            else:
+                raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
+        result[i, :len(tokens)] = torch.tensor(tokens)
+    return result

boxmot/appearance/backbones/clip/clip/model.py ADDED Viewed

	@@ -0,0 +1,504 @@

+# Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license
+from collections import OrderedDict
+from typing import Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(OrderedDict([
+                ("-1", nn.AvgPool2d(stride)),
+                ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
+                ("1", nn.BatchNorm2d(planes * self.expansion))
+            ]))
+    def forward(self, x: torch.Tensor):
+        identity = x
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.relu(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class AttentionPool2d(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+    def forward(self, x):
+        # NCHW -> (HW)NC  #32,2048,7,7 ->49, 32, 2048
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1)
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC  50,32,2048
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x, key=x, value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False
+        )
+        return x
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.avgpool = nn.AvgPool2d(2)
+        self.relu = nn.ReLU(inplace=True)
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=1)
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution, embed_dim, heads, output_dim)
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        def stem(x):
+            for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), (self.conv3, self.bn3)]:
+                x = self.relu(bn(conv(x)))
+            x = self.avgpool(x)
+            return x
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x3 = self.layer3(x)
+        x4 = self.layer4(x3)
+        xproj = self.attnpool(x4)
+        return x3, x4, xproj
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        for param in self.parameters():
+            if param.dtype == torch.float16:
+                param.data = param.data.to(torch.float32)
+        ret = super().forward(x.to(torch.float32))
+        return ret.to(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+class VisionTransformer(nn.Module):
+    def __init__(
+        self,
+        h_resolution: int,
+        w_resolution: int,
+        patch_size: int,
+        stride_size: int,
+        width: int,
+        layers: int,
+        heads: int,
+        output_dim: int
+    ):
+        super().__init__()
+        self.h_resolution = h_resolution
+        self.w_resolution = w_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=stride_size,
+            bias=False
+        )
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn(h_resolution * w_resolution + 1, width))
+        self.ln_pre = LayerNorm(width)
+        self.transformer = Transformer(width, layers, heads)
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+    def forward(self, x: torch.Tensor, cv_emb=None):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat([self.class_embedding.to(x.dtype) +
+                       # shape = [*, grid ** 2 + 1, width]
+                       torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)
+        if cv_emb is not None:
+            x[:, 0] = x[:, 0] + cv_emb
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x11 = self.transformer.resblocks[:11](x)
+        x12 = self.transformer.resblocks[11](x11)
+        x11 = x11.permute(1, 0, 2)  # LND -> NLD
+        x12 = x12.permute(1, 0, 2)  # LND -> NLD
+        x12 = self.ln_post(x12)
+        if self.proj is not None:
+            xproj = x12 @ self.proj
+        return x11, x12, xproj
+class CLIP(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 image_resolution: int,
+                 vision_layers: Union[Tuple[int, int, int, int], int],
+                 vision_width: int,
+                 vision_patch_size: int,
+                 vision_stride_size: int,
+                 # text
+                 context_length: int,
+                 vocab_size: int,
+                 transformer_width: int,
+                 transformer_heads: int,
+                 transformer_layers: int,
+                 h_resolution: int,
+                 w_resolution: int
+                 ):
+        super().__init__()
+        self.context_length = context_length
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=h_resolution * w_resolution,
+                width=vision_width
+            )
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisionTransformer(
+                h_resolution=h_resolution,
+                w_resolution=w_resolution,
+                patch_size=vision_patch_size,
+                stride_size=vision_stride_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim
+            )
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask()
+        )
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features ** -0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+    def encode_text(self, text):
+        x = self.token_embedding(text).type(self.dtype)
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)
+        x = self.ln_final(x).type(self.dtype)
+        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+        return x
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logit_scale * text_features @ image_features.t()
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.float()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.float()
+        if isinstance(l, nn.MultiheadAttention):
+            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.float()
+        for name in ["text_projection", "proj"]:
+            if hasattr(l, name):
+                attr = getattr(l, name)
+                if attr is not None:
+                    attr.data = attr.data.float()
+    model.apply(_convert_weights_to_fp16)
+def build_model(state_dict: dict, h_resolution: int, w_resolution: int, vision_stride_size: int):
+    vit = "visual.proj" in state_dict
+    if vit:
+        vision_width = state_dict["visual.conv1.weight"].shape[0]
+        vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and
+                             k.endswith(".attn.in_proj_weight")])
+        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+        grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:  # RN50
+        counts: list = [len(set(k.split(".")[2] for k in state_dict if
+                        k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
+        vision_layers = tuple(counts)
+        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
+        output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
+        vision_patch_size = None
+        assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
+        image_resolution = output_width * 32
+    embed_dim = state_dict["text_projection"].shape[1]
+    context_length = state_dict["positional_embedding"].shape[0]  # 77 (77,512)
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith("transformer.resblocks")))
+    model = CLIP(
+        embed_dim,
+        image_resolution, vision_layers, vision_width, vision_patch_size, vision_stride_size,
+        context_length, vocab_size, transformer_width, transformer_heads, transformer_layers,
+        h_resolution, w_resolution
+    )
+    if vit:
+        state_dict["visual.positional_embedding"] = resize_pos_embed(
+            state_dict["visual.positional_embedding"],
+            model.visual.positional_embedding,
+            h_resolution,
+            w_resolution
+        )
+    else:  # RN50
+        state_dict["visual.attnpool.positional_embedding"] = resize_pos_embed(
+            state_dict["visual.attnpool.positional_embedding"],
+            model.visual.attnpool.positional_embedding,
+            h_resolution,
+            w_resolution
+        )
+    for key in ["input_resolution", "context_length", "vocab_size"]:
+        if key in state_dict:
+            del state_dict[key]
+    convert_weights(model)
+    model.load_state_dict(state_dict)
+    return model.eval()
+import math
+def resize_pos_embed(posemb, posemb_new, hight, width):
+    # Rescale the grid of position embeddings when loading from state_dict. Adapted from
+    # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
+    print('Resized position embedding: %s to %s', posemb.shape, posemb_new.shape)
+    ntok_new = posemb_new.shape[0]  # 129,2048
+    posemb_token, posemb_grid = posemb[:1], posemb[1:]
+    ntok_new -= 1
+    gs_old = int(math.sqrt(len(posemb_grid)))  # 14
+    print('Position embedding resize to height:{} width: {}'.format(hight, width))
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(posemb_grid, size=(hight, width), mode='bilinear')
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, hight * width, -1)
+    posemb = torch.cat([posemb_token, posemb_grid.squeeze()], dim=0)
+    return posemb

boxmot/appearance/backbones/clip/clip/simple_tokenizer.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license
+import gzip
+import html
+from functools import lru_cache
+import ftfy
+import regex as re
+from boxmot.utils import BOXMOT
+@lru_cache()
+def default_bpe():
+    return BOXMOT / "appearance/backbones/clip/clip/bpe_simple_vocab_16e6.txt.gz"
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2 ** 8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+class SimpleTokenizer(object):
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
+        merges = merges[1:49152 - 256 - 2 + 1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v + '</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
+        self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)  # noqa: E501
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + '</w>',)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token + '</w>'
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except Exception:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
+        return text

boxmot/appearance/backbones/clip/config/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license

boxmot/appearance/backbones/clip/config/defaults.py ADDED Viewed

	@@ -0,0 +1,239 @@

+# Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license
+from yacs.config import CfgNode as CN
+# -----------------------------------------------------------------------------
+# Convention about Training / Test specific parameters
+# -----------------------------------------------------------------------------
+# Whenever an argument can be either used for training or for testing, the
+# corresponding name will be post-fixed by a _TRAIN for a training parameter,
+# -----------------------------------------------------------------------------
+# Config definition
+# -----------------------------------------------------------------------------
+_C = CN()
+# -----------------------------------------------------------------------------
+# MODEL
+# -----------------------------------------------------------------------------
+_C.MODEL = CN()
+# Using cuda or cpu for training
+_C.MODEL.DEVICE = "cuda"
+# ID number of GPU
+_C.MODEL.DEVICE_ID = '0'
+# Name of backbone
+_C.MODEL.NAME = 'ViT-B-16'
+# Last stride of backbone
+_C.MODEL.LAST_STRIDE = 1
+# Path to pretrained model of backbone
+_C.MODEL.PRETRAIN_PATH = '/home/mikel.brostrom/yolo_tracking/clip_market1501.pt'
+# Use ImageNet pretrained model to initialize backbone or use self trained model to initialize the whole model
+# Options: 'imagenet' , 'self' , 'finetune'
+_C.MODEL.PRETRAIN_CHOICE = 'imagenet'
+# If train with BNNeck, options: 'bnneck' or 'no'
+_C.MODEL.NECK = 'bnneck'
+# If train loss include center loss, options: 'yes' or 'no'. Loss with center loss has different optimizer configuration
+_C.MODEL.IF_WITH_CENTER = 'no'
+_C.MODEL.ID_LOSS_TYPE = 'softmax'
+_C.MODEL.ID_LOSS_WEIGHT = 1.0
+_C.MODEL.TRIPLET_LOSS_WEIGHT = 1.0
+_C.MODEL.I2T_LOSS_WEIGHT = 1.0
+_C.MODEL.METRIC_LOSS_TYPE = 'triplet'
+# If train with multi-gpu ddp mode, options: 'True', 'False'
+_C.MODEL.DIST_TRAIN = False
+# If train with soft triplet loss, options: 'True', 'False'
+_C.MODEL.NO_MARGIN = False
+# If train with label smooth, options: 'on', 'off'
+_C.MODEL.IF_LABELSMOOTH = 'on'
+# If train with arcface loss, options: 'True', 'False'
+_C.MODEL.COS_LAYER = False
+# Transformer setting
+_C.MODEL.DROP_PATH = 0.1
+_C.MODEL.DROP_OUT = 0.0
+_C.MODEL.ATT_DROP_RATE = 0.0
+_C.MODEL.TRANSFORMER_TYPE = 'None'
+_C.MODEL.STRIDE_SIZE = [16, 16]
+# SIE Parameter
+_C.MODEL.SIE_COE = 3.0
+_C.MODEL.SIE_CAMERA = False
+_C.MODEL.SIE_VIEW = False
+# -----------------------------------------------------------------------------
+# INPUT
+# -----------------------------------------------------------------------------
+_C.INPUT = CN()
+# Size of the image during training
+_C.INPUT.SIZE_TRAIN = [256, 128]
+# Size of the image during test
+_C.INPUT.SIZE_TEST = [256, 128]
+# Random probability for image horizontal flip
+_C.INPUT.PROB = 0.5
+# Random probability for random erasing
+_C.INPUT.RE_PROB = 0.5
+# Values to be used for image normalization
+_C.INPUT.PIXEL_MEAN = [0.485, 0.456, 0.406]
+# Values to be used for image normalization
+_C.INPUT.PIXEL_STD = [0.229, 0.224, 0.225]
+# Value of padding size
+_C.INPUT.PADDING = 10
+# -----------------------------------------------------------------------------
+# Dataset
+# -----------------------------------------------------------------------------
+_C.DATASETS = CN()
+# List of the dataset names for training, as present in paths_catalog.py
+_C.DATASETS.NAMES = ('market1501')
+# Root directory where datasets should be used (and downloaded if not found)
+_C.DATASETS.ROOT_DIR = ('../data')
+# -----------------------------------------------------------------------------
+# DataLoader
+# -----------------------------------------------------------------------------
+_C.DATALOADER = CN()
+# Number of data loading threads
+_C.DATALOADER.NUM_WORKERS = 8
+# Sampler for data loading
+_C.DATALOADER.SAMPLER = 'softmax'
+# Number of instance for one batch
+_C.DATALOADER.NUM_INSTANCE = 16
+# ---------------------------------------------------------------------------- #
+# Solver
+_C.SOLVER = CN()
+_C.SOLVER.SEED = 1234
+_C.SOLVER.MARGIN = 0.3
+# stage1
+# ---------------------------------------------------------------------------- #
+# Name of optimizer
+_C.SOLVER.STAGE1 = CN()
+_C.SOLVER.STAGE1.IMS_PER_BATCH = 64
+_C.SOLVER.STAGE1.OPTIMIZER_NAME = "Adam"
+# Number of max epoches
+_C.SOLVER.STAGE1.MAX_EPOCHS = 100
+# Base learning rate
+_C.SOLVER.STAGE1.BASE_LR = 3e-4
+# Momentum
+_C.SOLVER.STAGE1.MOMENTUM = 0.9
+# Settings of weight decay
+_C.SOLVER.STAGE1.WEIGHT_DECAY = 0.0005
+_C.SOLVER.STAGE1.WEIGHT_DECAY_BIAS = 0.0005
+# warm up factor
+_C.SOLVER.STAGE1.WARMUP_FACTOR = 0.01
+#  warm up epochs
+_C.SOLVER.STAGE1.WARMUP_EPOCHS = 5
+_C.SOLVER.STAGE1.WARMUP_LR_INIT = 0.01
+_C.SOLVER.STAGE1.LR_MIN = 0.000016
+_C.SOLVER.STAGE1.WARMUP_ITERS = 500
+# method of warm up, option: 'constant','linear'
+_C.SOLVER.STAGE1.WARMUP_METHOD = "linear"
+_C.SOLVER.STAGE1.COSINE_MARGIN = 0.5
+_C.SOLVER.STAGE1.COSINE_SCALE = 30
+# epoch number of saving checkpoints
+_C.SOLVER.STAGE1.CHECKPOINT_PERIOD = 10
+# iteration of display training log
+_C.SOLVER.STAGE1.LOG_PERIOD = 100
+# epoch number of validation
+# Number of images per batch
+# This is global, so if we have 8 GPUs and IMS_PER_BATCH = 128, each GPU will
+# contain 16 images per batch
+# _C.SOLVER.STAGE1.IMS_PER_BATCH = 64
+_C.SOLVER.STAGE1.EVAL_PERIOD = 10
+# ---------------------------------------------------------------------------- #
+# Solver
+# stage1
+# ---------------------------------------------------------------------------- #
+_C.SOLVER.STAGE2 = CN()
+_C.SOLVER.STAGE2.IMS_PER_BATCH = 64
+# Name of optimizer
+_C.SOLVER.STAGE2.OPTIMIZER_NAME = "Adam"
+# Number of max epoches
+_C.SOLVER.STAGE2.MAX_EPOCHS = 100
+# Base learning rate
+_C.SOLVER.STAGE2.BASE_LR = 3e-4
+# Whether using larger learning rate for fc layer
+_C.SOLVER.STAGE2.LARGE_FC_LR = False
+# Factor of learning bias
+_C.SOLVER.STAGE2.BIAS_LR_FACTOR = 1
+# Momentum
+_C.SOLVER.STAGE2.MOMENTUM = 0.9
+# Margin of triplet loss
+# Learning rate of SGD to learn the centers of center loss
+_C.SOLVER.STAGE2.CENTER_LR = 0.5
+# Balanced weight of center loss
+_C.SOLVER.STAGE2.CENTER_LOSS_WEIGHT = 0.0005
+# Settings of weight decay
+_C.SOLVER.STAGE2.WEIGHT_DECAY = 0.0005
+_C.SOLVER.STAGE2.WEIGHT_DECAY_BIAS = 0.0005
+# decay rate of learning rate
+_C.SOLVER.STAGE2.GAMMA = 0.1
+# decay step of learning rate
+_C.SOLVER.STAGE2.STEPS = (40, 70)
+# warm up factor
+_C.SOLVER.STAGE2.WARMUP_FACTOR = 0.01
+#  warm up epochs
+_C.SOLVER.STAGE2.WARMUP_EPOCHS = 5
+_C.SOLVER.STAGE2.WARMUP_LR_INIT = 0.01
+_C.SOLVER.STAGE2.LR_MIN = 0.000016
+_C.SOLVER.STAGE2.WARMUP_ITERS = 500
+# method of warm up, option: 'constant','linear'
+_C.SOLVER.STAGE2.WARMUP_METHOD = "linear"
+_C.SOLVER.STAGE2.COSINE_MARGIN = 0.5
+_C.SOLVER.STAGE2.COSINE_SCALE = 30
+# epoch number of saving checkpoints
+_C.SOLVER.STAGE2.CHECKPOINT_PERIOD = 10
+# iteration of display training log
+_C.SOLVER.STAGE2.LOG_PERIOD = 100
+# epoch number of validation
+_C.SOLVER.STAGE2.EVAL_PERIOD = 10
+# Number of images per batch
+# This is global, so if we have 8 GPUs and IMS_PER_BATCH = 128, each GPU will
+# contain 16 images per batch
+# ---------------------------------------------------------------------------- #
+# TEST
+# ---------------------------------------------------------------------------- #
+_C.TEST = CN()
+# Number of images per batch during test
+_C.TEST.IMS_PER_BATCH = 128
+# If test with re-ranking, options: 'True','False'
+_C.TEST.RE_RANKING = False
+# Path to trained model
+_C.TEST.WEIGHT = ""
+# Which feature of BNNeck to be used for test, before or after BNNneck, options: 'before' or 'after'
+_C.TEST.NECK_FEAT = 'after'
+# Whether feature is nomalized before test, if yes, it is equivalent to cosine distance
+_C.TEST.FEAT_NORM = 'yes'
+# Name for saving the distmat after testing.
+_C.TEST.DIST_MAT = "dist_mat.npy"
+# Whether calculate the eval score option: 'True', 'False'
+_C.TEST.EVAL = False
+# ---------------------------------------------------------------------------- #
+# Misc options
+# ---------------------------------------------------------------------------- #
+# Path to checkpoint and saved log of trained model
+_C.OUTPUT_DIR = ""

boxmot/appearance/backbones/clip/config/defaults_base.py ADDED Viewed

	@@ -0,0 +1,190 @@

+# Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license
+from yacs.config import CfgNode as CN
+# -----------------------------------------------------------------------------
+# Convention about Training / Test specific parameters
+# -----------------------------------------------------------------------------
+# Whenever an argument can be either used for training or for testing, the
+# corresponding name will be post-fixed by a _TRAIN for a training parameter,
+# -----------------------------------------------------------------------------
+# Config definition
+# -----------------------------------------------------------------------------
+_C = CN()
+# -----------------------------------------------------------------------------
+# MODEL
+# -----------------------------------------------------------------------------
+_C.MODEL = CN()
+# Using cuda or cpu for training
+_C.MODEL.DEVICE = "cuda"
+# ID number of GPU
+_C.MODEL.DEVICE_ID = '0'
+# Name of backbone
+_C.MODEL.NAME = 'resnet50'
+# Last stride of backbone
+_C.MODEL.LAST_STRIDE = 1
+# Path to pretrained model of backbone
+_C.MODEL.PRETRAIN_PATH = ''
+# Use ImageNet pretrained model to initialize backbone or use self trained model to initialize the whole model
+# Options: 'imagenet' , 'self' , 'finetune'
+_C.MODEL.PRETRAIN_CHOICE = 'imagenet'
+# If train with BNNeck, options: 'bnneck' or 'no'
+_C.MODEL.NECK = 'bnneck'
+# If train loss include center loss, options: 'yes' or 'no'. Loss with center loss has different optimizer configuration
+_C.MODEL.IF_WITH_CENTER = 'no'
+_C.MODEL.ID_LOSS_TYPE = 'softmax'
+_C.MODEL.ID_LOSS_WEIGHT = 1.0
+_C.MODEL.TRIPLET_LOSS_WEIGHT = 1.0
+_C.MODEL.I2T_LOSS_WEIGHT = 1.0
+_C.MODEL.METRIC_LOSS_TYPE = 'triplet'
+# If train with multi-gpu ddp mode, options: 'True', 'False'
+_C.MODEL.DIST_TRAIN = False
+# If train with soft triplet loss, options: 'True', 'False'
+_C.MODEL.NO_MARGIN = False
+# If train with label smooth, options: 'on', 'off'
+_C.MODEL.IF_LABELSMOOTH = 'on'
+# If train with arcface loss, options: 'True', 'False'
+_C.MODEL.COS_LAYER = False
+# Transformer setting
+_C.MODEL.DROP_PATH = 0.1
+_C.MODEL.DROP_OUT = 0.0
+_C.MODEL.ATT_DROP_RATE = 0.0
+_C.MODEL.TRANSFORMER_TYPE = 'None'
+_C.MODEL.STRIDE_SIZE = [16, 16]
+# SIE Parameter
+_C.MODEL.SIE_COE = 3.0
+_C.MODEL.SIE_CAMERA = False
+_C.MODEL.SIE_VIEW = False
+# -----------------------------------------------------------------------------
+# INPUT
+# -----------------------------------------------------------------------------
+_C.INPUT = CN()
+# Size of the image during training
+_C.INPUT.SIZE_TRAIN = [384, 128]
+# Size of the image during test
+_C.INPUT.SIZE_TEST = [384, 128]
+# Random probability for image horizontal flip
+_C.INPUT.PROB = 0.5
+# Random probability for random erasing
+_C.INPUT.RE_PROB = 0.5
+# Values to be used for image normalization
+_C.INPUT.PIXEL_MEAN = [0.485, 0.456, 0.406]
+# Values to be used for image normalization
+_C.INPUT.PIXEL_STD = [0.229, 0.224, 0.225]
+# Value of padding size
+_C.INPUT.PADDING = 10
+# -----------------------------------------------------------------------------
+# Dataset
+# -----------------------------------------------------------------------------
+_C.DATASETS = CN()
+# List of the dataset names for training, as present in paths_catalog.py
+_C.DATASETS.NAMES = ('market1501')
+# Root directory where datasets should be used (and downloaded if not found)
+_C.DATASETS.ROOT_DIR = ('../data')
+# -----------------------------------------------------------------------------
+# DataLoader
+# -----------------------------------------------------------------------------
+_C.DATALOADER = CN()
+# Number of data loading threads
+_C.DATALOADER.NUM_WORKERS = 8
+# Sampler for data loading
+_C.DATALOADER.SAMPLER = 'softmax'
+# Number of instance for one batch
+_C.DATALOADER.NUM_INSTANCE = 16
+# ---------------------------------------------------------------------------- #
+# Solver
+# ---------------------------------------------------------------------------- #
+_C.SOLVER = CN()
+# Name of optimizer
+_C.SOLVER.OPTIMIZER_NAME = "Adam"
+# Number of max epoches
+_C.SOLVER.MAX_EPOCHS = 100
+# Base learning rate
+_C.SOLVER.BASE_LR = 3e-4
+# Whether using larger learning rate for fc layer
+_C.SOLVER.LARGE_FC_LR = False
+# Factor of learning bias
+_C.SOLVER.BIAS_LR_FACTOR = 1
+# Factor of learning bias
+_C.SOLVER.SEED = 1234
+# Momentum
+_C.SOLVER.MOMENTUM = 0.9
+# Margin of triplet loss
+_C.SOLVER.MARGIN = 0.3
+# Learning rate of SGD to learn the centers of center loss
+_C.SOLVER.CENTER_LR = 0.5
+# Balanced weight of center loss
+_C.SOLVER.CENTER_LOSS_WEIGHT = 0.0005
+# Settings of weight decay
+_C.SOLVER.WEIGHT_DECAY = 0.0005
+_C.SOLVER.WEIGHT_DECAY_BIAS = 0.0005
+# decay rate of learning rate
+_C.SOLVER.GAMMA = 0.1
+# decay step of learning rate
+_C.SOLVER.STEPS = (40, 70)
+# warm up factor
+_C.SOLVER.WARMUP_FACTOR = 0.01
+#  warm up epochs
+_C.SOLVER.WARMUP_EPOCHS = 5
+_C.SOLVER.WARMUP_LR_INIT = 0.01
+_C.SOLVER.LR_MIN = 0.000016
+_C.SOLVER.WARMUP_ITERS = 500
+# method of warm up, option: 'constant','linear'
+_C.SOLVER.WARMUP_METHOD = "linear"
+_C.SOLVER.COSINE_MARGIN = 0.5
+_C.SOLVER.COSINE_SCALE = 30
+# epoch number of saving checkpoints
+_C.SOLVER.CHECKPOINT_PERIOD = 10
+# iteration of display training log
+_C.SOLVER.LOG_PERIOD = 100
+# epoch number of validation
+_C.SOLVER.EVAL_PERIOD = 10
+# Number of images per batch
+# This is global, so if we have 8 GPUs and IMS_PER_BATCH = 128, each GPU will
+# contain 16 images per batch
+_C.SOLVER.IMS_PER_BATCH = 64
+# ---------------------------------------------------------------------------- #
+# TEST
+# ---------------------------------------------------------------------------- #
+_C.TEST = CN()
+# Number of images per batch during test
+_C.TEST.IMS_PER_BATCH = 128
+# If test with re-ranking, options: 'True','False'
+_C.TEST.RE_RANKING = False
+# Path to trained model
+_C.TEST.WEIGHT = ""
+# Which feature of BNNeck to be used for test, before or after BNNneck, options: 'before' or 'after'
+_C.TEST.NECK_FEAT = 'after'
+# Whether feature is nomalized before test, if yes, it is equivalent to cosine distance
+_C.TEST.FEAT_NORM = 'yes'
+# Name for saving the distmat after testing.
+_C.TEST.DIST_MAT = "dist_mat.npy"
+# Whether calculate the eval score option: 'True', 'False'
+_C.TEST.EVAL = False
+# ---------------------------------------------------------------------------- #
+# Misc options
+# ---------------------------------------------------------------------------- #
+# Path to checkpoint and saved log of trained model
+_C.OUTPUT_DIR = ""

boxmot/appearance/backbones/clip/make_model.py ADDED Viewed

	@@ -0,0 +1,161 @@

+# Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license
+import torch
+import torch.nn as nn
+from .clip.simple_tokenizer import SimpleTokenizer as _Tokenizer
+_tokenizer = _Tokenizer()
+def weights_init_kaiming(m):
+    classname = m.__class__.__name__
+    if classname.find('Linear') != -1:
+        nn.init.kaiming_normal_(m.weight, a=0, mode='fan_out')
+        nn.init.constant_(m.bias, 0.0)
+    elif classname.find('Conv') != -1:
+        nn.init.kaiming_normal_(m.weight, a=0, mode='fan_in')
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0.0)
+    elif classname.find('BatchNorm') != -1:
+        if m.affine:
+            nn.init.constant_(m.weight, 1.0)
+            nn.init.constant_(m.bias, 0.0)
+def weights_init_classifier(m):
+    classname = m.__class__.__name__
+    if classname.find('Linear') != -1:
+        nn.init.normal_(m.weight, std=0.001)
+        if m.bias:
+            nn.init.constant_(m.bias, 0.0)
+class build_transformer(nn.Module):
+    def __init__(self, num_classes, camera_num, view_num, cfg):
+        super(build_transformer, self).__init__()
+        self.model_name = cfg.MODEL.NAME
+        self.cos_layer = cfg.MODEL.COS_LAYER
+        self.neck = cfg.MODEL.NECK
+        self.neck_feat = cfg.TEST.NECK_FEAT
+        if self.model_name == 'ViT-B-16':
+            self.in_planes = 768
+            self.in_planes_proj = 512
+        elif self.model_name == 'RN50':
+            self.in_planes = 2048
+            self.in_planes_proj = 1024
+        self.num_classes = num_classes
+        self.camera_num = camera_num
+        self.view_num = view_num
+        self.sie_coe = cfg.MODEL.SIE_COE
+        self.classifier = nn.Linear(self.in_planes, self.num_classes, bias=False)
+        self.classifier.apply(weights_init_classifier)
+        self.classifier_proj = nn.Linear(self.in_planes_proj, self.num_classes, bias=False)
+        self.classifier_proj.apply(weights_init_classifier)
+        self.bottleneck = nn.BatchNorm1d(self.in_planes)
+        self.bottleneck.bias.requires_grad_(False)
+        self.bottleneck.apply(weights_init_kaiming)
+        self.bottleneck_proj = nn.BatchNorm1d(self.in_planes_proj)
+        self.bottleneck_proj.bias.requires_grad_(False)
+        self.bottleneck_proj.apply(weights_init_kaiming)
+        self.h_resolution = int((cfg.INPUT.SIZE_TRAIN[0] - 16) // cfg.MODEL.STRIDE_SIZE[0] + 1)
+        self.w_resolution = int((cfg.INPUT.SIZE_TRAIN[1] - 16) // cfg.MODEL.STRIDE_SIZE[1] + 1)
+        self.vision_stride_size = cfg.MODEL.STRIDE_SIZE[0]
+        clip_model = load_clip_to_cpu(self.model_name, self.h_resolution, self.w_resolution, self.vision_stride_size)
+        self.image_encoder = clip_model.visual
+        # if cfg.MODEL.SIE_CAMERA and cfg.MODEL.SIE_VIEW:
+        #     self.cv_embed = nn.Parameter(torch.zeros(camera_num * view_num, self.in_planes))
+        #     trunc_normal_(self.cv_embed, std=.02)
+        #     print('camera number is : {}'.format(camera_num))
+        # elif cfg.MODEL.SIE_CAMERA:
+        #     self.cv_embed = nn.Parameter(torch.zeros(camera_num, self.in_planes))
+        #     trunc_normal_(self.cv_embed, std=.02)
+        #     print('camera number is : {}'.format(camera_num))
+        # elif cfg.MODEL.SIE_VIEW:
+        #     self.cv_embed = nn.Parameter(torch.zeros(view_num, self.in_planes))
+        #     trunc_normal_(self.cv_embed, std=.02)
+        #     print('camera number is : {}'.format(view_num))
+    def forward(self, x, label=None, cam_label=None, view_label=None):
+        if self.model_name == 'RN50':
+            image_features_last, image_features, image_features_proj = self.image_encoder(x)  # B,512  B,128,512
+            img_feature_last = nn.functional.avg_pool2d(
+                image_features_last,
+                image_features_last.shape[2:4]).view(x.shape[0], -1)
+            img_feature = nn.functional.avg_pool2d(
+                image_features,
+                image_features.shape[2:4]).view(x.shape[0], -1)
+            img_feature_proj = image_features_proj[0]
+        elif self.model_name == 'ViT-B-16':
+            if cam_label is not None and view_label is not None:
+                cv_embed = self.sie_coe * self.cv_embed[cam_label * self.view_num + view_label]
+            elif cam_label is not None:
+                cv_embed = self.sie_coe * self.cv_embed[cam_label]
+            elif view_label is not None:
+                cv_embed = self.sie_coe * self.cv_embed[view_label]
+            else:
+                cv_embed = None
+            # B,512  B,128,512
+            image_features_last, image_features, image_features_proj = self.image_encoder(x, cv_embed)
+            img_feature_last = image_features_last[:, 0]
+            img_feature = image_features[:, 0]
+            img_feature_proj = image_features_proj[:, 0]
+        feat = self.bottleneck(img_feature)
+        feat_proj = self.bottleneck_proj(img_feature_proj)
+        if self.training:
+            cls_score = self.classifier(feat)
+            cls_score_proj = self.classifier_proj(feat_proj)
+            return [cls_score, cls_score_proj], [img_feature_last, img_feature, img_feature_proj]
+        else:
+            if self.neck_feat == 'after':
+                # print("Test with feature after BN")
+                return torch.cat([feat, feat_proj], dim=1)
+            else:
+                return torch.cat([img_feature, img_feature_proj], dim=1)
+    def load_param(self, trained_path):
+        param_dict = torch.load(trained_path, map_location=torch.device("cpu"))
+        for i in self.state_dict():
+            self.state_dict()[i.replace('module.', '')].copy_(param_dict[i])
+        # print('Loading pretrained model from {}'.format('/home/mikel.brostrom/yolo_tracking/clip_market1501.pt'))
+    def load_param_finetune(self, model_path):
+        param_dict = torch.load(model_path)
+        for i in param_dict:
+            self.state_dict()[i].copy_(param_dict[i])
+        # print('Loading pretrained model for finetuning from {}'.format(model_path))
+def make_model(cfg, num_class, camera_num, view_num):
+    model = build_transformer(num_class, camera_num, view_num, cfg)
+    return model
+from .clip import clip
+def load_clip_to_cpu(backbone_name, h_resolution, w_resolution, vision_stride_size):
+    url = clip._MODELS[backbone_name]
+    model_path = clip._download(url)
+    try:
+        # loading JIT archive
+        model = torch.jit.load(model_path, map_location="cpu").eval()
+        state_dict = None
+    except RuntimeError:
+        state_dict = torch.load(model_path, map_location="cpu")
+    model = clip.build_model(state_dict or model.state_dict(), h_resolution, w_resolution, vision_stride_size)
+    return model

boxmot/appearance/backbones/clip/make_model_clipreid.py ADDED Viewed

	@@ -0,0 +1,247 @@

+# Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license
+import torch
+import torch.nn as nn
+from .clip.simple_tokenizer import SimpleTokenizer as _Tokenizer
+_tokenizer = _Tokenizer()
+def weights_init_kaiming(m):
+    classname = m.__class__.__name__
+    if classname.find('Linear') != -1:
+        nn.init.kaiming_normal_(m.weight, a=0, mode='fan_out')
+        nn.init.constant_(m.bias, 0.0)
+    elif classname.find('Conv') != -1:
+        nn.init.kaiming_normal_(m.weight, a=0, mode='fan_in')
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0.0)
+    elif classname.find('BatchNorm') != -1:
+        if m.affine:
+            nn.init.constant_(m.weight, 1.0)
+            nn.init.constant_(m.bias, 0.0)
+def weights_init_classifier(m):
+    classname = m.__class__.__name__
+    if classname.find('Linear') != -1:
+        nn.init.normal_(m.weight, std=0.001)
+        if m.bias:
+            nn.init.constant_(m.bias, 0.0)
+class TextEncoder(nn.Module):
+    def __init__(self, clip_model):
+        super().__init__()
+        self.transformer = clip_model.transformer
+        self.positional_embedding = clip_model.positional_embedding
+        self.ln_final = clip_model.ln_final
+        self.text_projection = clip_model.text_projection
+        self.dtype = clip_model.dtype
+    def forward(self, prompts, tokenized_prompts):
+        x = prompts + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), tokenized_prompts.argmax(dim=-1)] @ self.text_projection
+        return x
+class build_transformer(nn.Module):
+    def __init__(self, num_classes, camera_num, view_num, cfg):
+        super(build_transformer, self).__init__()
+        self.model_name = cfg.MODEL.NAME
+        self.cos_layer = cfg.MODEL.COS_LAYER
+        self.neck = cfg.MODEL.NECK
+        self.neck_feat = cfg.TEST.NECK_FEAT
+        if self.model_name == 'ViT-B-16':
+            self.in_planes = 768
+            self.in_planes_proj = 512
+        elif self.model_name == 'RN50':
+            self.in_planes = 2048
+            self.in_planes_proj = 1024
+        self.num_classes = num_classes
+        self.camera_num = camera_num
+        self.view_num = view_num
+        self.sie_coe = cfg.MODEL.SIE_COE
+        self.classifier = nn.Linear(self.in_planes, self.num_classes, bias=False)
+        self.classifier.apply(weights_init_classifier)
+        self.classifier_proj = nn.Linear(self.in_planes_proj, self.num_classes, bias=False)
+        self.classifier_proj.apply(weights_init_classifier)
+        self.bottleneck = nn.BatchNorm1d(self.in_planes)
+        self.bottleneck.bias.requires_grad_(False)
+        self.bottleneck.apply(weights_init_kaiming)
+        self.bottleneck_proj = nn.BatchNorm1d(self.in_planes_proj)
+        self.bottleneck_proj.bias.requires_grad_(False)
+        self.bottleneck_proj.apply(weights_init_kaiming)
+        self.h_resolution = int((cfg.INPUT.SIZE_TRAIN[0] - 16) // cfg.MODEL.STRIDE_SIZE[0] + 1)
+        self.w_resolution = int((cfg.INPUT.SIZE_TRAIN[1] - 16) // cfg.MODEL.STRIDE_SIZE[1] + 1)
+        self.vision_stride_size = cfg.MODEL.STRIDE_SIZE[0]
+        clip_model = load_clip_to_cpu(self.model_name, self.h_resolution, self.w_resolution, self.vision_stride_size)
+        self.image_encoder = clip_model.visual
+        # if cfg.MODEL.SIE_CAMERA and cfg.MODEL.SIE_VIEW:
+        #     self.cv_embed = nn.Parameter(torch.zeros(camera_num * view_num, self.in_planes))
+        #     trunc_normal_(self.cv_embed, std=.02)
+        #     print('camera number is : {}'.format(camera_num))
+        # elif cfg.MODEL.SIE_CAMERA:
+        #     self.cv_embed = nn.Parameter(torch.zeros(camera_num, self.in_planes))
+        #     trunc_normal_(self.cv_embed, std=.02)
+        #     print('camera number is : {}'.format(camera_num))
+        # elif cfg.MODEL.SIE_VIEW:
+        #     self.cv_embed = nn.Parameter(torch.zeros(view_num, self.in_planes))
+        #     trunc_normal_(self.cv_embed, std=.02)
+        #     print('camera number is : {}'.format(view_num))
+        dataset_name = cfg.DATASETS.NAMES
+        self.prompt_learner = PromptLearner(num_classes, dataset_name, clip_model.dtype, clip_model.token_embedding)
+        self.text_encoder = TextEncoder(clip_model)
+    def forward(self, x=None, label=None, get_image=False, get_text=False, cam_label=None, view_label=None):
+        if get_text is True:
+            prompts = self.prompt_learner(label)
+            text_features = self.text_encoder(prompts, self.prompt_learner.tokenized_prompts)
+            return text_features
+        if get_image is True:
+            image_features_last, image_features, image_features_proj = self.image_encoder(x)
+            if self.model_name == 'RN50':
+                return image_features_proj[0]
+            elif self.model_name == 'ViT-B-16':
+                return image_features_proj[:, 0]
+        if self.model_name == 'RN50':
+            image_features_last, image_features, image_features_proj = self.image_encoder(x)
+            img_feature_last = nn.functional.avg_pool2d(
+                image_features_last,
+                image_features_last.shape[2:4]).view(x.shape[0], -1)
+            img_feature = nn.functional.avg_pool2d(
+                image_features,
+                image_features.shape[2:4]).view(x.shape[0], -1)
+            img_feature_proj = image_features_proj[0]
+        elif self.model_name == 'ViT-B-16':
+            if cam_label is not None and view_label is not None:
+                cv_embed = self.sie_coe * self.cv_embed[cam_label * self.view_num + view_label]
+            elif cam_label is not None:
+                cv_embed = self.sie_coe * self.cv_embed[cam_label]
+            elif view_label is not None:
+                cv_embed = self.sie_coe * self.cv_embed[view_label]
+            else:
+                cv_embed = None
+            image_features_last, image_features, image_features_proj = self.image_encoder(x, cv_embed)
+            img_feature_last = image_features_last[:, 0]
+            img_feature = image_features[:, 0]
+            img_feature_proj = image_features_proj[:, 0]
+        feat = self.bottleneck(img_feature)
+        feat_proj = self.bottleneck_proj(img_feature_proj)
+        if self.training:
+            cls_score = self.classifier(feat)
+            cls_score_proj = self.classifier_proj(feat_proj)
+            return [cls_score, cls_score_proj], [img_feature_last, img_feature, img_feature_proj], img_feature_proj
+        else:
+            if self.neck_feat == 'after':
+                # print("Test with feature after BN")
+                return torch.cat([feat, feat_proj], dim=1)
+            else:
+                return torch.cat([img_feature, img_feature_proj], dim=1)
+    def load_param(self, trained_path):
+        param_dict = torch.load(trained_path)
+        for i in param_dict:
+            self.state_dict()[i.replace('module.', '')].copy_(param_dict[i])
+        print('Loaded pretrained model from {}'.format(trained_path))
+    def load_param_finetune(self, model_path):
+        param_dict = torch.load(model_path)
+        for i in param_dict:
+            self.state_dict()[i].copy_(param_dict[i])
+        print('Loading pretrained model for finetuning from {}'.format(model_path))
+def make_model(cfg, num_class, camera_num, view_num):
+    model = build_transformer(num_class, camera_num, view_num, cfg)
+    return model
+from .clip import clip
+def load_clip_to_cpu(backbone_name, h_resolution, w_resolution, vision_stride_size):
+    url = clip._MODELS[backbone_name]
+    model_path = clip._download(url)
+    try:
+        # loading JIT archive
+        model = torch.jit.load(model_path, map_location="cpu").eval()
+        state_dict = None
+    except RuntimeError:
+        state_dict = torch.load(model_path, map_location="cpu")
+    model = clip.build_model(state_dict or model.state_dict(), h_resolution, w_resolution, vision_stride_size)
+    return model
+class PromptLearner(nn.Module):
+    def __init__(self, num_class, dataset_name, dtype, token_embedding):
+        super().__init__()
+        if dataset_name == "VehicleID" or dataset_name == "veri":
+            ctx_init = "A photo of a X X X X vehicle."
+        else:
+            ctx_init = "A photo of a X X X X person."
+        ctx_dim = 512
+        # use given words to initialize context vectors
+        ctx_init = ctx_init.replace("_", " ")
+        n_ctx = 4
+        tokenized_prompts = clip.tokenize(ctx_init).cuda()
+        with torch.no_grad():
+            embedding = token_embedding(tokenized_prompts).type(dtype)
+        self.tokenized_prompts = tokenized_prompts  # torch.Tensor
+        n_cls_ctx = 4
+        cls_vectors = torch.empty(num_class, n_cls_ctx, ctx_dim, dtype=dtype)
+        nn.init.normal_(cls_vectors, std=0.02)
+        self.cls_ctx = nn.Parameter(cls_vectors)
+        # These token vectors will be saved when in save_model(),
+        # but they should be ignored in load_model() as we want to use
+        # those computed using the current class names
+        self.register_buffer("token_prefix", embedding[:, :n_ctx + 1, :])
+        self.register_buffer("token_suffix", embedding[:, n_ctx + 1 + n_cls_ctx:, :])
+        self.num_class = num_class
+        self.n_cls_ctx = n_cls_ctx
+    def forward(self, label):
+        cls_ctx = self.cls_ctx[label]
+        b = label.shape[0]
+        prefix = self.token_prefix.expand(b, -1, -1)
+        suffix = self.token_suffix.expand(b, -1, -1)
+        prompts = torch.cat(
+            [
+                prefix,  # (n_cls, 1, dim)
+                cls_ctx,     # (n_cls, n_ctx, dim)
+                suffix,  # (n_cls, *, dim)
+            ],
+            dim=1,
+        )
+        return prompts

boxmot/appearance/backbones/hacnn.py ADDED Viewed

	@@ -0,0 +1,406 @@

+# Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license
+from __future__ import absolute_import, division
+import torch
+from torch import nn
+from torch.nn import functional as F
+__all__ = ["HACNN"]
+class ConvBlock(nn.Module):
+    """Basic convolutional block.
+    convolution + batch normalization + relu.
+    Args:
+        in_c (int): number of input channels.
+        out_c (int): number of output channels.
+        k (int or tuple): kernel size.
+        s (int or tuple): stride.
+        p (int or tuple): padding.
+    """
+    def __init__(self, in_c, out_c, k, s=1, p=0):
+        super(ConvBlock, self).__init__()
+        self.conv = nn.Conv2d(in_c, out_c, k, stride=s, padding=p)
+        self.bn = nn.BatchNorm2d(out_c)
+    def forward(self, x):
+        return F.relu(self.bn(self.conv(x)))
+class InceptionA(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(InceptionA, self).__init__()
+        mid_channels = out_channels // 4
+        self.stream1 = nn.Sequential(
+            ConvBlock(in_channels, mid_channels, 1),
+            ConvBlock(mid_channels, mid_channels, 3, p=1),
+        )
+        self.stream2 = nn.Sequential(
+            ConvBlock(in_channels, mid_channels, 1),
+            ConvBlock(mid_channels, mid_channels, 3, p=1),
+        )
+        self.stream3 = nn.Sequential(
+            ConvBlock(in_channels, mid_channels, 1),
+            ConvBlock(mid_channels, mid_channels, 3, p=1),
+        )
+        self.stream4 = nn.Sequential(
+            nn.AvgPool2d(3, stride=1, padding=1),
+            ConvBlock(in_channels, mid_channels, 1),
+        )
+    def forward(self, x):
+        s1 = self.stream1(x)
+        s2 = self.stream2(x)
+        s3 = self.stream3(x)
+        s4 = self.stream4(x)
+        y = torch.cat([s1, s2, s3, s4], dim=1)
+        return y
+class InceptionB(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(InceptionB, self).__init__()
+        mid_channels = out_channels // 4
+        self.stream1 = nn.Sequential(
+            ConvBlock(in_channels, mid_channels, 1),
+            ConvBlock(mid_channels, mid_channels, 3, s=2, p=1),
+        )
+        self.stream2 = nn.Sequential(
+            ConvBlock(in_channels, mid_channels, 1),
+            ConvBlock(mid_channels, mid_channels, 3, p=1),
+            ConvBlock(mid_channels, mid_channels, 3, s=2, p=1),
+        )
+        self.stream3 = nn.Sequential(
+            nn.MaxPool2d(3, stride=2, padding=1),
+            ConvBlock(in_channels, mid_channels * 2, 1),
+        )
+    def forward(self, x):
+        s1 = self.stream1(x)
+        s2 = self.stream2(x)
+        s3 = self.stream3(x)
+        y = torch.cat([s1, s2, s3], dim=1)
+        return y
+class SpatialAttn(nn.Module):
+    """Spatial Attention (Sec. 3.1.I.1)"""
+    def __init__(self):
+        super(SpatialAttn, self).__init__()
+        self.conv1 = ConvBlock(1, 1, 3, s=2, p=1)
+        self.conv2 = ConvBlock(1, 1, 1)
+    def forward(self, x):
+        # global cross-channel averaging
+        x = x.mean(1, keepdim=True)
+        # 3-by-3 conv
+        x = self.conv1(x)
+        # bilinear resizing
+        x = F.upsample(
+            x, (x.size(2) * 2, x.size(3) * 2), mode="bilinear", align_corners=True
+        )
+        # scaling conv
+        x = self.conv2(x)
+        return x
+class ChannelAttn(nn.Module):
+    """Channel Attention (Sec. 3.1.I.2)"""
+    def __init__(self, in_channels, reduction_rate=16):
+        super(ChannelAttn, self).__init__()
+        assert in_channels % reduction_rate == 0
+        self.conv1 = ConvBlock(in_channels, in_channels // reduction_rate, 1)
+        self.conv2 = ConvBlock(in_channels // reduction_rate, in_channels, 1)
+    def forward(self, x):
+        # squeeze operation (global average pooling)
+        x = F.avg_pool2d(x, x.size()[2:])
+        # excitation operation (2 conv layers)
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+class SoftAttn(nn.Module):
+    """Soft Attention (Sec. 3.1.I)
+    Aim: Spatial Attention + Channel Attention
+    Output: attention maps with shape identical to input.
+    """
+    def __init__(self, in_channels):
+        super(SoftAttn, self).__init__()
+        self.spatial_attn = SpatialAttn()
+        self.channel_attn = ChannelAttn(in_channels)
+        self.conv = ConvBlock(in_channels, in_channels, 1)
+    def forward(self, x):
+        y_spatial = self.spatial_attn(x)
+        y_channel = self.channel_attn(x)
+        y = y_spatial * y_channel
+        y = torch.sigmoid(self.conv(y))
+        return y
+class HardAttn(nn.Module):
+    """Hard Attention (Sec. 3.1.II)"""
+    def __init__(self, in_channels):
+        super(HardAttn, self).__init__()
+        self.fc = nn.Linear(in_channels, 4 * 2)
+        self.init_params()
+    def init_params(self):
+        self.fc.weight.data.zero_()
+        self.fc.bias.data.copy_(
+            torch.tensor([0, -0.75, 0, -0.25, 0, 0.25, 0, 0.75], dtype=torch.float)
+        )
+    def forward(self, x):
+        # squeeze operation (global average pooling)
+        x = F.avg_pool2d(x, x.size()[2:]).view(x.size(0), x.size(1))
+        # predict transformation parameters
+        theta = torch.tanh(self.fc(x))
+        theta = theta.view(-1, 4, 2)
+        return theta
+class HarmAttn(nn.Module):
+    """Harmonious Attention (Sec. 3.1)"""
+    def __init__(self, in_channels):
+        super(HarmAttn, self).__init__()
+        self.soft_attn = SoftAttn(in_channels)
+        self.hard_attn = HardAttn(in_channels)
+    def forward(self, x):
+        y_soft_attn = self.soft_attn(x)
+        theta = self.hard_attn(x)
+        return y_soft_attn, theta
+class HACNN(nn.Module):
+    """Harmonious Attention Convolutional Neural Network.
+    Reference:
+        Li et al. Harmonious Attention Network for Person Re-identification. CVPR 2018.
+    Public keys:
+        - ``hacnn``: HACNN.
+    """
+    # Args:
+    #    num_classes (int): number of classes to predict
+    #    nchannels (list): number of channels AFTER concatenation
+    #    feat_dim (int): feature dimension for a single stream
+    #    learn_region (bool): whether to learn region features (i.e. local branch)
+    def __init__(
+        self,
+        num_classes,
+        loss="softmax",
+        nchannels=[128, 256, 384],
+        feat_dim=512,
+        learn_region=True,
+        use_gpu=True,
+        **kwargs
+    ):
+        super(HACNN, self).__init__()
+        self.loss = loss
+        self.learn_region = learn_region
+        self.use_gpu = use_gpu
+        self.conv = ConvBlock(3, 32, 3, s=2, p=1)
+        # Construct Inception + HarmAttn blocks
+        # ============== Block 1 ==============
+        self.inception1 = nn.Sequential(
+            InceptionA(32, nchannels[0]),
+            InceptionB(nchannels[0], nchannels[0]),
+        )
+        self.ha1 = HarmAttn(nchannels[0])
+        # ============== Block 2 ==============
+        self.inception2 = nn.Sequential(
+            InceptionA(nchannels[0], nchannels[1]),
+            InceptionB(nchannels[1], nchannels[1]),
+        )
+        self.ha2 = HarmAttn(nchannels[1])
+        # ============== Block 3 ==============
+        self.inception3 = nn.Sequential(
+            InceptionA(nchannels[1], nchannels[2]),
+            InceptionB(nchannels[2], nchannels[2]),
+        )
+        self.ha3 = HarmAttn(nchannels[2])
+        self.fc_global = nn.Sequential(
+            nn.Linear(nchannels[2], feat_dim),
+            nn.BatchNorm1d(feat_dim),
+            nn.ReLU(),
+        )
+        self.classifier_global = nn.Linear(feat_dim, num_classes)
+        if self.learn_region:
+            self.init_scale_factors()
+            self.local_conv1 = InceptionB(32, nchannels[0])
+            self.local_conv2 = InceptionB(nchannels[0], nchannels[1])
+            self.local_conv3 = InceptionB(nchannels[1], nchannels[2])
+            self.fc_local = nn.Sequential(
+                nn.Linear(nchannels[2] * 4, feat_dim),
+                nn.BatchNorm1d(feat_dim),
+                nn.ReLU(),
+            )
+            self.classifier_local = nn.Linear(feat_dim, num_classes)
+            self.feat_dim = feat_dim * 2
+        else:
+            self.feat_dim = feat_dim
+    def init_scale_factors(self):
+        # initialize scale factors (s_w, s_h) for four regions
+        self.scale_factors = []
+        self.scale_factors.append(torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float))
+        self.scale_factors.append(torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float))
+        self.scale_factors.append(torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float))
+        self.scale_factors.append(torch.tensor([[1, 0], [0, 0.25]], dtype=torch.float))
+    def stn(self, x, theta):
+        """Performs spatial transform
+        x: (batch, channel, height, width)
+        theta: (batch, 2, 3)
+        """
+        grid = F.affine_grid(theta, x.size())
+        x = F.grid_sample(x, grid)
+        return x
+    def transform_theta(self, theta_i, region_idx):
+        """Transforms theta to include (s_w, s_h), resulting in (batch, 2, 3)"""
+        scale_factors = self.scale_factors[region_idx]
+        theta = torch.zeros(theta_i.size(0), 2, 3)
+        theta[:, :, :2] = scale_factors
+        theta[:, :, -1] = theta_i
+        if self.use_gpu:
+            theta = theta.cuda()
+        return theta
+    def forward(self, x):
+        assert (
+            x.size(2) == 160 and x.size(3) == 64
+        ), "Input size does not match, expected (160, 64) but got ({}, {})".format(
+            x.size(2), x.size(3)
+        )
+        x = self.conv(x)
+        # ============== Block 1 ==============
+        # global branch
+        x1 = self.inception1(x)
+        x1_attn, x1_theta = self.ha1(x1)
+        x1_out = x1 * x1_attn
+        # local branch
+        if self.learn_region:
+            x1_local_list = []
+            for region_idx in range(4):
+                x1_theta_i = x1_theta[:, region_idx, :]
+                x1_theta_i = self.transform_theta(x1_theta_i, region_idx)
+                x1_trans_i = self.stn(x, x1_theta_i)
+                x1_trans_i = F.upsample(
+                    x1_trans_i, (24, 28), mode="bilinear", align_corners=True
+                )
+                x1_local_i = self.local_conv1(x1_trans_i)
+                x1_local_list.append(x1_local_i)
+        # ============== Block 2 ==============
+        # Block 2
+        # global branch
+        x2 = self.inception2(x1_out)
+        x2_attn, x2_theta = self.ha2(x2)
+        x2_out = x2 * x2_attn
+        # local branch
+        if self.learn_region:
+            x2_local_list = []
+            for region_idx in range(4):
+                x2_theta_i = x2_theta[:, region_idx, :]
+                x2_theta_i = self.transform_theta(x2_theta_i, region_idx)
+                x2_trans_i = self.stn(x1_out, x2_theta_i)
+                x2_trans_i = F.upsample(
+                    x2_trans_i, (12, 14), mode="bilinear", align_corners=True
+                )
+                x2_local_i = x2_trans_i + x1_local_list[region_idx]
+                x2_local_i = self.local_conv2(x2_local_i)
+                x2_local_list.append(x2_local_i)
+        # ============== Block 3 ==============
+        # Block 3
+        # global branch
+        x3 = self.inception3(x2_out)
+        x3_attn, x3_theta = self.ha3(x3)
+        x3_out = x3 * x3_attn
+        # local branch
+        if self.learn_region:
+            x3_local_list = []
+            for region_idx in range(4):
+                x3_theta_i = x3_theta[:, region_idx, :]
+                x3_theta_i = self.transform_theta(x3_theta_i, region_idx)
+                x3_trans_i = self.stn(x2_out, x3_theta_i)
+                x3_trans_i = F.upsample(
+                    x3_trans_i, (6, 7), mode="bilinear", align_corners=True
+                )
+                x3_local_i = x3_trans_i + x2_local_list[region_idx]
+                x3_local_i = self.local_conv3(x3_local_i)
+                x3_local_list.append(x3_local_i)
+        # ============== Feature generation ==============
+        # global branch
+        x_global = F.avg_pool2d(x3_out, x3_out.size()[2:]).view(
+            x3_out.size(0), x3_out.size(1)
+        )
+        x_global = self.fc_global(x_global)
+        # local branch
+        if self.learn_region:
+            x_local_list = []
+            for region_idx in range(4):
+                x_local_i = x3_local_list[region_idx]
+                x_local_i = F.avg_pool2d(x_local_i, x_local_i.size()[2:]).view(
+                    x_local_i.size(0), -1
+                )
+                x_local_list.append(x_local_i)
+            x_local = torch.cat(x_local_list, 1)
+            x_local = self.fc_local(x_local)
+        if not self.training:
+            # l2 normalization before concatenation
+            if self.learn_region:
+                x_global = x_global / x_global.norm(p=2, dim=1, keepdim=True)
+                x_local = x_local / x_local.norm(p=2, dim=1, keepdim=True)
+                return torch.cat([x_global, x_local], 1)
+            else:
+                return x_global
+        prelogits_global = self.classifier_global(x_global)
+        if self.learn_region:
+            prelogits_local = self.classifier_local(x_local)
+        if self.loss == "softmax":
+            if self.learn_region:
+                return (prelogits_global, prelogits_local)
+            else:
+                return prelogits_global
+        elif self.loss == "triplet":
+            if self.learn_region:
+                return (prelogits_global, prelogits_local), (x_global, x_local)
+            else:
+                return prelogits_global, x_global
+        else:
+            raise KeyError("Unsupported loss: {}".format(self.loss))

boxmot/appearance/backbones/lmbn/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license

boxmot/appearance/backbones/lmbn/attention.py ADDED Viewed

	@@ -0,0 +1,281 @@

+# Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license
+import math
+import random
+import torch
+from torch import nn
+from torch.nn import Conv2d, Module, Parameter, ReLU, Sigmoid, Softmax
+from torch.nn import functional as F
+torch_ver = torch.__version__[:3]
+__all__ = [
+    "BatchDrop",
+    "BatchFeatureErase_Top",
+    "BatchRandomErasing",
+    "PAM_Module",
+    "CAM_Module",
+    "Dual_Module",
+    "SE_Module",
+]
+class BatchRandomErasing(nn.Module):
+    def __init__(
+        self, probability=0.5, sl=0.02, sh=0.4, r1=0.3, mean=[0.4914, 0.4822, 0.4465]
+    ):
+        super(BatchRandomErasing, self).__init__()
+        self.probability = probability
+        self.mean = mean
+        self.sl = sl
+        self.sh = sh
+        self.r1 = r1
+    def forward(self, img):
+        if self.training:
+            if random.uniform(0, 1) > self.probability:
+                return img
+            for attempt in range(100):
+                area = img.size()[2] * img.size()[3]
+                target_area = random.uniform(self.sl, self.sh) * area
+                aspect_ratio = random.uniform(self.r1, 1 / self.r1)
+                h = int(round(math.sqrt(target_area * aspect_ratio)))
+                w = int(round(math.sqrt(target_area / aspect_ratio)))
+                if w < img.size()[3] and h < img.size()[2]:
+                    x1 = random.randint(0, img.size()[2] - h)
+                    y1 = random.randint(0, img.size()[3] - w)
+                    if img.size()[1] == 3:
+                        img[:, 0, x1: x1 + h, y1: y1 + w] = self.mean[0]
+                        img[:, 1, x1: x1 + h, y1: y1 + w] = self.mean[1]
+                        img[:, 2, x1: x1 + h, y1: y1 + w] = self.mean[2]
+                    else:
+                        img[:, 0, x1: x1 + h, y1: y1 + w] = self.mean[0]
+                    return img
+        return img
+class BatchDrop(nn.Module):
+    """
+    Ref: Batch DropBlock Network for Person Re-identification and Beyond
+    https://github.com/daizuozhuo/batch-dropblock-network/blob/master/models/networks.py
+    Created by: daizuozhuo
+    """
+    def __init__(self, h_ratio, w_ratio):
+        super(BatchDrop, self).__init__()
+        self.h_ratio = h_ratio
+        self.w_ratio = w_ratio
+    def forward(self, x):
+        if self.training:
+            h, w = x.size()[-2:]
+            rh = round(self.h_ratio * h)
+            rw = round(self.w_ratio * w)
+            sx = random.randint(0, h - rh)
+            sy = random.randint(0, w - rw)
+            mask = x.new_ones(x.size())
+            mask[:, :, sx: sx + rh, sy: sy + rw] = 0
+            x = x * mask
+        return x
+class BatchDropTop(nn.Module):
+    """
+    Ref: Top-DB-Net: Top DropBlock for Activation Enhancement in Person Re-Identification
+    https://github.com/RQuispeC/top-dropblock/blob/master/torchreid/models/bdnet.py
+    Created by: RQuispeC
+    """
+    def __init__(self, h_ratio):
+        super(BatchDropTop, self).__init__()
+        self.h_ratio = h_ratio
+    def forward(self, x, visdrop=False):
+        if self.training or visdrop:
+            b, c, h, w = x.size()
+            rh = round(self.h_ratio * h)
+            act = (x**2).sum(1)
+            act = act.view(b, h * w)
+            act = F.normalize(act, p=2, dim=1)
+            act = act.view(b, h, w)
+            max_act, _ = act.max(2)
+            ind = torch.argsort(max_act, 1)
+            ind = ind[:, -rh:]
+            mask = []
+            for i in range(b):
+                rmask = torch.ones(h)
+                rmask[ind[i]] = 0
+                mask.append(rmask.unsqueeze(0))
+            mask = torch.cat(mask)
+            mask = torch.repeat_interleave(mask, w, 1).view(b, h, w)
+            mask = torch.repeat_interleave(mask, c, 0).view(b, c, h, w)
+            if x.is_cuda:
+                mask = mask.cuda()
+            if visdrop:
+                return mask
+            x = x * mask
+        return x
+class BatchFeatureErase_Top(nn.Module):
+    """
+    Ref: Top-DB-Net: Top DropBlock for Activation Enhancement in Person Re-Identification
+    https://github.com/RQuispeC/top-dropblock/blob/master/torchreid/models/bdnet.py
+    Created by: RQuispeC
+    """
+    def __init__(
+        self,
+        channels,
+        bottleneck_type,
+        h_ratio=0.33,
+        w_ratio=1.0,
+        double_bottleneck=False,
+    ):
+        super(BatchFeatureErase_Top, self).__init__()
+        self.drop_batch_bottleneck = bottleneck_type(channels, 512)
+        self.drop_batch_drop_basic = BatchDrop(h_ratio, w_ratio)
+        self.drop_batch_drop_top = BatchDropTop(h_ratio)
+    def forward(self, x, drop_top=True, bottleneck_features=True, visdrop=False):
+        features = self.drop_batch_bottleneck(x)
+        if drop_top:
+            x = self.drop_batch_drop_top(features, visdrop=visdrop)
+        else:
+            x = self.drop_batch_drop_basic(features, visdrop=visdrop)
+        if visdrop:
+            return x  # x is dropmask
+        if bottleneck_features:
+            return x, features
+        else:
+            return x
+class SE_Module(Module):
+    def __init__(self, channels, reduction=4):
+        super(SE_Module, self).__init__()
+        self.fc1 = Conv2d(channels, channels // reduction, kernel_size=1, padding=0)
+        self.relu = ReLU(inplace=True)
+        self.fc2 = Conv2d(channels // reduction, channels, kernel_size=1, padding=0)
+        self.sigmoid = Sigmoid()
+    def forward(self, x):
+        module_input = x
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.sigmoid(x)
+        return module_input * x
+class PAM_Module(Module):
+    """Position attention module"""
+    # Ref from SAGAN
+    def __init__(self, in_dim):
+        super(PAM_Module, self).__init__()
+        self.chanel_in = in_dim
+        self.query_conv = Conv2d(
+            in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1
+        )
+        self.key_conv = Conv2d(
+            in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1
+        )
+        self.value_conv = Conv2d(in_channels=in_dim, out_channels=in_dim, kernel_size=1)
+        self.gamma = Parameter(torch.zeros(1))
+        self.softmax = Softmax(dim=-1)
+    def forward(self, x):
+        """
+        inputs :
+            x : input feature maps( B X C X H X W)
+        returns :
+            out : attention value + input feature
+            attention: B X (HxW) X (HxW)
+        """
+        m_batchsize, C, height, width = x.size()
+        proj_query = (
+            self.query_conv(x).view(m_batchsize, -1, width * height).permute(0, 2, 1)
+        )
+        proj_key = self.key_conv(x).view(m_batchsize, -1, width * height)
+        energy = torch.bmm(proj_query, proj_key)
+        attention = self.softmax(energy)
+        proj_value = self.value_conv(x).view(m_batchsize, -1, width * height)
+        out = torch.bmm(proj_value, attention.permute(0, 2, 1))
+        out = out.view(m_batchsize, C, height, width)
+        out = self.gamma * out + x
+        return out
+class CAM_Module(Module):
+    """Channel attention module"""
+    def __init__(self, in_dim):
+        super(CAM_Module, self).__init__()
+        self.chanel_in = in_dim
+        self.gamma = Parameter(torch.zeros(1))
+        self.softmax = Softmax(dim=-1)
+    def forward(self, x):
+        """
+        inputs :
+            x : input feature maps( B X C X H X W)
+        returns :
+            out : attention value + input feature
+            attention: B X C X C
+        """
+        m_batchsize, C, height, width = x.size()
+        proj_query = x.view(m_batchsize, C, -1)
+        proj_key = x.view(m_batchsize, C, -1).permute(0, 2, 1)
+        # proj_key = x.view(m_batchsize, C, -1).permute(0, 2, 1).contiguous()
+        energy = torch.bmm(proj_query, proj_key)
+        energy_new = torch.max(energy, -1, keepdim=True)[0].expand_as(energy) - energy
+        attention = self.softmax(energy_new)
+        proj_value = x.view(m_batchsize, C, -1)
+        out = torch.bmm(attention, proj_value)
+        out = out.view(m_batchsize, C, height, width)
+        out = self.gamma * out + x
+        return out
+class Dual_Module(Module):
+    """
+    # Created by: CASIA IVA
+    # Email: jliu@nlpr.ia.ac.cn
+    # Copyright (c) 2018
+    # Reference: Dual Attention Network for Scene Segmentation
+    # https://arxiv.org/pdf/1809.02983.pdf
+    # https://github.com/junfu1115/DANet/blob/master/encoding/nn/attention.py
+    """
+    def __init__(self, in_dim):
+        super(Dual_Module).__init__()
+        self.indim = in_dim
+        self.pam = PAM_Module(in_dim)
+        self.cam = CAM_Module(in_dim)
+    def forward(self, x):
+        out1 = self.pam(x)
+        out2 = self.cam(x)
+        return out1 + out2

boxmot/appearance/backbones/lmbn/bnneck.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license
+from torch import nn
+class BNNeck(nn.Module):
+    def __init__(self, input_dim, class_num, return_f=False):
+        super(BNNeck, self).__init__()
+        self.return_f = return_f
+        self.bn = nn.BatchNorm1d(input_dim)
+        self.bn.bias.requires_grad_(False)
+        self.classifier = nn.Linear(input_dim, class_num, bias=False)
+        self.bn.apply(self.weights_init_kaiming)
+        self.classifier.apply(self.weights_init_classifier)
+    def forward(self, x):
+        before_neck = x.view(x.size(0), x.size(1))
+        after_neck = self.bn(before_neck)
+        if self.return_f:
+            score = self.classifier(after_neck)
+            return after_neck, score, before_neck
+        else:
+            x = self.classifier(x)
+            return x
+    def weights_init_kaiming(self, m):
+        classname = m.__class__.__name__
+        if classname.find("Linear") != -1:
+            nn.init.kaiming_normal_(m.weight, a=0, mode="fan_out")
+            nn.init.constant_(m.bias, 0.0)
+        elif classname.find("Conv") != -1:
+            nn.init.kaiming_normal_(m.weight, a=0, mode="fan_in")
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0.0)
+        elif classname.find("BatchNorm") != -1:
+            if m.affine:
+                nn.init.constant_(m.weight, 1.0)
+                nn.init.constant_(m.bias, 0.0)
+    def weights_init_classifier(self, m):
+        classname = m.__class__.__name__
+        if classname.find("Linear") != -1:
+            nn.init.normal_(m.weight, std=0.001)
+            if m.bias:
+                nn.init.constant_(m.bias, 0.0)
+class BNNeck3(nn.Module):
+    def __init__(self, input_dim, class_num, feat_dim, return_f=False):
+        super(BNNeck3, self).__init__()
+        self.return_f = return_f
+        # self.reduction = nn.Linear(input_dim, feat_dim)
+        # self.bn = nn.BatchNorm1d(feat_dim)
+        self.reduction = nn.Conv2d(input_dim, feat_dim, 1, bias=False)
+        self.bn = nn.BatchNorm1d(feat_dim)
+        self.bn.bias.requires_grad_(False)
+        self.classifier = nn.Linear(feat_dim, class_num, bias=False)
+        self.bn.apply(self.weights_init_kaiming)
+        self.classifier.apply(self.weights_init_classifier)
+    def forward(self, x):
+        x = self.reduction(x)
+        # before_neck = x.squeeze(dim=3).squeeze(dim=2)
+        # after_neck = self.bn(x).squeeze(dim=3).squeeze(dim=2)
+        before_neck = x.view(x.size(0), x.size(1))
+        after_neck = self.bn(before_neck)
+        if self.return_f:
+            score = self.classifier(after_neck)
+            return after_neck, score, before_neck
+        else:
+            x = self.classifier(x)
+            return x
+    def weights_init_kaiming(self, m):
+        classname = m.__class__.__name__
+        if classname.find("Linear") != -1:
+            nn.init.kaiming_normal_(m.weight, a=0, mode="fan_out")
+            nn.init.constant_(m.bias, 0.0)
+        elif classname.find("Conv") != -1:
+            nn.init.kaiming_normal_(m.weight, a=0, mode="fan_in")
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0.0)
+        elif classname.find("BatchNorm") != -1:
+            if m.affine:
+                nn.init.constant_(m.weight, 1.0)
+                nn.init.constant_(m.bias, 0.0)
+    def weights_init_classifier(self, m):
+        classname = m.__class__.__name__
+        if classname.find("Linear") != -1:
+            nn.init.normal_(m.weight, std=0.001)
+            if m.bias:
+                nn.init.constant_(m.bias, 0.0)
+# Defines the new fc layer and classification layer
+# |--Linear--|--bn--|--relu--|--Linear--|
+class ClassBlock(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        class_num,
+        droprate=0,
+        relu=False,
+        bnorm=True,
+        num_bottleneck=512,
+        linear=True,
+        return_f=False,
+    ):
+        super(ClassBlock, self).__init__()
+        self.return_f = return_f
+        add_block = []
+        if linear:
+            add_block += [nn.Linear(input_dim, num_bottleneck)]
+        else:
+            num_bottleneck = input_dim
+        if bnorm:
+            add_block += [nn.BatchNorm1d(num_bottleneck)]
+        if relu:
+            add_block += [nn.LeakyReLU(0.1)]
+        if droprate > 0:
+            add_block += [nn.Dropout(p=droprate)]
+        add_block = nn.Sequential(*add_block)
+        add_block.apply(self.weights_init_kaiming)
+        classifier = []
+        classifier += [nn.Linear(num_bottleneck, class_num)]
+        classifier = nn.Sequential(*classifier)
+        classifier.apply(self.weights_init_classifier)
+        self.add_block = add_block
+        self.classifier = classifier
+    def forward(self, x):
+        x = self.add_block(x.squeeze(3).squeeze(2))
+        if self.return_f:
+            f = x
+            x = self.classifier(x)
+            return f, x, f
+        else:
+            x = self.classifier(x)
+            return x
+    def weights_init_kaiming(self, m):
+        classname = m.__class__.__name__
+        # print(classname)
+        if classname.find("Conv") != -1:
+            # For old pytorch, you may use kaiming_normal.
+            nn.init.kaiming_normal_(m.weight.data, a=0, mode="fan_in")
+        elif classname.find("Linear") != -1:
+            nn.init.kaiming_normal_(m.weight.data, a=0, mode="fan_out")
+            nn.init.constant_(m.bias.data, 0.0)
+        elif classname.find("BatchNorm1d") != -1:
+            nn.init.normal_(m.weight.data, 1.0, 0.02)
+            nn.init.constant_(m.bias.data, 0.0)
+    def weights_init_classifier(self, m):
+        classname = m.__class__.__name__
+        if classname.find("Linear") != -1:
+            nn.init.normal_(m.weight.data, std=0.001)
+            nn.init.constant_(m.bias.data, 0.0)

boxmot/appearance/backbones/lmbn/lmbn_n.py ADDED Viewed

	@@ -0,0 +1,185 @@

+# Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license
+import copy
+import torch
+from torch import nn
+from boxmot.appearance.backbones.lmbn.attention import BatchFeatureErase_Top
+from boxmot.appearance.backbones.lmbn.bnneck import BNNeck, BNNeck3
+from boxmot.appearance.backbones.osnet import OSBlock, osnet_x1_0
+class LMBN_n(nn.Module):
+    def __init__(self, num_classes, loss, pretrained, use_gpu):
+        super(LMBN_n, self).__init__()
+        self.n_ch = 2
+        self.chs = 512 // self.n_ch
+        self.training = False
+        osnet = osnet_x1_0(pretrained=True)
+        self.backone = nn.Sequential(
+            osnet.conv1, osnet.maxpool, osnet.conv2, osnet.conv3[0]
+        )
+        conv3 = osnet.conv3[1:]
+        self.global_branch = nn.Sequential(
+            copy.deepcopy(conv3), copy.deepcopy(osnet.conv4), copy.deepcopy(osnet.conv5)
+        )
+        self.partial_branch = nn.Sequential(
+            copy.deepcopy(conv3), copy.deepcopy(osnet.conv4), copy.deepcopy(osnet.conv5)
+        )
+        self.channel_branch = nn.Sequential(
+            copy.deepcopy(conv3), copy.deepcopy(osnet.conv4), copy.deepcopy(osnet.conv5)
+        )
+        self.global_pooling = nn.AdaptiveMaxPool2d((1, 1))
+        self.partial_pooling = nn.AdaptiveAvgPool2d((2, 1))
+        self.channel_pooling = nn.AdaptiveAvgPool2d((1, 1))
+        reduction = BNNeck3(512, num_classes, 512, return_f=True)
+        self.reduction_0 = copy.deepcopy(reduction)
+        self.reduction_1 = copy.deepcopy(reduction)
+        self.reduction_2 = copy.deepcopy(reduction)
+        self.reduction_3 = copy.deepcopy(reduction)
+        self.reduction_4 = copy.deepcopy(reduction)
+        self.shared = nn.Sequential(
+            nn.Conv2d(self.chs, 512, 1, bias=False), nn.BatchNorm2d(512), nn.ReLU(True)
+        )
+        self.weights_init_kaiming(self.shared)
+        self.reduction_ch_0 = BNNeck(512, num_classes, return_f=True)
+        self.reduction_ch_1 = BNNeck(512, num_classes, return_f=True)
+        # if args.drop_block:
+        #     print('Using batch random erasing block.')
+        #     self.batch_drop_block = BatchRandomErasing()
+        # print('Using batch drop block.')
+        # self.batch_drop_block = BatchDrop(
+        #     h_ratio=args.h_ratio, w_ratio=args.w_ratio)
+        self.batch_drop_block = BatchFeatureErase_Top(512, OSBlock)
+        self.activation_map = False
+    def forward(self, x):
+        # if self.batch_drop_block is not None:
+        #     x = self.batch_drop_block(x)
+        x = self.backone(x)
+        glo = self.global_branch(x)
+        par = self.partial_branch(x)
+        cha = self.channel_branch(x)
+        if self.activation_map:
+            glo_ = glo
+        if self.batch_drop_block is not None:
+            glo_drop, glo = self.batch_drop_block(glo)
+        if self.activation_map:
+            _, _, h_par, _ = par.size()
+            fmap_p0 = par[:, :, :h_par // 2, :]
+            fmap_p1 = par[:, :, h_par // 2:, :]
+            fmap_c0 = cha[:, : self.chs, :, :]
+            fmap_c1 = cha[:, self.chs:, :, :]
+            print("Generating activation maps...")
+            return glo, glo_, fmap_c0, fmap_c1, fmap_p0, fmap_p1
+        glo_drop = self.global_pooling(glo_drop)
+        glo = self.channel_pooling(glo)  # shape:(batchsize, 512,1,1)
+        g_par = self.global_pooling(par)  # shape:(batchsize, 512,1,1)
+        p_par = self.partial_pooling(par)  # shape:(batchsize, 512,2,1)
+        cha = self.channel_pooling(cha)  # shape:(batchsize, 256,1,1)
+        p0 = p_par[:, :, 0:1, :]
+        p1 = p_par[:, :, 1:2, :]
+        f_glo = self.reduction_0(glo)
+        f_p0 = self.reduction_1(g_par)
+        f_p1 = self.reduction_2(p0)
+        f_p2 = self.reduction_3(p1)
+        f_glo_drop = self.reduction_4(glo_drop)
+        ################
+        c0 = cha[:, : self.chs, :, :]
+        c1 = cha[:, self.chs:, :, :]
+        c0 = self.shared(c0)
+        c1 = self.shared(c1)
+        f_c0 = self.reduction_ch_0(c0)
+        f_c1 = self.reduction_ch_1(c1)
+        ################
+        fea = [f_glo[-1], f_glo_drop[-1], f_p0[-1]]
+        if not self.training:
+            features = torch.stack(
+                [f_glo[0], f_glo_drop[0], f_p0[0], f_p1[0], f_p2[0], f_c0[0], f_c1[0]],
+                dim=2,
+            )
+            features = features.flatten(1, 2)
+            return features
+        return [
+            f_glo[1],
+            f_glo_drop[1],
+            f_p0[1],
+            f_p1[1],
+            f_p2[1],
+            f_c0[1],
+            f_c1[1],
+        ], fea
+    def weights_init_kaiming(self, m):
+        classname = m.__class__.__name__
+        if classname.find("Linear") != -1:
+            nn.init.kaiming_normal_(m.weight, a=0, mode="fan_out")
+            nn.init.constant_(m.bias, 0.0)
+        elif classname.find("Conv") != -1:
+            nn.init.kaiming_normal_(m.weight, a=0, mode="fan_in")
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0.0)
+        elif classname.find("BatchNorm") != -1:
+            if m.affine:
+                nn.init.constant_(m.weight, 1.0)
+                nn.init.constant_(m.bias, 0.0)
+if __name__ == "__main__":
+    # Here I left a simple forward function.
+    # Test the model, before you train it.
+    import argparse
+    parser = argparse.ArgumentParser(description="MGN")
+    parser.add_argument("--num_classes", type=int, default=751, help="")
+    parser.add_argument("--bnneck", type=bool, default=True)
+    parser.add_argument("--pool", type=str, default="max")
+    parser.add_argument("--feats", type=int, default=512)
+    parser.add_argument("--drop_block", type=bool, default=True)
+    parser.add_argument("--w_ratio", type=float, default=1.0, help="")
+    args = parser.parse_args()
+    # net = MCMP_n(args)
+    # net.classifier = nn.Sequential()
+    # print([p for p in net.parameters()])
+    # a=filter(lambda p: p.requires_grad, net.parameters())
+    # print(a)
+    # print(net)
+    # input = Variable(torch.FloatTensor(8, 3, 384, 128))
+    # net.eval()
+    # output = net(input)
+    # print(output.shape)
+    print("net output size:")
+    # print(len(output))

boxmot/appearance/backbones/mlfn.py ADDED Viewed

	@@ -0,0 +1,240 @@

+# Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license
+from __future__ import absolute_import, division
+import torch
+import torch.utils.model_zoo as model_zoo
+from torch import nn
+from torch.nn import functional as F
+__all__ = ["mlfn"]
+model_urls = {
+    # training epoch = 5, top1 = 51.6
+    "imagenet": "https://mega.nz/#!YHxAhaxC!yu9E6zWl0x5zscSouTdbZu8gdFFytDdl-RAdD2DEfpk",
+}
+class MLFNBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, stride, fsm_channels, groups=32):
+        super(MLFNBlock, self).__init__()
+        self.groups = groups
+        mid_channels = out_channels // 2
+        # Factor Modules
+        self.fm_conv1 = nn.Conv2d(in_channels, mid_channels, 1, bias=False)
+        self.fm_bn1 = nn.BatchNorm2d(mid_channels)
+        self.fm_conv2 = nn.Conv2d(
+            mid_channels,
+            mid_channels,
+            3,
+            stride=stride,
+            padding=1,
+            bias=False,
+            groups=self.groups,
+        )
+        self.fm_bn2 = nn.BatchNorm2d(mid_channels)
+        self.fm_conv3 = nn.Conv2d(mid_channels, out_channels, 1, bias=False)
+        self.fm_bn3 = nn.BatchNorm2d(out_channels)
+        # Factor Selection Module
+        self.fsm = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_channels, fsm_channels[0], 1),
+            nn.BatchNorm2d(fsm_channels[0]),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(fsm_channels[0], fsm_channels[1], 1),
+            nn.BatchNorm2d(fsm_channels[1]),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(fsm_channels[1], self.groups, 1),
+            nn.BatchNorm2d(self.groups),
+            nn.Sigmoid(),
+        )
+        self.downsample = None
+        if in_channels != out_channels or stride > 1:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, 1, stride=stride, bias=False),
+                nn.BatchNorm2d(out_channels),
+            )
+    def forward(self, x):
+        residual = x
+        s = self.fsm(x)
+        # reduce dimension
+        x = self.fm_conv1(x)
+        x = self.fm_bn1(x)
+        x = F.relu(x, inplace=True)
+        # group convolution
+        x = self.fm_conv2(x)
+        x = self.fm_bn2(x)
+        x = F.relu(x, inplace=True)
+        # factor selection
+        b, c = x.size(0), x.size(1)
+        n = c // self.groups
+        ss = s.repeat(1, n, 1, 1)  # from (b, g, 1, 1) to (b, g*n=c, 1, 1)
+        ss = ss.view(b, n, self.groups, 1, 1)
+        ss = ss.permute(0, 2, 1, 3, 4).contiguous()
+        ss = ss.view(b, c, 1, 1)
+        x = ss * x
+        # recover dimension
+        x = self.fm_conv3(x)
+        x = self.fm_bn3(x)
+        x = F.relu(x, inplace=True)
+        if self.downsample is not None:
+            residual = self.downsample(residual)
+        return F.relu(residual + x, inplace=True), s
+class MLFN(nn.Module):
+    """Multi-Level Factorisation Net.
+    Reference:
+        Chang et al. Multi-Level Factorisation Net for
+        Person Re-Identification. CVPR 2018.
+    Public keys:
+        - ``mlfn``: MLFN (Multi-Level Factorisation Net).
+    """
+    def __init__(
+        self,
+        num_classes,
+        loss="softmax",
+        groups=32,
+        channels=[64, 256, 512, 1024, 2048],
+        embed_dim=1024,
+        **kwargs
+    ):
+        super(MLFN, self).__init__()
+        self.loss = loss
+        self.groups = groups
+        # first convolutional layer
+        self.conv1 = nn.Conv2d(3, channels[0], 7, stride=2, padding=3)
+        self.bn1 = nn.BatchNorm2d(channels[0])
+        self.maxpool = nn.MaxPool2d(3, stride=2, padding=1)
+        # main body
+        self.feature = nn.ModuleList(
+            [
+                # layer 1-3
+                MLFNBlock(channels[0], channels[1], 1, [128, 64], self.groups),
+                MLFNBlock(channels[1], channels[1], 1, [128, 64], self.groups),
+                MLFNBlock(channels[1], channels[1], 1, [128, 64], self.groups),
+                # layer 4-7
+                MLFNBlock(channels[1], channels[2], 2, [256, 128], self.groups),
+                MLFNBlock(channels[2], channels[2], 1, [256, 128], self.groups),
+                MLFNBlock(channels[2], channels[2], 1, [256, 128], self.groups),
+                MLFNBlock(channels[2], channels[2], 1, [256, 128], self.groups),
+                # layer 8-13
+                MLFNBlock(channels[2], channels[3], 2, [512, 128], self.groups),
+                MLFNBlock(channels[3], channels[3], 1, [512, 128], self.groups),
+                MLFNBlock(channels[3], channels[3], 1, [512, 128], self.groups),
+                MLFNBlock(channels[3], channels[3], 1, [512, 128], self.groups),
+                MLFNBlock(channels[3], channels[3], 1, [512, 128], self.groups),
+                MLFNBlock(channels[3], channels[3], 1, [512, 128], self.groups),
+                # layer 14-16
+                MLFNBlock(channels[3], channels[4], 2, [512, 128], self.groups),
+                MLFNBlock(channels[4], channels[4], 1, [512, 128], self.groups),
+                MLFNBlock(channels[4], channels[4], 1, [512, 128], self.groups),
+            ]
+        )
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        # projection functions
+        self.fc_x = nn.Sequential(
+            nn.Conv2d(channels[4], embed_dim, 1, bias=False),
+            nn.BatchNorm2d(embed_dim),
+            nn.ReLU(inplace=True),
+        )
+        self.fc_s = nn.Sequential(
+            nn.Conv2d(self.groups * 16, embed_dim, 1, bias=False),
+            nn.BatchNorm2d(embed_dim),
+            nn.ReLU(inplace=True),
+        )
+        self.classifier = nn.Linear(embed_dim, num_classes)
+        self.init_params()
+    def init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = F.relu(x, inplace=True)
+        x = self.maxpool(x)
+        s_hat = []
+        for block in self.feature:
+            x, s = block(x)
+            s_hat.append(s)
+        s_hat = torch.cat(s_hat, 1)
+        x = self.global_avgpool(x)
+        x = self.fc_x(x)
+        s_hat = self.fc_s(s_hat)
+        v = (x + s_hat) * 0.5
+        v = v.view(v.size(0), -1)
+        if not self.training:
+            return v
+        y = self.classifier(v)
+        if self.loss == "softmax":
+            return y
+        elif self.loss == "triplet":
+            return y, v
+        else:
+            raise KeyError("Unsupported loss: {}".format(self.loss))
+def init_pretrained_weights(model, model_url):
+    """Initializes model with pretrained weights.
+    Layers that don't match with pretrained layers in name or size are kept unchanged.
+    """
+    pretrain_dict = model_zoo.load_url(model_url)
+    model_dict = model.state_dict()
+    pretrain_dict = {
+        k: v
+        for k, v in pretrain_dict.items()
+        if k in model_dict and model_dict[k].size() == v.size()
+    }
+    model_dict.update(pretrain_dict)
+    model.load_state_dict(model_dict)
+def mlfn(num_classes, loss="softmax", pretrained=True, **kwargs):
+    model = MLFN(num_classes, loss, **kwargs)
+    if pretrained:
+        # init_pretrained_weights(model, model_urls['imagenet'])
+        import warnings
+        warnings.warn(
+            "The imagenet pretrained weights need to be manually downloaded from {}".format(
+                model_urls["imagenet"]
+            )
+        )
+    return model

boxmot/appearance/backbones/mobilenetv2.py ADDED Viewed

	@@ -0,0 +1,246 @@

+# Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license
+from __future__ import absolute_import, division
+import torch.utils.model_zoo as model_zoo
+from torch import nn
+from torch.nn import functional as F
+__all__ = ["mobilenetv2_x1_0", "mobilenetv2_x1_4"]
+model_urls = {
+    # 1.0: top-1 71.3
+    "mobilenetv2_x1_0": "https://mega.nz/#!NKp2wAIA!1NH1pbNzY_M2hVk_hdsxNM1NUOWvvGPHhaNr-fASF6c",
+    # 1.4: top-1 73.9
+    "mobilenetv2_x1_4": "https://mega.nz/#!RGhgEIwS!xN2s2ZdyqI6vQ3EwgmRXLEW3khr9tpXg96G9SUJugGk",
+}
+class ConvBlock(nn.Module):
+    """Basic convolutional block.
+    convolution (bias discarded) + batch normalization + relu6.
+    Args:
+        in_c (int): number of input channels.
+        out_c (int): number of output channels.
+        k (int or tuple): kernel size.
+        s (int or tuple): stride.
+        p (int or tuple): padding.
+        g (int): number of blocked connections from input channels
+            to output channels (default: 1).
+    """
+    def __init__(self, in_c, out_c, k, s=1, p=0, g=1):
+        super(ConvBlock, self).__init__()
+        self.conv = nn.Conv2d(in_c, out_c, k, stride=s, padding=p, bias=False, groups=g)
+        self.bn = nn.BatchNorm2d(out_c)
+    def forward(self, x):
+        return F.relu6(self.bn(self.conv(x)))
+class Bottleneck(nn.Module):
+    def __init__(self, in_channels, out_channels, expansion_factor, stride=1):
+        super(Bottleneck, self).__init__()
+        mid_channels = in_channels * expansion_factor
+        self.use_residual = stride == 1 and in_channels == out_channels
+        self.conv1 = ConvBlock(in_channels, mid_channels, 1)
+        self.dwconv2 = ConvBlock(
+            mid_channels, mid_channels, 3, stride, 1, g=mid_channels
+        )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(mid_channels, out_channels, 1, bias=False),
+            nn.BatchNorm2d(out_channels),
+        )
+    def forward(self, x):
+        m = self.conv1(x)
+        m = self.dwconv2(m)
+        m = self.conv3(m)
+        if self.use_residual:
+            return x + m
+        else:
+            return m
+class MobileNetV2(nn.Module):
+    """MobileNetV2.
+    Reference:
+        Sandler et al. MobileNetV2: Inverted Residuals and
+        Linear Bottlenecks. CVPR 2018.
+    Public keys:
+        - ``mobilenetv2_x1_0``: MobileNetV2 x1.0.
+        - ``mobilenetv2_x1_4``: MobileNetV2 x1.4.
+    """
+    def __init__(
+        self,
+        num_classes,
+        width_mult=1,
+        loss="softmax",
+        fc_dims=None,
+        dropout_p=None,
+        **kwargs
+    ):
+        super(MobileNetV2, self).__init__()
+        self.loss = loss
+        self.in_channels = int(32 * width_mult)
+        self.feature_dim = int(1280 * width_mult) if width_mult > 1 else 1280
+        # construct layers
+        self.conv1 = ConvBlock(3, self.in_channels, 3, s=2, p=1)
+        self.conv2 = self._make_layer(Bottleneck, 1, int(16 * width_mult), 1, 1)
+        self.conv3 = self._make_layer(Bottleneck, 6, int(24 * width_mult), 2, 2)
+        self.conv4 = self._make_layer(Bottleneck, 6, int(32 * width_mult), 3, 2)
+        self.conv5 = self._make_layer(Bottleneck, 6, int(64 * width_mult), 4, 2)
+        self.conv6 = self._make_layer(Bottleneck, 6, int(96 * width_mult), 3, 1)
+        self.conv7 = self._make_layer(Bottleneck, 6, int(160 * width_mult), 3, 2)
+        self.conv8 = self._make_layer(Bottleneck, 6, int(320 * width_mult), 1, 1)
+        self.conv9 = ConvBlock(self.in_channels, self.feature_dim, 1)
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.fc = self._construct_fc_layer(fc_dims, self.feature_dim, dropout_p)
+        self.classifier = nn.Linear(self.feature_dim, num_classes)
+        self._init_params()
+    def _make_layer(self, block, t, c, n, s):
+        # t: expansion factor
+        # c: output channels
+        # n: number of blocks
+        # s: stride for first layer
+        layers = []
+        layers.append(block(self.in_channels, c, t, s))
+        self.in_channels = c
+        for i in range(1, n):
+            layers.append(block(self.in_channels, c, t))
+        return nn.Sequential(*layers)
+    def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None):
+        """Constructs fully connected layer.
+        Args:
+            fc_dims (list or tuple): dimensions of fc layers, if None, no fc layers are constructed
+            input_dim (int): input dimension
+            dropout_p (float): dropout probability, if None, dropout is unused
+        """
+        if fc_dims is None:
+            self.feature_dim = input_dim
+            return None
+        assert isinstance(
+            fc_dims, (list, tuple)
+        ), "fc_dims must be either list or tuple, but got {}".format(type(fc_dims))
+        layers = []
+        for dim in fc_dims:
+            layers.append(nn.Linear(input_dim, dim))
+            layers.append(nn.BatchNorm1d(dim))
+            layers.append(nn.ReLU(inplace=True))
+            if dropout_p is not None:
+                layers.append(nn.Dropout(p=dropout_p))
+            input_dim = dim
+        self.feature_dim = fc_dims[-1]
+        return nn.Sequential(*layers)
+    def _init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm1d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def featuremaps(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        x = self.conv4(x)
+        x = self.conv5(x)
+        x = self.conv6(x)
+        x = self.conv7(x)
+        x = self.conv8(x)
+        x = self.conv9(x)
+        return x
+    def forward(self, x):
+        f = self.featuremaps(x)
+        v = self.global_avgpool(f)
+        v = v.view(v.size(0), -1)
+        if self.fc is not None:
+            v = self.fc(v)
+        if not self.training:
+            return v
+        y = self.classifier(v)
+        if self.loss == "softmax":
+            return y
+        elif self.loss == "triplet":
+            return y, v
+        else:
+            raise KeyError("Unsupported loss: {}".format(self.loss))
+def init_pretrained_weights(model, model_url):
+    """Initializes model with pretrained weights.
+    Layers that don't match with pretrained layers in name or size are kept unchanged.
+    """
+    pretrain_dict = model_zoo.load_url(model_url)
+    model_dict = model.state_dict()
+    pretrain_dict = {
+        k: v
+        for k, v in pretrain_dict.items()
+        if k in model_dict and model_dict[k].size() == v.size()
+    }
+    model_dict.update(pretrain_dict)
+    model.load_state_dict(model_dict)
+def mobilenetv2_x1_0(num_classes, loss, pretrained=True, **kwargs):
+    model = MobileNetV2(
+        num_classes, loss=loss, width_mult=1, fc_dims=None, dropout_p=None, **kwargs
+    )
+    if pretrained:
+        # init_pretrained_weights(model, model_urls['mobilenetv2_x1_0'])
+        import warnings
+        warnings.warn(
+            "The imagenet pretrained weights need to be manually downloaded from {}".format(
+                model_urls["mobilenetv2_x1_0"]
+            )
+        )
+    return model
+def mobilenetv2_x1_4(num_classes, loss, pretrained=True, **kwargs):
+    model = MobileNetV2(
+        num_classes, loss=loss, width_mult=1.4, fc_dims=None, dropout_p=None, **kwargs
+    )
+    if pretrained:
+        # init_pretrained_weights(model, model_urls['mobilenetv2_x1_4'])
+        import warnings
+        warnings.warn(
+            "The imagenet pretrained weights need to be manually downloaded from {}".format(
+                model_urls["mobilenetv2_x1_4"]
+            )
+        )
+    return model

boxmot/appearance/backbones/osnet.py ADDED Viewed

	@@ -0,0 +1,560 @@

+# Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license
+from __future__ import absolute_import, division
+import warnings
+import torch
+from torch import nn
+from torch.nn import functional as F
+__all__ = ["osnet_x1_0", "osnet_x0_75", "osnet_x0_5", "osnet_x0_25", "osnet_ibn_x1_0"]
+pretrained_urls = {
+    "osnet_x1_0": "https://drive.google.com/uc?id=1LaG1EJpHrxdAxKnSCJ_i0u-nbxSAeiFY",
+    "osnet_x0_75": "https://drive.google.com/uc?id=1uwA9fElHOk3ZogwbeY5GkLI6QPTX70Hq",
+    "osnet_x0_5": "https://drive.google.com/uc?id=16DGLbZukvVYgINws8u8deSaOqjybZ83i",
+    "osnet_x0_25": "https://drive.google.com/uc?id=1rb8UN5ZzPKRc_xvtHlyDh-cSz88YX9hs",
+    "osnet_ibn_x1_0": "https://drive.google.com/uc?id=1sr90V6irlYYDd4_4ISU2iruoRG8J__6l",
+}
+##########
+# Basic layers
+##########
+class ConvLayer(nn.Module):
+    """Convolution layer (conv + bn + relu)."""
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        groups=1,
+        IN=False,
+    ):
+        super(ConvLayer, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=False,
+            groups=groups,
+        )
+        if IN:
+            self.bn = nn.InstanceNorm2d(out_channels, affine=True)
+        else:
+            self.bn = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+class Conv1x1(nn.Module):
+    """1x1 convolution + bn + relu."""
+    def __init__(self, in_channels, out_channels, stride=1, groups=1):
+        super(Conv1x1, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            1,
+            stride=stride,
+            padding=0,
+            bias=False,
+            groups=groups,
+        )
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+class Conv1x1Linear(nn.Module):
+    """1x1 convolution + bn (w/o non-linearity)."""
+    def __init__(self, in_channels, out_channels, stride=1):
+        super(Conv1x1Linear, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels, out_channels, 1, stride=stride, padding=0, bias=False
+        )
+        self.bn = nn.BatchNorm2d(out_channels)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+class Conv3x3(nn.Module):
+    """3x3 convolution + bn + relu."""
+    def __init__(self, in_channels, out_channels, stride=1, groups=1):
+        super(Conv3x3, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            3,
+            stride=stride,
+            padding=1,
+            bias=False,
+            groups=groups,
+        )
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+class LightConv3x3(nn.Module):
+    """Lightweight 3x3 convolution.
+    1x1 (linear) + dw 3x3 (nonlinear).
+    """
+    def __init__(self, in_channels, out_channels):
+        super(LightConv3x3, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels, out_channels, 1, stride=1, padding=0, bias=False
+        )
+        self.conv2 = nn.Conv2d(
+            out_channels,
+            out_channels,
+            3,
+            stride=1,
+            padding=1,
+            bias=False,
+            groups=out_channels,
+        )
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+##########
+# Building blocks for omni-scale feature learning
+##########
+class ChannelGate(nn.Module):
+    """A mini-network that generates channel-wise gates conditioned on input tensor."""
+    def __init__(
+        self,
+        in_channels,
+        num_gates=None,
+        return_gates=False,
+        gate_activation="sigmoid",
+        reduction=16,
+        layer_norm=False,
+    ):
+        super(ChannelGate, self).__init__()
+        if num_gates is None:
+            num_gates = in_channels
+        self.return_gates = return_gates
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.fc1 = nn.Conv2d(
+            in_channels, in_channels // reduction, kernel_size=1, bias=True, padding=0
+        )
+        self.norm1 = None
+        if layer_norm:
+            self.norm1 = nn.LayerNorm((in_channels // reduction, 1, 1))
+        self.relu = nn.ReLU(inplace=True)
+        self.fc2 = nn.Conv2d(
+            in_channels // reduction, num_gates, kernel_size=1, bias=True, padding=0
+        )
+        if gate_activation == "sigmoid":
+            self.gate_activation = nn.Sigmoid()
+        elif gate_activation == "relu":
+            self.gate_activation = nn.ReLU(inplace=True)
+        elif gate_activation == "linear":
+            self.gate_activation = None
+        else:
+            raise RuntimeError("Unknown gate activation: {}".format(gate_activation))
+    def forward(self, x):
+        input = x
+        x = self.global_avgpool(x)
+        x = self.fc1(x)
+        if self.norm1 is not None:
+            x = self.norm1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        if self.gate_activation is not None:
+            x = self.gate_activation(x)
+        if self.return_gates:
+            return x
+        return input * x
+class OSBlock(nn.Module):
+    """Omni-scale feature learning block."""
+    def __init__(
+        self, in_channels, out_channels, IN=False, bottleneck_reduction=4, **kwargs
+    ):
+        super(OSBlock, self).__init__()
+        mid_channels = out_channels // bottleneck_reduction
+        self.conv1 = Conv1x1(in_channels, mid_channels)
+        self.conv2a = LightConv3x3(mid_channels, mid_channels)
+        self.conv2b = nn.Sequential(
+            LightConv3x3(mid_channels, mid_channels),
+            LightConv3x3(mid_channels, mid_channels),
+        )
+        self.conv2c = nn.Sequential(
+            LightConv3x3(mid_channels, mid_channels),
+            LightConv3x3(mid_channels, mid_channels),
+            LightConv3x3(mid_channels, mid_channels),
+        )
+        self.conv2d = nn.Sequential(
+            LightConv3x3(mid_channels, mid_channels),
+            LightConv3x3(mid_channels, mid_channels),
+            LightConv3x3(mid_channels, mid_channels),
+            LightConv3x3(mid_channels, mid_channels),
+        )
+        self.gate = ChannelGate(mid_channels)
+        self.conv3 = Conv1x1Linear(mid_channels, out_channels)
+        self.downsample = None
+        if in_channels != out_channels:
+            self.downsample = Conv1x1Linear(in_channels, out_channels)
+        self.IN = None
+        if IN:
+            self.IN = nn.InstanceNorm2d(out_channels, affine=True)
+    def forward(self, x):
+        identity = x
+        x1 = self.conv1(x)
+        x2a = self.conv2a(x1)
+        x2b = self.conv2b(x1)
+        x2c = self.conv2c(x1)
+        x2d = self.conv2d(x1)
+        x2 = self.gate(x2a) + self.gate(x2b) + self.gate(x2c) + self.gate(x2d)
+        x3 = self.conv3(x2)
+        if self.downsample is not None:
+            identity = self.downsample(identity)
+        out = x3 + identity
+        if self.IN is not None:
+            out = self.IN(out)
+        return F.relu(out)
+##########
+# Network architecture
+##########
+class OSNet(nn.Module):
+    """Omni-Scale Network.
+    Reference:
+        - Zhou et al. Omni-Scale Feature Learning for Person Re-Identification. ICCV, 2019.
+        - Zhou et al. Learning Generalisable Omni-Scale Representations
+          for Person Re-Identification. TPAMI, 2021.
+    """
+    def __init__(
+        self,
+        num_classes,
+        blocks,
+        layers,
+        channels,
+        feature_dim=512,
+        loss="softmax",
+        IN=False,
+        **kwargs
+    ):
+        super(OSNet, self).__init__()
+        num_blocks = len(blocks)
+        assert num_blocks == len(layers)
+        assert num_blocks == len(channels) - 1
+        self.loss = loss
+        self.feature_dim = feature_dim
+        # convolutional backbone
+        self.conv1 = ConvLayer(3, channels[0], 7, stride=2, padding=3, IN=IN)
+        self.maxpool = nn.MaxPool2d(3, stride=2, padding=1)
+        self.conv2 = self._make_layer(
+            blocks[0],
+            layers[0],
+            channels[0],
+            channels[1],
+            reduce_spatial_size=True,
+            IN=IN,
+        )
+        self.conv3 = self._make_layer(
+            blocks[1], layers[1], channels[1], channels[2], reduce_spatial_size=True
+        )
+        self.conv4 = self._make_layer(
+            blocks[2], layers[2], channels[2], channels[3], reduce_spatial_size=False
+        )
+        self.conv5 = Conv1x1(channels[3], channels[3])
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        # fully connected layer
+        self.fc = self._construct_fc_layer(
+            self.feature_dim, channels[3], dropout_p=None
+        )
+        # identity classification layer
+        self.classifier = nn.Linear(self.feature_dim, num_classes)
+        self._init_params()
+    def _make_layer(
+        self, block, layer, in_channels, out_channels, reduce_spatial_size, IN=False
+    ):
+        layers = []
+        layers.append(block(in_channels, out_channels, IN=IN))
+        for i in range(1, layer):
+            layers.append(block(out_channels, out_channels, IN=IN))
+        if reduce_spatial_size:
+            layers.append(
+                nn.Sequential(
+                    Conv1x1(out_channels, out_channels), nn.AvgPool2d(2, stride=2)
+                )
+            )
+        return nn.Sequential(*layers)
+    def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None):
+        if fc_dims is None or fc_dims < 0:
+            self.feature_dim = input_dim
+            return None
+        if isinstance(fc_dims, int):
+            fc_dims = [fc_dims]
+        layers = []
+        for dim in fc_dims:
+            layers.append(nn.Linear(input_dim, dim))
+            layers.append(nn.BatchNorm1d(dim))
+            layers.append(nn.ReLU(inplace=True))
+            if dropout_p is not None:
+                layers.append(nn.Dropout(p=dropout_p))
+            input_dim = dim
+        self.feature_dim = fc_dims[-1]
+        return nn.Sequential(*layers)
+    def _init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm1d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def featuremaps(self, x):
+        x = self.conv1(x)
+        x = self.maxpool(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        x = self.conv4(x)
+        x = self.conv5(x)
+        return x
+    def forward(self, x, return_featuremaps=False):
+        x = self.featuremaps(x)
+        if return_featuremaps:
+            return x
+        v = self.global_avgpool(x)
+        v = v.view(v.size(0), -1)
+        if self.fc is not None:
+            v = self.fc(v)
+        if not self.training:
+            return v
+        y = self.classifier(v)
+        if self.loss == "softmax":
+            return y
+        elif self.loss == "triplet":
+            return y, v
+        else:
+            raise KeyError("Unsupported loss: {}".format(self.loss))
+def init_pretrained_weights(model, key=""):
+    """Initializes model with pretrained weights.
+    Layers that don't match with pretrained layers in name or size are kept unchanged.
+    """
+    import errno
+    import os
+    from collections import OrderedDict
+    import gdown
+    def _get_torch_home():
+        ENV_TORCH_HOME = "TORCH_HOME"
+        ENV_XDG_CACHE_HOME = "XDG_CACHE_HOME"
+        DEFAULT_CACHE_DIR = "~/.cache"
+        torch_home = os.path.expanduser(
+            os.getenv(
+                ENV_TORCH_HOME,
+                os.path.join(os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), "torch"),
+            )
+        )
+        return torch_home
+    torch_home = _get_torch_home()
+    model_dir = os.path.join(torch_home, "checkpoints")
+    try:
+        os.makedirs(model_dir)
+    except OSError as e:
+        if e.errno == errno.EEXIST:
+            # Directory already exists, ignore.
+            pass
+        else:
+            # Unexpected OSError, re-raise.
+            raise
+    filename = key + "_imagenet.pth"
+    cached_file = os.path.join(model_dir, filename)
+    if not os.path.exists(cached_file):
+        gdown.download(pretrained_urls[key], cached_file, quiet=False)
+    state_dict = torch.load(cached_file)
+    model_dict = model.state_dict()
+    new_state_dict = OrderedDict()
+    matched_layers, discarded_layers = [], []
+    for k, v in state_dict.items():
+        if k.startswith("module."):
+            k = k[7:]  # discard module.
+        if k in model_dict and model_dict[k].size() == v.size():
+            new_state_dict[k] = v
+            matched_layers.append(k)
+        else:
+            discarded_layers.append(k)
+    model_dict.update(new_state_dict)
+    model.load_state_dict(model_dict)
+    if len(matched_layers) == 0:
+        warnings.warn(
+            'The pretrained weights from "{}" cannot be loaded, '
+            "please check the key names manually "
+            "(** ignored and continue **)".format(cached_file)
+        )
+    else:
+        print(
+            'Successfully loaded imagenet pretrained weights from "{}"'.format(
+                cached_file
+            )
+        )
+        if len(discarded_layers) > 0:
+            print(
+                "** The following layers are discarded "
+                "due to unmatched keys or layer size: {}".format(discarded_layers)
+            )
+##########
+# Instantiation
+##########
+def osnet_x1_0(num_classes=1000, pretrained=True, loss="softmax", **kwargs):
+    # standard size (width x1.0)
+    model = OSNet(
+        num_classes,
+        blocks=[OSBlock, OSBlock, OSBlock],
+        layers=[2, 2, 2],
+        channels=[64, 256, 384, 512],
+        loss=loss,
+        **kwargs
+    )
+    if pretrained:
+        init_pretrained_weights(model, key="osnet_x1_0")
+    return model
+def osnet_x0_75(num_classes=1000, pretrained=True, loss="softmax", **kwargs):
+    # medium size (width x0.75)
+    model = OSNet(
+        num_classes,
+        blocks=[OSBlock, OSBlock, OSBlock],
+        layers=[2, 2, 2],
+        channels=[48, 192, 288, 384],
+        loss=loss,
+        **kwargs
+    )
+    if pretrained:
+        init_pretrained_weights(model, key="osnet_x0_75")
+    return model
+def osnet_x0_5(num_classes=1000, pretrained=True, loss="softmax", **kwargs):
+    # tiny size (width x0.5)
+    model = OSNet(
+        num_classes,
+        blocks=[OSBlock, OSBlock, OSBlock],
+        layers=[2, 2, 2],
+        channels=[32, 128, 192, 256],
+        loss=loss,
+        **kwargs
+    )
+    if pretrained:
+        init_pretrained_weights(model, key="osnet_x0_5")
+    return model
+def osnet_x0_25(num_classes=1000, pretrained=True, loss="softmax", **kwargs):
+    # very tiny size (width x0.25)
+    model = OSNet(
+        num_classes,
+        blocks=[OSBlock, OSBlock, OSBlock],
+        layers=[2, 2, 2],
+        channels=[16, 64, 96, 128],
+        loss=loss,
+        **kwargs
+    )
+    if pretrained:
+        init_pretrained_weights(model, key="osnet_x0_25")
+    return model
+def osnet_ibn_x1_0(num_classes=1000, pretrained=True, loss="softmax", **kwargs):
+    # standard size (width x1.0) + IBN layer
+    # Ref: Pan et al. Two at Once: Enhancing Learning and Generalization Capacities via IBN-Net. ECCV, 2018.
+    model = OSNet(
+        num_classes,
+        blocks=[OSBlock, OSBlock, OSBlock],
+        layers=[2, 2, 2],
+        channels=[64, 256, 384, 512],
+        loss=loss,
+        IN=True,
+        **kwargs
+    )
+    if pretrained:
+        init_pretrained_weights(model, key="osnet_ibn_x1_0")
+    return model

boxmot/appearance/backbones/osnet_ain.py ADDED Viewed

	@@ -0,0 +1,582 @@

+# Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license
+from __future__ import absolute_import, division
+import warnings
+import torch
+from torch import nn
+from torch.nn import functional as F
+__all__ = ["osnet_ain_x1_0", "osnet_ain_x0_75", "osnet_ain_x0_5", "osnet_ain_x0_25"]
+pretrained_urls = {
+    "osnet_ain_x1_0": "https://drive.google.com/uc?id=1-CaioD9NaqbHK_kzSMW8VE4_3KcsRjEo",
+    "osnet_ain_x0_75": "https://drive.google.com/uc?id=1apy0hpsMypqstfencdH-jKIUEFOW4xoM",
+    "osnet_ain_x0_5": "https://drive.google.com/uc?id=1KusKvEYyKGDTUBVRxRiz55G31wkihB6l",
+    "osnet_ain_x0_25": "https://drive.google.com/uc?id=1SxQt2AvmEcgWNhaRb2xC4rP6ZwVDP0Wt",
+}
+##########
+# Basic layers
+##########
+class ConvLayer(nn.Module):
+    """Convolution layer (conv + bn + relu)."""
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        groups=1,
+        IN=False,
+    ):
+        super(ConvLayer, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=False,
+            groups=groups,
+        )
+        if IN:
+            self.bn = nn.InstanceNorm2d(out_channels, affine=True)
+        else:
+            self.bn = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return self.relu(x)
+class Conv1x1(nn.Module):
+    """1x1 convolution + bn + relu."""
+    def __init__(self, in_channels, out_channels, stride=1, groups=1):
+        super(Conv1x1, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            1,
+            stride=stride,
+            padding=0,
+            bias=False,
+            groups=groups,
+        )
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return self.relu(x)
+class Conv1x1Linear(nn.Module):
+    """1x1 convolution + bn (w/o non-linearity)."""
+    def __init__(self, in_channels, out_channels, stride=1, bn=True):
+        super(Conv1x1Linear, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels, out_channels, 1, stride=stride, padding=0, bias=False
+        )
+        self.bn = None
+        if bn:
+            self.bn = nn.BatchNorm2d(out_channels)
+    def forward(self, x):
+        x = self.conv(x)
+        if self.bn is not None:
+            x = self.bn(x)
+        return x
+class Conv3x3(nn.Module):
+    """3x3 convolution + bn + relu."""
+    def __init__(self, in_channels, out_channels, stride=1, groups=1):
+        super(Conv3x3, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            3,
+            stride=stride,
+            padding=1,
+            bias=False,
+            groups=groups,
+        )
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return self.relu(x)
+class LightConv3x3(nn.Module):
+    """Lightweight 3x3 convolution.
+    1x1 (linear) + dw 3x3 (nonlinear).
+    """
+    def __init__(self, in_channels, out_channels):
+        super(LightConv3x3, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels, out_channels, 1, stride=1, padding=0, bias=False
+        )
+        self.conv2 = nn.Conv2d(
+            out_channels,
+            out_channels,
+            3,
+            stride=1,
+            padding=1,
+            bias=False,
+            groups=out_channels,
+        )
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.bn(x)
+        return self.relu(x)
+class LightConvStream(nn.Module):
+    """Lightweight convolution stream."""
+    def __init__(self, in_channels, out_channels, depth):
+        super(LightConvStream, self).__init__()
+        assert depth >= 1, "depth must be equal to or larger than 1, but got {}".format(
+            depth
+        )
+        layers = []
+        layers += [LightConv3x3(in_channels, out_channels)]
+        for i in range(depth - 1):
+            layers += [LightConv3x3(out_channels, out_channels)]
+        self.layers = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.layers(x)
+##########
+# Building blocks for omni-scale feature learning
+##########
+class ChannelGate(nn.Module):
+    """A mini-network that generates channel-wise gates conditioned on input tensor."""
+    def __init__(
+        self,
+        in_channels,
+        num_gates=None,
+        return_gates=False,
+        gate_activation="sigmoid",
+        reduction=16,
+        layer_norm=False,
+    ):
+        super(ChannelGate, self).__init__()
+        if num_gates is None:
+            num_gates = in_channels
+        self.return_gates = return_gates
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.fc1 = nn.Conv2d(
+            in_channels, in_channels // reduction, kernel_size=1, bias=True, padding=0
+        )
+        self.norm1 = None
+        if layer_norm:
+            self.norm1 = nn.LayerNorm((in_channels // reduction, 1, 1))
+        self.relu = nn.ReLU()
+        self.fc2 = nn.Conv2d(
+            in_channels // reduction, num_gates, kernel_size=1, bias=True, padding=0
+        )
+        if gate_activation == "sigmoid":
+            self.gate_activation = nn.Sigmoid()
+        elif gate_activation == "relu":
+            self.gate_activation = nn.ReLU()
+        elif gate_activation == "linear":
+            self.gate_activation = None
+        else:
+            raise RuntimeError("Unknown gate activation: {}".format(gate_activation))
+    def forward(self, x):
+        input = x
+        x = self.global_avgpool(x)
+        x = self.fc1(x)
+        if self.norm1 is not None:
+            x = self.norm1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        if self.gate_activation is not None:
+            x = self.gate_activation(x)
+        if self.return_gates:
+            return x
+        return input * x
+class OSBlock(nn.Module):
+    """Omni-scale feature learning block."""
+    def __init__(self, in_channels, out_channels, reduction=4, T=4, **kwargs):
+        super(OSBlock, self).__init__()
+        assert T >= 1
+        assert out_channels >= reduction and out_channels % reduction == 0
+        mid_channels = out_channels // reduction
+        self.conv1 = Conv1x1(in_channels, mid_channels)
+        self.conv2 = nn.ModuleList()
+        for t in range(1, T + 1):
+            self.conv2 += [LightConvStream(mid_channels, mid_channels, t)]
+        self.gate = ChannelGate(mid_channels)
+        self.conv3 = Conv1x1Linear(mid_channels, out_channels)
+        self.downsample = None
+        if in_channels != out_channels:
+            self.downsample = Conv1x1Linear(in_channels, out_channels)
+    def forward(self, x):
+        identity = x
+        x1 = self.conv1(x)
+        x2 = 0
+        for conv2_t in self.conv2:
+            x2_t = conv2_t(x1)
+            x2 = x2 + self.gate(x2_t)
+        x3 = self.conv3(x2)
+        if self.downsample is not None:
+            identity = self.downsample(identity)
+        out = x3 + identity
+        return F.relu(out)
+class OSBlockINin(nn.Module):
+    """Omni-scale feature learning block with instance normalization."""
+    def __init__(self, in_channels, out_channels, reduction=4, T=4, **kwargs):
+        super(OSBlockINin, self).__init__()
+        assert T >= 1
+        assert out_channels >= reduction and out_channels % reduction == 0
+        mid_channels = out_channels // reduction
+        self.conv1 = Conv1x1(in_channels, mid_channels)
+        self.conv2 = nn.ModuleList()
+        for t in range(1, T + 1):
+            self.conv2 += [LightConvStream(mid_channels, mid_channels, t)]
+        self.gate = ChannelGate(mid_channels)
+        self.conv3 = Conv1x1Linear(mid_channels, out_channels, bn=False)
+        self.downsample = None
+        if in_channels != out_channels:
+            self.downsample = Conv1x1Linear(in_channels, out_channels)
+        self.IN = nn.InstanceNorm2d(out_channels, affine=True)
+    def forward(self, x):
+        identity = x
+        x1 = self.conv1(x)
+        x2 = 0
+        for conv2_t in self.conv2:
+            x2_t = conv2_t(x1)
+            x2 = x2 + self.gate(x2_t)
+        x3 = self.conv3(x2)
+        x3 = self.IN(x3)  # IN inside residual
+        if self.downsample is not None:
+            identity = self.downsample(identity)
+        out = x3 + identity
+        return F.relu(out)
+##########
+# Network architecture
+##########
+class OSNet(nn.Module):
+    """Omni-Scale Network.
+    Reference:
+        - Zhou et al. Omni-Scale Feature Learning for Person Re-Identification. ICCV, 2019.
+        - Zhou et al. Learning Generalisable Omni-Scale Representations
+          for Person Re-Identification. TPAMI, 2021.
+    """
+    def __init__(
+        self,
+        num_classes,
+        blocks,
+        layers,
+        channels,
+        feature_dim=512,
+        loss="softmax",
+        conv1_IN=False,
+        **kwargs
+    ):
+        super(OSNet, self).__init__()
+        num_blocks = len(blocks)
+        assert num_blocks == len(layers)
+        assert num_blocks == len(channels) - 1
+        self.loss = loss
+        self.feature_dim = feature_dim
+        # convolutional backbone
+        self.conv1 = ConvLayer(3, channels[0], 7, stride=2, padding=3, IN=conv1_IN)
+        self.maxpool = nn.MaxPool2d(3, stride=2, padding=1)
+        self.conv2 = self._make_layer(blocks[0], layers[0], channels[0], channels[1])
+        self.pool2 = nn.Sequential(
+            Conv1x1(channels[1], channels[1]), nn.AvgPool2d(2, stride=2)
+        )
+        self.conv3 = self._make_layer(blocks[1], layers[1], channels[1], channels[2])
+        self.pool3 = nn.Sequential(
+            Conv1x1(channels[2], channels[2]), nn.AvgPool2d(2, stride=2)
+        )
+        self.conv4 = self._make_layer(blocks[2], layers[2], channels[2], channels[3])
+        self.conv5 = Conv1x1(channels[3], channels[3])
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        # fully connected layer
+        self.fc = self._construct_fc_layer(
+            self.feature_dim, channels[3], dropout_p=None
+        )
+        # identity classification layer
+        self.classifier = nn.Linear(self.feature_dim, num_classes)
+        self._init_params()
+    def _make_layer(self, blocks, layer, in_channels, out_channels):
+        layers = []
+        layers += [blocks[0](in_channels, out_channels)]
+        for i in range(1, len(blocks)):
+            layers += [blocks[i](out_channels, out_channels)]
+        return nn.Sequential(*layers)
+    def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None):
+        if fc_dims is None or fc_dims < 0:
+            self.feature_dim = input_dim
+            return None
+        if isinstance(fc_dims, int):
+            fc_dims = [fc_dims]
+        layers = []
+        for dim in fc_dims:
+            layers.append(nn.Linear(input_dim, dim))
+            layers.append(nn.BatchNorm1d(dim))
+            layers.append(nn.ReLU())
+            if dropout_p is not None:
+                layers.append(nn.Dropout(p=dropout_p))
+            input_dim = dim
+        self.feature_dim = fc_dims[-1]
+        return nn.Sequential(*layers)
+    def _init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm1d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.InstanceNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def featuremaps(self, x):
+        x = self.conv1(x)
+        x = self.maxpool(x)
+        x = self.conv2(x)
+        x = self.pool2(x)
+        x = self.conv3(x)
+        x = self.pool3(x)
+        x = self.conv4(x)
+        x = self.conv5(x)
+        return x
+    def forward(self, x, return_featuremaps=False):
+        x = self.featuremaps(x)
+        if return_featuremaps:
+            return x
+        v = self.global_avgpool(x)
+        v = v.view(v.size(0), -1)
+        if self.fc is not None:
+            v = self.fc(v)
+        if not self.training:
+            return v
+        y = self.classifier(v)
+        if self.loss == "softmax":
+            return y
+        elif self.loss == "triplet":
+            return y, v
+        else:
+            raise KeyError("Unsupported loss: {}".format(self.loss))
+def init_pretrained_weights(model, key=""):
+    """Initializes model with pretrained weights.
+    Layers that don't match with pretrained layers in name or size are kept unchanged.
+    """
+    import errno
+    import os
+    from collections import OrderedDict
+    import gdown
+    def _get_torch_home():
+        ENV_TORCH_HOME = "TORCH_HOME"
+        ENV_XDG_CACHE_HOME = "XDG_CACHE_HOME"
+        DEFAULT_CACHE_DIR = "~/.cache"
+        torch_home = os.path.expanduser(
+            os.getenv(
+                ENV_TORCH_HOME,
+                os.path.join(os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), "torch"),
+            )
+        )
+        return torch_home
+    torch_home = _get_torch_home()
+    model_dir = os.path.join(torch_home, "checkpoints")
+    try:
+        os.makedirs(model_dir)
+    except OSError as e:
+        if e.errno == errno.EEXIST:
+            # Directory already exists, ignore.
+            pass
+        else:
+            # Unexpected OSError, re-raise.
+            raise
+    filename = key + "_imagenet.pth"
+    cached_file = os.path.join(model_dir, filename)
+    if not os.path.exists(cached_file):
+        gdown.download(pretrained_urls[key], cached_file, quiet=False)
+    state_dict = torch.load(cached_file)
+    model_dict = model.state_dict()
+    new_state_dict = OrderedDict()
+    matched_layers, discarded_layers = [], []
+    for k, v in state_dict.items():
+        if k.startswith("module."):
+            k = k[7:]  # discard module.
+        if k in model_dict and model_dict[k].size() == v.size():
+            new_state_dict[k] = v
+            matched_layers.append(k)
+        else:
+            discarded_layers.append(k)
+    model_dict.update(new_state_dict)
+    model.load_state_dict(model_dict)
+    if len(matched_layers) == 0:
+        warnings.warn(
+            'The pretrained weights from "{}" cannot be loaded, '
+            "please check the key names manually "
+            "(** ignored and continue **)".format(cached_file)
+        )
+    else:
+        print(
+            'Successfully loaded imagenet pretrained weights from "{}"'.format(
+                cached_file
+            )
+        )
+        if len(discarded_layers) > 0:
+            print(
+                "** The following layers are discarded "
+                "due to unmatched keys or layer size: {}".format(discarded_layers)
+            )
+##########
+# Instantiation
+##########
+def osnet_ain_x1_0(num_classes=1000, pretrained=True, loss="softmax", **kwargs):
+    model = OSNet(
+        num_classes,
+        blocks=[
+            [OSBlockINin, OSBlockINin],
+            [OSBlock, OSBlockINin],
+            [OSBlockINin, OSBlock],
+        ],
+        layers=[2, 2, 2],
+        channels=[64, 256, 384, 512],
+        loss=loss,
+        conv1_IN=True,
+        **kwargs
+    )
+    if pretrained:
+        init_pretrained_weights(model, key="osnet_ain_x1_0")
+    return model
+def osnet_ain_x0_75(num_classes=1000, pretrained=True, loss="softmax", **kwargs):
+    model = OSNet(
+        num_classes,
+        blocks=[
+            [OSBlockINin, OSBlockINin],
+            [OSBlock, OSBlockINin],
+            [OSBlockINin, OSBlock],
+        ],
+        layers=[2, 2, 2],
+        channels=[48, 192, 288, 384],
+        loss=loss,
+        conv1_IN=True,
+        **kwargs
+    )
+    if pretrained:
+        init_pretrained_weights(model, key="osnet_ain_x0_75")
+    return model
+def osnet_ain_x0_5(num_classes=1000, pretrained=True, loss="softmax", **kwargs):
+    model = OSNet(
+        num_classes,
+        blocks=[
+            [OSBlockINin, OSBlockINin],
+            [OSBlock, OSBlockINin],
+            [OSBlockINin, OSBlock],
+        ],
+        layers=[2, 2, 2],
+        channels=[32, 128, 192, 256],
+        loss=loss,
+        conv1_IN=True,
+        **kwargs
+    )
+    if pretrained:
+        init_pretrained_weights(model, key="osnet_ain_x0_5")
+    return model
+def osnet_ain_x0_25(num_classes=1000, pretrained=True, loss="softmax", **kwargs):
+    model = OSNet(
+        num_classes,
+        blocks=[
+            [OSBlockINin, OSBlockINin],
+            [OSBlock, OSBlockINin],
+            [OSBlockINin, OSBlock],
+        ],
+        layers=[2, 2, 2],
+        channels=[16, 64, 96, 128],
+        loss=loss,
+        conv1_IN=True,
+        **kwargs
+    )
+    if pretrained:
+        init_pretrained_weights(model, key="osnet_ain_x0_25")
+    return model

boxmot/appearance/backbones/resnet.py ADDED Viewed

	@@ -0,0 +1,517 @@

+# Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license
+"""
+Code source: https://github.com/pytorch/vision
+"""
+from __future__ import absolute_import, division
+import torch.utils.model_zoo as model_zoo
+from torch import nn
+__all__ = [
+    "resnet18",
+    "resnet34",
+    "resnet50",
+    "resnet101",
+    "resnet152",
+    "resnext50_32x4d",
+    "resnext101_32x8d",
+    "resnet50_fc512",
+]
+model_urls = {
+    "resnet18": "https://download.pytorch.org/models/resnet18-5c106cde.pth",
+    "resnet34": "https://download.pytorch.org/models/resnet34-333f7ec4.pth",
+    "resnet50": "https://download.pytorch.org/models/resnet50-19c8e357.pth",
+    "resnet101": "https://download.pytorch.org/models/resnet101-5d3b4d8f.pth",
+    "resnet152": "https://download.pytorch.org/models/resnet152-b121ed2d.pth",
+    "resnext50_32x4d": "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth",
+    "resnext101_32x8d": "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth",
+}
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        groups=groups,
+        bias=False,
+        dilation=dilation,
+    )
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        stride=1,
+        downsample=None,
+        groups=1,
+        base_width=64,
+        dilation=1,
+        norm_layer=None,
+    ):
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError("BasicBlock only supports groups=1 and base_width=64")
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        stride=1,
+        downsample=None,
+        groups=1,
+        base_width=64,
+        dilation=1,
+        norm_layer=None,
+    ):
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.0)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class ResNet(nn.Module):
+    """Residual network.
+    Reference:
+        - He et al. Deep Residual Learning for Image Recognition. CVPR 2016.
+        - Xie et al. Aggregated Residual Transformations for Deep Neural Networks. CVPR 2017.
+    Public keys:
+        - ``resnet18``: ResNet18.
+        - ``resnet34``: ResNet34.
+        - ``resnet50``: ResNet50.
+        - ``resnet101``: ResNet101.
+        - ``resnet152``: ResNet152.
+        - ``resnext50_32x4d``: ResNeXt50.
+        - ``resnext101_32x8d``: ResNeXt101.
+        - ``resnet50_fc512``: ResNet50 + FC.
+    """
+    def __init__(
+        self,
+        num_classes,
+        loss,
+        block,
+        layers,
+        zero_init_residual=False,
+        groups=1,
+        width_per_group=64,
+        replace_stride_with_dilation=None,
+        norm_layer=None,
+        last_stride=2,
+        fc_dims=None,
+        dropout_p=None,
+        **kwargs
+    ):
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+        self.loss = loss
+        self.feature_dim = 512 * block.expansion
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError(
+                "replace_stride_with_dilation should be None "
+                "or a 3-element tuple, got {}".format(replace_stride_with_dilation)
+            )
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(
+            3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False
+        )
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(
+            block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0]
+        )
+        self.layer3 = self._make_layer(
+            block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1]
+        )
+        self.layer4 = self._make_layer(
+            block,
+            512,
+            layers[3],
+            stride=last_stride,
+            dilate=replace_stride_with_dilation[2],
+        )
+        self.global_avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = self._construct_fc_layer(fc_dims, 512 * block.expansion, dropout_p)
+        self.classifier = nn.Linear(self.feature_dim, num_classes)
+        self._init_params()
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+        layers = []
+        layers.append(
+            block(
+                self.inplanes,
+                planes,
+                stride,
+                downsample,
+                self.groups,
+                self.base_width,
+                previous_dilation,
+                norm_layer,
+            )
+        )
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    groups=self.groups,
+                    base_width=self.base_width,
+                    dilation=self.dilation,
+                    norm_layer=norm_layer,
+                )
+            )
+        return nn.Sequential(*layers)
+    def _construct_fc_layer(self, fc_dims, input_dim, dropout_p=None):
+        """Constructs fully connected layer
+        Args:
+            fc_dims (list or tuple): dimensions of fc layers, if None, no fc layers are constructed
+            input_dim (int): input dimension
+            dropout_p (float): dropout probability, if None, dropout is unused
+        """
+        if fc_dims is None:
+            self.feature_dim = input_dim
+            return None
+        assert isinstance(
+            fc_dims, (list, tuple)
+        ), "fc_dims must be either list or tuple, but got {}".format(type(fc_dims))
+        layers = []
+        for dim in fc_dims:
+            layers.append(nn.Linear(input_dim, dim))
+            layers.append(nn.BatchNorm1d(dim))
+            layers.append(nn.ReLU(inplace=True))
+            if dropout_p is not None:
+                layers.append(nn.Dropout(p=dropout_p))
+            input_dim = dim
+        self.feature_dim = fc_dims[-1]
+        return nn.Sequential(*layers)
+    def _init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm1d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def featuremaps(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        return x
+    def forward(self, x):
+        f = self.featuremaps(x)
+        v = self.global_avgpool(f)
+        v = v.view(v.size(0), -1)
+        if self.fc is not None:
+            v = self.fc(v)
+        if not self.training:
+            return v
+        y = self.classifier(v)
+        if self.loss == "softmax":
+            return y
+        elif self.loss == "triplet":
+            return y, v
+        else:
+            raise KeyError("Unsupported loss: {}".format(self.loss))
+def init_pretrained_weights(model, model_url):
+    """Initializes model with pretrained weights.
+    Layers that don't match with pretrained layers in name or size are kept unchanged.
+    """
+    pretrain_dict = model_zoo.load_url(model_url)
+    model_dict = model.state_dict()
+    pretrain_dict = {
+        k: v
+        for k, v in pretrain_dict.items()
+        if k in model_dict and model_dict[k].size() == v.size()
+    }
+    model_dict.update(pretrain_dict)
+    model.load_state_dict(model_dict)
+"""ResNet"""
+def resnet18(num_classes, loss="softmax", pretrained=True, **kwargs):
+    model = ResNet(
+        num_classes=num_classes,
+        loss=loss,
+        block=BasicBlock,
+        layers=[2, 2, 2, 2],
+        last_stride=2,
+        fc_dims=None,
+        dropout_p=None,
+        **kwargs
+    )
+    if pretrained:
+        init_pretrained_weights(model, model_urls["resnet18"])
+    return model
+def resnet34(num_classes, loss="softmax", pretrained=True, **kwargs):
+    model = ResNet(
+        num_classes=num_classes,
+        loss=loss,
+        block=BasicBlock,
+        layers=[3, 4, 6, 3],
+        last_stride=2,
+        fc_dims=None,
+        dropout_p=None,
+        **kwargs
+    )
+    if pretrained:
+        init_pretrained_weights(model, model_urls["resnet34"])
+    return model
+def resnet50(num_classes, loss="softmax", pretrained=True, **kwargs):
+    model = ResNet(
+        num_classes=num_classes,
+        loss=loss,
+        block=Bottleneck,
+        layers=[3, 4, 6, 3],
+        last_stride=2,
+        fc_dims=None,
+        dropout_p=None,
+        **kwargs
+    )
+    if pretrained:
+        init_pretrained_weights(model, model_urls["resnet50"])
+    return model
+def resnet101(num_classes, loss="softmax", pretrained=True, **kwargs):
+    model = ResNet(
+        num_classes=num_classes,
+        loss=loss,
+        block=Bottleneck,
+        layers=[3, 4, 23, 3],
+        last_stride=2,
+        fc_dims=None,
+        dropout_p=None,
+        **kwargs
+    )
+    if pretrained:
+        init_pretrained_weights(model, model_urls["resnet101"])
+    return model
+def resnet152(num_classes, loss="softmax", pretrained=True, **kwargs):
+    model = ResNet(
+        num_classes=num_classes,
+        loss=loss,
+        block=Bottleneck,
+        layers=[3, 8, 36, 3],
+        last_stride=2,
+        fc_dims=None,
+        dropout_p=None,
+        **kwargs
+    )
+    if pretrained:
+        init_pretrained_weights(model, model_urls["resnet152"])
+    return model
+"""ResNeXt"""
+def resnext50_32x4d(num_classes, loss="softmax", pretrained=True, **kwargs):
+    model = ResNet(
+        num_classes=num_classes,
+        loss=loss,
+        block=Bottleneck,
+        layers=[3, 4, 6, 3],
+        last_stride=2,
+        fc_dims=None,
+        dropout_p=None,
+        groups=32,
+        width_per_group=4,
+        **kwargs
+    )
+    if pretrained:
+        init_pretrained_weights(model, model_urls["resnext50_32x4d"])
+    return model
+def resnext101_32x8d(num_classes, loss="softmax", pretrained=True, **kwargs):
+    model = ResNet(
+        num_classes=num_classes,
+        loss=loss,
+        block=Bottleneck,
+        layers=[3, 4, 23, 3],
+        last_stride=2,
+        fc_dims=None,
+        dropout_p=None,
+        groups=32,
+        width_per_group=8,
+        **kwargs
+    )
+    if pretrained:
+        init_pretrained_weights(model, model_urls["resnext101_32x8d"])
+    return model
+"""
+ResNet + FC
+"""
+def resnet50_fc512(num_classes, loss="softmax", pretrained=True, **kwargs):
+    model = ResNet(
+        num_classes=num_classes,
+        loss=loss,
+        block=Bottleneck,
+        layers=[3, 4, 6, 3],
+        last_stride=1,
+        fc_dims=[512],
+        dropout_p=None,
+        **kwargs
+    )
+    if pretrained:
+        init_pretrained_weights(model, model_urls["resnet50"])
+    return model

boxmot/appearance/backends/base_backend.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import cv2
+import torch
+import gdown
+import numpy as np
+from abc import ABC, abstractmethod
+from boxmot.utils import logger as LOGGER
+from boxmot.appearance.reid.registry import ReIDModelRegistry
+from boxmot.utils.checks import RequirementsChecker
+class BaseModelBackend:
+    def __init__(self, weights, device, half):
+        self.weights = weights[0] if isinstance(weights, list) else weights
+        self.device = device
+        self.half = half
+        self.model = None
+        self.cuda = torch.cuda.is_available() and self.device.type != "cpu"
+        self.download_model(self.weights)
+        self.model_name = ReIDModelRegistry.get_model_name(self.weights)
+        self.model = ReIDModelRegistry.build_model(
+            self.model_name,
+            num_classes=ReIDModelRegistry.get_nr_classes(self.weights),
+            pretrained=not (self.weights and self.weights.is_file()),
+            use_gpu=device,
+        )
+        self.checker = RequirementsChecker()
+        self.load_model(self.weights)
+    def get_crops(self, xyxys, img):
+        h, w = img.shape[:2]
+        resize_dims = (128, 256)
+        interpolation_method = cv2.INTER_LINEAR
+        mean_array = torch.tensor([0.485, 0.456, 0.406], device=self.device).view(1, 3, 1, 1)
+        std_array = torch.tensor([0.229, 0.224, 0.225], device=self.device).view(1, 3, 1, 1)
+        # Preallocate tensor for crops
+        num_crops = len(xyxys)
+        crops = torch.empty((num_crops, 3, resize_dims[1], resize_dims[0]),
+                            dtype=torch.half if self.half else torch.float, device=self.device)
+        for i, box in enumerate(xyxys):
+            x1, y1, x2, y2 = box.round().astype('int')
+            x1, y1, x2, y2 = max(0, x1), max(0, y1), min(w, x2), min(h, y2)
+            crop = img[y1:y2, x1:x2]
+            # Resize and convert color in one step
+            crop = cv2.resize(crop, resize_dims, interpolation=interpolation_method)
+            crop = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
+            # Convert to tensor and normalize (convert to [0, 1] by dividing by 255 in batch later)
+            crop = torch.from_numpy(crop).to(self.device, dtype=torch.half if self.half else torch.float)
+            crops[i] = torch.permute(crop, (2, 0, 1))  # Change to (C, H, W)
+        # Normalize the entire batch in one go
+        crops = crops / 255.0
+        # Standardize the batch
+        crops = (crops - mean_array) / std_array
+        return crops
+    @torch.no_grad()
+    def get_features(self, xyxys, img):
+        if xyxys.size != 0:
+            crops = self.get_crops(xyxys, img)
+            crops = self.inference_preprocess(crops)
+            features = self.forward(crops)
+            features = self.inference_postprocess(features)
+        else:
+            features = np.array([])
+        features = features / np.linalg.norm(features, axis=-1, keepdims=True)
+        return features
+    def warmup(self, imgsz=[(256, 128, 3)]):
+        # warmup model by running inference once
+        if self.device.type != "cpu":
+            im = np.random.randint(0, 255, *imgsz, dtype=np.uint8)
+            crops = self.get_crops(xyxys=np.array(
+                [[0, 0, 64, 64], [0, 0, 128, 128]]),
+                img=im
+            )
+            crops = self.inference_preprocess(crops)
+            self.forward(crops)  # warmup
+    def to_numpy(self, x):
+        return x.cpu().numpy() if isinstance(x, torch.Tensor) else x
+    def inference_preprocess(self, x):
+        if self.half:
+            if isinstance(x, torch.Tensor):
+                if x.dtype != torch.float16:
+                    x = x.half()
+            elif isinstance(x, np.ndarray):
+                if x.dtype != np.float16:
+                    x = x.astype(np.float16)
+        if self.nhwc:
+            if isinstance(x, torch.Tensor):
+                x = x.permute(0, 2, 3, 1)  # Convert from NCHW to NHWC
+            elif isinstance(x, np.ndarray):
+                x = np.transpose(x, (0, 2, 3, 1))  # Convert from NCHW to NHWC
+        return x
+    def inference_postprocess(self, features):
+        if isinstance(features, (list, tuple)):
+            return (
+                self.to_numpy(features[0]) if len(features) == 1 else [self.to_numpy(x) for x in features]
+            )
+        else:
+            return self.to_numpy(features)
+    @abstractmethod
+    def forward(self, im_batch):
+        raise NotImplementedError("This method should be implemented by subclasses.")
+    @abstractmethod
+    def load_model(self, w):
+        raise NotImplementedError("This method should be implemented by subclasses.")
+    def download_model(self, w):
+        if w.suffix == ".pt":
+            model_url = ReIDModelRegistry.get_model_url(w)
+            if not w.exists() and model_url is not None:
+                gdown.download(model_url, str(w), quiet=False)
+            elif not w.exists():
+                LOGGER.error(
+                    f"No URL associated with the chosen StrongSORT weights ({w}). Choose between:"
+                )
+                ReIDModelRegistry.show_downloadable_models()
+                exit()

boxmot/appearance/backends/onnx_backend.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import numpy as np
+from pathlib import Path
+from boxmot.appearance.backends.base_backend import BaseModelBackend
+class ONNXBackend(BaseModelBackend):
+    def __init__(self, weights, device, half):
+        super().__init__(weights, device, half)
+        self.nhwc = False
+        self.half = half
+    def load_model(self, w):
+            # ONNXRuntime will attempt to use the first provider, and if it fails or is not
+            # available for some reason, it will fall back to the next provider in the list
+            if self.device == "mps":
+                self.checker.check_packages(("onnxruntime-silicon==1.17.0",))
+                providers = ["MPSExecutionProvider", "CPUExecutionProvider"]
+            elif self.device == "cuda":
+                self.checker.check_packages(("onnxruntime-gpu==1.17.0",))
+                providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+            else:
+                self.checker.check_packages(("onnxruntime==1.17.0",))
+                providers = ["CPUExecutionProvider"]
+            # Load the ONNX model using onnxruntime
+            import onnxruntime
+            self.session = onnxruntime.InferenceSession(str(w), providers=providers)
+    def forward(self, im_batch):
+        # Convert torch tensor to numpy (onnxruntime expects numpy arrays)
+        im_batch = im_batch.cpu().numpy()
+        # Run inference using ONNX session
+        features = self.session.run(
+            [self.session.get_outputs()[0].name],
+            {self.session.get_inputs()[0].name: im_batch},
+        )[0]
+        return features

boxmot/appearance/backends/openvino_backend.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import numpy as np
+from pathlib import Path
+from boxmot.utils import logger as LOGGER
+from boxmot.appearance.backends.base_backend import BaseModelBackend
+class OpenVinoBackend(BaseModelBackend):
+    def __init__(self, weights, device, half):
+        super().__init__(weights, device, half)
+        self.nhwc = False
+        self.half = half
+    def load_model(self, w):
+        self.checker.check_packages(("openvino-dev>=2022.3",))
+        LOGGER.info(f"Loading {w} for OpenVINO inference...")
+        try:
+            # requires openvino-dev: https://pypi.org/project/openvino-dev/
+            from openvino.runtime import Core, Layout
+        except ImportError:
+            LOGGER.error(
+                f"Running {self.__class__} with the specified OpenVINO weights\n{w.name}\n"
+                "requires openvino pip package to be installed!\n"
+                "$ pip install openvino-dev>=2022.3\n"
+            )
+        ie = Core()
+        if not Path(w).is_file():  # if not *.xml
+            w = next(
+                Path(w).glob("*.xml")
+            )  # get *.xml file from *_openvino_model dir
+        network = ie.read_model(model=w, weights=Path(w).with_suffix(".bin"))
+        if network.get_parameters()[0].get_layout().empty:
+            network.get_parameters()[0].set_layout(Layout("NCWH"))
+        self.executable_network = ie.compile_model(
+            network, device_name="CPU"
+        )  # device_name="MYRIAD" for Intel NCS2
+        self.output_layer = next(iter(self.executable_network.outputs))
+    def forward(self, im_batch):
+        im_batch = im_batch.cpu().numpy()  # FP32
+        features = self.executable_network([im_batch])[self.output_layer]
+        return features

boxmot/appearance/backends/pytorch_backend.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import numpy as np
+from pathlib import Path
+from boxmot.appearance.backends.base_backend import BaseModelBackend
+from boxmot.appearance.reid.registry import ReIDModelRegistry
+class PyTorchBackend(BaseModelBackend):
+    def __init__(self, weights, device, half):
+        super().__init__(weights, device, half)
+        self.nhwc = False
+        self.half = half
+    def load_model(self, w):
+        # Load a PyTorch model
+        if w and w.is_file():
+            ReIDModelRegistry.load_pretrained_weights(self.model, w)
+        self.model.to(self.device).eval()
+        self.model.half() if self.half else self.model.float()
+    def forward(self, im_batch):
+        features = self.model(im_batch)
+        return features

boxmot/appearance/backends/tensorrt_backend.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import torch
+import numpy as np
+from pathlib import Path
+from collections import OrderedDict, namedtuple
+from boxmot.utils import logger as LOGGER
+from boxmot.appearance.backends.base_backend import BaseModelBackend
+class TensorRTBackend(BaseModelBackend):
+    def __init__(self, weights, device, half):
+        self.is_trt10 = False
+        super().__init__(weights, device, half)
+        self.nhwc = False
+        self.half = half
+        self.device = device
+        self.weights = weights
+        self.fp16 = False  # Will be updated in load_model
+        self.load_model(self.weights)
+    def load_model(self, w):
+        LOGGER.info(f"Loading {w} for TensorRT inference...")
+        self.checker.check_packages(("nvidia-tensorrt",))
+        try:
+            import tensorrt as trt  # TensorRT library
+        except ImportError:
+            raise ImportError("Please install tensorrt to use this backend.")
+        if self.device.type == "cpu":
+            if torch.cuda.is_available():
+                self.device = torch.device("cuda:0")
+            else:
+                raise ValueError("CUDA device not available for TensorRT inference.")
+        Binding = namedtuple("Binding", ("name", "dtype", "shape", "data", "ptr"))
+        logger = trt.Logger(trt.Logger.INFO)
+        # Deserialize the engine
+        with open(w, "rb") as f, trt.Runtime(logger) as runtime:
+            self.model_ = runtime.deserialize_cuda_engine(f.read())
+        # Execution context
+        self.context = self.model_.create_execution_context()
+        self.bindings = OrderedDict()
+        self.is_trt10 = not hasattr(self.model_, "num_bindings")
+        num = range(self.model_.num_io_tensors) if self.is_trt10 else range(self.model_.num_bindings)
+        # Parse bindings
+        for index in num:
+            if self.is_trt10:
+                name = self.model_.get_tensor_name(index)
+                dtype = trt.nptype(self.model_.get_tensor_dtype(name))
+                is_input = self.model_.get_tensor_mode(name) == trt.TensorIOMode.INPUT
+                if is_input and -1 in tuple(self.model_.get_tensor_shape(name)):
+                        self.context.set_input_shape(name, tuple(self.model_.get_tensor_profile_shape(name, 0)[1]))
+                if is_input and dtype == np.float16:
+                    self.fp16 = True
+                shape = tuple(self.context.get_tensor_shape(name))
+            else:
+                name = self.model_.get_binding_name(index)
+                dtype = trt.nptype(self.model_.get_binding_dtype(index))
+                is_input = self.model_.binding_is_input(index)
+                # Handle dynamic shapes
+                if is_input and -1 in self.model_.get_binding_shape(index):
+                    profile_index = 0
+                    min_shape, opt_shape, max_shape = self.model_.get_profile_shape(profile_index, index)
+                    self.context.set_binding_shape(index, opt_shape)
+                if is_input and dtype == np.float16:
+                    self.fp16 = True
+                shape = tuple(self.context.get_binding_shape(index))
+            data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(self.device)
+            self.bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr()))
+        self.binding_addrs = OrderedDict((n, d.ptr) for n, d in self.bindings.items())
+    def forward(self, im_batch):
+        temp_im_batch = im_batch.clone()
+        batch_array = []
+        inp_batch = im_batch.shape[0]
+        out_batch = self.bindings["output"].shape[0]
+        resultant_features = []
+        # Divide batch to sub batches
+        while inp_batch > out_batch:
+            batch_array.append(temp_im_batch[:out_batch])
+            temp_im_batch = temp_im_batch[out_batch:]
+            inp_batch = temp_im_batch.shape[0]
+        if temp_im_batch.shape[0] > 0:
+            batch_array.append(temp_im_batch)
+        for temp_batch in batch_array:
+            # Adjust for dynamic shapes
+            if temp_batch.shape != self.bindings["images"].shape:
+                if self.is_trt10:
+                    self.context.set_input_shape("images", temp_batch.shape)
+                    self.bindings["images"] = self.bindings["images"]._replace(shape=temp_batch.shape)
+                    self.bindings["output"].data.resize_(tuple(self.context.get_tensor_shape("output")))
+                else:
+                    i_in = self.model_.get_binding_index("images")
+                    i_out = self.model_.get_binding_index("output")
+                    self.context.set_binding_shape(i_in, temp_batch.shape)
+                    self.bindings["images"] = self.bindings["images"]._replace(shape=temp_batch.shape)
+                    output_shape = tuple(self.context.get_binding_shape(i_out))
+                    self.bindings["output"].data.resize_(output_shape)
+            s = self.bindings["images"].shape
+            assert temp_batch.shape == s, f"Input size {temp_batch.shape} does not match model size {s}"
+            self.binding_addrs["images"] = int(temp_batch.data_ptr())
+            # Execute inference
+            self.context.execute_v2(list(self.binding_addrs.values()))
+            features = self.bindings["output"].data
+            resultant_features.append(features.clone())
+        if len(resultant_features)== 1:
+            return resultant_features[0]
+        else:
+            rslt_features = torch.cat(resultant_features,dim=0)
+            rslt_features= rslt_features[:im_batch.shape[0]]
+            return rslt_features

boxmot/appearance/backends/tflite_backend.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import torch
+import numpy as np
+from pathlib import Path
+from boxmot.utils import logger as LOGGER
+from boxmot.appearance.backends.base_backend import BaseModelBackend
+class TFLiteBackend(BaseModelBackend):
+    """
+    A class to handle TensorFlow Lite model inference with dynamic batch size support.
+    Attributes:
+        nhwc (bool): A flag indicating the order of dimensions.
+        half (bool): A flag to indicate if half precision is used.
+        interpreter (tf.lite.Interpreter): The TensorFlow Lite interpreter.
+        current_allocated_batch_size (int): The current batch size allocated in the interpreter.
+    """
+    def __init__(self, weights: Path, device: str, half: bool):
+        """
+        Initializes the TFLiteBackend with given weights, device, and precision flag.
+        Args:
+            weights (Path): Path to the TFLite model file.
+            device (str): Device type (e.g., 'cpu', 'gpu').
+            half (bool): Flag to indicate if half precision is used.
+        """
+        super().__init__(weights, device, half)
+        self.nhwc = True
+        self.half = False
+        # self.interpreter: tf.lite.Interpreter = None
+        # self.current_allocated_batch_size: int = None
+    def load_model(self, w):
+        """
+        Loads the TensorFlow Lite model and initializes the interpreter.
+        Args:
+            w (str): Path to the TFLite model file.
+        """
+        self.checker.check_packages(("tensorflow",))
+        LOGGER.info(f"Loading {str(w)} for TensorFlow Lite inference...")
+        import tensorflow as tf
+        self.interpreter = tf.lite.Interpreter(model_path=str(w))
+        self.interpreter.allocate_tensors()  # allocate
+        self.input_details = self.interpreter.get_input_details()  # inputs
+        self.output_details = self.interpreter.get_output_details()  # outputs
+        self.current_allocated_batch_size = self.input_details[0]['shape'][0]
+    def forward(self, im_batch: torch.Tensor) -> np.ndarray:
+        """
+        Runs forward pass for the given image batch through the TFLite model.
+        Args:
+            im_batch (torch.Tensor): Input image batch tensor.
+        Returns:
+            np.ndarray: Output features from the TFLite model.
+        """
+        im_batch = im_batch.cpu().numpy()
+        # Extract batch size from im_batch
+        batch_size = im_batch.shape[0]
+        # Resize tensors if the new batch size is different from the current allocated batch size
+        if batch_size != self.current_allocated_batch_size:
+            # print(f"Resizing tensor input to batch size {batch_size}")
+            self.interpreter.resize_tensor_input(self.input_details[0]['index'], [batch_size, 256, 128, 3])
+            self.interpreter.allocate_tensors()
+            self.current_allocated_batch_size = batch_size
+        # Set the tensor to point to the input data
+        self.interpreter.set_tensor(self.input_details[0]['index'], im_batch)
+        # Run inference
+        self.interpreter.invoke()
+        # Get the output data
+        features = self.interpreter.get_tensor(self.output_details[0]['index'])
+        return features

boxmot/appearance/backends/torchscript_backend.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import torch
+import numpy as np
+from pathlib import Path
+from boxmot.utils import logger as LOGGER
+from boxmot.appearance.backends.base_backend import BaseModelBackend
+class TorchscriptBackend(BaseModelBackend):
+    def __init__(self, weights, device, half):
+        super().__init__(weights, device, half)
+        self.nhwc = False
+        self.half = half
+    def load_model(self, w):
+        LOGGER.info(f"Loading {w} for TorchScript inference...")
+        self.model = torch.jit.load(w)
+        self.model.half() if self.half else self.model.float()
+    def forward(self, im_batch):
+        features = self.model(im_batch)
+        return features

boxmot/appearance/exporters/base_exporter.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import logging
+import torch
+from pathlib import Path
+from boxmot.utils.checks import RequirementsChecker
+from boxmot.utils import logger as LOGGER
+def export_decorator(export_func):
+    def wrapper(self, *args, **kwargs):
+        try:
+            if hasattr(self, 'required_packages'):
+                if hasattr(self, 'cmd'):
+                    self.checker.check_packages(self.required_packages, cmd=self.cmd)
+                else:
+                    self.checker.check_packages(self.required_packages)
+            LOGGER.info(f"\nStarting {self.file} export with {self.__class__.__name__}...")
+            result = export_func(self, *args, **kwargs)
+            if result:
+                LOGGER.info(f"Export success, saved as {result} ({self.file_size(result):.1f} MB)")
+            return result
+        except Exception as e:
+            LOGGER.error(f"Export failure: {e}")
+            return None
+    return wrapper
+class BaseExporter:
+    def __init__(self, model, im, file, optimize=False, dynamic=False, half=False, simplify=False):
+        self.model = model
+        self.im = im
+        self.file = Path(file)
+        self.optimize = optimize
+        self.dynamic = dynamic
+        self.half = half
+        self.simplify = simplify
+        self.checker = RequirementsChecker()
+        self.workspace = 4
+    @staticmethod
+    def file_size(path):
+        path = Path(path)
+        if path.is_file():
+            return path.stat().st_size / 1e6
+        elif path.is_dir():
+            return sum(f.stat().st_size for f in path.glob("**/*") if f.is_file()) / 1e6
+        else:
+            return 0.0
+    def export(self):
+        raise NotImplementedError("Export method must be implemented in subclasses.")
+    def __init_subclass__(cls, **kwargs):
+        super().__init_subclass__(**kwargs)
+        if 'export' in cls.__dict__:
+            cls.export = export_decorator(cls.export)

boxmot/appearance/exporters/onnx_exporter.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+import onnx
+from boxmot.appearance.exporters.base_exporter import BaseExporter
+from boxmot.utils import logger as LOGGER
+class ONNXExporter(BaseExporter):
+    required_packages = ("onnx>=1.16.1",)
+    def export(self):
+        f = self.file.with_suffix(".onnx")
+        dynamic = {"images": {0: "batch"}, "output": {0: "batch"}} if self.dynamic else None
+        torch.onnx.export(
+            self.model.cpu() if self.dynamic else self.model,
+            self.im.cpu() if self.dynamic else self.im,
+            f,
+            verbose=False,
+            opset_version=12,
+            do_constant_folding=True,
+            input_names=["images"],
+            output_names=["output"],
+            dynamic_axes=dynamic,
+        )
+        model_onnx = onnx.load(f)
+        onnx.checker.check_model(model_onnx)
+        onnx.save(model_onnx, f)
+        if self.simplify:
+            self.simplify_model(model_onnx, f)
+        return f
+    def simplify_model(self, model_onnx, f):
+        try:
+            cuda = torch.cuda.is_available()
+            self.checker.check_packages(
+                (
+                    "onnxruntime-gpu" if cuda else "onnxruntime",
+                    "onnx-simplifier>=0.4.1",
+                )
+            )
+            import onnxsim
+            LOGGER.info(
+                f"Simplifying with onnx-simplifier {onnxsim.__version__}..."
+            )
+            model_onnx, check = onnxsim.simplify(model_onnx)
+            assert check, "assert check failed"
+            onnx.save(model_onnx, f)
+        except Exception as e:
+            LOGGER.error(f"Simplifier failure: {e}")

boxmot/appearance/exporters/openvino_exporter.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import os
+from pathlib import Path
+import openvino.runtime as ov
+from openvino.tools import mo
+from boxmot.appearance.exporters.base_exporter import BaseExporter
+from boxmot.utils import logger as LOGGER
+class OpenVINOExporter(BaseExporter):
+    required_packages = ("openvino-dev>=2023.0",)
+    def export(self):
+        f = str(self.file).replace(self.file.suffix, f"_openvino_model{os.sep}")
+        f_onnx = self.file.with_suffix(".onnx")
+        f_ov = str(Path(f) / self.file.with_suffix(".xml").name)
+        ov_model = mo.convert_model(
+            f_onnx,
+            model_name=self.file.with_suffix(".xml"),
+            framework="onnx",
+            compress_to_fp16=self.half,
+        )
+        ov.serialize(ov_model, f_ov)
+        return f

boxmot/appearance/exporters/tensorrt_exporter.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import platform
+import torch
+from boxmot.appearance.exporters.base_exporter import BaseExporter
+from boxmot.appearance.exporters.onnx_exporter import ONNXExporter
+from boxmot.utils import logger as LOGGER
+class EngineExporter(BaseExporter):
+    required_packages = ("nvidia-tensorrt",)
+    cmds = '--extra-index-url https://pypi.ngc.nvidia.com'
+    def export(self):
+        assert self.im.device.type != "cpu", "export running on CPU but must be on GPU, i.e. `python export.py --device 0`"
+        try:
+            import tensorrt as trt
+        except ImportError:
+            import tensorrt as trt
+        onnx_file = self.export_onnx()
+        LOGGER.info(f"\nStarting export with TensorRT {trt.__version__}...")
+        is_trt10 = int(trt.__version__.split(".")[0]) >= 10  # is TensorRT >= 10
+        assert onnx_file.exists(), f"Failed to export ONNX file: {onnx_file}"
+        f = self.file.with_suffix(".engine")
+        logger = trt.Logger(trt.Logger.INFO)
+        if True:
+            logger.min_severity = trt.Logger.Severity.VERBOSE
+        builder = trt.Builder(logger)
+        config = builder.create_builder_config()
+        workspace = int(self.workspace * (1 << 30))
+        if is_trt10:
+            config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace)
+        else:  # TensorRT versions 7, 8
+            config.max_workspace_size = workspace
+        flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+        network = builder.create_network(flag)
+        parser = trt.OnnxParser(network, logger)
+        if not parser.parse_from_file(str(onnx_file)):
+            raise RuntimeError(f"Failed to load ONNX file: {onnx_file}")
+        inputs = [network.get_input(i) for i in range(network.num_inputs)]
+        outputs = [network.get_output(i) for i in range(network.num_outputs)]
+        LOGGER.info("Network Description:")
+        for inp in inputs:
+            LOGGER.info(f'\tinput "{inp.name}" with shape {inp.shape} and dtype {inp.dtype}')
+        for out in outputs:
+            LOGGER.info(f'\toutput "{out.name}" with shape {out.shape} and dtype {out.dtype}')
+        if self.dynamic:
+            if self.im.shape[0] <= 1:
+                LOGGER.warning("WARNING: --dynamic model requires maximum --batch-size argument")
+            profile = builder.create_optimization_profile()
+            for inp in inputs:
+                if self.half:
+                    inp.dtype = trt.float16
+                profile.set_shape(
+                    inp.name,
+                    (1, *self.im.shape[1:]),
+                    (max(1, self.im.shape[0] // 2), *self.im.shape[1:]),
+                    self.im.shape,
+                )
+            config.add_optimization_profile(profile)
+        LOGGER.info(f"Building FP{16 if builder.platform_has_fast_fp16 and self.half else 32} engine in {f}")
+        if builder.platform_has_fast_fp16 and self.half:
+            config.set_flag(trt.BuilderFlag.FP16)
+            config.default_device_type = trt.DeviceType.GPU
+        build = builder.build_serialized_network if is_trt10 else builder.build_engine
+        with build(network, config) as engine, open(f, "wb") as t:
+            t.write(engine if is_trt10 else engine.serialize())
+        return f
+    def export_onnx(self):
+        onnx_exporter = ONNXExporter(self.model, self.im, self.file, self.optimize, self.dynamic, self.half, self.simplify)
+        return onnx_exporter.export()

boxmot/appearance/exporters/tflite_exporter.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import os
+from boxmot.appearance.exporters.base_exporter import BaseExporter
+from boxmot.utils import logger as LOGGER
+class TFLiteExporter(BaseExporter):
+    required_packages = (
+        "onnx2tf>=1.18.0",
+        "onnx>=1.16.1",
+        "tensorflow==2.17.0",
+        "tf_keras",  # required by 'onnx2tf' package
+        "sng4onnx>=1.0.1",  # required by 'onnx2tf' package
+        "onnx_graphsurgeon>=0.3.26",  # required by 'onnx2tf' package
+        "onnxslim>=0.1.31",
+        "onnxruntime",
+        "flatbuffers>=23.5.26",
+        "psutil==5.9.5",
+        "ml_dtypes==0.3.2",
+        "ai_edge_litert>=1.2.0"
+    )
+    cmds = '--extra-index-url https://pypi.ngc.nvidia.com'
+    def export(self):
+        import onnx2tf
+        input_onnx_file_path = str(self.file.with_suffix('.onnx'))
+        output_folder_path = input_onnx_file_path.replace(".onnx", f"_saved_model{os.sep}")
+        onnx2tf.convert(
+            input_onnx_file_path=input_onnx_file_path,
+            output_folder_path=output_folder_path,
+            not_use_onnxsim=True,
+            verbosity=True,
+            # output_integer_quantized_tflite=self.args.int8,
+            # quant_type="per-tensor",  # "per-tensor" (faster) or "per-channel" (slower but more accurate)
+            # custom_input_op_name_np_data_path=np_data,
+        )
+        return output_folder_path

boxmot/appearance/exporters/torchscript_exporter.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import torch
+from boxmot.appearance.exporters.base_exporter import BaseExporter
+from boxmot.utils import logger as LOGGER
+class TorchScriptExporter(BaseExporter):
+    def export(self):
+        f = self.file.with_suffix(".torchscript")
+        ts = torch.jit.trace(self.model, self.im, strict=False)
+        if self.optimize:
+            torch.utils.mobile_optimizer.optimize_for_mobile(ts)._save_for_lite_interpreter(str(f))
+        else:
+            ts.save(str(f))
+        return f

boxmot/appearance/reid/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license
+import pandas as pd
+def export_formats():
+    # yolo tracking export formats
+    x = [
+        ["PyTorch", "-", ".pt", True, True],
+        ["TorchScript", "torchscript", ".torchscript", True, True],
+        ["ONNX", "onnx", ".onnx", True, True],
+        ["OpenVINO", "openvino", "_openvino_model", True, False],
+        ["TensorRT", "engine", ".engine", False, True],
+        ["TensorFlow Lite", "tflite", ".tflite", True, False],
+    ]
+    return pd.DataFrame(x, columns=["Format", "Argument", "Suffix", "CPU", "GPU"])

boxmot/appearance/reid/auto_backend.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import torch
+from pathlib import Path
+from typing import Union, Tuple
+from boxmot.utils import WEIGHTS
+from boxmot.utils import logger as LOGGER
+from boxmot.utils.torch_utils import select_device
+from boxmot.appearance.reid import export_formats
+from boxmot.appearance.backends.onnx_backend import ONNXBackend
+from boxmot.appearance.backends.openvino_backend import OpenVinoBackend
+from boxmot.appearance.backends.pytorch_backend import PyTorchBackend
+from boxmot.appearance.backends.tensorrt_backend import TensorRTBackend
+from boxmot.appearance.backends.tflite_backend import TFLiteBackend
+from boxmot.appearance.backends.torchscript_backend import TorchscriptBackend
+from boxmot.appearance.backends.base_backend import BaseModelBackend
+class ReidAutoBackend():
+    def __init__(
+        self,
+        weights: Path = WEIGHTS / "osnet_x0_25_msmt17.pt",
+        device: torch.device = torch.device("cpu"),
+        half: bool = False) -> None:
+        """
+        Initializes the ReidAutoBackend instance with specified weights, device, and precision mode.
+        Args:
+            weights (Union[str, List[str]]): Path to the model weights. Can be a string or a list of strings; if a list, the first element is used.
+            device (torch.device): The device to run the model on, e.g., CPU or GPU.
+            half (bool): Whether to use half precision for model inference.
+        """
+        super().__init__()
+        w = weights[0] if isinstance(weights, list) else weights
+        (
+            self.pt,
+            self.jit,
+            self.onnx,
+            self.xml,
+            self.engine,
+            self.tflite,
+        ) = self.model_type(w)  # get backend
+        self.weights = weights
+        self.device = select_device(device)
+        self.half = half
+        self.model = self.get_backend()
+    def get_backend(self) -> Union['PyTorchBackend', 'TorchscriptBackend', 'ONNXBackend', 'TensorRTBackend', 'OpenVinoBackend', 'TFLiteBackend']:
+        """
+        Returns an instance of the appropriate backend based on the model type.
+        Returns:
+            An instance of a backend class corresponding to the detected model type.
+        Raises:
+            SystemExit: If no supported model framework is detected.
+        """
+        # Mapping of conditions to backend constructors
+        backend_map = {
+            self.pt: PyTorchBackend,
+            self.jit: TorchscriptBackend,
+            self.onnx: ONNXBackend,
+            self.engine: TensorRTBackend,
+            self.xml: OpenVinoBackend,
+            self.tflite: TFLiteBackend
+        }
+        # Iterate through the mapping and return the first matching backend
+        for condition, backend_class in backend_map.items():
+            if condition:
+                return backend_class(self.weights, self.device, self.half)
+        # If no condition is met, log an error and exit
+        LOGGER.error("This model framework is not supported yet!")
+        exit()
+    def forward(self, im_batch: torch.Tensor) -> torch.Tensor:
+        """
+        Processes an image batch through the selected backend and returns the processed batch.
+        Args:
+            im_batch (torch.Tensor): The batch of images to process.
+        Returns:
+            torch.Tensor: The processed image batch.
+        """
+        im_batch = self.backend.preprocess_input(im_batch)
+        return self.backend.get_features(im_batch)
+    def check_suffix(self, file: Path = "osnet_x0_25_msmt17.pt", suffix: Union[str, Tuple[str, ...]] = (".pt",), msg: str = "") -> None:
+        """
+        Validates that the file or files have an acceptable suffix.
+        Args:
+            file (Union[str, List[str], Path]): The file or files to check.
+            suffix (Union[str, Tuple[str, ...]]): Acceptable suffix or suffixes.
+            msg (str): Additional message to log in case of an error.
+        """
+        suffix = [suffix] if isinstance(suffix, str) else list(suffix)
+        files = [file] if isinstance(file, (str, Path)) else list(file)
+        for f in files:
+            file_suffix = Path(f).suffix.lower()
+            if file_suffix and file_suffix not in suffix:
+                LOGGER.error(f"File {f} does not have an acceptable suffix. Expected: {suffix}")
+    def model_type(self, p: Path) -> Tuple[bool, ...]:
+        """
+        Determines the model type based on the file's suffix.
+        Args:
+            path (str): The file path to the model.
+        Returns:
+            Tuple[bool, ...]: A tuple of booleans indicating the model type, corresponding to pt, jit, onnx, xml, engine, and tflite.
+        """
+        sf = list(export_formats().Suffix)  # export suffixes
+        self.check_suffix(p, sf)  # checks
+        types = [s in Path(p).name for s in sf]
+        return types

boxmot/appearance/reid/config.py ADDED Viewed

	@@ -0,0 +1,73 @@

+MODEL_TYPES = [
+    "resnet50",
+    "resnet101",
+    "mlfn",
+    "hacnn",
+    "mobilenetv2_x1_0",
+    "mobilenetv2_x1_4",
+    "osnet_x1_0",
+    "osnet_x0_75",
+    "osnet_x0_5",
+    "osnet_x0_25",
+    "osnet_ibn_x1_0",
+    "osnet_ain_x1_0",
+    "lmbn_n",
+    "clip",
+]
+TRAINED_URLS = {
+    # resnet50
+    "resnet50_market1501.pt": "https://drive.google.com/uc?id=1dUUZ4rHDWohmsQXCRe2C_HbYkzz94iBV",
+    "resnet50_dukemtmcreid.pt": "https://drive.google.com/uc?id=17ymnLglnc64NRvGOitY3BqMRS9UWd1wg",
+    "resnet50_msmt17.pt": "https://drive.google.com/uc?id=1ep7RypVDOthCRIAqDnn4_N-UhkkFHJsj",
+    "resnet50_fc512_market1501.pt": "https://drive.google.com/uc?id=1kv8l5laX_YCdIGVCetjlNdzKIA3NvsSt",
+    "resnet50_fc512_dukemtmcreid.pt": "https://drive.google.com/uc?id=13QN8Mp3XH81GK4BPGXobKHKyTGH50Rtx",
+    "resnet50_fc512_msmt17.pt": "https://drive.google.com/uc?id=1fDJLcz4O5wxNSUvImIIjoaIF9u1Rwaud",
+    # mlfn
+    "mlfn_market1501.pt": "https://drive.google.com/uc?id=1wXcvhA_b1kpDfrt9s2Pma-MHxtj9pmvS",
+    "mlfn_dukemtmcreid.pt": "https://drive.google.com/uc?id=1rExgrTNb0VCIcOnXfMsbwSUW1h2L1Bum",
+    "mlfn_msmt17.pt": "https://drive.google.com/uc?id=18JzsZlJb3Wm7irCbZbZ07TN4IFKvR6p-",
+    # hacnn
+    "hacnn_market1501.pt": "https://drive.google.com/uc?id=1LRKIQduThwGxMDQMiVkTScBwR7WidmYF",
+    "hacnn_dukemtmcreid.pt": "https://drive.google.com/uc?id=1zNm6tP4ozFUCUQ7Sv1Z98EAJWXJEhtYH",
+    "hacnn_msmt17.pt": "https://drive.google.com/uc?id=1MsKRtPM5WJ3_Tk2xC0aGOO7pM3VaFDNZ",
+    # mobilenetv2
+    "mobilenetv2_x1_0_market1501.pt": "https://drive.google.com/uc?id=18DgHC2ZJkjekVoqBWszD8_Xiikz-fewp",
+    "mobilenetv2_x1_0_dukemtmcreid.pt": "https://drive.google.com/uc?id=1q1WU2FETRJ3BXcpVtfJUuqq4z3psetds",
+    "mobilenetv2_x1_0_msmt17.pt": "https://drive.google.com/uc?id=1j50Hv14NOUAg7ZeB3frzfX-WYLi7SrhZ",
+    "mobilenetv2_x1_4_market1501.pt": "https://drive.google.com/uc?id=1t6JCqphJG-fwwPVkRLmGGyEBhGOf2GO5",
+    "mobilenetv2_x1_4_dukemtmcreid.pt": "https://drive.google.com/uc?id=12uD5FeVqLg9-AFDju2L7SQxjmPb4zpBN",
+    "mobilenetv2_x1_4_msmt17.pt": "https://drive.google.com/uc?id=1ZY5P2Zgm-3RbDpbXM0kIBMPvspeNIbXz",
+    # osnet
+    "osnet_x1_0_market1501.pt": "https://drive.google.com/uc?id=1vduhq5DpN2q1g4fYEZfPI17MJeh9qyrA",
+    "osnet_x1_0_dukemtmcreid.pt": "https://drive.google.com/uc?id=1QZO_4sNf4hdOKKKzKc-TZU9WW1v6zQbq",
+    "osnet_x1_0_msmt17.pt": "https://drive.google.com/uc?id=112EMUfBPYeYg70w-syK6V6Mx8-Qb9Q1M",
+    "osnet_x0_75_market1501.pt": "https://drive.google.com/uc?id=1ozRaDSQw_EQ8_93OUmjDbvLXw9TnfPer",
+    "osnet_x0_75_dukemtmcreid.pt": "https://drive.google.com/uc?id=1IE3KRaTPp4OUa6PGTFL_d5_KQSJbP0Or",
+    "osnet_x0_75_msmt17.pt": "https://drive.google.com/uc?id=1QEGO6WnJ-BmUzVPd3q9NoaO_GsPNlmWc",
+    "osnet_x0_5_market1501.pt": "https://drive.google.com/uc?id=1PLB9rgqrUM7blWrg4QlprCuPT7ILYGKT",
+    "osnet_x0_5_dukemtmcreid.pt": "https://drive.google.com/uc?id=1KoUVqmiST175hnkALg9XuTi1oYpqcyTu",
+    "osnet_x0_5_msmt17.pt": "https://drive.google.com/uc?id=1UT3AxIaDvS2PdxzZmbkLmjtiqq7AIKCv",
+    "osnet_x0_25_market1501.pt": "https://drive.google.com/uc?id=1z1UghYvOTtjx7kEoRfmqSMu-z62J6MAj",
+    "osnet_x0_25_dukemtmcreid.pt": "https://drive.google.com/uc?id=1eumrtiXT4NOspjyEV4j8cHmlOaaCGk5l",
+    "osnet_x0_25_msmt17.pt": "https://drive.google.com/uc?id=1sSwXSUlj4_tHZequ_iZ8w_Jh0VaRQMqF",
+    # osnet_ain | osnet_ibn
+    "osnet_ibn_x1_0_msmt17.pt": "https://drive.google.com/uc?id=1q3Sj2ii34NlfxA4LvmHdWO_75NDRmECJ",
+    "osnet_ain_x1_0_msmt17.pt": "https://drive.google.com/uc?id=1SigwBE6mPdqiJMqhuIY4aqC7--5CsMal",
+    # lmbn
+    "lmbn_n_duke.pt": "https://github.com/mikel-brostrom/yolov8_tracking/releases/download/v9.0/lmbn_n_duke.pth",
+    "lmbn_n_market.pt": "https://github.com/mikel-brostrom/yolov8_tracking/releases/download/v9.0/lmbn_n_market.pth",
+    "lmbn_n_cuhk03_d.pt": "https://github.com/mikel-brostrom/yolov8_tracking/releases/download/v9.0/lmbn_n_cuhk03_d.pth",
+    # clip
+    "clip_market1501.pt": "https://drive.google.com/uc?id=1GnyAVeNOg3Yug1KBBWMKKbT2x43O5Ch7",
+    "clip_duke.pt": "https://drive.google.com/uc?id=1ldjSkj-7pXAWmx8on5x0EftlCaolU4dY",
+    "clip_veri.pt": "https://drive.google.com/uc?id=1RyfHdOBI2pan_wIGSim5-l6cM4S2WN8e",
+    "clip_vehicleid.pt": "https://drive.google.com/uc?id=168BLegHHxNqatW5wx1YyL2REaThWoof5"
+}
+NR_CLASSES_DICT = {
+    "market1501": 751,
+    "duke": 702,
+    "veri": 576,
+    "vehicleid": 576,
+}

boxmot/appearance/reid/export.py ADDED Viewed

	@@ -0,0 +1,227 @@

+#!/usr/bin/env python3
+import argparse
+import time
+from pathlib import Path
+import torch
+from boxmot.appearance.exporters.base_exporter import BaseExporter
+from boxmot.appearance.exporters.onnx_exporter import ONNXExporter
+from boxmot.appearance.exporters.openvino_exporter import OpenVINOExporter
+from boxmot.appearance.exporters.tflite_exporter import TFLiteExporter
+from boxmot.appearance.exporters.torchscript_exporter import TorchScriptExporter
+from boxmot.appearance.exporters.tensorrt_exporter import EngineExporter
+from boxmot.appearance.reid import export_formats
+from boxmot.appearance.reid.auto_backend import ReidAutoBackend
+from boxmot.appearance.reid.registry import ReIDModelRegistry
+from boxmot.utils import WEIGHTS, logger as LOGGER
+from boxmot.utils.torch_utils import select_device
+def parse_args():
+    """
+    Parse command-line arguments for the ReID export script.
+    """
+    parser = argparse.ArgumentParser(description="ReID Export Script")
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size for export")
+    parser.add_argument("--imgsz", "--img", "--img-size",
+                        nargs="+", type=int, default=[256, 128],
+                        help="Image size in the format: height width")
+    parser.add_argument("--device", default="cpu",
+                        help="CUDA device (e.g., '0', '0,1,2,3', or 'cpu')")
+    parser.add_argument("--optimize", action="store_true",
+                        help="Optimize TorchScript for mobile (CPU export only)")
+    parser.add_argument("--dynamic", action="store_true",
+                        help="Enable dynamic axes for ONNX/TF/TensorRT export")
+    parser.add_argument("--simplify", action="store_true",
+                        help="Simplify ONNX model")
+    parser.add_argument("--opset", type=int, default=12,
+                        help="ONNX opset version")
+    parser.add_argument("--workspace", type=int, default=4,
+                        help="TensorRT workspace size (GB)")
+    parser.add_argument("--verbose", action="store_true",
+                        help="Enable verbose logging for TensorRT")
+    parser.add_argument("--weights", type=Path,
+                        default=WEIGHTS / "osnet_x0_25_msmt17.pt",
+                        help="Path to the model weights (.pt file)")
+    parser.add_argument("--half", action="store_true",
+                        help="Enable FP16 half-precision export (GPU only)")
+    parser.add_argument("--include", nargs="+",
+                        default=["torchscript"],
+                        help=("Export formats to include. Options: torchscript, onnx, "
+                              "openvino, engine, tflite"))
+    return parser.parse_args()
+def validate_export_formats(include):
+    """
+    Validate the provided export formats and return corresponding flags.
+    Args:
+        include (list): List of export formats provided via the command line.
+    Returns:
+        tuple: Boolean flags for each export format in the order:
+               (torchscript, onnx, openvino, engine, tflite)
+    """
+    available_formats = tuple(export_formats()["Argument"][1:])
+    include_lower = [fmt.lower() for fmt in include]
+    flags = [fmt in include_lower for fmt in available_formats]
+    if sum(flags) != len(include_lower):
+        raise AssertionError(
+            f"ERROR: Invalid --include {include}, valid arguments are {available_formats}"
+        )
+    return tuple(flags)
+def setup_model(args):
+    """
+    Initialize and prepare the ReID model for export.
+    Args:
+        args: Parsed command-line arguments.
+    Returns:
+        tuple: (model (torch.nn.Module), dummy_input (torch.Tensor))
+    """
+    # Select the correct device
+    args.device = select_device(args.device)
+    if args.half and args.device.type == "cpu":
+        raise AssertionError("--half only compatible with GPU export, use --device 0 for GPU")
+    # Initialize backend model using the auto backend
+    auto_backend = ReidAutoBackend(weights=args.weights, device=args.device, half=args.half)
+    _ = auto_backend.get_backend()  # Backend model is managed internally
+    # Build and load the ReID model from the registry
+    model_name = ReIDModelRegistry.get_model_name(args.weights)
+    nr_classes = ReIDModelRegistry.get_nr_classes(args.weights)
+    pretrained = not (args.weights and args.weights.is_file() and args.weights.suffix == ".pt")
+    model = ReIDModelRegistry.build_model(
+        model_name,
+        num_classes=nr_classes,
+        pretrained=pretrained,
+        use_gpu=args.device,
+    ).to(args.device)
+    ReIDModelRegistry.load_pretrained_weights(model, args.weights)
+    model.eval()
+    # Ensure --optimize is only used with CPU exports
+    if args.optimize and args.device.type != "cpu":
+        raise AssertionError("--optimize not compatible with CUDA devices, use --device cpu")
+    # Adjust image size if a specific weight type is detected
+    if "lmbn" in str(args.weights):
+        args.imgsz = [384, 128]
+    # Create dummy input tensor for warming up the model
+    dummy_input = torch.empty(args.batch_size, 3, args.imgsz[0], args.imgsz[1]).to(args.device)
+    for _ in range(2):
+        _ = model(dummy_input)
+    # Convert to half precision if required
+    if args.half:
+        dummy_input = dummy_input.half()
+        model = model.half()
+    return model, dummy_input
+def create_export_tasks(args, model, dummy_input):
+    """
+    Create a mapping of export tasks with associated flags, exporter classes, and parameters.
+    Args:
+        args: Parsed command-line arguments.
+        model: Prepared ReID model.
+        dummy_input: Dummy input tensor.
+    Returns:
+        dict: Mapping of export format to a tuple (flag, exporter_class, export_args)
+    """
+    torchscript_flag, onnx_flag, openvino_flag, engine_flag, tflite_flag = validate_export_formats(args.include)
+    return {
+        "torchscript": (
+            torchscript_flag,
+            TorchScriptExporter,
+            (model, dummy_input, args.weights, args.optimize)
+        ),
+        "engine": (
+            engine_flag,
+            EngineExporter,
+            (model, dummy_input, args.weights, args.half, args.dynamic, args.simplify, args.verbose)
+        ),
+        "onnx": (
+            onnx_flag,
+            ONNXExporter,
+            (model, dummy_input, args.weights, args.opset, args.dynamic, args.half, args.simplify)
+        ),
+        "tflite": (
+            tflite_flag,
+            TFLiteExporter,
+            (model, dummy_input, args.weights)
+        ),
+        "openvino": (
+            openvino_flag,
+            OpenVINOExporter,
+            (model, dummy_input, args.weights, args.half)
+        )
+    }
+def perform_exports(export_tasks):
+    """
+    Iterate over export tasks and perform export for enabled formats.
+    Args:
+        export_tasks (dict): Mapping of export tasks.
+    Returns:
+        dict: Mapping of export format to export results.
+    """
+    exported_files = {}
+    for fmt, (flag, exporter_class, exp_args) in export_tasks.items():
+        if flag:
+            exporter = exporter_class(*exp_args)
+            export_result = exporter.export()
+            exported_files[fmt] = export_result
+    return exported_files
+def main():
+    """Main function to execute the ReID export process."""
+    args = parse_args()
+    start_time = time.time()
+    # Ensure the weights directory exists
+    WEIGHTS.mkdir(parents=False, exist_ok=True)
+    # Setup model and create a dummy input tensor
+    model, dummy_input = setup_model(args)
+    # Log model output shape and file size
+    output = model(dummy_input)
+    output_tensor = output[0] if isinstance(output, tuple) else output
+    output_shape = tuple(output_tensor.shape)
+    LOGGER.info(
+        f"\nStarting from {args.weights} with output shape {output_shape} "
+        f"({BaseExporter.file_size(args.weights):.1f} MB)"
+    )
+    # Create export tasks
+    export_tasks = create_export_tasks(args, model, dummy_input)
+    # Perform exports for enabled formats
+    exported_files = perform_exports(export_tasks)
+    if exported_files:
+        elapsed_time = time.time() - start_time
+        LOGGER.info(
+            f"\nExport complete ({elapsed_time:.1f}s)"
+            f"\nResults saved to {args.weights.parent.resolve()}"
+            f"\nVisualize: https://netron.app"
+        )
+if __name__ == "__main__":
+    main()

boxmot/appearance/reid/factory.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from boxmot.appearance.backbones.clip.make_model import make_model
+from boxmot.appearance.backbones.hacnn import HACNN
+from boxmot.appearance.backbones.lmbn.lmbn_n import LMBN_n
+from boxmot.appearance.backbones.mlfn import mlfn
+from boxmot.appearance.backbones.mobilenetv2 import mobilenetv2_x1_0, mobilenetv2_x1_4
+from boxmot.appearance.backbones.osnet import (
+    osnet_ibn_x1_0,
+    osnet_x0_5,
+    osnet_x0_25,
+    osnet_x0_75,
+    osnet_x1_0,
+)
+from boxmot.appearance.backbones.osnet_ain import (
+    osnet_ain_x0_5,
+    osnet_ain_x0_25,
+    osnet_ain_x0_75,
+    osnet_ain_x1_0,
+)
+from boxmot.appearance.backbones.resnet import resnet50, resnet101
+# Map model names to their respective constructors
+MODEL_FACTORY = {
+    "resnet50": resnet50,
+    "resnet101": resnet101,
+    "mobilenetv2_x1_0": mobilenetv2_x1_0,
+    "mobilenetv2_x1_4": mobilenetv2_x1_4,
+    "hacnn": HACNN,
+    "mlfn": mlfn,
+    "osnet_x1_0": osnet_x1_0,
+    "osnet_x0_75": osnet_x0_75,
+    "osnet_x0_5": osnet_x0_5,
+    "osnet_x0_25": osnet_x0_25,
+    "osnet_ibn_x1_0": osnet_ibn_x1_0,
+    "osnet_ain_x1_0": osnet_ain_x1_0,
+    "osnet_ain_x0_75": osnet_ain_x0_75,
+    "osnet_ain_x0_5": osnet_ain_x0_5,
+    "osnet_ain_x0_25": osnet_ain_x0_25,
+    "lmbn_n": LMBN_n,
+    "clip": make_model,
+}

boxmot/appearance/reid/registry.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# model_registry.py
+import torch
+from collections import OrderedDict
+from boxmot.utils import logger as LOGGER
+from boxmot.appearance.reid.config import MODEL_TYPES, TRAINED_URLS, NR_CLASSES_DICT
+from boxmot.appearance.reid.factory import MODEL_FACTORY
+class ReIDModelRegistry:
+    """Encapsulates model registration and related utilities."""
+    @staticmethod
+    def show_downloadable_models():
+        LOGGER.info("Available .pt ReID models for automatic download")
+        LOGGER.info(list(TRAINED_URLS.keys()))
+    @staticmethod
+    def get_model_name(model):
+        for name in MODEL_TYPES:
+            if name in model.name:
+                return name
+        return None
+    @staticmethod
+    def get_model_url(model):
+        return TRAINED_URLS.get(model.name, None)
+    @staticmethod
+    def load_pretrained_weights(model, weight_path):
+        """
+        Loads pretrained weights into a model.
+        Chooses the proper map_location based on CUDA availability.
+        """
+        device = "cpu" if not torch.cuda.is_available() else None
+        checkpoint = torch.load(weight_path, map_location=torch.device("cpu") if device == "cpu" else None)
+        state_dict = checkpoint.get("state_dict", checkpoint)
+        model_dict = model.state_dict()
+        if "lmbn" in weight_path.parts:
+            model.load_state_dict(model_dict, strict=True)
+        else:
+            new_state_dict = OrderedDict()
+            matched_layers, discarded_layers = [], []
+            for k, v in state_dict.items():
+                # Remove 'module.' prefix if present
+                key = k[7:] if k.startswith("module.") else k
+                if key in model_dict and model_dict[key].size() == v.size():
+                    new_state_dict[key] = v
+                    matched_layers.append(key)
+                else:
+                    discarded_layers.append(key)
+            model_dict.update(new_state_dict)
+            model.load_state_dict(model_dict)
+            if not matched_layers:
+                LOGGER.debug(f"Pretrained weights from {weight_path} cannot be loaded. Check key names manually.")
+            else:
+                LOGGER.success(f"Loaded pretrained weights from {weight_path}")
+            if discarded_layers:
+                LOGGER.debug(f"Discarded layers due to unmatched keys or size: {discarded_layers}")
+    @staticmethod
+    def show_available_models():
+        LOGGER.info("Available models:")
+        LOGGER.info(list(MODEL_FACTORY.keys()))
+    @staticmethod
+    def get_nr_classes(weights):
+        # Extract dataset name from weights name, then look up in the class dictionary
+        dataset_key = weights.name.split('_')[1]
+        return NR_CLASSES_DICT.get(dataset_key, 1)
+    @staticmethod
+    def build_model(name, num_classes, loss="softmax", pretrained=True, use_gpu=True):
+        if name not in MODEL_FACTORY:
+            available = list(MODEL_FACTORY.keys())
+            raise KeyError(f"Unknown model '{name}'. Must be one of {available}")
+        # Special case handling for clip model
+        if 'clip' in name:
+            from boxmot.appearance.backbones.clip.config.defaults import _C as cfg
+            return MODEL_FACTORY[name](cfg, num_class=num_classes, camera_num=2, view_num=1)
+        return MODEL_FACTORY[name](
+            num_classes=num_classes, loss=loss, pretrained=pretrained, use_gpu=use_gpu
+        )

boxmot/configs/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Mikel Broström 🔥 Yolo Tracking 🧾 AGPL-3.0 license

boxmot/configs/boosttrack.yaml ADDED Viewed

	@@ -0,0 +1,90 @@

+max_age:
+  type: uniform
+  default: 60
+  range: [15, 90]
+min_hits:
+  type: uniform
+  default: 3
+  range: [1, 5]
+det_thresh:
+  type: uniform
+  default: 0.6
+  range: [0.1, 0.9]
+iou_threshold:
+  type: uniform
+  default: 0.3
+  range: [0.1, 0.9]
+use_ecc:
+  type: choice
+  default: True
+  options: [False, True]
+min_box_area:
+  type: uniform
+  default: 10
+  range: [5, 100]
+aspect_ratio_thresh:
+  type: uniform
+  default: 1.6
+  range: [0.1, 2.0]
+lambda_iou:
+  type: uniform
+  default: 0.5
+  range: [0.3, 2.0]
+lambda_mhd:
+  type: uniform
+  default: 0.25
+  range: [0.5, 2.0]
+lambda_shape:
+  type: uniform
+  default: 0.25
+  range: [0.5, 2.0]
+use_dlo_boost:
+  type: choice
+  default: True
+  options: [False, True]
+use_duo_boost:
+  type: choice
+  default: True
+  options: [False, True]
+dlo_boost_coef:
+  type: uniform
+  default: 0.65
+  range: [0.3, 2.0]
+s_sim_corr:
+  type: choice
+  default: False
+  options: [False, True]
+use_rich_s:
+  type: choice
+  default: True # True for BoostTrack++
+  options: [False, True]
+use_sb:
+  type: choice
+  default: True # True for BoostTrack++
+  options: [False, True]
+use_vt:
+  type: choice
+  default: True # True for BoostTrack++
+  options: [False, True]
+with_reid:
+  type: choice
+  default: True # True for BoostTrack+ and BoostTrack++
+  options: [False, True]

boxmot/configs/botsort.yaml ADDED Viewed

	@@ -0,0 +1,39 @@

+track_high_thresh:
+  type: uniform
+  default: 0.6  # from the default parameters
+  range: [0.3, 0.7]
+track_low_thresh:
+  type: uniform
+  default: 0.1  # from the default parameters
+  range: [0.1, 0.3]
+new_track_thresh:
+  type: uniform
+  default: 0.7  # from the default parameters
+  range: [0.1, 0.8]
+track_buffer:
+  type: randint
+  default: 30  # from the default parameters
+  range: [20, 81]
+match_thresh:
+  type: uniform
+  default: 0.8  # from the default parameters
+  range: [0.1, 0.9]
+proximity_thresh:
+  type: uniform
+  default: 0.5  # from the default parameters
+  range: [0.25, 0.75]
+appearance_thresh:
+  type: uniform
+  default: 0.25  # from the default parameters
+  range: [0.1, 0.8]
+cmc_method:
+  type: choice
+  default: ecc  # from the default parameters
+  options: [sof, ecc]

boxmot/configs/bytetrack.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+min_conf:
+  type: uniform
+  default: 0.1  # from the default parameters
+  range: [0.1, 0.3]
+track_thresh:
+  type: uniform
+  default: 0.6  # from the default parameters
+  range: [0.4, 0.6]
+track_buffer:
+  type: randint
+  default: 30  # from the default parameters
+  range: [10, 61, 10]  # step size of 10, upper bound exclusive
+match_thresh:
+  type: uniform
+  default: 0.9  # from the default parameters
+  range: [0.7, 0.9]
+frame_rate:
+  type: choice
+  default: 30  # from the default parameters
+  choices: [30]  # static choice for Ray Search

boxmot/configs/deepocsort.yaml ADDED Viewed

	@@ -0,0 +1,74 @@

+det_thresh:
+  type: uniform
+  default: 0.5  # from the default parameters
+  range: [0.3, 0.6]
+max_age:
+  type: randint
+  default: 30  # from the default parameters
+  range: [10, 61, 10]  # step size of 10, upper bound exclusive
+min_hits:
+  type: randint
+  default: 3  # from the default parameters
+  range: [1, 6]  # upper bound exclusive
+iou_thresh:
+  type: uniform
+  default: 0.3  # from the default parameters
+  range: [0.1, 0.4]
+delta_t:
+  type: randint
+  default: 3  # from the default parameters
+  range: [1, 6]  # upper bound exclusive
+asso_func:
+  type: choice
+  default: iou  # from the default parameters
+  options: ['iou', 'giou', 'diou', 'ciou', 'hmiou']
+inertia:
+  type: uniform
+  default: 0.2  # from the default parameters
+  range: [0.1, 0.4]
+w_association_emb:
+  type: uniform
+  default: 0.75  # from the default parameters
+  range: [0.5, 0.9]
+alpha_fixed_emb:
+  type: uniform
+  default: 0.95  # from the default parameters
+  range: [0.9, 0.999]
+aw_param:
+  type: uniform
+  default: 0.5  # from the default parameters
+  range: [0.3, 0.7]
+embedding_off:
+  type: choice
+  default: false  # from the default parameters
+  options: [True, False]
+cmc_off:
+  type: choice
+  default: false  # from the default parameters
+  options: [True, False]
+aw_off:
+  type: choice
+  default: false  # from the default parameters
+  options: [True, False]
+Q_xy_scaling:
+  type: uniform
+  default: 0.01  # from the default parameters
+  range: [0.01, 1]
+Q_s_scaling:
+  type: uniform
+  default: 0.0001  # from the default parameters
+  range: [0.0001, 1]

boxmot/configs/hybridsort.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+det_thresh:
+  type: uniform
+  default: 0.12442660055370669  # from the default parameters
+  range: [0, 0.6]
+max_age:
+  type: randint
+  default: 30  # from the default parameters
+  range: [10, 151, 10]  # step size of 10, upper bound exclusive
+min_hits:
+  type: randint
+  default: 1  # from the default parameters
+  range: [1, 6]  # upper bound exclusive
+delta_t:
+  type: randint
+  default: 5  # from the default parameters
+  range: [1, 6]  # upper bound exclusive
+asso_func:
+  type: choice
+  default: hmiou  # from the default parameters
+  options: ['iou', 'giou', 'diou']
+iou_threshold:
+  type: uniform
+  default: 0.3  # from the default parameters
+  range: [0.1, 0.4]
+inertia:
+  type: uniform
+  default: 0.369525477649008  # from the default parameters
+  range: [0.1, 0.4]
+TCM_first_step_weight:
+  type: uniform
+  default: 0.2866529225304586  # from the default parameters
+  range: [0, 0.5]
+longterm_reid_weight:
+  type: uniform
+  default: 0.0509704360503877  # from the default parameters
+  range: [0, 0.5]
+use_byte:
+  type: choice
+  default: False  # from the default parameters
+  options: [True, False]