George commited on Sep 26, 2023

Commit

61214ab

•

1 Parent(s): 6c635c4

change to clean model

Browse files

Files changed (25) hide show

conda.txt +0 -5
gender_age.py +0 -93
human_detect.py +0 -38
item2pic.py +0 -173
main.py +0 -9
mivolo/data/data_reader.py +0 -125
mivolo/data/dataset/__init__.py +0 -64
mivolo/data/dataset/age_gender_dataset.py +0 -194
mivolo/data/dataset/age_gender_loader.py +0 -169
mivolo/data/dataset/classification_dataset.py +0 -48
mivolo/data/dataset/reader_age_gender.py +0 -490
mivolo/data/misc.py +0 -264
mivolo/model/create_timm_model.py +0 -107
mivolo/model/cross_bottleneck_attn.py +0 -116
mivolo/model/mi_volo.py +0 -229
mivolo/model/mivolo_model.py +0 -402
mivolo/model/yolo_detector.py +0 -48
mivolo/predictor.py +0 -68
mivolo/structures.py +0 -464
mivolo/version.py +0 -1
model/model_imdb_cross_person_4.22_99.46.pth.tar → model_imdb_cross_person_4.22_99.46.pth.tar +0 -0
product2item.py +0 -95
requirements.txt +0 -9
utils.py +0 -59
model/yolov8x_person_face.pt → yolov8x_person_face.pt +0 -0

conda.txt DELETED Viewed

@@ -1,5 +0,0 @@
-python=3.9
-pytorch=1.12.1
-torchvision=0.13.1
-torchaudio=0.12.1
-cudatoolkit=11.3.1

gender_age.py DELETED Viewed

@@ -1,93 +0,0 @@
-import cv2
-import shutil
-import numpy as np
-from dataclasses import dataclass
-from tqdm import tqdm
-from mivolo.predictor import Predictor
-from utils import *
-import warnings
-warnings.filterwarnings("ignore")
-@dataclass
-class Cfg:
-    detector_weights: str
-    checkpoint: str
-    device: str = "cuda"
-    with_persons: bool = True
-    disable_faces: bool = False
-    draw: bool = True
-class ValidImgDetector:
-    predictor = None
-    def __init__(self):
-        detector_path = "./model/yolov8x_person_face.pt"
-        age_gender_path = "./model/model_imdb_cross_person_4.22_99.46.pth.tar"
-        predictor_cfg = Cfg(detector_path, age_gender_path)
-        self.predictor = Predictor(predictor_cfg)
-    def _detect(
-        self,
-        image: np.ndarray,
-        score_threshold: float,
-        iou_threshold: float,
-        mode: str,
-        predictor: Predictor
-    ) -> np.ndarray:
-        # input is rgb image, output must be rgb too
-        predictor.detector.detector_kwargs['conf'] = score_threshold
-        predictor.detector.detector_kwargs['iou'] = iou_threshold
-        if mode == "Use persons and faces":
-            use_persons = True
-            disable_faces = False
-        elif mode == "Use persons only":
-            use_persons = True
-            disable_faces = True
-        elif mode == "Use faces only":
-            use_persons = False
-            disable_faces = False
-        predictor.age_gender_model.meta.use_persons = use_persons
-        predictor.age_gender_model.meta.disable_faces = disable_faces
-        # image = image[:, :, ::-1]  # RGB -> BGR
-        detected_objects, _ = predictor.recognize(image)
-        has_child, has_female, has_male = False, False, False
-        if len(detected_objects.ages) > 0:
-            has_child = min(detected_objects.ages) < 18
-            has_female = 'female' in detected_objects.genders
-            has_male = 'male' in detected_objects.genders
-        return has_child, has_female, has_male
-    def valid_img(self, img_path):
-        image = cv2.imread(img_path)
-        has_child, has_female, has_male = self._detect(
-            image, 0.4, 0.7, "Use persons and faces", self.predictor)
-        return (not has_child) and (has_female) and (not has_male)
-def filter_img():
-    detector = ValidImgDetector()
-    create_dir('./output/valid')
-    create_dir('./output/invalid')
-    for _, _, files in os.walk('./images'):
-        for file in tqdm(files):
-            if file.endswith('.jpg'):
-                src_path = f"./images/{file}"
-                dst_path = "./output/invalid"
-                if detector.valid_img(src_path):
-                    dst_path = "./output/valid"
-                shutil.move(src_path, dst_path)
-if __name__ == "__main__":
-    filter_img()

human_detect.py DELETED Viewed

@@ -1,38 +0,0 @@
-import torch
-import torchvision.transforms as transforms
-from PIL import Image
-from torchvision.models.detection import fasterrcnn_resnet50_fpn
-def has_person(image_path):
-    # 加载预训练的 Faster R-CNN 模型
-    model = fasterrcnn_resnet50_fpn(pretrained=True)
-    model.eval()
-    # 载入并预处理图片
-    img = Image.open(image_path)
-    transform = transforms.Compose([transforms.ToTensor()])
-    input_tensor = transform(img)
-    input_batch = input_tensor.unsqueeze(0)
-    # 模型推理
-    with torch.no_grad():
-        output = model(input_batch)
-    # 解析输出结果
-    labels = output[0]['labels'].numpy()
-    scores = output[0]['scores'].numpy()
-    # 判断是否检测到人体（label=1 表示人类类别）
-    person_detected = any(label == 1 and score >
-                          0.5 for label, score in zip(labels, scores))
-    return person_detected
-if __name__ == "__main__":
-    image_path = './images/test.jpg'
-    if has_person(image_path):
-        print("图片中检测到人体。")
-    else:
-        print("图片中没有检测到人体。")

item2pic.py DELETED Viewed

@@ -1,173 +0,0 @@
-import json
-import requests
-from bs4 import BeautifulSoup
-from selenium import webdriver
-from tqdm import tqdm
-from utils import *
-DEBUG_MODE = False
-def add_to_failist(urlog, file_path="./output/failist.txt"):
-    with open(file_path, 'a', encoding='utf-8') as file:
-        file.write(urlog + "\n")
-    if DEBUG_MODE:
-        print(urlog)
-def download_image(img_dir='./images'):
-    create_dir(img_dir)
-    image_urls = load_urls()
-    print('下载图片中...')
-    trytime = 0
-    while len(image_urls) > 0:
-        failist = []
-        for img in tqdm(image_urls):
-            sleeps(0.5 + 0.1 * trytime, 1.0 + 0.1 * trytime)
-            response = requests.get(img['url'], stream=True)
-            if response.status_code == 200:
-                # 从URL中获取图像文件名
-                image_filename = f'{img_dir}/{img["pid"]}_{img["url"].split("/")[-1]}'
-                # 使用二进制写模式打开文件，准备写入图像数据
-                with open(image_filename, 'wb') as file:
-                    for chunk in response.iter_content(chunk_size=8192):
-                        file.write(chunk)
-                if DEBUG_MODE:
-                    print(f"{image_filename} 下载完成！")
-            elif response.status_code == 420:
-                failist.append(img)
-            else:
-                add_to_failist(
-                    f"下载 {img['url']} 失败: HTTP 错误码 {response.status_code}")
-        trytime += 1
-        print(
-            f'[{len(failist)} / {len(image_urls)}] images failed to download in attempt [{trytime}].')
-        image_urls = failist
-    print('下载完成!')
-def fix_url(link):
-    tmp_url = link.get('src')
-    if tmp_url[:2] == '//':
-        tmp_url = 'https:' + tmp_url
-    if '.png_' in tmp_url:
-        tmp_url = tmp_url.split('.png_')[0] + '.png'
-    elif '.gif_' in tmp_url:
-        tmp_url = tmp_url.split('.gif_')[0] + '.gif'
-    else:
-        tmp_url = tmp_url.split('.jpg_')[0] + '.jpg'
-    return tmp_url
-def get_pics(id):
-    sleeps(1.0, 1.5)
-    # selenium
-    option = webdriver.ChromeOptions()
-    option.add_experimental_option('excludeSwitches', ['enable-automation'])
-    option.add_argument("--disable-blink-features=AutomationControlled")
-    # option.add_argument('--headless')
-    browser = webdriver.Chrome(options=option)
-    browser.get(f'https://www.taobao.com/list/item/{id}.htm')
-    # browser.minimize_window()
-    browser.maximize_window()
-    skip_captcha()
-    # bs4
-    soup = BeautifulSoup(browser.page_source, 'html.parser')
-    srcs = set()
-    try:
-        for link in soup.find_all('img', class_='item-thumbnail'):
-            srcs.add(fix_url(link))
-        for link in soup.find_all('img', class_='property-img'):
-            srcs.add(fix_url(link))
-        for link in soup.find('div', class_='detail-content').find('p').find_all('img'):
-            srcs.add(fix_url(link))
-    except Exception as err:
-        print("Error: ", err)
-    return srcs
-def load_items(items_jsonl_path='./output/items.jsonl'):
-    ids = []
-    with open(items_jsonl_path, 'r', encoding='utf-8') as items_jsonl:
-        for line in items_jsonl:
-            # 将JSON字符串转换为Python对象
-            data = json.loads(line)
-            # 获取字典中的'id'键值的值，并添加到列表中
-            id_value = data.get('id')
-            if id_value is not None:
-                ids.append(id_value)
-    return ids
-def get_img_urls(ids, images_jsonl_path="./output/images.jsonl"):
-    for id in ids:
-        urls = get_pics(id)
-        with open(images_jsonl_path, 'a', encoding='utf-8') as images_jsonl:
-            for url in urls:
-                img = {
-                    'url': url,
-                    'pid': id
-                }
-                json.dump(img, images_jsonl)
-                images_jsonl.write('\n')
-def load_urls(images_jsonl_path="./output/images.jsonl"):
-    urls = []
-    with open(images_jsonl_path, 'r', encoding='utf-8') as items_jsonl:
-        for line in items_jsonl:
-            # 将JSON字符串转换为Python对象
-            data = json.loads(line)
-            tmp_dict = {
-                'url': data.get('url'),
-                'pid': data.get('pid')
-            }
-            if tmp_dict is not None:
-                urls.append(tmp_dict)
-    return urls
-def item_to_pic():
-    create_dir('./images')
-    ids = load_items()
-    get_img_urls(ids)
-    rm_duplicates_by_key(
-        jsonl_path='./output/images.jsonl',
-        key_to_check='url',
-        failist_path='./output/duplicate_img.txt'
-    )
-    download_image()
-if __name__ == "__main__":
-    # create_dir('./images')
-    # ids = load_items()
-    # get_img_urls(ids)
-    # rm_duplicates_by_key(
-    #     jsonl_path='./output/images.jsonl',
-    #     key_to_check='url',
-    #     failist_path='./output/duplicate_img.txt'
-    # )
-    download_image()

main.py DELETED Viewed

@@ -1,9 +0,0 @@
-from product2item import product_to_items
-from item2pic import item_to_pic
-from gender_age import filter_img
-if __name__ == "__main__":
-    product_to_items()
-    item_to_pic()
-    filter_img()

mivolo/data/data_reader.py DELETED Viewed

@@ -1,125 +0,0 @@
-import os
-from collections import defaultdict
-from dataclasses import dataclass, field
-from enum import Enum
-from typing import Dict, List, Optional, Tuple
-import pandas as pd
-IMAGES_EXT: Tuple = (".jpeg", ".jpg", ".png", ".webp", ".bmp", ".gif")
-VIDEO_EXT: Tuple = (".mp4", ".avi", ".mov", ".mkv", ".webm")
-@dataclass
-class PictureInfo:
-    image_path: str
-    age: Optional[str]  # age or age range(start;end format) or "-1"
-    gender: Optional[str]  # "M" of "F" or "-1"
-    bbox: List[int] = field(default_factory=lambda: [-1, -1, -1, -1])  # face bbox: xyxy
-    person_bbox: List[int] = field(default_factory=lambda: [-1, -1, -1, -1])  # person bbox: xyxy
-    @property
-    def has_person_bbox(self) -> bool:
-        return any(coord != -1 for coord in self.person_bbox)
-    @property
-    def has_face_bbox(self) -> bool:
-        return any(coord != -1 for coord in self.bbox)
-    def has_gt(self, only_age: bool = False) -> bool:
-        if only_age:
-            return self.age != "-1"
-        else:
-            return not (self.age == "-1" and self.gender == "-1")
-    def clear_person_bbox(self):
-        self.person_bbox = [-1, -1, -1, -1]
-    def clear_face_bbox(self):
-        self.bbox = [-1, -1, -1, -1]
-class AnnotType(Enum):
-    ORIGINAL = "original"
-    PERSONS = "persons"
-    NONE = "none"
-    @classmethod
-    def _missing_(cls, value):
-        print(f"WARN: Unknown annotation type {value}.")
-        return AnnotType.NONE
-def get_all_files(path: str, extensions: Tuple = IMAGES_EXT):
-    files_all = []
-    for root, subFolders, files in os.walk(path):
-        for name in files:
-            # linux tricks with .directory that still is file
-            if "directory" not in name and sum([ext.lower() in name.lower() for ext in extensions]) > 0:
-                files_all.append(os.path.join(root, name))
-    return files_all
-class InputType(Enum):
-    Image = 0
-    Video = 1
-    VideoStream = 2
-def get_input_type(input_path: str) -> InputType:
-    if os.path.isdir(input_path):
-        print("Input is a folder, only images will be processed")
-        return InputType.Image
-    elif os.path.isfile(input_path):
-        if input_path.endswith(VIDEO_EXT):
-            return InputType.Video
-        if input_path.endswith(IMAGES_EXT):
-            return InputType.Image
-        else:
-            raise ValueError(
-                f"Unknown or unsupported input file format {input_path}, \
-                             supported video formats: {VIDEO_EXT}, \
-                             supported image formats: {IMAGES_EXT}"
-            )
-    elif input_path.startswith("http") and not input_path.endswith(IMAGES_EXT):
-        return InputType.VideoStream
-    else:
-        raise ValueError(f"Unknown input {input_path}")
-def read_csv_annotation_file(annotation_file: str, images_dir: str, ignore_without_gt=False):
-    bboxes_per_image: Dict[str, List[PictureInfo]] = defaultdict(list)
-    df = pd.read_csv(annotation_file, sep=",")
-    annot_type = AnnotType("persons") if "person_x0" in df.columns else AnnotType("original")
-    print(f"Reading {annotation_file} (type: {annot_type})...")
-    missing_images = 0
-    for index, row in df.iterrows():
-        img_path = os.path.join(images_dir, row["img_name"])
-        if not os.path.exists(img_path):
-            missing_images += 1
-            continue
-        face_x1, face_y1, face_x2, face_y2 = row["face_x0"], row["face_y0"], row["face_x1"], row["face_y1"]
-        age, gender = str(row["age"]), str(row["gender"])
-        if ignore_without_gt and (age == "-1" or gender == "-1"):
-            continue
-        if annot_type == AnnotType.PERSONS:
-            p_x1, p_y1, p_x2, p_y2 = row["person_x0"], row["person_y0"], row["person_x1"], row["person_y1"]
-            person_bbox = list(map(int, [p_x1, p_y1, p_x2, p_y2]))
-        else:
-            person_bbox = [-1, -1, -1, -1]
-        bbox = list(map(int, [face_x1, face_y1, face_x2, face_y2]))
-        pic_info = PictureInfo(img_path, age, gender, bbox, person_bbox)
-        assert isinstance(pic_info.person_bbox, list)
-        bboxes_per_image[img_path].append(pic_info)
-    if missing_images > 0:
-        print(f"WARNING: Missing images: {missing_images}/{len(df)}")
-    return bboxes_per_image, annot_type

mivolo/data/dataset/__init__.py DELETED Viewed

@@ -1,64 +0,0 @@
-from typing import Tuple
-import torch
-from mivolo.model.mi_volo import MiVOLO
-from .age_gender_dataset import AgeGenderDataset
-from .age_gender_loader import create_loader
-from .classification_dataset import AdienceDataset, FairFaceDataset
-DATASET_CLASS_MAP = {
-    "utk": AgeGenderDataset,
-    "lagenda": AgeGenderDataset,
-    "imdb": AgeGenderDataset,
-    "adience": AdienceDataset,
-    "fairface": FairFaceDataset,
-}
-def build(
-    name: str,
-    images_path: str,
-    annotations_path: str,
-    split: str,
-    mivolo_model: MiVOLO,
-    workers: int,
-    batch_size: int,
-) -> Tuple[torch.utils.data.Dataset, torch.utils.data.DataLoader]:
-    dataset_class = DATASET_CLASS_MAP[name]
-    dataset: torch.utils.data.Dataset = dataset_class(
-        images_path=images_path,
-        annotations_path=annotations_path,
-        name=name,
-        split=split,
-        target_size=mivolo_model.input_size,
-        max_age=mivolo_model.meta.max_age,
-        min_age=mivolo_model.meta.min_age,
-        model_with_persons=mivolo_model.meta.with_persons_model,
-        use_persons=mivolo_model.meta.use_persons,
-        disable_faces=mivolo_model.meta.disable_faces,
-        only_age=mivolo_model.meta.only_age,
-    )
-    data_config = mivolo_model.data_config
-    in_chans = 3 if not mivolo_model.meta.with_persons_model else 6
-    input_size = (in_chans, mivolo_model.input_size, mivolo_model.input_size)
-    dataset_loader: torch.utils.data.DataLoader = create_loader(
-        dataset,
-        input_size=input_size,
-        batch_size=batch_size,
-        mean=data_config["mean"],
-        std=data_config["std"],
-        num_workers=workers,
-        crop_pct=data_config["crop_pct"],
-        crop_mode=data_config["crop_mode"],
-        pin_memory=False,
-        device=mivolo_model.device,
-        target_type=dataset.target_dtype,
-    )
-    return dataset, dataset_loader

mivolo/data/dataset/age_gender_dataset.py DELETED Viewed

@@ -1,194 +0,0 @@
-import logging
-from typing import Any, List, Optional, Set
-import cv2
-import numpy as np
-import torch
-from mivolo.data.dataset.reader_age_gender import ReaderAgeGender
-from PIL import Image
-from torchvision import transforms
-_logger = logging.getLogger("AgeGenderDataset")
-class AgeGenderDataset(torch.utils.data.Dataset):
-    def __init__(
-        self,
-        images_path,
-        annotations_path,
-        name=None,
-        split="train",
-        load_bytes=False,
-        img_mode="RGB",
-        transform=None,
-        is_training=False,
-        seed=1234,
-        target_size=224,
-        min_age=None,
-        max_age=None,
-        model_with_persons=False,
-        use_persons=False,
-        disable_faces=False,
-        only_age=False,
-    ):
-        reader = ReaderAgeGender(
-            images_path,
-            annotations_path,
-            split=split,
-            seed=seed,
-            target_size=target_size,
-            with_persons=use_persons,
-            disable_faces=disable_faces,
-            only_age=only_age,
-        )
-        self.name = name
-        self.model_with_persons = model_with_persons
-        self.reader = reader
-        self.load_bytes = load_bytes
-        self.img_mode = img_mode
-        self.transform = transform
-        self._consecutive_errors = 0
-        self.is_training = is_training
-        self.random_flip = 0.0
-        # Setting up classes.
-        # If min and max classes are passed - use them to have the same preprocessing for validation
-        self.max_age: float = None
-        self.min_age: float = None
-        self.avg_age: float = None
-        self.set_ages_min_max(min_age, max_age)
-        self.genders = ["M", "F"]
-        self.num_classes_gender = len(self.genders)
-        self.age_classes: Optional[List[str]] = self.set_age_classes()
-        self.num_classes_age = 1 if self.age_classes is None else len(self.age_classes)
-        self.num_classes: int = self.num_classes_age + self.num_classes_gender
-        self.target_dtype = torch.float32
-    def set_age_classes(self) -> Optional[List[str]]:
-        return None  # for regression dataset
-    def set_ages_min_max(self, min_age: Optional[float], max_age: Optional[float]):
-        assert all(age is None for age in [min_age, max_age]) or all(
-            age is not None for age in [min_age, max_age]
-        ), "Both min and max age must be passed or none of them"
-        if max_age is not None and min_age is not None:
-            _logger.info(f"Received predefined min_age {min_age} and max_age {max_age}")
-            self.max_age = max_age
-            self.min_age = min_age
-        else:
-            # collect statistics from loaded dataset
-            all_ages_set: Set[int] = set()
-            for img_path, image_samples in self.reader._ann.items():
-                for image_sample_info in image_samples:
-                    if image_sample_info.age == "-1":
-                        continue
-                    age = round(float(image_sample_info.age))
-                    all_ages_set.add(age)
-            self.max_age = max(all_ages_set)
-            self.min_age = min(all_ages_set)
-        self.avg_age = (self.max_age + self.min_age) / 2.0
-    def _norm_age(self, age):
-        return (age - self.avg_age) / (self.max_age - self.min_age)
-    def parse_gender(self, _gender: str) -> float:
-        if _gender != "-1":
-            gender = float(0 if _gender == "M" or _gender == "0" else 1)
-        else:
-            gender = -1
-        return gender
-    def parse_target(self, _age: str, gender: str) -> List[Any]:
-        if _age != "-1":
-            age = round(float(_age))
-            age = self._norm_age(float(age))
-        else:
-            age = -1
-        target: List[float] = [age, self.parse_gender(gender)]
-        return target
-    @property
-    def transform(self):
-        return self._transform
-    @transform.setter
-    def transform(self, transform):
-        # Disable pretrained monkey-patched transforms
-        if not transform:
-            return
-        _trans = []
-        for trans in transform.transforms:
-            if "Resize" in str(trans):
-                continue
-            if "Crop" in str(trans):
-                continue
-            _trans.append(trans)
-        self._transform = transforms.Compose(_trans)
-    def apply_tranforms(self, image: Optional[np.ndarray]) -> np.ndarray:
-        if image is None:
-            return None
-        if self.transform is None:
-            return image
-        image = convert_to_pil(image, self.img_mode)
-        for trans in self.transform.transforms:
-            image = trans(image)
-        return image
-    def __getitem__(self, index):
-        # get preprocessed face and person crops (np.ndarray)
-        # resize + pad, for person crops: cut off other bboxes
-        images, target = self.reader[index]
-        target = self.parse_target(*target)
-        if self.model_with_persons:
-            face_image, person_image = images
-            person_image: np.ndarray = self.apply_tranforms(person_image)
-        else:
-            face_image = images[0]
-            person_image = None
-        face_image: np.ndarray = self.apply_tranforms(face_image)
-        if person_image is not None:
-            img = np.concatenate([face_image, person_image], axis=0)
-        else:
-            img = face_image
-        return img, target
-    def __len__(self):
-        return len(self.reader)
-    def filename(self, index, basename=False, absolute=False):
-        return self.reader.filename(index, basename, absolute)
-    def filenames(self, basename=False, absolute=False):
-        return self.reader.filenames(basename, absolute)
-def convert_to_pil(cv_im: Optional[np.ndarray], img_mode: str = "RGB") -> "Image":
-    if cv_im is None:
-        return None
-    if img_mode == "RGB":
-        cv_im = cv2.cvtColor(cv_im, cv2.COLOR_BGR2RGB)
-    else:
-        raise Exception("Incorrect image mode has been passed!")
-    cv_im = np.ascontiguousarray(cv_im)
-    pil_image = Image.fromarray(cv_im)
-    return pil_image

mivolo/data/dataset/age_gender_loader.py DELETED Viewed

@@ -1,169 +0,0 @@
-"""
-Code adapted from timm https://github.com/huggingface/pytorch-image-models
-Modifications and additions for mivolo by / Copyright 2023, Irina Tolstykh, Maxim Kuprashevich
-"""
-import logging
-from contextlib import suppress
-from functools import partial
-from itertools import repeat
-import numpy as np
-import torch
-import torch.utils.data
-from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from timm.data.dataset import IterableImageDataset
-from timm.data.loader import PrefetchLoader, _worker_init
-from timm.data.transforms_factory import create_transform
-_logger = logging.getLogger(__name__)
-def fast_collate(batch, target_dtype=torch.uint8):
-    """A fast collation function optimized for uint8 images (np array or torch) and target_dtype targets (labels)"""
-    assert isinstance(batch[0], tuple)
-    batch_size = len(batch)
-    if isinstance(batch[0][0], np.ndarray):
-        targets = torch.tensor([b[1] for b in batch], dtype=target_dtype)
-        assert len(targets) == batch_size
-        tensor = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8)
-        for i in range(batch_size):
-            tensor[i] += torch.from_numpy(batch[i][0])
-        return tensor, targets
-    else:
-        raise ValueError(f"Incorrect batch type: {type(batch[0][0])}")
-def adapt_to_chs(x, n):
-    if not isinstance(x, (tuple, list)):
-        x = tuple(repeat(x, n))
-    elif len(x) != n:
-        # doubled channels
-        if len(x) * 2 == n:
-            x = np.concatenate((x, x))
-            _logger.warning(f"Pretrained mean/std different shape than model (doubled channes), using concat: {x}.")
-        else:
-            x_mean = np.mean(x).item()
-            x = (x_mean,) * n
-            _logger.warning(f"Pretrained mean/std different shape than model, using avg value {x}.")
-    else:
-        assert len(x) == n, "normalization stats must match image channels"
-    return x
-class PrefetchLoaderForMultiInput(PrefetchLoader):
-    def __init__(
-        self,
-        loader,
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        channels=3,
-        device=torch.device("cuda"),
-        img_dtype=torch.float32,
-    ):
-        mean = adapt_to_chs(mean, channels)
-        std = adapt_to_chs(std, channels)
-        normalization_shape = (1, channels, 1, 1)
-        self.loader = loader
-        self.device = device
-        self.img_dtype = img_dtype
-        self.mean = torch.tensor([x * 255 for x in mean], device=device, dtype=img_dtype).view(normalization_shape)
-        self.std = torch.tensor([x * 255 for x in std], device=device, dtype=img_dtype).view(normalization_shape)
-        self.is_cuda = torch.cuda.is_available() and device.type == "cuda"
-    def __iter__(self):
-        first = True
-        if self.is_cuda:
-            stream = torch.cuda.Stream()
-            stream_context = partial(torch.cuda.stream, stream=stream)
-        else:
-            stream = None
-            stream_context = suppress
-        for next_input, next_target in self.loader:
-            with stream_context():
-                next_input = next_input.to(device=self.device, non_blocking=True)
-                next_target = next_target.to(device=self.device, non_blocking=True)
-                next_input = next_input.to(self.img_dtype).sub_(self.mean).div_(self.std)
-            if not first:
-                yield input, target  # noqa: F823, F821
-            else:
-                first = False
-            if stream is not None:
-                torch.cuda.current_stream().wait_stream(stream)
-            input = next_input
-            target = next_target
-        yield input, target
-def create_loader(
-    dataset,
-    input_size,
-    batch_size,
-    mean=IMAGENET_DEFAULT_MEAN,
-    std=IMAGENET_DEFAULT_STD,
-    num_workers=1,
-    crop_pct=None,
-    crop_mode=None,
-    pin_memory=False,
-    img_dtype=torch.float32,
-    device=torch.device("cuda"),
-    persistent_workers=True,
-    worker_seeding="all",
-    target_type=torch.int64,
-):
-    transform = create_transform(
-        input_size,
-        is_training=False,
-        use_prefetcher=True,
-        mean=mean,
-        std=std,
-        crop_pct=crop_pct,
-        crop_mode=crop_mode,
-    )
-    dataset.transform = transform
-    if isinstance(dataset, IterableImageDataset):
-        # give Iterable datasets early knowledge of num_workers so that sample estimates
-        # are correct before worker processes are launched
-        dataset.set_loader_cfg(num_workers=num_workers)
-        raise ValueError("Incorrect dataset type: IterableImageDataset")
-    loader_class = torch.utils.data.DataLoader
-    loader_args = dict(
-        batch_size=batch_size,
-        shuffle=False,
-        num_workers=num_workers,
-        sampler=None,
-        collate_fn=lambda batch: fast_collate(batch, target_dtype=target_type),
-        pin_memory=pin_memory,
-        drop_last=False,
-        worker_init_fn=partial(_worker_init, worker_seeding=worker_seeding),
-        persistent_workers=persistent_workers,
-    )
-    try:
-        loader = loader_class(dataset, **loader_args)
-    except TypeError:
-        loader_args.pop("persistent_workers")  # only in Pytorch 1.7+
-        loader = loader_class(dataset, **loader_args)
-    loader = PrefetchLoaderForMultiInput(
-        loader,
-        mean=mean,
-        std=std,
-        channels=input_size[0],
-        device=device,
-        img_dtype=img_dtype,
-    )
-    return loader

mivolo/data/dataset/classification_dataset.py DELETED Viewed

@@ -1,48 +0,0 @@
-from typing import Any, List, Optional
-import torch
-from .age_gender_dataset import AgeGenderDataset
-class ClassificationDataset(AgeGenderDataset):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.target_dtype = torch.int32
-    def set_age_classes(self) -> Optional[List[str]]:
-        raise NotImplementedError
-    def parse_target(self, age: str, gender: str) -> List[Any]:
-        assert self.age_classes is not None
-        if age != "-1":
-            assert age in self.age_classes, f"Unknown category in {self.name} dataset: {age}"
-            age_ind = self.age_classes.index(age)
-        else:
-            age_ind = -1
-        target: List[int] = [age_ind, int(self.parse_gender(gender))]
-        return target
-class FairFaceDataset(ClassificationDataset):
-    def set_age_classes(self) -> Optional[List[str]]:
-        age_classes = ["0;2", "3;9", "10;19", "20;29", "30;39", "40;49", "50;59", "60;69", "70;120"]
-        # a[i-1] <= v < a[i] => age_classes[i-1]
-        self._intervals = torch.tensor([0, 3, 10, 20, 30, 40, 50, 60, 70])
-        return age_classes
-class AdienceDataset(ClassificationDataset):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.target_dtype = torch.int32
-    def set_age_classes(self) -> Optional[List[str]]:
-        age_classes = ["0;2", "4;6", "8;12", "15;20", "25;32", "38;43", "48;53", "60;100"]
-        # a[i-1] <= v < a[i] => age_classes[i-1]
-        self._intervals = torch.tensor([0, 4, 7, 14, 24, 36, 46, 57])
-        return age_classes

mivolo/data/dataset/reader_age_gender.py DELETED Viewed

@@ -1,490 +0,0 @@
-import logging
-import os
-from functools import partial
-from multiprocessing.pool import ThreadPool
-from typing import Dict, List, Optional, Tuple
-import cv2
-import numpy as np
-from mivolo.data.data_reader import AnnotType, PictureInfo, get_all_files, read_csv_annotation_file
-from mivolo.data.misc import IOU, class_letterbox, cropout_black_parts
-from timm.data.readers.reader import Reader
-from tqdm import tqdm
-CROP_ROUND_TOL = 0.3
-MIN_PERSON_SIZE = 100
-MIN_PERSON_CROP_AFTERCUT_RATIO = 0.4
-_logger = logging.getLogger("ReaderAgeGender")
-class ReaderAgeGender(Reader):
-    """
-    Reader for almost original imdb-wiki cleaned dataset.
-    Two changes:
-        1. Your annotation must be in ./annotation subdir of dataset root
-        2. Images must be in images subdir
-    """
-    def __init__(
-        self,
-        images_path,
-        annotations_path,
-        split="validation",
-        target_size=224,
-        min_size=5,
-        seed=1234,
-        with_persons=False,
-        min_person_size=MIN_PERSON_SIZE,
-        disable_faces=False,
-        only_age=False,
-        min_person_aftercut_ratio=MIN_PERSON_CROP_AFTERCUT_RATIO,
-        crop_round_tol=CROP_ROUND_TOL,
-    ):
-        super().__init__()
-        self.with_persons = with_persons
-        self.disable_faces = disable_faces
-        self.only_age = only_age
-        # can be only black for now, even though it's not very good with further normalization
-        self.crop_out_color = (0, 0, 0)
-        self.empty_crop = np.ones((target_size, target_size, 3)) * self.crop_out_color
-        self.empty_crop = self.empty_crop.astype(np.uint8)
-        self.min_person_size = min_person_size
-        self.min_person_aftercut_ratio = min_person_aftercut_ratio
-        self.crop_round_tol = crop_round_tol
-        self.split = split
-        self.min_size = min_size
-        self.seed = seed
-        self.target_size = target_size
-        # Reading annotations. Can be multiple files if annotations_path dir
-        self._ann: Dict[str, List[PictureInfo]] = {}  # list of samples for each image
-        self._associated_objects: Dict[str, Dict[int, List[List[int]]]] = {}
-        self._faces_list: List[Tuple[str, int]] = []  # samples from this list will be loaded in __getitem__
-        self._read_annotations(images_path, annotations_path)
-        _logger.info(f"Dataset length: {len(self._faces_list)} crops")
-    def __getitem__(self, index):
-        return self._read_img_and_label(index)
-    def __len__(self):
-        return len(self._faces_list)
-    def _filename(self, index, basename=False, absolute=False):
-        img_p = self._faces_list[index][0]
-        return os.path.basename(img_p) if basename else img_p
-    def _read_annotations(self, images_path, csvs_path):
-        self._ann = {}
-        self._faces_list = []
-        self._associated_objects = {}
-        csvs = get_all_files(csvs_path, [".csv"])
-        csvs = [c for c in csvs if self.split in os.path.basename(c)]
-        # load annotations per image
-        for csv in csvs:
-            db, ann_type = read_csv_annotation_file(csv, images_path)
-            if self.with_persons and ann_type != AnnotType.PERSONS:
-                raise ValueError(
-                    f"Annotation type in file {csv} contains no persons, "
-                    f"but annotations with persons are requested."
-                )
-            self._ann.update(db)
-        if len(self._ann) == 0:
-            raise ValueError("Annotations are empty!")
-        self._ann, self._associated_objects = self.prepare_annotations()
-        images_list = list(self._ann.keys())
-        for img_path in images_list:
-            for index, image_sample_info in enumerate(self._ann[img_path]):
-                assert image_sample_info.has_gt(
-                    self.only_age
-                ), "Annotations must be checked with self.prepare_annotations() func"
-                self._faces_list.append((img_path, index))
-    def _read_img_and_label(self, index):
-        if not isinstance(index, int):
-            raise TypeError("ReaderAgeGender expected index to be integer")
-        img_p, face_index = self._faces_list[index]
-        ann: PictureInfo = self._ann[img_p][face_index]
-        img = cv2.imread(img_p)
-        face_empty = True
-        if ann.has_face_bbox and not (self.with_persons and self.disable_faces):
-            face_crop, face_empty = self._get_crop(ann.bbox, img)
-        if not self.with_persons and face_empty:
-            # model without persons
-            raise ValueError("Annotations must be checked with self.prepare_annotations() func")
-        if face_empty:
-            face_crop = self.empty_crop
-        person_empty = True
-        if self.with_persons or self.disable_faces:
-            if ann.has_person_bbox:
-                # cut off all associated objects from person crop
-                objects = self._associated_objects[img_p][face_index]
-                person_crop, person_empty = self._get_crop(
-                    ann.person_bbox,
-                    img,
-                    crop_out_color=self.crop_out_color,
-                    asced_objects=objects,
-                )
-            if face_empty and person_empty:
-                raise ValueError("Annotations must be checked with self.prepare_annotations() func")
-        if person_empty:
-            person_crop = self.empty_crop
-        return (face_crop, person_crop), [ann.age, ann.gender]
-    def _get_crop(
-        self,
-        bbox,
-        img,
-        asced_objects=None,
-        crop_out_color=(0, 0, 0),
-    ) -> Tuple[np.ndarray, bool]:
-        empty_bbox = False
-        xmin, ymin, xmax, ymax = bbox
-        assert not (
-            ymax - ymin < self.min_size or xmax - xmin < self.min_size
-        ), "Annotations must be checked with self.prepare_annotations() func"
-        crop = img[ymin:ymax, xmin:xmax]
-        if asced_objects:
-            # cut off other objects for person crop
-            crop, empty_bbox = _cropout_asced_objs(
-                asced_objects,
-                bbox,
-                crop.copy(),
-                crop_out_color=crop_out_color,
-                min_person_size=self.min_person_size,
-                crop_round_tol=self.crop_round_tol,
-                min_person_aftercut_ratio=self.min_person_aftercut_ratio,
-            )
-            if empty_bbox:
-                crop = self.empty_crop
-        crop = class_letterbox(crop, new_shape=(self.target_size, self.target_size), color=crop_out_color)
-        return crop, empty_bbox
-    def prepare_annotations(self):
-        good_anns: Dict[str, List[PictureInfo]] = {}
-        all_associated_objects: Dict[str, Dict[int, List[List[int]]]] = {}
-        if not self.with_persons:
-            # remove all persons
-            for img_path, bboxes in self._ann.items():
-                for sample in bboxes:
-                    sample.clear_person_bbox()
-        # check dataset and collect associated_objects
-        verify_images_func = partial(
-            verify_images,
-            min_size=self.min_size,
-            min_person_size=self.min_person_size,
-            with_persons=self.with_persons,
-            disable_faces=self.disable_faces,
-            crop_round_tol=self.crop_round_tol,
-            min_person_aftercut_ratio=self.min_person_aftercut_ratio,
-            only_age=self.only_age,
-        )
-        num_threads = min(8, os.cpu_count())
-        all_msgs = []
-        broken = 0
-        skipped = 0
-        all_skipped_crops = 0
-        desc = "Check annotations..."
-        with ThreadPool(num_threads) as pool:
-            pbar = tqdm(
-                pool.imap_unordered(verify_images_func, list(self._ann.items())),
-                desc=desc,
-                total=len(self._ann),
-            )
-            for (img_info, associated_objects, msgs, is_corrupted, is_empty_annotations, skipped_crops) in pbar:
-                broken += 1 if is_corrupted else 0
-                all_msgs.extend(msgs)
-                all_skipped_crops += skipped_crops
-                skipped += 1 if is_empty_annotations else 0
-                if img_info is not None:
-                    img_path, img_samples = img_info
-                    good_anns[img_path] = img_samples
-                    all_associated_objects.update({img_path: associated_objects})
-                pbar.desc = (
-                    f"{desc} {skipped} images skipped ({all_skipped_crops} crops are incorrect); "
-                    f"{broken} images corrupted"
-                )
-            pbar.close()
-        for msg in all_msgs:
-            print(msg)
-        print(f"\nLeft images: {len(good_anns)}")
-        return good_anns, all_associated_objects
-def verify_images(
-    img_info,
-    min_size: int,
-    min_person_size: int,
-    with_persons: bool,
-    disable_faces: bool,
-    crop_round_tol: float,
-    min_person_aftercut_ratio: float,
-    only_age: bool,
-):
-    # If crop is too small, if image can not be read or if image does not exist
-    # then filter out this sample
-    disable_faces = disable_faces and with_persons
-    kwargs = dict(
-        min_person_size=min_person_size,
-        disable_faces=disable_faces,
-        with_persons=with_persons,
-        crop_round_tol=crop_round_tol,
-        min_person_aftercut_ratio=min_person_aftercut_ratio,
-        only_age=only_age,
-    )
-    def bbox_correct(bbox, min_size, im_h, im_w) -> Tuple[bool, List[int]]:
-        ymin, ymax, xmin, xmax = _correct_bbox(bbox, im_h, im_w)
-        crop_h, crop_w = ymax - ymin, xmax - xmin
-        if crop_h < min_size or crop_w < min_size:
-            return False, [-1, -1, -1, -1]
-        bbox = [xmin, ymin, xmax, ymax]
-        return True, bbox
-    msgs = []
-    skipped_crops = 0
-    is_corrupted = False
-    is_empty_annotations = False
-    img_path: str = img_info[0]
-    img_samples: List[PictureInfo] = img_info[1]
-    try:
-        im_cv = cv2.imread(img_path)
-        im_h, im_w = im_cv.shape[:2]
-    except Exception:
-        msgs.append(f"Can not load image {img_path}")
-        is_corrupted = True
-        return None, {}, msgs, is_corrupted, is_empty_annotations, skipped_crops
-    out_samples: List[PictureInfo] = []
-    for sample in img_samples:
-        # correct face bbox
-        if sample.has_face_bbox:
-            is_correct, sample.bbox = bbox_correct(sample.bbox, min_size, im_h, im_w)
-            if not is_correct and sample.has_gt(only_age):
-                msgs.append("Small face. Passing..")
-                skipped_crops += 1
-        # correct person bbox
-        if sample.has_person_bbox:
-            is_correct, sample.person_bbox = bbox_correct(
-                sample.person_bbox, max(min_person_size, min_size), im_h, im_w
-            )
-            if not is_correct and sample.has_gt(only_age):
-                msgs.append(f"Small person {img_path}. Passing..")
-                skipped_crops += 1
-        if sample.has_face_bbox or sample.has_person_bbox:
-            out_samples.append(sample)
-        elif sample.has_gt(only_age):
-            msgs.append("Sample hs no face and no body. Passing..")
-            skipped_crops += 1
-    # sort that samples with undefined age and gender be the last
-    out_samples = sorted(out_samples, key=lambda sample: 1 if not sample.has_gt(only_age) else 0)
-    # for each person find other faces and persons bboxes, intersected with it
-    associated_objects: Dict[int, List[List[int]]] = find_associated_objects(out_samples, only_age=only_age)
-    out_samples, associated_objects, skipped_crops = filter_bad_samples(
-        out_samples, associated_objects, im_cv, msgs, skipped_crops, **kwargs
-    )
-    out_img_info: Optional[Tuple[str, List]] = (img_path, out_samples)
-    if len(out_samples) == 0:
-        out_img_info = None
-        is_empty_annotations = True
-    return out_img_info, associated_objects, msgs, is_corrupted, is_empty_annotations, skipped_crops
-def filter_bad_samples(
-    out_samples: List[PictureInfo],
-    associated_objects: dict,
-    im_cv: np.ndarray,
-    msgs: List[str],
-    skipped_crops: int,
-    **kwargs,
-):
-    with_persons, disable_faces, min_person_size, crop_round_tol, min_person_aftercut_ratio, only_age = (
-        kwargs["with_persons"],
-        kwargs["disable_faces"],
-        kwargs["min_person_size"],
-        kwargs["crop_round_tol"],
-        kwargs["min_person_aftercut_ratio"],
-        kwargs["only_age"],
-    )
-    # left only samples with annotations
-    inds = [sample_ind for sample_ind, sample in enumerate(out_samples) if sample.has_gt(only_age)]
-    out_samples, associated_objects = _filter_by_ind(out_samples, associated_objects, inds)
-    if kwargs["disable_faces"]:
-        # clear all faces
-        for ind, sample in enumerate(out_samples):
-            sample.clear_face_bbox()
-        # left only samples with person_bbox
-        inds = [sample_ind for sample_ind, sample in enumerate(out_samples) if sample.has_person_bbox]
-        out_samples, associated_objects = _filter_by_ind(out_samples, associated_objects, inds)
-    if with_persons or disable_faces:
-        # check that preprocessing func
-        # _cropout_asced_objs() return not empty person_image for each out sample
-        inds = []
-        for ind, sample in enumerate(out_samples):
-            person_empty = True
-            if sample.has_person_bbox:
-                xmin, ymin, xmax, ymax = sample.person_bbox
-                crop = im_cv[ymin:ymax, xmin:xmax]
-                # cut off all associated objects from person crop
-                _, person_empty = _cropout_asced_objs(
-                    associated_objects[ind],
-                    sample.person_bbox,
-                    crop.copy(),
-                    min_person_size=min_person_size,
-                    crop_round_tol=crop_round_tol,
-                    min_person_aftercut_ratio=min_person_aftercut_ratio,
-                )
-            if person_empty and not sample.has_face_bbox:
-                msgs.append("Small person after preprocessing. Passing..")
-                skipped_crops += 1
-            else:
-                inds.append(ind)
-        out_samples, associated_objects = _filter_by_ind(out_samples, associated_objects, inds)
-    assert len(associated_objects) == len(out_samples)
-    return out_samples, associated_objects, skipped_crops
-def _filter_by_ind(out_samples, associated_objects, inds):
-    _associated_objects = {}
-    _out_samples = []
-    for ind, sample in enumerate(out_samples):
-        if ind in inds:
-            _associated_objects[len(_out_samples)] = associated_objects[ind]
-            _out_samples.append(sample)
-    return _out_samples, _associated_objects
-def find_associated_objects(
-    image_samples: List[PictureInfo], iou_thresh=0.0001, only_age=False
-) -> Dict[int, List[List[int]]]:
-    """
-    For each person (which has gt age and gt gender) find other faces and persons bboxes, intersected with it
-    """
-    associated_objects: Dict[int, List[List[int]]] = {}
-    for iindex, image_sample_info in enumerate(image_samples):
-        # add own face
-        associated_objects[iindex] = [image_sample_info.bbox] if image_sample_info.has_face_bbox else []
-        if not image_sample_info.has_person_bbox or not image_sample_info.has_gt(only_age):
-            # if sample has not gt => not be used
-            continue
-        iperson_box = image_sample_info.person_bbox
-        for jindex, other_image_sample in enumerate(image_samples):
-            if iindex == jindex:
-                continue
-            if other_image_sample.has_face_bbox:
-                jface_bbox = other_image_sample.bbox
-                iou = _get_iou(jface_bbox, iperson_box)
-                if iou >= iou_thresh:
-                    associated_objects[iindex].append(jface_bbox)
-            if other_image_sample.has_person_bbox:
-                jperson_bbox = other_image_sample.person_bbox
-                iou = _get_iou(jperson_bbox, iperson_box)
-                if iou >= iou_thresh:
-                    associated_objects[iindex].append(jperson_bbox)
-    return associated_objects
-def _cropout_asced_objs(
-    asced_objects,
-    person_bbox,
-    crop,
-    min_person_size,
-    crop_round_tol,
-    min_person_aftercut_ratio,
-    crop_out_color=(0, 0, 0),
-):
-    empty = False
-    xmin, ymin, xmax, ymax = person_bbox
-    for a_obj in asced_objects:
-        aobj_xmin, aobj_ymin, aobj_xmax, aobj_ymax = a_obj
-        aobj_ymin = int(max(aobj_ymin - ymin, 0))
-        aobj_xmin = int(max(aobj_xmin - xmin, 0))
-        aobj_ymax = int(min(aobj_ymax - ymin, ymax - ymin))
-        aobj_xmax = int(min(aobj_xmax - xmin, xmax - xmin))
-        crop[aobj_ymin:aobj_ymax, aobj_xmin:aobj_xmax] = crop_out_color
-    crop, cropped_ratio = cropout_black_parts(crop, crop_round_tol)
-    if (
-        crop.shape[0] < min_person_size or crop.shape[1] < min_person_size
-    ) or cropped_ratio < min_person_aftercut_ratio:
-        crop = None
-        empty = True
-    return crop, empty
-def _correct_bbox(bbox, h, w):
-    xmin, ymin, xmax, ymax = bbox
-    ymin = min(max(ymin, 0), h)
-    ymax = min(max(ymax, 0), h)
-    xmin = min(max(xmin, 0), w)
-    xmax = min(max(xmax, 0), w)
-    return ymin, ymax, xmin, xmax
-def _get_iou(bbox1, bbox2):
-    xmin1, ymin1, xmax1, ymax1 = bbox1
-    xmin2, ymin2, xmax2, ymax2 = bbox2
-    iou = IOU(
-        [ymin1, xmin1, ymax1, xmax1],
-        [ymin2, xmin2, ymax2, xmax2],
-    )
-    return iou

mivolo/data/misc.py DELETED Viewed

@@ -1,264 +0,0 @@
-import argparse
-import ast
-import re
-from typing import List, Optional, Tuple, Union
-import cv2
-import numpy as np
-import torch
-import torchvision.transforms.functional as F
-from scipy.optimize import linear_sum_assignment
-from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-CROP_ROUND_RATE = 0.1
-MIN_PERSON_CROP_NONZERO = 0.5
-def aggregate_votes_winsorized(ages, max_age_dist=6):
-    # Replace any annotation that is more than a max_age_dist away from the median
-    # with the median + max_age_dist if higher or max_age_dist - max_age_dist if below
-    median = np.median(ages)
-    ages = np.clip(ages, median - max_age_dist, median + max_age_dist)
-    return np.mean(ages)
-def cropout_black_parts(img, tol=0.3):
-    # Create a binary mask of zero pixels
-    zero_pixels_mask = np.all(img == 0, axis=2)
-    # Calculate the threshold for zero pixels in rows and columns
-    threshold = img.shape[0] - img.shape[0] * tol
-    # Calculate row sums and column sums of zero pixels mask
-    row_sums = np.sum(zero_pixels_mask, axis=1)
-    col_sums = np.sum(zero_pixels_mask, axis=0)
-    # Find the first and last rows with zero pixel sums above the threshold
-    start_row = np.argmin(row_sums > threshold)
-    end_row = img.shape[0] - np.argmin(row_sums[::-1] > threshold)
-    # Find the first and last columns with zero pixel sums above the threshold
-    start_col = np.argmin(col_sums > threshold)
-    end_col = img.shape[1] - np.argmin(col_sums[::-1] > threshold)
-    # Crop the image
-    cropped_img = img[start_row:end_row, start_col:end_col, :]
-    area = cropped_img.shape[0] * cropped_img.shape[1]
-    area_orig = img.shape[0] * img.shape[1]
-    return cropped_img, area / area_orig
-def natural_key(string_):
-    """See http://www.codinghorror.com/blog/archives/001018.html"""
-    return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_.lower())]
-def add_bool_arg(parser, name, default=False, help=""):
-    dest_name = name.replace("-", "_")
-    group = parser.add_mutually_exclusive_group(required=False)
-    group.add_argument("--" + name, dest=dest_name, action="store_true", help=help)
-    group.add_argument("--no-" + name, dest=dest_name, action="store_false", help=help)
-    parser.set_defaults(**{dest_name: default})
-def cumulative_score(pred_ages, gt_ages, L, tol=1e-6):
-    n = pred_ages.shape[0]
-    num_correct = torch.sum(torch.abs(pred_ages - gt_ages) <= L + tol)
-    cs_score = num_correct / n
-    return cs_score
-def cumulative_error(pred_ages, gt_ages, L, tol=1e-6):
-    n = pred_ages.shape[0]
-    num_correct = torch.sum(torch.abs(pred_ages - gt_ages) >= L + tol)
-    cs_score = num_correct / n
-    return cs_score
-class ParseKwargs(argparse.Action):
-    def __call__(self, parser, namespace, values, option_string=None):
-        kw = {}
-        for value in values:
-            key, value = value.split("=")
-            try:
-                kw[key] = ast.literal_eval(value)
-            except ValueError:
-                kw[key] = str(value)  # fallback to string (avoid need to escape on command line)
-        setattr(namespace, self.dest, kw)
-def box_iou(box1, box2, over_second=False):
-    """
-    Return intersection-over-union (Jaccard index) of boxes.
-    If over_second == True, return mean(intersection-over-union, (inter / area2))
-    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
-    Arguments:
-        box1 (Tensor[N, 4])
-        box2 (Tensor[M, 4])
-    Returns:
-        iou (Tensor[N, M]): the NxM matrix containing the pairwise
-            IoU values for every element in boxes1 and boxes2
-    """
-    def box_area(box):
-        # box = 4xn
-        return (box[2] - box[0]) * (box[3] - box[1])
-    area1 = box_area(box1.T)
-    area2 = box_area(box2.T)
-    # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
-    inter = (torch.min(box1[:, None, 2:], box2[:, 2:]) - torch.max(box1[:, None, :2], box2[:, :2])).clamp(0).prod(2)
-    iou = inter / (area1[:, None] + area2 - inter)  # iou = inter / (area1 + area2 - inter)
-    if over_second:
-        return (inter / area2 + iou) / 2  # mean(inter / area2, iou)
-    else:
-        return iou
-def split_batch(bs: int, dev: int) -> Tuple[int, int]:
-    full_bs = (bs // dev) * dev
-    part_bs = bs - full_bs
-    return full_bs, part_bs
-def assign_faces(
-    persons_bboxes: List[torch.tensor], faces_bboxes: List[torch.tensor], iou_thresh: float = 0.0001
-) -> Tuple[List[Optional[int]], List[int]]:
-    """
-    Assign person to each face if it is possible.
-    Return:
-        - assigned_faces List[Optional[int]]: mapping of face_ind to person_ind
-                                            ( assigned_faces[face_ind] = person_ind ). person_ind can be None
-        - unassigned_persons_inds List[int]: persons indexes without any assigned face
-    """
-    assigned_faces: List[Optional[int]] = [None for _ in range(len(faces_bboxes))]
-    unassigned_persons_inds: List[int] = [p_ind for p_ind in range(len(persons_bboxes))]
-    if len(persons_bboxes) == 0 or len(faces_bboxes) == 0:
-        return assigned_faces, unassigned_persons_inds
-    cost_matrix = box_iou(torch.stack(persons_bboxes), torch.stack(faces_bboxes), over_second=True).cpu().numpy()
-    persons_indexes, face_indexes = [], []
-    if len(cost_matrix) > 0:
-        persons_indexes, face_indexes = linear_sum_assignment(cost_matrix, maximize=True)
-    matched_persons = set()
-    for person_idx, face_idx in zip(persons_indexes, face_indexes):
-        ciou = cost_matrix[person_idx][face_idx]
-        if ciou > iou_thresh:
-            if person_idx in matched_persons:
-                # Person can not be assigned twice, in reality this should not happen
-                continue
-            assigned_faces[face_idx] = person_idx
-            matched_persons.add(person_idx)
-    unassigned_persons_inds = [p_ind for p_ind in range(len(persons_bboxes)) if p_ind not in matched_persons]
-    return assigned_faces, unassigned_persons_inds
-def class_letterbox(im, new_shape=(640, 640), color=(0, 0, 0), scaleup=True):
-    # Resize and pad image while meeting stride-multiple constraints
-    shape = im.shape[:2]  # current shape [height, width]
-    if isinstance(new_shape, int):
-        new_shape = (new_shape, new_shape)
-    if im.shape[0] == new_shape[0] and im.shape[1] == new_shape[1]:
-        return im
-    # Scale ratio (new / old)
-    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
-    if not scaleup:  # only scale down, do not scale up (for better val mAP)
-        r = min(r, 1.0)
-    # Compute padding
-    # ratio = r, r  # width, height ratios
-    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
-    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
-    dw /= 2  # divide padding into 2 sides
-    dh /= 2
-    if shape[::-1] != new_unpad:  # resize
-        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
-    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
-    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
-    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
-    return im
-def prepare_classification_images(
-    img_list: List[Optional[np.ndarray]],
-    target_size: int = 224,
-    mean=IMAGENET_DEFAULT_MEAN,
-    std=IMAGENET_DEFAULT_STD,
-    device=None,
-) -> torch.tensor:
-    prepared_images: List[torch.tensor] = []
-    for img in img_list:
-        if img is None:
-            img = torch.zeros((3, target_size, target_size), dtype=torch.float32)
-            img = F.normalize(img, mean=mean, std=std)
-            img = img.unsqueeze(0)
-            prepared_images.append(img)
-            continue
-        img = class_letterbox(img, new_shape=(target_size, target_size))
-        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-        img = img / 255.0
-        img = (img - mean) / std
-        img = img.astype(dtype=np.float32)
-        img = img.transpose((2, 0, 1))
-        img = np.ascontiguousarray(img)
-        img = torch.from_numpy(img)
-        img = img.unsqueeze(0)
-        prepared_images.append(img)
-    prepared_input = torch.concat(prepared_images)
-    if device:
-        prepared_input = prepared_input.to(device)
-    return prepared_input
-def IOU(bb1: Union[tuple, list], bb2: Union[tuple, list], norm_second_bbox: bool = False) -> float:
-    # expects [ymin, xmin, ymax, xmax], doesnt matter absolute or relative
-    assert bb1[1] < bb1[3]
-    assert bb1[0] < bb1[2]
-    assert bb2[1] < bb2[3]
-    assert bb2[0] < bb2[2]
-    # determine the coordinates of the intersection rectangle
-    x_left = max(bb1[1], bb2[1])
-    y_top = max(bb1[0], bb2[0])
-    x_right = min(bb1[3], bb2[3])
-    y_bottom = min(bb1[2], bb2[2])
-    if x_right < x_left or y_bottom < y_top:
-        return 0.0
-    # The intersection of two axis-aligned bounding boxes is always an
-    # axis-aligned bounding box
-    intersection_area = (x_right - x_left) * (y_bottom - y_top)
-    # compute the area of both AABBs
-    bb1_area = (bb1[3] - bb1[1]) * (bb1[2] - bb1[0])
-    bb2_area = (bb2[3] - bb2[1]) * (bb2[2] - bb2[0])
-    if not norm_second_bbox:
-        # compute the intersection over union by taking the intersection
-        # area and dividing it by the sum of prediction + ground-truth
-        # areas - the interesection area
-        iou = intersection_area / float(bb1_area + bb2_area - intersection_area)
-    else:
-        # for cases when we search if second bbox is inside first one
-        iou = intersection_area / float(bb2_area)
-    assert iou >= 0.0
-    assert iou <= 1.01
-    return iou

mivolo/model/create_timm_model.py DELETED Viewed

@@ -1,107 +0,0 @@
-"""
-Code adapted from timm https://github.com/huggingface/pytorch-image-models
-Modifications and additions for mivolo by / Copyright 2023, Irina Tolstykh, Maxim Kuprashevich
-"""
-import os
-from typing import Any, Dict, Optional, Union
-import timm
-# register new models
-from mivolo.model.mivolo_model import *  # noqa: F403, F401
-from timm.layers import set_layer_config
-from timm.models._factory import parse_model_name
-from timm.models._helpers import load_state_dict, remap_checkpoint
-from timm.models._hub import load_model_config_from_hf
-from timm.models._pretrained import PretrainedCfg, split_model_name_tag
-from timm.models._registry import is_model, model_entrypoint
-def load_checkpoint(
-    model, checkpoint_path, use_ema=True, strict=True, remap=False, filter_keys=None, state_dict_map=None
-):
-    if os.path.splitext(checkpoint_path)[-1].lower() in (".npz", ".npy"):
-        # numpy checkpoint, try to load via model specific load_pretrained fn
-        if hasattr(model, "load_pretrained"):
-            timm.models._model_builder.load_pretrained(checkpoint_path)
-        else:
-            raise NotImplementedError("Model cannot load numpy checkpoint")
-        return
-    state_dict = load_state_dict(checkpoint_path, use_ema)
-    if remap:
-        state_dict = remap_checkpoint(model, state_dict)
-    if filter_keys:
-        for sd_key in list(state_dict.keys()):
-            for filter_key in filter_keys:
-                if filter_key in sd_key:
-                    if sd_key in state_dict:
-                        del state_dict[sd_key]
-    rep = []
-    if state_dict_map is not None:
-        # 'patch_embed.conv1.' : 'patch_embed.conv.'
-        for state_k in list(state_dict.keys()):
-            for target_k, target_v in state_dict_map.items():
-                if target_v in state_k:
-                    target_name = state_k.replace(target_v, target_k)
-                    state_dict[target_name] = state_dict[state_k]
-                    rep.append(state_k)
-        for r in rep:
-            if r in state_dict:
-                del state_dict[r]
-    incompatible_keys = model.load_state_dict(state_dict, strict=strict if filter_keys is None else False)
-    return incompatible_keys
-def create_model(
-    model_name: str,
-    pretrained: bool = False,
-    pretrained_cfg: Optional[Union[str, Dict[str, Any], PretrainedCfg]] = None,
-    pretrained_cfg_overlay: Optional[Dict[str, Any]] = None,
-    checkpoint_path: str = "",
-    scriptable: Optional[bool] = None,
-    exportable: Optional[bool] = None,
-    no_jit: Optional[bool] = None,
-    filter_keys=None,
-    state_dict_map=None,
-    **kwargs,
-):
-    """Create a model
-    Lookup model's entrypoint function and pass relevant args to create a new model.
-    """
-    # Parameters that aren't supported by all models or are intended to only override model defaults if set
-    # should default to None in command line args/cfg. Remove them if they are present and not set so that
-    # non-supporting models don't break and default args remain in effect.
-    kwargs = {k: v for k, v in kwargs.items() if v is not None}
-    model_source, model_name = parse_model_name(model_name)
-    if model_source == "hf-hub":
-        assert not pretrained_cfg, "pretrained_cfg should not be set when sourcing model from Hugging Face Hub."
-        # For model names specified in the form `hf-hub:path/architecture_name@revision`,
-        # load model weights + pretrained_cfg from Hugging Face hub.
-        pretrained_cfg, model_name = load_model_config_from_hf(model_name)
-    else:
-        model_name, pretrained_tag = split_model_name_tag(model_name)
-        if not pretrained_cfg:
-            # a valid pretrained_cfg argument takes priority over tag in model name
-            pretrained_cfg = pretrained_tag
-    if not is_model(model_name):
-        raise RuntimeError("Unknown model (%s)" % model_name)
-    create_fn = model_entrypoint(model_name)
-    with set_layer_config(scriptable=scriptable, exportable=exportable, no_jit=no_jit):
-        model = create_fn(
-            pretrained=pretrained,
-            pretrained_cfg=pretrained_cfg,
-            pretrained_cfg_overlay=pretrained_cfg_overlay,
-            **kwargs,
-        )
-    if checkpoint_path:
-        load_checkpoint(model, checkpoint_path, filter_keys=filter_keys, state_dict_map=state_dict_map)
-    return model

mivolo/model/cross_bottleneck_attn.py DELETED Viewed

@@ -1,116 +0,0 @@
-"""
-Code based on timm https://github.com/huggingface/pytorch-image-models
-Modifications and additions for mivolo by / Copyright 2023, Irina Tolstykh, Maxim Kuprashevich
-"""
-import torch
-import torch.nn as nn
-from timm.layers.bottleneck_attn import PosEmbedRel
-from timm.layers.helpers import make_divisible
-from timm.layers.mlp import Mlp
-from timm.layers.trace_utils import _assert
-from timm.layers.weight_init import trunc_normal_
-class CrossBottleneckAttn(nn.Module):
-    def __init__(
-        self,
-        dim,
-        dim_out=None,
-        feat_size=None,
-        stride=1,
-        num_heads=4,
-        dim_head=None,
-        qk_ratio=1.0,
-        qkv_bias=False,
-        scale_pos_embed=False,
-    ):
-        super().__init__()
-        assert feat_size is not None, "A concrete feature size matching expected input (H, W) is required"
-        dim_out = dim_out or dim
-        assert dim_out % num_heads == 0
-        self.num_heads = num_heads
-        self.dim_head_qk = dim_head or make_divisible(dim_out * qk_ratio, divisor=8) // num_heads
-        self.dim_head_v = dim_out // self.num_heads
-        self.dim_out_qk = num_heads * self.dim_head_qk
-        self.dim_out_v = num_heads * self.dim_head_v
-        self.scale = self.dim_head_qk**-0.5
-        self.scale_pos_embed = scale_pos_embed
-        self.qkv_f = nn.Conv2d(dim, self.dim_out_qk * 2 + self.dim_out_v, 1, bias=qkv_bias)
-        self.qkv_p = nn.Conv2d(dim, self.dim_out_qk * 2 + self.dim_out_v, 1, bias=qkv_bias)
-        # NOTE I'm only supporting relative pos embedding for now
-        self.pos_embed = PosEmbedRel(feat_size, dim_head=self.dim_head_qk, scale=self.scale)
-        self.norm = nn.LayerNorm([self.dim_out_v * 2, *feat_size])
-        mlp_ratio = 4
-        self.mlp = Mlp(
-            in_features=self.dim_out_v * 2,
-            hidden_features=int(dim * mlp_ratio),
-            act_layer=nn.GELU,
-            out_features=dim_out,
-            drop=0,
-            use_conv=True,
-        )
-        self.pool = nn.AvgPool2d(2, 2) if stride == 2 else nn.Identity()
-        self.reset_parameters()
-    def reset_parameters(self):
-        trunc_normal_(self.qkv_f.weight, std=self.qkv_f.weight.shape[1] ** -0.5)  # fan-in
-        trunc_normal_(self.qkv_p.weight, std=self.qkv_p.weight.shape[1] ** -0.5)  # fan-in
-        trunc_normal_(self.pos_embed.height_rel, std=self.scale)
-        trunc_normal_(self.pos_embed.width_rel, std=self.scale)
-    def get_qkv(self, x, qvk_conv):
-        B, C, H, W = x.shape
-        x = qvk_conv(x)  # B, (2 * dim_head_qk + dim_head_v) * num_heads, H, W
-        q, k, v = torch.split(x, [self.dim_out_qk, self.dim_out_qk, self.dim_out_v], dim=1)
-        q = q.reshape(B * self.num_heads, self.dim_head_qk, -1).transpose(-1, -2)
-        k = k.reshape(B * self.num_heads, self.dim_head_qk, -1)  # no transpose, for q @ k
-        v = v.reshape(B * self.num_heads, self.dim_head_v, -1).transpose(-1, -2)
-        return q, k, v
-    def apply_attn(self, q, k, v, B, H, W, dropout=None):
-        if self.scale_pos_embed:
-            attn = (q @ k + self.pos_embed(q)) * self.scale  # B * num_heads, H * W, H * W
-        else:
-            attn = (q @ k) * self.scale + self.pos_embed(q)
-        attn = attn.softmax(dim=-1)
-        if dropout:
-            attn = dropout(attn)
-        out = (attn @ v).transpose(-1, -2).reshape(B, self.dim_out_v, H, W)  # B, dim_out, H, W
-        return out
-    def forward(self, x):
-        B, C, H, W = x.shape
-        dim = int(C / 2)
-        x1 = x[:, :dim, :, :]
-        x2 = x[:, dim:, :, :]
-        _assert(H == self.pos_embed.height, "")
-        _assert(W == self.pos_embed.width, "")
-        q_f, k_f, v_f = self.get_qkv(x1, self.qkv_f)
-        q_p, k_p, v_p = self.get_qkv(x2, self.qkv_p)
-        # person to face
-        out_f = self.apply_attn(q_f, k_p, v_p, B, H, W)
-        # face to person
-        out_p = self.apply_attn(q_p, k_f, v_f, B, H, W)
-        x_pf = torch.cat((out_f, out_p), dim=1)  # B, dim_out * 2, H, W
-        x_pf = self.norm(x_pf)
-        x_pf = self.mlp(x_pf)  # B, dim_out, H, W
-        out = self.pool(x_pf)
-        return out

mivolo/model/mi_volo.py DELETED Viewed

@@ -1,229 +0,0 @@
-import logging
-from typing import Optional
-import numpy as np
-import torch
-from mivolo.data.misc import prepare_classification_images
-from mivolo.model.create_timm_model import create_model
-from mivolo.structures import PersonAndFaceCrops, PersonAndFaceResult
-from timm.data import resolve_data_config
-_logger = logging.getLogger("MiVOLO")
-has_compile = hasattr(torch, "compile")
-class Meta:
-    def __init__(self):
-        self.min_age = None
-        self.max_age = None
-        self.avg_age = None
-        self.num_classes = None
-        self.in_chans = 3
-        self.with_persons_model = False
-        self.disable_faces = False
-        self.use_persons = True
-        self.only_age = False
-        self.num_classes_gender = 2
-    def load_from_ckpt(self, ckpt_path: str, disable_faces: bool = False, use_persons: bool = True) -> "Meta":
-        state = torch.load(ckpt_path, map_location="cpu")
-        self.min_age = state["min_age"]
-        self.max_age = state["max_age"]
-        self.avg_age = state["avg_age"]
-        self.only_age = state["no_gender"]
-        only_age = state["no_gender"]
-        self.disable_faces = disable_faces
-        if "with_persons_model" in state:
-            self.with_persons_model = state["with_persons_model"]
-        else:
-            self.with_persons_model = True if "patch_embed.conv1.0.weight" in state["state_dict"] else False
-        self.num_classes = 1 if only_age else 3
-        self.in_chans = 3 if not self.with_persons_model else 6
-        self.use_persons = use_persons and self.with_persons_model
-        if not self.with_persons_model and self.disable_faces:
-            raise ValueError("You can not use disable-faces for faces-only model")
-        if self.with_persons_model and self.disable_faces and not self.use_persons:
-            raise ValueError("You can not disable faces and persons together")
-        return self
-    def __str__(self):
-        attrs = vars(self)
-        attrs.update({"use_person_crops": self.use_person_crops, "use_face_crops": self.use_face_crops})
-        return ", ".join("%s: %s" % item for item in attrs.items())
-    @property
-    def use_person_crops(self) -> bool:
-        return self.with_persons_model and self.use_persons
-    @property
-    def use_face_crops(self) -> bool:
-        return not self.disable_faces or not self.with_persons_model
-class MiVOLO:
-    def __init__(
-        self,
-        ckpt_path: str,
-        device: str = "cuda",
-        half: bool = True,
-        disable_faces: bool = False,
-        use_persons: bool = True,
-        verbose: bool = False,
-        torchcompile: Optional[str] = None,
-    ):
-        self.verbose = verbose
-        self.device = torch.device(device)
-        self.half = half and self.device.type != "cpu"
-        self.meta: Meta = Meta().load_from_ckpt(ckpt_path, disable_faces, use_persons)
-        if self.verbose:
-            _logger.info(f"Model meta:\n{str(self.meta)}")
-        model_name = "mivolo_d1_224"
-        self.model = create_model(
-            model_name=model_name,
-            num_classes=self.meta.num_classes,
-            in_chans=self.meta.in_chans,
-            pretrained=False,
-            checkpoint_path=ckpt_path,
-            filter_keys=["fds."],
-        )
-        self.param_count = sum([m.numel() for m in self.model.parameters()])
-        _logger.info(f"Model {model_name} created, param count: {self.param_count}")
-        self.data_config = resolve_data_config(
-            model=self.model,
-            verbose=verbose,
-            use_test_size=True,
-        )
-        self.data_config["crop_pct"] = 1.0
-        c, h, w = self.data_config["input_size"]
-        assert h == w, "Incorrect data_config"
-        self.input_size = w
-        self.model = self.model.to(self.device)
-        if torchcompile:
-            assert has_compile, "A version of torch w/ torch.compile() is required for --compile, possibly a nightly."
-            torch._dynamo.reset()
-            self.model = torch.compile(self.model, backend=torchcompile)
-        self.model.eval()
-        if self.half:
-            self.model = self.model.half()
-    def warmup(self, batch_size: int, steps=10):
-        if self.meta.with_persons_model:
-            input_size = (6, self.input_size, self.input_size)
-        else:
-            input_size = self.data_config["input_size"]
-        input = torch.randn((batch_size,) + tuple(input_size)).to(self.device)
-        for _ in range(steps):
-            out = self.inference(input)  # noqa: F841
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
-    def inference(self, model_input: torch.tensor) -> torch.tensor:
-        with torch.no_grad():
-            if self.half:
-                model_input = model_input.half()
-            output = self.model(model_input)
-        return output
-    def predict(self, image: np.ndarray, detected_bboxes: PersonAndFaceResult):
-        if detected_bboxes.n_objects == 0:
-            return
-        faces_input, person_input, faces_inds, bodies_inds = self.prepare_crops(image, detected_bboxes)
-        if self.meta.with_persons_model:
-            model_input = torch.cat((faces_input, person_input), dim=1)
-        else:
-            model_input = faces_input
-        output = self.inference(model_input)
-        # write gender and age results into detected_bboxes
-        self.fill_in_results(output, detected_bboxes, faces_inds, bodies_inds)
-    def fill_in_results(self, output, detected_bboxes, faces_inds, bodies_inds):
-        if self.meta.only_age:
-            age_output = output
-            gender_probs, gender_indx = None, None
-        else:
-            age_output = output[:, 2]
-            gender_output = output[:, :2].softmax(-1)
-            gender_probs, gender_indx = gender_output.topk(1)
-        assert output.shape[0] == len(faces_inds) == len(bodies_inds)
-        # per face
-        for index in range(output.shape[0]):
-            face_ind = faces_inds[index]
-            body_ind = bodies_inds[index]
-            # get_age
-            age = age_output[index].item()
-            age = age * (self.meta.max_age - self.meta.min_age) + self.meta.avg_age
-            age = round(age, 2)
-            detected_bboxes.set_age(face_ind, age)
-            detected_bboxes.set_age(body_ind, age)
-            _logger.info(f"\tage: {age}")
-            if gender_probs is not None:
-                gender = "male" if gender_indx[index].item() == 0 else "female"
-                gender_score = gender_probs[index].item()
-                _logger.info(f"\tgender: {gender} [{int(gender_score * 100)}%]")
-                detected_bboxes.set_gender(face_ind, gender, gender_score)
-                detected_bboxes.set_gender(body_ind, gender, gender_score)
-    def prepare_crops(self, image: np.ndarray, detected_bboxes: PersonAndFaceResult):
-        if self.meta.use_person_crops and self.meta.use_face_crops:
-            detected_bboxes.associate_faces_with_persons()
-        crops: PersonAndFaceCrops = detected_bboxes.collect_crops(image)
-        (bodies_inds, bodies_crops), (faces_inds, faces_crops) = crops.get_faces_with_bodies(
-            self.meta.use_person_crops, self.meta.use_face_crops
-        )
-        if not self.meta.use_face_crops:
-            assert all(f is None for f in faces_crops)
-        faces_input = prepare_classification_images(
-            faces_crops, self.input_size, self.data_config["mean"], self.data_config["std"], device=self.device
-        )
-        if not self.meta.use_person_crops:
-            assert all(p is None for p in bodies_crops)
-        person_input = prepare_classification_images(
-            bodies_crops, self.input_size, self.data_config["mean"], self.data_config["std"], device=self.device
-        )
-        _logger.info(
-            f"faces_input: {faces_input.shape if faces_input is not None else None}, "
-            f"person_input: {person_input.shape if person_input is not None else None}"
-        )
-        return faces_input, person_input, faces_inds, bodies_inds
-if __name__ == "__main__":
-    model = MiVOLO("../pretrained/checkpoint-377.pth.tar", half=True, device="cuda:0")

mivolo/model/mivolo_model.py DELETED Viewed

@@ -1,402 +0,0 @@
-"""
-Code adapted from timm https://github.com/huggingface/pytorch-image-models
-Modifications and additions for mivolo by / Copyright 2023, Irina Tolstykh, Maxim Kuprashevich
-"""
-import torch
-import torch.nn as nn
-from mivolo.model.cross_bottleneck_attn import CrossBottleneckAttn
-from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from timm.layers import trunc_normal_
-from timm.models._builder import build_model_with_cfg
-from timm.models._registry import register_model
-from timm.models.volo import VOLO
-__all__ = ["MiVOLOModel"]  # model_registry will add each entrypoint fn to this
-def _cfg(url="", **kwargs):
-    return {
-        "url": url,
-        "num_classes": 1000,
-        "input_size": (3, 224, 224),
-        "pool_size": None,
-        "crop_pct": 0.96,
-        "interpolation": "bicubic",
-        "fixed_input_size": True,
-        "mean": IMAGENET_DEFAULT_MEAN,
-        "std": IMAGENET_DEFAULT_STD,
-        "first_conv": None,
-        "classifier": ("head", "aux_head"),
-        **kwargs,
-    }
-default_cfgs = {
-    "mivolo_d1_224": _cfg(
-        url="https://github.com/sail-sg/volo/releases/download/volo_1/d1_224_84.2.pth.tar", crop_pct=0.96
-    ),
-    "mivolo_d1_384": _cfg(
-        url="https://github.com/sail-sg/volo/releases/download/volo_1/d1_384_85.2.pth.tar",
-        crop_pct=1.0,
-        input_size=(3, 384, 384),
-    ),
-    "mivolo_d2_224": _cfg(
-        url="https://github.com/sail-sg/volo/releases/download/volo_1/d2_224_85.2.pth.tar", crop_pct=0.96
-    ),
-    "mivolo_d2_384": _cfg(
-        url="https://github.com/sail-sg/volo/releases/download/volo_1/d2_384_86.0.pth.tar",
-        crop_pct=1.0,
-        input_size=(3, 384, 384),
-    ),
-    "mivolo_d3_224": _cfg(
-        url="https://github.com/sail-sg/volo/releases/download/volo_1/d3_224_85.4.pth.tar", crop_pct=0.96
-    ),
-    "mivolo_d3_448": _cfg(
-        url="https://github.com/sail-sg/volo/releases/download/volo_1/d3_448_86.3.pth.tar",
-        crop_pct=1.0,
-        input_size=(3, 448, 448),
-    ),
-    "mivolo_d4_224": _cfg(
-        url="https://github.com/sail-sg/volo/releases/download/volo_1/d4_224_85.7.pth.tar", crop_pct=0.96
-    ),
-    "mivolo_d4_448": _cfg(
-        url="https://github.com/sail-sg/volo/releases/download/volo_1/d4_448_86.79.pth.tar",
-        crop_pct=1.15,
-        input_size=(3, 448, 448),
-    ),
-    "mivolo_d5_224": _cfg(
-        url="https://github.com/sail-sg/volo/releases/download/volo_1/d5_224_86.10.pth.tar", crop_pct=0.96
-    ),
-    "mivolo_d5_448": _cfg(
-        url="https://github.com/sail-sg/volo/releases/download/volo_1/d5_448_87.0.pth.tar",
-        crop_pct=1.15,
-        input_size=(3, 448, 448),
-    ),
-    "mivolo_d5_512": _cfg(
-        url="https://github.com/sail-sg/volo/releases/download/volo_1/d5_512_87.07.pth.tar",
-        crop_pct=1.15,
-        input_size=(3, 512, 512),
-    ),
-}
-def get_output_size(input_shape, conv_layer):
-    padding = conv_layer.padding
-    dilation = conv_layer.dilation
-    kernel_size = conv_layer.kernel_size
-    stride = conv_layer.stride
-    output_size = [
-        ((input_shape[i] + 2 * padding[i] - dilation[i] * (kernel_size[i] - 1) - 1) // stride[i]) + 1 for i in range(2)
-    ]
-    return output_size
-def get_output_size_module(input_size, stem):
-    output_size = input_size
-    for module in stem:
-        if isinstance(module, nn.Conv2d):
-            output_size = [
-                (
-                    (output_size[i] + 2 * module.padding[i] - module.dilation[i] * (module.kernel_size[i] - 1) - 1)
-                    // module.stride[i]
-                )
-                + 1
-                for i in range(2)
-            ]
-    return output_size
-class PatchEmbed(nn.Module):
-    """Image to Patch Embedding."""
-    def __init__(
-        self, img_size=224, stem_conv=False, stem_stride=1, patch_size=8, in_chans=3, hidden_dim=64, embed_dim=384
-    ):
-        super().__init__()
-        assert patch_size in [4, 8, 16]
-        assert in_chans in [3, 6]
-        self.with_persons_model = in_chans == 6
-        self.use_cross_attn = True
-        if stem_conv:
-            if not self.with_persons_model:
-                self.conv = self.create_stem(stem_stride, in_chans, hidden_dim)
-            else:
-                self.conv = True  # just to match interface
-                # split
-                self.conv1 = self.create_stem(stem_stride, 3, hidden_dim)
-                self.conv2 = self.create_stem(stem_stride, 3, hidden_dim)
-        else:
-            self.conv = None
-        if self.with_persons_model:
-            self.proj1 = nn.Conv2d(
-                hidden_dim, embed_dim, kernel_size=patch_size // stem_stride, stride=patch_size // stem_stride
-            )
-            self.proj2 = nn.Conv2d(
-                hidden_dim, embed_dim, kernel_size=patch_size // stem_stride, stride=patch_size // stem_stride
-            )
-            stem_out_shape = get_output_size_module((img_size, img_size), self.conv1)
-            self.proj_output_size = get_output_size(stem_out_shape, self.proj1)
-            self.map = CrossBottleneckAttn(embed_dim, dim_out=embed_dim, num_heads=1, feat_size=self.proj_output_size)
-        else:
-            self.proj = nn.Conv2d(
-                hidden_dim, embed_dim, kernel_size=patch_size // stem_stride, stride=patch_size // stem_stride
-            )
-        self.patch_dim = img_size // patch_size
-        self.num_patches = self.patch_dim**2
-    def create_stem(self, stem_stride, in_chans, hidden_dim):
-        return nn.Sequential(
-            nn.Conv2d(in_chans, hidden_dim, kernel_size=7, stride=stem_stride, padding=3, bias=False),  # 112x112
-            nn.BatchNorm2d(hidden_dim),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(hidden_dim, hidden_dim, kernel_size=3, stride=1, padding=1, bias=False),  # 112x112
-            nn.BatchNorm2d(hidden_dim),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(hidden_dim, hidden_dim, kernel_size=3, stride=1, padding=1, bias=False),  # 112x112
-            nn.BatchNorm2d(hidden_dim),
-            nn.ReLU(inplace=True),
-        )
-    def forward(self, x):
-        if self.conv is not None:
-            if self.with_persons_model:
-                x1 = x[:, :3]
-                x2 = x[:, 3:]
-                x1 = self.conv1(x1)
-                x1 = self.proj1(x1)
-                x2 = self.conv2(x2)
-                x2 = self.proj2(x2)
-                x = torch.cat([x1, x2], dim=1)
-                x = self.map(x)
-            else:
-                x = self.conv(x)
-                x = self.proj(x)  # B, C, H, W
-        return x
-class MiVOLOModel(VOLO):
-    """
-    Vision Outlooker, the main class of our model
-    """
-    def __init__(
-        self,
-        layers,
-        img_size=224,
-        in_chans=3,
-        num_classes=1000,
-        global_pool="token",
-        patch_size=8,
-        stem_hidden_dim=64,
-        embed_dims=None,
-        num_heads=None,
-        downsamples=(True, False, False, False),
-        outlook_attention=(True, False, False, False),
-        mlp_ratio=3.0,
-        qkv_bias=False,
-        drop_rate=0.0,
-        attn_drop_rate=0.0,
-        drop_path_rate=0.0,
-        norm_layer=nn.LayerNorm,
-        post_layers=("ca", "ca"),
-        use_aux_head=True,
-        use_mix_token=False,
-        pooling_scale=2,
-    ):
-        super().__init__(
-            layers,
-            img_size,
-            in_chans,
-            num_classes,
-            global_pool,
-            patch_size,
-            stem_hidden_dim,
-            embed_dims,
-            num_heads,
-            downsamples,
-            outlook_attention,
-            mlp_ratio,
-            qkv_bias,
-            drop_rate,
-            attn_drop_rate,
-            drop_path_rate,
-            norm_layer,
-            post_layers,
-            use_aux_head,
-            use_mix_token,
-            pooling_scale,
-        )
-        self.patch_embed = PatchEmbed(
-            stem_conv=True,
-            stem_stride=2,
-            patch_size=patch_size,
-            in_chans=in_chans,
-            hidden_dim=stem_hidden_dim,
-            embed_dim=embed_dims[0],
-        )
-        trunc_normal_(self.pos_embed, std=0.02)
-        self.apply(self._init_weights)
-    def forward_features(self, x):
-        x = self.patch_embed(x).permute(0, 2, 3, 1)  # B,C,H,W-> B,H,W,C
-        # step2: tokens learning in the two stages
-        x = self.forward_tokens(x)
-        # step3: post network, apply class attention or not
-        if self.post_network is not None:
-            x = self.forward_cls(x)
-        x = self.norm(x)
-        return x
-    def forward_head(self, x, pre_logits: bool = False, targets=None, epoch=None):
-        if self.global_pool == "avg":
-            out = x.mean(dim=1)
-        elif self.global_pool == "token":
-            out = x[:, 0]
-        else:
-            out = x
-        if pre_logits:
-            return out
-        features = out
-        fds_enabled = hasattr(self, "_fds_forward")
-        if fds_enabled:
-            features = self._fds_forward(features, targets, epoch)
-        out = self.head(features)
-        if self.aux_head is not None:
-            # generate classes in all feature tokens, see token labeling
-            aux = self.aux_head(x[:, 1:])
-            out = out + 0.5 * aux.max(1)[0]
-        return (out, features) if (fds_enabled and self.training) else out
-    def forward(self, x, targets=None, epoch=None):
-        """simplified forward (without mix token training)"""
-        x = self.forward_features(x)
-        x = self.forward_head(x, targets=targets, epoch=epoch)
-        return x
-def _create_mivolo(variant, pretrained=False, **kwargs):
-    if kwargs.get("features_only", None):
-        raise RuntimeError("features_only not implemented for Vision Transformer models.")
-    return build_model_with_cfg(MiVOLOModel, variant, pretrained, **kwargs)
-@register_model
-def mivolo_d1_224(pretrained=False, **kwargs):
-    model_args = dict(layers=(4, 4, 8, 2), embed_dims=(192, 384, 384, 384), num_heads=(6, 12, 12, 12), **kwargs)
-    model = _create_mivolo("mivolo_d1_224", pretrained=pretrained, **model_args)
-    return model
-@register_model
-def mivolo_d1_384(pretrained=False, **kwargs):
-    model_args = dict(layers=(4, 4, 8, 2), embed_dims=(192, 384, 384, 384), num_heads=(6, 12, 12, 12), **kwargs)
-    model = _create_mivolo("mivolo_d1_384", pretrained=pretrained, **model_args)
-    return model
-@register_model
-def mivolo_d2_224(pretrained=False, **kwargs):
-    model_args = dict(layers=(6, 4, 10, 4), embed_dims=(256, 512, 512, 512), num_heads=(8, 16, 16, 16), **kwargs)
-    model = _create_mivolo("mivolo_d2_224", pretrained=pretrained, **model_args)
-    return model
-@register_model
-def mivolo_d2_384(pretrained=False, **kwargs):
-    model_args = dict(layers=(6, 4, 10, 4), embed_dims=(256, 512, 512, 512), num_heads=(8, 16, 16, 16), **kwargs)
-    model = _create_mivolo("mivolo_d2_384", pretrained=pretrained, **model_args)
-    return model
-@register_model
-def mivolo_d3_224(pretrained=False, **kwargs):
-    model_args = dict(layers=(8, 8, 16, 4), embed_dims=(256, 512, 512, 512), num_heads=(8, 16, 16, 16), **kwargs)
-    model = _create_mivolo("mivolo_d3_224", pretrained=pretrained, **model_args)
-    return model
-@register_model
-def mivolo_d3_448(pretrained=False, **kwargs):
-    model_args = dict(layers=(8, 8, 16, 4), embed_dims=(256, 512, 512, 512), num_heads=(8, 16, 16, 16), **kwargs)
-    model = _create_mivolo("mivolo_d3_448", pretrained=pretrained, **model_args)
-    return model
-@register_model
-def mivolo_d4_224(pretrained=False, **kwargs):
-    model_args = dict(layers=(8, 8, 16, 4), embed_dims=(384, 768, 768, 768), num_heads=(12, 16, 16, 16), **kwargs)
-    model = _create_mivolo("mivolo_d4_224", pretrained=pretrained, **model_args)
-    return model
-@register_model
-def mivolo_d4_448(pretrained=False, **kwargs):
-    """VOLO-D4 model, Params: 193M"""
-    model_args = dict(layers=(8, 8, 16, 4), embed_dims=(384, 768, 768, 768), num_heads=(12, 16, 16, 16), **kwargs)
-    model = _create_mivolo("mivolo_d4_448", pretrained=pretrained, **model_args)
-    return model
-@register_model
-def mivolo_d5_224(pretrained=False, **kwargs):
-    model_args = dict(
-        layers=(12, 12, 20, 4),
-        embed_dims=(384, 768, 768, 768),
-        num_heads=(12, 16, 16, 16),
-        mlp_ratio=4,
-        stem_hidden_dim=128,
-        **kwargs
-    )
-    model = _create_mivolo("mivolo_d5_224", pretrained=pretrained, **model_args)
-    return model
-@register_model
-def mivolo_d5_448(pretrained=False, **kwargs):
-    model_args = dict(
-        layers=(12, 12, 20, 4),
-        embed_dims=(384, 768, 768, 768),
-        num_heads=(12, 16, 16, 16),
-        mlp_ratio=4,
-        stem_hidden_dim=128,
-        **kwargs
-    )
-    model = _create_mivolo("mivolo_d5_448", pretrained=pretrained, **model_args)
-    return model
-@register_model
-def mivolo_d5_512(pretrained=False, **kwargs):
-    model_args = dict(
-        layers=(12, 12, 20, 4),
-        embed_dims=(384, 768, 768, 768),
-        num_heads=(12, 16, 16, 16),
-        mlp_ratio=4,
-        stem_hidden_dim=128,
-        **kwargs
-    )
-    model = _create_mivolo("mivolo_d5_512", pretrained=pretrained, **model_args)
-    return model

mivolo/model/yolo_detector.py DELETED Viewed

@@ -1,48 +0,0 @@
-import os
-from typing import Dict, Union
-import numpy as np
-import PIL
-import torch
-from mivolo.structures import PersonAndFaceResult
-from ultralytics import YOLO
-# from ultralytics.yolo.engine.results import Results
-# because of ultralytics bug it is important to unset CUBLAS_WORKSPACE_CONFIG after the module importing
-os.unsetenv("CUBLAS_WORKSPACE_CONFIG")
-class Detector:
-    def __init__(
-        self,
-        weights: str,
-        device: str = "cuda",
-        half: bool = True,
-        verbose: bool = False,
-        conf_thresh: float = 0.4,
-        iou_thresh: float = 0.7,
-    ):
-        self.yolo = YOLO(weights)
-        self.yolo.fuse()
-        self.device = torch.device(device)
-        self.half = half and self.device.type != "cpu"
-        if self.half:
-            self.yolo.model = self.yolo.model.half()
-        self.detector_names: Dict[int, str] = self.yolo.model.names
-        # init yolo.predictor
-        self.detector_kwargs = {
-            "conf": conf_thresh, "iou": iou_thresh, "half": self.half, "verbose": verbose}
-        # self.yolo.predict(**self.detector_kwargs)
-    def predict(self, image: Union[np.ndarray, str, "PIL.Image"]) -> PersonAndFaceResult:
-        results = self.yolo.predict(image, **self.detector_kwargs)[0]
-        return PersonAndFaceResult(results)
-    def track(self, image: Union[np.ndarray, str, "PIL.Image"]) -> PersonAndFaceResult:
-        results = self.yolo.track(
-            image, persist=True, **self.detector_kwargs)[0]
-        return PersonAndFaceResult(results)

mivolo/predictor.py DELETED Viewed

@@ -1,68 +0,0 @@
-from collections import defaultdict
-from typing import Dict, Generator, List, Optional, Tuple
-import cv2
-import numpy as np
-import tqdm
-from mivolo.model.mi_volo import MiVOLO
-from mivolo.model.yolo_detector import Detector
-from mivolo.structures import AGE_GENDER_TYPE, PersonAndFaceResult
-class Predictor:
-    def __init__(self, config, verbose: bool = False):
-        self.detector = Detector(config.detector_weights, config.device, verbose=verbose)
-        self.age_gender_model = MiVOLO(
-            config.checkpoint,
-            config.device,
-            half=True,
-            use_persons=config.with_persons,
-            disable_faces=config.disable_faces,
-            verbose=verbose,
-        )
-        self.draw = config.draw
-    def recognize(self, image: np.ndarray) -> Tuple[PersonAndFaceResult, Optional[np.ndarray]]:
-        detected_objects: PersonAndFaceResult = self.detector.predict(image)
-        self.age_gender_model.predict(image, detected_objects)
-        out_im = None
-        if self.draw:
-            # plot results on image
-            out_im = detected_objects.plot()
-        return detected_objects, out_im
-    def recognize_video(self, source: str) -> Generator:
-        video_capture = cv2.VideoCapture(source)
-        if not video_capture.isOpened():
-            raise ValueError(f"Failed to open video source {source}")
-        detected_objects_history: Dict[int, List[AGE_GENDER_TYPE]] = defaultdict(list)
-        total_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
-        for _ in tqdm.tqdm(range(total_frames)):
-            ret, frame = video_capture.read()
-            if not ret:
-                break
-            detected_objects: PersonAndFaceResult = self.detector.track(frame)
-            self.age_gender_model.predict(frame, detected_objects)
-            current_frame_objs = detected_objects.get_results_for_tracking()
-            cur_persons: Dict[int, AGE_GENDER_TYPE] = current_frame_objs[0]
-            cur_faces: Dict[int, AGE_GENDER_TYPE] = current_frame_objs[1]
-            # add tr_persons and tr_faces to history
-            for guid, data in cur_persons.items():
-                # not useful for tracking :)
-                if None not in data:
-                    detected_objects_history[guid].append(data)
-            for guid, data in cur_faces.items():
-                if None not in data:
-                    detected_objects_history[guid].append(data)
-            detected_objects.set_tracked_age_gender(detected_objects_history)
-            if self.draw:
-                frame = detected_objects.plot()
-            yield detected_objects_history, frame

mivolo/structures.py DELETED Viewed

@@ -1,464 +0,0 @@
-import math
-import os
-from copy import deepcopy
-from typing import Dict, List, Optional, Tuple
-import cv2
-import numpy as np
-import torch
-from mivolo.data.misc import aggregate_votes_winsorized, assign_faces, box_iou, cropout_black_parts
-from ultralytics.yolo.engine.results import Results
-from ultralytics.yolo.utils.plotting import Annotator, colors
-# because of ultralytics bug it is important to unset CUBLAS_WORKSPACE_CONFIG after the module importing
-os.unsetenv("CUBLAS_WORKSPACE_CONFIG")
-AGE_GENDER_TYPE = Tuple[float, str]
-class PersonAndFaceCrops:
-    def __init__(self):
-        # int: index of person along results
-        self.crops_persons: Dict[int, np.ndarray] = {}
-        # int: index of face along results
-        self.crops_faces: Dict[int, np.ndarray] = {}
-        # int: index of face along results
-        self.crops_faces_wo_body: Dict[int, np.ndarray] = {}
-        # int: index of person along results
-        self.crops_persons_wo_face: Dict[int, np.ndarray] = {}
-    def _add_to_output(
-        self, crops: Dict[int, np.ndarray], out_crops: List[np.ndarray], out_crop_inds: List[Optional[int]]
-    ):
-        inds_to_add = list(crops.keys())
-        crops_to_add = list(crops.values())
-        out_crops.extend(crops_to_add)
-        out_crop_inds.extend(inds_to_add)
-    def _get_all_faces(
-        self, use_persons: bool, use_faces: bool
-    ) -> Tuple[List[Optional[int]], List[Optional[np.ndarray]]]:
-        """
-        Returns
-            if use_persons and use_faces
-                faces: faces_with_bodies + faces_without_bodies + [None] * len(crops_persons_wo_face)
-            if use_persons and not use_faces
-                faces: [None] * n_persons
-            if not use_persons and use_faces:
-                faces: faces_with_bodies + faces_without_bodies
-        """
-        def add_none_to_output(faces_inds, faces_crops, num):
-            faces_inds.extend([None for _ in range(num)])
-            faces_crops.extend([None for _ in range(num)])
-        faces_inds: List[Optional[int]] = []
-        faces_crops: List[Optional[np.ndarray]] = []
-        if not use_faces:
-            add_none_to_output(faces_inds, faces_crops, len(self.crops_persons) + len(self.crops_persons_wo_face))
-            return faces_inds, faces_crops
-        self._add_to_output(self.crops_faces, faces_crops, faces_inds)
-        self._add_to_output(self.crops_faces_wo_body, faces_crops, faces_inds)
-        if use_persons:
-            add_none_to_output(faces_inds, faces_crops, len(self.crops_persons_wo_face))
-        return faces_inds, faces_crops
-    def _get_all_bodies(
-        self, use_persons: bool, use_faces: bool
-    ) -> Tuple[List[Optional[int]], List[Optional[np.ndarray]]]:
-        """
-        Returns
-            if use_persons and use_faces
-                persons: bodies_with_faces + [None] * len(faces_without_bodies) + bodies_without_faces
-            if use_persons and not use_faces
-                persons: bodies_with_faces + bodies_without_faces
-            if not use_persons and use_faces
-                persons: [None] * n_faces
-        """
-        def add_none_to_output(bodies_inds, bodies_crops, num):
-            bodies_inds.extend([None for _ in range(num)])
-            bodies_crops.extend([None for _ in range(num)])
-        bodies_inds: List[Optional[int]] = []
-        bodies_crops: List[Optional[np.ndarray]] = []
-        if not use_persons:
-            add_none_to_output(bodies_inds, bodies_crops, len(self.crops_faces) + len(self.crops_faces_wo_body))
-            return bodies_inds, bodies_crops
-        self._add_to_output(self.crops_persons, bodies_crops, bodies_inds)
-        if use_faces:
-            add_none_to_output(bodies_inds, bodies_crops, len(self.crops_faces_wo_body))
-        self._add_to_output(self.crops_persons_wo_face, bodies_crops, bodies_inds)
-        return bodies_inds, bodies_crops
-    def get_faces_with_bodies(self, use_persons: bool, use_faces: bool):
-        """
-        Return
-            faces: faces_with_bodies, faces_without_bodies, [None] * len(crops_persons_wo_face)
-            persons: bodies_with_faces, [None] * len(faces_without_bodies), bodies_without_faces
-        """
-        bodies_inds, bodies_crops = self._get_all_bodies(use_persons, use_faces)
-        faces_inds, faces_crops = self._get_all_faces(use_persons, use_faces)
-        return (bodies_inds, bodies_crops), (faces_inds, faces_crops)
-    def save(self, out_dir="output"):
-        ind = 0
-        os.makedirs(out_dir, exist_ok=True)
-        for crops in [self.crops_persons, self.crops_faces, self.crops_faces_wo_body, self.crops_persons_wo_face]:
-            for crop in crops.values():
-                if crop is None:
-                    continue
-                out_name = os.path.join(out_dir, f"{ind}_crop.jpg")
-                cv2.imwrite(out_name, crop)
-                ind += 1
-class PersonAndFaceResult:
-    def __init__(self, results: Results):
-        self.yolo_results = results
-        names = set(results.names.values())
-        assert "person" in names and "face" in names
-        # initially no faces and persons are associated to each other
-        self.face_to_person_map: Dict[int, Optional[int]] = {ind: None for ind in self.get_bboxes_inds("face")}
-        self.unassigned_persons_inds: List[int] = self.get_bboxes_inds("person")
-        n_objects = len(self.yolo_results.boxes)
-        self.ages: List[Optional[float]] = [None for _ in range(n_objects)]
-        self.genders: List[Optional[str]] = [None for _ in range(n_objects)]
-        self.gender_scores: List[Optional[float]] = [None for _ in range(n_objects)]
-    @property
-    def n_objects(self) -> int:
-        return len(self.yolo_results.boxes)
-    def get_bboxes_inds(self, category: str) -> List[int]:
-        bboxes: List[int] = []
-        for ind, det in enumerate(self.yolo_results.boxes):
-            name = self.yolo_results.names[int(det.cls)]
-            if name == category:
-                bboxes.append(ind)
-        return bboxes
-    def get_distance_to_center(self, bbox_ind: int) -> float:
-        """
-        Calculate euclidian distance between bbox center and image center.
-        """
-        im_h, im_w = self.yolo_results[bbox_ind].orig_shape
-        x1, y1, x2, y2 = self.get_bbox_by_ind(bbox_ind).cpu().numpy()
-        center_x, center_y = (x1 + x2) / 2, (y1 + y2) / 2
-        dist = math.dist([center_x, center_y], [im_w / 2, im_h / 2])
-        return dist
-    def plot(
-        self,
-        conf=False,
-        line_width=None,
-        font_size=None,
-        font="Arial.ttf",
-        pil=False,
-        img=None,
-        labels=True,
-        boxes=True,
-        probs=True,
-        ages=True,
-        genders=True,
-        gender_probs=False,
-    ):
-        """
-        Plots the detection results on an input RGB image. Accepts a numpy array (cv2) or a PIL Image.
-        Args:
-            conf (bool): Whether to plot the detection confidence score.
-            line_width (float, optional): The line width of the bounding boxes. If None, it is scaled to the image size.
-            font_size (float, optional): The font size of the text. If None, it is scaled to the image size.
-            font (str): The font to use for the text.
-            pil (bool): Whether to return the image as a PIL Image.
-            img (numpy.ndarray): Plot to another image. if not, plot to original image.
-            labels (bool): Whether to plot the label of bounding boxes.
-            boxes (bool): Whether to plot the bounding boxes.
-            probs (bool): Whether to plot classification probability
-            ages (bool): Whether to plot the age of bounding boxes.
-            genders (bool): Whether to plot the genders of bounding boxes.
-            gender_probs (bool): Whether to plot gender classification probability
-        Returns:
-            (numpy.ndarray): A numpy array of the annotated image.
-        """
-        # return self.yolo_results.plot()
-        colors_by_ind = {}
-        for face_ind, person_ind in self.face_to_person_map.items():
-            if person_ind is not None:
-                colors_by_ind[face_ind] = face_ind + 2
-                colors_by_ind[person_ind] = face_ind + 2
-            else:
-                colors_by_ind[face_ind] = 0
-        for person_ind in self.unassigned_persons_inds:
-            colors_by_ind[person_ind] = 1
-        names = self.yolo_results.names
-        annotator = Annotator(
-            deepcopy(self.yolo_results.orig_img if img is None else img),
-            line_width,
-            font_size,
-            font,
-            pil,
-            example=names,
-        )
-        pred_boxes, show_boxes = self.yolo_results.boxes, boxes
-        pred_probs, show_probs = self.yolo_results.probs, probs
-        if pred_boxes and show_boxes:
-            for bb_ind, (d, age, gender, gender_score) in enumerate(
-                zip(pred_boxes, self.ages, self.genders, self.gender_scores)
-            ):
-                c, conf, guid = int(d.cls), float(d.conf) if conf else None, None if d.id is None else int(d.id.item())
-                name = ("" if guid is None else f"id:{guid} ") + names[c]
-                label = (f"{name} {conf:.2f}" if conf else name) if labels else None
-                if ages and age is not None:
-                    label += f" {age:.1f}"
-                if genders and gender is not None:
-                    label += f" {'F' if gender == 'female' else 'M'}"
-                if gender_probs and gender_score is not None:
-                    label += f" ({gender_score:.1f})"
-                annotator.box_label(d.xyxy.squeeze(), label, color=colors(colors_by_ind[bb_ind], True))
-        if pred_probs is not None and show_probs:
-            text = f"{', '.join(f'{names[j] if names else j} {pred_probs.data[j]:.2f}' for j in pred_probs.top5)}, "
-            annotator.text((32, 32), text, txt_color=(255, 255, 255))  # TODO: allow setting colors
-        return annotator.result()
-    def set_tracked_age_gender(self, tracked_objects: Dict[int, List[AGE_GENDER_TYPE]]):
-        """
-        Update age and gender for objects based on history from tracked_objects.
-        Args:
-            tracked_objects (dict[int, list[AGE_GENDER_TYPE]]): info about tracked objects by guid
-        """
-        for face_ind, person_ind in self.face_to_person_map.items():
-            pguid = self._get_id_by_ind(person_ind)
-            fguid = self._get_id_by_ind(face_ind)
-            if fguid == -1 and pguid == -1:
-                # YOLO might not assign ids for some objects in some cases:
-                # https://github.com/ultralytics/ultralytics/issues/3830
-                continue
-            age, gender = self._gather_tracking_result(tracked_objects, fguid, pguid)
-            if age is None or gender is None:
-                continue
-            self.set_age(face_ind, age)
-            self.set_gender(face_ind, gender, 1.0)
-            if pguid != -1:
-                self.set_gender(person_ind, gender, 1.0)
-                self.set_age(person_ind, age)
-        for person_ind in self.unassigned_persons_inds:
-            pid = self._get_id_by_ind(person_ind)
-            if pid == -1:
-                continue
-            age, gender = self._gather_tracking_result(tracked_objects, -1, pid)
-            if age is None or gender is None:
-                continue
-            self.set_gender(person_ind, gender, 1.0)
-            self.set_age(person_ind, age)
-    def _get_id_by_ind(self, ind: Optional[int] = None) -> int:
-        if ind is None:
-            return -1
-        obj_id = self.yolo_results.boxes[ind].id
-        if obj_id is None:
-            return -1
-        return obj_id.item()
-    def get_bbox_by_ind(self, ind: int, im_h: int = None, im_w: int = None) -> torch.tensor:
-        bb = self.yolo_results.boxes[ind].xyxy.squeeze().type(torch.int32)
-        if im_h is not None and im_w is not None:
-            bb[0] = torch.clamp(bb[0], min=0, max=im_w - 1)
-            bb[1] = torch.clamp(bb[1], min=0, max=im_h - 1)
-            bb[2] = torch.clamp(bb[2], min=0, max=im_w - 1)
-            bb[3] = torch.clamp(bb[3], min=0, max=im_h - 1)
-        return bb
-    def set_age(self, ind: Optional[int], age: float):
-        if ind is not None:
-            self.ages[ind] = age
-    def set_gender(self, ind: Optional[int], gender: str, gender_score: float):
-        if ind is not None:
-            self.genders[ind] = gender
-            self.gender_scores[ind] = gender_score
-    @staticmethod
-    def _gather_tracking_result(
-        tracked_objects: Dict[int, List[AGE_GENDER_TYPE]],
-        fguid: int = -1,
-        pguid: int = -1,
-        minimum_sample_size: int = 10,
-    ) -> AGE_GENDER_TYPE:
-        assert fguid != -1 or pguid != -1, "Incorrect tracking behaviour"
-        face_ages = [r[0] for r in tracked_objects[fguid] if r[0] is not None] if fguid in tracked_objects else []
-        face_genders = [r[1] for r in tracked_objects[fguid] if r[1] is not None] if fguid in tracked_objects else []
-        person_ages = [r[0] for r in tracked_objects[pguid] if r[0] is not None] if pguid in tracked_objects else []
-        person_genders = [r[1] for r in tracked_objects[pguid] if r[1] is not None] if pguid in tracked_objects else []
-        if not face_ages and not person_ages: # both empty
-            return None, None
-        # You can play here with different aggregation strategies
-        # Face ages - predictions based on face or face + person, depends on history of object
-        # Person ages - predictions based on person or face + person, depends on history of object
-        if len(person_ages + face_ages) >= minimum_sample_size:
-            age = aggregate_votes_winsorized(person_ages + face_ages)
-        else:
-            face_age = np.mean(face_ages) if face_ages else None
-            person_age = np.mean(person_ages) if person_ages else None
-            if face_age is None:
-                face_age = person_age
-            if person_age is None:
-                person_age = face_age
-            age = (face_age + person_age) / 2.0
-        genders = face_genders + person_genders
-        assert len(genders) > 0
-        # take mode of genders
-        gender = max(set(genders), key=genders.count)
-        return age, gender
-    def get_results_for_tracking(self) -> Tuple[Dict[int, AGE_GENDER_TYPE], Dict[int, AGE_GENDER_TYPE]]:
-        """
-        Get objects from current frame
-        """
-        persons: Dict[int, AGE_GENDER_TYPE] = {}
-        faces: Dict[int, AGE_GENDER_TYPE] = {}
-        names = self.yolo_results.names
-        pred_boxes = self.yolo_results.boxes
-        for _, (det, age, gender, _) in enumerate(zip(pred_boxes, self.ages, self.genders, self.gender_scores)):
-            if det.id is None:
-                continue
-            cat_id, _, guid = int(det.cls), float(det.conf), int(det.id.item())
-            name = names[cat_id]
-            if name == "person":
-                persons[guid] = (age, gender)
-            elif name == "face":
-                faces[guid] = (age, gender)
-        return persons, faces
-    def associate_faces_with_persons(self):
-        face_bboxes_inds: List[int] = self.get_bboxes_inds("face")
-        person_bboxes_inds: List[int] = self.get_bboxes_inds("person")
-        face_bboxes: List[torch.tensor] = [self.get_bbox_by_ind(ind) for ind in face_bboxes_inds]
-        person_bboxes: List[torch.tensor] = [self.get_bbox_by_ind(ind) for ind in person_bboxes_inds]
-        self.face_to_person_map = {ind: None for ind in face_bboxes_inds}
-        assigned_faces, unassigned_persons_inds = assign_faces(person_bboxes, face_bboxes)
-        for face_ind, person_ind in enumerate(assigned_faces):
-            face_ind = face_bboxes_inds[face_ind]
-            person_ind = person_bboxes_inds[person_ind] if person_ind is not None else None
-            self.face_to_person_map[face_ind] = person_ind
-        self.unassigned_persons_inds = [person_bboxes_inds[person_ind] for person_ind in unassigned_persons_inds]
-    def crop_object(
-        self, full_image: np.ndarray, ind: int, cut_other_classes: Optional[List[str]] = None
-    ) -> Optional[np.ndarray]:
-        IOU_THRESH = 0.000001
-        MIN_PERSON_CROP_AFTERCUT_RATIO = 0.4
-        CROP_ROUND_RATE = 0.3
-        MIN_PERSON_SIZE = 50
-        obj_bbox = self.get_bbox_by_ind(ind, *full_image.shape[:2])
-        x1, y1, x2, y2 = obj_bbox
-        cur_cat = self.yolo_results.names[int(self.yolo_results.boxes[ind].cls)]
-        # get crop of face or person
-        obj_image = full_image[y1:y2, x1:x2].copy()
-        crop_h, crop_w = obj_image.shape[:2]
-        if cur_cat == "person" and (crop_h < MIN_PERSON_SIZE or crop_w < MIN_PERSON_SIZE):
-            return None
-        if not cut_other_classes:
-            return obj_image
-        # calc iou between obj_bbox and other bboxes
-        other_bboxes: List[torch.tensor] = [
-            self.get_bbox_by_ind(other_ind, *full_image.shape[:2]) for other_ind in range(len(self.yolo_results.boxes))
-        ]
-        iou_matrix = box_iou(torch.stack([obj_bbox]), torch.stack(other_bboxes)).cpu().numpy()[0]
-        # cut out other objects in case of intersection
-        for other_ind, (det, iou) in enumerate(zip(self.yolo_results.boxes, iou_matrix)):
-            other_cat = self.yolo_results.names[int(det.cls)]
-            if ind == other_ind or iou < IOU_THRESH or other_cat not in cut_other_classes:
-                continue
-            o_x1, o_y1, o_x2, o_y2 = det.xyxy.squeeze().type(torch.int32)
-            # remap current_person_bbox to reference_person_bbox coordinates
-            o_x1 = max(o_x1 - x1, 0)
-            o_y1 = max(o_y1 - y1, 0)
-            o_x2 = min(o_x2 - x1, crop_w)
-            o_y2 = min(o_y2 - y1, crop_h)
-            if other_cat != "face":
-                if (o_y1 / crop_h) < CROP_ROUND_RATE:
-                    o_y1 = 0
-                if ((crop_h - o_y2) / crop_h) < CROP_ROUND_RATE:
-                    o_y2 = crop_h
-                if (o_x1 / crop_w) < CROP_ROUND_RATE:
-                    o_x1 = 0
-                if ((crop_w - o_x2) / crop_w) < CROP_ROUND_RATE:
-                    o_x2 = crop_w
-            obj_image[o_y1:o_y2, o_x1:o_x2] = 0
-        obj_image, remain_ratio = cropout_black_parts(obj_image, CROP_ROUND_RATE)
-        if remain_ratio < MIN_PERSON_CROP_AFTERCUT_RATIO:
-            return None
-        return obj_image
-    def collect_crops(self, image) -> PersonAndFaceCrops:
-        crops_data = PersonAndFaceCrops()
-        for face_ind, person_ind in self.face_to_person_map.items():
-            face_image = self.crop_object(image, face_ind, cut_other_classes=[])
-            if person_ind is None:
-                crops_data.crops_faces_wo_body[face_ind] = face_image
-                continue
-            person_image = self.crop_object(image, person_ind, cut_other_classes=["face", "person"])
-            crops_data.crops_faces[face_ind] = face_image
-            crops_data.crops_persons[person_ind] = person_image
-        for person_ind in self.unassigned_persons_inds:
-            person_image = self.crop_object(image, person_ind, cut_other_classes=["face", "person"])
-            crops_data.crops_persons_wo_face[person_ind] = person_image
-        # uncomment to save preprocessed crops
-        # crops_data.save()
-        return crops_data

mivolo/version.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __version__ = "0.3.0dev"

model/model_imdb_cross_person_4.22_99.46.pth.tar → model_imdb_cross_person_4.22_99.46.pth.tar RENAMED Viewed

File without changes

product2item.py DELETED Viewed

@@ -1,95 +0,0 @@
-import json
-from urllib.parse import quote
-from tqdm import tqdm
-from bs4 import BeautifulSoup
-from selenium import webdriver
-from utils import *
-MAX_PAGE = 618
-def append_dict_to_jsonl(dictionary, file_path='./output/items.jsonl'):
-    with open(file_path, 'a', encoding='utf-8') as jsonl_file:
-        json.dump(dictionary, jsonl_file, ensure_ascii=False)
-        jsonl_file.write('\n')
-def get_second_links(keyword):
-    # selenium
-    option = webdriver.ChromeOptions()
-    option.add_experimental_option('excludeSwitches', ['enable-automation'])
-    option.add_argument("--disable-blink-features=AutomationControlled")
-    # option.add_argument('--headless')
-    browser = webdriver.Chrome(options=option)
-    browser.get(f'https://www.taobao.com/list/product/{quote(keyword)}.htm')
-    # browser.minimize_window()
-    browser.maximize_window()
-    skip_captcha()
-    # 遍历product页面下的所有item，直至已加载全部商品
-    for i in tqdm(range(1, MAX_PAGE + 1)):
-        browser.execute_script(f'window.scrollTo(0, {i * 500})')
-        sleeps(0.5, 1.0)
-        page_str = str(browser.page_source)
-        if "<title>taobao | 淘寶</title>" in page_str:
-            print('遭遇验证码...')
-            return []
-        if "已加载全部商品" in page_str:
-            print('已加载全部商品!')
-            break
-        if "加载错误，请重试" in page_str:
-            print('加载错误，爬取中断')
-            break
-    html_content = browser.page_source
-    # bs4
-    soup = BeautifulSoup(html_content, 'html.parser')
-    return [link.get('href') for link in soup.find_all('a', class_='item')]
-def read_lines_to_array(file_path):
-    create_dir('./' + os.path.dirname(file_path))
-    lines_array = []
-    with open(file_path, 'r', encoding='utf-8') as file:
-        for line in file:
-            lines_array.append(line.strip())
-    return lines_array
-def product_to_items():
-    keywords = read_lines_to_array('./input/keywords.txt')
-    create_dir('./output')
-    for key in keywords:
-        urls = list(get_second_links(key))
-        print(f'Saving url into jsonl for keyword [{key}]')
-        for url in tqdm(urls):
-            tmp_dict = {
-                'keyword': key,
-                'id': url.split('.htm?spm=')[0].split('//www.taobao.com/list/item/')[1]
-            }
-            append_dict_to_jsonl(tmp_dict)
-    rm_duplicates_by_key()
-if __name__ == "__main__":
-    keywords = read_lines_to_array('./input/keywords.txt')
-    create_dir('./output')
-    for key in keywords:
-        urls = list(get_second_links(key))
-        print(f'Saving url into jsonl for keyword [{key}]')
-        for url in tqdm(urls):
-            tmp_dict = {
-                'keyword': key,
-                'id': url.split('.htm?spm=')[0].split('//www.taobao.com/list/item/')[1]
-            }
-            append_dict_to_jsonl(tmp_dict)
-    rm_duplicates_by_key()

requirements.txt DELETED Viewed

@@ -1,9 +0,0 @@
-requests
-beautifulsoup4
-selenium
-Cython==0.29.28
-ultralytics
-timm==0.8.13.dev0
-omegaconf
-tqdm
-opencv-python

utils.py DELETED Viewed

@@ -1,59 +0,0 @@
-import os
-import json
-import random
-from time import sleep
-def create_dir(dir_path):
-    if not os.path.exists(dir_path):
-        os.makedirs(dir_path)
-def skip_captcha():
-    print('爬取链接中...')
-def sleeps(a, b):
-    if a > 0 and b > a:
-        sleep((b - a) * random.random() + a)
-    else:
-        print('Invalid parms!')
-def save_to_file(data_list, file_path='./output/items.jsonl'):
-    with open(file_path, 'w', encoding='utf-8') as jsonl_file:
-        for data in data_list:
-            json.dump(data, jsonl_file, ensure_ascii=(
-                file_path != './output/items.jsonl'))
-            jsonl_file.write('\n')
-def rm_duplicates_by_key(jsonl_path='./output/items.jsonl', key_to_check='id', failist_path='./output/duplicate_id.txt'):
-    print('Removing duplicates...')
-    if not os.path.exists(jsonl_path):
-        print('jsonl not exist')
-        return
-    data_set = set()
-    unique_data = []
-    duplicates = set()
-    with open(jsonl_path, 'r', encoding='utf-8') as jsonl_file:
-        for line in jsonl_file:
-            data = json.loads(line)
-            # 提取指定键值的值，并用作判断重复的标识
-            key_value = data.get(key_to_check)
-            # 如果标识值已存在，表示数据重复
-            if key_value in data_set:
-                duplicates.add(key_value)
-                continue
-            else:
-                data_set.add(key_value)
-                unique_data.append(data)
-    save_to_file(unique_data, file_path=jsonl_path)
-    save_to_file(duplicates, file_path=failist_path)
-    print('Duplicates removed!')

model/yolov8x_person_face.pt → yolov8x_person_face.pt RENAMED Viewed

File without changes