Added training scripts

Browse files

Files changed (9) hide show

training/segmentation_prepare.ipynb +323 -0
training/segmentation_stage1.yaml +12 -0
training/segmentation_stage2.yaml +12 -0
training/segmentation_train.py +35 -0
training/smsrc_prepare.ipynb +182 -0
training/smsrc_visualize.ipynb +139 -0
training/turtle_detector/__init__.py +2 -0
training/turtle_detector/masks.py +92 -0
training/turtle_detector/utils.py +96 -0

training/segmentation_prepare.ipynb ADDED Viewed

	@@ -0,0 +1,323 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "ebe0faa7",
+   "metadata": {},
+   "source": [
+    "This notebook prepares the datasets for training of the turtle detection model. First, it goes through the SeaTurtleID2022 dataset and converts the existing masks into the YOLO format needed by Ultralytics. Then it goes through the TurtlesOfSMSRC dataset, loads the masks created in the smsrc_prepare notebook and again, converts the masks to the YOLO format. Finally, the metadata are merged together and are ready to use the segmentation_train script, which first trains on SeaTurtleID2022 (photos below water) and then finetunes on the combined dataset (photos above water were added)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a2e66c17",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "import shutil\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm\n",
+    "from wildlife_datasets.datasets import SeaTurtleID2022, TurtlesOfSMSRC\n",
+    "from wildlife_datasets.datasets.utils import find_images, parse_bbox_mask\n",
+    "from wildlife_datasets.splits import ClosedSetSplit\n",
+    "from turtle_detector import get_index, rle_to_yolo, uncompressed_rle_to_yolo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "93be7212",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "root_out = f'/data/wildlife_datasets/turtle-detector'\n",
+    "\n",
+    "for addition in ['images/train', 'images/val', 'labels/train', 'labels/val']:\n",
+    "    for dataset_name in ['SeaTurtleID2022', 'TurtlesOfSMSRC']:\n",
+    "        os.makedirs(os.path.join(root_out, addition, dataset_name), exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "14b3e193",
+   "metadata": {},
+   "source": [
+    "# SeaTurtleID2022"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c664fa1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_name = 'SeaTurtleID2022'\n",
+    "root = '/data/wildlife_datasets/data/SeaTurtleID2022'\n",
+    "\n",
+    "dataset = SeaTurtleID2022(root)\n",
+    "if dataset.df['path'].nunique() != len(dataset):\n",
+    "    raise ValueError('path is not unique')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64391cd5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "splitter = ClosedSetSplit(0.8)\n",
+    "idx_train, idx_test = splitter.split(dataset.df)[0]\n",
+    "idx_train += 1\n",
+    "idx_test += 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "057b036e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "flipper_categories = {\n",
+    "    '': 0,\n",
+    "    'front_left': 2,\n",
+    "    'front_right': 3,\n",
+    "    'rear_left': 4,\n",
+    "    'rear_right': 5,\n",
+    "}\n",
+    "\n",
+    "root_ann = f'{root}/turtles-data/data'\n",
+    "with open(os.path.join(root_ann, 'annotations.json')) as file:\n",
+    "    annotations = json.load(file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7ff91b8a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for ann_img in tqdm(annotations['images']):\n",
+    "    file_name = os.path.join(root_ann, ann_img['file_name'])\n",
+    "    if ann_img['id'] in idx_train:\n",
+    "        shutil.copy(file_name, f'{root_out}/images/train/{dataset_name}')\n",
+    "    elif ann_img['id'] in idx_test:\n",
+    "        shutil.copy(file_name, f'{root_out}/images/val/{dataset_name}')\n",
+    "    else:\n",
+    "        raise ValueError('Split wrong')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a5a19846",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for ann_ann in tqdm(annotations['annotations']):\n",
+    "    if ann_ann['category_id'] == 1:\n",
+    "        category_id = 0\n",
+    "    elif ann_ann['category_id'] == 3:\n",
+    "        category_id = 1\n",
+    "    else:\n",
+    "        location = ann_ann['attributes'].get('location', '')\n",
+    "        category_id = flipper_categories[location]\n",
+    "\n",
+    "    image_id = ann_ann['image_id']\n",
+    "    rle = ann_ann['segmentation']    \n",
+    "    yolo_segments = uncompressed_rle_to_yolo(rle, class_id=category_id)\n",
+    "    ann_img = annotations['images'][image_id - 1]\n",
+    "    base_name = os.path.basename(ann_img['file_name'])\n",
+    "    base_name = os.path.splitext(base_name)[0] + '.txt'\n",
+    "\n",
+    "    if image_id != ann_img['id']:\n",
+    "        raise ValueError('Image ids are not ordered')\n",
+    "    if ann_img['id'] in idx_train:\n",
+    "        file_name = f'{root_out}/labels/train/{dataset_name}/{base_name}'\n",
+    "    elif ann_img['id'] in idx_test:\n",
+    "        file_name = f'{root_out}/labels/val/{dataset_name}/{base_name}'\n",
+    "    else:\n",
+    "        raise ValueError('Split wrong')\n",
+    "\n",
+    "    with open(file_name, 'a') as myfile:\n",
+    "        for yolo_segment in yolo_segments:\n",
+    "            myfile.write(yolo_segment + '\\n')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "65a8b7ce",
+   "metadata": {},
+   "source": [
+    "# TurtlesOfSMSRC"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "33d88ac1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_name = 'TurtlesOfSMSRC'\n",
+    "root = '/data/wildlife_datasets/TurtlesOfSMSRC'\n",
+    "\n",
+    "dataset = TurtlesOfSMSRC(root)\n",
+    "masks = pd.read_csv(f'{root}/masks.csv')\n",
+    "masks['mask'] = masks['mask'].apply(parse_bbox_mask)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e632972b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "splitter = ClosedSetSplit(0.8)\n",
+    "idx_train, idx_test = splitter.split(dataset.df)[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6acc2bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "annotation_categories = {\n",
+    "    'turtle': 0,\n",
+    "    'head': 1,\n",
+    "    'flipper_fl': 2,\n",
+    "    'flipper_fr': 3,\n",
+    "    'flipper_rl': 4,\n",
+    "    'flipper_rr': 5,\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b3377df4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for image_id in tqdm(masks['image_id'].unique()):\n",
+    "    i = get_index(dataset, image_id)\n",
+    "    file_name = os.path.join(root, dataset.metadata.loc[i, 'path'])\n",
+    "    if i in idx_train:\n",
+    "        shutil.copy(file_name, f'{root_out}/images/train/{dataset_name}')\n",
+    "    elif i in idx_test:\n",
+    "        shutil.copy(file_name, f'{root_out}/images/val/{dataset_name}')\n",
+    "    else:\n",
+    "        raise ValueError('Split wrong')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d1eba0f2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for _, mask in tqdm(masks.iterrows(), total=len(masks)):\n",
+    "    category_id = annotation_categories[mask['label_side']]\n",
+    "    image_id = mask['image_id']\n",
+    "    rle = mask['mask']    \n",
+    "    yolo_segments = rle_to_yolo(rle, class_id=category_id)\n",
+    "    i = get_index(dataset, image_id)\n",
+    "\n",
+    "    base_name = os.path.basename(dataset.metadata.loc[i, 'path'])\n",
+    "    base_name = os.path.splitext(base_name)[0] + '.txt'\n",
+    "\n",
+    "    if i in idx_train:\n",
+    "        file_name = f'{root_out}/labels/train/{dataset_name}/{base_name}'\n",
+    "    elif i in idx_test:\n",
+    "        file_name = f'{root_out}/labels/val/{dataset_name}/{base_name}'\n",
+    "    else:\n",
+    "        raise ValueError('Split wrong')\n",
+    "\n",
+    "    with open(file_name, 'a') as myfile:\n",
+    "        for yolo_segment in yolo_segments:\n",
+    "            myfile.write(yolo_segment + '\\n')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0b6f6683",
+   "metadata": {},
+   "source": [
+    "# Create metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5908b52",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_repeat = {\n",
+    "    'SeaTurtleID2022': 1,\n",
+    "    'TurtlesOfSMSRC': 30,\n",
+    "}\n",
+    "\n",
+    "# First split and only then oversample to prevent train-test leak\n",
+    "images = find_images(root_out)\n",
+    "images = root_out + '/' + images['path'] + '/' + images['file']\n",
+    "images_train = images[images.str.contains('/train/')]\n",
+    "images_test = images[images.str.contains('/val/')]\n",
+    "if len(images_train) + len(images_test) != len(images):\n",
+    "    raise ValueError('The split into train and test images failed.')\n",
+    "\n",
+    "# Oversample (even the test set)\n",
+    "idx_train = []\n",
+    "idx_test = []\n",
+    "for dataset_name in ['SeaTurtleID2022', 'TurtlesOfSMSRC']:\n",
+    "    idx_part = list(images_train[images_train.str.contains(dataset_name)].index)\n",
+    "    idx_train += n_repeat[dataset_name] * idx_part\n",
+    "    idx_part = list(images_test[images_test.str.contains(dataset_name)].index)\n",
+    "    idx_test += n_repeat[dataset_name] * idx_part\n",
+    "images_train = images_train.loc[idx_train]\n",
+    "images_test = images_test.loc[idx_test]\n",
+    "\n",
+    "# Save the oversampled splits\n",
+    "images_train.to_csv(f'{root_out}/train.txt', header=False, index=False)\n",
+    "images_test.to_csv(f'{root_out}/val.txt', header=False, index=False)\n",
+    "for dataset_name in ['SeaTurtleID2022', 'TurtlesOfSMSRC']:\n",
+    "    subset_train = images_train[images_train.str.contains(dataset_name)]\n",
+    "    subset_train.to_csv(f'{root_out}/train_{dataset_name}.txt', header=False, index=False)\n",
+    "    subset_test = images_test[images_test.str.contains(dataset_name)]\n",
+    "    subset_test.to_csv(f'{root_out}/val_{dataset_name}.txt', header=False, index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "sam3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

training/segmentation_stage1.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+path: /data/wildlife_datasets/turtle-detector
+train: train_SeaTurtleID2022.txt
+val: val_SeaTurtleID2022.txt
+nc: 6
+names:
+  0: turtle
+  1: head
+  2: flipper_fl
+  3: flipper_fr
+  4: flipper_rl
+  5: flipper_rr

training/segmentation_stage2.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+path: /data/wildlife_datasets/turtle-detector
+train: train.txt
+val: val.txt
+nc: 6
+names:
+  0: turtle
+  1: head
+  2: flipper_fl
+  3: flipper_fr
+  4: flipper_rl
+  5: flipper_rr

training/segmentation_train.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import os
+from ultralytics import YOLO
+project = f"{os.getcwd()}/runs"
+device = "cuda:2"
+imgsz = 640
+epochs = 20
+# Stage 1: Pretrain on SeaTurtleID2022 (large dataset)
+model = YOLO("yolo11s-seg.pt")
+model.train(
+    data="segmentation_stage1.yaml",
+    project=project,
+    name="stage1",
+    epochs=epochs,
+    imgsz=imgsz,
+    device=device,
+    fliplr=0,
+    flipud=0,
+)
+# Stage 2: Fine-tune on combined dataset (balanced)
+model = YOLO(f"{project}/stage1/weights/last.pt")
+model.train(
+    data="segmentation_stage2.yaml",
+    project=project,
+    name="stage2",
+    epochs=epochs,
+    imgsz=imgsz,
+    device=device,
+    fliplr=0,
+    flipud=0,
+    freeze=5,
+)

training/smsrc_prepare.ipynb ADDED Viewed

	@@ -0,0 +1,182 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "926f340c",
+   "metadata": {},
+   "source": [
+    "The notebook prepares the SMSRC data for training of the turtle detector. It uses SAM3 to detect the turtle, its head and flippers. Then it uses a heuristic to assing the left/right and front/rear orientation of the flipper. These assignments were manually checked and fixed when not correct.\n",
+    "\n",
+    "The output is the notebook is the masks.csv file which is then used in the segmentation_prepare notebook to create the training dataset for detection."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6774dc0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from wildlife_datasets.datasets import TurtlesOfSMSRC\n",
+    "from turtle_detector import assign_flippers, initialize_sam3, mask_to_rle, rle_to_mask, compute_iou, mask_to_bbox"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c8a0449",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "root = '/data/wildlife_datasets/TurtlesOfSMSRC'\n",
+    "dataset = TurtlesOfSMSRC(root)\n",
+    "\n",
+    "idx_ranges = [\n",
+    "    (333582414, 333582440),\n",
+    "    (327367311, 327367335),\n",
+    "]\n",
+    "idx = np.zeros(len(dataset), dtype=bool)\n",
+    "for idx_min, idx_max in idx_ranges:\n",
+    "    encounter_id = dataset.metadata['encounter_id'].to_numpy()\n",
+    "    idx += (encounter_id >= idx_min) * (encounter_id <= idx_max)\n",
+    "\n",
+    "dataset = dataset.get_subset(idx)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7a35f952",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model, processor = initialize_sam3()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f521710",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt_map = {\n",
+    "    \"head\": \"turtle head\",\n",
+    "    \"flipper\": \"turtle flipper\",\n",
+    "    \"turtle\": \"turtle\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aef61bcb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "min_area = 500\n",
+    "iou_threshold = 0.1\n",
+    "\n",
+    "masks = []\n",
+    "for i in range(len(dataset)):\n",
+    "    image_path = f\"{dataset.root}/{dataset.metadata['path'].iloc[i]}\"\n",
+    "    image = dataset[i]\n",
+    "    inference_state = processor.set_image(image)\n",
+    "\n",
+    "    for label, prompt in prompt_map.items():\n",
+    "        processor.reset_all_prompts(inference_state)\n",
+    "        inference_state = processor.set_text_prompt(state=inference_state, prompt=prompt)\n",
+    "\n",
+    "        for m in inference_state[\"masks\"]:\n",
+    "            m = m.cpu().numpy().astype(bool)\n",
+    "            if m.ndim == 3 and m.shape[0] == 1:\n",
+    "                m = m[0]\n",
+    "            if m.sum() > min_area:\n",
+    "                masks.append({\n",
+    "                    'image_id': dataset.metadata['image_id'].loc[i],\n",
+    "                    'mask': mask_to_rle(m),\n",
+    "                    'label': label,\n",
+    "                })\n",
+    "masks = pd.DataFrame(masks)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d4a052cd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "masks['keep'] = True\n",
+    "for _, masks_image in masks.groupby('image_id'):\n",
+    "    keep = masks_image['keep'].copy()\n",
+    "    for i, (j, mask_j) in enumerate(masks_image.iterrows()):\n",
+    "        for k, mask_k in masks_image.iloc[i+1:].iterrows():            \n",
+    "            if not keep.loc[j] or not keep.loc[k]:\n",
+    "                continue\n",
+    "            \n",
+    "            mj = rle_to_mask(masks.loc[j, 'mask'])\n",
+    "            mk = rle_to_mask(masks.loc[k, 'mask'])\n",
+    "\n",
+    "            iou = compute_iou(mj, mk)\n",
+    "            if iou < iou_threshold:\n",
+    "                continue\n",
+    "\n",
+    "            if mask_j['label'] == mask_k['label']:\n",
+    "                masks.at[j, 'mask'] = mask_to_rle(mj | mk)\n",
+    "                keep.loc[k] = False\n",
+    "\n",
+    "            elif {\"head\", \"flipper\"} == {mask_j['label'], mask_k['label']}:\n",
+    "                if (keep * (masks_image['label'] == 'head')).sum() == 1:\n",
+    "                    if mask_j['label'] == \"flipper\":\n",
+    "                        keep.loc[j] = False\n",
+    "                    else:\n",
+    "                        keep.loc[k] = False\n",
+    "                else:\n",
+    "                    if mask_j['label'] == \"head\":\n",
+    "                        keep.loc[j] = False\n",
+    "                    else:\n",
+    "                        keep.loc[k] = False\n",
+    "    masks.loc[masks_image.index, 'keep'] = keep\n",
+    "masks = masks[masks['keep']]\n",
+    "masks = masks.drop('keep', axis=1)\n",
+    "\n",
+    "for i, m in masks.iterrows():\n",
+    "    bbox = mask_to_bbox(rle_to_mask(m['mask']))\n",
+    "    x0, y0, x1, y1 = bbox\n",
+    "    masks.loc[i, 'bbox_x'] = x0\n",
+    "    masks.loc[i, 'bbox_y'] = y0\n",
+    "    masks.loc[i, 'bbox_w'] = x1 - x0\n",
+    "    masks.loc[i, 'bbox_h'] = y1 - y0\n",
+    "\n",
+    "for _, masks_image in masks.groupby('image_id'):\n",
+    "    masks.loc[masks_image.index, 'label_side'] = assign_flippers(masks_image)['label']\n",
+    "\n",
+    "masks.to_csv('masks.csv', index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "sam3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

training/smsrc_visualize.ipynb ADDED Viewed

	@@ -0,0 +1,139 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "56e96915",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib.patches as patches\n",
+    "\n",
+    "from wildlife_datasets.datasets import TurtlesOfSMSRC\n",
+    "from wildlife_datasets.datasets.utils import parse_bbox_mask\n",
+    "from turtle_detector import assign_flippers, initialize_sam3, mask_to_rle, rle_to_mask, compute_iou, mask_to_bbox"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c8a0449",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "root = '/data/wildlife_datasets/TurtlesOfSMSRC'\n",
+    "root_figures = 'figures'\n",
+    "dataset = TurtlesOfSMSRC(root)\n",
+    "masks = pd.read_csv('masks.csv')\n",
+    "masks['mask'] = masks['mask'].apply(parse_bbox_mask)\n",
+    "\n",
+    "os.makedirs(root_figures, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f521710",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "colors_map = {\n",
+    "    \"head\": 0,\n",
+    "    \"flipper\": 1,\n",
+    "    \"turtle\": 2,\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e82f9db7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for image_id, masks_image in masks.groupby('image_id'):\n",
+    "    i = np.where(dataset.metadata.image_id == image_id)[0][0]\n",
+    "    image = dataset[i]\n",
+    "    width, height = image.size\n",
+    "\n",
+    "    overlay = np.zeros((height, width, 3), dtype=np.float32)\n",
+    "    for _, m in masks_image.iterrows():\n",
+    "        mask_bool = rle_to_mask(m['mask']).astype(bool)\n",
+    "        overlay[mask_bool, colors_map[m['label']]] = 1.0\n",
+    "\n",
+    "    fig, ax = plt.subplots(figsize=(8, 8))\n",
+    "    plt.imshow(image)\n",
+    "    plt.imshow(overlay, alpha=0.5)\n",
+    "\n",
+    "    for _, m in masks_image.iterrows():\n",
+    "        rect = patches.Rectangle(\n",
+    "            (m['bbox_x'], m['bbox_y']),\n",
+    "            m['bbox_w'],\n",
+    "            m['bbox_h'],\n",
+    "            linewidth=2,\n",
+    "            edgecolor=\"white\",\n",
+    "            facecolor=\"none\"\n",
+    "        )\n",
+    "        ax.add_patch(rect)\n",
+    "        ax.text(\n",
+    "            m['bbox_x'],\n",
+    "            m['bbox_y'] - 3,\n",
+    "            m['label_side'],\n",
+    "            color=\"white\",\n",
+    "            fontsize=10,\n",
+    "            weight=\"bold\",\n",
+    "            bbox=dict(facecolor=\"black\", alpha=0.5, pad=2)\n",
+    "        )\n",
+    "    \n",
+    "    n_head = (masks_image['label'] == 'head').sum()\n",
+    "    n_flipper = (masks_image['label'] == 'flipper').sum()\n",
+    "    n_turtle = (masks_image['label'] == 'head').sum()\n",
+    "\n",
+    "    plt.axis(\"off\")\n",
+    "    plt.title(f'{n_head}, {n_flipper}, {n_turtle}')\n",
+    "    plt.savefig(f'{root_figures}/{image_id}.png', bbox_inches='tight', dpi=600)\n",
+    "    plt.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "54035a2d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for image_id, masks_image in masks.groupby('image_id'):\n",
+    "    if masks_image['label_side'].value_counts().max() > 1:\n",
+    "        print(f'Image id {image_id} has multiple annotations.')\n",
+    "        display(masks_image)\n",
+    "display(masks['label'].value_counts())\n",
+    "display(masks['label_side'].value_counts())"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "sam3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

training/turtle_detector/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .masks import *
2	+ from .utils import assign_flippers, get_index, initialize_sam3

training/turtle_detector/masks.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import numpy as np
+import cv2
+import pycocotools.mask as mask_utils
+from PIL import ImageDraw
+def compute_iou(mask_a, mask_b):
+    intersection = np.logical_and(mask_a, mask_b).sum()
+    union = np.logical_or(mask_a, mask_b).sum()
+    return 0.0 if union == 0 else intersection / union
+def mask_to_bbox(mask):
+    ys, xs = np.where(mask)
+    if len(xs) == 0:
+        return None
+    return xs.min(), ys.min(), xs.max(), ys.max()
+def mask_to_rle(mask, json_safe=True):
+    rle = mask_utils.encode(np.asfortranarray(mask.astype(np.uint8)))
+    if json_safe:
+        rle["counts"] = rle["counts"].decode("ascii")
+    return rle
+def rle_to_mask(rle):
+    rle = rle.copy()
+    if isinstance(rle["counts"], str):
+        rle["counts"] = rle["counts"].encode("ascii")
+    return mask_utils.decode(rle)
+def uncompressed_rle_to_mask(rle):
+    """Decode COCO-style uncompressed RLE into a binary mask (0/1)."""
+    h, w = rle["size"]
+    counts = rle["counts"]
+    mask = np.zeros(h * w, dtype=np.uint8)
+    val = 0
+    idx = 0
+    for c in counts:
+        mask[idx:idx + c] = val
+        idx += c
+        val = 1 - val
+    mask = mask.reshape((h, w), order='F')
+    return mask
+def mask_to_yolo(mask, class_id=0):
+    """Convert a binary mask (0/1) into YOLO polygon segmentation format."""
+    h, w = mask.shape
+    # ensure 8-bit binary mask
+    mask8 = (mask * 255).astype(np.uint8)
+    # find outer contours only
+    contours, _ = cv2.findContours(mask8, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    yolo_segments = []
+    for contour in contours:
+        if cv2.contourArea(contour) < 100:  # ignore tiny noise
+            continue
+        contour = contour.squeeze().astype(float)
+        if contour.ndim != 2:
+            continue
+        # normalize to [0,1]
+        contour[:, 0] = contour[:, 0] / float(w)
+        contour[:, 1] = contour[:, 1] / float(h)
+        coords = contour.flatten().tolist()
+        yolo_segments.append(f"{class_id} " + " ".join(f"{x:.6f}" for x in coords))
+    return yolo_segments
+def rle_to_yolo(rle, class_id=0):
+    mask = rle_to_mask(rle)
+    return mask_to_yolo(mask, class_id)
+def uncompressed_rle_to_yolo(rle, class_id=0):
+    mask = uncompressed_rle_to_mask(rle)
+    return mask_to_yolo(mask, class_id)
+def draw_yolo_on_pil(image, yolo_segments, color=(0,255,0)):
+    img = image.convert("RGB")
+    draw = ImageDraw.Draw(img)
+    w, h = img.size
+    for seg in yolo_segments:
+        parts = seg.strip().split()
+        class_id = int(parts[0])
+        coords = np.array([float(x) for x in parts[1:]]).reshape(-1, 2)
+        points = [(x * w, y * h) for x, y in coords]
+        draw.line(points + [points[0]], fill=color, width=2)
+    return img

training/turtle_detector/utils.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import os
+import numpy as np
+import sam3
+from sam3 import build_sam3_image_model
+from sam3.model.sam3_image_processor import Sam3Processor
+from .masks import rle_to_mask
+def get_index(dataset, image_id):
+    idx = dataset.metadata['image_id'] == image_id
+    if idx.sum() != 1:
+        raise ValueError('image_id not found or found multiple times.')
+    return dataset.metadata[idx].index[0]
+def mask_centroid(mask):
+    ys, xs = np.nonzero(mask)
+    return np.array([xs.mean(), ys.mean()])
+def rle_centroid(rle):
+    return mask_centroid(rle_to_mask(rle))
+def assign_flippers(df):
+    df = df.copy()
+    # Check that there is only one head
+    head_rows = df[df['label'] == 'head']
+    if len(head_rows) != 1:
+        return df
+    # Compute the head centroid
+    head_center = rle_centroid(head_rows.iloc[0]['mask'])
+    # Extract the flippers
+    flippers = df[df['label'] == 'flipper']
+    n_flippers = len(flippers)
+    if n_flippers == 0:
+        return df
+    # Compute the flipper centroids
+    flipper_centers = np.vstack([
+        rle_centroid(rle) for rle in flippers['mask']
+    ])
+    # Vector from turtle center to head defines "forward"
+    turtle_center = flipper_centers.mean(axis=0)
+    forward_vec = head_center - turtle_center
+    forward_vec /= np.linalg.norm(forward_vec)
+    # Perpendicular defines left/right
+    left_vec = np.array([-forward_vec[1], forward_vec[0]])
+    # Project flippers
+    forward_proj = flipper_centers @ forward_vec
+    lateral_proj = flipper_centers @ left_vec
+    if n_flippers <= 2:
+        # Always front flippers
+        order = np.argsort(lateral_proj)
+        left_idx, right_idx = order[0], order[-1]
+        df.loc[flippers.index[left_idx], 'label'] = 'flipper_fl'
+        df.loc[flippers.index[right_idx], 'label'] = 'flipper_fr'
+        return df
+    elif n_flippers <= 4:
+        # Sort by forward distance
+        order_fwd = np.argsort(forward_proj)
+        rear_idxs = order_fwd[:2]
+        front_idxs = order_fwd[-2:]
+        # Front flippers
+        front_l = front_idxs[np.argmin(lateral_proj[front_idxs])]
+        front_r = front_idxs[np.argmax(lateral_proj[front_idxs])]
+        df.loc[flippers.index[front_l], 'label'] = 'flipper_fl'
+        df.loc[flippers.index[front_r], 'label'] = 'flipper_fr'
+        # Rear flippers (if present)
+        if len(rear_idxs) == 2:
+            rear_l = rear_idxs[np.argmin(lateral_proj[rear_idxs])]
+            rear_r = rear_idxs[np.argmax(lateral_proj[rear_idxs])]
+            df.loc[flippers.index[rear_l], 'label'] = 'flipper_rl'
+            df.loc[flippers.index[rear_r], 'label'] = 'flipper_rr'
+        else:
+            # 3 flippers: assign only the most rear one
+            idx = rear_idxs[0]
+            side = 'l' if lateral_proj[idx] < 0 else 'r'
+            df.loc[flippers.index[idx], 'label'] = f'flipper_r{side}'
+    return df
+def initialize_sam3():
+    sam3_root = os.path.join(os.path.dirname(sam3.__file__), "..")
+    bpe_path = f"{sam3_root}/sam3/assets/bpe_simple_vocab_16e6.txt.gz"
+    model = build_sam3_image_model(bpe_path=bpe_path)
+    processor = Sam3Processor(model, confidence_threshold=0.5)
+    return model, processor