azure-scripts / train_v3.py
vivekvar's picture
azure home scripts: data gen, training, misc
a70eb3d verified
#!/usr/bin/env python3
# v3 max-throughput: yolo26m on 6673-img dataset, batch=128, cache=ram
from ultralytics import YOLO
import torch
print('GPU:', torch.cuda.get_device_name(0), '|', round(torch.cuda.get_device_properties(0).total_memory/1e9), 'GB')
model = YOLO('yolo26m.pt')
model.train(
data='/home/azureuser/merged_v3/data.yaml',
epochs=200,
imgsz=640,
batch=128, # 2x v2 — should hit ~70GB VRAM
device=0,
workers=16, # feed data faster
cache='ram', # dataset is ~1GB, fits easily
project='runs_v3',
name='h100_3class_v3',
exist_ok=True,
amp=True,
cos_lr=True,
close_mosaic=20,
mosaic=1.0, mixup=0.15, copy_paste=0.3,
hsv_h=0.015, hsv_s=0.7, hsv_v=0.4,
degrees=5.0, translate=0.1, scale=0.5, fliplr=0.5,
cls=1.0, box=7.5, dfl=1.5,
weight_decay=0.0005,
optimizer='auto',
patience=60,
plots=True, verbose=True,
)
print('TRAIN DONE — running val + test')
m = YOLO('runs_v3/h100_3class_v3/weights/best.pt')
print('--- VAL ---'); m.val(data='/home/azureuser/merged_v3/data.yaml', split='val')
print('--- TEST ---'); m.val(data='/home/azureuser/merged_v3/data.yaml', split='test')
print('--- TEST + TTA ---'); m.val(data='/home/azureuser/merged_v3/data.yaml', split='test', augment=True)