vivekvar
/

azure-scripts

Model card Files Files and versions

azure-scripts / train_h100_clean.py

vivekvar's picture

azure home scripts: data gen, training, misc

a70eb3d verified 21 days ago

history blame contribute delete

1.43 kB

	#!/usr/bin/env python3
	# 3-class clean training on H100 NVL
	# Classes: 0 no-helmet \| 1 with-helmet \| 2 triple-riding
	from ultralytics import YOLO
	import torch, os

	print('GPU:', torch.cuda.get_device_name(0), '\|', torch.cuda.get_device_properties(0).total_memory/1e9, 'GB')

	# Start from pretrained yolo26m (auto-downloads if missing)
	model = YOLO('yolo26m.pt')

	results = model.train(
	data='/home/azureuser/clean_merged_data/data.yaml',
	epochs=150,
	imgsz=640,
	batch=64, # H100 NVL has 95GB, can push batch high
	device=0,
	workers=8,
	project='runs_clean',
	name='h100_3class',
	exist_ok=True,
	amp=True,
	cos_lr=True,
	close_mosaic=15,
	# augmentation — important for 10k image dataset
	mosaic=1.0,
	mixup=0.15,
	copy_paste=0.3, # boost with-helmet via cross-image pasting
	hsv_h=0.015, hsv_s=0.7, hsv_v=0.4,
	degrees=5.0,
	translate=0.1,
	scale=0.5,
	fliplr=0.5,
	# loss
	cls=1.0, # classification loss weight (bump if still confused)
	box=7.5,
	dfl=1.5,
	# regularization
	weight_decay=0.0005,
	dropout=0.0,
	# schedule
	optimizer='auto',
	lr0=0.01,
	patience=40,
	plots=True,
	verbose=True,
	)
	print('TRAIN DONE — running val on test split')
	m = YOLO('runs_clean/h100_3class/weights/best.pt')
	m.val(data='/home/azureuser/clean_merged_data/data.yaml', split='test', plots=True, save_json=True)