GLEN-model / scripts /train_full_vault.ps1

15-06-v2

08894ba 3 months ago

11.4 kB

	#!/usr/bin/env pwsh

	Write-Host "==========================================="
	Write-Host "GLEN Full Training on The Vault Dataset"
	Write-Host "Processing 34M+ code samples"
	Write-Host "==========================================="

	# Production parameters
	$GPU_MEMORY_THRESHOLD = 0.85
	$GPU_CHECK_INTERVAL = 50
	$WANDB_PROJECT = "glen-vault-production"

	# Training configuration
	$PHASE1_EPOCHS = 3
	$PHASE2_EPOCHS = 5
	$PHASE1_BATCH_SIZE = 32
	$PHASE2_BATCH_SIZE = 16
	$GRADIENT_ACCUMULATION = 4
	$MAX_INPUT_LENGTH = 256
	$LEARNING_RATE = 5e-5

	Write-Host "🔧 Production Configuration:"
	Write-Host " - Phase 1 epochs: $PHASE1_EPOCHS"
	Write-Host " - Phase 2 epochs: $PHASE2_EPOCHS"
	Write-Host " - Phase 1 batch size: $PHASE1_BATCH_SIZE"
	Write-Host " - Phase 2 batch size: $PHASE2_BATCH_SIZE"
	Write-Host " - Gradient accumulation: $GRADIENT_ACCUMULATION"
	Write-Host " - Max input length: $MAX_INPUT_LENGTH"
	Write-Host " - Learning rate: $LEARNING_RATE"
	Write-Host ""

	Write-Host "🛡️ Memory Protection:"
	Write-Host " - GPU memory threshold: ${GPU_MEMORY_THRESHOLD} (85%)"
	Write-Host " - Check interval: ${GPU_CHECK_INTERVAL} steps"
	Write-Host " - FP16 training enabled"
	Write-Host " - Automatic checkpoint saving on memory limit"
	Write-Host ""

	# Check prerequisites
	Write-Host "📋 Checking prerequisites..."

	# Check if full dataset exists
	if (-not (Test-Path "the_vault_dataset")) {
	Write-Error "❌ The Vault dataset not found! Please download and extract to 'the_vault_dataset/'"
	Write-Host " Download from: https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeT5-learning-framework/data"
	exit 1
	}

	# Ensure data preprocessing is done for full dataset
	Write-Host "Checking full dataset preprocessing..."
	if (-not (Test-Path "data/the_vault/DOC_VAULT_train.tsv")) {
	Write-Host "🔄 Running full dataset preprocessing (this may take 30-60 minutes)..."
	python scripts/preprocess_vault_dataset.py --input_dir the_vault_dataset/ --output_dir data/the_vault/ --full_dataset
	if ($LASTEXITCODE -ne 0) {
	Write-Error "❌ Data preprocessing failed!"
	exit 1
	}
	} else {
	$train_lines = (Get-Content "data/the_vault/DOC_VAULT_train.tsv").Count
	Write-Host "✅ Full dataset already preprocessed ($train_lines training samples)"
	}

	# Check GPU availability
	$gpu_count = 0
	try {
	$gpu_info = nvidia-smi --query-gpu=name --format=csv,noheader,nounits 2>$null
	if ($gpu_info) {
	$gpu_count = ($gpu_info \| Measure-Object).Count
	Write-Host "🖥️ Detected $gpu_count GPU(s): $($gpu_info -join ', ')"
	}
	} catch {
	Write-Host "⚠️ No GPU detected, will use CPU (training will be much slower)"
	}

	if ($gpu_count -eq 0) {
	Write-Host "⚠️ Warning: Training on CPU will take days/weeks. Consider using GPU."
	$response = Read-Host "Continue with CPU training? (y/N)"
	if ($response -ne "y" -and $response -ne "Y") {
	Write-Host "Training cancelled."
	exit 0
	}
	}

	Write-Host ""
	Write-Host "=== Phase 1 Training: Document ID Assignment ==="
	Write-Host "🎯 Learning to assign semantic identifiers to code documents"

	$PHASE1_OUTPUT = "logs/glen_vault_production/GLEN_P1"
	$env:CUDA_VISIBLE_DEVICES = "0"

	try {
	python examples/glen_phase1/train_glen.py `
	--output_dir $PHASE1_OUTPUT `
	--model_name_or_path t5-base `
	--query_type gtq_doc `
	--per_device_train_batch_size $PHASE1_BATCH_SIZE `
	--per_device_eval_batch_size 8 `
	--gradient_accumulation_steps $GRADIENT_ACCUMULATION `
	--learning_rate $LEARNING_RATE `
	--dropout_rate 0.1 `
	--Rdrop 0.15 `
	--aug_query True `
	--aug_query_type corrupted_query `
	--input_dropout 1 `
	--id_class t5_bm25_truncate_3 `
	--dataset_name the_vault `
	--tree 1 `
	--pretrain_decoder True `
	--max_input_length $MAX_INPUT_LENGTH `
	--val_check_interval 0.1 `
	--tie_word_embeddings True `
	--decoder_input doc_rep `
	--max_output_length 10 `
	--num_return_sequences 10 `
	--logging_steps 100 `
	--eval_steps 1000 `
	--save_steps 2000 `
	--overwrite_output_dir `
	--wandb_tag "phase1_production" `
	--project_name $WANDB_PROJECT `
	--do_eval True `
	--evaluation_strategy steps `
	--num_train_epochs $PHASE1_EPOCHS `
	--save_strategy steps `
	--save_total_limit 5 `
	--load_best_model_at_end True `
	--metric_for_best_model eval_loss `
	--greater_is_better False `
	--seed 42 `
	--gpu_memory_threshold $GPU_MEMORY_THRESHOLD `
	--gpu_check_interval $GPU_CHECK_INTERVAL `
	--fp16 True `
	--dataloader_num_workers 4 `
	--warmup_ratio 0.1

	if ($LASTEXITCODE -ne 0) {
	throw "Phase 1 training failed!"
	}
	} catch {
	Write-Error "❌ Phase 1 training failed: $_"
	Write-Host "📁 Check logs in: $PHASE1_OUTPUT"
	exit 1
	}

	Write-Host "✅ Phase 1 training completed successfully!"

	# Validate Phase 1 checkpoint
	if (-not (Test-Path $PHASE1_OUTPUT)) {
	Write-Error "❌ Phase 1 checkpoint not found at $PHASE1_OUTPUT"
	exit 1
	fi

	# Find the best checkpoint
	$best_checkpoint = Get-ChildItem -Path $PHASE1_OUTPUT -Directory -Name "checkpoint-*" \|
	Sort-Object {[int]($_.Split('-')[1])} \| Select-Object -Last 1

	if ($best_checkpoint) {
	Write-Host "📁 Using Phase 1 checkpoint: $best_checkpoint"
	$PHASE1_CKPT = "$PHASE1_OUTPUT/$best_checkpoint"
	} else {
	$PHASE1_CKPT = $PHASE1_OUTPUT
	}

	Write-Host ""
	Write-Host "=== Phase 2 Training: Ranking-based Refinement ==="
	Write-Host "🎯 Learning to rank and refine document identifiers"

	$PHASE2_OUTPUT = "logs/glen_vault_production/GLEN_P2"

	try {
	python examples/glen_phase2/train_glen.py `
	--output_dir $PHASE2_OUTPUT `
	--model_name_or_path $PHASE1_CKPT `
	--per_device_train_batch_size $PHASE2_BATCH_SIZE `
	--per_device_eval_batch_size 4 `
	--gradient_accumulation_steps $GRADIENT_ACCUMULATION `
	--learning_rate $LEARNING_RATE `
	--dropout_rate 0.1 `
	--warmup_ratio 0.1 `
	--id_class t5_bm25_truncate_3 `
	--dataset_name the_vault `
	--tree 1 `
	--q_max_len 64 `
	--p_max_len $MAX_INPUT_LENGTH `
	--negative_passage_type self `
	--positive_passage_no_shuffle True `
	--tie_word_embeddings True `
	--num_return_sequences 10 `
	--logging_steps 100 `
	--eval_steps 1000 `
	--save_steps 2000 `
	--overwrite_output_dir `
	--wandb_tag "phase2_production" `
	--project_name $WANDB_PROJECT `
	--do_eval True `
	--evaluation_strategy steps `
	--num_train_epochs $PHASE2_EPOCHS `
	--save_strategy steps `
	--save_total_limit 5 `
	--load_best_model_at_end True `
	--metric_for_best_model eval_loss `
	--greater_is_better False `
	--seed 42 `
	--gpu_memory_threshold $GPU_MEMORY_THRESHOLD `
	--gpu_check_interval $GPU_CHECK_INTERVAL `
	--fp16 True `
	--dataloader_num_workers 4

	if ($LASTEXITCODE -ne 0) {
	throw "Phase 2 training failed!"
	}
	} catch {
	Write-Error "❌ Phase 2 training failed: $_"
	Write-Host "📁 Check logs in: $PHASE2_OUTPUT"
	exit 1
	}

	Write-Host "✅ Phase 2 training completed successfully!"

	# Validate Phase 2 checkpoint
	if (-not (Test-Path $PHASE2_OUTPUT)) {
	Write-Error "❌ Phase 2 checkpoint not found at $PHASE2_OUTPUT"
	exit 1
	}

	# Find the best Phase 2 checkpoint
	$best_checkpoint_p2 = Get-ChildItem -Path $PHASE2_OUTPUT -Directory -Name "checkpoint-*" \|
	Sort-Object {[int]($_.Split('-')[1])} \| Select-Object -Last 1

	if ($best_checkpoint_p2) {
	Write-Host "📁 Using Phase 2 checkpoint: $best_checkpoint_p2"
	$PHASE2_CKPT = "$PHASE2_OUTPUT/$best_checkpoint_p2"
	} else {
	$PHASE2_CKPT = $PHASE2_OUTPUT
	}

	Write-Host ""
	Write-Host "=== Document ID Generation ==="
	Write-Host "🎯 Generating semantic IDs for all documents"

	try {
	python examples/glen_phase2/makeid_glen.py `
	--model_name_or_path $PHASE2_CKPT `
	--infer_dir $PHASE2_CKPT `
	--dataset_name the_vault `
	--docid_file_name glen_vault_production_docids `
	--per_device_eval_batch_size 16 `
	--max_input_length $MAX_INPUT_LENGTH `
	--num_return_sequences 20

	if ($LASTEXITCODE -ne 0) {
	throw "Document ID generation failed!"
	}
	} catch {
	Write-Error "❌ Document ID generation failed: $_"
	exit 1
	}

	# Validate docid file
	$docid_file = "logs/glen_vault_production/glen_vault_production_docids.tsv"
	if (-not (Test-Path $docid_file)) {
	Write-Error "❌ Document ID file not created: $docid_file"
	exit 1
	fi

	$total_docs = (Get-Content $docid_file).Count
	Write-Host "✅ Document ID generation completed! Generated $total_docs document IDs"

	Write-Host ""
	Write-Host "=== Model Evaluation ==="
	Write-Host "🎯 Evaluating model performance on test set"

	try {
	python examples/glen_phase2/evaluate_glen.py `
	--model_name_or_path $PHASE2_CKPT `
	--infer_dir $PHASE2_CKPT `
	--dataset_name the_vault `
	--docid_file_name glen_vault_production_docids `
	--per_device_eval_batch_size 8 `
	--q_max_len 64 `
	--num_return_sequences 20 `
	--logs_dir logs/glen_vault_production

	if ($LASTEXITCODE -ne 0) {
	throw "Model evaluation failed!"
	}
	} catch {
	Write-Error "❌ Model evaluation failed: $_"
	exit 1
	}

	Write-Host "✅ Model evaluation completed successfully!"

	# Training completion summary
	$training_time = Get-Date
	Write-Host ""
	Write-Host "==========================================="
	Write-Host "🎉 FULL TRAINING COMPLETED SUCCESSFULLY! 🎉"
	Write-Host "==========================================="
	Write-Host ""
	Write-Host "📊 Training Summary:"
	Write-Host " ✅ Phase 1: Document ID Assignment ($PHASE1_EPOCHS epochs)"
	Write-Host " ✅ Phase 2: Ranking Refinement ($PHASE2_EPOCHS epochs)"
	Write-Host " ✅ Document ID Generation ($total_docs documents)"
	Write-Host " ✅ Model Evaluation & Metrics"
	Write-Host ""
	Write-Host "📁 Production Model Artifacts:"
	Write-Host " 🏷️ Phase 1 Checkpoint: $PHASE1_CKPT"
	Write-Host " 🏷️ Phase 2 Checkpoint: $PHASE2_CKPT"
	Write-Host " 📄 Document IDs: $docid_file"
	Write-Host " 📊 Evaluation Results: logs/glen_vault_production/"
	Write-Host ""
	Write-Host "🛡️ Memory Protection Summary:"
	Write-Host " - GPU memory threshold: ${GPU_MEMORY_THRESHOLD} (85%)"
	Write-Host " - Check interval: ${GPU_CHECK_INTERVAL} steps"
	Write-Host " - FP16 training enabled throughout"
	Write-Host " - Automatic checkpoint saving on memory limits"
	Write-Host ""
	Write-Host "📈 Performance Optimizations Used:"
	Write-Host " - Gradient accumulation: ${GRADIENT_ACCUMULATION}x"
	Write-Host " - Multi-worker data loading"
	Write-Host " - Mixed precision training (FP16)"
	Write-Host " - Memory-efficient batch sizes"
	Write-Host ""
	Write-Host "🚀 Your GLEN model is ready for production use!"
	Write-Host " - Use the Phase 2 checkpoint for inference"
	Write-Host " - Document IDs are saved for fast retrieval"
	Write-Host " - Evaluation metrics are in the logs directory"
	Write-Host ""
	Write-Host "Training completed at: $training_time"