|
|
|
|
|
Write-Host "===========================================" |
|
Write-Host "GLEN Full Training on The Vault Dataset" |
|
Write-Host "Processing 34M+ code samples" |
|
Write-Host "===========================================" |
|
|
|
|
|
$GPU_MEMORY_THRESHOLD = 0.85 |
|
$GPU_CHECK_INTERVAL = 50 |
|
$WANDB_PROJECT = "glen-vault-production" |
|
|
|
|
|
$PHASE1_EPOCHS = 3 |
|
$PHASE2_EPOCHS = 5 |
|
$PHASE1_BATCH_SIZE = 32 |
|
$PHASE2_BATCH_SIZE = 16 |
|
$GRADIENT_ACCUMULATION = 4 |
|
$MAX_INPUT_LENGTH = 256 |
|
$LEARNING_RATE = 5e-5 |
|
|
|
Write-Host "π§ Production Configuration:" |
|
Write-Host " - Phase 1 epochs: $PHASE1_EPOCHS" |
|
Write-Host " - Phase 2 epochs: $PHASE2_EPOCHS" |
|
Write-Host " - Phase 1 batch size: $PHASE1_BATCH_SIZE" |
|
Write-Host " - Phase 2 batch size: $PHASE2_BATCH_SIZE" |
|
Write-Host " - Gradient accumulation: $GRADIENT_ACCUMULATION" |
|
Write-Host " - Max input length: $MAX_INPUT_LENGTH" |
|
Write-Host " - Learning rate: $LEARNING_RATE" |
|
Write-Host "" |
|
|
|
Write-Host "π‘οΈ Memory Protection:" |
|
Write-Host " - GPU memory threshold: ${GPU_MEMORY_THRESHOLD} (85%)" |
|
Write-Host " - Check interval: ${GPU_CHECK_INTERVAL} steps" |
|
Write-Host " - FP16 training enabled" |
|
Write-Host " - Automatic checkpoint saving on memory limit" |
|
Write-Host "" |
|
|
|
|
|
Write-Host "π Checking prerequisites..." |
|
|
|
|
|
if (-not (Test-Path "the_vault_dataset")) { |
|
Write-Error "β The Vault dataset not found! Please download and extract to 'the_vault_dataset/'" |
|
Write-Host " Download from: https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeT5-learning-framework/data" |
|
exit 1 |
|
} |
|
|
|
|
|
Write-Host "Checking full dataset preprocessing..." |
|
if (-not (Test-Path "data/the_vault/DOC_VAULT_train.tsv")) { |
|
Write-Host "π Running full dataset preprocessing (this may take 30-60 minutes)..." |
|
python scripts/preprocess_vault_dataset.py --input_dir the_vault_dataset/ --output_dir data/the_vault/ --full_dataset |
|
if ($LASTEXITCODE -ne 0) { |
|
Write-Error "β Data preprocessing failed!" |
|
exit 1 |
|
} |
|
} else { |
|
$train_lines = (Get-Content "data/the_vault/DOC_VAULT_train.tsv").Count |
|
Write-Host "β
Full dataset already preprocessed ($train_lines training samples)" |
|
} |
|
|
|
|
|
$gpu_count = 0 |
|
try { |
|
$gpu_info = nvidia-smi --query-gpu=name --format=csv,noheader,nounits 2>$null |
|
if ($gpu_info) { |
|
$gpu_count = ($gpu_info | Measure-Object).Count |
|
Write-Host "π₯οΈ Detected $gpu_count GPU(s): $($gpu_info -join ', ')" |
|
} |
|
} catch { |
|
Write-Host "β οΈ No GPU detected, will use CPU (training will be much slower)" |
|
} |
|
|
|
if ($gpu_count -eq 0) { |
|
Write-Host "β οΈ Warning: Training on CPU will take days/weeks. Consider using GPU." |
|
$response = Read-Host "Continue with CPU training? (y/N)" |
|
if ($response -ne "y" -and $response -ne "Y") { |
|
Write-Host "Training cancelled." |
|
exit 0 |
|
} |
|
} |
|
|
|
Write-Host "" |
|
Write-Host "=== Phase 1 Training: Document ID Assignment ===" |
|
Write-Host "π― Learning to assign semantic identifiers to code documents" |
|
|
|
$PHASE1_OUTPUT = "logs/glen_vault_production/GLEN_P1" |
|
$env:CUDA_VISIBLE_DEVICES = "0" |
|
|
|
try { |
|
python examples/glen_phase1/train_glen.py ` |
|
--output_dir $PHASE1_OUTPUT ` |
|
--model_name_or_path t5-base ` |
|
--query_type gtq_doc ` |
|
--per_device_train_batch_size $PHASE1_BATCH_SIZE ` |
|
--per_device_eval_batch_size 8 ` |
|
--gradient_accumulation_steps $GRADIENT_ACCUMULATION ` |
|
--learning_rate $LEARNING_RATE ` |
|
--dropout_rate 0.1 ` |
|
--Rdrop 0.15 ` |
|
--aug_query True ` |
|
--aug_query_type corrupted_query ` |
|
--input_dropout 1 ` |
|
--id_class t5_bm25_truncate_3 ` |
|
--dataset_name the_vault ` |
|
--tree 1 ` |
|
--pretrain_decoder True ` |
|
--max_input_length $MAX_INPUT_LENGTH ` |
|
--val_check_interval 0.1 ` |
|
--tie_word_embeddings True ` |
|
--decoder_input doc_rep ` |
|
--max_output_length 10 ` |
|
--num_return_sequences 10 ` |
|
--logging_steps 100 ` |
|
--eval_steps 1000 ` |
|
--save_steps 2000 ` |
|
--overwrite_output_dir ` |
|
--wandb_tag "phase1_production" ` |
|
--project_name $WANDB_PROJECT ` |
|
--do_eval True ` |
|
--evaluation_strategy steps ` |
|
--num_train_epochs $PHASE1_EPOCHS ` |
|
--save_strategy steps ` |
|
--save_total_limit 5 ` |
|
--load_best_model_at_end True ` |
|
--metric_for_best_model eval_loss ` |
|
--greater_is_better False ` |
|
--seed 42 ` |
|
--gpu_memory_threshold $GPU_MEMORY_THRESHOLD ` |
|
--gpu_check_interval $GPU_CHECK_INTERVAL ` |
|
--fp16 True ` |
|
--dataloader_num_workers 4 ` |
|
--warmup_ratio 0.1 |
|
|
|
if ($LASTEXITCODE -ne 0) { |
|
throw "Phase 1 training failed!" |
|
} |
|
} catch { |
|
Write-Error "β Phase 1 training failed: $_" |
|
Write-Host "π Check logs in: $PHASE1_OUTPUT" |
|
exit 1 |
|
} |
|
|
|
Write-Host "β
Phase 1 training completed successfully!" |
|
|
|
|
|
if (-not (Test-Path $PHASE1_OUTPUT)) { |
|
Write-Error "β Phase 1 checkpoint not found at $PHASE1_OUTPUT" |
|
exit 1 |
|
fi |
|
|
|
|
|
$best_checkpoint = Get-ChildItem -Path $PHASE1_OUTPUT -Directory -Name "checkpoint-*" | |
|
Sort-Object {[int]($_.Split('-')[1])} | Select-Object -Last 1 |
|
|
|
if ($best_checkpoint) { |
|
Write-Host "π Using Phase 1 checkpoint: $best_checkpoint" |
|
$PHASE1_CKPT = "$PHASE1_OUTPUT/$best_checkpoint" |
|
} else { |
|
$PHASE1_CKPT = $PHASE1_OUTPUT |
|
} |
|
|
|
Write-Host "" |
|
Write-Host "=== Phase 2 Training: Ranking-based Refinement ===" |
|
Write-Host "π― Learning to rank and refine document identifiers" |
|
|
|
$PHASE2_OUTPUT = "logs/glen_vault_production/GLEN_P2" |
|
|
|
try { |
|
python examples/glen_phase2/train_glen.py ` |
|
--output_dir $PHASE2_OUTPUT ` |
|
--model_name_or_path $PHASE1_CKPT ` |
|
--per_device_train_batch_size $PHASE2_BATCH_SIZE ` |
|
--per_device_eval_batch_size 4 ` |
|
--gradient_accumulation_steps $GRADIENT_ACCUMULATION ` |
|
--learning_rate $LEARNING_RATE ` |
|
--dropout_rate 0.1 ` |
|
--warmup_ratio 0.1 ` |
|
--id_class t5_bm25_truncate_3 ` |
|
--dataset_name the_vault ` |
|
--tree 1 ` |
|
--q_max_len 64 ` |
|
--p_max_len $MAX_INPUT_LENGTH ` |
|
--negative_passage_type self ` |
|
--positive_passage_no_shuffle True ` |
|
--tie_word_embeddings True ` |
|
--num_return_sequences 10 ` |
|
--logging_steps 100 ` |
|
--eval_steps 1000 ` |
|
--save_steps 2000 ` |
|
--overwrite_output_dir ` |
|
--wandb_tag "phase2_production" ` |
|
--project_name $WANDB_PROJECT ` |
|
--do_eval True ` |
|
--evaluation_strategy steps ` |
|
--num_train_epochs $PHASE2_EPOCHS ` |
|
--save_strategy steps ` |
|
--save_total_limit 5 ` |
|
--load_best_model_at_end True ` |
|
--metric_for_best_model eval_loss ` |
|
--greater_is_better False ` |
|
--seed 42 ` |
|
--gpu_memory_threshold $GPU_MEMORY_THRESHOLD ` |
|
--gpu_check_interval $GPU_CHECK_INTERVAL ` |
|
--fp16 True ` |
|
--dataloader_num_workers 4 |
|
|
|
if ($LASTEXITCODE -ne 0) { |
|
throw "Phase 2 training failed!" |
|
} |
|
} catch { |
|
Write-Error "β Phase 2 training failed: $_" |
|
Write-Host "π Check logs in: $PHASE2_OUTPUT" |
|
exit 1 |
|
} |
|
|
|
Write-Host "β
Phase 2 training completed successfully!" |
|
|
|
|
|
if (-not (Test-Path $PHASE2_OUTPUT)) { |
|
Write-Error "β Phase 2 checkpoint not found at $PHASE2_OUTPUT" |
|
exit 1 |
|
} |
|
|
|
|
|
$best_checkpoint_p2 = Get-ChildItem -Path $PHASE2_OUTPUT -Directory -Name "checkpoint-*" | |
|
Sort-Object {[int]($_.Split('-')[1])} | Select-Object -Last 1 |
|
|
|
if ($best_checkpoint_p2) { |
|
Write-Host "π Using Phase 2 checkpoint: $best_checkpoint_p2" |
|
$PHASE2_CKPT = "$PHASE2_OUTPUT/$best_checkpoint_p2" |
|
} else { |
|
$PHASE2_CKPT = $PHASE2_OUTPUT |
|
} |
|
|
|
Write-Host "" |
|
Write-Host "=== Document ID Generation ===" |
|
Write-Host "π― Generating semantic IDs for all documents" |
|
|
|
try { |
|
python examples/glen_phase2/makeid_glen.py ` |
|
--model_name_or_path $PHASE2_CKPT ` |
|
--infer_dir $PHASE2_CKPT ` |
|
--dataset_name the_vault ` |
|
--docid_file_name glen_vault_production_docids ` |
|
--per_device_eval_batch_size 16 ` |
|
--max_input_length $MAX_INPUT_LENGTH ` |
|
--num_return_sequences 20 |
|
|
|
if ($LASTEXITCODE -ne 0) { |
|
throw "Document ID generation failed!" |
|
} |
|
} catch { |
|
Write-Error "β Document ID generation failed: $_" |
|
exit 1 |
|
} |
|
|
|
|
|
$docid_file = "logs/glen_vault_production/glen_vault_production_docids.tsv" |
|
if (-not (Test-Path $docid_file)) { |
|
Write-Error "β Document ID file not created: $docid_file" |
|
exit 1 |
|
fi |
|
|
|
$total_docs = (Get-Content $docid_file).Count |
|
Write-Host "β
Document ID generation completed! Generated $total_docs document IDs" |
|
|
|
Write-Host "" |
|
Write-Host "=== Model Evaluation ===" |
|
Write-Host "π― Evaluating model performance on test set" |
|
|
|
try { |
|
python examples/glen_phase2/evaluate_glen.py ` |
|
--model_name_or_path $PHASE2_CKPT ` |
|
--infer_dir $PHASE2_CKPT ` |
|
--dataset_name the_vault ` |
|
--docid_file_name glen_vault_production_docids ` |
|
--per_device_eval_batch_size 8 ` |
|
--q_max_len 64 ` |
|
--num_return_sequences 20 ` |
|
--logs_dir logs/glen_vault_production |
|
|
|
if ($LASTEXITCODE -ne 0) { |
|
throw "Model evaluation failed!" |
|
} |
|
} catch { |
|
Write-Error "β Model evaluation failed: $_" |
|
exit 1 |
|
} |
|
|
|
Write-Host "β
Model evaluation completed successfully!" |
|
|
|
|
|
$training_time = Get-Date |
|
Write-Host "" |
|
Write-Host "===========================================" |
|
Write-Host "π FULL TRAINING COMPLETED SUCCESSFULLY! π" |
|
Write-Host "===========================================" |
|
Write-Host "" |
|
Write-Host "π Training Summary:" |
|
Write-Host " β
Phase 1: Document ID Assignment ($PHASE1_EPOCHS epochs)" |
|
Write-Host " β
Phase 2: Ranking Refinement ($PHASE2_EPOCHS epochs)" |
|
Write-Host " β
Document ID Generation ($total_docs documents)" |
|
Write-Host " β
Model Evaluation & Metrics" |
|
Write-Host "" |
|
Write-Host "π Production Model Artifacts:" |
|
Write-Host " π·οΈ Phase 1 Checkpoint: $PHASE1_CKPT" |
|
Write-Host " π·οΈ Phase 2 Checkpoint: $PHASE2_CKPT" |
|
Write-Host " π Document IDs: $docid_file" |
|
Write-Host " π Evaluation Results: logs/glen_vault_production/" |
|
Write-Host "" |
|
Write-Host "π‘οΈ Memory Protection Summary:" |
|
Write-Host " - GPU memory threshold: ${GPU_MEMORY_THRESHOLD} (85%)" |
|
Write-Host " - Check interval: ${GPU_CHECK_INTERVAL} steps" |
|
Write-Host " - FP16 training enabled throughout" |
|
Write-Host " - Automatic checkpoint saving on memory limits" |
|
Write-Host "" |
|
Write-Host "π Performance Optimizations Used:" |
|
Write-Host " - Gradient accumulation: ${GRADIENT_ACCUMULATION}x" |
|
Write-Host " - Multi-worker data loading" |
|
Write-Host " - Mixed precision training (FP16)" |
|
Write-Host " - Memory-efficient batch sizes" |
|
Write-Host "" |
|
Write-Host "π Your GLEN model is ready for production use!" |
|
Write-Host " - Use the Phase 2 checkpoint for inference" |
|
Write-Host " - Document IDs are saved for fast retrieval" |
|
Write-Host " - Evaluation metrics are in the logs directory" |
|
Write-Host "" |
|
Write-Host "Training completed at: $training_time" |