|
|
|
|
|
Write-Host "===========================================" |
|
Write-Host "Testing GLEN with small Vault dataset" |
|
Write-Host "===========================================" |
|
|
|
|
|
$GPU_MEMORY_THRESHOLD = 0.85 |
|
$GPU_CHECK_INTERVAL = 50 |
|
|
|
Write-Host "GPU Memory Protection enabled:" |
|
Write-Host "- Memory threshold: ${GPU_MEMORY_THRESHOLD} (85%)" |
|
Write-Host "- Check interval: ${GPU_CHECK_INTERVAL} steps" |
|
Write-Host "" |
|
|
|
|
|
Write-Host "Checking data preprocessing..." |
|
if (-not (Test-Path "data/the_vault/DOC_VAULT_train.tsv")) { |
|
Write-Host "Running data preprocessing..." |
|
python scripts/preprocess_vault_dataset.py --input_dir the_vault_dataset/ --output_dir data/the_vault/ --sample_size 1000 |
|
if ($LASTEXITCODE -ne 0) { |
|
Write-Error "Data preprocessing failed!" |
|
exit 1 |
|
} |
|
} else { |
|
Write-Host "Data already preprocessed." |
|
} |
|
|
|
|
|
Write-Host "" |
|
Write-Host "=== Phase 1 Training (Document ID Assignment) ===" |
|
$env:CUDA_VISIBLE_DEVICES = "0" |
|
|
|
try { |
|
python examples/glen_phase1/train_glen.py ` |
|
--output_dir logs/test_glen_vault/GLEN_P1_test ` |
|
--model_name_or_path t5-base ` |
|
--query_type gtq_doc ` |
|
--per_device_train_batch_size 8 ` |
|
--per_device_eval_batch_size 4 ` |
|
--gradient_accumulation_steps 2 ` |
|
--dropout_rate 0.1 ` |
|
--Rdrop 0.15 ` |
|
--aug_query True ` |
|
--aug_query_type corrupted_query ` |
|
--input_dropout 1 ` |
|
--id_class t5_bm25_truncate_3 ` |
|
--dataset_name the_vault ` |
|
--test100 1 ` |
|
--tree 1 ` |
|
--pretrain_decoder True ` |
|
--max_input_length 128 ` |
|
--val_check_interval 1.0 ` |
|
--tie_word_embeddings True ` |
|
--decoder_input doc_rep ` |
|
--max_output_length 5 ` |
|
--num_return_sequences 5 ` |
|
--logging_steps 10 ` |
|
--overwrite_output_dir ` |
|
--wandb_tag test_glen_vault_p1 ` |
|
--do_eval False ` |
|
--num_train_epochs 1 ` |
|
--save_steps 50 ` |
|
--save_strategy steps ` |
|
--evaluation_strategy no ` |
|
--seed 42 ` |
|
--gpu_memory_threshold $GPU_MEMORY_THRESHOLD ` |
|
--gpu_check_interval $GPU_CHECK_INTERVAL ` |
|
--fp16 True |
|
|
|
if ($LASTEXITCODE -ne 0) { |
|
throw "Phase 1 training failed!" |
|
} |
|
} catch { |
|
Write-Error "Phase 1 training failed: $_" |
|
exit 1 |
|
} |
|
|
|
Write-Host "β
Phase 1 training completed successfully!" |
|
|
|
|
|
$PHASE1_CKPT = "logs/test_glen_vault/GLEN_P1_test" |
|
if (-not (Test-Path $PHASE1_CKPT)) { |
|
Write-Error "β Phase 1 checkpoint not found at $PHASE1_CKPT" |
|
exit 1 |
|
} |
|
|
|
|
|
$model_files = @("pytorch_model.bin", "model.safetensors") |
|
$found_model = $false |
|
foreach ($file in $model_files) { |
|
if (Test-Path "$PHASE1_CKPT/$file") { |
|
$found_model = $true |
|
Write-Host "π Found Phase 1 model: $file" |
|
break |
|
} |
|
} |
|
|
|
if (-not $found_model) { |
|
Write-Error "β No model files found in Phase 1 checkpoint" |
|
exit 1 |
|
} |
|
|
|
Write-Host "" |
|
Write-Host "=== Phase 2 Training (Ranking-based Refinement) ===" |
|
|
|
|
|
try { |
|
python examples/glen_phase2/train_glen.py ` |
|
--output_dir logs/test_glen_vault/GLEN_P2_test ` |
|
--model_name_or_path $PHASE1_CKPT ` |
|
--per_device_train_batch_size 4 ` |
|
--per_device_eval_batch_size 2 ` |
|
--gradient_accumulation_steps 4 ` |
|
--dropout_rate 0.1 ` |
|
--warmup_ratio 0.1 ` |
|
--id_class t5_bm25_truncate_3 ` |
|
--dataset_name the_vault ` |
|
--test100 1 ` |
|
--tree 1 ` |
|
--q_max_len 32 ` |
|
--p_max_len 128 ` |
|
--negative_passage_type self ` |
|
--positive_passage_no_shuffle True ` |
|
--tie_word_embeddings True ` |
|
--num_return_sequences 5 ` |
|
--logging_steps 10 ` |
|
--overwrite_output_dir ` |
|
--wandb_tag test_glen_vault_p2 ` |
|
--do_eval False ` |
|
--num_train_epochs 1 ` |
|
--save_steps 50 ` |
|
--save_strategy steps ` |
|
--evaluation_strategy no ` |
|
--seed 42 ` |
|
--gpu_memory_threshold $GPU_MEMORY_THRESHOLD ` |
|
--gpu_check_interval $GPU_CHECK_INTERVAL ` |
|
--fp16 True |
|
|
|
if ($LASTEXITCODE -ne 0) { |
|
throw "Phase 2 training failed!" |
|
} |
|
} catch { |
|
Write-Error "Phase 2 training failed: $_" |
|
exit 1 |
|
} |
|
|
|
Write-Host "β
Phase 2 training completed successfully!" |
|
|
|
|
|
$PHASE2_CKPT = "logs/test_glen_vault/GLEN_P2_test" |
|
if (-not (Test-Path $PHASE2_CKPT)) { |
|
Write-Error "β Phase 2 checkpoint not found at $PHASE2_CKPT" |
|
exit 1 |
|
} |
|
|
|
|
|
$checkpoint_dirs = Get-ChildItem -Path $PHASE2_CKPT -Directory -Name "checkpoint-*" | Sort-Object {[int]($_.Split('-')[1])} | Select-Object -Last 1 |
|
if ($checkpoint_dirs) { |
|
Write-Host "π Found Phase 2 checkpoint: $checkpoint_dirs" |
|
$checkpoint_path = "$PHASE2_CKPT/$checkpoint_dirs" |
|
if (-not (Test-Path "$checkpoint_path/model.safetensors") -and -not (Test-Path "$checkpoint_path/pytorch_model.bin")) { |
|
Write-Error "β No model files in checkpoint directory" |
|
exit 1 |
|
} |
|
} else { |
|
|
|
$found_model = $false |
|
foreach ($file in $model_files) { |
|
if (Test-Path "$PHASE2_CKPT/$file") { |
|
$found_model = $true |
|
Write-Host "π Found Phase 2 model: $file" |
|
break |
|
} |
|
} |
|
if (-not $found_model) { |
|
Write-Error "β No model files found in Phase 2 checkpoint" |
|
exit 1 |
|
} |
|
} |
|
|
|
Write-Host "" |
|
Write-Host "=== Document ID Generation ===" |
|
|
|
try { |
|
python examples/glen_phase2/makeid_glen.py ` |
|
--model_name_or_path $PHASE2_CKPT ` |
|
--infer_dir $PHASE2_CKPT ` |
|
--dataset_name the_vault ` |
|
--docid_file_name GLEN_P2_test_docids ` |
|
--per_device_eval_batch_size 4 ` |
|
--max_input_length 128 ` |
|
--num_return_sequences 10 |
|
|
|
if ($LASTEXITCODE -ne 0) { |
|
throw "Document ID generation failed!" |
|
} |
|
} catch { |
|
Write-Error "Document ID generation failed: $_" |
|
exit 1 |
|
} |
|
|
|
|
|
$docid_file = "logs/test_glen_vault/GLEN_P2_test_docids.tsv" |
|
if (-not (Test-Path $docid_file)) { |
|
Write-Error "β Document ID file not created: $docid_file" |
|
exit 1 |
|
} |
|
|
|
$line_count = (Get-Content $docid_file).Count |
|
Write-Host "β
Document ID generation completed! Generated $line_count document IDs" |
|
|
|
Write-Host "" |
|
Write-Host "=== Query Inference ===" |
|
|
|
try { |
|
python examples/glen_phase2/evaluate_glen.py ` |
|
--model_name_or_path $PHASE2_CKPT ` |
|
--infer_dir $PHASE2_CKPT ` |
|
--dataset_name the_vault ` |
|
--docid_file_name GLEN_P2_test_docids ` |
|
--per_device_eval_batch_size 4 ` |
|
--q_max_len 32 ` |
|
--num_return_sequences 5 ` |
|
--logs_dir logs/test_glen_vault |
|
|
|
if ($LASTEXITCODE -ne 0) { |
|
throw "Query inference failed!" |
|
} |
|
} catch { |
|
Write-Error "Query inference failed: $_" |
|
exit 1 |
|
} |
|
|
|
Write-Host "β
Query inference completed successfully!" |
|
|
|
Write-Host "" |
|
Write-Host "===========================================" |
|
Write-Host "π ALL TESTS COMPLETED SUCCESSFULLY! π" |
|
Write-Host "===========================================" |
|
Write-Host "" |
|
Write-Host "π Summary:" |
|
Write-Host " β
Phase 1 Training (Document ID Assignment)" |
|
Write-Host " β
Phase 2 Training (Ranking-based Refinement)" |
|
Write-Host " β
Document ID Generation ($line_count IDs)" |
|
Write-Host " β
Query Inference & Evaluation" |
|
Write-Host "" |
|
Write-Host "π Results saved in: logs/test_glen_vault/" |
|
Write-Host "π Document IDs: $docid_file" |
|
Write-Host "" |
|
Write-Host "π‘οΈ Memory Protection Summary:" |
|
Write-Host " - GPU memory threshold: ${GPU_MEMORY_THRESHOLD} (85%)" |
|
Write-Host " - Check interval: ${GPU_CHECK_INTERVAL} steps" |
|
Write-Host " - FP16 training enabled" |
|
Write-Host " - Optimized batch sizes used" |
|
Write-Host "" |
|
Write-Host "π The system is ready for full training on The Vault dataset!" |
|
Write-Host " Use scripts/train_full_vault.ps1 for production training." |