|
|
|
"""Test script to debug VeRL training""" |
|
|
|
import sys |
|
import os |
|
sys.path.insert(0, '/home/ubuntu/RLVR/TestTime-RLVR-v2') |
|
sys.path.insert(0, '/home/ubuntu/RLVR/verl') |
|
|
|
|
|
import pandas as pd |
|
import numpy as np |
|
|
|
output_dir = "./test_time_output_debug" |
|
training_data_path = os.path.join(output_dir, "training_data") |
|
os.makedirs(training_data_path, exist_ok=True) |
|
|
|
|
|
for task_type in ['induction', 'deduction', 'abduction']: |
|
data = { |
|
'prompts': ['test prompt ' + task_type], |
|
'responses': ['test response ' + task_type], |
|
'rewards': [1.0], |
|
'problem_id': ['test_id'], |
|
'token_level_scores': [np.array([1.0] * 10)] |
|
} |
|
df = pd.DataFrame(data) |
|
df.to_parquet(os.path.join(training_data_path, f'{task_type}.parquet')) |
|
|
|
print(f"Created dummy training data in {training_data_path}") |
|
|
|
|
|
from test.train_ttrlvr_azr import main |
|
import argparse |
|
|
|
args = argparse.Namespace( |
|
benchmark='mbpp', |
|
problem_id='Mbpp/2', |
|
rounds=1, |
|
config='test/configs/ttrlvr_azr_ppo_4gpu.yaml', |
|
step5_only=True, |
|
data_path=training_data_path, |
|
output_dir=output_dir, |
|
model='Qwen/Qwen2.5-7B', |
|
debug=True, |
|
batch_size=24, |
|
batch_epochs=1, |
|
num_programs=4, |
|
input_generation_rounds=3, |
|
parallel_batch_size=4, |
|
eval_rounds=5, |
|
skip_task_eval=False, |
|
save_every_round=False, |
|
save_round_interval=5, |
|
problems=10, |
|
resume=1, |
|
gpu=None |
|
) |
|
|
|
|
|
sys.argv = ['test_debug_verl.py'] |
|
|
|
|
|
main(args) |