| import os |
| import json |
| import datasets |
| import argparse |
| from verl.utils.hdfs_io import copy, makedirs |
|
|
| |
| |
| with open("/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/prompt", 'r') as f: |
| PROMPT_TEMPLATE = f.read() |
|
|
| def make_map_fn(split, data_source): |
| def process_fn(example, idx): |
| |
| full_text = example.pop('fulltext') |
| gold_summary = example.pop('summary') |
| fulltext_subclaims = example.pop('fulltext_subclaims', None) |
| summary_subclaims = example.pop('summary_subclaims', None) |
| |
| |
| |
| prompt_content = PROMPT_TEMPLATE.format( |
| source_lang="English", |
| gold_summary=gold_summary, |
| full_text=full_text |
| ) |
|
|
| return { |
| "data_source": data_source, |
| "prompt": [{ |
| "role": "user", |
| "content": prompt_content |
| }], |
| "ability": "summarization", |
| "reward_model": { |
| "style": "rule", |
| "ground_truth": { |
| "summary_subclaims": summary_subclaims, |
| "fulltext_subclaims": fulltext_subclaims |
| } |
| }, |
| "extra_info": { |
| "split": split, |
| "index": idx, |
| "original_id": example.get('id', idx), |
| "fulltext_subclaims": fulltext_subclaims, |
| "summary_subclaims": summary_subclaims |
| } |
| } |
| return process_fn |
|
|
| if __name__ == '__main__': |
| parser = argparse.ArgumentParser() |
| |
| parser.add_argument('--input_path', default='/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_multiclinsum_test_en_full.json') |
| |
| parser.add_argument('--local_dir', default='/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset') |
| args = parser.parse_args() |
|
|
| data_source = 'multiclinsum' |
|
|
| |
| with open(args.input_path, 'r') as f: |
| raw_data = json.load(f) |
| |
| |
| dataset = datasets.Dataset.from_list(raw_data) |
|
|
| |
| dataset_split = dataset.train_test_split(test_size=0.05, seed=42, shuffle=True) |
|
|
| |
| processed_train = dataset_split['train'].map( |
| function=make_map_fn('train', data_source), |
| with_indices=True |
| ) |
| processed_test = dataset_split['test'].map( |
| function=make_map_fn('test', data_source), |
| with_indices=True |
| ) |
|
|
| |
| os.makedirs(args.local_dir, exist_ok=True) |
|
|
| |
| train_output_path = os.path.join(args.local_dir, 'train.parquet') |
| test_output_path = os.path.join(args.local_dir, 'test.parquet') |
| processed_train.to_parquet(train_output_path) |
| processed_test.to_parquet(test_output_path) |
|
|
| print(f"--- Dataset Preparation Complete ---") |
| print(f"Train file saved to: {train_output_path}") |
| print(f"Test file saved to: {test_output_path}") |
| print(f"Total train records: {len(processed_train)}") |
| print(f"Total test records: {len(processed_test)}") |