| |
| """ |
| Prepare dataset for LoRA training - Stack 2.9 Local Version |
| """ |
| import json |
| import argparse |
| from pathlib import Path |
| from datasets import Dataset |
|
|
| def prepare_dataset(input_path, output_dir, max_length=2048, test_split=0.1): |
| """Load JSONL and prepare for training with tokenization.""" |
| |
| from transformers import AutoTokenizer |
| |
| print(f"Loading data from: {input_path}") |
| |
| |
| with open(input_path, 'r') as f: |
| data = [json.loads(line) for line in f] |
| |
| print(f"Loaded {len(data)} examples") |
| |
| |
| formatted_data = [] |
| for item in data: |
| if 'prompt' in item and 'completion' in item: |
| text = item['prompt'] + item['completion'] |
| formatted_data.append({'text': text}) |
| elif 'input' in item and 'output' in item: |
| text = item['input'] + item['output'] |
| formatted_data.append({'text': text}) |
| elif 'instruction' in item and 'output' in item: |
| text = item['instruction'] + ' ' + item['output'] |
| formatted_data.append({'text': text}) |
| |
| print(f"Formatted {len(formatted_data)} examples") |
| |
| |
| dataset = Dataset.from_list(formatted_data) |
| |
| |
| tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-7B", trust_remote_code=True) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
| |
| |
| def tokenize_function(examples): |
| return tokenizer( |
| examples['text'], |
| padding='max_length', |
| truncation=True, |
| max_length=max_length, |
| return_tensors=None |
| ) |
| |
| dataset = dataset.map( |
| tokenize_function, |
| batched=True, |
| remove_columns=['text'] |
| ) |
| |
| |
| split = dataset.train_test_split(test_size=test_split) |
| train_data = split['train'] |
| eval_data = split['test'] |
| |
| |
| output_path = Path(output_dir) |
| train_path = output_path / "train" |
| eval_path = output_path / "eval" |
| |
| train_data.save_to_disk(str(train_path)) |
| eval_data.save_to_disk(str(eval_path)) |
| |
| print(f"Saved to: {output_dir}") |
| print(f" Train: {len(train_data)} examples") |
| print(f" Eval: {len(eval_data)} examples") |
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--input", type=str, default="training-data/generated/synthetic_50k.jsonl") |
| parser.add_argument("--output", type=str, default="stack-2.9-training/data") |
| parser.add_argument("--max-length", type=int, default=2048) |
| parser.add_argument("--test-split", type=float, default=0.1) |
| args = parser.parse_args() |
| |
| |
| input_path = Path(args.input) |
| if not input_path.is_absolute(): |
| input_path = Path("/Users/walidsobhi/.openclaw/workspace/stack-2.9") / input_path |
| |
| prepare_dataset( |
| str(input_path), |
| args.output, |
| args.max_length, |
| args.test_split |
| ) |