File size: 1,402 Bytes
d8dd7a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/env python3
"""
Sample Dataset Creation Script
Creates sample datasets for testing SmolLM3 fine-tuning
"""

import os
import json
import argparse
from data import create_sample_dataset

def main():
    parser = argparse.ArgumentParser(description='Create sample dataset for SmolLM3 fine-tuning')
    parser.add_argument('--output_dir', type=str, default='my_dataset',
                       help='Output directory for the dataset')
    parser.add_argument('--format', type=str, default='chat',
                       choices=['chat', 'instruction', 'user_assistant'],
                       help='Dataset format')
    parser.add_argument('--num_samples', type=int, default=100,
                       help='Number of samples to create')
    
    args = parser.parse_args()
    
    # Create sample dataset
    output_path = create_sample_dataset(args.output_dir)
    
    print(f"Sample dataset created in: {output_path}")
    print(f"Format: {args.format}")
    print(f"Samples: {args.num_samples}")
    print("\nFiles created:")
    print(f"- {os.path.join(output_path, 'train.json')}")
    print(f"- {os.path.join(output_path, 'validation.json')}")
    
    # Show sample data
    with open(os.path.join(output_path, 'train.json'), 'r') as f:
        data = json.load(f)
        print(f"\nSample data:")
        print(json.dumps(data[0], indent=2))

if __name__ == '__main__':
    main()