WCNegentropy commited on
Commit
f4730af
Β·
verified Β·
1 Parent(s): 47e4c3f

πŸš€ Refined BitTransformerLM: Organized codebase with best practices

Browse files
Files changed (1) hide show
  1. scripts/tools/create_dataset.py +61 -0
scripts/tools/create_dataset.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ BitTransformerLM Dataset Creation Script
4
+
5
+ Usage:
6
+ python create_dataset.py --token YOUR_HF_TOKEN --repo-id YOUR_REPO_NAME
7
+
8
+ This script creates a comprehensive dataset for BitTransformerLM training
9
+ and uploads it to HuggingFace Hub with proper metadata and organization.
10
+ """
11
+
12
+ import argparse
13
+ import sys
14
+ from pathlib import Path
15
+
16
+ # Add the bit_transformer module to path
17
+ sys.path.insert(0, str(Path(__file__).parent))
18
+
19
+ from bit_transformer.dataset_builder import create_bittransformerlm_dataset
20
+
21
+
22
+ def main():
23
+ parser = argparse.ArgumentParser(description="Create BitTransformerLM Dataset")
24
+ parser.add_argument("--token", required=True, help="HuggingFace access token")
25
+ parser.add_argument("--repo-id", default="BitTransformerLM", help="Dataset repository ID")
26
+ parser.add_argument("--private", action="store_true", default=True, help="Make dataset private")
27
+ parser.add_argument("--samples", type=int, default=25000, help="Total number of samples")
28
+
29
+ args = parser.parse_args()
30
+
31
+ print("πŸš€ Starting BitTransformerLM Dataset Creation")
32
+ print(f"Repository: {args.repo_id}")
33
+ print(f"Private: {args.private}")
34
+ print(f"Target samples: {args.samples}")
35
+ print("-" * 50)
36
+
37
+ try:
38
+ dataset_url = create_bittransformerlm_dataset(
39
+ hf_token=args.token,
40
+ repo_id=args.repo_id
41
+ )
42
+
43
+ print("\n" + "=" * 50)
44
+ print("πŸŽ‰ SUCCESS! Dataset created and uploaded")
45
+ print(f"πŸ“ URL: {dataset_url}")
46
+ print("=" * 50)
47
+
48
+ print("\nπŸ“‹ Next Steps:")
49
+ print("1. View your dataset on HuggingFace Hub")
50
+ print("2. Test loading with: `from datasets import load_dataset`")
51
+ print("3. Integrate with BitTransformerLM training pipeline")
52
+ print("4. Monitor dataset usage and performance metrics")
53
+
54
+ except Exception as e:
55
+ print(f"\n❌ ERROR: {e}")
56
+ print("Please check your token and repository permissions.")
57
+ sys.exit(1)
58
+
59
+
60
+ if __name__ == "__main__":
61
+ main()