WCNegentropy commited on
Commit
119fd59
Β·
verified Β·
1 Parent(s): 4a5ea0f

πŸš€ Refined BitTransformerLM: Organized codebase with best practices

Browse files
Files changed (1) hide show
  1. scripts/tools/sync_to_hf.py +303 -0
scripts/tools/sync_to_hf.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Sync BitTransformerLM repository to HuggingFace Hub for OS launch.
4
+ Uploads all cleaned documentation and code with proper commit message.
5
+ """
6
+
7
+ import os
8
+ import logging
9
+ from pathlib import Path
10
+ from huggingface_hub import HfApi, login
11
+ from typing import Optional, List
12
+
13
+ # Setup logging
14
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
15
+ logger = logging.getLogger(__name__)
16
+
17
+ def get_files_to_sync(repo_root: Path) -> List[Path]:
18
+ """Get the exact list of files that will be synced to HuggingFace."""
19
+ # Files and directories to upload (excluding unnecessary files)
20
+ include_patterns = [
21
+ # Core code
22
+ "bit_transformer/**/*.py",
23
+ "tests/**/*.py",
24
+ "scripts/**/*.py", # Organized scripts
25
+ "scripts/**/*.md", # Script documentation
26
+
27
+ # All root level files (filtered by type)
28
+ "*.py",
29
+ "*.md",
30
+ "*.txt",
31
+ "*.toml",
32
+ "*.sh",
33
+ "Dockerfile",
34
+
35
+ # License files
36
+ "LICENSE/**/*",
37
+ ]
38
+
39
+ # Files to exclude
40
+ exclude_patterns = [
41
+ "__pycache__/**",
42
+ "*.pyc",
43
+ ".git/**",
44
+ ".pytest_cache/**",
45
+ ".ipynb_checkpoints/**",
46
+ "weights/**",
47
+ "checkpoints/**",
48
+ "*.log",
49
+ "*.pt", # Model weights
50
+ "*.zip", # Backup files
51
+ # Temporary or generated files
52
+ "*-checkpoint.*",
53
+ "*.tmp",
54
+ "*.swp",
55
+ # OS files
56
+ ".DS_Store",
57
+ "Thumbs.db",
58
+ ]
59
+
60
+ # Get all files to upload
61
+ files_to_upload = []
62
+ for pattern in include_patterns:
63
+ for file_path in repo_root.glob(pattern):
64
+ if file_path.is_file():
65
+ # Check if file should be excluded
66
+ relative_path = file_path.relative_to(repo_root)
67
+ should_exclude = any(
68
+ relative_path.match(exclude)
69
+ for exclude in exclude_patterns
70
+ )
71
+ if not should_exclude:
72
+ files_to_upload.append(file_path)
73
+
74
+ return sorted(files_to_upload)
75
+
76
+
77
+ def preview_sync(repo_root: Path = None) -> None:
78
+ """Preview what files will be synced without actually uploading."""
79
+ if repo_root is None:
80
+ repo_root = Path(__file__).parent.parent.parent
81
+
82
+ files_to_upload = get_files_to_sync(repo_root)
83
+
84
+ print(f"\nπŸ“ Repository root: {repo_root}")
85
+ print(f"πŸ“¦ Files to sync: {len(files_to_upload)}")
86
+ print("\nπŸ“‹ File list:")
87
+
88
+ for file_path in files_to_upload:
89
+ relative_path = file_path.relative_to(repo_root)
90
+ file_size = file_path.stat().st_size
91
+ print(f" {relative_path} ({file_size:,} bytes)")
92
+
93
+ total_size = sum(f.stat().st_size for f in files_to_upload)
94
+ print(f"\nπŸ“Š Total size: {total_size:,} bytes ({total_size/1024/1024:.2f} MB)")
95
+
96
+
97
+ def sync_repository_to_hf(
98
+ repo_id: str = "WCNegentropy/BitTransformerLM",
99
+ token: Optional[str] = None,
100
+ commit_message: str = "πŸš€ Refined BitTransformerLM: Organized codebase with best practices",
101
+ preview_only: bool = False
102
+ ):
103
+ """
104
+ Sync the entire cleaned BitTransformerLM repository to HuggingFace Hub.
105
+
106
+ Args:
107
+ repo_id: HuggingFace repository ID
108
+ token: HF token (defaults to HF_TOKEN environment variable)
109
+ commit_message: Commit message for the upload
110
+ """
111
+
112
+ # Get token from environment if not provided
113
+ if token is None:
114
+ token = os.environ.get('HF_TOKEN')
115
+ if not token:
116
+ logger.error("HF_TOKEN environment variable not set and no token provided")
117
+ return False
118
+
119
+ try:
120
+ # Login to HuggingFace
121
+ login(token=token)
122
+ api = HfApi()
123
+ logger.info("Successfully authenticated with HuggingFace Hub")
124
+
125
+ # Get the repository root directory (go up from scripts/tools/)
126
+ repo_root = Path(__file__).parent.parent.parent
127
+ logger.info(f"Repository root: {repo_root}")
128
+
129
+ # Get files to sync using the centralized function
130
+ files_to_upload = get_files_to_sync(repo_root)
131
+ logger.info(f"Found {len(files_to_upload)} files to upload")
132
+
133
+ # If preview only, just show the files and return
134
+ if preview_only:
135
+ preview_sync(repo_root)
136
+ return True
137
+
138
+ # Use upload_folder for exact sync - this will mirror the entire directory
139
+ logger.info("Syncing entire repository structure to HuggingFace...")
140
+
141
+ try:
142
+ # First, let's create a temporary directory with only the files we want
143
+ import tempfile
144
+ import shutil
145
+
146
+ with tempfile.TemporaryDirectory() as temp_dir:
147
+ temp_path = Path(temp_dir)
148
+
149
+ # Copy all files we want to upload to temp directory
150
+ for file_path in files_to_upload:
151
+ relative_path = file_path.relative_to(repo_root)
152
+ dest_path = temp_path / relative_path
153
+ dest_path.parent.mkdir(parents=True, exist_ok=True)
154
+ shutil.copy2(file_path, dest_path)
155
+
156
+ logger.info(f"Prepared {len(files_to_upload)} files for upload")
157
+
158
+ # Upload the entire folder structure - this ensures exact mirroring
159
+ api.upload_folder(
160
+ folder_path=str(temp_path),
161
+ repo_id=repo_id,
162
+ repo_type="model",
163
+ commit_message=commit_message,
164
+ commit_description="""
165
+ BitTransformerLM refined with ML engineering best practices:
166
+
167
+ βœ… **Organized Codebase Structure**
168
+ - Cleaned up 30+ scattered scripts into organized directories
169
+ - Standardized imports and docstring formatting
170
+ - Consolidated configuration management
171
+ - Professional package metadata
172
+
173
+ βœ… **Enhanced Developer Experience**
174
+ - Comprehensive CLI interface with standardized arguments
175
+ - Type-safe configuration system with presets
176
+ - Improved error handling and logging
177
+ - Better modular organization
178
+
179
+ βœ… **Production Quality**
180
+ - PyProject.toml with proper dependencies and tooling
181
+ - Consistent code formatting and documentation
182
+ - Maintainable directory structure
183
+ - Ready for serious development and research
184
+
185
+ The bit-native transformer architecture with reversible layers, safety telemetry,
186
+ and distributed training capabilities is now properly packaged for research use.
187
+ """.strip(),
188
+ delete_patterns=["*"] # This ensures old files are removed
189
+ )
190
+
191
+ uploaded_count = len(files_to_upload)
192
+
193
+ except Exception as e:
194
+ logger.error(f"Failed to upload folder: {e}")
195
+ logger.info("Falling back to individual file upload...")
196
+
197
+ # Fallback to individual file upload
198
+ uploaded_count = 0
199
+ for file_path in files_to_upload:
200
+ try:
201
+ relative_path = file_path.relative_to(repo_root)
202
+ logger.info(f"Uploading: {relative_path}")
203
+
204
+ api.upload_file(
205
+ path_or_fileobj=str(file_path),
206
+ path_in_repo=str(relative_path),
207
+ repo_id=repo_id,
208
+ repo_type="model",
209
+ commit_message=commit_message,
210
+ )
211
+
212
+ uploaded_count += 1
213
+ if uploaded_count % 10 == 0:
214
+ logger.info(f"Progress: {uploaded_count}/{len(files_to_upload)} files uploaded")
215
+
216
+ except Exception as e:
217
+ logger.warning(f"Failed to upload {relative_path}: {e}")
218
+ continue
219
+
220
+ logger.info(f"βœ… Successfully uploaded {uploaded_count}/{len(files_to_upload)} files")
221
+ logger.info(f"πŸŽ‰ Repository synced to: https://huggingface.co/{repo_id}")
222
+
223
+ return True
224
+
225
+ except Exception as e:
226
+ logger.error(f"❌ Failed to sync repository: {e}")
227
+ return False
228
+
229
+ def create_release_info():
230
+ """Create a release information file for the OS launch."""
231
+ release_info = """# BitTransformerLM v0.1.0 - Experimental Research Release
232
+
233
+ **Release Date:** August 2025
234
+ **Status:** Open Source Research Implementation
235
+ **License:** AGPLv3 + Commercial Licensing Available
236
+
237
+ ## What's Included
238
+
239
+ This release provides a complete experimental framework for bit-native language modeling research:
240
+
241
+ - **Core Architecture:** 57 Python files implementing bit-native transformer with reversible layers
242
+ - **Safety Systems:** Real-time K/C/S telemetry and monitoring
243
+ - **Research Tools:** Interactive dashboard, distributed training, comprehensive testing
244
+ - **Documentation:** Professional model card, research status, and validation reports
245
+
246
+ ## Important Notes
247
+
248
+ ⚠️ **Experimental Status:** This is research code requiring rigorous baseline validation
249
+ ⚠️ **Not Production Ready:** Needs extensive evaluation vs standard transformers
250
+ ⚠️ **Research Use Only:** Intended for academic investigation and experimentation
251
+
252
+ ## Licensing
253
+
254
+ - **Open Source:** AGPLv3 for research and open source use
255
+ - **Commercial:** Contact contact@wcnegentropy.com for commercial licensing
256
+
257
+ ## Next Steps
258
+
259
+ The research community is invited to:
260
+ 1. Conduct rigorous baseline comparisons vs standard transformers
261
+ 2. Evaluate on established language modeling benchmarks
262
+ 3. Validate (or refute) claimed memory efficiency benefits
263
+ 4. Share findings openly to advance the field
264
+
265
+ **Research responsibly. Validate rigorously. Share openly.**
266
+ """
267
+
268
+ release_file = Path(__file__).parent / "RELEASE_INFO.md"
269
+ with open(release_file, 'w') as f:
270
+ f.write(release_info)
271
+
272
+ logger.info("Created RELEASE_INFO.md")
273
+ return release_file
274
+
275
+ if __name__ == "__main__":
276
+ import argparse
277
+
278
+ parser = argparse.ArgumentParser(description="Sync BitTransformerLM to HuggingFace Hub")
279
+ parser.add_argument("--preview", action="store_true", help="Preview files without uploading")
280
+ parser.add_argument("--repo-id", default="WCNegentropy/BitTransformerLM", help="HuggingFace repo ID")
281
+ parser.add_argument("--token", help="HuggingFace token (or set HF_TOKEN env var)")
282
+ args = parser.parse_args()
283
+
284
+ if args.preview:
285
+ print("πŸ” Preview mode: showing files that would be synced...")
286
+ preview_sync()
287
+ print("\nβœ… Use --token YOUR_TOKEN to perform actual sync")
288
+ else:
289
+ # Create release info file
290
+ create_release_info()
291
+
292
+ # Sync to HuggingFace
293
+ success = sync_repository_to_hf(
294
+ repo_id=args.repo_id,
295
+ token=args.token
296
+ )
297
+
298
+ if success:
299
+ print(f"\nπŸš€ BitTransformerLM Sync Complete!")
300
+ print(f"πŸ“ Repository: https://huggingface.co/{args.repo_id}")
301
+ print("\nRefined codebase with ML engineering best practices is now live! ✨")
302
+ else:
303
+ print("\n❌ Sync failed. Please check logs and try again.")