WCNegentropy
/

BitTransformerLM

+#!/usr/bin/env python3
+"""
+Sync BitTransformerLM repository to HuggingFace Hub for OS launch.
+Uploads all cleaned documentation and code with proper commit message.
+"""
+import os
+import logging
+from pathlib import Path
+from huggingface_hub import HfApi, login
+from typing import Optional, List
+# Setup logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+def get_files_to_sync(repo_root: Path) -> List[Path]:
+    """Get the exact list of files that will be synced to HuggingFace."""
+    # Files and directories to upload (excluding unnecessary files)
+    include_patterns = [
+        # Core code
+        "bit_transformer/**/*.py",
+        "tests/**/*.py",
+        "scripts/**/*.py",  # Organized scripts
+        "scripts/**/*.md",  # Script documentation
+        # All root level files (filtered by type)
+        "*.py",
+        "*.md",
+        "*.txt",
+        "*.toml",
+        "*.sh",
+        "Dockerfile",
+        # License files
+        "LICENSE/**/*",
+    ]
+    # Files to exclude
+    exclude_patterns = [
+        "__pycache__/**",
+        "*.pyc",
+        ".git/**",
+        ".pytest_cache/**",
+        ".ipynb_checkpoints/**",
+        "weights/**",
+        "checkpoints/**",
+        "*.log",
+        "*.pt",  # Model weights
+        "*.zip",  # Backup files
+        # Temporary or generated files
+        "*-checkpoint.*",
+        "*.tmp",
+        "*.swp",
+        # OS files
+        ".DS_Store",
+        "Thumbs.db",
+    ]
+    # Get all files to upload
+    files_to_upload = []
+    for pattern in include_patterns:
+        for file_path in repo_root.glob(pattern):
+            if file_path.is_file():
+                # Check if file should be excluded
+                relative_path = file_path.relative_to(repo_root)
+                should_exclude = any(
+                    relative_path.match(exclude)
+                    for exclude in exclude_patterns
+                )
+                if not should_exclude:
+                    files_to_upload.append(file_path)
+    return sorted(files_to_upload)
+def preview_sync(repo_root: Path = None) -> None:
+    """Preview what files will be synced without actually uploading."""
+    if repo_root is None:
+        repo_root = Path(__file__).parent.parent.parent
+    files_to_upload = get_files_to_sync(repo_root)
+    print(f"\n📁 Repository root: {repo_root}")
+    print(f"📦 Files to sync: {len(files_to_upload)}")
+    print("\n📋 File list:")
+    for file_path in files_to_upload:
+        relative_path = file_path.relative_to(repo_root)
+        file_size = file_path.stat().st_size
+        print(f"  {relative_path} ({file_size:,} bytes)")
+    total_size = sum(f.stat().st_size for f in files_to_upload)
+    print(f"\n📊 Total size: {total_size:,} bytes ({total_size/1024/1024:.2f} MB)")
+def sync_repository_to_hf(
+    repo_id: str = "WCNegentropy/BitTransformerLM",
+    token: Optional[str] = None,
+    commit_message: str = "🚀 Refined BitTransformerLM: Organized codebase with best practices",
+    preview_only: bool = False
+):
+    """
+    Sync the entire cleaned BitTransformerLM repository to HuggingFace Hub.
+    Args:
+        repo_id: HuggingFace repository ID
+        token: HF token (defaults to HF_TOKEN environment variable)
+        commit_message: Commit message for the upload
+    """
+    # Get token from environment if not provided
+    if token is None:
+        token = os.environ.get('HF_TOKEN')
+        if not token:
+            logger.error("HF_TOKEN environment variable not set and no token provided")
+            return False
+    try:
+        # Login to HuggingFace
+        login(token=token)
+        api = HfApi()
+        logger.info("Successfully authenticated with HuggingFace Hub")
+        # Get the repository root directory (go up from scripts/tools/)
+        repo_root = Path(__file__).parent.parent.parent
+        logger.info(f"Repository root: {repo_root}")
+        # Get files to sync using the centralized function
+        files_to_upload = get_files_to_sync(repo_root)
+        logger.info(f"Found {len(files_to_upload)} files to upload")
+        # If preview only, just show the files and return
+        if preview_only:
+            preview_sync(repo_root)
+            return True
+        # Use upload_folder for exact sync - this will mirror the entire directory
+        logger.info("Syncing entire repository structure to HuggingFace...")
+        try:
+            # First, let's create a temporary directory with only the files we want
+            import tempfile
+            import shutil
+            with tempfile.TemporaryDirectory() as temp_dir:
+                temp_path = Path(temp_dir)
+                # Copy all files we want to upload to temp directory
+                for file_path in files_to_upload:
+                    relative_path = file_path.relative_to(repo_root)
+                    dest_path = temp_path / relative_path
+                    dest_path.parent.mkdir(parents=True, exist_ok=True)
+                    shutil.copy2(file_path, dest_path)
+                logger.info(f"Prepared {len(files_to_upload)} files for upload")
+                # Upload the entire folder structure - this ensures exact mirroring
+                api.upload_folder(
+                    folder_path=str(temp_path),
+                    repo_id=repo_id,
+                    repo_type="model",
+                    commit_message=commit_message,
+                    commit_description="""
+BitTransformerLM refined with ML engineering best practices:
+✅ **Organized Codebase Structure**
+- Cleaned up 30+ scattered scripts into organized directories
+- Standardized imports and docstring formatting
+- Consolidated configuration management
+- Professional package metadata
+✅ **Enhanced Developer Experience**
+- Comprehensive CLI interface with standardized arguments
+- Type-safe configuration system with presets
+- Improved error handling and logging
+- Better modular organization
+✅ **Production Quality**
+- PyProject.toml with proper dependencies and tooling
+- Consistent code formatting and documentation
+- Maintainable directory structure
+- Ready for serious development and research
+The bit-native transformer architecture with reversible layers, safety telemetry,
+and distributed training capabilities is now properly packaged for research use.
+                    """.strip(),
+                    delete_patterns=["*"]  # This ensures old files are removed
+                )
+                uploaded_count = len(files_to_upload)
+        except Exception as e:
+            logger.error(f"Failed to upload folder: {e}")
+            logger.info("Falling back to individual file upload...")
+            # Fallback to individual file upload
+            uploaded_count = 0
+            for file_path in files_to_upload:
+                try:
+                    relative_path = file_path.relative_to(repo_root)
+                    logger.info(f"Uploading: {relative_path}")
+                    api.upload_file(
+                        path_or_fileobj=str(file_path),
+                        path_in_repo=str(relative_path),
+                        repo_id=repo_id,
+                        repo_type="model",
+                        commit_message=commit_message,
+                    )
+                    uploaded_count += 1
+                    if uploaded_count % 10 == 0:
+                        logger.info(f"Progress: {uploaded_count}/{len(files_to_upload)} files uploaded")
+                except Exception as e:
+                    logger.warning(f"Failed to upload {relative_path}: {e}")
+                    continue
+        logger.info(f"✅ Successfully uploaded {uploaded_count}/{len(files_to_upload)} files")
+        logger.info(f"🎉 Repository synced to: https://huggingface.co/{repo_id}")
+        return True
+    except Exception as e:
+        logger.error(f"❌ Failed to sync repository: {e}")
+        return False
+def create_release_info():
+    """Create a release information file for the OS launch."""
+    release_info = """# BitTransformerLM v0.1.0 - Experimental Research Release
+**Release Date:** August 2025
+**Status:** Open Source Research Implementation
+**License:** AGPLv3 + Commercial Licensing Available
+## What's Included
+This release provides a complete experimental framework for bit-native language modeling research:
+- **Core Architecture:** 57 Python files implementing bit-native transformer with reversible layers
+- **Safety Systems:** Real-time K/C/S telemetry and monitoring
+- **Research Tools:** Interactive dashboard, distributed training, comprehensive testing
+- **Documentation:** Professional model card, research status, and validation reports
+## Important Notes
+⚠️ **Experimental Status:** This is research code requiring rigorous baseline validation
+⚠️ **Not Production Ready:** Needs extensive evaluation vs standard transformers
+⚠️ **Research Use Only:** Intended for academic investigation and experimentation
+## Licensing
+- **Open Source:** AGPLv3 for research and open source use
+- **Commercial:** Contact contact@wcnegentropy.com for commercial licensing
+## Next Steps
+The research community is invited to:
+1. Conduct rigorous baseline comparisons vs standard transformers
+2. Evaluate on established language modeling benchmarks
+3. Validate (or refute) claimed memory efficiency benefits
+4. Share findings openly to advance the field
+**Research responsibly. Validate rigorously. Share openly.**
+"""
+    release_file = Path(__file__).parent / "RELEASE_INFO.md"
+    with open(release_file, 'w') as f:
+        f.write(release_info)
+    logger.info("Created RELEASE_INFO.md")
+    return release_file
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Sync BitTransformerLM to HuggingFace Hub")
+    parser.add_argument("--preview", action="store_true", help="Preview files without uploading")
+    parser.add_argument("--repo-id", default="WCNegentropy/BitTransformerLM", help="HuggingFace repo ID")
+    parser.add_argument("--token", help="HuggingFace token (or set HF_TOKEN env var)")
+    args = parser.parse_args()
+    if args.preview:
+        print("🔍 Preview mode: showing files that would be synced...")
+        preview_sync()
+        print("\n✅ Use --token YOUR_TOKEN to perform actual sync")
+    else:
+        # Create release info file
+        create_release_info()
+        # Sync to HuggingFace
+        success = sync_repository_to_hf(
+            repo_id=args.repo_id,
+            token=args.token
+        )
+        if success:
+            print(f"\n🚀 BitTransformerLM Sync Complete!")
+            print(f"📍 Repository: https://huggingface.co/{args.repo_id}")
+            print("\nRefined codebase with ML engineering best practices is now live! ✨")
+        else:
+            print("\n❌ Sync failed. Please check logs and try again.")