#!/usr/bin/env python3 # Copyright (C) 2024 Louis Chua Bean Chong # # This file is part of OpenLLM. # # OpenLLM is dual-licensed: # 1. For open source use: GNU General Public License v3.0 # 2. For commercial use: Commercial License (contact for details) # # See LICENSE and docs/LICENSES.md for full license information. """ Train a SentencePiece tokenizer from scratch using the prepared training data. OVERVIEW: This script trains a SentencePiece tokenizer on the cleaned text data from the SQUAD dataset or any other text corpus. SentencePiece is a subword tokenizer that works well for language models and supports multiple languages without requiring pre-tokenization. FEATURES: - Supports BPE (Byte Pair Encoding) and Unigram tokenization algorithms - Configurable vocabulary size (recommended: 8k-64k for LLMs) - Handles special tokens (BOS, EOS, UNK, PAD) - Outputs tokenizer model files compatible with Hugging Face - Comprehensive statistics and vocabulary analysis TOKENIZER OUTPUT: - tokenizer.model: SentencePiece model file - tokenizer.vocab: Human-readable vocabulary file - tokenizer_config.json: Configuration for Hugging Face integration Usage: python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000 Advanced usage: python core/src/train_tokenizer.py \\ --input data/clean/training_data.txt \\ --vocab_size 32000 \\ --model_type bpe \\ --output_dir data/tokenizer/ \\ --character_coverage 0.9995 Requirements: pip install sentencepiece Example setup: ```bash # If not already in virtual environment python -m venv venv source venv/bin/activate # Linux/macOS # .\venv\Scripts\Activate.ps1 # Windows PowerShell # Install SentencePiece pip install sentencepiece # Train tokenizer python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000 ``` """ import argparse import json import os import time from typing import Any, Dict try: import sentencepiece as spm except ImportError: print("ERROR: SentencePiece not installed. Run: pip install sentencepiece") exit(1) def validate_input_file(input_path: str) -> None: """ Validate that the input training file exists and is readable. Args: input_path (str): Path to the training text file Raises: FileNotFoundError: If input file doesn't exist ValueError: If input file is empty or unreadable """ if not os.path.exists(input_path): raise FileNotFoundError(f"Training data file not found: {input_path}") # Check file size and readability file_size = os.path.getsize(input_path) if file_size == 0: raise ValueError(f"Training data file is empty: {input_path}") # Test that we can read the file try: with open(input_path, "r", encoding="utf-8") as f: first_line = f.readline() if not first_line.strip(): raise ValueError( "Training data file appears to be empty or contains only whitespace" ) except UnicodeDecodeError as e: raise ValueError(f"Cannot read training data file as UTF-8: {e}") print(f"✓ Input file validated: {input_path} ({file_size:,} bytes)") def count_training_sentences(input_path: str) -> int: """ Count the number of training sentences/lines in the input file. Args: input_path (str): Path to the training text file Returns: int: Number of lines in the file """ print("Counting training sentences...") with open(input_path, "r", encoding="utf-8") as f: count = sum(1 for line in f if line.strip()) print(f"✓ Found {count:,} training sentences") return count def train_sentencepiece_tokenizer( input_path: str, output_dir: str, vocab_size: int = 32000, model_type: str = "bpe", character_coverage: float = 0.9995, max_sentence_length: int = 4192, input_sentence_size: int = 10000000, shuffle_input_sentence: bool = True, ) -> Dict[str, Any]: """ Train a SentencePiece tokenizer with the specified parameters. Args: input_path (str): Path to training text file output_dir (str): Directory to save tokenizer files vocab_size (int): Target vocabulary size (recommended: 8k-64k) model_type (str): Algorithm type ('bpe' or 'unigram') character_coverage (float): Character coverage (0.9995 for English, 1.0 for Japanese) max_sentence_length (int): Maximum sentence length in characters input_sentence_size (int): Maximum number of sentences to use for training shuffle_input_sentence (bool): Whether to shuffle input sentences Returns: Dict[str, Any]: Training statistics and configuration """ # Ensure output directory exists os.makedirs(output_dir, exist_ok=True) # Define output paths model_prefix = os.path.join(output_dir, "tokenizer") # SentencePiece training parameters train_params = [ f"--input={input_path}", f"--model_prefix={model_prefix}", f"--vocab_size={vocab_size}", f"--model_type={model_type}", f"--character_coverage={character_coverage}", f"--max_sentence_length={max_sentence_length}", f"--input_sentence_size={input_sentence_size}", f"--shuffle_input_sentence={shuffle_input_sentence}", # Special tokens for language modeling "--pad_id=0", # Padding token "--unk_id=1", # Unknown token "--bos_id=2", # Beginning of sequence "--eos_id=3", # End of sequence # Additional useful parameters "--split_by_unicode_script=true", # Better handling of mixed scripts "--split_by_whitespace=true", # Split on whitespace "--remove_extra_whitespaces=true", # Clean up whitespace "--normalization_rule_name=identity", # Keep original text as-is ] print("\nTraining SentencePiece tokenizer...") print(f" Algorithm: {model_type.upper()}") print(f" Vocabulary size: {vocab_size:,}") print(f" Character coverage: {character_coverage}") print(f" Output directory: {output_dir}") print(f" Model files: {model_prefix}.model, {model_prefix}.vocab") # Record training start time start_time = time.time() # Train the tokenizer try: spm.SentencePieceTrainer.train(" ".join(train_params)) training_time = time.time() - start_time print(f"✓ Tokenizer training completed in {training_time:.1f} seconds") except Exception as e: raise RuntimeError(f"SentencePiece training failed: {e}") # Verify output files were created model_file = f"{model_prefix}.model" vocab_file = f"{model_prefix}.vocab" if not os.path.exists(model_file): raise RuntimeError(f"Expected model file not created: {model_file}") if not os.path.exists(vocab_file): raise RuntimeError(f"Expected vocab file not created: {vocab_file}") print(f"✓ Model file created: {model_file} ({os.path.getsize(model_file):,} bytes)") print(f"✓ Vocab file created: {vocab_file} ({os.path.getsize(vocab_file):,} bytes)") # Return training configuration and statistics config = { "model_type": model_type, "vocab_size": vocab_size, "character_coverage": character_coverage, "max_sentence_length": max_sentence_length, "training_time_seconds": training_time, "input_file": input_path, "output_directory": output_dir, "model_file": model_file, "vocab_file": vocab_file, } return config def test_tokenizer(model_path: str, test_sentences: list = None) -> None: """ Test the trained tokenizer on sample sentences to verify it works correctly. Args: model_path (str): Path to the trained .model file test_sentences (list): Optional list of test sentences """ print("\nTesting trained tokenizer...") # Load the trained tokenizer sp = spm.SentencePieceProcessor() sp.load(model_path) # Default test sentences if none provided if test_sentences is None: test_sentences = [ "Hello, world! This is a test sentence.", "The quick brown fox jumps over the lazy dog.", "Machine learning and artificial intelligence are transforming technology.", "SentencePiece tokenization works well for language models.", ] print(f"Vocabulary size: {sp.vocab_size():,}") print( f"Special tokens: PAD={sp.pad_id()}, UNK={sp.unk_id()}, BOS={sp.bos_id()}, EOS={sp.eos_id()}" ) print("\nTokenization examples:") for i, sentence in enumerate(test_sentences, 1): # Encode to token IDs and pieces token_ids = sp.encode(sentence) token_pieces = sp.encode(sentence, out_type=str) print(f"\n{i}. Input: {sentence}") print(f" Tokens ({len(token_pieces)}): {token_pieces}") print(f" IDs: {token_ids[:10]}{'...' if len(token_ids) > 10 else ''}") # Test decoding decoded = sp.decode(token_ids) print(f" Decoded: {decoded}") # Verify round-trip encoding/decoding if decoded.strip() != sentence.strip(): print(" ⚠️ Warning: Decode mismatch!") print("✓ Tokenizer testing completed") def save_huggingface_config(output_dir: str, config: Dict[str, Any]) -> None: """ Save a Hugging Face compatible tokenizer configuration file. Args: output_dir (str): Directory containing the tokenizer files config (Dict[str, Any]): Tokenizer configuration """ # Create Hugging Face tokenizer config hf_config = { "tokenizer_class": "SentencePieceTokenizer", "model_type": config["model_type"], "vocab_size": config["vocab_size"], "model_file": "tokenizer.model", "special_tokens": { "pad_token": "", "unk_token": "", "bos_token": "", "eos_token": "", }, "special_token_ids": { "pad_token_id": 0, "unk_token_id": 1, "bos_token_id": 2, "eos_token_id": 3, }, } config_path = os.path.join(output_dir, "tokenizer_config.json") with open(config_path, "w", encoding="utf-8") as f: json.dump(hf_config, f, indent=2, ensure_ascii=False) print(f"✓ Hugging Face config saved: {config_path}") def main(): """Main function to handle command line arguments and orchestrate tokenizer training.""" parser = argparse.ArgumentParser( description="Train a SentencePiece tokenizer for language model training", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Basic usage with SQUAD data python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000 # Advanced configuration python core/src/train_tokenizer.py \\ --input data/clean/training_data.txt \\ --vocab_size 32000 \\ --model_type bpe \\ --output_dir data/tokenizer/ \\ --character_coverage 0.9995 """, ) # Required arguments parser.add_argument( "--input", required=True, help="Path to training text file (e.g., data/clean/training_data.txt)", ) # Optional arguments with sensible defaults parser.add_argument( "--vocab_size", type=int, default=32000, help="Vocabulary size (default: 32000, recommended: 8k-64k)", ) parser.add_argument( "--model_type", choices=["bpe", "unigram"], default="bpe", help="Tokenization algorithm (default: bpe)", ) parser.add_argument( "--output_dir", default="data/tokenizer/", help="Output directory for tokenizer files (default: data/tokenizer/)", ) parser.add_argument( "--character_coverage", type=float, default=0.9995, help="Character coverage (default: 0.9995 for English)", ) parser.add_argument( "--max_sentence_length", type=int, default=4192, help="Maximum sentence length in characters (default: 4192)", ) parser.add_argument( "--no_test", action="store_true", help="Skip tokenizer testing after training" ) args = parser.parse_args() print("🔤 SentencePiece Tokenizer Training") print("=" * 50) try: # Step 1: Validate input file validate_input_file(args.input) # Step 2: Count training data sentence_count = count_training_sentences(args.input) # Step 3: Train tokenizer config = train_sentencepiece_tokenizer( input_path=args.input, output_dir=args.output_dir, vocab_size=args.vocab_size, model_type=args.model_type, character_coverage=args.character_coverage, max_sentence_length=args.max_sentence_length, ) # Step 4: Save Hugging Face compatible config save_huggingface_config(args.output_dir, config) # Step 5: Test tokenizer (unless skipped) if not args.no_test: model_path = os.path.join(args.output_dir, "tokenizer.model") test_tokenizer(model_path) # Step 6: Print summary print("\n🎉 Tokenizer training completed successfully!") print(f"📁 Output directory: {args.output_dir}") print(f"📊 Vocabulary size: {config['vocab_size']:,}") print(f"⏱️ Training time: {config['training_time_seconds']:.1f}s") print(f"📄 Training sentences: {sentence_count:,}") print("\nFiles created:") print(f" • {config['model_file']} - SentencePiece model") print(f" • {config['vocab_file']} - Vocabulary file") print(f" • {os.path.join(args.output_dir, 'tokenizer_config.json')} - Hugging Face config") print("\nTo use this tokenizer in your language model:") print(" import sentencepiece as spm") print(" sp = spm.SentencePieceProcessor()") print(f" sp.load('{config['model_file']}')") except Exception as e: print(f"\n❌ Error: {e}") exit(1) if __name__ == "__main__": main()