openllm / training /train_tokenizer.py
lemms's picture
feat: Sync training infrastructure from main repository
8f69cec verified
#!/usr/bin/env python3
# Copyright (C) 2024 Louis Chua Bean Chong
#
# This file is part of OpenLLM.
#
# OpenLLM is dual-licensed:
# 1. For open source use: GNU General Public License v3.0
# 2. For commercial use: Commercial License (contact for details)
#
# See LICENSE and docs/LICENSES.md for full license information.
"""
Train a SentencePiece tokenizer from scratch using the prepared training data.
OVERVIEW:
This script trains a SentencePiece tokenizer on the cleaned text data from the SQUAD dataset
or any other text corpus. SentencePiece is a subword tokenizer that works well for language
models and supports multiple languages without requiring pre-tokenization.
FEATURES:
- Supports BPE (Byte Pair Encoding) and Unigram tokenization algorithms
- Configurable vocabulary size (recommended: 8k-64k for LLMs)
- Handles special tokens (BOS, EOS, UNK, PAD)
- Outputs tokenizer model files compatible with Hugging Face
- Comprehensive statistics and vocabulary analysis
TOKENIZER OUTPUT:
- tokenizer.model: SentencePiece model file
- tokenizer.vocab: Human-readable vocabulary file
- tokenizer_config.json: Configuration for Hugging Face integration
Usage:
python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000
Advanced usage:
python core/src/train_tokenizer.py \\
--input data/clean/training_data.txt \\
--vocab_size 32000 \\
--model_type bpe \\
--output_dir data/tokenizer/ \\
--character_coverage 0.9995
Requirements:
pip install sentencepiece
Example setup:
```bash
# If not already in virtual environment
python -m venv venv
source venv/bin/activate # Linux/macOS
# .\venv\Scripts\Activate.ps1 # Windows PowerShell
# Install SentencePiece
pip install sentencepiece
# Train tokenizer
python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000
```
"""
import argparse
import json
import os
import time
from typing import Any, Dict
try:
import sentencepiece as spm
except ImportError:
print("ERROR: SentencePiece not installed. Run: pip install sentencepiece")
exit(1)
def validate_input_file(input_path: str) -> None:
"""
Validate that the input training file exists and is readable.
Args:
input_path (str): Path to the training text file
Raises:
FileNotFoundError: If input file doesn't exist
ValueError: If input file is empty or unreadable
"""
if not os.path.exists(input_path):
raise FileNotFoundError(f"Training data file not found: {input_path}")
# Check file size and readability
file_size = os.path.getsize(input_path)
if file_size == 0:
raise ValueError(f"Training data file is empty: {input_path}")
# Test that we can read the file
try:
with open(input_path, "r", encoding="utf-8") as f:
first_line = f.readline()
if not first_line.strip():
raise ValueError(
"Training data file appears to be empty or contains only whitespace"
)
except UnicodeDecodeError as e:
raise ValueError(f"Cannot read training data file as UTF-8: {e}")
print(f"βœ“ Input file validated: {input_path} ({file_size:,} bytes)")
def count_training_sentences(input_path: str) -> int:
"""
Count the number of training sentences/lines in the input file.
Args:
input_path (str): Path to the training text file
Returns:
int: Number of lines in the file
"""
print("Counting training sentences...")
with open(input_path, "r", encoding="utf-8") as f:
count = sum(1 for line in f if line.strip())
print(f"βœ“ Found {count:,} training sentences")
return count
def train_sentencepiece_tokenizer(
input_path: str,
output_dir: str,
vocab_size: int = 32000,
model_type: str = "bpe",
character_coverage: float = 0.9995,
max_sentence_length: int = 4192,
input_sentence_size: int = 10000000,
shuffle_input_sentence: bool = True,
) -> Dict[str, Any]:
"""
Train a SentencePiece tokenizer with the specified parameters.
Args:
input_path (str): Path to training text file
output_dir (str): Directory to save tokenizer files
vocab_size (int): Target vocabulary size (recommended: 8k-64k)
model_type (str): Algorithm type ('bpe' or 'unigram')
character_coverage (float): Character coverage (0.9995 for English, 1.0 for Japanese)
max_sentence_length (int): Maximum sentence length in characters
input_sentence_size (int): Maximum number of sentences to use for training
shuffle_input_sentence (bool): Whether to shuffle input sentences
Returns:
Dict[str, Any]: Training statistics and configuration
"""
# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)
# Define output paths
model_prefix = os.path.join(output_dir, "tokenizer")
# SentencePiece training parameters
train_params = [
f"--input={input_path}",
f"--model_prefix={model_prefix}",
f"--vocab_size={vocab_size}",
f"--model_type={model_type}",
f"--character_coverage={character_coverage}",
f"--max_sentence_length={max_sentence_length}",
f"--input_sentence_size={input_sentence_size}",
f"--shuffle_input_sentence={shuffle_input_sentence}",
# Special tokens for language modeling
"--pad_id=0", # Padding token
"--unk_id=1", # Unknown token
"--bos_id=2", # Beginning of sequence
"--eos_id=3", # End of sequence
# Additional useful parameters
"--split_by_unicode_script=true", # Better handling of mixed scripts
"--split_by_whitespace=true", # Split on whitespace
"--remove_extra_whitespaces=true", # Clean up whitespace
"--normalization_rule_name=identity", # Keep original text as-is
]
print("\nTraining SentencePiece tokenizer...")
print(f" Algorithm: {model_type.upper()}")
print(f" Vocabulary size: {vocab_size:,}")
print(f" Character coverage: {character_coverage}")
print(f" Output directory: {output_dir}")
print(f" Model files: {model_prefix}.model, {model_prefix}.vocab")
# Record training start time
start_time = time.time()
# Train the tokenizer
try:
spm.SentencePieceTrainer.train(" ".join(train_params))
training_time = time.time() - start_time
print(f"βœ“ Tokenizer training completed in {training_time:.1f} seconds")
except Exception as e:
raise RuntimeError(f"SentencePiece training failed: {e}")
# Verify output files were created
model_file = f"{model_prefix}.model"
vocab_file = f"{model_prefix}.vocab"
if not os.path.exists(model_file):
raise RuntimeError(f"Expected model file not created: {model_file}")
if not os.path.exists(vocab_file):
raise RuntimeError(f"Expected vocab file not created: {vocab_file}")
print(f"βœ“ Model file created: {model_file} ({os.path.getsize(model_file):,} bytes)")
print(f"βœ“ Vocab file created: {vocab_file} ({os.path.getsize(vocab_file):,} bytes)")
# Return training configuration and statistics
config = {
"model_type": model_type,
"vocab_size": vocab_size,
"character_coverage": character_coverage,
"max_sentence_length": max_sentence_length,
"training_time_seconds": training_time,
"input_file": input_path,
"output_directory": output_dir,
"model_file": model_file,
"vocab_file": vocab_file,
}
return config
def test_tokenizer(model_path: str, test_sentences: list = None) -> None:
"""
Test the trained tokenizer on sample sentences to verify it works correctly.
Args:
model_path (str): Path to the trained .model file
test_sentences (list): Optional list of test sentences
"""
print("\nTesting trained tokenizer...")
# Load the trained tokenizer
sp = spm.SentencePieceProcessor()
sp.load(model_path)
# Default test sentences if none provided
if test_sentences is None:
test_sentences = [
"Hello, world! This is a test sentence.",
"The quick brown fox jumps over the lazy dog.",
"Machine learning and artificial intelligence are transforming technology.",
"SentencePiece tokenization works well for language models.",
]
print(f"Vocabulary size: {sp.vocab_size():,}")
print(
f"Special tokens: PAD={sp.pad_id()}, UNK={sp.unk_id()}, BOS={sp.bos_id()}, EOS={sp.eos_id()}"
)
print("\nTokenization examples:")
for i, sentence in enumerate(test_sentences, 1):
# Encode to token IDs and pieces
token_ids = sp.encode(sentence)
token_pieces = sp.encode(sentence, out_type=str)
print(f"\n{i}. Input: {sentence}")
print(f" Tokens ({len(token_pieces)}): {token_pieces}")
print(f" IDs: {token_ids[:10]}{'...' if len(token_ids) > 10 else ''}")
# Test decoding
decoded = sp.decode(token_ids)
print(f" Decoded: {decoded}")
# Verify round-trip encoding/decoding
if decoded.strip() != sentence.strip():
print(" ⚠️ Warning: Decode mismatch!")
print("βœ“ Tokenizer testing completed")
def save_huggingface_config(output_dir: str, config: Dict[str, Any]) -> None:
"""
Save a Hugging Face compatible tokenizer configuration file.
Args:
output_dir (str): Directory containing the tokenizer files
config (Dict[str, Any]): Tokenizer configuration
"""
# Create Hugging Face tokenizer config
hf_config = {
"tokenizer_class": "SentencePieceTokenizer",
"model_type": config["model_type"],
"vocab_size": config["vocab_size"],
"model_file": "tokenizer.model",
"special_tokens": {
"pad_token": "<pad>",
"unk_token": "<unk>",
"bos_token": "<s>",
"eos_token": "</s>",
},
"special_token_ids": {
"pad_token_id": 0,
"unk_token_id": 1,
"bos_token_id": 2,
"eos_token_id": 3,
},
}
config_path = os.path.join(output_dir, "tokenizer_config.json")
with open(config_path, "w", encoding="utf-8") as f:
json.dump(hf_config, f, indent=2, ensure_ascii=False)
print(f"βœ“ Hugging Face config saved: {config_path}")
def main():
"""Main function to handle command line arguments and orchestrate tokenizer training."""
parser = argparse.ArgumentParser(
description="Train a SentencePiece tokenizer for language model training",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Basic usage with SQUAD data
python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000
# Advanced configuration
python core/src/train_tokenizer.py \\
--input data/clean/training_data.txt \\
--vocab_size 32000 \\
--model_type bpe \\
--output_dir data/tokenizer/ \\
--character_coverage 0.9995
""",
)
# Required arguments
parser.add_argument(
"--input",
required=True,
help="Path to training text file (e.g., data/clean/training_data.txt)",
)
# Optional arguments with sensible defaults
parser.add_argument(
"--vocab_size",
type=int,
default=32000,
help="Vocabulary size (default: 32000, recommended: 8k-64k)",
)
parser.add_argument(
"--model_type",
choices=["bpe", "unigram"],
default="bpe",
help="Tokenization algorithm (default: bpe)",
)
parser.add_argument(
"--output_dir",
default="data/tokenizer/",
help="Output directory for tokenizer files (default: data/tokenizer/)",
)
parser.add_argument(
"--character_coverage",
type=float,
default=0.9995,
help="Character coverage (default: 0.9995 for English)",
)
parser.add_argument(
"--max_sentence_length",
type=int,
default=4192,
help="Maximum sentence length in characters (default: 4192)",
)
parser.add_argument(
"--no_test", action="store_true", help="Skip tokenizer testing after training"
)
args = parser.parse_args()
print("πŸ”€ SentencePiece Tokenizer Training")
print("=" * 50)
try:
# Step 1: Validate input file
validate_input_file(args.input)
# Step 2: Count training data
sentence_count = count_training_sentences(args.input)
# Step 3: Train tokenizer
config = train_sentencepiece_tokenizer(
input_path=args.input,
output_dir=args.output_dir,
vocab_size=args.vocab_size,
model_type=args.model_type,
character_coverage=args.character_coverage,
max_sentence_length=args.max_sentence_length,
)
# Step 4: Save Hugging Face compatible config
save_huggingface_config(args.output_dir, config)
# Step 5: Test tokenizer (unless skipped)
if not args.no_test:
model_path = os.path.join(args.output_dir, "tokenizer.model")
test_tokenizer(model_path)
# Step 6: Print summary
print("\nπŸŽ‰ Tokenizer training completed successfully!")
print(f"πŸ“ Output directory: {args.output_dir}")
print(f"πŸ“Š Vocabulary size: {config['vocab_size']:,}")
print(f"⏱️ Training time: {config['training_time_seconds']:.1f}s")
print(f"πŸ“„ Training sentences: {sentence_count:,}")
print("\nFiles created:")
print(f" β€’ {config['model_file']} - SentencePiece model")
print(f" β€’ {config['vocab_file']} - Vocabulary file")
print(f" β€’ {os.path.join(args.output_dir, 'tokenizer_config.json')} - Hugging Face config")
print("\nTo use this tokenizer in your language model:")
print(" import sentencepiece as spm")
print(" sp = spm.SentencePieceProcessor()")
print(f" sp.load('{config['model_file']}')")
except Exception as e:
print(f"\n❌ Error: {e}")
exit(1)
if __name__ == "__main__":
main()