|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
Train a SentencePiece tokenizer from scratch using the prepared training data. |
|
|
|
OVERVIEW: |
|
This script trains a SentencePiece tokenizer on the cleaned text data from the SQUAD dataset |
|
or any other text corpus. SentencePiece is a subword tokenizer that works well for language |
|
models and supports multiple languages without requiring pre-tokenization. |
|
|
|
FEATURES: |
|
- Supports BPE (Byte Pair Encoding) and Unigram tokenization algorithms |
|
- Configurable vocabulary size (recommended: 8k-64k for LLMs) |
|
- Handles special tokens (BOS, EOS, UNK, PAD) |
|
- Outputs tokenizer model files compatible with Hugging Face |
|
- Comprehensive statistics and vocabulary analysis |
|
|
|
TOKENIZER OUTPUT: |
|
- tokenizer.model: SentencePiece model file |
|
- tokenizer.vocab: Human-readable vocabulary file |
|
- tokenizer_config.json: Configuration for Hugging Face integration |
|
|
|
Usage: |
|
python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000 |
|
|
|
Advanced usage: |
|
python core/src/train_tokenizer.py \\ |
|
--input data/clean/training_data.txt \\ |
|
--vocab_size 32000 \\ |
|
--model_type bpe \\ |
|
--output_dir data/tokenizer/ \\ |
|
--character_coverage 0.9995 |
|
|
|
Requirements: |
|
pip install sentencepiece |
|
|
|
Example setup: |
|
```bash |
|
# If not already in virtual environment |
|
python -m venv venv |
|
source venv/bin/activate # Linux/macOS |
|
# .\venv\Scripts\Activate.ps1 # Windows PowerShell |
|
|
|
# Install SentencePiece |
|
pip install sentencepiece |
|
|
|
# Train tokenizer |
|
python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000 |
|
``` |
|
|
|
""" |
|
|
|
import argparse |
|
import json |
|
import os |
|
import time |
|
from typing import Any, Dict |
|
|
|
try: |
|
import sentencepiece as spm |
|
except ImportError: |
|
print("ERROR: SentencePiece not installed. Run: pip install sentencepiece") |
|
exit(1) |
|
|
|
|
|
def validate_input_file(input_path: str) -> None: |
|
""" |
|
Validate that the input training file exists and is readable. |
|
|
|
Args: |
|
input_path (str): Path to the training text file |
|
|
|
Raises: |
|
FileNotFoundError: If input file doesn't exist |
|
ValueError: If input file is empty or unreadable |
|
""" |
|
if not os.path.exists(input_path): |
|
raise FileNotFoundError(f"Training data file not found: {input_path}") |
|
|
|
|
|
file_size = os.path.getsize(input_path) |
|
if file_size == 0: |
|
raise ValueError(f"Training data file is empty: {input_path}") |
|
|
|
|
|
try: |
|
with open(input_path, "r", encoding="utf-8") as f: |
|
first_line = f.readline() |
|
if not first_line.strip(): |
|
raise ValueError( |
|
"Training data file appears to be empty or contains only whitespace" |
|
) |
|
except UnicodeDecodeError as e: |
|
raise ValueError(f"Cannot read training data file as UTF-8: {e}") |
|
|
|
print(f"β Input file validated: {input_path} ({file_size:,} bytes)") |
|
|
|
|
|
def count_training_sentences(input_path: str) -> int: |
|
""" |
|
Count the number of training sentences/lines in the input file. |
|
|
|
Args: |
|
input_path (str): Path to the training text file |
|
|
|
Returns: |
|
int: Number of lines in the file |
|
""" |
|
print("Counting training sentences...") |
|
with open(input_path, "r", encoding="utf-8") as f: |
|
count = sum(1 for line in f if line.strip()) |
|
print(f"β Found {count:,} training sentences") |
|
return count |
|
|
|
|
|
def train_sentencepiece_tokenizer( |
|
input_path: str, |
|
output_dir: str, |
|
vocab_size: int = 32000, |
|
model_type: str = "bpe", |
|
character_coverage: float = 0.9995, |
|
max_sentence_length: int = 4192, |
|
input_sentence_size: int = 10000000, |
|
shuffle_input_sentence: bool = True, |
|
) -> Dict[str, Any]: |
|
""" |
|
Train a SentencePiece tokenizer with the specified parameters. |
|
|
|
Args: |
|
input_path (str): Path to training text file |
|
output_dir (str): Directory to save tokenizer files |
|
vocab_size (int): Target vocabulary size (recommended: 8k-64k) |
|
model_type (str): Algorithm type ('bpe' or 'unigram') |
|
character_coverage (float): Character coverage (0.9995 for English, 1.0 for Japanese) |
|
max_sentence_length (int): Maximum sentence length in characters |
|
input_sentence_size (int): Maximum number of sentences to use for training |
|
shuffle_input_sentence (bool): Whether to shuffle input sentences |
|
|
|
Returns: |
|
Dict[str, Any]: Training statistics and configuration |
|
""" |
|
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
model_prefix = os.path.join(output_dir, "tokenizer") |
|
|
|
|
|
train_params = [ |
|
f"--input={input_path}", |
|
f"--model_prefix={model_prefix}", |
|
f"--vocab_size={vocab_size}", |
|
f"--model_type={model_type}", |
|
f"--character_coverage={character_coverage}", |
|
f"--max_sentence_length={max_sentence_length}", |
|
f"--input_sentence_size={input_sentence_size}", |
|
f"--shuffle_input_sentence={shuffle_input_sentence}", |
|
|
|
"--pad_id=0", |
|
"--unk_id=1", |
|
"--bos_id=2", |
|
"--eos_id=3", |
|
|
|
"--split_by_unicode_script=true", |
|
"--split_by_whitespace=true", |
|
"--remove_extra_whitespaces=true", |
|
"--normalization_rule_name=identity", |
|
] |
|
|
|
print("\nTraining SentencePiece tokenizer...") |
|
print(f" Algorithm: {model_type.upper()}") |
|
print(f" Vocabulary size: {vocab_size:,}") |
|
print(f" Character coverage: {character_coverage}") |
|
print(f" Output directory: {output_dir}") |
|
print(f" Model files: {model_prefix}.model, {model_prefix}.vocab") |
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
try: |
|
spm.SentencePieceTrainer.train(" ".join(train_params)) |
|
training_time = time.time() - start_time |
|
print(f"β Tokenizer training completed in {training_time:.1f} seconds") |
|
except Exception as e: |
|
raise RuntimeError(f"SentencePiece training failed: {e}") |
|
|
|
|
|
model_file = f"{model_prefix}.model" |
|
vocab_file = f"{model_prefix}.vocab" |
|
|
|
if not os.path.exists(model_file): |
|
raise RuntimeError(f"Expected model file not created: {model_file}") |
|
if not os.path.exists(vocab_file): |
|
raise RuntimeError(f"Expected vocab file not created: {vocab_file}") |
|
|
|
print(f"β Model file created: {model_file} ({os.path.getsize(model_file):,} bytes)") |
|
print(f"β Vocab file created: {vocab_file} ({os.path.getsize(vocab_file):,} bytes)") |
|
|
|
|
|
config = { |
|
"model_type": model_type, |
|
"vocab_size": vocab_size, |
|
"character_coverage": character_coverage, |
|
"max_sentence_length": max_sentence_length, |
|
"training_time_seconds": training_time, |
|
"input_file": input_path, |
|
"output_directory": output_dir, |
|
"model_file": model_file, |
|
"vocab_file": vocab_file, |
|
} |
|
|
|
return config |
|
|
|
|
|
def test_tokenizer(model_path: str, test_sentences: list = None) -> None: |
|
""" |
|
Test the trained tokenizer on sample sentences to verify it works correctly. |
|
|
|
Args: |
|
model_path (str): Path to the trained .model file |
|
test_sentences (list): Optional list of test sentences |
|
""" |
|
print("\nTesting trained tokenizer...") |
|
|
|
|
|
sp = spm.SentencePieceProcessor() |
|
sp.load(model_path) |
|
|
|
|
|
if test_sentences is None: |
|
test_sentences = [ |
|
"Hello, world! This is a test sentence.", |
|
"The quick brown fox jumps over the lazy dog.", |
|
"Machine learning and artificial intelligence are transforming technology.", |
|
"SentencePiece tokenization works well for language models.", |
|
] |
|
|
|
print(f"Vocabulary size: {sp.vocab_size():,}") |
|
print( |
|
f"Special tokens: PAD={sp.pad_id()}, UNK={sp.unk_id()}, BOS={sp.bos_id()}, EOS={sp.eos_id()}" |
|
) |
|
|
|
print("\nTokenization examples:") |
|
for i, sentence in enumerate(test_sentences, 1): |
|
|
|
token_ids = sp.encode(sentence) |
|
token_pieces = sp.encode(sentence, out_type=str) |
|
|
|
print(f"\n{i}. Input: {sentence}") |
|
print(f" Tokens ({len(token_pieces)}): {token_pieces}") |
|
print(f" IDs: {token_ids[:10]}{'...' if len(token_ids) > 10 else ''}") |
|
|
|
|
|
decoded = sp.decode(token_ids) |
|
print(f" Decoded: {decoded}") |
|
|
|
|
|
if decoded.strip() != sentence.strip(): |
|
print(" β οΈ Warning: Decode mismatch!") |
|
|
|
print("β Tokenizer testing completed") |
|
|
|
|
|
def save_huggingface_config(output_dir: str, config: Dict[str, Any]) -> None: |
|
""" |
|
Save a Hugging Face compatible tokenizer configuration file. |
|
|
|
Args: |
|
output_dir (str): Directory containing the tokenizer files |
|
config (Dict[str, Any]): Tokenizer configuration |
|
""" |
|
|
|
hf_config = { |
|
"tokenizer_class": "SentencePieceTokenizer", |
|
"model_type": config["model_type"], |
|
"vocab_size": config["vocab_size"], |
|
"model_file": "tokenizer.model", |
|
"special_tokens": { |
|
"pad_token": "<pad>", |
|
"unk_token": "<unk>", |
|
"bos_token": "<s>", |
|
"eos_token": "</s>", |
|
}, |
|
"special_token_ids": { |
|
"pad_token_id": 0, |
|
"unk_token_id": 1, |
|
"bos_token_id": 2, |
|
"eos_token_id": 3, |
|
}, |
|
} |
|
|
|
config_path = os.path.join(output_dir, "tokenizer_config.json") |
|
with open(config_path, "w", encoding="utf-8") as f: |
|
json.dump(hf_config, f, indent=2, ensure_ascii=False) |
|
|
|
print(f"β Hugging Face config saved: {config_path}") |
|
|
|
|
|
def main(): |
|
"""Main function to handle command line arguments and orchestrate tokenizer training.""" |
|
parser = argparse.ArgumentParser( |
|
description="Train a SentencePiece tokenizer for language model training", |
|
formatter_class=argparse.RawDescriptionHelpFormatter, |
|
epilog=""" |
|
Examples: |
|
# Basic usage with SQUAD data |
|
python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000 |
|
|
|
# Advanced configuration |
|
python core/src/train_tokenizer.py \\ |
|
--input data/clean/training_data.txt \\ |
|
--vocab_size 32000 \\ |
|
--model_type bpe \\ |
|
--output_dir data/tokenizer/ \\ |
|
--character_coverage 0.9995 |
|
""", |
|
) |
|
|
|
|
|
parser.add_argument( |
|
"--input", |
|
required=True, |
|
help="Path to training text file (e.g., data/clean/training_data.txt)", |
|
) |
|
|
|
|
|
parser.add_argument( |
|
"--vocab_size", |
|
type=int, |
|
default=32000, |
|
help="Vocabulary size (default: 32000, recommended: 8k-64k)", |
|
) |
|
|
|
parser.add_argument( |
|
"--model_type", |
|
choices=["bpe", "unigram"], |
|
default="bpe", |
|
help="Tokenization algorithm (default: bpe)", |
|
) |
|
|
|
parser.add_argument( |
|
"--output_dir", |
|
default="data/tokenizer/", |
|
help="Output directory for tokenizer files (default: data/tokenizer/)", |
|
) |
|
|
|
parser.add_argument( |
|
"--character_coverage", |
|
type=float, |
|
default=0.9995, |
|
help="Character coverage (default: 0.9995 for English)", |
|
) |
|
|
|
parser.add_argument( |
|
"--max_sentence_length", |
|
type=int, |
|
default=4192, |
|
help="Maximum sentence length in characters (default: 4192)", |
|
) |
|
|
|
parser.add_argument( |
|
"--no_test", action="store_true", help="Skip tokenizer testing after training" |
|
) |
|
|
|
args = parser.parse_args() |
|
|
|
print("π€ SentencePiece Tokenizer Training") |
|
print("=" * 50) |
|
|
|
try: |
|
|
|
validate_input_file(args.input) |
|
|
|
|
|
sentence_count = count_training_sentences(args.input) |
|
|
|
|
|
config = train_sentencepiece_tokenizer( |
|
input_path=args.input, |
|
output_dir=args.output_dir, |
|
vocab_size=args.vocab_size, |
|
model_type=args.model_type, |
|
character_coverage=args.character_coverage, |
|
max_sentence_length=args.max_sentence_length, |
|
) |
|
|
|
|
|
save_huggingface_config(args.output_dir, config) |
|
|
|
|
|
if not args.no_test: |
|
model_path = os.path.join(args.output_dir, "tokenizer.model") |
|
test_tokenizer(model_path) |
|
|
|
|
|
print("\nπ Tokenizer training completed successfully!") |
|
print(f"π Output directory: {args.output_dir}") |
|
print(f"π Vocabulary size: {config['vocab_size']:,}") |
|
print(f"β±οΈ Training time: {config['training_time_seconds']:.1f}s") |
|
print(f"π Training sentences: {sentence_count:,}") |
|
|
|
print("\nFiles created:") |
|
print(f" β’ {config['model_file']} - SentencePiece model") |
|
print(f" β’ {config['vocab_file']} - Vocabulary file") |
|
print(f" β’ {os.path.join(args.output_dir, 'tokenizer_config.json')} - Hugging Face config") |
|
|
|
print("\nTo use this tokenizer in your language model:") |
|
print(" import sentencepiece as spm") |
|
print(" sp = spm.SentencePieceProcessor()") |
|
print(f" sp.load('{config['model_file']}')") |
|
|
|
except Exception as e: |
|
print(f"\nβ Error: {e}") |
|
exit(1) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|