Spaces:

lemms
/

openllm

Sleeping

App Files Files Community

openllm / training /train_tokenizer.py

lemms

feat: Sync training infrastructure from main repository

8f69cec verified 21 days ago

raw

history blame contribute delete

14.3 kB

	#!/usr/bin/env python3
	# Copyright (C) 2024 Louis Chua Bean Chong
	#
	# This file is part of OpenLLM.
	#
	# OpenLLM is dual-licensed:
	# 1. For open source use: GNU General Public License v3.0
	# 2. For commercial use: Commercial License (contact for details)
	#
	# See LICENSE and docs/LICENSES.md for full license information.

	"""
	Train a SentencePiece tokenizer from scratch using the prepared training data.

	OVERVIEW:
	This script trains a SentencePiece tokenizer on the cleaned text data from the SQUAD dataset
	or any other text corpus. SentencePiece is a subword tokenizer that works well for language
	models and supports multiple languages without requiring pre-tokenization.

	FEATURES:
	- Supports BPE (Byte Pair Encoding) and Unigram tokenization algorithms
	- Configurable vocabulary size (recommended: 8k-64k for LLMs)
	- Handles special tokens (BOS, EOS, UNK, PAD)
	- Outputs tokenizer model files compatible with Hugging Face
	- Comprehensive statistics and vocabulary analysis

	TOKENIZER OUTPUT:
	- tokenizer.model: SentencePiece model file
	- tokenizer.vocab: Human-readable vocabulary file
	- tokenizer_config.json: Configuration for Hugging Face integration

	Usage:
	python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000

	Advanced usage:
	python core/src/train_tokenizer.py \\
	--input data/clean/training_data.txt \\
	--vocab_size 32000 \\
	--model_type bpe \\
	--output_dir data/tokenizer/ \\
	--character_coverage 0.9995

	Requirements:
	pip install sentencepiece

	Example setup:
	```bash
	# If not already in virtual environment
	python -m venv venv
	source venv/bin/activate # Linux/macOS
	# .\venv\Scripts\Activate.ps1 # Windows PowerShell

	# Install SentencePiece
	pip install sentencepiece

	# Train tokenizer
	python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000
	```

	"""

	import argparse
	import json
	import os
	import time
	from typing import Any, Dict

	try:
	import sentencepiece as spm
	except ImportError:
	print("ERROR: SentencePiece not installed. Run: pip install sentencepiece")
	exit(1)


	def validate_input_file(input_path: str) -> None:
	"""
	Validate that the input training file exists and is readable.

	Args:
	input_path (str): Path to the training text file

	Raises:
	FileNotFoundError: If input file doesn't exist
	ValueError: If input file is empty or unreadable
	"""
	if not os.path.exists(input_path):
	raise FileNotFoundError(f"Training data file not found: {input_path}")

	# Check file size and readability
	file_size = os.path.getsize(input_path)
	if file_size == 0:
	raise ValueError(f"Training data file is empty: {input_path}")

	# Test that we can read the file
	try:
	with open(input_path, "r", encoding="utf-8") as f:
	first_line = f.readline()
	if not first_line.strip():
	raise ValueError(
	"Training data file appears to be empty or contains only whitespace"
	)
	except UnicodeDecodeError as e:
	raise ValueError(f"Cannot read training data file as UTF-8: {e}")

	print(f"✓ Input file validated: {input_path} ({file_size:,} bytes)")


	def count_training_sentences(input_path: str) -> int:
	"""
	Count the number of training sentences/lines in the input file.

	Args:
	input_path (str): Path to the training text file

	Returns:
	int: Number of lines in the file
	"""
	print("Counting training sentences...")
	with open(input_path, "r", encoding="utf-8") as f:
	count = sum(1 for line in f if line.strip())
	print(f"✓ Found {count:,} training sentences")
	return count


	def train_sentencepiece_tokenizer(
	input_path: str,
	output_dir: str,
	vocab_size: int = 32000,
	model_type: str = "bpe",
	character_coverage: float = 0.9995,
	max_sentence_length: int = 4192,
	input_sentence_size: int = 10000000,
	shuffle_input_sentence: bool = True,
	) -> Dict[str, Any]:
	"""
	Train a SentencePiece tokenizer with the specified parameters.

	Args:
	input_path (str): Path to training text file
	output_dir (str): Directory to save tokenizer files
	vocab_size (int): Target vocabulary size (recommended: 8k-64k)
	model_type (str): Algorithm type ('bpe' or 'unigram')
	character_coverage (float): Character coverage (0.9995 for English, 1.0 for Japanese)
	max_sentence_length (int): Maximum sentence length in characters
	input_sentence_size (int): Maximum number of sentences to use for training
	shuffle_input_sentence (bool): Whether to shuffle input sentences

	Returns:
	Dict[str, Any]: Training statistics and configuration
	"""
	# Ensure output directory exists
	os.makedirs(output_dir, exist_ok=True)

	# Define output paths
	model_prefix = os.path.join(output_dir, "tokenizer")

	# SentencePiece training parameters
	train_params = [
	f"--input={input_path}",
	f"--model_prefix={model_prefix}",
	f"--vocab_size={vocab_size}",
	f"--model_type={model_type}",
	f"--character_coverage={character_coverage}",
	f"--max_sentence_length={max_sentence_length}",
	f"--input_sentence_size={input_sentence_size}",
	f"--shuffle_input_sentence={shuffle_input_sentence}",
	# Special tokens for language modeling
	"--pad_id=0", # Padding token
	"--unk_id=1", # Unknown token
	"--bos_id=2", # Beginning of sequence
	"--eos_id=3", # End of sequence
	# Additional useful parameters
	"--split_by_unicode_script=true", # Better handling of mixed scripts
	"--split_by_whitespace=true", # Split on whitespace
	"--remove_extra_whitespaces=true", # Clean up whitespace
	"--normalization_rule_name=identity", # Keep original text as-is
	]

	print("\nTraining SentencePiece tokenizer...")
	print(f" Algorithm: {model_type.upper()}")
	print(f" Vocabulary size: {vocab_size:,}")
	print(f" Character coverage: {character_coverage}")
	print(f" Output directory: {output_dir}")
	print(f" Model files: {model_prefix}.model, {model_prefix}.vocab")

	# Record training start time
	start_time = time.time()

	# Train the tokenizer
	try:
	spm.SentencePieceTrainer.train(" ".join(train_params))
	training_time = time.time() - start_time
	print(f"✓ Tokenizer training completed in {training_time:.1f} seconds")
	except Exception as e:
	raise RuntimeError(f"SentencePiece training failed: {e}")

	# Verify output files were created
	model_file = f"{model_prefix}.model"
	vocab_file = f"{model_prefix}.vocab"

	if not os.path.exists(model_file):
	raise RuntimeError(f"Expected model file not created: {model_file}")
	if not os.path.exists(vocab_file):
	raise RuntimeError(f"Expected vocab file not created: {vocab_file}")

	print(f"✓ Model file created: {model_file} ({os.path.getsize(model_file):,} bytes)")
	print(f"✓ Vocab file created: {vocab_file} ({os.path.getsize(vocab_file):,} bytes)")

	# Return training configuration and statistics
	config = {
	"model_type": model_type,
	"vocab_size": vocab_size,
	"character_coverage": character_coverage,
	"max_sentence_length": max_sentence_length,
	"training_time_seconds": training_time,
	"input_file": input_path,
	"output_directory": output_dir,
	"model_file": model_file,
	"vocab_file": vocab_file,
	}

	return config


	def test_tokenizer(model_path: str, test_sentences: list = None) -> None:
	"""
	Test the trained tokenizer on sample sentences to verify it works correctly.

	Args:
	model_path (str): Path to the trained .model file
	test_sentences (list): Optional list of test sentences
	"""
	print("\nTesting trained tokenizer...")

	# Load the trained tokenizer
	sp = spm.SentencePieceProcessor()
	sp.load(model_path)

	# Default test sentences if none provided
	if test_sentences is None:
	test_sentences = [
	"Hello, world! This is a test sentence.",
	"The quick brown fox jumps over the lazy dog.",
	"Machine learning and artificial intelligence are transforming technology.",
	"SentencePiece tokenization works well for language models.",
	]

	print(f"Vocabulary size: {sp.vocab_size():,}")
	print(
	f"Special tokens: PAD={sp.pad_id()}, UNK={sp.unk_id()}, BOS={sp.bos_id()}, EOS={sp.eos_id()}"
	)

	print("\nTokenization examples:")
	for i, sentence in enumerate(test_sentences, 1):
	# Encode to token IDs and pieces
	token_ids = sp.encode(sentence)
	token_pieces = sp.encode(sentence, out_type=str)

	print(f"\n{i}. Input: {sentence}")
	print(f" Tokens ({len(token_pieces)}): {token_pieces}")
	print(f" IDs: {token_ids[:10]}{'...' if len(token_ids) > 10 else ''}")

	# Test decoding
	decoded = sp.decode(token_ids)
	print(f" Decoded: {decoded}")

	# Verify round-trip encoding/decoding
	if decoded.strip() != sentence.strip():
	print(" ⚠️ Warning: Decode mismatch!")

	print("✓ Tokenizer testing completed")


	def save_huggingface_config(output_dir: str, config: Dict[str, Any]) -> None:
	"""
	Save a Hugging Face compatible tokenizer configuration file.

	Args:
	output_dir (str): Directory containing the tokenizer files
	config (Dict[str, Any]): Tokenizer configuration
	"""
	# Create Hugging Face tokenizer config
	hf_config = {
	"tokenizer_class": "SentencePieceTokenizer",
	"model_type": config["model_type"],
	"vocab_size": config["vocab_size"],
	"model_file": "tokenizer.model",
	"special_tokens": {
	"pad_token": "<pad>",
	"unk_token": "<unk>",
	"bos_token": "<s>",
	"eos_token": "</s>",
	},
	"special_token_ids": {
	"pad_token_id": 0,
	"unk_token_id": 1,
	"bos_token_id": 2,
	"eos_token_id": 3,
	},
	}

	config_path = os.path.join(output_dir, "tokenizer_config.json")
	with open(config_path, "w", encoding="utf-8") as f:
	json.dump(hf_config, f, indent=2, ensure_ascii=False)

	print(f"✓ Hugging Face config saved: {config_path}")


	def main():
	"""Main function to handle command line arguments and orchestrate tokenizer training."""
	parser = argparse.ArgumentParser(
	description="Train a SentencePiece tokenizer for language model training",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples:
	# Basic usage with SQUAD data
	python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000

	# Advanced configuration
	python core/src/train_tokenizer.py \\
	--input data/clean/training_data.txt \\
	--vocab_size 32000 \\
	--model_type bpe \\
	--output_dir data/tokenizer/ \\
	--character_coverage 0.9995
	""",
	)

	# Required arguments
	parser.add_argument(
	"--input",
	required=True,
	help="Path to training text file (e.g., data/clean/training_data.txt)",
	)

	# Optional arguments with sensible defaults
	parser.add_argument(
	"--vocab_size",
	type=int,
	default=32000,
	help="Vocabulary size (default: 32000, recommended: 8k-64k)",
	)

	parser.add_argument(
	"--model_type",
	choices=["bpe", "unigram"],
	default="bpe",
	help="Tokenization algorithm (default: bpe)",
	)

	parser.add_argument(
	"--output_dir",
	default="data/tokenizer/",
	help="Output directory for tokenizer files (default: data/tokenizer/)",
	)

	parser.add_argument(
	"--character_coverage",
	type=float,
	default=0.9995,
	help="Character coverage (default: 0.9995 for English)",
	)

	parser.add_argument(
	"--max_sentence_length",
	type=int,
	default=4192,
	help="Maximum sentence length in characters (default: 4192)",
	)

	parser.add_argument(
	"--no_test", action="store_true", help="Skip tokenizer testing after training"
	)

	args = parser.parse_args()

	print("🔤 SentencePiece Tokenizer Training")
	print("=" * 50)

	try:
	# Step 1: Validate input file
	validate_input_file(args.input)

	# Step 2: Count training data
	sentence_count = count_training_sentences(args.input)

	# Step 3: Train tokenizer
	config = train_sentencepiece_tokenizer(
	input_path=args.input,
	output_dir=args.output_dir,
	vocab_size=args.vocab_size,
	model_type=args.model_type,
	character_coverage=args.character_coverage,
	max_sentence_length=args.max_sentence_length,
	)

	# Step 4: Save Hugging Face compatible config
	save_huggingface_config(args.output_dir, config)

	# Step 5: Test tokenizer (unless skipped)
	if not args.no_test:
	model_path = os.path.join(args.output_dir, "tokenizer.model")
	test_tokenizer(model_path)

	# Step 6: Print summary
	print("\n🎉 Tokenizer training completed successfully!")
	print(f"📁 Output directory: {args.output_dir}")
	print(f"📊 Vocabulary size: {config['vocab_size']:,}")
	print(f"⏱️ Training time: {config['training_time_seconds']:.1f}s")
	print(f"📄 Training sentences: {sentence_count:,}")

	print("\nFiles created:")
	print(f" • {config['model_file']} - SentencePiece model")
	print(f" • {config['vocab_file']} - Vocabulary file")
	print(f" • {os.path.join(args.output_dir, 'tokenizer_config.json')} - Hugging Face config")

	print("\nTo use this tokenizer in your language model:")
	print(" import sentencepiece as spm")
	print(" sp = spm.SentencePieceProcessor()")
	print(f" sp.load('{config['model_file']}')")

	except Exception as e:
	print(f"\n❌ Error: {e}")
	exit(1)


	if __name__ == "__main__":
	main()