Transformers
English
Hindi
Sanskrit
sovereign-ai
ecological-intelligence
indian-llm
environmental-protection
ARAVALLI-1 / data /tokenizer_train.py
iamkoder001's picture
Create data/tokenizer_train.py
cd37343 verified
import os
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace, ByteLevel
from tokenizers.processors import TemplateProcessing
def train_sovereign_tokenizer(corpus_path, vocab_size=50257):
"""
Trains a custom BPE tokenizer optimized for Indic and Ecological technical terms.
Target: 50,257 tokens (matches the model_config.yaml).
"""
# 1. Initialize an empty BPE model
# ByteLevel ensures we can handle any UTF-8 character without [UNK] tokens
tokenizer = Tokenizer(BPE(unk_token="<|unk|>"))
# 2. Set the Pre-Tokenizer
# We use ByteLevel to treat the text as a sequence of bytes
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
# 3. Initialize the Trainer
# We include special tokens for GOEC protocols (SN, NE, IPN)
trainer = BpeTrainer(
vocab_size=vocab_size,
min_frequency=2,
special_tokens=[
"<|endoftext|>",
"<|unk|>",
"<|pad|>",
"CATEGORY_SN",
"CATEGORY_NE",
"CATEGORY_IPN"
],
show_progress=True,
initial_alphabet=ByteLevel.alphabet()
)
# 4. Train on the Sovereign Corpus
print(f"Commencing Tokenizer Training on {corpus_path}...")
files = [os.path.join(corpus_path, f) for f in os.listdir(corpus_path) if f.endswith(".txt")]
tokenizer.train(files, trainer)
# 5. Post-Processing
# Add template to handle start/end of sequences for the Secretariat
tokenizer.post_processor = TemplateProcessing(
single="$A <|endoftext|>",
special_tokens=[("<|endoftext|>", 0)],
)
# 6. Save the Sovereign Lens
tokenizer.save("data/processed/aravalli_tokenizer.json")
print("Sovereign Tokenizer Enacted and Saved to data/processed/")
if __name__ == "__main__":
# Ensure raw data exists before training
if not os.path.exists("data/raw/"):
os.makedirs("data/raw/")
train_sovereign_tokenizer("data/raw/")