File size: 2,420 Bytes

2d6ede2

import json
import sentencepiece as spm
import os
import shutil
from transformers import AutoTokenizer

def convert_to_sentencepiece(input_dir, output_dir):
    print(f"Converting tokenizer from {input_dir} to {output_dir}")

    # First ensure we have a working tokenizer by copying all files
    os.makedirs(output_dir, exist_ok=True)

    # Read vocab.json to get the vocabulary
    vocab_path = os.path.join(input_dir, "vocab.json")
    with open(vocab_path, 'r', encoding='utf-8') as f:
        vocab = json.load(f)

    # Create a temporary vocabulary file for SentencePiece
    temp_vocab_file = os.path.join(output_dir, "temp_vocab.txt")
    with open(temp_vocab_file, "w", encoding="utf-8") as f:
        # Sort by token id to maintain correct order
        for token, index in sorted(vocab.items(), key=lambda x: x[1]):
            # SentencePiece expects tab-separated format: token<tab>score
            f.write(f"{token}\t1.0\n")

    print("\nCreating SentencePiece model...")

    # Train the SentencePiece model using the vocabulary
    spm.SentencePieceTrainer.train(
        input=temp_vocab_file,
        model_prefix=os.path.join(output_dir, "tokenizer"),
        vocab_size=len(vocab),
        model_type='bpe',
        character_coverage=1.0,
        input_format='tsv',
        train_extremely_large_corpus=True,
        bos_id=-1,  # No beginning of sentence token
        eos_id=-1,  # No end of sentence token
        pad_id=-1,  # No padding token
        unk_id=0,   # Unknown token ID
        max_sentence_length=16384
    )

    # Clean up temporary file
    os.remove(temp_vocab_file)

    print("SentencePiece model created successfully")

    # Test the original tokenizer for comparison
    test_text = "Hello, world!"
    tokenizer = AutoTokenizer.from_pretrained(input_dir)
    tokens_orig = tokenizer.encode(test_text)

    # Test the SentencePiece model
    sp = spm.SentencePieceProcessor()
    sp.load(os.path.join(output_dir, "tokenizer.model"))
    tokens_sp = sp.encode_as_ids(test_text)

    print("\nTokenizer comparison test:")
    print(f"Original tokenizer: {tokens_orig}")
    print(f"SentencePiece tokenizer: {tokens_sp}")

if __name__ == "__main__":
    input_dir = "/mnt/llm/models/phi-4/model"  # or "model" depending on which directory you want to use
    output_dir = "/mnt/llm/models/phi-4/converted_tokenizer"

    convert_to_sentencepiece(input_dir, output_dir)