File size: 5,000 Bytes

from transformers import AutoTokenizer
import json
import os
import shutil

def safe_read_json(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"File not found: {filepath}")
        return None
    except Exception as e:
        print(f"Error reading {filepath}: {str(e)}")
        return None

def safe_copy_file(src, dst):
    try:
        shutil.copy2(src, dst)
        print(f"Successfully copied {os.path.basename(src)}")
        return True
    except Exception as e:
        print(f"Error copying {src}: {str(e)}")
        return False

def convert_phi_tokenizer(input_dir, output_dir):
    print(f"Converting tokenizer from {input_dir} to {output_dir}")

    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # JSON files to process
    json_files = [
        'tokenizer.json',
        'tokenizer_config.json',
        'special_tokens_map.json',
        'added_tokens.json'  # Moved added_tokens.json here
    ]

    # Files to copy directly (no JSON parsing)
    copy_files = [
        'merges.txt'
    ]

    # List what files we actually find
    print("\nFound files:")
    for f in os.listdir(input_dir):
        print(f"- {f}")

    # Process JSON files
    for filename in json_files:
        input_path = os.path.join(input_dir, filename)
        if os.path.exists(input_path):
            print(f"\nProcessing {filename}")
            content = safe_read_json(input_path)
            if content is not None:
                output_path = os.path.join(output_dir, filename)
                with open(output_path, 'w', encoding='utf-8') as f:
                    json.dump(content, f, indent=2)
                print(f"Successfully copied {filename}")

    # Copy non-JSON files directly
    for filename in copy_files:
        input_path = os.path.join(input_dir, filename)
        if os.path.exists(input_path):
            print(f"\nCopying {filename}")
            safe_copy_file(input_path, os.path.join(output_dir, filename))

    # Load and modify the tokenizer config
    config_path = os.path.join(input_dir, 'tokenizer_config.json')
    if os.path.exists(config_path):
        print("\nProcessing tokenizer config")
        config = safe_read_json(config_path)
        if config is not None:
            config.update({
                'add_prefix_space': False,
                'clean_up_tokenization_spaces': False,
                'model_max_length': 16384,
                'tokenizer_class': 'GPT2Tokenizer',  # Changed to GPT2Tokenizer
                'bos_token': '<|endoftext|>',
                'eos_token': '<|endoftext|>',
                'pad_token': '<|endoftext|>'
            })

            # Save the modified config
            output_config_path = os.path.join(output_dir, 'tokenizer_config.json')
            with open(output_config_path, 'w') as f:
                json.dump(config, f, indent=2)
            print("Successfully updated config")

    # Construct the vocabulary with added tokens
    print("\nConstructing vocabulary...")
    tokenizer_path = os.path.join(output_dir, "tokenizer.json")
    tokenizer_data = safe_read_json(tokenizer_path)
    if tokenizer_data is None:
        print("Error: Unable to read tokenizer.json")
        return

    vocab = tokenizer_data["model"]["vocab"]
    added_tokens = tokenizer_data.get("added_tokens", [])

    for token_data in added_tokens:
        content = token_data["content"]
        if content not in vocab:
            vocab[content] = token_data["id"]

    vocab_size = len(vocab)
    print(f"Vocabulary size: {vocab_size}")

    # Save the vocabulary as vocab.json
    vocab_output_path = os.path.join(output_dir, "vocab.json")
    with open(vocab_output_path, 'w', encoding='utf-8') as f:
        json.dump(vocab, f, indent=2)
    print(f"Successfully saved vocabulary to {vocab_output_path}")

    print("\nAttempting to test tokenizer...")
    try:
        tokenizer = AutoTokenizer.from_pretrained(output_dir)
        test_text = "Hello, world!"
        tokens = tokenizer.encode(test_text)
        decoded = tokenizer.decode(tokens)
        print("Tokenizer test successful!")
        print(f"Test text: {test_text}")
        print(f"Encoded: {tokens}")
        print(f"Decoded: {decoded}")

        # check if they're the same
        if test_text != decoded:
            print("Decoded text does not match original text!")
        else:
            print("Decoded text matches original text!")
            # save the tokenizer
            tokenizer.save_pretrained(output_dir)
            print(f"Tokenizer saved to {output_dir}")

    except Exception as e:
        print(f"Error testing tokenizer: {e}")

if __name__ == "__main__":
    input_dir = "/mnt/llm/models/phi-4/model"  # or "model" depending on which directory you want to use
    output_dir = "/mnt/llm/models/phi-4/converted_tokenizer"

    convert_phi_tokenizer(input_dir, output_dir)