File size: 5,000 Bytes
f14551e 2d6ede2 e83cf6a ea21fc1 2d6ede2 ea21fc1 2d6ede2 ea21fc1 f14551e ea21fc1 f14551e ea21fc1 f14551e ea21fc1 f14551e ea21fc1 2d6ede2 ea21fc1 2d6ede2 ea21fc1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
from transformers import AutoTokenizer
import json
import os
import shutil
def safe_read_json(filepath):
try:
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"File not found: {filepath}")
return None
except Exception as e:
print(f"Error reading {filepath}: {str(e)}")
return None
def safe_copy_file(src, dst):
try:
shutil.copy2(src, dst)
print(f"Successfully copied {os.path.basename(src)}")
return True
except Exception as e:
print(f"Error copying {src}: {str(e)}")
return False
def convert_phi_tokenizer(input_dir, output_dir):
print(f"Converting tokenizer from {input_dir} to {output_dir}")
# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)
# JSON files to process
json_files = [
'tokenizer.json',
'tokenizer_config.json',
'special_tokens_map.json',
'added_tokens.json' # Moved added_tokens.json here
]
# Files to copy directly (no JSON parsing)
copy_files = [
'merges.txt'
]
# List what files we actually find
print("\nFound files:")
for f in os.listdir(input_dir):
print(f"- {f}")
# Process JSON files
for filename in json_files:
input_path = os.path.join(input_dir, filename)
if os.path.exists(input_path):
print(f"\nProcessing {filename}")
content = safe_read_json(input_path)
if content is not None:
output_path = os.path.join(output_dir, filename)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(content, f, indent=2)
print(f"Successfully copied {filename}")
# Copy non-JSON files directly
for filename in copy_files:
input_path = os.path.join(input_dir, filename)
if os.path.exists(input_path):
print(f"\nCopying {filename}")
safe_copy_file(input_path, os.path.join(output_dir, filename))
# Load and modify the tokenizer config
config_path = os.path.join(input_dir, 'tokenizer_config.json')
if os.path.exists(config_path):
print("\nProcessing tokenizer config")
config = safe_read_json(config_path)
if config is not None:
config.update({
'add_prefix_space': False,
'clean_up_tokenization_spaces': False,
'model_max_length': 16384,
'tokenizer_class': 'GPT2Tokenizer', # Changed to GPT2Tokenizer
'bos_token': '<|endoftext|>',
'eos_token': '<|endoftext|>',
'pad_token': '<|endoftext|>'
})
# Save the modified config
output_config_path = os.path.join(output_dir, 'tokenizer_config.json')
with open(output_config_path, 'w') as f:
json.dump(config, f, indent=2)
print("Successfully updated config")
# Construct the vocabulary with added tokens
print("\nConstructing vocabulary...")
tokenizer_path = os.path.join(output_dir, "tokenizer.json")
tokenizer_data = safe_read_json(tokenizer_path)
if tokenizer_data is None:
print("Error: Unable to read tokenizer.json")
return
vocab = tokenizer_data["model"]["vocab"]
added_tokens = tokenizer_data.get("added_tokens", [])
for token_data in added_tokens:
content = token_data["content"]
if content not in vocab:
vocab[content] = token_data["id"]
vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")
# Save the vocabulary as vocab.json
vocab_output_path = os.path.join(output_dir, "vocab.json")
with open(vocab_output_path, 'w', encoding='utf-8') as f:
json.dump(vocab, f, indent=2)
print(f"Successfully saved vocabulary to {vocab_output_path}")
print("\nAttempting to test tokenizer...")
try:
tokenizer = AutoTokenizer.from_pretrained(output_dir)
test_text = "Hello, world!"
tokens = tokenizer.encode(test_text)
decoded = tokenizer.decode(tokens)
print("Tokenizer test successful!")
print(f"Test text: {test_text}")
print(f"Encoded: {tokens}")
print(f"Decoded: {decoded}")
# check if they're the same
if test_text != decoded:
print("Decoded text does not match original text!")
else:
print("Decoded text matches original text!")
# save the tokenizer
tokenizer.save_pretrained(output_dir)
print(f"Tokenizer saved to {output_dir}")
except Exception as e:
print(f"Error testing tokenizer: {e}")
if __name__ == "__main__":
input_dir = "/mnt/llm/models/phi-4/model" # or "model" depending on which directory you want to use
output_dir = "/mnt/llm/models/phi-4/converted_tokenizer"
convert_phi_tokenizer(input_dir, output_dir)
|