smcleod commited on
Commit
ea21fc1
1 Parent(s): e83cf6a

Update convert_tokenizer.py

Browse files
Files changed (1) hide show
  1. convert_tokenizer.py +107 -69
convert_tokenizer.py CHANGED
@@ -1,81 +1,119 @@
1
- # phi_create_tokenizer_model.py
2
- # This script converts tokenizer.json to tokenizer.model and vocab.json to vocab.txt
3
  import json
4
  import os
5
  import shutil
6
 
7
- import sentencepiece as spm
8
- from transformers import AutoTokenizer
9
-
10
- def convert_to_sentencepiece(input_dir, output_dir):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  print(f"Converting tokenizer from {input_dir} to {output_dir}")
12
 
13
- # Ensure a working tokenizer by copying all files
14
  os.makedirs(output_dir, exist_ok=True)
15
- for filename in os.listdir(input_dir):
16
- if filename.startswith("tokenizer"):
17
- shutil.copyfile(os.path.join(input_dir, filename), os.path.join(output_dir, filename))
18
-
19
- # Read tokenizer.json to get the vocabulary and added_tokens
20
- tokenizer_path = os.path.join(input_dir, "tokenizer.json")
21
- with open(tokenizer_path, 'r', encoding='utf-8') as f:
22
- tokenizer_data = json.load(f)
23
-
24
- vocab = tokenizer_data["model"]["vocab"]
25
- added_tokens = tokenizer_data["added_tokens"]
26
-
27
- # Add the added tokens to the vocabulary with their correct IDs
28
- for token_data in added_tokens:
29
- vocab[token_data["content"]] = token_data["id"]
30
-
31
- # Create a temporary vocabulary file for SentencePiece
32
- temp_vocab_file = os.path.join(output_dir, "temp_vocab.txt")
33
- with open(temp_vocab_file, "w", encoding="utf-8") as f:
34
- # Sort by token ID to maintain correct order
35
- for token, index in sorted(vocab.items(), key=lambda x: x[1]):
36
- # SentencePiece expects tab-separated format: token<tab>score
37
- f.write(f"{token}\t1.0\n")
38
-
39
- print("\nCreating SentencePiece model...")
40
-
41
- # Train the SentencePiece model using the vocabulary
42
- spm.SentencePieceTrainer.train(
43
- input=temp_vocab_file,
44
- model_prefix=os.path.join(output_dir, "tokenizer"),
45
- vocab_size=len(vocab),
46
- model_type='bpe',
47
- character_coverage=1.0,
48
- input_format='tsv',
49
- train_extremely_large_corpus=True,
50
- bos_id=-1, # No beginning of sentence token
51
- eos_id=-1, # No end of sentence token
52
- pad_id=-1, # No padding token
53
- unk_id=0, # Unknown token ID
54
- max_sentence_length=131072, # Increased to 128K tokens for RoPE
55
- num_threads=16 # Adjust based on your system's capabilities
56
- )
57
-
58
- # Clean up temporary file
59
- os.remove(temp_vocab_file)
60
-
61
- print("SentencePiece model created successfully")
62
-
63
- # Test the original tokenizer for comparison
64
- test_text = "Hello, world!"
65
- tokenizer = AutoTokenizer.from_pretrained(input_dir)
66
- tokens_orig = tokenizer.encode(test_text)
67
-
68
- # Test the SentencePiece model
69
- sp = spm.SentencePieceProcessor()
70
- sp.load(os.path.join(output_dir, "tokenizer.model"))
71
- tokens_sp = sp.encode_as_ids(test_text)
72
 
73
- print("\nTokenizer comparison test:")
74
- print(f"Original tokenizer: {tokens_orig}")
75
- print(f"SentencePiece tokenizer: {tokens_sp}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  if __name__ == "__main__":
78
- input_dir = "/mnt/llm/models/phi-4/model"
79
  output_dir = "/mnt/llm/models/phi-4/converted_tokenizer"
80
 
81
- convert_to_sentencepiece(input_dir, output_dir)
 
1
+ from transformers import PreTrainedTokenizerFast, AutoTokenizer
 
2
  import json
3
  import os
4
  import shutil
5
 
6
+ def safe_read_json(filepath):
7
+ try:
8
+ with open(filepath, 'r', encoding='utf-8') as f:
9
+ return json.load(f)
10
+ except FileNotFoundError:
11
+ print(f"File not found: {filepath}")
12
+ return None
13
+ except Exception as e:
14
+ print(f"Error reading {filepath}: {str(e)}")
15
+ return None
16
+
17
+ def safe_copy_file(src, dst):
18
+ try:
19
+ shutil.copy2(src, dst)
20
+ print(f"Successfully copied {os.path.basename(src)}")
21
+ return True
22
+ except Exception as e:
23
+ print(f"Error copying {src}: {str(e)}")
24
+ return False
25
+
26
+ def convert_phi_tokenizer(input_dir, output_dir):
27
  print(f"Converting tokenizer from {input_dir} to {output_dir}")
28
 
29
+ # Ensure output directory exists
30
  os.makedirs(output_dir, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ # JSON files to process
33
+ json_files = [
34
+ 'tokenizer.json',
35
+ 'tokenizer_config.json',
36
+ 'special_tokens_map.json',
37
+ 'vocab.json',
38
+ 'added_tokens.json'
39
+ ]
40
+
41
+ # Files to copy directly (no JSON parsing)
42
+ copy_files = [
43
+ 'merges.txt'
44
+ ]
45
+
46
+ # List what files we actually find
47
+ print("\nFound files:")
48
+ for f in os.listdir(input_dir):
49
+ print(f"- {f}")
50
+
51
+ # Process JSON files
52
+ for filename in json_files:
53
+ input_path = os.path.join(input_dir, filename)
54
+ if os.path.exists(input_path):
55
+ print(f"\nProcessing {filename}")
56
+ content = safe_read_json(input_path)
57
+ if content is not None:
58
+ output_path = os.path.join(output_dir, filename)
59
+ with open(output_path, 'w', encoding='utf-8') as f:
60
+ json.dump(content, f, indent=2)
61
+ print(f"Successfully copied {filename}")
62
+
63
+ # Copy non-JSON files directly
64
+ for filename in copy_files:
65
+ input_path = os.path.join(input_dir, filename)
66
+ if os.path.exists(input_path):
67
+ print(f"\nCopying {filename}")
68
+ safe_copy_file(input_path, os.path.join(output_dir, filename))
69
+
70
+ # Load and modify the tokenizer config
71
+ config_path = os.path.join(input_dir, 'tokenizer_config.json')
72
+ if os.path.exists(config_path):
73
+ print("\nProcessing tokenizer config")
74
+ config = safe_read_json(config_path)
75
+ if config is not None:
76
+ config.update({
77
+ 'add_prefix_space': False,
78
+ 'clean_up_tokenization_spaces': False,
79
+ 'model_max_length': 16384,
80
+ 'tokenizer_class': 'GPT2Tokenizer',
81
+ 'bos_token': '<|endoftext|>',
82
+ 'eos_token': '<|endoftext|>',
83
+ 'pad_token': '<|endoftext|>'
84
+ })
85
+
86
+ # Save the modified config
87
+ output_config_path = os.path.join(output_dir, 'tokenizer_config.json')
88
+ with open(output_config_path, 'w') as f:
89
+ json.dump(config, f, indent=2)
90
+ print("Successfully updated config")
91
+
92
+ print("\nAttempting to test tokenizer...")
93
+ try:
94
+ tokenizer = AutoTokenizer.from_pretrained(output_dir)
95
+ test_text = "Hello, world!"
96
+ tokens = tokenizer.encode(test_text)
97
+ decoded = tokenizer.decode(tokens)
98
+ print("Tokenizer test successful!")
99
+ print(f"Test text: {test_text}")
100
+ print(f"Encoded: {tokens}")
101
+ print(f"Decoded: {decoded}")
102
+
103
+ # check if they're the same
104
+ if test_text != decoded:
105
+ print("Decoded text does not match original text!")
106
+ else:
107
+ print("Decoded text matches original text!")
108
+ # save the tokenizer
109
+ tokenizer.save_pretrained(output_dir)
110
+ print(f"Tokenizer saved to {output_dir}")
111
+
112
+ except Exception as e:
113
+ print(f"Error testing tokenizer: {e}")
114
 
115
  if __name__ == "__main__":
116
+ input_dir = "/mnt/llm/models/phi-4/model" # or "model" depending on which directory you want to use
117
  output_dir = "/mnt/llm/models/phi-4/converted_tokenizer"
118
 
119
+ convert_phi_tokenizer(input_dir, output_dir)