smcleod
/

phi-4

@@ -1,81 +1,119 @@
-# phi_create_tokenizer_model.py
-# This script converts tokenizer.json to tokenizer.model and vocab.json to vocab.txt
 import json
 import os
 import shutil
-import sentencepiece as spm
-from transformers import AutoTokenizer
-def convert_to_sentencepiece(input_dir, output_dir):
     print(f"Converting tokenizer from {input_dir} to {output_dir}")
-    # Ensure a working tokenizer by copying all files
     os.makedirs(output_dir, exist_ok=True)
-    for filename in os.listdir(input_dir):
-        if filename.startswith("tokenizer"):
-            shutil.copyfile(os.path.join(input_dir, filename), os.path.join(output_dir, filename))
-    # Read tokenizer.json to get the vocabulary and added_tokens
-    tokenizer_path = os.path.join(input_dir, "tokenizer.json")
-    with open(tokenizer_path, 'r', encoding='utf-8') as f:
-        tokenizer_data = json.load(f)
-    vocab = tokenizer_data["model"]["vocab"]
-    added_tokens = tokenizer_data["added_tokens"]
-    # Add the added tokens to the vocabulary with their correct IDs
-    for token_data in added_tokens:
-        vocab[token_data["content"]] = token_data["id"]
-    # Create a temporary vocabulary file for SentencePiece
-    temp_vocab_file = os.path.join(output_dir, "temp_vocab.txt")
-    with open(temp_vocab_file, "w", encoding="utf-8") as f:
-        # Sort by token ID to maintain correct order
-        for token, index in sorted(vocab.items(), key=lambda x: x[1]):
-            # SentencePiece expects tab-separated format: token<tab>score
-            f.write(f"{token}\t1.0\n")
-    print("\nCreating SentencePiece model...")
-    # Train the SentencePiece model using the vocabulary
-    spm.SentencePieceTrainer.train(
-        input=temp_vocab_file,
-        model_prefix=os.path.join(output_dir, "tokenizer"),
-        vocab_size=len(vocab),
-        model_type='bpe',
-        character_coverage=1.0,
-        input_format='tsv',
-        train_extremely_large_corpus=True,
-        bos_id=-1,  # No beginning of sentence token
-        eos_id=-1,  # No end of sentence token
-        pad_id=-1,  # No padding token
-        unk_id=0,   # Unknown token ID
-        max_sentence_length=131072,  # Increased to 128K tokens for RoPE
-        num_threads=16  # Adjust based on your system's capabilities
-    )
-    # Clean up temporary file
-    os.remove(temp_vocab_file)
-    print("SentencePiece model created successfully")
-    # Test the original tokenizer for comparison
-    test_text = "Hello, world!"
-    tokenizer = AutoTokenizer.from_pretrained(input_dir)
-    tokens_orig = tokenizer.encode(test_text)
-    # Test the SentencePiece model
-    sp = spm.SentencePieceProcessor()
-    sp.load(os.path.join(output_dir, "tokenizer.model"))
-    tokens_sp = sp.encode_as_ids(test_text)
-    print("\nTokenizer comparison test:")
-    print(f"Original tokenizer: {tokens_orig}")
-    print(f"SentencePiece tokenizer: {tokens_sp}")
 if __name__ == "__main__":
-    input_dir = "/mnt/llm/models/phi-4/model"
     output_dir = "/mnt/llm/models/phi-4/converted_tokenizer"
-    convert_to_sentencepiece(input_dir, output_dir)

+from transformers import PreTrainedTokenizerFast, AutoTokenizer
 import json
 import os
 import shutil
+def safe_read_json(filepath):
+    try:
+        with open(filepath, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except FileNotFoundError:
+        print(f"File not found: {filepath}")
+        return None
+    except Exception as e:
+        print(f"Error reading {filepath}: {str(e)}")
+        return None
+def safe_copy_file(src, dst):
+    try:
+        shutil.copy2(src, dst)
+        print(f"Successfully copied {os.path.basename(src)}")
+        return True
+    except Exception as e:
+        print(f"Error copying {src}: {str(e)}")
+        return False
+def convert_phi_tokenizer(input_dir, output_dir):
     print(f"Converting tokenizer from {input_dir} to {output_dir}")
+    # Ensure output directory exists
     os.makedirs(output_dir, exist_ok=True)
+    # JSON files to process
+    json_files = [
+        'tokenizer.json',
+        'tokenizer_config.json',
+        'special_tokens_map.json',
+        'vocab.json',
+        'added_tokens.json'
+    ]
+    # Files to copy directly (no JSON parsing)
+    copy_files = [
+        'merges.txt'
+    ]
+    # List what files we actually find
+    print("\nFound files:")
+    for f in os.listdir(input_dir):
+        print(f"- {f}")
+    # Process JSON files
+    for filename in json_files:
+        input_path = os.path.join(input_dir, filename)
+        if os.path.exists(input_path):
+            print(f"\nProcessing {filename}")
+            content = safe_read_json(input_path)
+            if content is not None:
+                output_path = os.path.join(output_dir, filename)
+                with open(output_path, 'w', encoding='utf-8') as f:
+                    json.dump(content, f, indent=2)
+                print(f"Successfully copied {filename}")
+    # Copy non-JSON files directly
+    for filename in copy_files:
+        input_path = os.path.join(input_dir, filename)
+        if os.path.exists(input_path):
+            print(f"\nCopying {filename}")
+            safe_copy_file(input_path, os.path.join(output_dir, filename))
+    # Load and modify the tokenizer config
+    config_path = os.path.join(input_dir, 'tokenizer_config.json')
+    if os.path.exists(config_path):
+        print("\nProcessing tokenizer config")
+        config = safe_read_json(config_path)
+        if config is not None:
+            config.update({
+                'add_prefix_space': False,
+                'clean_up_tokenization_spaces': False,
+                'model_max_length': 16384,
+                'tokenizer_class': 'GPT2Tokenizer',
+                'bos_token': '<|endoftext|>',
+                'eos_token': '<|endoftext|>',
+                'pad_token': '<|endoftext|>'
+            })
+            # Save the modified config
+            output_config_path = os.path.join(output_dir, 'tokenizer_config.json')
+            with open(output_config_path, 'w') as f:
+                json.dump(config, f, indent=2)
+            print("Successfully updated config")
+    print("\nAttempting to test tokenizer...")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(output_dir)
+        test_text = "Hello, world!"
+        tokens = tokenizer.encode(test_text)
+        decoded = tokenizer.decode(tokens)
+        print("Tokenizer test successful!")
+        print(f"Test text: {test_text}")
+        print(f"Encoded: {tokens}")
+        print(f"Decoded: {decoded}")
+        # check if they're the same
+        if test_text != decoded:
+            print("Decoded text does not match original text!")
+        else:
+            print("Decoded text matches original text!")
+            # save the tokenizer
+            tokenizer.save_pretrained(output_dir)
+            print(f"Tokenizer saved to {output_dir}")
+    except Exception as e:
+        print(f"Error testing tokenizer: {e}")
 if __name__ == "__main__":
+    input_dir = "/mnt/llm/models/phi-4/model"  # or "model" depending on which directory you want to use
     output_dir = "/mnt/llm/models/phi-4/converted_tokenizer"
+    convert_phi_tokenizer(input_dir, output_dir)