lemms commited on
Commit
e54c9be
Β·
verified Β·
1 Parent(s): 5d375ac

Add sentencepiece-specific test script to diagnose tokenizer issues

Browse files
Files changed (1) hide show
  1. test_sentencepiece.py +127 -0
test_sentencepiece.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Simple test script to check sentencepiece installation and import.
4
+
5
+ This script specifically tests the sentencepiece library which is critical
6
+ for OpenLLM model tokenization.
7
+
8
+ Author: Louis Chua Bean Chong
9
+ License: GPL-3.0
10
+ """
11
+
12
+ import sys
13
+ import subprocess
14
+
15
+ def test_sentencepiece():
16
+ """Test sentencepiece installation and import."""
17
+ print("πŸ” Testing SentencePiece Installation")
18
+ print("=" * 40)
19
+
20
+ # Test 1: Check if sentencepiece is installed via pip
21
+ print("\nπŸ“¦ Checking pip installation...")
22
+ try:
23
+ result = subprocess.run(
24
+ ["pip", "show", "sentencepiece"],
25
+ capture_output=True,
26
+ text=True
27
+ )
28
+ if result.returncode == 0:
29
+ print("βœ… sentencepiece is installed via pip")
30
+ print(f"Info:\n{result.stdout}")
31
+ else:
32
+ print("❌ sentencepiece is NOT installed via pip")
33
+ print("Installing sentencepiece...")
34
+ install_result = subprocess.run(
35
+ ["pip", "install", "sentencepiece>=0.1.99"],
36
+ capture_output=True,
37
+ text=True
38
+ )
39
+ if install_result.returncode == 0:
40
+ print("βœ… sentencepiece installed successfully")
41
+ else:
42
+ print(f"❌ Failed to install sentencepiece: {install_result.stderr}")
43
+ except Exception as e:
44
+ print(f"❌ Error checking pip: {e}")
45
+
46
+ # Test 2: Try to import sentencepiece
47
+ print("\n🐍 Testing Python import...")
48
+ try:
49
+ import sentencepiece
50
+ print("βœ… sentencepiece import successful")
51
+ print(f"Version: {sentencepiece.__version__}")
52
+ except ImportError as e:
53
+ print(f"❌ sentencepiece import failed: {e}")
54
+ return False
55
+
56
+ # Test 3: Test SentencePieceTokenizer specifically
57
+ print("\nπŸ”€ Testing SentencePieceTokenizer...")
58
+ try:
59
+ from transformers import AutoTokenizer
60
+ print("βœ… AutoTokenizer import successful")
61
+
62
+ # Try to load a simple tokenizer to test
63
+ print("Testing tokenizer loading...")
64
+ tokenizer = AutoTokenizer.from_pretrained("gpt2") # Simple test
65
+ print("βœ… Basic tokenizer loading successful")
66
+
67
+ except Exception as e:
68
+ print(f"❌ Tokenizer test failed: {e}")
69
+ return False
70
+
71
+ print("\n" + "=" * 40)
72
+ print("🎯 SentencePiece Test Complete!")
73
+ return True
74
+
75
+ def test_openllm_model():
76
+ """Test loading the OpenLLM model specifically."""
77
+ print("\nπŸš€ Testing OpenLLM Model Loading")
78
+ print("=" * 40)
79
+
80
+ try:
81
+ from transformers import AutoTokenizer, AutoModelForCausalLM
82
+
83
+ print("Loading OpenLLM small model...")
84
+ model_name = "lemms/openllm-small-extended-7k"
85
+
86
+ # Load tokenizer
87
+ print("Loading tokenizer...")
88
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
89
+ print("βœ… Tokenizer loaded successfully")
90
+
91
+ # Load model
92
+ print("Loading model...")
93
+ model = AutoModelForCausalLM.from_pretrained(model_name)
94
+ print("βœ… Model loaded successfully")
95
+
96
+ print(f"\nπŸŽ‰ OpenLLM model test successful!")
97
+ print(f"Model: {model_name}")
98
+ print(f"Tokenizer type: {type(tokenizer).__name__}")
99
+ print(f"Model type: {type(model).__name__}")
100
+
101
+ return True
102
+
103
+ except Exception as e:
104
+ print(f"❌ OpenLLM model test failed: {e}")
105
+ return False
106
+
107
+ if __name__ == "__main__":
108
+ print("πŸ§ͺ SentencePiece and OpenLLM Model Test")
109
+ print("=" * 50)
110
+
111
+ # Test sentencepiece
112
+ sp_success = test_sentencepiece()
113
+
114
+ # Test OpenLLM model if sentencepiece works
115
+ if sp_success:
116
+ model_success = test_openllm_model()
117
+ if model_success:
118
+ print("\nπŸŽ‰ All tests passed! Training should work now.")
119
+ else:
120
+ print("\n⚠️ SentencePiece works but model loading failed.")
121
+ else:
122
+ print("\n❌ SentencePiece test failed. Need to fix dependencies first.")
123
+
124
+ print("\nπŸ’‘ Next steps:")
125
+ print("1. If tests failed, run: python install_dependencies.py")
126
+ print("2. If tests passed, try the training again")
127
+ print("3. If still having issues, restart the Space")