File size: 4,425 Bytes
e54c9be |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
#!/usr/bin/env python3
"""
Simple test script to check sentencepiece installation and import.
This script specifically tests the sentencepiece library which is critical
for OpenLLM model tokenization.
Author: Louis Chua Bean Chong
License: GPL-3.0
"""
import sys
import subprocess
def test_sentencepiece():
"""Test sentencepiece installation and import."""
print("π Testing SentencePiece Installation")
print("=" * 40)
# Test 1: Check if sentencepiece is installed via pip
print("\nπ¦ Checking pip installation...")
try:
result = subprocess.run(
["pip", "show", "sentencepiece"],
capture_output=True,
text=True
)
if result.returncode == 0:
print("β
sentencepiece is installed via pip")
print(f"Info:\n{result.stdout}")
else:
print("β sentencepiece is NOT installed via pip")
print("Installing sentencepiece...")
install_result = subprocess.run(
["pip", "install", "sentencepiece>=0.1.99"],
capture_output=True,
text=True
)
if install_result.returncode == 0:
print("β
sentencepiece installed successfully")
else:
print(f"β Failed to install sentencepiece: {install_result.stderr}")
except Exception as e:
print(f"β Error checking pip: {e}")
# Test 2: Try to import sentencepiece
print("\nπ Testing Python import...")
try:
import sentencepiece
print("β
sentencepiece import successful")
print(f"Version: {sentencepiece.__version__}")
except ImportError as e:
print(f"β sentencepiece import failed: {e}")
return False
# Test 3: Test SentencePieceTokenizer specifically
print("\nπ€ Testing SentencePieceTokenizer...")
try:
from transformers import AutoTokenizer
print("β
AutoTokenizer import successful")
# Try to load a simple tokenizer to test
print("Testing tokenizer loading...")
tokenizer = AutoTokenizer.from_pretrained("gpt2") # Simple test
print("β
Basic tokenizer loading successful")
except Exception as e:
print(f"β Tokenizer test failed: {e}")
return False
print("\n" + "=" * 40)
print("π― SentencePiece Test Complete!")
return True
def test_openllm_model():
"""Test loading the OpenLLM model specifically."""
print("\nπ Testing OpenLLM Model Loading")
print("=" * 40)
try:
from transformers import AutoTokenizer, AutoModelForCausalLM
print("Loading OpenLLM small model...")
model_name = "lemms/openllm-small-extended-7k"
# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("β
Tokenizer loaded successfully")
# Load model
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(model_name)
print("β
Model loaded successfully")
print(f"\nπ OpenLLM model test successful!")
print(f"Model: {model_name}")
print(f"Tokenizer type: {type(tokenizer).__name__}")
print(f"Model type: {type(model).__name__}")
return True
except Exception as e:
print(f"β OpenLLM model test failed: {e}")
return False
if __name__ == "__main__":
print("π§ͺ SentencePiece and OpenLLM Model Test")
print("=" * 50)
# Test sentencepiece
sp_success = test_sentencepiece()
# Test OpenLLM model if sentencepiece works
if sp_success:
model_success = test_openllm_model()
if model_success:
print("\nπ All tests passed! Training should work now.")
else:
print("\nβ οΈ SentencePiece works but model loading failed.")
else:
print("\nβ SentencePiece test failed. Need to fix dependencies first.")
print("\nπ‘ Next steps:")
print("1. If tests failed, run: python install_dependencies.py")
print("2. If tests passed, try the training again")
print("3. If still having issues, restart the Space")
|