|
|
|
"""
|
|
Simple test script to check sentencepiece installation and import.
|
|
|
|
This script specifically tests the sentencepiece library which is critical
|
|
for OpenLLM model tokenization.
|
|
|
|
Author: Louis Chua Bean Chong
|
|
License: GPL-3.0
|
|
"""
|
|
|
|
import sys
|
|
import subprocess
|
|
|
|
def test_sentencepiece():
|
|
"""Test sentencepiece installation and import."""
|
|
print("π Testing SentencePiece Installation")
|
|
print("=" * 40)
|
|
|
|
|
|
print("\nπ¦ Checking pip installation...")
|
|
try:
|
|
result = subprocess.run(
|
|
["pip", "show", "sentencepiece"],
|
|
capture_output=True,
|
|
text=True
|
|
)
|
|
if result.returncode == 0:
|
|
print("β
sentencepiece is installed via pip")
|
|
print(f"Info:\n{result.stdout}")
|
|
else:
|
|
print("β sentencepiece is NOT installed via pip")
|
|
print("Installing sentencepiece...")
|
|
install_result = subprocess.run(
|
|
["pip", "install", "sentencepiece>=0.1.99"],
|
|
capture_output=True,
|
|
text=True
|
|
)
|
|
if install_result.returncode == 0:
|
|
print("β
sentencepiece installed successfully")
|
|
else:
|
|
print(f"β Failed to install sentencepiece: {install_result.stderr}")
|
|
except Exception as e:
|
|
print(f"β Error checking pip: {e}")
|
|
|
|
|
|
print("\nπ Testing Python import...")
|
|
try:
|
|
import sentencepiece
|
|
print("β
sentencepiece import successful")
|
|
print(f"Version: {sentencepiece.__version__}")
|
|
except ImportError as e:
|
|
print(f"β sentencepiece import failed: {e}")
|
|
return False
|
|
|
|
|
|
print("\nπ€ Testing SentencePieceTokenizer...")
|
|
try:
|
|
from transformers import AutoTokenizer
|
|
print("β
AutoTokenizer import successful")
|
|
|
|
|
|
print("Testing tokenizer loading...")
|
|
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
|
print("β
Basic tokenizer loading successful")
|
|
|
|
except Exception as e:
|
|
print(f"β Tokenizer test failed: {e}")
|
|
return False
|
|
|
|
print("\n" + "=" * 40)
|
|
print("π― SentencePiece Test Complete!")
|
|
return True
|
|
|
|
def test_openllm_model():
|
|
"""Test loading the OpenLLM model specifically."""
|
|
print("\nπ Testing OpenLLM Model Loading")
|
|
print("=" * 40)
|
|
|
|
try:
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
|
|
print("Loading OpenLLM small model...")
|
|
model_name = "lemms/openllm-small-extended-7k"
|
|
|
|
|
|
print("Loading tokenizer...")
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
print("β
Tokenizer loaded successfully")
|
|
|
|
|
|
print("Loading model...")
|
|
model = AutoModelForCausalLM.from_pretrained(model_name)
|
|
print("β
Model loaded successfully")
|
|
|
|
print(f"\nπ OpenLLM model test successful!")
|
|
print(f"Model: {model_name}")
|
|
print(f"Tokenizer type: {type(tokenizer).__name__}")
|
|
print(f"Model type: {type(model).__name__}")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"β OpenLLM model test failed: {e}")
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
print("π§ͺ SentencePiece and OpenLLM Model Test")
|
|
print("=" * 50)
|
|
|
|
|
|
sp_success = test_sentencepiece()
|
|
|
|
|
|
if sp_success:
|
|
model_success = test_openllm_model()
|
|
if model_success:
|
|
print("\nπ All tests passed! Training should work now.")
|
|
else:
|
|
print("\nβ οΈ SentencePiece works but model loading failed.")
|
|
else:
|
|
print("\nβ SentencePiece test failed. Need to fix dependencies first.")
|
|
|
|
print("\nπ‘ Next steps:")
|
|
print("1. If tests failed, run: python install_dependencies.py")
|
|
print("2. If tests passed, try the training again")
|
|
print("3. If still having issues, restart the Space")
|
|
|