File size: 4,425 Bytes
e54c9be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python3
"""

Simple test script to check sentencepiece installation and import.



This script specifically tests the sentencepiece library which is critical

for OpenLLM model tokenization.



Author: Louis Chua Bean Chong

License: GPL-3.0

"""

import sys
import subprocess

def test_sentencepiece():
    """Test sentencepiece installation and import."""
    print("πŸ” Testing SentencePiece Installation")
    print("=" * 40)
    
    # Test 1: Check if sentencepiece is installed via pip
    print("\nπŸ“¦ Checking pip installation...")
    try:
        result = subprocess.run(
            ["pip", "show", "sentencepiece"], 
            capture_output=True, 
            text=True
        )
        if result.returncode == 0:
            print("βœ… sentencepiece is installed via pip")
            print(f"Info:\n{result.stdout}")
        else:
            print("❌ sentencepiece is NOT installed via pip")
            print("Installing sentencepiece...")
            install_result = subprocess.run(
                ["pip", "install", "sentencepiece>=0.1.99"],
                capture_output=True,
                text=True
            )
            if install_result.returncode == 0:
                print("βœ… sentencepiece installed successfully")
            else:
                print(f"❌ Failed to install sentencepiece: {install_result.stderr}")
    except Exception as e:
        print(f"❌ Error checking pip: {e}")
    
    # Test 2: Try to import sentencepiece
    print("\n🐍 Testing Python import...")
    try:
        import sentencepiece
        print("βœ… sentencepiece import successful")
        print(f"Version: {sentencepiece.__version__}")
    except ImportError as e:
        print(f"❌ sentencepiece import failed: {e}")
        return False
    
    # Test 3: Test SentencePieceTokenizer specifically
    print("\nπŸ”€ Testing SentencePieceTokenizer...")
    try:
        from transformers import AutoTokenizer
        print("βœ… AutoTokenizer import successful")
        
        # Try to load a simple tokenizer to test
        print("Testing tokenizer loading...")
        tokenizer = AutoTokenizer.from_pretrained("gpt2")  # Simple test
        print("βœ… Basic tokenizer loading successful")
        
    except Exception as e:
        print(f"❌ Tokenizer test failed: {e}")
        return False
    
    print("\n" + "=" * 40)
    print("🎯 SentencePiece Test Complete!")
    return True

def test_openllm_model():
    """Test loading the OpenLLM model specifically."""
    print("\nπŸš€ Testing OpenLLM Model Loading")
    print("=" * 40)
    
    try:
        from transformers import AutoTokenizer, AutoModelForCausalLM
        
        print("Loading OpenLLM small model...")
        model_name = "lemms/openllm-small-extended-7k"
        
        # Load tokenizer
        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        print("βœ… Tokenizer loaded successfully")
        
        # Load model
        print("Loading model...")
        model = AutoModelForCausalLM.from_pretrained(model_name)
        print("βœ… Model loaded successfully")
        
        print(f"\nπŸŽ‰ OpenLLM model test successful!")
        print(f"Model: {model_name}")
        print(f"Tokenizer type: {type(tokenizer).__name__}")
        print(f"Model type: {type(model).__name__}")
        
        return True
        
    except Exception as e:
        print(f"❌ OpenLLM model test failed: {e}")
        return False

if __name__ == "__main__":
    print("πŸ§ͺ SentencePiece and OpenLLM Model Test")
    print("=" * 50)
    
    # Test sentencepiece
    sp_success = test_sentencepiece()
    
    # Test OpenLLM model if sentencepiece works
    if sp_success:
        model_success = test_openllm_model()
        if model_success:
            print("\nπŸŽ‰ All tests passed! Training should work now.")
        else:
            print("\n⚠️ SentencePiece works but model loading failed.")
    else:
        print("\n❌ SentencePiece test failed. Need to fix dependencies first.")
    
    print("\nπŸ’‘ Next steps:")
    print("1. If tests failed, run: python install_dependencies.py")
    print("2. If tests passed, try the training again")
    print("3. If still having issues, restart the Space")