File size: 2,390 Bytes
dbca390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python3
"""
Test script to verify the fixes for the ASL gloss processing
"""

import asyncio
import re
from vectorizer import Vectorizer


def clean_gloss_token(token):
    """
    Clean a gloss token by removing brackets, newlines, and extra whitespace
    """
    # Remove brackets and newlines
    cleaned = re.sub(r'[\[\]\n\r]', '', token)
    # Remove extra whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned.lower()


def test_gloss_parsing():
    """Test the gloss parsing functionality"""
    # Sample gloss output from the notebook
    sample_gloss = ("ASL [BEAR] [NAME] [OSKI] [LOVE] [HONEY] [BUT] [ALWAYS] "
                    "[GET-STUCK] [TREE]\n\n[ONE_DAY] [HE] [DISCOVER] [LADDER]\n\n"
                    "[PROBLEM] [SOLVE] [FINISH]")
    
    print("Original gloss:")
    print(sample_gloss)
    print("\n" + "="*50 + "\n")
    
    # Split by spaces and clean each token
    gloss_tokens = sample_gloss.split()
    cleaned_tokens = []
    
    for token in gloss_tokens:
        cleaned = clean_gloss_token(token)
        if cleaned:  # Only add non-empty tokens
            cleaned_tokens.append(cleaned)
    
    print("Cleaned tokens:")
    for i, token in enumerate(cleaned_tokens):
        print(f"{i+1:2d}. {token}")
    
    return cleaned_tokens


async def test_vectorizer():
    """Test the vectorizer functionality"""
    try:
        vectorizer = Vectorizer()
        
        # Test with a simple word that should be in the vocabulary
        test_words = ["BEAR", "LOVE", "TREE", "HE", "FINISH"]
        
        for word in test_words:
            print(f"\nTesting word: {word}")
            result = await vectorizer.vector_query_from_supabase(word)
            print(f"Result: {result}")
            
    except Exception as e:
        print(f"Error testing vectorizer: {e}")


async def main():
    """Main test function"""
    print("Testing ASL Gloss Processing Fixes")
    print("=" * 50)
    
    # Test 1: Gloss parsing
    print("\n1. Testing gloss parsing...")
    cleaned_tokens = test_gloss_parsing()
    print(f"Total cleaned tokens: {len(cleaned_tokens)}")
    
    # Test 2: Vectorizer (if environment is set up)
    print("\n2. Testing vectorizer...")
    await test_vectorizer()
    
    print("\n" + "=" * 50)
    print("Test completed!")


if __name__ == "__main__":
    asyncio.run(main())