File size: 2,390 Bytes
dbca390 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
#!/usr/bin/env python3
"""
Test script to verify the fixes for the ASL gloss processing
"""
import asyncio
import re
from vectorizer import Vectorizer
def clean_gloss_token(token):
"""
Clean a gloss token by removing brackets, newlines, and extra whitespace
"""
# Remove brackets and newlines
cleaned = re.sub(r'[\[\]\n\r]', '', token)
# Remove extra whitespace
cleaned = re.sub(r'\s+', ' ', cleaned).strip()
return cleaned.lower()
def test_gloss_parsing():
"""Test the gloss parsing functionality"""
# Sample gloss output from the notebook
sample_gloss = ("ASL [BEAR] [NAME] [OSKI] [LOVE] [HONEY] [BUT] [ALWAYS] "
"[GET-STUCK] [TREE]\n\n[ONE_DAY] [HE] [DISCOVER] [LADDER]\n\n"
"[PROBLEM] [SOLVE] [FINISH]")
print("Original gloss:")
print(sample_gloss)
print("\n" + "="*50 + "\n")
# Split by spaces and clean each token
gloss_tokens = sample_gloss.split()
cleaned_tokens = []
for token in gloss_tokens:
cleaned = clean_gloss_token(token)
if cleaned: # Only add non-empty tokens
cleaned_tokens.append(cleaned)
print("Cleaned tokens:")
for i, token in enumerate(cleaned_tokens):
print(f"{i+1:2d}. {token}")
return cleaned_tokens
async def test_vectorizer():
"""Test the vectorizer functionality"""
try:
vectorizer = Vectorizer()
# Test with a simple word that should be in the vocabulary
test_words = ["BEAR", "LOVE", "TREE", "HE", "FINISH"]
for word in test_words:
print(f"\nTesting word: {word}")
result = await vectorizer.vector_query_from_supabase(word)
print(f"Result: {result}")
except Exception as e:
print(f"Error testing vectorizer: {e}")
async def main():
"""Main test function"""
print("Testing ASL Gloss Processing Fixes")
print("=" * 50)
# Test 1: Gloss parsing
print("\n1. Testing gloss parsing...")
cleaned_tokens = test_gloss_parsing()
print(f"Total cleaned tokens: {len(cleaned_tokens)}")
# Test 2: Vectorizer (if environment is set up)
print("\n2. Testing vectorizer...")
await test_vectorizer()
print("\n" + "=" * 50)
print("Test completed!")
if __name__ == "__main__":
asyncio.run(main()) |