|
|
|
""" |
|
Test script to verify the fixes for the ASL gloss processing |
|
""" |
|
|
|
import asyncio |
|
import re |
|
from vectorizer import Vectorizer |
|
|
|
|
|
def clean_gloss_token(token): |
|
""" |
|
Clean a gloss token by removing brackets, newlines, and extra whitespace |
|
""" |
|
|
|
cleaned = re.sub(r'[\[\]\n\r]', '', token) |
|
|
|
cleaned = re.sub(r'\s+', ' ', cleaned).strip() |
|
return cleaned.lower() |
|
|
|
|
|
def test_gloss_parsing(): |
|
"""Test the gloss parsing functionality""" |
|
|
|
sample_gloss = ("ASL [BEAR] [NAME] [OSKI] [LOVE] [HONEY] [BUT] [ALWAYS] " |
|
"[GET-STUCK] [TREE]\n\n[ONE_DAY] [HE] [DISCOVER] [LADDER]\n\n" |
|
"[PROBLEM] [SOLVE] [FINISH]") |
|
|
|
print("Original gloss:") |
|
print(sample_gloss) |
|
print("\n" + "="*50 + "\n") |
|
|
|
|
|
gloss_tokens = sample_gloss.split() |
|
cleaned_tokens = [] |
|
|
|
for token in gloss_tokens: |
|
cleaned = clean_gloss_token(token) |
|
if cleaned: |
|
cleaned_tokens.append(cleaned) |
|
|
|
print("Cleaned tokens:") |
|
for i, token in enumerate(cleaned_tokens): |
|
print(f"{i+1:2d}. {token}") |
|
|
|
return cleaned_tokens |
|
|
|
|
|
async def test_vectorizer(): |
|
"""Test the vectorizer functionality""" |
|
try: |
|
vectorizer = Vectorizer() |
|
|
|
|
|
test_words = ["BEAR", "LOVE", "TREE", "HE", "FINISH"] |
|
|
|
for word in test_words: |
|
print(f"\nTesting word: {word}") |
|
result = await vectorizer.vector_query_from_supabase(word) |
|
print(f"Result: {result}") |
|
|
|
except Exception as e: |
|
print(f"Error testing vectorizer: {e}") |
|
|
|
|
|
async def main(): |
|
"""Main test function""" |
|
print("Testing ASL Gloss Processing Fixes") |
|
print("=" * 50) |
|
|
|
|
|
print("\n1. Testing gloss parsing...") |
|
cleaned_tokens = test_gloss_parsing() |
|
print(f"Total cleaned tokens: {len(cleaned_tokens)}") |
|
|
|
|
|
print("\n2. Testing vectorizer...") |
|
await test_vectorizer() |
|
|
|
print("\n" + "=" * 50) |
|
print("Test completed!") |
|
|
|
|
|
if __name__ == "__main__": |
|
asyncio.run(main()) |