aramt5 / src /test_t5.py
crossroderick's picture
v3.2 hotfix with some corrections
8d1be51
from transformers import AutoTokenizer, T5ForConditionalGeneration, pipeline
# HF Hub path config
model_path = "crossroderick/aramt5"
# Unicode directional formatting for RTL text (Syriac)
RLI = "\u2067" # Right-to-Left Isolate
PDI = "\u2069" # Pop Directional Isolate
def rtl(text: str) -> str:
"""Wrap text in RTL isolate markers for correct terminal display."""
return f"{RLI}{text}{PDI}"
# Load model and tokeniser
print("Loading model and tokeniser...")
tokeniser = AutoTokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)
pipe = pipeline("text2text-generation", model=model, tokenizer=tokeniser)
print("Model loaded successfully.\n")
def transliterate(text: str, dialect: str = "west") -> str:
"""
Transliterate Syriac text to Latin script.
Args:
text: Syriac text to transliterate
dialect: 'west' for West Syriac (Serto) or 'east' for East Syriac (Madnḥaya)
Returns:
Transliterated Latin text
"""
if dialect == "east":
prefix = "Syriac2EastLatin: "
else:
prefix = "Syriac2WestLatin: "
input_prompt = f"{prefix}{text}"
# Simple generation - let model decide length naturally
output = pipe(
input_prompt,
max_new_tokens=128,
num_beams=4,
do_sample=False,
)[
0
]["generated_text"]
return output
# Test examples - mix of words and sentences
test_samples = [
# Single words - West Syriac
{"text": "ܫܠܡܐ", "dialect": "west", "description": "Peace (West)"},
{"text": "ܐܠܗܐ", "dialect": "west", "description": "God (West)"},
{"text": "ܡܫܝܚܐ", "dialect": "west", "description": "Messiah/Christ (West)"},
{"text": "ܡܠܟܐ", "dialect": "west", "description": "King (West)"},
{"text": "ܒܝܬܐ", "dialect": "west", "description": "House (West)"},
# Single words - East Syriac
{"text": "ܫܠܡܐ", "dialect": "east", "description": "Peace (East)"},
{"text": "ܐܠܗܐ", "dialect": "east", "description": "God (East)"},
{"text": "ܡܫܝܚܐ", "dialect": "east", "description": "Messiah/Christ (East)"},
# Proclitic examples
{"text": "ܒܒܝܬܐ", "dialect": "west", "description": "In the house (West)"},
{"text": "ܘܡܠܟܐ", "dialect": "west", "description": "And the king (West)"},
{"text": "ܕܐܠܗܐ", "dialect": "west", "description": "Of God (West)"},
{"text": "ܠܡܠܟܐ", "dialect": "west", "description": "To the king (West)"},
# Short phrases
{
"text": "ܐܒܘܢ ܕܒܫܡܝܐ",
"dialect": "west",
"description": "Our Father in heaven (West)",
},
{"text": "ܫܠܡܐ ܥܡܟ", "dialect": "west", "description": "Peace be with you (West)"},
]
print("=" * 50)
print("AramT5 Syriac Transliteration Test")
print("=" * 50)
for sample in test_samples:
result = transliterate(sample["text"], sample["dialect"])
print(f"\n{sample['description']}:")
print(f" Syriac: {rtl(sample['text'])}")
print(f" Latin: {result}")
print("\n" + "=" * 50)
print("Interactive mode - enter Syriac text to transliterate")
print("Format: [e/w] text (e=east, w=west, default=west)")
print("Enter 'q' to quit")
print("=" * 50)
while True:
user_input = input("\n> ").strip()
if user_input.lower() == "q":
break
# Parse dialect prefix
if user_input.startswith("e "):
dialect = "east"
text = user_input[2:]
elif user_input.startswith("w "):
dialect = "west"
text = user_input[2:]
else:
dialect = "west"
text = user_input
if text:
result = transliterate(text, dialect)
dialect_name = "East" if dialect == "east" else "West"
print(f" [{dialect_name}] {rtl(text)}{result}")