crossroderick
/

aramt5

Text Generation

Classical Syriac

text2text-generation

transliteration

Eval Results (legacy)

Model card Files Files and versions

aramt5 / src /test_t5.py

crossroderick's picture

v3.2 hotfix with some corrections

8d1be51 about 2 months ago

history blame contribute delete

3.8 kB

	from transformers import AutoTokenizer, T5ForConditionalGeneration, pipeline

	# HF Hub path config
	model_path = "crossroderick/aramt5"

	# Unicode directional formatting for RTL text (Syriac)
	RLI = "\u2067" # Right-to-Left Isolate
	PDI = "\u2069" # Pop Directional Isolate


	def rtl(text: str) -> str:
	"""Wrap text in RTL isolate markers for correct terminal display."""
	return f"{RLI}{text}{PDI}"


	# Load model and tokeniser
	print("Loading model and tokeniser...")
	tokeniser = AutoTokenizer.from_pretrained(model_path)
	model = T5ForConditionalGeneration.from_pretrained(model_path)
	pipe = pipeline("text2text-generation", model=model, tokenizer=tokeniser)
	print("Model loaded successfully.\n")


	def transliterate(text: str, dialect: str = "west") -> str:
	"""
	Transliterate Syriac text to Latin script.

	Args:
	text: Syriac text to transliterate
	dialect: 'west' for West Syriac (Serto) or 'east' for East Syriac (Madnḥaya)

	Returns:
	Transliterated Latin text
	"""
	if dialect == "east":
	prefix = "Syriac2EastLatin: "
	else:
	prefix = "Syriac2WestLatin: "

	input_prompt = f"{prefix}{text}"
	# Simple generation - let model decide length naturally
	output = pipe(
	input_prompt,
	max_new_tokens=128,
	num_beams=4,
	do_sample=False,
	)[
	0
	]["generated_text"]
	return output


	# Test examples - mix of words and sentences
	test_samples = [
	# Single words - West Syriac
	{"text": "ܫܠܡܐ", "dialect": "west", "description": "Peace (West)"},
	{"text": "ܐܠܗܐ", "dialect": "west", "description": "God (West)"},
	{"text": "ܡܫܝܚܐ", "dialect": "west", "description": "Messiah/Christ (West)"},
	{"text": "ܡܠܟܐ", "dialect": "west", "description": "King (West)"},
	{"text": "ܒܝܬܐ", "dialect": "west", "description": "House (West)"},
	# Single words - East Syriac
	{"text": "ܫܠܡܐ", "dialect": "east", "description": "Peace (East)"},
	{"text": "ܐܠܗܐ", "dialect": "east", "description": "God (East)"},
	{"text": "ܡܫܝܚܐ", "dialect": "east", "description": "Messiah/Christ (East)"},
	# Proclitic examples
	{"text": "ܒܒܝܬܐ", "dialect": "west", "description": "In the house (West)"},
	{"text": "ܘܡܠܟܐ", "dialect": "west", "description": "And the king (West)"},
	{"text": "ܕܐܠܗܐ", "dialect": "west", "description": "Of God (West)"},
	{"text": "ܠܡܠܟܐ", "dialect": "west", "description": "To the king (West)"},
	# Short phrases
	{
	"text": "ܐܒܘܢ ܕܒܫܡܝܐ",
	"dialect": "west",
	"description": "Our Father in heaven (West)",
	},
	{"text": "ܫܠܡܐ ܥܡܟ", "dialect": "west", "description": "Peace be with you (West)"},
	]

	print("=" * 50)
	print("AramT5 Syriac Transliteration Test")
	print("=" * 50)

	for sample in test_samples:
	result = transliterate(sample["text"], sample["dialect"])
	print(f"\n{sample['description']}:")
	print(f" Syriac: {rtl(sample['text'])}")
	print(f" Latin: {result}")

	print("\n" + "=" * 50)
	print("Interactive mode - enter Syriac text to transliterate")
	print("Format: [e/w] text (e=east, w=west, default=west)")
	print("Enter 'q' to quit")
	print("=" * 50)

	while True:
	user_input = input("\n> ").strip()
	if user_input.lower() == "q":
	break

	# Parse dialect prefix
	if user_input.startswith("e "):
	dialect = "east"
	text = user_input[2:]
	elif user_input.startswith("w "):
	dialect = "west"
	text = user_input[2:]
	else:
	dialect = "west"
	text = user_input

	if text:
	result = transliterate(text, dialect)
	dialect_name = "East" if dialect == "east" else "West"
	print(f" [{dialect_name}] {rtl(text)} → {result}")