Rogaton
Display Prolog validation results in dependency parsing output
6d0a56b
import streamlit as st
import os
import xml.etree.ElementTree as ET
import re
import sys
# Try importing transformers with detailed error handling
try:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
except ImportError as e:
st.error(f"""
### ❌ Transformers Import Error
Failed to import required transformers components: {e}
**Debug Info:**
- Python version: {sys.version}
- Torch available: {('torch' in sys.modules)}
**This usually means:**
1. The Docker container is still rebuilding (wait 2-5 minutes)
2. Dependencies weren't installed correctly
3. There's a version conflict in requirements.txt
Please check the HuggingFace Space build logs or try rebuilding the Space.
""")
st.stop()
from huggingface_hub import InferenceClient
from coptic_parser_core import CopticParserCore
# ========================================
# COPTIC TRANSLATOR PREPROCESSING FUNCTIONS
# ========================================
# These functions convert between Coptic Unicode and Greek transcription
# Required for Coptic translator models (MarianMT-based)
COPTIC_TO_GREEK = {
"ⲁ": "α", "ⲃ": "β", "ⲅ": "γ", "ⲇ": "δ", "ⲉ": "ε", "ⲋ": "ϛ",
"ⲍ": "ζ", "ⲏ": "η", "ⲑ": "θ", "ⲓ": "ι", "ⲕ": "κ", "ⲗ": "λ",
"ⲙ": "μ", "ⲛ": "ν", "ⲝ": "ξ", "ⲟ": "ο", "ⲡ": "π", "ⲣ": "ρ",
"ⲥ": "σ", "ⲧ": "τ", "ⲩ": "υ", "ⲫ": "φ", "ⲭ": "χ", "ⲯ": "ψ",
"ⲱ": "ω",
# Coptic-specific characters (must match model training)
"ϣ": "ʃ", "ϥ": "f", "ϧ": "x", "ϩ": "h", "ϫ": "ɟ",
"ϭ": "c", "ϯ": "ti",
# Uppercase variants
"Ⲁ": "Α", "Ⲃ": "Β", "Ⲅ": "Γ", "Ⲇ": "Δ", "Ⲉ": "Ε", "Ⲍ": "Ζ", "Ⲏ": "Η", "Ⲑ": "Θ",
"Ⲓ": "Ι", "Ⲕ": "Κ", "Ⲗ": "Λ", "Ⲙ": "Μ", "Ⲛ": "Ν", "Ⲝ": "Ξ", "Ⲟ": "Ο", "Ⲡ": "Π",
"Ⲣ": "Ρ", "Ⲥ": "Σ", "Ⲧ": "Τ", "Ⲩ": "Υ", "Ⲫ": "Φ", "Ⲭ": "Χ", "Ⲯ": "Ψ", "Ⲱ": "Ω",
"Ϣ": "Ʃ", "Ϥ": "F", "Ϧ": "X", "Ϩ": "H", "Ϫ": "Ɉ", "Ϭ": "C", "Ϯ": "TI"
}
GREEK_TO_COPTIC = {
"α": "ⲁ", "β": "ⲃ", "γ": "ⲅ", "δ": "ⲇ", "ε": "ⲉ", "ϛ": "ⲋ",
"ζ": "ⲍ", "η": "ⲏ", "θ": "ⲑ", "ι": "ⲓ", "κ": "ⲕ", "λ": "ⲗ",
"μ": "ⲙ", "ν": "ⲛ", "ξ": "ⲝ", "ο": "ⲟ", "π": "ⲡ", "ρ": "ⲣ",
"σ": "ⲥ", "ς": "ⲥ", "τ": "ⲧ", "υ": "ⲩ", "φ": "ⲫ", "χ": "ⲭ", "ψ": "ⲯ",
"ω": "ⲱ",
# Coptic-specific characters (must match model training)
"ʃ": "ϣ", "f": "ϥ", "x": "ϧ", "h": "ϩ", "ɟ": "ϫ",
"c": "ϭ", "ti": "ϯ",
# Uppercase variants
"Α": "Ⲁ", "Β": "Ⲃ", "Γ": "Ⲅ", "Δ": "Ⲇ", "Ε": "Ⲉ", "Ζ": "Ⲍ", "Η": "Ⲏ", "Θ": "Ⲑ",
"Ι": "Ⲓ", "Κ": "Ⲕ", "Λ": "Ⲗ", "Μ": "Ⲙ", "Ν": "Ⲛ", "Ξ": "Ⲝ", "Ο": "Ⲟ", "Π": "Ⲡ",
"Ρ": "Ⲣ", "Σ": "Ⲥ", "Τ": "Ⲧ", "Υ": "Ⲩ", "Φ": "Ⲫ", "Χ": "Ⲭ", "Ψ": "Ⲯ", "Ω": "Ⲱ",
"Ʃ": "Ϣ", "F": "Ϥ", "X": "Ϧ", "H": "Ϩ", "Ɉ": "Ϫ", "C": "Ϭ", "TI": "Ϯ"
}
def greekify(coptic_text):
"""Convert Coptic Unicode to Greek transcription for Coptic translator models."""
chars = []
for c in coptic_text:
l_c = c.lower()
chars.append(COPTIC_TO_GREEK.get(l_c, l_c))
return "".join(chars)
def degreekify(greek_text):
"""Convert Greek transcription back to Coptic Unicode.
Handles two-character sequences like 'ti' → 'ϯ'
"""
result = []
i = 0
while i < len(greek_text):
# Check for two-character sequences first
if i < len(greek_text) - 1:
two_char = greek_text[i:i+2].lower()
if two_char == 'ti':
result.append(GREEK_TO_COPTIC.get(two_char, greek_text[i:i+2]))
i += 2
continue
# Single character
result.append(GREEK_TO_COPTIC.get(greek_text[i], greek_text[i]))
i += 1
return ''.join(result)
# Coptic alphabet helper
COPTIC_ALPHABET = {
'Ⲁ': 'Alpha', 'Ⲃ': 'Beta', 'Ⲅ': 'Gamma', 'Ⲇ': 'Delta', 'Ⲉ': 'Epsilon', 'Ⲋ': 'Zeta',
'Ⲏ': 'Eta', 'Ⲑ': 'Theta', 'Ⲓ': 'Iota', 'Ⲕ': 'Kappa', 'Ⲗ': 'Lambda', 'Ⲙ': 'Mu',
'Ⲛ': 'Nu', 'Ⲝ': 'Xi', 'Ⲟ': 'Omicron', 'Ⲡ': 'Pi', 'Ⲣ': 'Rho', 'Ⲥ': 'Sigma',
'Ⲧ': 'Tau', 'Ⲩ': 'Upsilon', 'Ⲫ': 'Phi', 'Ⲭ': 'Chi', 'Ⲯ': 'Psi', 'Ⲱ': 'Omega',
'Ϣ': 'Shai', 'Ϥ': 'Fai', 'Ϧ': 'Khei', 'Ϩ': 'Hori', 'Ϫ': 'Gangia', 'Ϭ': 'Shima', 'Ϯ': 'Ti'
}
# Coptic linguistic prompts (will be formatted with target language)
def get_coptic_prompts(target_language):
"""Generate Coptic analysis prompts with specified target language"""
return {
'dialect_analysis': f"Analyze the Coptic dialect of this text and identify linguistic features. Respond in {target_language}:",
'translation': f"You are a professional Coptic translator. Translate the following Coptic text to {target_language}.\n\nIMPORTANT: Provide ONLY the direct translation. Do not include:\n- The original Coptic text\n- Explanations or commentary\n- Notes about context or meaning\n- Any text other than the {target_language} translation\n\nCoptic text to translate:",
'transcription': f"Provide a romanized transcription of this Coptic text. Respond in {target_language}:",
'morphology': f"Analyze the morphological structure of these Coptic words. Respond in {target_language}:",
'lexicon_lookup': f"Look up these Coptic words and provide definitions with Greek etymologies. Respond in {target_language}:"
}
# Lexicon loader
@st.cache_data
def load_coptic_lexicon(file_path=None):
"""Load Coptic lexicon from various formats including TEI XML"""
if not file_path or not os.path.exists(file_path):
return {}
lexicon = {}
try:
# Handle XML format (TEI structure for Comprehensive Coptic Lexicon)
if file_path.endswith('.xml'):
tree = ET.parse(file_path)
root = tree.getroot()
# Handle TEI namespace
ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
# Find entries in TEI format
entries = root.findall('.//tei:entry', ns)
for entry in entries[:100]: # Limit to first 100 entries for performance
coptic_word = ""
definition = ""
# Extract Coptic headword from TEI structure
form = entry.find('.//tei:form[@type="lemma"]', ns) or entry.find('.//tei:form', ns)
if form is not None:
orth = form.find('.//tei:orth', ns)
if orth is not None and orth.text:
coptic_word = orth.text.strip()
# Extract definition from sense elements
senses = entry.findall('.//tei:sense', ns)
definitions = []
for sense in senses[:2]: # Limit to first 2 senses
def_elem = sense.find('.//tei:def', ns)
if def_elem is not None and def_elem.text:
definitions.append(def_elem.text.strip())
if definitions:
definition = "; ".join(definitions)
# Clean and store
if coptic_word and definition:
# Clean Coptic word (preserve Coptic and Greek Unicode)
coptic_word = re.sub(r'[^\u2C80-\u2CFF\u03B0-\u03FF\u1F00-\u1FFF\w\s\-]', '', coptic_word).strip()
if coptic_word:
lexicon[coptic_word] = definition[:200] # Limit definition length
# Handle text formats
else:
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line:
continue
# Support multiple separators
separator = None
for sep in ['\t', '|', ',', ';']:
if sep in line:
separator = sep
break
if separator:
parts = line.split(separator, 1)
if len(parts) >= 2:
coptic_word = parts[0].strip()
definition = parts[1].strip()
lexicon[coptic_word] = definition
except Exception as e:
st.error(f"Error loading lexicon: {str(e)}")
return lexicon
# ========================================
# COPTIC TRANSLATOR MODEL LOADING
# ========================================
# Load and cache Coptic translation models
@st.cache_resource
def load_coptic_to_english_model():
"""Load Coptic → English translation model (Norelad/coptic-megalaa-finetuned)."""
try:
with st.spinner("📥 Loading Coptic→English model (first time only, ~600MB)..."):
model_name = "Norelad/coptic-megalaa-finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
st.success(f"✅ Coptic→English model loaded on {device.upper()}")
return tokenizer, model, device
except Exception as e:
st.error(f"Failed to load Coptic→English model: {e}")
return None, None, None
@st.cache_resource
def load_english_to_coptic_model():
"""Load English → Coptic translation model (megalaa/english-coptic-translator)."""
try:
with st.spinner("📥 Loading English→Coptic model (first time only, ~600MB)..."):
model_name = "megalaa/english-coptic-translator"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
st.success(f"✅ English→Coptic model loaded on {device.upper()}")
return tokenizer, model, device
except Exception as e:
st.error(f"Failed to load English→Coptic model: {e}")
return None, None, None
def translate_coptic_to_english(text, dialect='cop-sa'):
"""Translate Coptic text to English using local Coptic translator.
Args:
text: Coptic text to translate
dialect: Coptic dialect ('cop-sa' for Sahidic, 'cop-bo' for Bohairic, 'cop' defaults to Sahidic)
"""
tokenizer, model, device = load_coptic_to_english_model()
if tokenizer is None or model is None:
return "Error: Model not loaded. Please check your internet connection."
try:
# Dialect tags (required by the Norelad/coptic-megalaa-finetuned model)
DIALECT_TAGS = {
'cop-sa': 'з', # Sahidic (Cyrillic 'з')
'cop-bo': 'б', # Bohairic (Cyrillic 'б')
'cop': 'з' # Default to Sahidic for generic Coptic
}
dialect_tag = DIALECT_TAGS.get(dialect, 'з')
# Preprocessing: Convert Coptic Unicode to Greek transcription and add dialect tag
greek_input = greekify(text.lower())
greek_input = f"{dialect_tag} {greek_input}"
# Tokenize and generate
inputs = tokenizer(greek_input, return_tensors="pt", padding=True).to(device)
outputs = model.generate(
**inputs,
max_new_tokens=128,
num_beams=5,
early_stopping=True
)
# Decode translation
translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
return translation
except Exception as e:
return f"Translation error: {e}"
def translate_english_to_coptic(text):
"""Translate English text to Coptic using local Coptic translator."""
tokenizer, model, device = load_english_to_coptic_model()
if tokenizer is None or model is None:
return "Error: Model not loaded. Please check your internet connection."
try:
# Tokenize and generate (input is already in English)
inputs = tokenizer(text, return_tensors="pt", padding=True).to(device)
outputs = model.generate(
**inputs,
max_new_tokens=128,
num_beams=5,
early_stopping=True
)
# Decode and postprocess: Convert Greek transcription to Coptic Unicode
greek_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
coptic_output = degreekify(greek_output)
return coptic_output
except Exception as e:
return f"Translation error: {e}"
# Language detection and UI
LANGUAGES = {
'en': 'English', 'es': 'Español', 'fr': 'Français', 'de': 'Deutsch',
'zh': '中文', 'ja': '日本語', 'ar': 'العربية', 'hi': 'हिन्दी',
'cop': 'Coptic (ⲘⲉⲧⲢⲉⲙ̀ⲛⲭⲏⲙⲓ)', 'cop-sa': 'Sahidic Coptic', 'cop-bo': 'Bohairic Coptic'
}
st.set_page_config(page_title="Apertus Chat", layout="wide")
# Initialize variables (so they're accessible throughout the script)
analysis_type = None
target_lang = None
target_language_name = "English"
# Language selector
selected_lang = st.selectbox("Language / Langue / Idioma",
options=list(LANGUAGES.keys()),
format_func=lambda x: LANGUAGES[x])
# Sidebar for Coptic tools
with st.sidebar:
st.header("Coptic Tools")
# Translation Model Selection
st.subheader("🤖 Translation Model")
st.info("✨ **NEW:** Using specialized Coptic translator models (free, no API token needed!)")
st.markdown("Models: `Norelad/coptic-megalaa-finetuned` & `megalaa/english-coptic-translator`")
# Optional: HuggingFace API Token for advanced features
with st.expander("⚙️ Advanced: Use Apertus-8B (optional)"):
st.caption("For multi-language translation beyond English-Coptic")
hf_token_input = st.text_input(
"HuggingFace API Token",
type="password",
help="Optional: For Apertus-8B multi-language support"
)
use_apertus = st.checkbox("Use Apertus-8B instead of local Coptic translator", value=False)
if hf_token_input and use_apertus:
st.success("✅ Apertus-8B enabled")
elif not use_apertus:
hf_token_input = None # Disable API usage
st.divider()
# Lexicon file uploader
st.subheader("📚 Lexicon Upload")
lexicon_file = st.file_uploader(
"Upload Coptic Lexicon (optional)",
type=['txt', 'tsv', 'csv', 'xml'],
help="Supports: Text (TAB/pipe separated), XML (TEI format), CSV\nNote: Comprehensive lexicon is pre-loaded"
)
# Load lexicon
if lexicon_file:
try:
# Check file size (max 20MB)
file_size = len(lexicon_file.getvalue())
if file_size > 20 * 1024 * 1024:
st.error("❌ File too large (max 20MB)")
coptic_lexicon = {}
else:
# Save uploaded file temporarily
temp_path = f"temp_lexicon.{lexicon_file.name.split('.')[-1]}"
with open(temp_path, "wb") as f:
f.write(lexicon_file.getbuffer())
coptic_lexicon = load_coptic_lexicon(temp_path)
if coptic_lexicon:
st.success(f"✅ Loaded {len(coptic_lexicon)} lexicon entries from {lexicon_file.name}")
else:
st.warning("⚠️ File uploaded but no valid entries found")
coptic_lexicon = {}
# Clean up temp file
if os.path.exists(temp_path):
os.remove(temp_path)
except Exception as e:
st.error(f"❌ Error loading file: {str(e)}")
st.info("💡 Supported formats: Plain text (TAB/pipe separated), XML (TEI), CSV")
coptic_lexicon = {}
else:
# Try to load the comprehensive lexicon if available
comprehensive_lexicon_path = "Comprehensive_Coptic_Lexicon-v1.2-2020.xml"
if os.path.exists(comprehensive_lexicon_path):
coptic_lexicon = load_coptic_lexicon(comprehensive_lexicon_path)
if coptic_lexicon:
st.info(f"📚 Loaded Comprehensive Coptic Lexicon: {len(coptic_lexicon)} entries")
else:
coptic_lexicon = {}
else:
coptic_lexicon = {}
# Coptic alphabet reference
if st.expander("Coptic Alphabet"):
for letter, name in COPTIC_ALPHABET.items():
st.text(f"{letter} - {name}")
# Lexicon search
if coptic_lexicon:
st.subheader("Lexicon Search")
# Initialize session state for search term
if "search_term" not in st.session_state:
st.session_state.search_term = ""
# Virtual Coptic keyboard
st.write("**Virtual Keyboard:**")
coptic_letters = ['ⲁ', 'ⲃ', 'ⲅ', 'ⲇ', 'ⲉ', 'ⲍ', 'ⲏ', 'ⲑ', 'ⲓ', 'ⲕ', 'ⲗ', 'ⲙ', 'ⲛ', 'ⲝ', 'ⲟ', 'ⲡ', 'ⲣ', 'ⲥ', 'ⲧ', 'ⲩ', 'ⲫ', 'ⲭ', 'ⲯ', 'ⲱ', 'ϣ', 'ϥ', 'ϧ', 'ϩ', 'ϫ', 'ϭ', 'ϯ']
# Create keyboard layout in rows
cols1 = st.columns(8)
cols2 = st.columns(8)
cols3 = st.columns(8)
cols4 = st.columns(8)
# Keyboard buttons - accumulate in session state
for i, letter in enumerate(coptic_letters):
col_idx = i % 8
if i < 8:
if cols1[col_idx].button(letter, key=f"key_{letter}"):
st.session_state.search_term += letter
st.rerun()
elif i < 16:
if cols2[col_idx].button(letter, key=f"key_{letter}"):
st.session_state.search_term += letter
st.rerun()
elif i < 24:
if cols3[col_idx].button(letter, key=f"key_{letter}"):
st.session_state.search_term += letter
st.rerun()
else:
if cols4[col_idx].button(letter, key=f"key_{letter}"):
st.session_state.search_term += letter
st.rerun()
# Control buttons
col_space, col_back, col_clear = st.columns(3)
with col_space:
if st.button("Space"):
st.session_state.search_term += " "
st.rerun()
with col_back:
if st.button("⌫ Backspace"):
st.session_state.search_term = st.session_state.search_term[:-1]
st.rerun()
with col_clear:
if st.button("Clear"):
st.session_state.search_term = ""
st.rerun()
# Search input - directly use session state WITHOUT key parameter to avoid conflicts
search_term = st.text_input("Search Coptic word:", value=st.session_state.search_term)
# Update session state if user types directly
if search_term != st.session_state.search_term:
st.session_state.search_term = search_term
if search_term:
if search_term in coptic_lexicon:
st.write(f"**{search_term}**")
st.write(coptic_lexicon[search_term])
else:
# Partial matches
matches = [k for k in coptic_lexicon.keys() if search_term in k]
if matches:
st.write("Partial matches:")
for match in matches[:5]: # Show first 5 matches
st.write(f"**{match}** → {coptic_lexicon[match][:100]}...")
else:
st.write("No matches found")
# Test Corpus Examples
if selected_lang in ['cop', 'cop-sa', 'cop-bo']:
st.divider()
st.subheader("📖 Example Texts")
try:
import json
from pathlib import Path
corpus_path = Path(__file__).parent / "coptic_test_corpus.json"
if corpus_path.exists():
with open(corpus_path, 'r', encoding='utf-8') as f:
corpus = json.load(f)
# Category selection
categories = {
"simple_sentences": "Simple Sentences",
"complex_sentences": "Complex Sentences",
"short_texts": "Short Texts (Paragraphs)",
"grammar_patterns": "Grammar Patterns"
}
selected_category = st.selectbox(
"Choose category:",
options=list(categories.keys()),
format_func=lambda x: categories[x],
key="corpus_category"
)
if selected_category in corpus['categories']:
category_data = corpus['categories'][selected_category]
if selected_category == 'grammar_patterns':
# Handle grammar patterns differently
pattern_names = [p['pattern'] for p in category_data['patterns']]
selected_pattern = st.selectbox("Select pattern:", pattern_names, key="pattern_select")
pattern_data = next(p for p in category_data['patterns'] if p['pattern'] == selected_pattern)
st.caption(f"**Structure:** {pattern_data['structure']}")
example_texts = [f"{ex['coptic']}{ex['english']}" for ex in pattern_data['examples']]
selected_example_idx = st.selectbox(
"Select example:",
range(len(pattern_data['examples'])),
format_func=lambda i: example_texts[i],
key="pattern_example"
)
example = pattern_data['examples'][selected_example_idx]
else:
# Handle regular examples
examples = category_data['examples']
example_labels = []
for ex in examples:
label = ex.get('title', ex['coptic'][:30] + '...' if len(ex['coptic']) > 30 else ex['coptic'])
example_labels.append(label)
selected_example_idx = st.selectbox(
"Select example:",
range(len(examples)),
format_func=lambda i: example_labels[i],
key="example_select"
)
example = examples[selected_example_idx]
# Display example details
with st.expander("📝 View Example", expanded=True):
st.markdown(f"**Coptic:**")
st.code(example['coptic'], language="")
st.markdown(f"**English:**")
st.write(example['english'])
if 'grammar_notes' in example:
st.caption(f"*Grammar:* {example['grammar_notes']}")
elif 'analysis' in example:
st.caption(f"*Analysis:* {example['analysis']}")
if 'source' in example:
st.caption(f"*Source:* {example['source']}")
# Load button
if st.button("📥 Load This Example", key="load_example", use_container_width=True):
st.session_state['example_text'] = example['coptic']
st.success("✓ Example loaded! Scroll down to chat input.")
st.rerun()
except Exception as e:
st.info("💡 Test corpus not available")
# Linguistic analysis options for Coptic input
if selected_lang in ['cop', 'cop-sa', 'cop-bo']:
st.subheader("Analysis Type")
analysis_type = st.selectbox("Choose analysis:",
options=['dependency_parse', 'translation', 'parse_and_translate', 'dialect_analysis', 'transcription', 'morphology', 'lexicon_lookup'],
format_func=lambda x: x.replace('_', ' ').title())
# Target language selector for translation
if analysis_type in ['translation', 'parse_and_translate']:
st.subheader("Target Language")
target_lang = st.selectbox("Translate to:",
options=[k for k in LANGUAGES.keys() if k not in ['cop', 'cop-sa', 'cop-bo']],
format_func=lambda x: LANGUAGES[x],
index=0) # Default to English
target_language_name = LANGUAGES[target_lang]
else:
# For non-translation tasks, use English as default output language
target_language_name = "English"
# Get prompts for the target language (only for LLM-based tasks)
if analysis_type not in ['dependency_parse', 'parse_and_translate']:
COPTIC_PROMPTS = get_coptic_prompts(target_language_name)
# Use HuggingFace Inference API instead of loading model locally
# This is much faster and doesn't require GPU
MODEL_NAME = "swiss-ai/Apertus-8B-Instruct-2509"
def get_inference_client(token=None):
"""Initialize HuggingFace Inference API client with provided token"""
try:
if token:
client = InferenceClient(token=token)
return client
else:
# Try to get token from Space secrets as fallback
if hasattr(st, 'secrets') and 'HF_TOKEN' in st.secrets:
client = InferenceClient(token=st.secrets['HF_TOKEN'])
return client
else:
return None
except Exception as e:
st.error(f"Error initializing inference client: {e}")
return None
# Initialize Coptic Dependency Parser
@st.cache_resource
def get_parser():
"""Initialize and cache the Coptic parser"""
try:
parser = CopticParserCore()
# Note: Don't pre-load here, load on demand to avoid startup delays
# First use will trigger model download if needed
return parser
except Exception as e:
st.error(f"Failed to initialize parser: {e}")
return None
# Chat interface
if "messages" not in st.session_state:
st.session_state.messages = []
# Display chat history
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
# Check if an example was loaded from the test corpus
prompt = None
if 'example_text' in st.session_state:
prompt = st.session_state['example_text']
del st.session_state['example_text'] # Clear after using
# User input (or use loaded example)
if not prompt:
prompt = st.chat_input("Type your message...")
if prompt:
# Handle dependency parsing (doesn't need API token)
if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and analysis_type == 'dependency_parse':
st.session_state.messages.append({"role": "user", "content": prompt})
with st.chat_message("user"):
st.markdown(f"**Parse this text:** {prompt}")
with st.chat_message("assistant"):
with st.spinner("🔍 Parsing Coptic text..."):
parser = get_parser()
if parser:
try:
parse_result = parser.parse_text(prompt)
if parse_result:
# Display parse results
st.success(f"✅ Parsed {parse_result['total_sentences']} sentence(s), {parse_result['total_tokens']} tokens")
# Show formatted table
table_output = parser.format_table(parse_result)
st.markdown(table_output)
# Display Prolog validation results if available
if 'prolog_validation' in parse_result and parse_result['prolog_validation']:
validation = parse_result['prolog_validation']
st.divider()
st.subheader("🔍 Prolog Validation (Walter Till Grammar)")
# Show detected patterns
if validation.get('patterns_detected'):
st.success("**✓ Grammatical Patterns Detected:**")
for pattern in validation['patterns_detected']:
if isinstance(pattern, dict):
if pattern.get('is_tripartite'):
st.write(f"- **Tripartite Sentence**: {pattern.get('description', '')}")
st.code(pattern.get('pattern', ''), language="")
else:
st.write(f"- {pattern}")
else:
st.write(f"- {pattern}")
# Show warnings if any
if validation.get('warnings'):
st.warning("**⚠ Grammatical Warnings:**")
for warning in validation['warnings']:
st.write(f"- {warning}")
# Show if no issues found
if not validation.get('warnings') and not validation.get('patterns_detected'):
st.info("✓ No grammatical issues detected")
# Offer CoNLL-U download
conllu_output = parser.format_conllu(parse_result)
st.download_button(
label="📥 Download CoNLL-U",
data=conllu_output,
file_name="coptic_parse.conllu",
mime="text/plain"
)
response = f"Parse complete. {parse_result['total_sentences']} sentences analyzed."
st.session_state.messages.append({"role": "assistant", "content": response})
else:
st.error("Failed to parse text. Please check the input.")
except Exception as e:
st.error(f"Parsing error: {e}")
else:
st.error("Parser not available. Please check Stanza installation.")
st.stop() # Don't continue to translation
# Initialize inference client if API token is provided (optional for local translator)
inference_client = None
if hf_token_input:
inference_client = get_inference_client(hf_token_input)
# Handle parse_and_translate mode
if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and analysis_type == 'parse_and_translate':
st.session_state.messages.append({"role": "user", "content": prompt})
with st.chat_message("user"):
st.markdown(f"**Parse and translate:** {prompt}")
with st.chat_message("assistant"):
# First, parse
st.subheader("📊 Dependency Analysis")
with st.spinner("🔍 Parsing..."):
parser = get_parser()
if parser:
parse_result = parser.parse_text(prompt)
if parse_result:
table_output = parser.format_table(parse_result)
st.markdown(table_output)
# Then, translate
st.divider()
st.subheader(f"🌍 Translation to {LANGUAGES[target_lang]}")
with st.spinner("🤖 Translating with local Coptic translator..."):
try:
# Use local Coptic translator for Coptic→English translation
if target_lang == 'en':
translation = translate_coptic_to_english(prompt, dialect=selected_lang)
st.markdown(translation)
combined_response = f"Parse complete. Translation: {translation}"
st.session_state.messages.append({"role": "assistant", "content": combined_response})
else:
# For non-English targets, need Apertus or show message
if inference_client and hf_token_input:
COPTIC_PROMPTS_TRANSLATE = get_coptic_prompts(target_language_name)
translate_prompt = f"{COPTIC_PROMPTS_TRANSLATE['translation']} {prompt}"
messages = [
{"role": "system", "content": "You are a professional Coptic-to-modern-language translator. Provide only direct translations without explanations, commentary, or repeating the source text."},
{"role": "user", "content": translate_prompt}
]
response_stream = inference_client.chat_completion(
model=MODEL_NAME,
messages=messages,
max_tokens=512,
temperature=0.5,
top_p=0.9,
stream=True
)
# Stream the translation
response_placeholder = st.empty()
full_response = ""
for message in response_stream:
if message.choices[0].delta.content:
full_response += message.choices[0].delta.content
response_placeholder.markdown(full_response + "▌")
response_placeholder.markdown(full_response)
combined_response = f"Parse complete. Translation: {full_response}"
st.session_state.messages.append({"role": "assistant", "content": combined_response})
else:
st.warning(f"⚠️ Translation to {target_language_name} requires Apertus-8B. Please enable it in the sidebar.")
st.info("💡 Local Coptic translator currently supports English↔Coptic only.")
except Exception as e:
st.error(f"❌ Translation error: {e}")
st.stop() # Special handling complete
# Standard translation/analysis handling
if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and analysis_type is not None:
# For translation, use raw text without prompt template
if analysis_type == 'translation':
full_prompt = prompt
else:
full_prompt = f"{COPTIC_PROMPTS[analysis_type]} {prompt}"
# Add lexicon context for lexicon lookup
if analysis_type == 'lexicon_lookup' and coptic_lexicon:
words_in_prompt = prompt.split()
lexicon_matches = []
for word in words_in_prompt:
if word in coptic_lexicon:
lexicon_matches.append(f"{word} = {coptic_lexicon[word]}")
if lexicon_matches:
full_prompt += f"\n\nLexicon entries found: {'; '.join(lexicon_matches)}"
else:
full_prompt = prompt
st.session_state.messages.append({"role": "user", "content": prompt})
with st.chat_message("user"):
st.markdown(prompt)
# Generate response using local Coptic translator or Apertus API
with st.chat_message("assistant"):
try:
# Check if this is a Coptic→English translation task
if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and analysis_type == 'translation':
# Use local Coptic translator (Norelad/coptic-megalaa-finetuned)
if target_lang == 'en':
with st.spinner("🤖 Translating with local Coptic translator..."):
translation = translate_coptic_to_english(prompt, dialect=selected_lang)
st.markdown(translation)
st.session_state.messages.append({"role": "assistant", "content": translation})
else:
# Non-English target: requires Apertus
if inference_client and hf_token_input:
with st.spinner("🤖 Translating with Apertus-8B..."):
messages = [
{"role": "system", "content": "You are a professional Coptic-to-modern-language translator. Provide only direct translations without explanations, commentary, or repeating the source text."},
{"role": "user", "content": full_prompt}
]
response_stream = inference_client.chat_completion(
model=MODEL_NAME,
messages=messages,
max_tokens=512,
temperature=0.5,
top_p=0.9,
stream=True
)
response_placeholder = st.empty()
full_response = ""
for message in response_stream:
if message.choices[0].delta.content:
full_response += message.choices[0].delta.content
response_placeholder.markdown(full_response + "▌")
response_placeholder.markdown(full_response)
st.session_state.messages.append({"role": "assistant", "content": full_response})
else:
st.warning(f"⚠️ Translation to {target_language_name} requires Apertus-8B.")
st.info("💡 Enable Apertus-8B in the sidebar for multi-language support.")
st.info("💡 Local Coptic translator currently supports English↔Coptic only.")
# For non-translation tasks or other languages
else:
if inference_client and hf_token_input:
with st.spinner("🤖 Generating response..."):
messages = [{"role": "user", "content": full_prompt}]
response_stream = inference_client.chat_completion(
model=MODEL_NAME,
messages=messages,
max_tokens=512,
temperature=0.5,
top_p=0.9,
stream=True
)
response_placeholder = st.empty()
full_response = ""
for message in response_stream:
if message.choices[0].delta.content:
full_response += message.choices[0].delta.content
response_placeholder.markdown(full_response + "▌")
response_placeholder.markdown(full_response)
st.session_state.messages.append({"role": "assistant", "content": full_response})
else:
st.warning("⚠️ This feature requires Apertus-8B. Please enable it in the sidebar.")
st.info("💡 Coptic→English translation works without API token using local Coptic translator.")
except Exception as e:
st.error(f"❌ Error: {str(e)}")
st.info("💡 If using Apertus-8B, please verify your API token is valid.")