|
|
import streamlit as st |
|
|
import os |
|
|
import xml.etree.ElementTree as ET |
|
|
import re |
|
|
import sys |
|
|
|
|
|
|
|
|
try: |
|
|
import torch |
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
except ImportError as e: |
|
|
st.error(f""" |
|
|
### ❌ Transformers Import Error |
|
|
|
|
|
Failed to import required transformers components: {e} |
|
|
|
|
|
**Debug Info:** |
|
|
- Python version: {sys.version} |
|
|
- Torch available: {('torch' in sys.modules)} |
|
|
|
|
|
**This usually means:** |
|
|
1. The Docker container is still rebuilding (wait 2-5 minutes) |
|
|
2. Dependencies weren't installed correctly |
|
|
3. There's a version conflict in requirements.txt |
|
|
|
|
|
Please check the HuggingFace Space build logs or try rebuilding the Space. |
|
|
""") |
|
|
st.stop() |
|
|
|
|
|
from huggingface_hub import InferenceClient |
|
|
from coptic_parser_core import CopticParserCore |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
COPTIC_TO_GREEK = { |
|
|
"ⲁ": "α", "ⲃ": "β", "ⲅ": "γ", "ⲇ": "δ", "ⲉ": "ε", "ⲋ": "ϛ", |
|
|
"ⲍ": "ζ", "ⲏ": "η", "ⲑ": "θ", "ⲓ": "ι", "ⲕ": "κ", "ⲗ": "λ", |
|
|
"ⲙ": "μ", "ⲛ": "ν", "ⲝ": "ξ", "ⲟ": "ο", "ⲡ": "π", "ⲣ": "ρ", |
|
|
"ⲥ": "σ", "ⲧ": "τ", "ⲩ": "υ", "ⲫ": "φ", "ⲭ": "χ", "ⲯ": "ψ", |
|
|
"ⲱ": "ω", |
|
|
|
|
|
"ϣ": "ʃ", "ϥ": "f", "ϧ": "x", "ϩ": "h", "ϫ": "ɟ", |
|
|
"ϭ": "c", "ϯ": "ti", |
|
|
|
|
|
"Ⲁ": "Α", "Ⲃ": "Β", "Ⲅ": "Γ", "Ⲇ": "Δ", "Ⲉ": "Ε", "Ⲍ": "Ζ", "Ⲏ": "Η", "Ⲑ": "Θ", |
|
|
"Ⲓ": "Ι", "Ⲕ": "Κ", "Ⲗ": "Λ", "Ⲙ": "Μ", "Ⲛ": "Ν", "Ⲝ": "Ξ", "Ⲟ": "Ο", "Ⲡ": "Π", |
|
|
"Ⲣ": "Ρ", "Ⲥ": "Σ", "Ⲧ": "Τ", "Ⲩ": "Υ", "Ⲫ": "Φ", "Ⲭ": "Χ", "Ⲯ": "Ψ", "Ⲱ": "Ω", |
|
|
"Ϣ": "Ʃ", "Ϥ": "F", "Ϧ": "X", "Ϩ": "H", "Ϫ": "Ɉ", "Ϭ": "C", "Ϯ": "TI" |
|
|
} |
|
|
|
|
|
GREEK_TO_COPTIC = { |
|
|
"α": "ⲁ", "β": "ⲃ", "γ": "ⲅ", "δ": "ⲇ", "ε": "ⲉ", "ϛ": "ⲋ", |
|
|
"ζ": "ⲍ", "η": "ⲏ", "θ": "ⲑ", "ι": "ⲓ", "κ": "ⲕ", "λ": "ⲗ", |
|
|
"μ": "ⲙ", "ν": "ⲛ", "ξ": "ⲝ", "ο": "ⲟ", "π": "ⲡ", "ρ": "ⲣ", |
|
|
"σ": "ⲥ", "ς": "ⲥ", "τ": "ⲧ", "υ": "ⲩ", "φ": "ⲫ", "χ": "ⲭ", "ψ": "ⲯ", |
|
|
"ω": "ⲱ", |
|
|
|
|
|
"ʃ": "ϣ", "f": "ϥ", "x": "ϧ", "h": "ϩ", "ɟ": "ϫ", |
|
|
"c": "ϭ", "ti": "ϯ", |
|
|
|
|
|
"Α": "Ⲁ", "Β": "Ⲃ", "Γ": "Ⲅ", "Δ": "Ⲇ", "Ε": "Ⲉ", "Ζ": "Ⲍ", "Η": "Ⲏ", "Θ": "Ⲑ", |
|
|
"Ι": "Ⲓ", "Κ": "Ⲕ", "Λ": "Ⲗ", "Μ": "Ⲙ", "Ν": "Ⲛ", "Ξ": "Ⲝ", "Ο": "Ⲟ", "Π": "Ⲡ", |
|
|
"Ρ": "Ⲣ", "Σ": "Ⲥ", "Τ": "Ⲧ", "Υ": "Ⲩ", "Φ": "Ⲫ", "Χ": "Ⲭ", "Ψ": "Ⲯ", "Ω": "Ⲱ", |
|
|
"Ʃ": "Ϣ", "F": "Ϥ", "X": "Ϧ", "H": "Ϩ", "Ɉ": "Ϫ", "C": "Ϭ", "TI": "Ϯ" |
|
|
} |
|
|
|
|
|
def greekify(coptic_text): |
|
|
"""Convert Coptic Unicode to Greek transcription for Coptic translator models.""" |
|
|
chars = [] |
|
|
for c in coptic_text: |
|
|
l_c = c.lower() |
|
|
chars.append(COPTIC_TO_GREEK.get(l_c, l_c)) |
|
|
return "".join(chars) |
|
|
|
|
|
def degreekify(greek_text): |
|
|
"""Convert Greek transcription back to Coptic Unicode. |
|
|
|
|
|
Handles two-character sequences like 'ti' → 'ϯ' |
|
|
""" |
|
|
result = [] |
|
|
i = 0 |
|
|
while i < len(greek_text): |
|
|
|
|
|
if i < len(greek_text) - 1: |
|
|
two_char = greek_text[i:i+2].lower() |
|
|
if two_char == 'ti': |
|
|
result.append(GREEK_TO_COPTIC.get(two_char, greek_text[i:i+2])) |
|
|
i += 2 |
|
|
continue |
|
|
|
|
|
result.append(GREEK_TO_COPTIC.get(greek_text[i], greek_text[i])) |
|
|
i += 1 |
|
|
return ''.join(result) |
|
|
|
|
|
|
|
|
COPTIC_ALPHABET = { |
|
|
'Ⲁ': 'Alpha', 'Ⲃ': 'Beta', 'Ⲅ': 'Gamma', 'Ⲇ': 'Delta', 'Ⲉ': 'Epsilon', 'Ⲋ': 'Zeta', |
|
|
'Ⲏ': 'Eta', 'Ⲑ': 'Theta', 'Ⲓ': 'Iota', 'Ⲕ': 'Kappa', 'Ⲗ': 'Lambda', 'Ⲙ': 'Mu', |
|
|
'Ⲛ': 'Nu', 'Ⲝ': 'Xi', 'Ⲟ': 'Omicron', 'Ⲡ': 'Pi', 'Ⲣ': 'Rho', 'Ⲥ': 'Sigma', |
|
|
'Ⲧ': 'Tau', 'Ⲩ': 'Upsilon', 'Ⲫ': 'Phi', 'Ⲭ': 'Chi', 'Ⲯ': 'Psi', 'Ⲱ': 'Omega', |
|
|
'Ϣ': 'Shai', 'Ϥ': 'Fai', 'Ϧ': 'Khei', 'Ϩ': 'Hori', 'Ϫ': 'Gangia', 'Ϭ': 'Shima', 'Ϯ': 'Ti' |
|
|
} |
|
|
|
|
|
|
|
|
def get_coptic_prompts(target_language): |
|
|
"""Generate Coptic analysis prompts with specified target language""" |
|
|
return { |
|
|
'dialect_analysis': f"Analyze the Coptic dialect of this text and identify linguistic features. Respond in {target_language}:", |
|
|
'translation': f"You are a professional Coptic translator. Translate the following Coptic text to {target_language}.\n\nIMPORTANT: Provide ONLY the direct translation. Do not include:\n- The original Coptic text\n- Explanations or commentary\n- Notes about context or meaning\n- Any text other than the {target_language} translation\n\nCoptic text to translate:", |
|
|
'transcription': f"Provide a romanized transcription of this Coptic text. Respond in {target_language}:", |
|
|
'morphology': f"Analyze the morphological structure of these Coptic words. Respond in {target_language}:", |
|
|
'lexicon_lookup': f"Look up these Coptic words and provide definitions with Greek etymologies. Respond in {target_language}:" |
|
|
} |
|
|
|
|
|
|
|
|
@st.cache_data |
|
|
def load_coptic_lexicon(file_path=None): |
|
|
"""Load Coptic lexicon from various formats including TEI XML""" |
|
|
if not file_path or not os.path.exists(file_path): |
|
|
return {} |
|
|
|
|
|
lexicon = {} |
|
|
|
|
|
try: |
|
|
|
|
|
if file_path.endswith('.xml'): |
|
|
tree = ET.parse(file_path) |
|
|
root = tree.getroot() |
|
|
|
|
|
|
|
|
ns = {'tei': 'http://www.tei-c.org/ns/1.0'} |
|
|
|
|
|
|
|
|
entries = root.findall('.//tei:entry', ns) |
|
|
|
|
|
for entry in entries[:100]: |
|
|
coptic_word = "" |
|
|
definition = "" |
|
|
|
|
|
|
|
|
form = entry.find('.//tei:form[@type="lemma"]', ns) or entry.find('.//tei:form', ns) |
|
|
if form is not None: |
|
|
orth = form.find('.//tei:orth', ns) |
|
|
if orth is not None and orth.text: |
|
|
coptic_word = orth.text.strip() |
|
|
|
|
|
|
|
|
senses = entry.findall('.//tei:sense', ns) |
|
|
definitions = [] |
|
|
for sense in senses[:2]: |
|
|
def_elem = sense.find('.//tei:def', ns) |
|
|
if def_elem is not None and def_elem.text: |
|
|
definitions.append(def_elem.text.strip()) |
|
|
|
|
|
if definitions: |
|
|
definition = "; ".join(definitions) |
|
|
|
|
|
|
|
|
if coptic_word and definition: |
|
|
|
|
|
coptic_word = re.sub(r'[^\u2C80-\u2CFF\u03B0-\u03FF\u1F00-\u1FFF\w\s\-]', '', coptic_word).strip() |
|
|
if coptic_word: |
|
|
lexicon[coptic_word] = definition[:200] |
|
|
|
|
|
|
|
|
else: |
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
|
for line in f: |
|
|
line = line.strip() |
|
|
if not line: |
|
|
continue |
|
|
|
|
|
|
|
|
separator = None |
|
|
for sep in ['\t', '|', ',', ';']: |
|
|
if sep in line: |
|
|
separator = sep |
|
|
break |
|
|
|
|
|
if separator: |
|
|
parts = line.split(separator, 1) |
|
|
if len(parts) >= 2: |
|
|
coptic_word = parts[0].strip() |
|
|
definition = parts[1].strip() |
|
|
lexicon[coptic_word] = definition |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"Error loading lexicon: {str(e)}") |
|
|
|
|
|
return lexicon |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def load_coptic_to_english_model(): |
|
|
"""Load Coptic → English translation model (Norelad/coptic-megalaa-finetuned).""" |
|
|
try: |
|
|
with st.spinner("📥 Loading Coptic→English model (first time only, ~600MB)..."): |
|
|
model_name = "Norelad/coptic-megalaa-finetuned" |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
|
|
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
model = model.to(device) |
|
|
|
|
|
st.success(f"✅ Coptic→English model loaded on {device.upper()}") |
|
|
return tokenizer, model, device |
|
|
except Exception as e: |
|
|
st.error(f"Failed to load Coptic→English model: {e}") |
|
|
return None, None, None |
|
|
|
|
|
@st.cache_resource |
|
|
def load_english_to_coptic_model(): |
|
|
"""Load English → Coptic translation model (megalaa/english-coptic-translator).""" |
|
|
try: |
|
|
with st.spinner("📥 Loading English→Coptic model (first time only, ~600MB)..."): |
|
|
model_name = "megalaa/english-coptic-translator" |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
|
|
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
model = model.to(device) |
|
|
|
|
|
st.success(f"✅ English→Coptic model loaded on {device.upper()}") |
|
|
return tokenizer, model, device |
|
|
except Exception as e: |
|
|
st.error(f"Failed to load English→Coptic model: {e}") |
|
|
return None, None, None |
|
|
|
|
|
def translate_coptic_to_english(text, dialect='cop-sa'): |
|
|
"""Translate Coptic text to English using local Coptic translator. |
|
|
|
|
|
Args: |
|
|
text: Coptic text to translate |
|
|
dialect: Coptic dialect ('cop-sa' for Sahidic, 'cop-bo' for Bohairic, 'cop' defaults to Sahidic) |
|
|
""" |
|
|
tokenizer, model, device = load_coptic_to_english_model() |
|
|
|
|
|
if tokenizer is None or model is None: |
|
|
return "Error: Model not loaded. Please check your internet connection." |
|
|
|
|
|
try: |
|
|
|
|
|
DIALECT_TAGS = { |
|
|
'cop-sa': 'з', |
|
|
'cop-bo': 'б', |
|
|
'cop': 'з' |
|
|
} |
|
|
|
|
|
dialect_tag = DIALECT_TAGS.get(dialect, 'з') |
|
|
|
|
|
|
|
|
greek_input = greekify(text.lower()) |
|
|
greek_input = f"{dialect_tag} {greek_input}" |
|
|
|
|
|
|
|
|
inputs = tokenizer(greek_input, return_tensors="pt", padding=True).to(device) |
|
|
outputs = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=128, |
|
|
num_beams=5, |
|
|
early_stopping=True |
|
|
) |
|
|
|
|
|
|
|
|
translation = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
return translation |
|
|
|
|
|
except Exception as e: |
|
|
return f"Translation error: {e}" |
|
|
|
|
|
def translate_english_to_coptic(text): |
|
|
"""Translate English text to Coptic using local Coptic translator.""" |
|
|
tokenizer, model, device = load_english_to_coptic_model() |
|
|
|
|
|
if tokenizer is None or model is None: |
|
|
return "Error: Model not loaded. Please check your internet connection." |
|
|
|
|
|
try: |
|
|
|
|
|
inputs = tokenizer(text, return_tensors="pt", padding=True).to(device) |
|
|
outputs = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=128, |
|
|
num_beams=5, |
|
|
early_stopping=True |
|
|
) |
|
|
|
|
|
|
|
|
greek_output = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
coptic_output = degreekify(greek_output) |
|
|
return coptic_output |
|
|
|
|
|
except Exception as e: |
|
|
return f"Translation error: {e}" |
|
|
|
|
|
|
|
|
LANGUAGES = { |
|
|
'en': 'English', 'es': 'Español', 'fr': 'Français', 'de': 'Deutsch', |
|
|
'zh': '中文', 'ja': '日本語', 'ar': 'العربية', 'hi': 'हिन्दी', |
|
|
'cop': 'Coptic (ⲘⲉⲧⲢⲉⲙ̀ⲛⲭⲏⲙⲓ)', 'cop-sa': 'Sahidic Coptic', 'cop-bo': 'Bohairic Coptic' |
|
|
} |
|
|
|
|
|
st.set_page_config(page_title="Apertus Chat", layout="wide") |
|
|
|
|
|
|
|
|
analysis_type = None |
|
|
target_lang = None |
|
|
target_language_name = "English" |
|
|
|
|
|
|
|
|
selected_lang = st.selectbox("Language / Langue / Idioma", |
|
|
options=list(LANGUAGES.keys()), |
|
|
format_func=lambda x: LANGUAGES[x]) |
|
|
|
|
|
|
|
|
with st.sidebar: |
|
|
st.header("Coptic Tools") |
|
|
|
|
|
|
|
|
st.subheader("🤖 Translation Model") |
|
|
st.info("✨ **NEW:** Using specialized Coptic translator models (free, no API token needed!)") |
|
|
st.markdown("Models: `Norelad/coptic-megalaa-finetuned` & `megalaa/english-coptic-translator`") |
|
|
|
|
|
|
|
|
with st.expander("⚙️ Advanced: Use Apertus-8B (optional)"): |
|
|
st.caption("For multi-language translation beyond English-Coptic") |
|
|
hf_token_input = st.text_input( |
|
|
"HuggingFace API Token", |
|
|
type="password", |
|
|
help="Optional: For Apertus-8B multi-language support" |
|
|
) |
|
|
use_apertus = st.checkbox("Use Apertus-8B instead of local Coptic translator", value=False) |
|
|
if hf_token_input and use_apertus: |
|
|
st.success("✅ Apertus-8B enabled") |
|
|
elif not use_apertus: |
|
|
hf_token_input = None |
|
|
|
|
|
st.divider() |
|
|
|
|
|
|
|
|
st.subheader("📚 Lexicon Upload") |
|
|
lexicon_file = st.file_uploader( |
|
|
"Upload Coptic Lexicon (optional)", |
|
|
type=['txt', 'tsv', 'csv', 'xml'], |
|
|
help="Supports: Text (TAB/pipe separated), XML (TEI format), CSV\nNote: Comprehensive lexicon is pre-loaded" |
|
|
) |
|
|
|
|
|
|
|
|
if lexicon_file: |
|
|
try: |
|
|
|
|
|
file_size = len(lexicon_file.getvalue()) |
|
|
if file_size > 20 * 1024 * 1024: |
|
|
st.error("❌ File too large (max 20MB)") |
|
|
coptic_lexicon = {} |
|
|
else: |
|
|
|
|
|
temp_path = f"temp_lexicon.{lexicon_file.name.split('.')[-1]}" |
|
|
with open(temp_path, "wb") as f: |
|
|
f.write(lexicon_file.getbuffer()) |
|
|
|
|
|
coptic_lexicon = load_coptic_lexicon(temp_path) |
|
|
|
|
|
if coptic_lexicon: |
|
|
st.success(f"✅ Loaded {len(coptic_lexicon)} lexicon entries from {lexicon_file.name}") |
|
|
else: |
|
|
st.warning("⚠️ File uploaded but no valid entries found") |
|
|
coptic_lexicon = {} |
|
|
|
|
|
|
|
|
if os.path.exists(temp_path): |
|
|
os.remove(temp_path) |
|
|
except Exception as e: |
|
|
st.error(f"❌ Error loading file: {str(e)}") |
|
|
st.info("💡 Supported formats: Plain text (TAB/pipe separated), XML (TEI), CSV") |
|
|
coptic_lexicon = {} |
|
|
else: |
|
|
|
|
|
comprehensive_lexicon_path = "Comprehensive_Coptic_Lexicon-v1.2-2020.xml" |
|
|
if os.path.exists(comprehensive_lexicon_path): |
|
|
coptic_lexicon = load_coptic_lexicon(comprehensive_lexicon_path) |
|
|
if coptic_lexicon: |
|
|
st.info(f"📚 Loaded Comprehensive Coptic Lexicon: {len(coptic_lexicon)} entries") |
|
|
else: |
|
|
coptic_lexicon = {} |
|
|
else: |
|
|
coptic_lexicon = {} |
|
|
|
|
|
|
|
|
if st.expander("Coptic Alphabet"): |
|
|
for letter, name in COPTIC_ALPHABET.items(): |
|
|
st.text(f"{letter} - {name}") |
|
|
|
|
|
|
|
|
if coptic_lexicon: |
|
|
st.subheader("Lexicon Search") |
|
|
|
|
|
|
|
|
if "search_term" not in st.session_state: |
|
|
st.session_state.search_term = "" |
|
|
|
|
|
|
|
|
st.write("**Virtual Keyboard:**") |
|
|
coptic_letters = ['ⲁ', 'ⲃ', 'ⲅ', 'ⲇ', 'ⲉ', 'ⲍ', 'ⲏ', 'ⲑ', 'ⲓ', 'ⲕ', 'ⲗ', 'ⲙ', 'ⲛ', 'ⲝ', 'ⲟ', 'ⲡ', 'ⲣ', 'ⲥ', 'ⲧ', 'ⲩ', 'ⲫ', 'ⲭ', 'ⲯ', 'ⲱ', 'ϣ', 'ϥ', 'ϧ', 'ϩ', 'ϫ', 'ϭ', 'ϯ'] |
|
|
|
|
|
|
|
|
cols1 = st.columns(8) |
|
|
cols2 = st.columns(8) |
|
|
cols3 = st.columns(8) |
|
|
cols4 = st.columns(8) |
|
|
|
|
|
|
|
|
for i, letter in enumerate(coptic_letters): |
|
|
col_idx = i % 8 |
|
|
if i < 8: |
|
|
if cols1[col_idx].button(letter, key=f"key_{letter}"): |
|
|
st.session_state.search_term += letter |
|
|
st.rerun() |
|
|
elif i < 16: |
|
|
if cols2[col_idx].button(letter, key=f"key_{letter}"): |
|
|
st.session_state.search_term += letter |
|
|
st.rerun() |
|
|
elif i < 24: |
|
|
if cols3[col_idx].button(letter, key=f"key_{letter}"): |
|
|
st.session_state.search_term += letter |
|
|
st.rerun() |
|
|
else: |
|
|
if cols4[col_idx].button(letter, key=f"key_{letter}"): |
|
|
st.session_state.search_term += letter |
|
|
st.rerun() |
|
|
|
|
|
|
|
|
col_space, col_back, col_clear = st.columns(3) |
|
|
with col_space: |
|
|
if st.button("Space"): |
|
|
st.session_state.search_term += " " |
|
|
st.rerun() |
|
|
with col_back: |
|
|
if st.button("⌫ Backspace"): |
|
|
st.session_state.search_term = st.session_state.search_term[:-1] |
|
|
st.rerun() |
|
|
with col_clear: |
|
|
if st.button("Clear"): |
|
|
st.session_state.search_term = "" |
|
|
st.rerun() |
|
|
|
|
|
|
|
|
search_term = st.text_input("Search Coptic word:", value=st.session_state.search_term) |
|
|
|
|
|
|
|
|
if search_term != st.session_state.search_term: |
|
|
st.session_state.search_term = search_term |
|
|
|
|
|
if search_term: |
|
|
if search_term in coptic_lexicon: |
|
|
st.write(f"**{search_term}**") |
|
|
st.write(coptic_lexicon[search_term]) |
|
|
else: |
|
|
|
|
|
matches = [k for k in coptic_lexicon.keys() if search_term in k] |
|
|
if matches: |
|
|
st.write("Partial matches:") |
|
|
for match in matches[:5]: |
|
|
st.write(f"**{match}** → {coptic_lexicon[match][:100]}...") |
|
|
else: |
|
|
st.write("No matches found") |
|
|
|
|
|
|
|
|
if selected_lang in ['cop', 'cop-sa', 'cop-bo']: |
|
|
st.divider() |
|
|
st.subheader("📖 Example Texts") |
|
|
|
|
|
try: |
|
|
import json |
|
|
from pathlib import Path |
|
|
|
|
|
corpus_path = Path(__file__).parent / "coptic_test_corpus.json" |
|
|
if corpus_path.exists(): |
|
|
with open(corpus_path, 'r', encoding='utf-8') as f: |
|
|
corpus = json.load(f) |
|
|
|
|
|
|
|
|
categories = { |
|
|
"simple_sentences": "Simple Sentences", |
|
|
"complex_sentences": "Complex Sentences", |
|
|
"short_texts": "Short Texts (Paragraphs)", |
|
|
"grammar_patterns": "Grammar Patterns" |
|
|
} |
|
|
|
|
|
selected_category = st.selectbox( |
|
|
"Choose category:", |
|
|
options=list(categories.keys()), |
|
|
format_func=lambda x: categories[x], |
|
|
key="corpus_category" |
|
|
) |
|
|
|
|
|
if selected_category in corpus['categories']: |
|
|
category_data = corpus['categories'][selected_category] |
|
|
|
|
|
if selected_category == 'grammar_patterns': |
|
|
|
|
|
pattern_names = [p['pattern'] for p in category_data['patterns']] |
|
|
selected_pattern = st.selectbox("Select pattern:", pattern_names, key="pattern_select") |
|
|
|
|
|
pattern_data = next(p for p in category_data['patterns'] if p['pattern'] == selected_pattern) |
|
|
st.caption(f"**Structure:** {pattern_data['structure']}") |
|
|
|
|
|
example_texts = [f"{ex['coptic']} → {ex['english']}" for ex in pattern_data['examples']] |
|
|
selected_example_idx = st.selectbox( |
|
|
"Select example:", |
|
|
range(len(pattern_data['examples'])), |
|
|
format_func=lambda i: example_texts[i], |
|
|
key="pattern_example" |
|
|
) |
|
|
|
|
|
example = pattern_data['examples'][selected_example_idx] |
|
|
|
|
|
else: |
|
|
|
|
|
examples = category_data['examples'] |
|
|
example_labels = [] |
|
|
for ex in examples: |
|
|
label = ex.get('title', ex['coptic'][:30] + '...' if len(ex['coptic']) > 30 else ex['coptic']) |
|
|
example_labels.append(label) |
|
|
|
|
|
selected_example_idx = st.selectbox( |
|
|
"Select example:", |
|
|
range(len(examples)), |
|
|
format_func=lambda i: example_labels[i], |
|
|
key="example_select" |
|
|
) |
|
|
|
|
|
example = examples[selected_example_idx] |
|
|
|
|
|
|
|
|
with st.expander("📝 View Example", expanded=True): |
|
|
st.markdown(f"**Coptic:**") |
|
|
st.code(example['coptic'], language="") |
|
|
st.markdown(f"**English:**") |
|
|
st.write(example['english']) |
|
|
|
|
|
if 'grammar_notes' in example: |
|
|
st.caption(f"*Grammar:* {example['grammar_notes']}") |
|
|
elif 'analysis' in example: |
|
|
st.caption(f"*Analysis:* {example['analysis']}") |
|
|
|
|
|
if 'source' in example: |
|
|
st.caption(f"*Source:* {example['source']}") |
|
|
|
|
|
|
|
|
if st.button("📥 Load This Example", key="load_example", use_container_width=True): |
|
|
st.session_state['example_text'] = example['coptic'] |
|
|
st.success("✓ Example loaded! Scroll down to chat input.") |
|
|
st.rerun() |
|
|
|
|
|
except Exception as e: |
|
|
st.info("💡 Test corpus not available") |
|
|
|
|
|
|
|
|
if selected_lang in ['cop', 'cop-sa', 'cop-bo']: |
|
|
st.subheader("Analysis Type") |
|
|
analysis_type = st.selectbox("Choose analysis:", |
|
|
options=['dependency_parse', 'translation', 'parse_and_translate', 'dialect_analysis', 'transcription', 'morphology', 'lexicon_lookup'], |
|
|
format_func=lambda x: x.replace('_', ' ').title()) |
|
|
|
|
|
|
|
|
if analysis_type in ['translation', 'parse_and_translate']: |
|
|
st.subheader("Target Language") |
|
|
target_lang = st.selectbox("Translate to:", |
|
|
options=[k for k in LANGUAGES.keys() if k not in ['cop', 'cop-sa', 'cop-bo']], |
|
|
format_func=lambda x: LANGUAGES[x], |
|
|
index=0) |
|
|
target_language_name = LANGUAGES[target_lang] |
|
|
else: |
|
|
|
|
|
target_language_name = "English" |
|
|
|
|
|
|
|
|
if analysis_type not in ['dependency_parse', 'parse_and_translate']: |
|
|
COPTIC_PROMPTS = get_coptic_prompts(target_language_name) |
|
|
|
|
|
|
|
|
|
|
|
MODEL_NAME = "swiss-ai/Apertus-8B-Instruct-2509" |
|
|
|
|
|
def get_inference_client(token=None): |
|
|
"""Initialize HuggingFace Inference API client with provided token""" |
|
|
try: |
|
|
if token: |
|
|
client = InferenceClient(token=token) |
|
|
return client |
|
|
else: |
|
|
|
|
|
if hasattr(st, 'secrets') and 'HF_TOKEN' in st.secrets: |
|
|
client = InferenceClient(token=st.secrets['HF_TOKEN']) |
|
|
return client |
|
|
else: |
|
|
return None |
|
|
except Exception as e: |
|
|
st.error(f"Error initializing inference client: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def get_parser(): |
|
|
"""Initialize and cache the Coptic parser""" |
|
|
try: |
|
|
parser = CopticParserCore() |
|
|
|
|
|
|
|
|
return parser |
|
|
except Exception as e: |
|
|
st.error(f"Failed to initialize parser: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
if "messages" not in st.session_state: |
|
|
st.session_state.messages = [] |
|
|
|
|
|
|
|
|
for message in st.session_state.messages: |
|
|
with st.chat_message(message["role"]): |
|
|
st.markdown(message["content"]) |
|
|
|
|
|
|
|
|
prompt = None |
|
|
if 'example_text' in st.session_state: |
|
|
prompt = st.session_state['example_text'] |
|
|
del st.session_state['example_text'] |
|
|
|
|
|
|
|
|
if not prompt: |
|
|
prompt = st.chat_input("Type your message...") |
|
|
|
|
|
if prompt: |
|
|
|
|
|
if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and analysis_type == 'dependency_parse': |
|
|
st.session_state.messages.append({"role": "user", "content": prompt}) |
|
|
|
|
|
with st.chat_message("user"): |
|
|
st.markdown(f"**Parse this text:** {prompt}") |
|
|
|
|
|
with st.chat_message("assistant"): |
|
|
with st.spinner("🔍 Parsing Coptic text..."): |
|
|
parser = get_parser() |
|
|
if parser: |
|
|
try: |
|
|
parse_result = parser.parse_text(prompt) |
|
|
|
|
|
if parse_result: |
|
|
|
|
|
st.success(f"✅ Parsed {parse_result['total_sentences']} sentence(s), {parse_result['total_tokens']} tokens") |
|
|
|
|
|
|
|
|
table_output = parser.format_table(parse_result) |
|
|
st.markdown(table_output) |
|
|
|
|
|
|
|
|
if 'prolog_validation' in parse_result and parse_result['prolog_validation']: |
|
|
validation = parse_result['prolog_validation'] |
|
|
|
|
|
st.divider() |
|
|
st.subheader("🔍 Prolog Validation (Walter Till Grammar)") |
|
|
|
|
|
|
|
|
if validation.get('patterns_detected'): |
|
|
st.success("**✓ Grammatical Patterns Detected:**") |
|
|
for pattern in validation['patterns_detected']: |
|
|
if isinstance(pattern, dict): |
|
|
if pattern.get('is_tripartite'): |
|
|
st.write(f"- **Tripartite Sentence**: {pattern.get('description', '')}") |
|
|
st.code(pattern.get('pattern', ''), language="") |
|
|
else: |
|
|
st.write(f"- {pattern}") |
|
|
else: |
|
|
st.write(f"- {pattern}") |
|
|
|
|
|
|
|
|
if validation.get('warnings'): |
|
|
st.warning("**⚠ Grammatical Warnings:**") |
|
|
for warning in validation['warnings']: |
|
|
st.write(f"- {warning}") |
|
|
|
|
|
|
|
|
if not validation.get('warnings') and not validation.get('patterns_detected'): |
|
|
st.info("✓ No grammatical issues detected") |
|
|
|
|
|
|
|
|
conllu_output = parser.format_conllu(parse_result) |
|
|
st.download_button( |
|
|
label="📥 Download CoNLL-U", |
|
|
data=conllu_output, |
|
|
file_name="coptic_parse.conllu", |
|
|
mime="text/plain" |
|
|
) |
|
|
|
|
|
response = f"Parse complete. {parse_result['total_sentences']} sentences analyzed." |
|
|
st.session_state.messages.append({"role": "assistant", "content": response}) |
|
|
else: |
|
|
st.error("Failed to parse text. Please check the input.") |
|
|
except Exception as e: |
|
|
st.error(f"Parsing error: {e}") |
|
|
else: |
|
|
st.error("Parser not available. Please check Stanza installation.") |
|
|
|
|
|
st.stop() |
|
|
|
|
|
|
|
|
inference_client = None |
|
|
if hf_token_input: |
|
|
inference_client = get_inference_client(hf_token_input) |
|
|
|
|
|
|
|
|
if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and analysis_type == 'parse_and_translate': |
|
|
st.session_state.messages.append({"role": "user", "content": prompt}) |
|
|
|
|
|
with st.chat_message("user"): |
|
|
st.markdown(f"**Parse and translate:** {prompt}") |
|
|
|
|
|
with st.chat_message("assistant"): |
|
|
|
|
|
st.subheader("📊 Dependency Analysis") |
|
|
with st.spinner("🔍 Parsing..."): |
|
|
parser = get_parser() |
|
|
if parser: |
|
|
parse_result = parser.parse_text(prompt) |
|
|
if parse_result: |
|
|
table_output = parser.format_table(parse_result) |
|
|
st.markdown(table_output) |
|
|
|
|
|
|
|
|
st.divider() |
|
|
st.subheader(f"🌍 Translation to {LANGUAGES[target_lang]}") |
|
|
|
|
|
with st.spinner("🤖 Translating with local Coptic translator..."): |
|
|
try: |
|
|
|
|
|
if target_lang == 'en': |
|
|
translation = translate_coptic_to_english(prompt, dialect=selected_lang) |
|
|
st.markdown(translation) |
|
|
|
|
|
combined_response = f"Parse complete. Translation: {translation}" |
|
|
st.session_state.messages.append({"role": "assistant", "content": combined_response}) |
|
|
else: |
|
|
|
|
|
if inference_client and hf_token_input: |
|
|
COPTIC_PROMPTS_TRANSLATE = get_coptic_prompts(target_language_name) |
|
|
translate_prompt = f"{COPTIC_PROMPTS_TRANSLATE['translation']} {prompt}" |
|
|
|
|
|
messages = [ |
|
|
{"role": "system", "content": "You are a professional Coptic-to-modern-language translator. Provide only direct translations without explanations, commentary, or repeating the source text."}, |
|
|
{"role": "user", "content": translate_prompt} |
|
|
] |
|
|
|
|
|
response_stream = inference_client.chat_completion( |
|
|
model=MODEL_NAME, |
|
|
messages=messages, |
|
|
max_tokens=512, |
|
|
temperature=0.5, |
|
|
top_p=0.9, |
|
|
stream=True |
|
|
) |
|
|
|
|
|
|
|
|
response_placeholder = st.empty() |
|
|
full_response = "" |
|
|
|
|
|
for message in response_stream: |
|
|
if message.choices[0].delta.content: |
|
|
full_response += message.choices[0].delta.content |
|
|
response_placeholder.markdown(full_response + "▌") |
|
|
|
|
|
response_placeholder.markdown(full_response) |
|
|
|
|
|
combined_response = f"Parse complete. Translation: {full_response}" |
|
|
st.session_state.messages.append({"role": "assistant", "content": combined_response}) |
|
|
else: |
|
|
st.warning(f"⚠️ Translation to {target_language_name} requires Apertus-8B. Please enable it in the sidebar.") |
|
|
st.info("💡 Local Coptic translator currently supports English↔Coptic only.") |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"❌ Translation error: {e}") |
|
|
|
|
|
st.stop() |
|
|
|
|
|
|
|
|
if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and analysis_type is not None: |
|
|
|
|
|
if analysis_type == 'translation': |
|
|
full_prompt = prompt |
|
|
else: |
|
|
full_prompt = f"{COPTIC_PROMPTS[analysis_type]} {prompt}" |
|
|
|
|
|
|
|
|
if analysis_type == 'lexicon_lookup' and coptic_lexicon: |
|
|
words_in_prompt = prompt.split() |
|
|
lexicon_matches = [] |
|
|
for word in words_in_prompt: |
|
|
if word in coptic_lexicon: |
|
|
lexicon_matches.append(f"{word} = {coptic_lexicon[word]}") |
|
|
|
|
|
if lexicon_matches: |
|
|
full_prompt += f"\n\nLexicon entries found: {'; '.join(lexicon_matches)}" |
|
|
else: |
|
|
full_prompt = prompt |
|
|
|
|
|
st.session_state.messages.append({"role": "user", "content": prompt}) |
|
|
|
|
|
with st.chat_message("user"): |
|
|
st.markdown(prompt) |
|
|
|
|
|
|
|
|
with st.chat_message("assistant"): |
|
|
try: |
|
|
|
|
|
if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and analysis_type == 'translation': |
|
|
|
|
|
if target_lang == 'en': |
|
|
with st.spinner("🤖 Translating with local Coptic translator..."): |
|
|
translation = translate_coptic_to_english(prompt, dialect=selected_lang) |
|
|
st.markdown(translation) |
|
|
st.session_state.messages.append({"role": "assistant", "content": translation}) |
|
|
else: |
|
|
|
|
|
if inference_client and hf_token_input: |
|
|
with st.spinner("🤖 Translating with Apertus-8B..."): |
|
|
messages = [ |
|
|
{"role": "system", "content": "You are a professional Coptic-to-modern-language translator. Provide only direct translations without explanations, commentary, or repeating the source text."}, |
|
|
{"role": "user", "content": full_prompt} |
|
|
] |
|
|
|
|
|
response_stream = inference_client.chat_completion( |
|
|
model=MODEL_NAME, |
|
|
messages=messages, |
|
|
max_tokens=512, |
|
|
temperature=0.5, |
|
|
top_p=0.9, |
|
|
stream=True |
|
|
) |
|
|
|
|
|
response_placeholder = st.empty() |
|
|
full_response = "" |
|
|
|
|
|
for message in response_stream: |
|
|
if message.choices[0].delta.content: |
|
|
full_response += message.choices[0].delta.content |
|
|
response_placeholder.markdown(full_response + "▌") |
|
|
|
|
|
response_placeholder.markdown(full_response) |
|
|
st.session_state.messages.append({"role": "assistant", "content": full_response}) |
|
|
else: |
|
|
st.warning(f"⚠️ Translation to {target_language_name} requires Apertus-8B.") |
|
|
st.info("💡 Enable Apertus-8B in the sidebar for multi-language support.") |
|
|
st.info("💡 Local Coptic translator currently supports English↔Coptic only.") |
|
|
|
|
|
|
|
|
else: |
|
|
if inference_client and hf_token_input: |
|
|
with st.spinner("🤖 Generating response..."): |
|
|
messages = [{"role": "user", "content": full_prompt}] |
|
|
|
|
|
response_stream = inference_client.chat_completion( |
|
|
model=MODEL_NAME, |
|
|
messages=messages, |
|
|
max_tokens=512, |
|
|
temperature=0.5, |
|
|
top_p=0.9, |
|
|
stream=True |
|
|
) |
|
|
|
|
|
response_placeholder = st.empty() |
|
|
full_response = "" |
|
|
|
|
|
for message in response_stream: |
|
|
if message.choices[0].delta.content: |
|
|
full_response += message.choices[0].delta.content |
|
|
response_placeholder.markdown(full_response + "▌") |
|
|
|
|
|
response_placeholder.markdown(full_response) |
|
|
st.session_state.messages.append({"role": "assistant", "content": full_response}) |
|
|
else: |
|
|
st.warning("⚠️ This feature requires Apertus-8B. Please enable it in the sidebar.") |
|
|
st.info("💡 Coptic→English translation works without API token using local Coptic translator.") |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"❌ Error: {str(e)}") |
|
|
st.info("💡 If using Apertus-8B, please verify your API token is valid.") |
|
|
|