import streamlit as st import epitran import langcodes from langcodes import LanguageTagError from pathlib import Path from operator import itemgetter from collections import defaultdict # TODO: reverse transliterate? def get_lang_description_from_mapping_name(string_to_check): if "generic-Latn" == string_to_check: return "Generic Latin Script" if len(string_to_check)<2: return None substrings = string_to_check.split("-") substrings = substrings[:2] # first two are ISO 639-3 language, and ISO 15924 script string_to_check = "-".join(substrings) description = None lang = langcodes.get(string_to_check) if lang: items = [] for key, value in lang.describe().items(): if key == "language": iso_code = lang.to_alpha3() value = f"[{value}](https://iso639-3.sil.org/code/{iso_code})" items.append(f"{key}: {value}") description = ", ".join(items) if substrings[-1] == "red": description = description + " (reduced)" return description def get_valid_epitran_mappings_list(): map_path = Path(epitran.__path__[0]) / "data" / "map" map_files = map_path.glob("*.*") valid_mappings = [map_file.stem for map_file in map_files] valid_mappings.append("cmn-Hans") # special case problem_mappings = ['generic-Latn', 'tur-Latn-bab', 'ood-Latn-sax', 'vie-Latn-so', 'vie-Latn-ce', 'vie-Latn-no', 'kaz-Cyrl-bab'] # https://github.com/dmort27/epitran/issues/98 filtered_mappings = [mapping for mapping in valid_mappings if mapping not in problem_mappings] return filtered_mappings if __name__ == "__main__": st.write("# Phonemize your text with [Epitran](https://github.com/dmort27/epitran)!") st.write("Epitran is a library and tool for transliterating orthographic text as IPA (International Phonetic Alphabet), by Mortensen, David R. and Dalmia, Siddharth and Littell, Patrick.") valid_epitran_mappings = get_valid_epitran_mappings_list() index_of_swa_latn = valid_epitran_mappings.index("swa-Latn") st.write(f"It supports converting many writing sytems to IPA symbols, including approximately {len(valid_epitran_mappings)} languages/scripts, listed below:") #st.write(valid_epitran_mappings) selected_mapping = st.selectbox("Select input language/script:", valid_epitran_mappings, index=index_of_swa_latn) description = get_lang_description_from_mapping_name(selected_mapping) st.write(f"Selected input language/script: {description}") if selected_mapping == "cmn-Hans": st.info("Chinese requires a special dictionary. Downloading now") epitran.download.cedict() st.info("attempting to instantiate epitran transliterator for your language/script") epi = epitran.Epitran(selected_mapping) examples = defaultdict(lambda: 'Try typing some words in the language you chose, and they will be transliterated.') examples['cmn-Hans'] = '太初有道,道与神同在,道就是神。' examples['swa-Latn'] = 'Mwanzoni Kabla ulimwengu haujaumbwa alikuwepo Neno Huyo Neno alikuwa pamoja na Mungu, na Neno alikuwa Mungu.' input_text = st.text_area(label="Whatever you type here will be transliterated!", value=examples[selected_mapping]) # combined_code = "-".join([iso_lang_code, iso_script_code]) # st.write(f"Combined code: {combined_code}") st.info(f"transliterating `{input_text}`\n\tusing {selected_mapping}...") transliteration = epi.transliterate(input_text) output = { "original": input_text, "transliteration":transliteration, } st.write(output)