diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..4cb73be8f56a0054718160201a4341fe38a0f05d --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Micah Geyman + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 6ec6b1565d4782cce3662a701e9c1e9395ba313e..bbdb71a731c32e773345ae064f71a5994b96f3b5 100644 --- a/README.md +++ b/README.md @@ -1,13 +1 @@ ---- -title: Rhg Script Converter Ui -emoji: 👁 -colorFrom: blue -colorTo: blue -sdk: gradio -sdk_version: 4.8.0 -app_file: app.py -pinned: false -license: mit ---- - -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +# rhg-script-converter-ui \ No newline at end of file diff --git a/__pycache__/config.cpython-310.pyc b/__pycache__/config.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..421eed6915eb232a663ca61777162f8920f6103f Binary files /dev/null and b/__pycache__/config.cpython-310.pyc differ diff --git a/__pycache__/functions.cpython-310.pyc b/__pycache__/functions.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c284c86f874e8b214e990f356d5c7ec40a94630 Binary files /dev/null and b/__pycache__/functions.cpython-310.pyc differ diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..5d3ed61dbbd5efc21928a97ebb7a2c0b50a1ffac --- /dev/null +++ b/app.py @@ -0,0 +1,37 @@ +import gradio as gr +from functions import convert_script +from config import scripts + +DEFAULT_INPUT_SCRIPT = list(scripts.keys())[0] +DEFAULT_OUTPUT_SCRIPT = list(scripts.keys())[1] + +def process_text(input_script, output_script, input_text, uploaded_file=None): + if uploaded_file is not None: + input_text = uploaded_file.decode("utf-8") + + output_text = convert_script(scripts[input_script], scripts[output_script], input_text) + + output_filename = "output.txt" + with open(output_filename, "w") as file: + file.write(output_text) + + return output_text, output_filename + +with gr.Blocks(title="Rohingya Script Converter") as page: + gr.Markdown("## Rohingya Script Converter") + with gr.Row(): + input_script = gr.Dropdown(label="Choose the input script:", choices=scripts.keys(), value=DEFAULT_INPUT_SCRIPT) + output_script = gr.Dropdown(label="Choose the output script:", choices=scripts.keys(), value=DEFAULT_OUTPUT_SCRIPT) + with gr.Row(): + input_text = gr.Textbox(label="Input Text", placeholder="Enter text here or upload a file", lines=5) + output_text = gr.Textbox(label="Output Text", placeholder="Converted text will appear here", lines=5, interactive=False) + with gr.Row(): + input_file = gr.File(label="Upload Text File", file_count="single", type="binary") + download_link = gr.File(label="Download Converted File") + gr.Button("Convert").click( + process_text, + inputs=[input_script, output_script, input_text, input_file], + outputs=[output_text, download_link] + ) + +page.launch(share=True) \ No newline at end of file diff --git a/config.py b/config.py new file mode 100644 index 0000000000000000000000000000000000000000..6ee755b939e2a9ca9159ac186c5a7d4a62de8414 --- /dev/null +++ b/config.py @@ -0,0 +1,5 @@ +scripts = { + 'LearnRohingya':'rhg-lroh', + 'Rohingyalish':'rhg-roheng', + 'Rohingyalish (old)':'rhg-roheng-old' + } diff --git a/epitran/__init__.py b/epitran/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..40b4ebca9f0f4f7720200dbbad5bd794d81a9c6a --- /dev/null +++ b/epitran/__init__.py @@ -0,0 +1,2 @@ +from epitran._epitran import Epitran +from epitran.reromanize import ReRomanizer \ No newline at end of file diff --git a/epitran/__pycache__/__init__.cpython-310.pyc b/epitran/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9152fbd69d49dadc1a83129d4886e82676f1adac Binary files /dev/null and b/epitran/__pycache__/__init__.cpython-310.pyc differ diff --git a/epitran/__pycache__/__init__.cpython-311.pyc b/epitran/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..663e2eaf24501710d547ba49716cc9b579333b50 Binary files /dev/null and b/epitran/__pycache__/__init__.cpython-311.pyc differ diff --git a/epitran/__pycache__/_epitran.cpython-310.pyc b/epitran/__pycache__/_epitran.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a415260ce00e5207b46d66f18b835ac5ce227182 Binary files /dev/null and b/epitran/__pycache__/_epitran.cpython-310.pyc differ diff --git a/epitran/__pycache__/_epitran.cpython-311.pyc b/epitran/__pycache__/_epitran.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b6f251cf7eddc321042a98b10607a416bc32a2f Binary files /dev/null and b/epitran/__pycache__/_epitran.cpython-311.pyc differ diff --git a/epitran/__pycache__/cedict.cpython-310.pyc b/epitran/__pycache__/cedict.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d9e5258109286641ede55931f960c3279636d17 Binary files /dev/null and b/epitran/__pycache__/cedict.cpython-310.pyc differ diff --git a/epitran/__pycache__/download.cpython-310.pyc b/epitran/__pycache__/download.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb80d590587f5d4798768ff23a9b525399f860dc Binary files /dev/null and b/epitran/__pycache__/download.cpython-310.pyc differ diff --git a/epitran/__pycache__/epihan.cpython-310.pyc b/epitran/__pycache__/epihan.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3f159ca9d1f4a9f61a1e3e88f5ebf9ba01e46bb0 Binary files /dev/null and b/epitran/__pycache__/epihan.cpython-310.pyc differ diff --git a/epitran/__pycache__/exceptions.cpython-310.pyc b/epitran/__pycache__/exceptions.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6357ed73f2d571652f3bc4b069f1c946ff438d48 Binary files /dev/null and b/epitran/__pycache__/exceptions.cpython-310.pyc differ diff --git a/epitran/__pycache__/flite.cpython-310.pyc b/epitran/__pycache__/flite.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..75d62c52b0102a80ae82902124d76b2334d407ff Binary files /dev/null and b/epitran/__pycache__/flite.cpython-310.pyc differ diff --git a/epitran/__pycache__/ligaturize.cpython-310.pyc b/epitran/__pycache__/ligaturize.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d043a3058944e2b94a5cb31a1d5f4bccfa9f7123 Binary files /dev/null and b/epitran/__pycache__/ligaturize.cpython-310.pyc differ diff --git a/epitran/__pycache__/ppprocessor.cpython-310.pyc b/epitran/__pycache__/ppprocessor.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7c0051137aa97b9843d0dc27de7ab8947c204196 Binary files /dev/null and b/epitran/__pycache__/ppprocessor.cpython-310.pyc differ diff --git a/epitran/__pycache__/puncnorm.cpython-310.pyc b/epitran/__pycache__/puncnorm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7683945278a7c81c630e929c2f11ed0f981430a7 Binary files /dev/null and b/epitran/__pycache__/puncnorm.cpython-310.pyc differ diff --git a/epitran/__pycache__/reromanize.cpython-310.pyc b/epitran/__pycache__/reromanize.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d647cccc817f5c5334ebde8416f6663590919952 Binary files /dev/null and b/epitran/__pycache__/reromanize.cpython-310.pyc differ diff --git a/epitran/__pycache__/rules.cpython-310.pyc b/epitran/__pycache__/rules.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..473e962b909a87ec9122f98af06aaef6fcec0fd6 Binary files /dev/null and b/epitran/__pycache__/rules.cpython-310.pyc differ diff --git a/epitran/__pycache__/simple.cpython-310.pyc b/epitran/__pycache__/simple.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22ae1f773a2e2bf62328c947898de9f2e6487b48 Binary files /dev/null and b/epitran/__pycache__/simple.cpython-310.pyc differ diff --git a/epitran/__pycache__/stripdiacritics.cpython-310.pyc b/epitran/__pycache__/stripdiacritics.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d38926292b44024bbfb2a400f43d6ebd3bc0094a Binary files /dev/null and b/epitran/__pycache__/stripdiacritics.cpython-310.pyc differ diff --git a/epitran/__pycache__/xsampa.cpython-310.pyc b/epitran/__pycache__/xsampa.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57510d561b1c5ba7354a4b84334f201419882921 Binary files /dev/null and b/epitran/__pycache__/xsampa.cpython-310.pyc differ diff --git a/epitran/_epitran.py b/epitran/_epitran.py new file mode 100644 index 0000000000000000000000000000000000000000..d1d0c8467a4eb4dcfb44a5a98ab2533ccfdaa1ae --- /dev/null +++ b/epitran/_epitran.py @@ -0,0 +1,129 @@ +# -*- coding: utf-8 -*- +import logging +from typing import Union + +import panphon.featuretable +from epitran.epihan import Epihan, EpihanTraditional +from epitran.flite import FliteLexLookup +from epitran.puncnorm import PuncNorm +from epitran.simple import SimpleEpitran +from epitran.xsampa import XSampa + +logger = logging.getLogger('epitran') +logger.setLevel(logging.WARNING) + +class Epitran(object): + """Unified interface for IPA transliteration/transcription + + :param code str: ISO 639-3 plus "-" plus ISO 15924 code of the language/script pair that should be loaded + :param preproc bool: apply preprocessors + :param postproc bool: apply postprocessors + :param ligatures bool: use precomposed ligatures instead of standard IPA + :param cedict_filename str: path to file containing the CC-CEDict dictionary + :param rev boolean: use reverse transliteration + :param rev_preproc bool: if True, apply preprocessors when reverse transliterating + :param rev_postproc bool: if True, apply postprocessors when reverse transliterating + """ + special = {'eng-Latn': FliteLexLookup, + 'cmn-Hans': Epihan, + 'cmn-Hant': EpihanTraditional} + + def __init__(self, code: str, preproc: bool=True, postproc: bool=True, ligatures: bool=False, + cedict_file: Union[bool, None]=None, rev: bool=False, + rev_preproc: bool=True, rev_postproc: bool=True, tones: bool=False): + """Constructor method""" + if code in self.special: + self.epi = self.special[code](ligatures=ligatures, cedict_file=cedict_file, tones=tones) + else: + self.epi = SimpleEpitran(code, preproc, postproc, ligatures, rev, rev_preproc, rev_postproc, tones=tones) + self.ft = panphon.featuretable.FeatureTable() + self.xsampa = XSampa() + self.puncnorm = PuncNorm() + + def transliterate(self, word: str, normpunc: bool=False, ligatures: bool=False) -> str: + """Transliterates/transcribes a word into IPA + + :param word str: word to transcribe + :param normpunc bool: if True, normalize punctuation + :param ligatures bool: if True, use precomposed ligatures instead of standard IPA + :return: An IPA string corresponding to the input orthographic string + :rtype: str + """ + return self.epi.transliterate(word, normpunc, ligatures) + + def reverse_transliterate(self, ipa: str) -> str: + """Reconstructs word from IPA. Does the reverse of transliterate() + + :param ipa str: An IPA representation of a word + :return: An orthographic representation of the word + :rtype: str + """ + return self.epi.reverse_transliterate(ipa) + + def strict_trans(self, word: str, normpunc:bool =False, ligatures: bool=False) -> str: + """Transliterate a word into IPA, ignoring all characters that cannot be recognized. + + :param word str: word to transcribe + :param normpunc bool, optional: if True, normalize punctuation + :param ligatures bool, optional: if True, use precomposed ligatures instead of standard IPA + :return: An IPA string corresponding to the input orthographic string, with all uncoverted characters omitted + :rtype: str + """ + return self.epi.strict_trans(word, normpunc, ligatures) + + def trans_list(self, word: str, normpunc: bool=False, ligatures: bool=False) -> "list[str]": + """Transliterates/transcribes a word into list of IPA phonemes + + :param word str: word to transcribe + :param normpunc bool, optional: if True, normalize punctuation + :param ligatures bool, optional: if True, use precomposed ligatures instead of standard IPA + :return: list of IPA strings, each corresponding to a segment + :rtype: list[str] + """ + return self.ft.segs_safe(self.epi.transliterate(word, normpunc, ligatures)) + + def trans_delimiter(self, text: str, delimiter: str=str(' '), normpunc: bool=False, ligatures: bool=False): + """Return IPA transliteration with a delimiter between segments + + :param text str: An orthographic text + :param delimiter str, optional: A string to insert between segments + :param normpunc bool, optional: If True, normalize punctuation + :param ligatures bool, optional: If True, use precomposed ligatures instead of standard IPA + :return: String of IPA phonemes separated by `delimiter` + :rtype: str + """ + return delimiter.join(self.trans_list(text, normpunc=normpunc, + ligatures=ligatures)) + + def xsampa_list(self, word: str, normpunc: bool=False, ligaturize: bool=False): + """Transliterates/transcribes a word as X-SAMPA + + :param word str: An orthographic word + :param normpunc bool, optional: If True, normalize punctuation + :param ligatures bool, optional: If True, use precomposed ligatures instead of standard IPA + :return: List of X-SAMPA strings corresponding to `word` + :rtype: list[str] + """ + ipa_segs = self.ft.ipa_segs(self.epi.strict_trans(word, normpunc, + ligaturize)) + return list(map(self.xsampa.ipa2xs, ipa_segs)) + + def word_to_tuples(self, word: str, normpunc: bool=False, _ligaturize: bool=False): + """Given a word, returns a list of tuples corresponding to IPA segments. The "feature + vectors" form a list consisting of (segment, vector) pairs. + For IPA segments, segment is a substring of phonetic_form such that the + concatenation of all segments in the list is equal to the phonetic_form. + The vectors are a sequence of integers drawn from the set {-1, 0, 1} + where -1 corresponds to '-', 0 corresponds to '0', and 1 corresponds to + '+'. + + :param word str: An orthographic word + :param normpunc bool, optional: If True, normalize punctuation + :param ligatures bool, optional: If True, use precomposed ligatures instead of standard IPA + :return: A list of tuples corresponding to IPA segments + :rtype: list[tuple[str, str, str, str, list[int]]] + """ + try: + return self.epi.word_to_tuples(word, normpunc) + except AttributeError: + raise AttributeError('Method word_to_tuples not yet implemented for this language-script pair!') from AttributeError diff --git a/epitran/backoff.py b/epitran/backoff.py new file mode 100644 index 0000000000000000000000000000000000000000..2554b207f19454ddd11e2792391798d4de3a05b7 --- /dev/null +++ b/epitran/backoff.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- +from __future__ import (print_function, absolute_import, + unicode_literals) + +import regex as re +from . import _epitran +import panphon.featuretable +from epitran.puncnorm import PuncNorm +from epitran.xsampa import XSampa +from epitran.stripdiacritics import StripDiacritics + + +class Backoff(object): + """Implements rudimentary language ID and backoff.""" + + def __init__(self, lang_script_codes, cedict_file=None): + """Construct a Backoff object. + + Args: + lang_script_codes (list): codes for languages to try, starting + with the highest priority languages + cedict_file (str): path to the CC-CEdict dictionary file + (necessary only when cmn-Hans or cmn-Hant are used) + """ + self.langs = [_epitran.Epitran(c, cedict_file=cedict_file) + for c in lang_script_codes] + self.num_re = re.compile(r'\p{Number}+') + self.ft = panphon.featuretable.FeatureTable() + self.xsampa = XSampa() + self.puncnorm = PuncNorm() + self.dias = [StripDiacritics(c) for c in lang_script_codes] + + def transliterate(self, token): + """Return IPA transliteration given by first acceptable mode. + Args: + token (unicode): orthographic text + Returns: + unicode: transliteration as Unicode IPA string + """ + tr_list = [] + while token: + is_outside_lang = True + for dia, lang in zip(self.dias, self.langs): + source = '' + while True: + m = lang.epi.regexp.match(dia.process(token)) + if not m: + break + s = m.group() + token = token[len(s):] + source += s + is_outside_lang = False + tr_list.append(lang.transliterate(source)) + if is_outside_lang: + m = re.match(r'\p{Number}+', token) + if m: + source = m.group() + tr_list.append(source) + token = token[len(source):] + else: + tr_list.append(token[0]) + token = token[1:] + return ''.join(tr_list) + + def trans_list(self, token): + """Transliterate/transcribe a word into list of IPA phonemes. + + Args: + token (unicode): word to transcribe; unicode string + + Returns: + list: list of IPA unicode strings, each corresponding to a segment + """ + return self.ft.segs_safe(self.transliterate(token)) + + def xsampa_list(self, token): + """Transcribe a word into a list of X-SAMPA phonemes. + + Args: + token (unicode): word to transcribe; unicode strings + + Returns: + list: list of X-SAMPA strings, each corresponding to a segment + """ + if re.match(r'^\p{Number}+$', token): + return '' + else: + ipa_segs = self.ft.ipa_segs(self.transliterate(token)) + return list(map(self.xsampa.ipa2xs, ipa_segs)) diff --git a/epitran/bin/connl2engipaspace.py b/epitran/bin/connl2engipaspace.py new file mode 100644 index 0000000000000000000000000000000000000000..0908afc5c8440c2b163bcf4454157a9040f83448 --- /dev/null +++ b/epitran/bin/connl2engipaspace.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python + +import argparse +import codecs +import logging +from collections import Counter + +import unicodecsv as csv + +import epitran +import epitran.flite +import panphon + +logger = logging.getLogger('epitran') + + +def normpunc(flite, s): + def norm(c): + if c in flite.puncnorm: + return flite.puncnorm[c] + else: + return c + return ''.join(map(norm, s)) + + +def add_record(flite, ft, orth): + space = Counter() + orth = normpunc(flite, orth) + trans = flite.transliterate(orth) + while trans: + pref = ft.longest_one_seg_prefix(trans) + if pref != '': + space[pref] += 1 + trans = trans[len(pref):] + else: + if trans[0] in flite.puncnorm_vals: + space[trans[0]] += 1 + else: + space[trans[0]] += 1 + trans = trans[1:] + return space + + +def add_file(flite, ft, fn): + space = Counter() + with codecs.open(fn, 'r', 'utf-8') as f: + for line in f: + fields = line.split(u'\t') + if len(fields) > 0: + orth = fields[0] + space.update(add_record(flite, ft, orth)) + logger.debug(u'Length of counter:\t{}'.format(len(space))) + return space + + +def print_space(output, space): + pairs = enumerate(sorted(filter(lambda x: x, space.keys()))) + with open(output, 'wb') as f: + writer = csv.writer(f, encoding='utf-8') + for i, char in pairs: + writer.writerow((i, char)) + + +def main(infiles, output): + flite = epitran.flite.Flite() + ft = panphon.FeatureTable() + space = Counter() + for fn in infiles: + logger.debug(u'Scanning:\t{}'.format(fn).encode('utf-8')) + space.update(add_file(flite, ft, fn)) + print_space(output, space) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-o', '--output', help='Output file.') + parser.add_argument('infiles', nargs='+', help='CONLL files serving as basis for segment space.') + args = parser.parse_args() + main(args.infiles, args.output) diff --git a/epitran/bin/connl2ipaspace.py b/epitran/bin/connl2ipaspace.py new file mode 100644 index 0000000000000000000000000000000000000000..b050db59366dbe34d41e4ba708771407386e0893 --- /dev/null +++ b/epitran/bin/connl2ipaspace.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python + +import argparse +import codecs +import logging +from collections import Counter + +import epitran +import panphon +import unicodecsv as csv + +logger = logging.getLogger('epitran') + + +def normpunc(epi, s): + def norm(c): + if c in epi.puncnorm: + return epi.puncnorm[c] + else: + return c + return ''.join(map(norm, s)) + + +def add_record_gen(epi, ft, orth): + space = Counter() + orth = normpunc(epi, orth) + trans = epi.transliterate(orth) + while trans: + pref = ft.longest_one_seg_prefix(trans) + if pref != '': + space[pref] += 1 + trans = trans[len(pref):] + else: + space[trans[0]] += 1 + trans = trans[1:] + return space + + +def add_file_gen(epi, ft, fn): + space = Counter() + with codecs.open(fn, 'r', 'utf-8') as f: + for line in f: + fields = line.split(u'\t') + if len(fields) > 0: + orth = fields[0] + space.update(add_record_gen(epi, ft, orth)) + logger.debug(u'Length of counter:\t{}'.format(len(space))) + return space + + +def add_file_op(epi, ft, fn): + space = Counter() + with codecs.open(fn, 'r', 'utf-8') as f: + for line in f: + fields = line.split(u'\t') + if len(fields) > 0: + orth = fields[0] + trans = epi.transliterate(orth) + while trans: + pref = ft.longest_one_seg_prefix(trans) + if pref != '': + space[pref] += 1 + trans = trans[len(pref):] + else: + if trans[0] in epi.puncnorm: + space[epi.puncnorm[trans[0]]] += 1 + else: + space[trans[0]] += 1 + trans = trans[1:] + logger.debug(u'Length of counter:\t{}'.format(len(space))) + return space + + +def print_space(output, space): + pairs = enumerate(sorted(filter(lambda x: x, space.keys()))) + with open(output, 'wb') as f: + writer = csv.writer(f, encoding='utf-8') + for i, char in pairs: + writer.writerow((i, char)) + + +def main(code, op, infiles, output): + epi = epitran.Epitran(code) + ft = panphon.FeatureTable() + space = Counter() + for fn in infiles: + logger.debug(u'Scanning:\t{}'.format(fn).encode('utf-8')) + add_file = add_file_op if op else add_file_gen + space.update(add_file(epi, ft, fn)) + print_space(output, space) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--op', action='store_true', help='Script uses punctuation as (parts of) letters.') + parser.add_argument('-c', '--code', help='Script code for CONNL files.') + parser.add_argument('-o', '--output', help='Output file.') + parser.add_argument('infiles', nargs='+', help='CONLL files serving as basis for segment space.') + args = parser.parse_args() + main(args.code, args.op, args.infiles, args.output) diff --git a/epitran/bin/decompose.py b/epitran/bin/decompose.py new file mode 100644 index 0000000000000000000000000000000000000000..70a0ef6978d2d9abccf94e97437fb683f9fcee59 --- /dev/null +++ b/epitran/bin/decompose.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 + +import unicodedata +import sys + + +def main(fn): + with open(fn, encoding='utf-8') as f: + print(unicodedata.normalize('NFD', f.read())) + + +if __name__ == '__main__': + main(sys.argv[1]) diff --git a/epitran/bin/detectcaps.py b/epitran/bin/detectcaps.py new file mode 100644 index 0000000000000000000000000000000000000000..474538599cd16e65889381e0c60ca98221135108 --- /dev/null +++ b/epitran/bin/detectcaps.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from __future__ import print_function + +import unicodedata +import fileinput + + +def main(): + for line in fileinput.input(): + line = line.decode('utf-8') + token = line.strip() + if len(token) > 1 and unicodedata.category(token[1]) == 'Lu': + is_cap = 0 + elif len(token) > 0 and unicodedata.category(token[0]) == 'Lu': + is_cap = 1 + else: + is_cap = 0 + line = u'{}\t{}'.format(is_cap, token) + line = line.encode('utf-8') + print(line) + + +if __name__ == '__main__': + main() diff --git a/epitran/bin/epitranscribe.py b/epitran/bin/epitranscribe.py new file mode 100644 index 0000000000000000000000000000000000000000..87d00283ca6033032392dba9e41cf0dd05640c59 --- /dev/null +++ b/epitran/bin/epitranscribe.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +import unicodedata +import epitran +import argparse + + +def main(code): + epi = epitran.Epitran(code) + for line in sys.stdin: # pointless + line = line.decode('utf-8') + line = unicodedata.normalize('NFD', line.lower()) + line = epi.transliterate(line) + line = line.encode('utf-8') + sys.stdout.write(line) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=u'Coverts text from STDIN (in the language specified),' + + 'into Unicode IPA and emits it to STDOUT.') + parser.add_argument('code', help=u'ISO 639-3 code for conversion language') + args = parser.parse_args() + main(args.code) diff --git a/epitran/bin/isbijective.py b/epitran/bin/isbijective.py new file mode 100644 index 0000000000000000000000000000000000000000..4de7ceb1f1e3e333f2d8134f3607d6d27e910ee8 --- /dev/null +++ b/epitran/bin/isbijective.py @@ -0,0 +1,31 @@ +#!/usr/bin/env pythoh +from __future__ import print_function + +import glob + +import unicodecsv as csv + + +def read_map(fn): + with open(fn, 'rb') as f: + reader = csv.reader(f, encoding='utf-8') + next(reader) + return [(a, b) for [a, b] in reader] + + +def is_bijection(mapping): + a, b = zip(*mapping) + distinct_a, distinct_b = set(a), set(b) + return len(distinct_a) == len(mapping) and len(distinct_b) == len(mapping) + + +def main(map_fns): + for fn in map_fns: + mapping = read_map(fn) + is_b = is_bijection(mapping) + print('{}\t{}'.format(fn, is_b)) + + +if __name__ == '__main__': + map_fns = glob.glob('../data/*.csv') + main(map_fns) diff --git a/epitran/bin/ltf2ipaspace.py b/epitran/bin/ltf2ipaspace.py new file mode 100644 index 0000000000000000000000000000000000000000..51b596767a319da251c5dc195f6a9ca912e16aeb --- /dev/null +++ b/epitran/bin/ltf2ipaspace.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from __future__ import print_function + +import argparse +import glob +import os.path + +from lxml import etree +import unicodecsv as csv + +import epitran +import panphon.featuretable + + +def read_tokens(fn): + tree = etree.parse(fn) + root = tree.getroot() + return [tok.text for tok in root.findall('.//TOKEN')] + + +def read_input(input_, langscript): + space = set() + epi = epitran.Epitran(langscript) + ft = panphon.featuretable.FeatureTable() + for dirname in input_[0]: + for fn in glob.glob(os.path.join(dirname, '*.ltf.xml')): + for token in read_tokens(fn): + ipa = epi.transliterate(token) + for seg in ft.segs_safe(ipa): + space.add(seg) + return space + + +def write_output(output, space): + with open(output, 'wb') as f: + writer = csv.writer(f, encoding='utf-8') + for n, ch in enumerate(sorted(list(space))): + writer.writerow((n, ch)) + + +def main(langscript, input_, output): + space = read_input(input_, langscript) + write_output(output, space) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--code', help='language-script code') + parser.add_argument('-i', '--input', nargs='+', action='append', help='Directories where input LTF files are found') + parser.add_argument('-o', '--output', help='Output file') + args = parser.parse_args() + main(args.code, args.input, args.output) diff --git a/epitran/bin/migraterules.py b/epitran/bin/migraterules.py new file mode 100644 index 0000000000000000000000000000000000000000..5b3be64903afc8a3f56cbc824d00c55b708fe3e6 --- /dev/null +++ b/epitran/bin/migraterules.py @@ -0,0 +1,40 @@ +#!/usr/bin/env Python +# -*- coding: utf-8 -*- + +from __future__ import (print_function, unicode_literals, absolute_import) + +import glob +import re +import io + +import unicodecsv + + +def build_rule(fields): + try: + a, b, X, Y = fields + b = "0" if not b else b + a = "0" if not a else a + return '{} -> {} / {} _ {}'.format(a, b, X, Y) + except ValueError: + print('Malformed rule: {}'.format(','.join(fields))) + + +def main(): + for csv in glob.glob('*.csv'): + txt = re.match('[A-Za-z-]+', csv).group(0) + '.txt' + with open(csv, 'rb') as f, io.open(txt, 'w', encoding='utf-8') as g: + reader = unicodecsv.reader(f, encoding='utf-8') + next(reader) + for fields in reader: + if re.match('\s*%', fields[0]): + print(','.join([x for x in fields if x]), file=g) + else: + rule = build_rule(fields) + rule = re.sub('[ ]+', ' ', rule) + rule = re.sub('[ ]$', '', rule) + print(rule, file=g) + + +if __name__ == '__main__': + main() diff --git a/epitran/bin/reromanize.py b/epitran/bin/reromanize.py new file mode 100644 index 0000000000000000000000000000000000000000..8303432f6b2ded127d5cf282b25208ee156caaa3 --- /dev/null +++ b/epitran/bin/reromanize.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python2 + +from __future__ import print_function + +import epitran.reromanize +import argparse +import sys + +def main(code, table): + rr = epitran.reromanize.ReRomanizer(code, table) + for line in sys.stdin: + line = line.decode('utf-8') + tokens = line.strip().split('\t') + tokens = [rr.reromanize(x) for x in tokens] + print('\t'.join(tokens).encode('utf-8')) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--code', default='ori-Orya', type=str, help='Languagee and script code') + parser.add_argument('-t', '--table', default='anglocentric', type=str, help='Romanization table') + args = parser.parse_args() + main(args.code, args.table) diff --git a/epitran/bin/space2punc.py b/epitran/bin/space2punc.py new file mode 100644 index 0000000000000000000000000000000000000000..98881342d168a4c0d3eaf58a8f2e9d4fb493fa57 --- /dev/null +++ b/epitran/bin/space2punc.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python + +import sys +import unicodedata +import unicodecsv as csv + + +def main(fns, fnn): + punc = set() + for fn in fns: + print fn + with open(fn, 'rb') as f: + reader = csv.reader(f, encoding='utf-8') + for _, s in reader: + if len(s) == 1 and unicodedata.category(s)[0] == u'P': + punc.add(s) + with open(fnn, 'wb') as f: + writer = csv.writer(f, encoding='utf-8') + for mark in sorted(list(punc)): + writer.writerow([mark]) + + +if __name__ == '__main__': + main(sys.argv[1:-1], sys.argv[-1]) diff --git a/epitran/bin/testvectorgen.py b/epitran/bin/testvectorgen.py new file mode 100644 index 0000000000000000000000000000000000000000..ac85d88037be833310fab9c46cca5d7a962d34b0 --- /dev/null +++ b/epitran/bin/testvectorgen.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import argparse +import codecs + +import epitran.vector + + +def main(code, space, infile): + vec = epitran.vector.VectorsWithIPASpace(code, space) + with codecs.open(infile, 'r', 'utf-8') as f: + for line in f: + fields = line.split('\t') + if len(fields) > 1: + word = fields[0] + print(u"WORD: {}".format(word).encode('utf-8')) + segs = vec.word_to_segs(word) + for record in segs: + cat, case, orth, phon, id_, vector = record + print(u"Category: {}".format(cat).encode('utf-8')) + print(u"Case: {}".format(case).encode('utf-8')) + print(u"Orthographic: {}".format(orth).encode('utf-8')) + print(u"Phonetic: {}".format(phon).encode('utf-8')) + print(u"Vector: {}".format(vector).encode('utf-8')) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--code', required=True, help='Script code.') + parser.add_argument('-s', '--space', required=True, help='Space.') + parser.add_argument('-i', '--infile', required=True, help='Input file.') + args = parser.parse_args() + main(args.code, args.space, args.infile) diff --git a/epitran/bin/transltf.py b/epitran/bin/transltf.py new file mode 100644 index 0000000000000000000000000000000000000000..d172fb51c607ad0def3a8ca895322bef59bceae1 --- /dev/null +++ b/epitran/bin/transltf.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python +from __future__ import print_function + +import sys + +from lxml import etree +import epitran +import epitran.vector + +def main(fn): + epi = epitran.Epitran('uig-Arab') + vwis = epitran.vector.VectorsWithIPASpace('uig-Arab', ['uig-Arab']) + tree = etree.parse(fn) + root = tree.getroot() + for token in root.findall('.//TOKEN'): + # print(token.text.encode('utf-8')) + print(epi.transliterate(unicode(token.text)).encode('utf-8')) + +if __name__ == '__main__': + main(sys.argv[1]) diff --git a/epitran/bin/uigtransliterate.py b/epitran/bin/uigtransliterate.py new file mode 100644 index 0000000000000000000000000000000000000000..90d207ac9a18bd04408902bdcc9c4b40af02b835 --- /dev/null +++ b/epitran/bin/uigtransliterate.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python +from __future__ import print_function + +import fileinput +import epitran + +epi = epitran.Epitran('uig-Arab') +for line in fileinput.input(): + s = epi.transliterate(line.strip().decode('utf-8')) + print(s.encode('utf-8')) diff --git a/epitran/bin/vie-tones.py b/epitran/bin/vie-tones.py new file mode 100644 index 0000000000000000000000000000000000000000..27e55ffaef8d399335a0eef4fff10df8b1b1e17b --- /dev/null +++ b/epitran/bin/vie-tones.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 + +import csv +import re +import sys +import os.path +import unicodedata + + +tones = { + '\u00b4': '˧˥', # acute = sac + '\u0060': '˨˩', # grave = huyen + '\u0303': '˧˥', # tilde = nga + '\u0309': '˧˩˧', # hook above = hoi + '\u0323': '˧˩', # dot below = nang +} + + +def shuffle_tone(orth, phon): + orth = unicodedata.normalize('NFD', orth) + if re.search('[aeiouơư]', orth): + for tone in tones: + if tone in orth: + phon += tones[tone] + if not re.search('[˩˨˧˦˥]', phon): + phon += '˧' + return phon + + +def main(): + fnin = sys.argv[1] + fnout = os.path.basename(fnin) + with open(fnin) as fin, open(fnout, 'w') as fout: + writer = csv.writer(fout) + reader = csv.reader(fin) + header = next(reader) + writer.writerow(header) + for orth, phon in reader: + phon = shuffle_tone(orth, phon) + writer.writerow([orth, phon]) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/epitran/cedict.py b/epitran/cedict.py new file mode 100644 index 0000000000000000000000000000000000000000..63550d13e97e6b674e7cdaa0afc2ad923c36bc66 --- /dev/null +++ b/epitran/cedict.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import codecs + +import marisa_trie +import regex as re + +ASCII_CHARS = ''.join([chr(i) for i in range(128)]) + + +class CEDictTrie(object): + def __init__(self, cedict_file, traditional=False): + """Construct a trie over CC-CEDict + + Args: + cedict_file (str): path to the CC-CEDict dictionary + traditional (bool): if True, use traditional characters + """ + self.hanzi = self._read_cedict(cedict_file, traditional=traditional) + self.trie = self._construct_trie(self.hanzi) + + def _read_cedict(self, cedict_file, traditional=False): + comment_re = re.compile('\s*#') + lemma_re = re.compile('(?P[^]]+) \[(?P[^]]+)\] /(?P.+)/') + cedict = {} + with codecs.open(cedict_file, 'r', 'utf-8') as f: + for line in f: + if comment_re.match(line): + pass + elif lemma_re.match(line): + match = lemma_re.match(line) + hanzi = match.group('hanzi').split(' ') + pinyin = match.group('pinyin').split(' ') + english = match.group('english').split('/') + if traditional: + cedict[hanzi[0]] = (pinyin, english) # traditional characters only + else: + cedict[hanzi[1]] = (pinyin, english) # simplified characters only. + return cedict + + + def _construct_trie(self, hanzi): + pairs = [] + for hz, df in self.hanzi.items(): + py, en = df + py = str(''.join(filter(lambda x: x in ASCII_CHARS, ' '.join(py)))) + pairs.append((hz, (py.encode('utf-8'),))) + trie = marisa_trie.RecordTrie(str('@s'), pairs) + return trie + + def has_key(self, key): + return key in self.hanzi + + def prefixes(self, s): + return self.trie.prefixes(s) + + def longest_prefix(self, s): + prefixes = self.prefixes(s) + if not prefixes: + return '' + else: + return sorted(prefixes, key=len)[-1] # Sort by length and return last. + + def tokenize(self, s): + tokens = [] + while s: + token = self.longest_prefix(s) + if token: + tokens.append(token) + s = s[len(token):] + else: + tokens.append(s[0]) + s = s[1:] + return tokens diff --git a/epitran/data/arpabet.csv b/epitran/data/arpabet.csv new file mode 100644 index 0000000000000000000000000000000000000000..042d4d797eb44defb721898440893de8b9500967 --- /dev/null +++ b/epitran/data/arpabet.csv @@ -0,0 +1,46 @@ +pau, +null, +ey,ej +ae,æ +iy,i +eh,ɛ +ay,aj +ih,ɪ +ow,ow +aa,ɑ +ao,ɔ +aw,aw +oy,oj +ah,ʌ +ax,ə +uw,u +uh,ʊ +er,ɹ̩ +b,b +ch,t͡ʃ +d,d +dx,ɾ +f,f +g,ɡ +hh,h +jh,d͡ʒ +k,k +l,l +em,m̩ +m,m +en,n̩ +n,n +ng,ŋ +p,p +q,ʔ +r,ɹ +s,s +sh,ʃ +t,t +dh,ð +th,θ +v,v +w,w +y,j +z,z +zh,ʒ diff --git a/epitran/data/ipa-xsampa.csv b/epitran/data/ipa-xsampa.csv new file mode 100644 index 0000000000000000000000000000000000000000..a48ff4b3c06f4230a95afdc149d29919e3e9d0f9 --- /dev/null +++ b/epitran/data/ipa-xsampa.csv @@ -0,0 +1,175 @@ +IPA,X-SAMPA,Name +p,p,vl bilabial plosive +b,b,vd bilabial plosive +t,t,vl alveolar plosive +d,d,vd alveolar plosive +ʈ,t`,vl retroflex plosive +ɖ,d`,vd retroflex plosive +c,c,vl palatal plosive +ɟ,J\,vd palatal plosive +k,k,ld velar plosive +ɡ,g,vd velar plosive +q,q,vl uvular plosive +ɢ,G\,vd uvular plosive +ʔ,?,glottal plosive +m,m,bilabial nasal +ɱ,F,vl labiodental nasal +n,n,alveolar nasal +ɳ,n`,vl retroflex nasal +ɲ,J,vl palatal nasal +ŋ,N,vl velar nasal +ɴ,N\,vl uvular nasal +ʙ,B\,vd bilabial trill +r,r,vd alveolar trill +ʀ,R\,vl uvular trill +ɾ,4,vl alveolar tap +ɽ,r`,vl retroflex flap +ɸ,p\,vl bilabial fricative +β,B,vd bilabial fricative +f,f,vl labiodental fricative +v,v,vd labiodental fricative +θ,T,vl dental fricative +ð,D,vd dental fricative +s,s,vl alveolar fricative +z,z,vd alveolar fricative +ʃ,S,vl postalveolar fricative +ʒ,Z,vd postalveolar fricative +ʂ,s`,vl retroflex fricative +ʐ,z`,vd retroflex fricative +ç,C,vl palatal fricative +ʝ,j\,vd palatal fricative +x,x,vl velar fricative +ɣ,G,vd velar fricative +χ,X,vl uvular fricative +ʁ,R,vd uvular fricative +ħ,X\,vl pharyngeal fricative +ʕ,?\,vd pharyngeal fricative +h,h,vl glottal fricative +ʔ,?,glottal plosive +ɬ,K,vl alveolar lateral fricative +ɮ,K\,vd alveolar lateral fricative +ʋ,P,vd labiodental approximant +ɹ,r\,vd (post)alveolar approximant +ɻ,r\`,vd retroflex approximant +j,j,vd palatal approximant +ɰ,M\,vd velar approximant +l,l,vd alveolar lateral approximant +ɭ,l`,vd retroflex lateral approximant +ʎ,L,vd palatal lateral approximant +ʟ,L\,vd velar lateral approximant +pʼ,p_>,ejective +tʼ,t_>,ejective +ʈʼ,t`_>,ejective +cʼ,c_>,ejective +kʼ,k_>,ejective +qʼ,q_>,ejective +ɓ,b_<,vl bilabial implosive +ɗ,d_<,vl alveolar implosive +ƙ,k_<,vl velar implosive +ɠ,g_<,vl velar implosive +i,i,close front unrounded +y,y,close front rounded +ɨ,1,close central unrounded +ʉ,},close central rounded +ɯ,M,close back unrounded +u,u,close back rounded +ɪ,I,lax close front unrounded +ʏ,Y,lax close front rounded +ʊ,U,lax close back rounded +e,e,close-mid front unrounded +ø,2,front close-mid rounded +ɤ,7,close-mid back unrounded +o,o,close-mid back rounded +ə,@,schwa +ɘ,@\,close-mid central unrounded vowel +ɵ,8,rounded schwa +ɛ,E,open-mid front unrounded +œ,9,front open-mid rounded +ʌ,V,open-mid back unrounded +ɔ,O,open-mid back rounded +æ,{,mid-open front unrounded vowel +ɐ,6,open-mid schwa +a,a,open front unrounded +ă,a_X,extra short open front unrounded +ɶ,&,front open rounded +ɑ,A,open back unrounded +ɒ,Q,open back rounded +̥,_0,voiceless +̬,_v,voiced +ʰ,_h,aspirated +̤,_t,breathy voiced +̰,_k,creaky voiced +̼,_N,linguolabial +̪,_d,dental +̺,_a,apical +̻,_m,laminal +̹,_O,more rounded +̜,_c,less rounded +̟,_+,advanced +̠,_-,retracted +̈,"_""",centralized +̽,_x,mid-centralized +̩,=,syllabic +̯,_^,non-syllabic +ʷ,_w,labialized +ʲ,',palatalized +ˠ,_G,velarized +ˤ,_?\,pharyngealized +̴,_e,velarized or pharyngealized +̝,_r,raised +̞,_o,lowered +̃,~,nasalized +ⁿ,_n,nasal release +ˡ,_l,lateral release +̚,_},not audibly released +̘,_A,advanced tongue root +̙,_q,retracted tongue root +̋,_T,extra high tone +́,_H,high tone +̄,_M,mid tone +̀,_L,low tone +̏,_B,extra low tone +ˈ,"""",(primary) stress mark +ˌ,%,secondary stress +ː,:,length mark +ˑ,:\,half-length +̆,_X,extra-short +.,.,syllable break +ʍ,W,vl labial-velar fricative +w,w,vd labio-velar approximant +ɥ,H,labial-palatal approximant +ʜ,H\,vl epiglottal fricative +ʢ,<\,vl epiglottal fricative +ʡ,>\,vl epiglottal plosive +ɕ,s\,vl alveolopalatal fricative +ʑ,z\,vl alveolopalatal fricative +ʘ,O\,bilabial click +ǀ,|\,dental click +ǃ,!\,click +ǂ,'=\,alveolar click +ǁ,|\|\,alveolar lateral click +ɺ,l\,vl alveolar lateral flap +ɜ,3,open-mid central +ʛ,G\_<,vl uvular implosive +ɚ,@`,rhotacized schwa +ɞ,3\,open-mid central rounded +ɦ,h\,vd glottal fricative +ɫ,5,velarized vl alveolar lateral +ʄ,J\_<,vl palatal implosive +ʼ,_>,ejective +ɝ,3`,rhotacized open-mid central +t͡ʃ,tS,vl postalveolar affricate +d͡ʒ,dZ,vd postalveolar affricate +t͡ɕ,ts\,vl alveolo-palatal affricate +d͡ʑ,dz\,vd alveolo-palatal affricate +t͡ɬ,tK,vl alveolar lateral affricate +k͡p,kp,vl labial-velar plosive +g͡b,gb,vd labial-velar plosive +ŋ͡m,Nm,labial-velar nasal stop +ʈ͡ʂ,ts`,vl retroflex affricate +ɖ͡ʐ,tz`,vd retroflex affricate +˩,_B,extra low tone +˨,_L,low tone +˧,_M,mid tone +˦,_H,high tone +˥,_T,extra high tone \ No newline at end of file diff --git a/epitran/data/map/rhg-lroh.csv b/epitran/data/map/rhg-lroh.csv new file mode 100644 index 0000000000000000000000000000000000000000..bd6c31fc1ae1698ff90bf2de4039ff6a883b61c3 --- /dev/null +++ b/epitran/data/map/rhg-lroh.csv @@ -0,0 +1,33 @@ +Orth,Phon +b,b +d,d +ḍ,ɖ +f,f +g,g +h,h +j,d͡ʒ +k,k +l,l +m,m +n,n +p,p +r,ɾ +ṛ,ɽ +s,s +š,ʃ +t,t +ṭ,ʈ +v,v +w,w +y,j +z,z +ã,ɑ̃ +a,ɑ +ẽ,ẽ +e,e +ĩ,ĩ +i,i +õ,ɔ̃ +o,ɔ +ũ,ũ +u,u \ No newline at end of file diff --git a/epitran/data/map/rhg-roheng.csv b/epitran/data/map/rhg-roheng.csv new file mode 100644 index 0000000000000000000000000000000000000000..6e799657ba1b080bc50a6a132ee3b817395f5c49 --- /dev/null +++ b/epitran/data/map/rhg-roheng.csv @@ -0,0 +1,35 @@ +Orth,Phon +b,b +c,ʃ +ç,ɽ +d,d +f,f +g,g +h,h +j,d͡ʒ +k,k +l,l +m,m +n,n +p,p +q,q +r,ɾ +s,s +t,t +v,v +w,w +x,ks +y,j +z,z +dh,ɖ +th,ʈ +a,ɑ +añ,ɑ̃ +e,e +eñ,ẽ +i,i +iñ,ĩ +o,ɔ +oñ,ɔ̃ +u,u +uñ,ũ \ No newline at end of file diff --git a/epitran/data/post/rhg-lroh.txt b/epitran/data/post/rhg-lroh.txt new file mode 100644 index 0000000000000000000000000000000000000000..d939c21a155b2a0f22b46d090dacc82dfe211422 --- /dev/null +++ b/epitran/data/post/rhg-lroh.txt @@ -0,0 +1,19 @@ +ɑ̃ɑ -> ɑ̃ː / _ +ɑɑ̃ -> ɑ̃ː / _ +ɑɑ -> ɑː / _ + +ẽe -> ẽː / _ +eẽ -> ẽː / _ +ee -> eː / _ + +ĩi -> ĩː / _ +iĩ -> ĩː / _ +ii -> iː / _ + +ɔ̃ɔ -> ɔ̃ː / _ +ɔɔ̃ -> ɔ̃ː / _ +ɔɔ -> ɔː / _ + +ũu -> ũː / _ +uũ -> ũː / _ +uu -> uː / _ \ No newline at end of file diff --git a/epitran/data/post/rhg-roheng.txt b/epitran/data/post/rhg-roheng.txt new file mode 100644 index 0000000000000000000000000000000000000000..08c6acbb47af55de5e47c25bba9d2eec9f45a83f --- /dev/null +++ b/epitran/data/post/rhg-roheng.txt @@ -0,0 +1,14 @@ +ɑɑ̃ -> ɑ̃ː / _ +ɑɑ -> ɑː / _ + +eẽ -> ẽː / _ +ee -> eː / _ + +iĩ -> ĩː / _ +ii -> iː / _ + +ɔɔ̃ -> ɔ̃ː / _ +oo -> ɔː / _ + +uũ -> ũː / _ +uu -> uː / _ \ No newline at end of file diff --git a/epitran/data/pre/rhg-lroh.txt b/epitran/data/pre/rhg-lroh.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f4a9c7375d45325502e9a49afce9337fdff7334 --- /dev/null +++ b/epitran/data/pre/rhg-lroh.txt @@ -0,0 +1,17 @@ +::vowel:: = a|ã|e|ẽ|i|ĩ|o|õ|u|ũ +::consonant:: = b|d|ḍ|f|g|h|j|k|l|m|n|p|r|ṛ|s|š|t|ṭ|v|w|y|z + +% remove stress marks +á -> a / _ +é -> e / _ +í -> i / _ +ó -> o / _ +ú -> u / _ + +% vowel glides +w -> 0 / (u|ũ) _ (a|o|e) +y -> 0 / (i|ĩ) _ (a|e|o|u) + +% long vowels + +% gemination \ No newline at end of file diff --git a/epitran/data/pre/rhg-roheng.txt b/epitran/data/pre/rhg-roheng.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee34209e4085090a18e1442093434b412cab0680 --- /dev/null +++ b/epitran/data/pre/rhg-roheng.txt @@ -0,0 +1,13 @@ +::vowel:: = a|e|i|o|u +::consonant:: = b|c|ç|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|x|y|z + +% remove stress marks +á -> a / _ +é -> e / _ +í -> i / _ +ó -> o / _ +ú -> u / _ + +% vowel glides +w -> 0 / (u|uñ) _ (a|o|e) +y -> 0 / (i|iñ) _ (a|e|o|u) \ No newline at end of file diff --git a/epitran/data/puncnorm.csv b/epitran/data/puncnorm.csv new file mode 100644 index 0000000000000000000000000000000000000000..f1a79f4ad69cf57f944ba26009dc055035eba82b --- /dev/null +++ b/epitran/data/puncnorm.csv @@ -0,0 +1,9 @@ +Punctuation,NormalizedForm +‘,' +’,' +ʼ,' +ʻ,' +”,"""" +“,"""" +。,. +,,"," diff --git a/epitran/dictfirst.py b/epitran/dictfirst.py new file mode 100644 index 0000000000000000000000000000000000000000..549f9275655414f2fd7c6a241189afd1123b304b --- /dev/null +++ b/epitran/dictfirst.py @@ -0,0 +1,32 @@ +import epitran + +class DictFirst: + """If words are in a dictionary, use one model; if words are not, use another fallback. + + Args: + code1 (str): language-script code for dictionary language + code2 (str): language-script code for fall-back language + dict_fn (str): file path to text file containing dictionary, one word per line + """ + def __init__(self, code1, code2, dict_fn): + self.epi1 = epitran.Epitran(code1) + self.epi2 = epitran.Epitran(code2) + self.dictionary = self._read_dictionary(dict_fn) + + def _read_dictionary(self, dict_fn): + with open(dict_fn, encoding='utf-8') as f: + return {x.strip(): self.epi1.transliterate(x.strip()) for x in f} + + def transliterate(self, token): + """Convert token to IPA, falling back on second language + + Args: + token (str): token to covert to IPA + + Returns: + str: IPA equivalent of token + """ + if token in self.dictionary: + return self.dictionary[token] + else: + return self.epi2.transliterate(token) diff --git a/epitran/download.py b/epitran/download.py new file mode 100644 index 0000000000000000000000000000000000000000..ac95cbc8a5fba8b047dbfaa301ed28d68add2d28 --- /dev/null +++ b/epitran/download.py @@ -0,0 +1,28 @@ +import os +import requests +import gzip + +CEDICT_URL='https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz' + +def get_dir(): + data_dir = os.path.expanduser('~/epitran_data/') + os.makedirs(data_dir, exist_ok=True) + return data_dir + +def get_cedict_file(): + return os.path.join(get_dir(), 'cedict.txt') + +def cedict_exists(): + return os.path.exists(get_cedict_file()) + +def cedict(): + gzfilename = os.path.join(get_dir(), 'cedict.txt.gz') + txtfilename = os.path.join(get_dir(), 'cedict.txt') + r = requests.get(CEDICT_URL) + with open(gzfilename, 'wb') as f: + f.write(r.content) + with gzip.open(gzfilename, 'rb') as ip_byte, open(txtfilename, 'w') as op: + op.write(ip_byte.read().decode('utf-8')) + + + diff --git a/epitran/epihan.py b/epitran/epihan.py new file mode 100644 index 0000000000000000000000000000000000000000..d4d178dc0f1cdb7113821a6014484b366ab613ca --- /dev/null +++ b/epitran/epihan.py @@ -0,0 +1,127 @@ +# -*- utf-8 -*- +from __future__ import print_function, unicode_literals, division, absolute_import + +import os.path + +import pkg_resources +import regex as re + +from . import cedict +from . import rules +from . import download +from epitran.ligaturize import ligaturize + + +class MissingData(Exception): + pass + + +class Epihan(object): + punc = [(u'\uff0c', u','), + (u'\uff01', u'!'), + (u'\uff1f', u'?'), + (u'\uff1b', u';'), + (u'\uff1a', u':'), + (u'\uff08', u'('), + (u'\uff09', u')'), + (u'\uff3b', u'['), + (u'\uff3d', u']'), + (u'\u3010', u'['), + (u'\u3011', u']'), + ] + + def __init__(self, ligatures=False, cedict_file=None, + rules_file='pinyin-to-ipa.txt', tones=False): + """Construct epitran object for Chinese + + Args: + ligatures (bool): if True, use ligatures instead of standard IPA + cedict_file (str): path to CC-CEDict dictionary file + rules_file (str): name of file with rules for converting pinyin to + IPA + tones (bool): if True, output tones as Chao tone numbers; overrides + `rules_file` + """ + # If no cedict_file is specified, raise and error + if not cedict_file: + if download.cedict_exists(): + cedict_file = download.get_cedict_file() + else: + raise MissingData('Download CC-CEDICT with "epitran.download.cedict()') + if tones: + rules_file = os.path.join('data', 'rules', 'pinyin-to-ipa-tones.txt') + else: + rules_file = os.path.join('data', 'rules', rules_file) + rules_file = pkg_resources.resource_filename(__name__, rules_file) + self.cedict = cedict.CEDictTrie(cedict_file) + self.rules = rules.Rules([rules_file]) + self.regexp = re.compile(r'\p{Han}') + + def normalize_punc(self, text): + """Normalize punctutation in a string + + Args: + text (unicode): an orthographic string + + Return: + unicode: an orthographic string with punctation normalized to + Western equivalents + """ + for a, b in self.punc: + text = text.replace(a, b) + return text + + def transliterate(self, text, normpunc=False, ligatures=False): + """Transliterates/transcribes a word into IPA + + Args: + word (str): word to transcribe; Unicode string + normpunc (bool): normalize punctuation + ligatures (bool): use precomposed ligatures instead of standard IPA + + Returns: + str: Unicode IPA string + """ + tokens = self.cedict.tokenize(text) + ipa_tokens = [] + for token in tokens: + if token in self.cedict.hanzi: + (pinyin, _) = self.cedict.hanzi[token] + pinyin = u''.join(pinyin).lower() + ipa = self.rules.apply(pinyin) + ipa_tokens.append(ipa.replace(u',', u'')) + else: + if normpunc: + token = self.normalize_punc(token) + ipa_tokens.append(token) + ipa_tokens = map(ligaturize, ipa_tokens)\ + if ligatures else ipa_tokens + return u''.join(ipa_tokens) + + def strict_trans(self, text, normpunc=False, ligatures=False): + return self.transliterate(text, normpunc, ligatures) + + +class EpihanTraditional(Epihan): + def __init__(self, ligatures=False, cedict_file=None, tones=False, rules_file='pinyin-to-ipa.txt'): + """Construct epitran object for Traditional Chinese + + Args: + ligatures (bool): if True, use ligatures instead of standard IPA + cedict_file (str): path to CC-CEDict dictionary file + rules_file (str): name of file with rules for converting pinyin to + IPA + """ + if not cedict_file: + if download.cedict_exists(): + cedict_file = download.get_cedict_file() + else: + raise MissingData('Download CC-CEDICT with "epitran.download.cedict().') + if tones: + rules_file = os.path.join('data', 'rules', 'pinyin-to-ipa-tones.txt') + else: + rules_file = os.path.join('data', 'rules', rules_file) + rules_file = pkg_resources.resource_filename(__name__, rules_file) + self.cedict = cedict.CEDictTrie(cedict_file, traditional=True) + self.rules = rules.Rules([rules_file]) + self.regexp = re.compile(r'\p{Han}') diff --git a/epitran/exceptions.py b/epitran/exceptions.py new file mode 100644 index 0000000000000000000000000000000000000000..1985710c1895fd36a336d4b99a2eca89bad1a7c1 --- /dev/null +++ b/epitran/exceptions.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- + + +class MappingError(Exception): + pass + + +class DatafileError(Exception): + pass + + +class FeatureValueError(Exception): + pass \ No newline at end of file diff --git a/epitran/flite.py b/epitran/flite.py new file mode 100644 index 0000000000000000000000000000000000000000..b3d3554e05ab054813b4cd9c47762e84c266c8e9 --- /dev/null +++ b/epitran/flite.py @@ -0,0 +1,215 @@ +# -*- coding: utf-8 -*- +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import logging +import os.path +import string +import sys +import unicodedata + +import pkg_resources +import regex as re + +import panphon +import unicodecsv as csv +from epitran.ligaturize import ligaturize +from epitran.puncnorm import PuncNorm + +if os.name == 'posix' and sys.version_info[0] < 3: + import subprocess32 as subprocess +else: + import subprocess + +logging.basicConfig(level=logging.CRITICAL) +logger = logging.getLogger('epitran') + + +if sys.version_info[0] == 3: + def unicode(x): + return x + + +class Flite(object): + """English G2P using the Flite speech synthesis system.""" + def __init__(self, arpabet='arpabet', ligatures=False, **kwargs): + """Construct a Flite "wrapper" + + Args: + arpabet (str): file containing ARPAbet to IPA mapping + ligatures (bool): if True, use non-standard ligatures instead of + standard IPA + """ + arpabet = pkg_resources.resource_filename(__name__, os.path.join('data', arpabet + '.csv')) + self.arpa_map = self._read_arpabet(arpabet) + self.chunk_re = re.compile(r"([A-Za-z'’]+|[^A-Za-z'’]+)", re.U) + self.letter_re = re.compile(r"[A-Za-z'’]+") + self.regexp = re.compile(r'[A-Za-z]') + self.puncnorm = PuncNorm() + self.ligatures = ligatures + self.ft = panphon.FeatureTable() + self.num_panphon_fts = len(self.ft.names) + + + def _read_arpabet(self, arpabet): + arpa_map = {} + with open(arpabet, 'rb') as f: + reader = csv.reader(f, encoding='utf-8') + for arpa, ipa in reader: + arpa_map[arpa] = ipa + return arpa_map + + def normalize(self, text): + text = unicode(text) + text = unicodedata.normalize('NFD', text) + text = ''.join(filter(lambda x: x in string.printable, text)) + return text + + def arpa_text_to_list(self, arpa_text): + return arpa_text.split(' ')[1:-1] + + def arpa_to_ipa(self, arpa_text, ligatures=False): + arpa_text = arpa_text.strip() + arpa_list = self.arpa_text_to_list(arpa_text) + arpa_list = map(lambda d: re.sub(r'\d', '', d), arpa_list) + ipa_list = map(lambda d: self.arpa_map[d], arpa_list) + text = ''.join(ipa_list) + return text + + def english_g2p(self, english): + """Stub for English G2P function to be overwritten by subclasses""" + return "" + + def transliterate(self, text, normpunc=False, ligatures=False): + """Convert English text to IPA transcription + + Args: + text (unicode): English text + normpunc (bool): if True, normalize punctuation downward + ligatures (bool): if True, use non-standard ligatures instead of + standard IPA + """ + text = unicodedata.normalize('NFC', text) + acc = [] + for chunk in self.chunk_re.findall(text): + if self.letter_re.match(chunk): + acc.append(self.english_g2p(chunk)) + else: + acc.append(chunk) + text = ''.join(acc) + text = self.puncnorm.norm(text) if normpunc else text + text = ligaturize(text) if (ligatures or self.ligatures) else text + return text + + def strict_trans(self, text, normpunc=False, ligatures=False): + return self.transliterate(text, normpunc, ligatures) + + def word_to_tuples(self, word, normpunc=False): + """Given a word, returns a list of tuples corresponding to IPA segments. + + Args: + word (unicode): word to transliterate + normpunc (bool): If True, normalizes punctuation to ASCII inventory + + Returns: + list: A list of (category, lettercase, orthographic_form, + phonetic_form, feature_vectors) tuples. + + The "feature vectors" form a list consisting of (segment, vector) pairs. + For IPA segments, segment is a substring of phonetic_form such that the + concatenation of all segments in the list is equal to the phonetic_form. + The vectors are a sequence of integers drawn from the set {-1, 0, 1} + where -1 corresponds to '-', 0 corresponds to '0', and 1 corresponds to + '+'. + """ + def cat_and_cap(c): + cat, case = tuple(unicodedata.category(c)) + case = 1 if case == 'u' else 0 + return unicode(cat), case + + def recode_ft(ft): + try: + return {'+': 1, '0': 0, '-': -1}[ft] + except KeyError: + return None + + def vec2bin(vec): + return map(recode_ft, vec) + + def to_vector(seg): + return seg, vec2bin(self.ft.segment_to_vector(seg)) + + def to_vectors(phon): + if phon == '': + return [(-1, [0] * self.num_panphon_fts)] + else: + return [to_vector(seg) for seg in self.ft.ipa_segs(phon)] + + tuples = [] + word = unicode(word) + # word = self.strip_diacritics.process(word) + word = unicodedata.normalize('NFKD', word) + word = unicodedata.normalize('NFC', word) + while word: + match = re.match('[A-Za-z]+', word) + if match: + span = match.group(0) + cat, case = cat_and_cap(span[0]) + phonword = self.transliterate(span) + phonsegs = self.ft.ipa_segs(phonword) + maxlen = max(len(phonsegs), len(span)) + orth = list(span) + [''] * (maxlen - len(span)) + phonsegs += [''] * (maxlen - len(phonsegs)) + for p, o in zip(phonsegs, orth): + tuples.append(('L', case, o, p, to_vectors(p))) + word = word[len(span):] + else: + span = word[0] + span = self.puncnorm.norm(span) if normpunc else span + cat, case = cat_and_cap(span) + cat = 'P' if normpunc and cat in self.puncnorm else cat + phon = '' + vecs = to_vectors(phon) + tuples.append((cat, case, span, phon, vecs)) + word = word[1:] + return tuples + + +class FliteT2P(Flite): + """Flite G2P using t2p.""" + + def english_g2p(self, text): + text = self.normalize(text) + try: + arpa_text = subprocess.check_output(['t2p', '"{}"'.format(text)]) + arpa_text = arpa_text.decode('utf-8') + except OSError: + logger.warning('t2p (from flite) is not installed.') + arpa_text = '' + except subprocess.CalledProcessError: + logger.warning('Non-zero exit status from t2p.') + arpa_text = '' + return self.arpa_to_ipa(arpa_text) + + +class FliteLexLookup(Flite): + """Flite G2P using lex_lookup.""" + + def arpa_text_to_list(self, arpa_text): + return arpa_text[1:-1].split(' ') + + def english_g2p(self, text): + text = self.normalize(text).lower() + try: + arpa_text = subprocess.check_output(['lex_lookup', text]) + arpa_text = arpa_text.decode('utf-8') + except OSError: + logger.warning('lex_lookup (from flite) is not installed.') + arpa_text = '' + except subprocess.CalledProcessError: + logger.warning('Non-zero exit status from lex_lookup.') + arpa_text = '' + # Split on newlines and take the first element (in case lex_lookup + # returns multiple lines). + arpa_text = arpa_text.splitlines()[0] + return self.arpa_to_ipa(arpa_text) diff --git a/epitran/ligaturize.py b/epitran/ligaturize.py new file mode 100644 index 0000000000000000000000000000000000000000..0a852a768426a8d6decb41b6958c87e7b89a9bbb --- /dev/null +++ b/epitran/ligaturize.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +from __future__ import print_function, unicode_literals, division, absolute_import + + +def ligaturize(text): + """Convert text to employ non-standard ligatures + + Args: + text (unicode): IPA text to Convert + + Return: + unicode: non-standard IPA text with phonetic ligatures for affricates + """ + mapping = [(u't͡s', u'ʦ'), + (u't͡ʃ', u'ʧ'), + (u't͡ɕ', u'ʨ'), + (u'd͡z', u'ʣ'), + (u'd͡ʒ', u'ʤ'), + (u'd͡ʑ', u'ʥ'),] + for from_, to_ in mapping: + text = text.replace(from_, to_) + return text diff --git a/epitran/meta.py b/epitran/meta.py new file mode 100644 index 0000000000000000000000000000000000000000..0f63dbc180f685ea931798de5ac5f8cb5b80d555 --- /dev/null +++ b/epitran/meta.py @@ -0,0 +1,75 @@ +modes = { + 'aar': ['Latn'], + 'amh': ['Ethi-pp', 'Ethi-red', 'Ethi'], + 'ara': ['Arab'], + 'aze': ['Latn', 'Cyrl'], + 'ben': ['Beng', 'Beng-red'], + 'cat': ['Latn'], + 'ceb': ['Latn'], + 'ces': ['Latn'], + 'ckb': ['Arab'], + 'deu': ['Latn', 'Latn-np'], + 'fas': ['Arab'], + 'fra': ['Latn', 'Latn-np'], + 'hat': ['Latn-bab'], + 'hau': ['Latn'], + 'hin': ['Deva'], + 'hun': ['Latn'], + 'ilo': ['Latn'], + 'ind': ['Latn'], + 'ita': ['Latn'], + 'jav': ['Latn'], + 'kaz': ['Cyrl', 'Cyrl-bab', 'Latn'], + 'khm': ['Khmr'], + 'kin': ['Latn'], + 'kir': ['Cyrl', 'Arab', 'Latn'], + 'kmr': ['Latn', 'Latn-red'], + 'lao': ['Laoo'], + 'mar': ['Deva'], + 'mlt': ['Latn'], + 'mon': ['Cyrl-bab'], + 'msa': ['Latn'], + 'mya': ['Mymr'], + 'nld': ['Latn'], + 'nya': ['Latn'], + 'orm': ['Latn'], + 'pan': ['Guru'], + 'pol': ['Latn'], + 'por': ['Latn'], + 'ron': ['Latn'], + 'rus': ['Cyrl'], + 'sag': ['Latn'], + 'sna': ['Latn'], + 'som': ['Latn'], + 'spa': ['Latn'], + 'swa': ['Latn', 'Latn-red'], + 'swe': ['Latn'], + 'tam': ['Taml', 'Taml-red'], + 'tel': ['Telu'], + 'tgk': ['Cyrl'], + 'tgl': ['Latn', 'Latn-red'], + 'tha': ['Thai'], + 'tir': ['Ethi-pp', 'Ethi-red', 'Ethi'], + 'tuk': ['Latn', 'Cyrl'], + 'tur': ['Latn', 'Latn-bab', 'Latn-red'], + 'uig': ['Arab'], + 'ukr': ['Cyrl'], + 'urd': ['Arab'], + 'uzb': ['Latn', 'Cyrl'], + 'vie': ['Latn'], + 'xho': ['Latn'], + 'yor': ['Latn'], + 'zha': ['Latn'], + 'zul': ['Latn'] + } + + +def supported_lang(iso639): + return iso639 in modes + + +def get_default_mode(iso639): + try: + return '-'.join([iso639, modes[iso639][0]]) + except KeyError: + return None diff --git a/epitran/ppprocessor.py b/epitran/ppprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..a145eb95db0b8baf9da9af0836168aac9c00bbad --- /dev/null +++ b/epitran/ppprocessor.py @@ -0,0 +1,53 @@ +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import logging +import os.path +import unicodedata + +import pkg_resources + +from epitran.rules import Rules + +logging.basicConfig(level=logging.DEBUG) + + +class PrePostProcessor(object): + def __init__(self, code, fix, rev): + """Constructs a pre/post-processor for orthographic/IPA strings + + This class reads processor files consisting of context-sensitive rules + and compiles them into regular expression objects that can then be used + to perform regex replacements in cascades that capture feeding and + bleeding. + + Args: + code (str): ISO 639-3 code and ISO 15924 code joined with a hyphen + fix (str): 'pre' for preprocessors, 'post' for postprocessors + rev (boolean): True for reverse transliterating pre/post-processors + """ + self.rules = self._read_rules(code, fix, rev) + + def _read_rules(self, code, fix, rev): + assert fix in ['pre', 'post'] + code += '_rev' if rev else '' + fn = os.path.join('data', fix, code + '.txt') + try: + abs_fn = pkg_resources.resource_filename(__name__, fn) + except KeyError: + return Rules([]) + if os.path.isfile(abs_fn): + return Rules([abs_fn]) + else: + return Rules([]) + + def process(self, word): + """Apply processor to an input string + + Args: + word (unicode): input string (orthographic or IPA) + + Returns: + unicode: output string with all rules applied in order + """ + return self.rules.apply(word) diff --git a/epitran/puncnorm.py b/epitran/puncnorm.py new file mode 100644 index 0000000000000000000000000000000000000000..5626180e2d7bb1f29de460a174e385e7b8d9fc58 --- /dev/null +++ b/epitran/puncnorm.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- + +import pkg_resources +import unicodecsv as csv + + +class PuncNorm(object): + def __init__(self): + """Constructs a punctuation normalization object""" + self.puncnorm = self._load_punc_norm_map() + + def _load_punc_norm_map(self): + """Load the map table for normalizing 'down' punctuation.""" + path = pkg_resources.resource_filename(__name__, 'data/puncnorm.csv') + with open(path, 'rb') as f: + reader = csv.reader(f, encoding='utf-8', delimiter=str(','), quotechar=str('"')) + next(reader) + return {punc: norm for (punc, norm) in reader} + + def norm(self, text): + """Apply punctuation normalization to a string of text + + Args: + text (unicode): text to normalize_punc + + Returns: + unicode: text with normalized punctuation + """ + new_text = [] + for c in text: + if c in self.puncnorm: + new_text.append(self.puncnorm[c]) + else: + new_text.append(c) + return ''.join(new_text) + + def __iter__(self): + return iter(self.puncnorm) + + def __getitem__(self, key): + return self.puncnorm[key] diff --git a/epitran/reromanize.py b/epitran/reromanize.py new file mode 100644 index 0000000000000000000000000000000000000000..49afe87289614facd20ffe0be179297837f4233e --- /dev/null +++ b/epitran/reromanize.py @@ -0,0 +1,65 @@ +from __future__ import print_function, unicode_literals, division, absolute_import + +import os.path +import sys +from unicodedata import normalize + +import pkg_resources + +import epitran +import unicodecsv as csv + + +class ReRomanizer(object): + """Converts IPA representations to a readable roman form.""" + + def __init__(self, code, table, decompose=True, cedict_file=None): + """Construct object for re-romanizing Epitran output. + + This class converts orthographic input, via Epitran, to a more + conventional romanization that should be more readable to most humans. + + Args: + code (str): ISO 639-3 code and ISO 15924 code joined with a hyphen + table (str): Name of re-romanization table + decompose (bool): apply decomposing normalization + """ + self.epi = epitran.Epitran(code, cedict_file=cedict_file) + self.mapping = self._load_reromanizer(table, decompose) + + def _load_reromanizer(self, table, decompose): + path = os.path.join('data', 'reromanize', table + '.csv') + path = pkg_resources.resource_filename(__name__, path) + if os.path.isfile(path): + mapping = {} + with open(path, 'rb') as f: + reader = csv.reader(f, encoding='utf-8') + next(reader) + for ipa, rom in reader: + rom = normalize('NFD', rom) if decompose else normalize('NFC', rom) + mapping[ipa] = rom + return mapping + else: + print('File {} does not exist.'.format(path), file=sys.stderr) + return {} + + def reromanize_ipa(self, tr_list): + re_rom_list = [] + for seg in tr_list: + if seg in self.mapping: + re_rom_list.append(self.mapping[seg]) + else: + re_rom_list.append(seg) + return re_rom_list + + def reromanize(self, text): + """Convert orthographic text to romanized text + + Arg: + text (unicode): orthographic text + + Returns: + unicode: romanized text + """ + tr_list = self.epi.trans_list(text) + return ''.join(self.reromanize_ipa(tr_list)) diff --git a/epitran/rules.py b/epitran/rules.py new file mode 100644 index 0000000000000000000000000000000000000000..95deceea98796083a371e1177fb20ce308ea9414 --- /dev/null +++ b/epitran/rules.py @@ -0,0 +1,114 @@ +# -*- coding: utf-8 -*- +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import io +import logging +import unicodedata + +import regex as re + +from epitran.exceptions import DatafileError + +logger = logging.getLogger('epitran') + + +def none2str(x): + return x if x else '' + + +class RuleFileError(Exception): + pass + + +class Rules(object): + def __init__(self, rule_files): + """Construct an object encoding context-sensitive rules + + Args: + rule_files (list): list of names of rule files + """ + self.rules = [] + self.symbols = {} + for rule_file in rule_files: + rules = self._read_rule_file(rule_file) + self.rules = self.rules + rules + + def _read_rule_file(self, rule_file): + rules = [] + with io.open(rule_file, 'r', encoding='utf-8') as f: + for i, line in enumerate(f): + # Normalize the line to decomposed form + line = line.strip() + line = unicodedata.normalize('NFD', line) + if not re.match('\s*%', line): + rules.append(self._read_rule(i, line)) + return [rule for rule in rules if rule is not None] + + def _sub_symbols(self, line): + while re.search(r'::\w+::', line): + s = re.search(r'::\w+::', line).group(0) + if s in self.symbols: + line = line.replace(s, self.symbols[s]) + else: + raise RuleFileError('Undefined symbol: {}'.format(s)) + return line + + def _read_rule(self, i, line): + line = line.strip() + if line: + line = unicodedata.normalize('NFD', line) + s = re.match(r'(?P::\w+::)\s*=\s*(?P.+)', line) + if s: + self.symbols[s.group('symbol')] = s.group('value') + else: + line = self._sub_symbols(line) + r = re.match(r'(\S+)\s*->\s*(\S+)\s*/\s*(\S*)\s*[_]\s*(\S*)', line) + try: + a, b, X, Y = r.groups() + except AttributeError: + raise DatafileError('Line {}: "{}" cannot be parsed.'.format(i + 1, line)) + X, Y = X.replace('#', '^'), Y.replace('#', '$') + a, b = a.replace('0', ''), b.replace('0', '') + try: + if re.search(r'[?]P[<]sw1[>].+[?]P[<]sw2[>]', a): + return self._fields_to_function_metathesis(a, X, Y) + else: + return self._fields_to_function(a, b, X, Y) + except Exception as e: + raise DatafileError('Line {}: "{}" cannot be compiled as regex: ̪{}'.format(i + 1, line, e)) + + def _fields_to_function_metathesis(self, a, X, Y): + left = r'(?P{}){}(?P{})'.format(X, a, Y) + regexp = re.compile(left) + + def rewrite(m): + d = {k: none2str(v) for k, v in m.groupdict().items()} + return '{}{}{}{}'.format(d['X'], d['sw2'], d['sw1'], d['Y']) + + return lambda w: regexp.sub(rewrite, w, re.U) + + def _fields_to_function(self, a, b, X, Y): + left = r'(?P{})(?P{})(?P{})'.format(X, a, Y) + regexp = re.compile(left) + + def rewrite(m): + d = {k: none2str(v) for k, v in m.groupdict().items()} + return '{}{}{}'.format(d['X'], b, d['Y']) + + return lambda w: regexp.sub(rewrite, w, re.U) + + def apply(self, text): + """Apply rules to input text + + Args: + text (unicode): input text (e.g. Pinyin) + + Returns: + unicode: output text (e.g. IPA) + """ + for i, rule in enumerate(self.rules): + text = rule(text) + # print(i, text) + # return unicodedata.normalize('NFD', text) + return text diff --git a/epitran/setup.cfg b/epitran/setup.cfg new file mode 100644 index 0000000000000000000000000000000000000000..2e9053c06e45cd65ebfe61ac591c280a1a9c6ee4 --- /dev/null +++ b/epitran/setup.cfg @@ -0,0 +1,5 @@ +[bdist_wheel] +universal = 1 + +[metadata] +license_file = LICENSE.txt diff --git a/epitran/simple.py b/epitran/simple.py new file mode 100644 index 0000000000000000000000000000000000000000..91ba54c780ab639ce172535818ddae04bb0cb449 --- /dev/null +++ b/epitran/simple.py @@ -0,0 +1,335 @@ +"""Basic Epitran class for G2P in most languages.""" +import logging +import os.path +import sys +import csv +import unicodedata +from collections import defaultdict +from typing import DefaultDict, Callable # pylint: disable=unused-import + +import pkg_resources +import regex + +import panphon +from epitran.exceptions import DatafileError, MappingError, FeatureValueError +from epitran.ligaturize import ligaturize +from epitran.ppprocessor import PrePostProcessor +from epitran.puncnorm import PuncNorm +from epitran.stripdiacritics import StripDiacritics + +logger = logging.getLogger('epitran') + +class SimpleEpitran(object): + """The backend object epitran uses for most languages + + :param code str: ISO 639-3 code and ISO 15924 code joined with a hyphen + :param preproc bool, optional: if True, apply preprocessor + :param postproc bool, optional: if True, apply postprocessors + :param ligatures bool, optional: if True, use phonetic ligatures for affricates instead of + standard IPA + :param rev bool, optional: if True, load reverse transliteration + :param rev_preproc bool, optional: if True, applyy preprocessor when reverse transliterating + :param rev_postproc bool, optional: if True, applyy postprocessor when reverse transliterating + """ + def __init__(self, code: str, preproc: bool=True, postproc: bool=True, ligatures: bool=False, + rev: bool=False, rev_preproc: bool=True, rev_postproc: bool=True, tones: bool=False): + """Constructor""" + self.rev = rev + self.tones = tones + self.g2p = self._load_g2p_map(code, False) + self.regexp = self._construct_regex(self.g2p.keys()) + self.puncnorm = PuncNorm() + self.ft = panphon.FeatureTable() + self.num_panphon_fts = len(self.ft.names) + self.preprocessor = PrePostProcessor(code, 'pre', False) + self.postprocessor = PrePostProcessor(code, 'post', False) + self.strip_diacritics = StripDiacritics(code) + self.preproc = preproc + self.postproc = postproc + self.ligatures = ligatures + self.rev_preproc = rev_preproc + self.rev_postproc = rev_postproc + if rev: + self.rev_g2p = self._load_g2p_map(code, True) + self.rev_regexp = self._construct_regex(self.rev_g2p.keys()) + self.rev_preprocessor = PrePostProcessor(code, 'pre', True) + self.rev_postprocessor = PrePostProcessor(code, 'post', True) + + self.nils = defaultdict(int) + + def get_tones(self) -> bool: + """Returns True if support for tones is turned on. + + :return: True if tone support is activated + :rtype: bool + """ + return self.tones + + def __enter__(self): + return self + + def __exit__(self, _type_, _val, _trace_back): + for nil, count in self.nils.items(): + sys.stderr.write(f'Unknown character "{nil}" occured {count} times.\n') + + # def _one_to_many_gr_by_line_map(self, gr_by_line: "dict[str, list[int]]") -> "tuple[str, list[int]]": + # for g, ls in gr_by_line.items(): + # if len(ls) > 0: + # return (g, ls) + # return ("", []) + + def _non_deterministic_mappings(self, gr_by_line: "dict[str, list[int]]") -> "list[tuple[str, list[int]]]": + return [(g, ls) for (g, ls) in gr_by_line.items() if len(ls) > 1] + + def _load_g2p_map(self, code: str, rev: bool) -> "DefaultDict[str, list[str]]": + """Load the code table for the specified language. + + :param code str: ISO 639-3 code plus "-" plus ISO 15924 code for the language/script to be loaded + :param rev bool: If True, reverse the table (for reverse transliterating) + :return: A mapping from graphemes to phonemes + :rtype: DefaultDict[str, list[str]] + """ + g2p = defaultdict(list) + gr_by_line = defaultdict(list) + code += '_rev' if rev else '' + try: + path = os.path.join('data', 'map', code + '.csv') + path = pkg_resources.resource_filename(__name__, path) + except IndexError as malformed_data_file: + raise DatafileError('Add an appropriately-named mapping to the data/maps directory.') from malformed_data_file + with open(path, encoding='utf-8') as f: + reader = csv.reader(f) + orth, phon = next(reader) + if orth != 'Orth' or phon != 'Phon': + raise DatafileError(f'Header is ["{orth}", "{phon}"] instead of ["Orth", "Phon"].') + for (i, fields) in enumerate(reader): + try: + graph, phon = fields + except ValueError as malformed_data_file: + raise DatafileError(f'Map file is not well formed at line {i + 2}.') from malformed_data_file + graph = unicodedata.normalize('NFD', graph) + phon = unicodedata.normalize('NFD', phon) + if not self.tones: + phon = regex.sub('[˩˨˧˦˥]', '', phon) + g2p[graph].append(phon) + gr_by_line[graph].append(i) + nondeterminisms = self._non_deterministic_mappings(gr_by_line) + if nondeterminisms: + message = "" + for graph, lines in nondeterminisms: + lines = [l + 2 for l in lines] + delim = ', ' + message += '\n' + f'One-to-many G2P mapping for "{graph}" on lines {delim.join(map(str, lines))}' + raise MappingError(f'Invalid mapping for {code}:\n{message}') + return g2p + + def _load_punc_norm_map(self): + """Load the map table for normalizing 'down' punctuation.""" + path = os.path.join('data', 'puncnorm.csv') + path = pkg_resources.resource_filename(__name__, path) + with open(path, encoding='utf-8') as f: + reader = csv.reader(f, delimiter=str(','), quotechar=str('"')) + next(reader) + return {punc: norm for (punc, norm) in reader} + + def _construct_regex(self, g2p_keys): + """Build a regular expression that will greadily match segments from + the mapping table. + """ + graphemes = sorted(g2p_keys, key=len, reverse=True) + return regex.compile(f"({r'|'.join(graphemes)})", regex.I) + + def general_trans(self, text: str, filter_func: "Callable[[tuple[str, bool]], bool]", + normpunc: bool=False, ligatures: bool=False): + """Transliaterates a word into IPA, filtering with filter_func + + :param text str: word to transcribe; unicode string + :param filter_func Callable[[tuple[str, bool]], bool]: function for filtering + segments; takes a tuple and returns a boolean. + :param normpunct bool: normalize punctuation + :param ligatures bool: use precompsed ligatures instead of standard IPA + :return: IPA string corresponding to the orthographic input `text` + :rtype: str + """ + text = unicodedata.normalize('NFD', text.lower()) + logger.debug('(after norm) text=%s', repr(list(text))) + text = self.strip_diacritics.process(text) + logger.debug('(after strip) text=%s', repr(list(text))) + if self.preproc: + text = self.preprocessor.process(text) + logger.debug('(after preproc) text=%s', repr(list(text))) + tr_list = [] + while text: + logger.debug('text=%s', repr(list(text))) + m = self.regexp.match(text) + if m: + source = m.group(0) + try: + target = self.g2p[source][0] + except KeyError: + logger.debug("source = '%s''", source) + logger.debug("self.g2p[source] = %s'", self.g2p[source]) + target = source + except IndexError: + logger.debug("self.g2p[source]= %s", self.g2p[source]) + target = source + tr_list.append((target, True)) + text = text[len(source):] + else: + tr_list.append((text[0], False)) + self.nils[text[0]] += 2 + text = text[1:] + text = ''.join([s for (s, _) in filter(filter_func, tr_list)]) + if self.postproc: + text = self.postprocessor.process(text) + if ligatures or self.ligatures: + text = ligaturize(text) + if normpunc: + text = self.puncnorm.norm(text) + return unicodedata.normalize('NFC', text) + + def transliterate(self, text: str, normpunc: bool=False, ligatures: bool=False): + """Transliterates/transcribes a word into IPA. Passes unmapped + characters through to output unchanged. + + :param text str: word to transcribe + :param normpunct bool: if True, normalize punctuation + :param ligatures bool: if True, use precomposed ligatures instead + of standard IPA + :return: IPA string corresponding to the orthographic string `text`. + All unrecognized characters are included. + :rtype: str + """ + return self.general_trans(text, lambda x: True, + normpunc, ligatures) + + def general_reverse_trans(self, text: str): + """Reconstructs word from IPA. Does the reverse of transliterate(). + Ignores unmapped characters. + + :param text str: Transcription to render in orthography + :return: Orthographic string corresponding to `text` + :rtype: str + """ + if self.rev_preproc: + text = self.rev_preprocessor.process(text) + tr_list = [] + while text: + m = self.rev_regexp.match(text) + if m: + source = m.group(0) + try: + target = self.rev_g2p[source][0] + except KeyError: + logger.debug("source = '%s'", source) + logger.debug("self.rev_g2p[source] = '%s'", self.g2p[source]) + target = source + tr_list.append((target, True)) + text = text[len(source):] + else: + tr_list.append((text[0], False)) + self.nils[text[0]] += 2 + text = text[1:] + text = ''.join([s for (s, _) in tr_list]) + if self.rev_postproc: + text = self.rev_postprocessor.process(text) + return unicodedata.normalize('NFC', text) + + def reverse_transliterate(self, ipa:str) -> str: + """Reconstructs word from IPA. Does the reverse of transliterate() + + :param ipa str: Word transcription in IPA + :return: Reconstruct word in orthography + :rtype: str + """ + if not self.rev: + raise ValueError('This Epitran object was initialized' + + 'with no reverse transliteration loaded') + return self.general_reverse_trans(ipa) + + def strict_trans(self, text: str, normpunc: bool=False, ligatures: bool=False) -> str: + """Transliterates/transcribes a word into IPA, ignoring + umapped characters. + + :param word str: word to transcribe + :param normpunc bool: normalize punctuation + :param ligatures bool: use precomposed ligatures instead of standard IPA + :return: IPA string corresponding to orthographic `word`, ignoring + out-of-mapping characters + :rtype: str + """ + return self.general_trans(text, lambda x: x[1], + normpunc, ligatures) + + def word_to_tuples(self, text: str, normpunc: bool=False) -> "list[tuple[str, int, str, str, list[tuple[str, list[int]]]]]": + """Given a word, returns a list of tuples corresponding to IPA segments. + + :param word str: Word to transcribe + :param normpunc bool: Normalize punctuation + :return: Word represented as (category, lettercase, orthographic_form, + phonetic_form, feature_vectors) tuples + :rtype: list[tuple[str, int, str, str, list[tuple[str, list[int]]]]] + + The "feature vectors" form a list consisting of (segment, vector) + pairs. For IPA segments, segment is a substring of phonetic_form such + that the concatenation of all segments in the list is equal to + the phonetic_form. The vectors are a sequence of integers drawn from + the set {-1, 0, 1} where -1 corresponds to '-', 0 corresponds to '0', + and 1 corresponds to '+'. + """ + def cat_and_cap(category: str) -> "tuple[str, int]": + cat, case = tuple(unicodedata.category(category)) + case = 1 if case == 'u' else 0 + return cat, case + + def recode_ft(feature: str) -> int: + try: + return {'+': 1, '0': 0, '-': -1}[feature] + except KeyError: + raise FeatureValueError(f'Unknown feature value "{feature}"') from KeyError + + def vec2bin(vec: "list[str]") -> "list[int]": + return list(map(recode_ft, vec)) + + def to_vector(seg: str) -> "tuple[str, list[int]]": + return seg, vec2bin(self.ft.segment_to_vector(seg)) + + def to_vectors(phon: str) -> "list[tuple[str, list[int]]]": + if phon == '': + return [('', [0] * self.num_panphon_fts)] + else: + return [to_vector(seg) for seg in self.ft.ipa_segs(phon)] + + tuples = [] + word = self.strip_diacritics.process(text) + word = unicodedata.normalize('NFD', word) + if self.preproc: + word = self.preprocessor.process(word) + while word: + match = self.regexp.match(word) + if match: + span: str = match.group(1) + cat, case = cat_and_cap(span[0]) + phon: str = self.g2p[span.lower()][0] + vecs: "list[tuple[str, list[int]]]" = to_vectors(phon) + tuples.append(('L', case, span, phon, vecs)) + word: str = word[len(span):] + else: + span = word[0] + span: str = self.puncnorm.norm(span) if normpunc else span + cat, case = cat_and_cap(span) + cat: str = 'P' if normpunc and cat in self.puncnorm else cat + phon: str = '' + vecs: "list[tuple[str, list[int]]]" = to_vectors(phon) + tuples.append((cat, case, span, phon, vecs)) + word = word[1:] + return tuples + + def ipa_segs(self, ipa: str) -> "list[str]": + """Given an IPA string, decompose it into a list of segments + + :param ipa str: A phonetic representation in IPA + :return: A list of words corresponding to the segments in `ipa` + :rtype: list[str] + """ + return self.ft.ipa_segs(ipa) diff --git a/epitran/space.py b/epitran/space.py new file mode 100644 index 0000000000000000000000000000000000000000..43483cc0276a905e4d811a74bb3b525aee6f5cb5 --- /dev/null +++ b/epitran/space.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import os + +import pkg_resources +import unicodecsv as csv +from epitran import Epitran + + +class Space(object): + def __init__(self, code, space_names): + """Construct a Space object + + Space objects take strings (corresponding to segments) and return + integers, placing them in an integer space that can be translated into + a one-hot vector. + + The resulting object has a dictionary-like interface that supports + indexing and iteration over "keys". + + Args: + code (str): ISO 639-3 code joined to ISO 15924 code with "-" + space_names (list): list of space names consisting of ISO 639-3 + codes joined to ISO 15924 codes with "-" + """ + self.epi = Epitran(code) + self.dict = self._load_space(space_names) + + def _load_space(self, space_names): + segs = set() + scripts = list(set([nm.split('-')[1] for nm in space_names])) + punc_fns = ['punc-{}.csv'.format(sc) for sc in scripts] + for punc_fn in punc_fns: + punc_fn = os.path.join('data', 'space', punc_fn) + punc_fn = pkg_resources.resource_filename(__name__, punc_fn) + with open(punc_fn, 'rb') as f: + reader = csv.reader(f, encoding='utf-8') + for (mark,) in reader: + segs.add(mark) + for name in space_names: + fn = os.path.join('data', 'space', name + '.csv') + fn = pkg_resources.resource_filename(__name__, fn) + with open(fn, 'rb') as f: + reader = csv.reader(f, encoding='utf-8') + for _, to_ in reader: + for seg in self.epi.ft.ipa_segs(to_): + segs.add(seg) + enum = enumerate(sorted(list(segs))) + return {seg: num for num, seg in enum} + + def __iter__(self): + return iter(self.dict) + + def __getitem__(self, key): + """Given a string as a key, return the corresponding integer + + Args: + key (unicode): a unicode key corresponding to a segment + + Returns: + int: the integer corresponding to the unicode string + """ + try: + return self.dict[key] + except KeyError: + return len(self.dict) diff --git a/epitran/stripdiacritics.py b/epitran/stripdiacritics.py new file mode 100644 index 0000000000000000000000000000000000000000..cf486994fa96c6025f2f8aeb773d43479e4c92ef --- /dev/null +++ b/epitran/stripdiacritics.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import os.path +import unicodedata + +import pkg_resources + +import unicodecsv as csv + + +class StripDiacritics(object): + def __init__(self, code): + """Constructs object to strip specified diacritics from text + + Args: + code (str): ISO 639-3 code and ISO 15924 code joined with a hyphen + """ + self.diacritics = self._read_diacritics(code) + + def _read_diacritics(self, code): + diacritics = [] + fn = os.path.join('data', 'strip', code + '.csv') + try: + abs_fn = pkg_resources.resource_filename(__name__, fn) + except KeyError: + return [] + if os.path.isfile(abs_fn): + with open(abs_fn, 'rb') as f: + reader = csv.reader(f, encoding='utf-8') + for [diacritic] in reader: + diacritics.append(diacritic) + return diacritics + + def process(self, word): + """Remove diacritics from an input string + + Args: + word (unicode): Unicode IPA string + + Returns: + unicode: Unicode IPA string with specified diacritics + removed + """ + # word = unicodedata.normalize('NFD', word) + word = ''.join(filter(lambda x: x not in self.diacritics, word)) + # return unicodedata.normalize('NFC', word) + return word diff --git a/epitran/tir2pp.py b/epitran/tir2pp.py new file mode 100644 index 0000000000000000000000000000000000000000..9bf867a5a53df45d74b38f0bb44781848a7ca426 --- /dev/null +++ b/epitran/tir2pp.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, unicode_literals + +import os.path + +import pkg_resources +from . import rules + + +class Tir2PP(object): + def __init__(self): + fn = os.path.join('data', 'post', 'tir-Ethi-pp.txt') + fn = pkg_resources.resource_filename(__name__, fn) + self.rules = rules.Rules([fn]) + + def apply(self, word): + word = word.replace('ɨ', '') + return self.rules.apply(word) diff --git a/epitran/vector.py b/epitran/vector.py new file mode 100644 index 0000000000000000000000000000000000000000..e49264c54e18eb8e4a4020bf0e1855a4da3d6b56 --- /dev/null +++ b/epitran/vector.py @@ -0,0 +1,61 @@ +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import logging + +from epitran import Epitran +from epitran.space import Space + +logger = logging.getLogger('epitran') + + +class VectorsWithIPASpace(object): + def __init__(self, code, space_names): + """Constructs VectorWithIPASpace object + + A VectorWithIPASpace object takes orthographic words, via the + word_to_segs method, and returns a list of tuples consisting of category + (letter or punctuation), lettercaase, orthographic form, phonetic form, + id within an IPA space, and articulatory feature vector. + + Args: + code (str): ISO 639-3 code joined to ISO 15924 code with "-" + space_names (list): list of space names consisting of ISO 639-3 + codes joined to ISO 15924 codes with "-" + """ + self.epi = Epitran(code) + self.space = Space(code, space_names) + + def word_to_segs(self, word, normpunc=False): + """Returns feature vectors, etc. for segments and punctuation in a word + + Args: + word (unicode): Unicode string representing a word in the + orthography specified when the class is + instantiated + normpunc (bool): normalize punctuation + + Returns: + list: a list of tuples, each representing an IPA segment or a + punctuation character. Tuples consist of . + + Category consists of the standard Unicode classes (e.g. 'L' + for letter and 'P' for punctuation). Case is binary: 1 for + uppercase and 0 for lowercase. + """ + segs = self.epi.word_to_tuples(word, normpunc) + new_segs = [] + for cat, case, orth, phon, id_vec_list in segs: + if not phon and normpunc: + if orth in self.epi.puncnorm: + orth = self.epi.puncnorm[orth] + for s, vector in id_vec_list: + if s in self.space: + id_ = int(self.space[s]) + elif orth in self.space: + id_ = int(self.space[orth]) + else: + id_ = -1 + new_segs.append((cat, case, orth, phon, id_, vector)) + return new_segs diff --git a/epitran/xsampa.py b/epitran/xsampa.py new file mode 100644 index 0000000000000000000000000000000000000000..1e8bd2159bc5ba3a14e2a53e31d6d05b10bc5dce --- /dev/null +++ b/epitran/xsampa.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import os.path +import unicodedata + +import pkg_resources + +import marisa_trie +import panphon +import unicodecsv as csv + + +class XSampa(object): + ipa2xs_fn = 'ipa-xsampa.csv' + + def __init__(self): + """Construct an IPA-XSampa conversion object + """ + self.trie = self._read_ipa2xs() + self.ft = panphon.FeatureTable() + + def _read_ipa2xs(self): + path = os.path.join('data', self.ipa2xs_fn) + path = pkg_resources.resource_filename(__name__, path) + pairs = [] + with open(path, 'rb') as f: + reader = csv.reader(f, encoding='utf-8') + next(reader) + for ipa, xs, _ in reader: + pairs.append((ipa, xs.encode('utf-8'),)) + trie = marisa_trie.BytesTrie(pairs) + return trie + + def prefixes(self, s): + return self.trie.prefixes(s) + + def longest_prefix(self, s): + prefixes = self.prefixes(s) + if not prefixes: + return '' + else: + return sorted(prefixes, key=len)[-1] # sort by length and return last + + def ipa2xs(self, ipa): + """Convert IPA string (unicode) to X-SAMPA string + + Args: + ipa (unicode): An IPA string as unicode + + Returns: + list: a list of strings corresponding to X-SAMPA segments + + Non-IPA segments are skipped. + """ + xsampa = [] + ipa = unicodedata.normalize('NFD', ipa) + while ipa: + token = self.longest_prefix(ipa) + if token: + xs = self.trie[token][0] # take first member of the list + xsampa.append(xs.decode('utf-8')) + ipa = ipa[len(token):] + else: + ipa = ipa[1:] + return ''.join(xsampa) diff --git a/functions.py b/functions.py new file mode 100644 index 0000000000000000000000000000000000000000..0424b487f8b17b1f305b05def56db4c68415e10e --- /dev/null +++ b/functions.py @@ -0,0 +1,92 @@ +import epitran + +def to_lroh(s): + s = s.replace('ɖ', 'ḍ') + s = s.replace('ɾ', 'r') + s = s.replace('ɽ', 'ṛ') + s = s.replace('ʃ', 'š') + s = s.replace('ʈ', 'ṭ') + s = s.replace('j', 'y') + s = s.replace('d͡ʒ', 'j') + + + + s = s.replace('ɑ̃ː', 'ɑ̃ɑ') + s = s.replace('ɑː', 'ɑɑ') + s = s.replace('ẽː', 'eẽ') + s = s.replace('eː', 'ee') + s = s.replace('ĩː', 'iĩ') + s = s.replace('iː', 'ii') + s = s.replace('ɔ̃ː', 'ɔ̃ɔ') + s = s.replace('ɔː', 'ɔɔ') + s = s.replace('ũː', 'uũ') + s = s.replace('uː', 'uu') + + s = s.replace('ɑ', 'a') + s = s.replace('̃ɑ', 'ã') + + s = s.replace('ɔ̃', 'õ') + s = s.replace('ɔ', 'o') + + return s + +def to_roheng(s): + s = s.replace('ɖ', 'dh') + s = s.replace('ɾ', 'r') + s = s.replace('ɽ', 'ç') + s = s.replace('ʃ', 'c') + s = s.replace('ʈ', 'th') + s = s.replace('j', 'y') + s = s.replace('d͡ʒ', 'j') + + s = s.replace('ɑ', 'a') + s = s.replace('̃ɑ', 'ã') + + s = s.replace('ɑ̃ː', 'ɑ̃ɑ') + s = s.replace('ɑː', 'ɑɑ') + s = s.replace('ẽː', 'eẽ') + s = s.replace('eː', 'ee') + s = s.replace('ĩː', 'iĩ') + s = s.replace('iː', 'ii') + s = s.replace('ɔ̃ː', 'ɔ̃ɔ') + s = s.replace('ɔː', 'ɔɔ') + s = s.replace('ũː', 'uũ') + s = s.replace('uː', 'uu') + + s = s.replace('ɑ', 'a') + s = s.replace('̃ɑ', 'ã') + + s = s.replace('ɔ̃', 'õ') + s = s.replace('ɔ', 'o') + + return s + +def convert_script(input_script, output_script, input_text): + epi = epitran.Epitran(input_script) + + # store indices for capitalized words (will assume only first letter is capitalized) + words = input_text.split() + capital_indices = [i for i, word in enumerate(words) if word[0].isupper()] + + grapheme_text = epi.transliterate(input_text) + + if output_script == 'rhg-roheng': + inter_text = to_roheng(grapheme_text) + elif output_script == 'rhg-lroh': + inter_text = to_lroh(grapheme_text) + + # reapply capitalization + words = inter_text.split() + for i in capital_indices: + if i < len(words): + words[i] = words[i].capitalize() + output_text = ' '.join(words) + + return output_text + +# print (f'Number of script mismatches: {numScriptMismatch} / {numEntries}') + +# issues +# +# ou +# glides with only one vowel nasalized (i.e is the whole glide always nasalized) (.e.g thiañ/ṭĩya) - need a constant way to deal with glides and nasalization (i.e. which vowel is nasalized?) diff --git a/output.txt b/output.txt new file mode 100644 index 0000000000000000000000000000000000000000..2adc19c576f0a8a9d9af0eb9ec52dcf7de2ba926 --- /dev/null +++ b/output.txt @@ -0,0 +1 @@ +Tandil hodeyan fukorzoria odeyan tuãra hoi faro ne? Tandilor fuk okkol bicci cõrode etolla boli suke daha no zar. Fuk iin dase nakortun mukortun galortun goli ore gā buture bari fecwamaze sai zagyoi. E fuk iin ekzonortu ar ekzonor hãse fara. Ekbar hacile ekbar ãcaile tuãrar galortu nakortu kuti kuti fuk okkol bairo. Zehon fua waye acaierar fuk iin dase mayafua mukotmade sai foje. Mayafuar mukotun fuk iin nakbai galbai goli bari feshwa maze fuk iin saizargoi. Raito maze fuk iin bariore bicci oigyoi etolla boli mayafua biaramya oigyoi ebala. Toile fuk iin fara mesal ibar mukormaze fuk ase bafe dojerar mayafuar mukortun fuk iin faraye bafor atot. Toi fuk hode iin jinic okkolortuno fara. Fun maye loie martuno fuk faragyoi. Mayafuatun fuk bariore bafmar erio faraye. Hodinbade gororgucitiãtun biaram oigioi. Sefotuno fuk iin fara. Ho sor hombol okkolotuno fuk fara. Endoila gori fuk fara. Gororgucitiã okkolotun fuk faraiore ar biaram oigyoi. Itara oinor zonorio biaramyã gori falai fare. Fukor babute zani fari bade itarar solaifira adot hasolot boduli falaye. Maye gura baica muk fũsibolla alada sap hor estamal gore. Ibaye sabondiore ho sor hombol okkol due. Roitormare fuadde belor gorome fuk more. Ya bafio sef falaite sai si ti fala. Maince zere ni ki aça uça no gore eçe sef falar. Fukor babute zani fari at duio falaibade muk fũse. Etola funormare fuk no fara ar funormare fuk nai. Bafe biaramyã zerfuar muk dori bade atore sabondiore dui fala nizor muk doribar age. Itara ehon begune gonno gonno tarar hatore due. Etolla yanor zoria biaramwa bicci hom oizargyoi itaratun. Biaramyã fuawayo fukor bafote zani fari bade ebola ite ãcaile hacile nakor mukor guri de. Etola mayafua eçe fuk no fara. Etola mayafuartun gorormanco eçe fuk no fara. Itarar solarfira adot hasilot boduli falaye. Itara zodi etorika mozin solile ar manile biaram ciaram no oibo: hacile ar ãcaile nakore mukore guri dio! Hato kolore sabondiore bicci bicci gori duio! Ãça uça no gorede zagamare sef falayo! Cattat made homot ekbar ho sor hombol okkolore duio! Hamica hombol okkolore roitormare fua dio. Iinore manile fuk okkol no faraibo ar hono biaram ciaram no oibo. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..84036e00c45435caca6f90dc190c29ffb3ef8258 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +gradio +panphon +setuptools +regex +marisa-trie +requests \ No newline at end of file