Spaces:

ILAD
/

rhg-script-converter-ui

Sleeping

App Files Files Community

micahg commited on Dec 12, 2023

Commit

609216a

•

1 Parent(s): 29fedcd

Initial file upload

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

LICENSE +21 -0
README.md +1 -13
__pycache__/config.cpython-310.pyc +0 -0
__pycache__/functions.cpython-310.pyc +0 -0
app.py +37 -0
config.py +5 -0
epitran/__init__.py +2 -0
epitran/__pycache__/__init__.cpython-310.pyc +0 -0
epitran/__pycache__/__init__.cpython-311.pyc +0 -0
epitran/__pycache__/_epitran.cpython-310.pyc +0 -0
epitran/__pycache__/_epitran.cpython-311.pyc +0 -0
epitran/__pycache__/cedict.cpython-310.pyc +0 -0
epitran/__pycache__/download.cpython-310.pyc +0 -0
epitran/__pycache__/epihan.cpython-310.pyc +0 -0
epitran/__pycache__/exceptions.cpython-310.pyc +0 -0
epitran/__pycache__/flite.cpython-310.pyc +0 -0
epitran/__pycache__/ligaturize.cpython-310.pyc +0 -0
epitran/__pycache__/ppprocessor.cpython-310.pyc +0 -0
epitran/__pycache__/puncnorm.cpython-310.pyc +0 -0
epitran/__pycache__/reromanize.cpython-310.pyc +0 -0
epitran/__pycache__/rules.cpython-310.pyc +0 -0
epitran/__pycache__/simple.cpython-310.pyc +0 -0
epitran/__pycache__/stripdiacritics.cpython-310.pyc +0 -0
epitran/__pycache__/xsampa.cpython-310.pyc +0 -0
epitran/_epitran.py +129 -0
epitran/backoff.py +89 -0
epitran/bin/connl2engipaspace.py +79 -0
epitran/bin/connl2ipaspace.py +100 -0
epitran/bin/decompose.py +13 -0
epitran/bin/detectcaps.py +25 -0
epitran/bin/epitranscribe.py +26 -0
epitran/bin/isbijective.py +31 -0
epitran/bin/ltf2ipaspace.py +53 -0
epitran/bin/migraterules.py +40 -0
epitran/bin/reromanize.py +22 -0
epitran/bin/space2punc.py +24 -0
epitran/bin/testvectorgen.py +35 -0
epitran/bin/transltf.py +20 -0
epitran/bin/uigtransliterate.py +10 -0
epitran/bin/vie-tones.py +44 -0
epitran/cedict.py +76 -0
epitran/data/arpabet.csv +46 -0
epitran/data/ipa-xsampa.csv +175 -0
epitran/data/map/rhg-lroh.csv +33 -0
epitran/data/map/rhg-roheng.csv +35 -0
epitran/data/post/rhg-lroh.txt +19 -0
epitran/data/post/rhg-roheng.txt +14 -0
epitran/data/pre/rhg-lroh.txt +17 -0
epitran/data/pre/rhg-roheng.txt +13 -0
epitran/data/puncnorm.csv +9 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Micah Geyman
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,13 +1 @@
----
-title: Rhg Script Converter Ui
-emoji: 👁
-colorFrom: blue
-colorTo: blue
-sdk: gradio
-sdk_version: 4.8.0
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ # rhg-script-converter-ui

__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (275 Bytes). View file

__pycache__/functions.cpython-310.pyc ADDED Viewed

Binary file (2.18 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import gradio as gr
+from functions import convert_script
+from config import scripts
+DEFAULT_INPUT_SCRIPT = list(scripts.keys())[0]
+DEFAULT_OUTPUT_SCRIPT = list(scripts.keys())[1]
+def process_text(input_script, output_script, input_text, uploaded_file=None):
+    if uploaded_file is not None:
+        input_text = uploaded_file.decode("utf-8")
+    output_text = convert_script(scripts[input_script], scripts[output_script], input_text)
+    output_filename = "output.txt"
+    with open(output_filename, "w") as file:
+        file.write(output_text)
+    return output_text, output_filename
+with gr.Blocks(title="Rohingya Script Converter") as page:
+    gr.Markdown("## Rohingya Script Converter")
+    with gr.Row():
+        input_script = gr.Dropdown(label="Choose the input script:", choices=scripts.keys(), value=DEFAULT_INPUT_SCRIPT)
+        output_script = gr.Dropdown(label="Choose the output script:", choices=scripts.keys(), value=DEFAULT_OUTPUT_SCRIPT)
+    with gr.Row():
+        input_text = gr.Textbox(label="Input Text", placeholder="Enter text here or upload a file", lines=5)
+        output_text = gr.Textbox(label="Output Text", placeholder="Converted text will appear here", lines=5, interactive=False)
+    with gr.Row():
+        input_file = gr.File(label="Upload Text File", file_count="single", type="binary")
+        download_link = gr.File(label="Download Converted File")
+    gr.Button("Convert").click(
+        process_text,
+        inputs=[input_script, output_script, input_text, input_file],
+        outputs=[output_text, download_link]
+    )
+page.launch(share=True)

config.py ADDED Viewed

	@@ -0,0 +1,5 @@

+scripts = {
+	'LearnRohingya':'rhg-lroh',
+	'Rohingyalish':'rhg-roheng',
+	'Rohingyalish (old)':'rhg-roheng-old'
+	}

epitran/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from epitran._epitran import Epitran
2	+ from epitran.reromanize import ReRomanizer

epitran/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (274 Bytes). View file

epitran/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (314 Bytes). View file

epitran/__pycache__/_epitran.cpython-310.pyc ADDED Viewed

Binary file (6.7 kB). View file

epitran/__pycache__/_epitran.cpython-311.pyc ADDED Viewed

Binary file (8.71 kB). View file

epitran/__pycache__/cedict.cpython-310.pyc ADDED Viewed

Binary file (2.93 kB). View file

epitran/__pycache__/download.cpython-310.pyc ADDED Viewed

Binary file (1.29 kB). View file

epitran/__pycache__/epihan.cpython-310.pyc ADDED Viewed

Binary file (4.22 kB). View file

epitran/__pycache__/exceptions.cpython-310.pyc ADDED Viewed

Binary file (577 Bytes). View file

epitran/__pycache__/flite.cpython-310.pyc ADDED Viewed

Binary file (8.39 kB). View file

epitran/__pycache__/ligaturize.cpython-310.pyc ADDED Viewed

Binary file (781 Bytes). View file

epitran/__pycache__/ppprocessor.cpython-310.pyc ADDED Viewed

Binary file (2.14 kB). View file

epitran/__pycache__/puncnorm.cpython-310.pyc ADDED Viewed

Binary file (1.88 kB). View file

epitran/__pycache__/reromanize.cpython-310.pyc ADDED Viewed

Binary file (2.53 kB). View file

epitran/__pycache__/rules.cpython-310.pyc ADDED Viewed

Binary file (4.85 kB). View file

epitran/__pycache__/simple.cpython-310.pyc ADDED Viewed

Binary file (14.7 kB). View file

epitran/__pycache__/stripdiacritics.cpython-310.pyc ADDED Viewed

Binary file (1.91 kB). View file

epitran/__pycache__/xsampa.cpython-310.pyc ADDED Viewed

Binary file (2.24 kB). View file

epitran/_epitran.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# -*- coding: utf-8 -*-
+import logging
+from typing import Union
+import panphon.featuretable
+from epitran.epihan import Epihan, EpihanTraditional
+from epitran.flite import FliteLexLookup
+from epitran.puncnorm import PuncNorm
+from epitran.simple import SimpleEpitran
+from epitran.xsampa import XSampa
+logger = logging.getLogger('epitran')
+logger.setLevel(logging.WARNING)
+class Epitran(object):
+    """Unified interface for IPA transliteration/transcription
+    :param code str: ISO 639-3 plus "-" plus ISO 15924 code of the language/script pair that should be loaded
+    :param preproc bool: apply preprocessors
+    :param postproc bool:  apply postprocessors
+    :param ligatures bool: use precomposed ligatures instead of standard IPA
+    :param cedict_filename str: path to file containing the CC-CEDict dictionary
+    :param rev boolean: use reverse transliteration
+    :param rev_preproc bool: if True, apply preprocessors when reverse transliterating
+    :param rev_postproc bool: if True, apply postprocessors when reverse transliterating
+    """
+    special = {'eng-Latn': FliteLexLookup,
+               'cmn-Hans': Epihan,
+               'cmn-Hant': EpihanTraditional}
+    def __init__(self, code: str, preproc: bool=True, postproc: bool=True, ligatures: bool=False,
+                cedict_file: Union[bool, None]=None, rev: bool=False,
+                rev_preproc: bool=True, rev_postproc: bool=True, tones: bool=False):
+        """Constructor method"""
+        if code in self.special:
+            self.epi = self.special[code](ligatures=ligatures, cedict_file=cedict_file, tones=tones)
+        else:
+            self.epi = SimpleEpitran(code, preproc, postproc, ligatures, rev, rev_preproc, rev_postproc, tones=tones)
+        self.ft = panphon.featuretable.FeatureTable()
+        self.xsampa = XSampa()
+        self.puncnorm = PuncNorm()
+    def transliterate(self, word: str, normpunc: bool=False, ligatures: bool=False) -> str:
+        """Transliterates/transcribes a word into IPA
+        :param word str: word to transcribe
+        :param normpunc bool: if True, normalize punctuation
+        :param ligatures bool: if True, use precomposed ligatures instead of standard IPA
+        :return: An IPA string corresponding to the input orthographic string
+        :rtype: str
+        """
+        return self.epi.transliterate(word, normpunc, ligatures)
+    def reverse_transliterate(self, ipa: str) -> str:
+        """Reconstructs word from IPA. Does the reverse of transliterate()
+        :param ipa str: An IPA representation of a word
+        :return: An orthographic representation of the word
+        :rtype: str
+        """
+        return self.epi.reverse_transliterate(ipa)
+    def strict_trans(self, word: str, normpunc:bool =False, ligatures: bool=False) -> str:
+        """Transliterate a word into IPA, ignoring all characters that cannot be recognized.
+        :param word str: word to transcribe
+        :param normpunc bool, optional: if True, normalize punctuation
+        :param ligatures bool, optional: if True, use precomposed ligatures instead of standard IPA
+        :return: An IPA string corresponding to the input orthographic string, with all uncoverted characters omitted
+        :rtype: str
+        """
+        return self.epi.strict_trans(word, normpunc, ligatures)
+    def trans_list(self, word: str, normpunc: bool=False, ligatures: bool=False) -> "list[str]":
+        """Transliterates/transcribes a word into list of IPA phonemes
+        :param word str: word to transcribe
+        :param normpunc bool, optional: if True, normalize punctuation
+        :param ligatures bool, optional: if True, use precomposed ligatures instead of standard IPA
+        :return: list of IPA strings, each corresponding to a segment
+        :rtype: list[str]
+        """
+        return self.ft.segs_safe(self.epi.transliterate(word, normpunc, ligatures))
+    def trans_delimiter(self, text: str, delimiter: str=str(' '), normpunc: bool=False, ligatures: bool=False):
+        """Return IPA transliteration with a delimiter between segments
+        :param text str: An orthographic text
+        :param delimiter str, optional: A string to insert between segments
+        :param normpunc bool, optional: If True, normalize punctuation
+        :param ligatures bool, optional: If True, use precomposed ligatures instead of standard IPA
+        :return: String of IPA phonemes separated by `delimiter`
+        :rtype: str
+        """
+        return delimiter.join(self.trans_list(text, normpunc=normpunc,
+                                              ligatures=ligatures))
+    def xsampa_list(self, word: str, normpunc: bool=False, ligaturize: bool=False):
+        """Transliterates/transcribes a word as X-SAMPA
+        :param word str: An orthographic word
+        :param normpunc bool, optional: If True, normalize punctuation
+        :param ligatures bool, optional: If True, use precomposed ligatures instead of standard IPA
+        :return: List of X-SAMPA strings corresponding to `word`
+        :rtype: list[str]
+        """
+        ipa_segs = self.ft.ipa_segs(self.epi.strict_trans(word, normpunc,
+                                                          ligaturize))
+        return list(map(self.xsampa.ipa2xs, ipa_segs))
+    def word_to_tuples(self, word: str, normpunc: bool=False, _ligaturize: bool=False):
+        """Given a word, returns a list of tuples corresponding to IPA segments. The "feature
+        vectors" form a list consisting of (segment, vector) pairs.
+        For IPA segments, segment is a substring of phonetic_form such that the
+        concatenation of all segments in the list is equal to the phonetic_form.
+        The vectors are a sequence of integers drawn from the set {-1, 0, 1}
+        where -1 corresponds to '-', 0 corresponds to '0', and 1 corresponds to
+        '+'.
+        :param word str: An orthographic word
+        :param normpunc bool, optional: If True, normalize punctuation
+        :param ligatures bool, optional: If True, use precomposed ligatures instead of standard IPA
+        :return: A list of tuples corresponding to IPA segments
+        :rtype: list[tuple[str, str, str, str, list[int]]]
+        """
+        try:
+            return self.epi.word_to_tuples(word, normpunc)
+        except AttributeError:
+            raise AttributeError('Method word_to_tuples not yet implemented for this language-script pair!') from AttributeError

epitran/backoff.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# -*- coding: utf-8 -*-
+from __future__ import (print_function, absolute_import,
+                        unicode_literals)
+import regex as re
+from . import _epitran
+import panphon.featuretable
+from epitran.puncnorm import PuncNorm
+from epitran.xsampa import XSampa
+from epitran.stripdiacritics import StripDiacritics
+class Backoff(object):
+    """Implements rudimentary language ID and backoff."""
+    def __init__(self, lang_script_codes, cedict_file=None):
+        """Construct a Backoff object.
+        Args:
+            lang_script_codes (list): codes for languages to try, starting
+            with the highest priority languages
+            cedict_file (str): path to the CC-CEdict dictionary file
+            (necessary only when cmn-Hans or cmn-Hant are used)
+        """
+        self.langs = [_epitran.Epitran(c, cedict_file=cedict_file)
+                      for c in lang_script_codes]
+        self.num_re = re.compile(r'\p{Number}+')
+        self.ft = panphon.featuretable.FeatureTable()
+        self.xsampa = XSampa()
+        self.puncnorm = PuncNorm()
+        self.dias = [StripDiacritics(c) for c in lang_script_codes]
+    def transliterate(self, token):
+        """Return IPA transliteration given by first acceptable mode.
+        Args:
+            token (unicode): orthographic text
+        Returns:
+            unicode: transliteration as Unicode IPA string
+        """
+        tr_list = []
+        while token:
+            is_outside_lang = True
+            for dia, lang in zip(self.dias, self.langs):
+                source = ''
+                while True:
+                    m = lang.epi.regexp.match(dia.process(token))
+                    if not m:
+                        break
+                    s = m.group()
+                    token = token[len(s):]
+                    source += s
+                    is_outside_lang = False
+                tr_list.append(lang.transliterate(source))
+            if is_outside_lang:
+                m = re.match(r'\p{Number}+', token)
+                if m:
+                    source = m.group()
+                    tr_list.append(source)
+                    token = token[len(source):]
+                else:
+                    tr_list.append(token[0])
+                    token = token[1:]
+        return ''.join(tr_list)
+    def trans_list(self, token):
+        """Transliterate/transcribe a word into list of IPA phonemes.
+        Args:
+            token (unicode): word to transcribe; unicode string
+        Returns:
+            list: list of IPA unicode strings, each corresponding to a segment
+        """
+        return self.ft.segs_safe(self.transliterate(token))
+    def xsampa_list(self, token):
+        """Transcribe a word into a list of X-SAMPA phonemes.
+        Args:
+            token (unicode): word to transcribe; unicode strings
+        Returns:
+            list: list of X-SAMPA strings, each corresponding to a segment
+        """
+        if re.match(r'^\p{Number}+$', token):
+            return ''
+        else:
+            ipa_segs = self.ft.ipa_segs(self.transliterate(token))
+            return list(map(self.xsampa.ipa2xs, ipa_segs))

epitran/bin/connl2engipaspace.py ADDED Viewed

	@@ -0,0 +1,79 @@

+#!/usr/bin/env python
+import argparse
+import codecs
+import logging
+from collections import Counter
+import unicodecsv as csv
+import epitran
+import epitran.flite
+import panphon
+logger = logging.getLogger('epitran')
+def normpunc(flite, s):
+    def norm(c):
+        if c in flite.puncnorm:
+            return flite.puncnorm[c]
+        else:
+            return c
+    return ''.join(map(norm, s))
+def add_record(flite, ft, orth):
+    space = Counter()
+    orth = normpunc(flite, orth)
+    trans = flite.transliterate(orth)
+    while trans:
+        pref = ft.longest_one_seg_prefix(trans)
+        if pref != '':
+            space[pref] += 1
+            trans = trans[len(pref):]
+        else:
+            if trans[0] in flite.puncnorm_vals:
+                space[trans[0]] += 1
+            else:
+                space[trans[0]] += 1
+            trans = trans[1:]
+    return space
+def add_file(flite, ft, fn):
+    space = Counter()
+    with codecs.open(fn, 'r', 'utf-8') as f:
+        for line in f:
+            fields = line.split(u'\t')
+            if len(fields) > 0:
+                orth = fields[0]
+                space.update(add_record(flite, ft, orth))
+    logger.debug(u'Length of counter:\t{}'.format(len(space)))
+    return space
+def print_space(output, space):
+    pairs = enumerate(sorted(filter(lambda x: x, space.keys())))
+    with open(output, 'wb') as f:
+        writer = csv.writer(f, encoding='utf-8')
+        for i, char in pairs:
+            writer.writerow((i, char))
+def main(infiles, output):
+    flite = epitran.flite.Flite()
+    ft = panphon.FeatureTable()
+    space = Counter()
+    for fn in infiles:
+        logger.debug(u'Scanning:\t{}'.format(fn).encode('utf-8'))
+        space.update(add_file(flite, ft, fn))
+    print_space(output, space)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-o', '--output', help='Output file.')
+    parser.add_argument('infiles', nargs='+', help='CONLL files serving as basis for segment space.')
+    args = parser.parse_args()
+    main(args.infiles, args.output)

epitran/bin/connl2ipaspace.py ADDED Viewed

	@@ -0,0 +1,100 @@

+#!/usr/bin/env python
+import argparse
+import codecs
+import logging
+from collections import Counter
+import epitran
+import panphon
+import unicodecsv as csv
+logger = logging.getLogger('epitran')
+def normpunc(epi, s):
+    def norm(c):
+        if c in epi.puncnorm:
+            return epi.puncnorm[c]
+        else:
+            return c
+    return ''.join(map(norm, s))
+def add_record_gen(epi, ft, orth):
+    space = Counter()
+    orth = normpunc(epi, orth)
+    trans = epi.transliterate(orth)
+    while trans:
+        pref = ft.longest_one_seg_prefix(trans)
+        if pref != '':
+            space[pref] += 1
+            trans = trans[len(pref):]
+        else:
+            space[trans[0]] += 1
+            trans = trans[1:]
+    return space
+def add_file_gen(epi, ft, fn):
+    space = Counter()
+    with codecs.open(fn, 'r', 'utf-8') as f:
+        for line in f:
+            fields = line.split(u'\t')
+            if len(fields) > 0:
+                orth = fields[0]
+                space.update(add_record_gen(epi, ft, orth))
+    logger.debug(u'Length of counter:\t{}'.format(len(space)))
+    return space
+def add_file_op(epi, ft, fn):
+    space = Counter()
+    with codecs.open(fn, 'r', 'utf-8') as f:
+        for line in f:
+            fields = line.split(u'\t')
+            if len(fields) > 0:
+                orth = fields[0]
+                trans = epi.transliterate(orth)
+                while trans:
+                    pref = ft.longest_one_seg_prefix(trans)
+                    if pref != '':
+                        space[pref] += 1
+                        trans = trans[len(pref):]
+                    else:
+                        if trans[0] in epi.puncnorm:
+                            space[epi.puncnorm[trans[0]]] += 1
+                        else:
+                            space[trans[0]] += 1
+                        trans = trans[1:]
+    logger.debug(u'Length of counter:\t{}'.format(len(space)))
+    return space
+def print_space(output, space):
+    pairs = enumerate(sorted(filter(lambda x: x, space.keys())))
+    with open(output, 'wb') as f:
+        writer = csv.writer(f, encoding='utf-8')
+        for i, char in pairs:
+            writer.writerow((i, char))
+def main(code, op, infiles, output):
+    epi = epitran.Epitran(code)
+    ft = panphon.FeatureTable()
+    space = Counter()
+    for fn in infiles:
+        logger.debug(u'Scanning:\t{}'.format(fn).encode('utf-8'))
+        add_file = add_file_op if op else add_file_gen
+        space.update(add_file(epi, ft, fn))
+    print_space(output, space)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--op', action='store_true', help='Script uses punctuation as (parts of) letters.')
+    parser.add_argument('-c', '--code', help='Script code for CONNL files.')
+    parser.add_argument('-o', '--output', help='Output file.')
+    parser.add_argument('infiles', nargs='+', help='CONLL files serving as basis for segment space.')
+    args = parser.parse_args()
+    main(args.code, args.op, args.infiles, args.output)

epitran/bin/decompose.py ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/usr/bin/env python3
+import unicodedata
+import sys
+def main(fn):
+    with open(fn, encoding='utf-8') as f:
+        print(unicodedata.normalize('NFD', f.read()))
+if __name__ == '__main__':
+    main(sys.argv[1])

epitran/bin/detectcaps.py ADDED Viewed

	@@ -0,0 +1,25 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+import unicodedata
+import fileinput
+def main():
+    for line in fileinput.input():
+        line = line.decode('utf-8')
+        token = line.strip()
+        if len(token) > 1 and unicodedata.category(token[1]) == 'Lu':
+            is_cap = 0
+        elif len(token) > 0 and unicodedata.category(token[0]) == 'Lu':
+            is_cap = 1
+        else:
+            is_cap = 0
+        line = u'{}\t{}'.format(is_cap, token)
+        line = line.encode('utf-8')
+        print(line)
+if __name__ == '__main__':
+    main()

epitran/bin/epitranscribe.py ADDED Viewed

	@@ -0,0 +1,26 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import sys
+import unicodedata
+import epitran
+import argparse
+def main(code):
+    epi = epitran.Epitran(code)
+    for line in sys.stdin:  # pointless
+        line = line.decode('utf-8')
+        line = unicodedata.normalize('NFD', line.lower())
+        line = epi.transliterate(line)
+        line = line.encode('utf-8')
+        sys.stdout.write(line)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description=u'Coverts text from STDIN (in the language specified),' +
+        'into Unicode IPA and emits it to STDOUT.')
+    parser.add_argument('code', help=u'ISO 639-3 code for conversion language')
+    args = parser.parse_args()
+    main(args.code)

epitran/bin/isbijective.py ADDED Viewed

	@@ -0,0 +1,31 @@

+#!/usr/bin/env pythoh
+from __future__ import print_function
+import glob
+import unicodecsv as csv
+def read_map(fn):
+    with open(fn, 'rb') as f:
+        reader = csv.reader(f, encoding='utf-8')
+        next(reader)
+        return [(a, b) for [a, b] in reader]
+def is_bijection(mapping):
+    a, b = zip(*mapping)
+    distinct_a, distinct_b = set(a), set(b)
+    return len(distinct_a) == len(mapping) and len(distinct_b) == len(mapping)
+def main(map_fns):
+    for fn in map_fns:
+        mapping = read_map(fn)
+        is_b = is_bijection(mapping)
+        print('{}\t{}'.format(fn, is_b))
+if __name__ == '__main__':
+    map_fns = glob.glob('../data/*.csv')
+    main(map_fns)

epitran/bin/ltf2ipaspace.py ADDED Viewed

	@@ -0,0 +1,53 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+import argparse
+import glob
+import os.path
+from lxml import etree
+import unicodecsv as csv
+import epitran
+import panphon.featuretable
+def read_tokens(fn):
+    tree = etree.parse(fn)
+    root = tree.getroot()
+    return [tok.text for tok in root.findall('.//TOKEN')]
+def read_input(input_, langscript):
+    space = set()
+    epi = epitran.Epitran(langscript)
+    ft = panphon.featuretable.FeatureTable()
+    for dirname in input_[0]:
+        for fn in glob.glob(os.path.join(dirname, '*.ltf.xml')):
+            for token in read_tokens(fn):
+                ipa = epi.transliterate(token)
+                for seg in ft.segs_safe(ipa):
+                    space.add(seg)
+    return space
+def write_output(output, space):
+    with open(output, 'wb') as f:
+        writer = csv.writer(f, encoding='utf-8')
+        for n, ch in enumerate(sorted(list(space))):
+            writer.writerow((n, ch))
+def main(langscript, input_, output):
+    space = read_input(input_, langscript)
+    write_output(output, space)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--code', help='language-script code')
+    parser.add_argument('-i', '--input', nargs='+', action='append', help='Directories where input LTF files are found')
+    parser.add_argument('-o', '--output', help='Output file')
+    args = parser.parse_args()
+    main(args.code, args.input, args.output)

epitran/bin/migraterules.py ADDED Viewed

	@@ -0,0 +1,40 @@

+#!/usr/bin/env Python
+# -*- coding: utf-8 -*-
+from __future__ import (print_function, unicode_literals, absolute_import)
+import glob
+import re
+import io
+import unicodecsv
+def build_rule(fields):
+    try:
+        a, b, X, Y = fields
+        b = "0" if not b else b
+        a = "0" if not a else a
+        return '{} -> {} / {} _ {}'.format(a, b, X, Y)
+    except ValueError:
+        print('Malformed rule: {}'.format(','.join(fields)))
+def main():
+    for csv in glob.glob('*.csv'):
+        txt = re.match('[A-Za-z-]+', csv).group(0) + '.txt'
+        with open(csv, 'rb') as f, io.open(txt, 'w', encoding='utf-8') as g:
+            reader = unicodecsv.reader(f, encoding='utf-8')
+            next(reader)
+            for fields in reader:
+                if re.match('\s*%', fields[0]):
+                    print(','.join([x for x in fields if x]), file=g)
+                else:
+                    rule = build_rule(fields)
+                    rule = re.sub('[ ]+', ' ', rule)
+                    rule = re.sub('[ ]$', '', rule)
+                    print(rule, file=g)
+if __name__ == '__main__':
+    main()

epitran/bin/reromanize.py ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/usr/bin/env python2
+from __future__ import print_function
+import epitran.reromanize
+import argparse
+import sys
+def main(code, table):
+    rr = epitran.reromanize.ReRomanizer(code, table)
+    for line in sys.stdin:
+        line = line.decode('utf-8')
+        tokens = line.strip().split('\t')
+        tokens = [rr.reromanize(x) for x in tokens]
+        print('\t'.join(tokens).encode('utf-8'))
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--code', default='ori-Orya', type=str, help='Languagee and script code')
+    parser.add_argument('-t', '--table', default='anglocentric', type=str, help='Romanization table')
+    args = parser.parse_args()
+    main(args.code, args.table)

epitran/bin/space2punc.py ADDED Viewed

	@@ -0,0 +1,24 @@

+#!/usr/bin/env python
+import sys
+import unicodedata
+import unicodecsv as csv
+def main(fns, fnn):
+    punc = set()
+    for fn in fns:
+        print fn
+        with open(fn, 'rb') as f:
+            reader = csv.reader(f, encoding='utf-8')
+            for _, s in reader:
+                if len(s) == 1 and unicodedata.category(s)[0] == u'P':
+                    punc.add(s)
+    with open(fnn, 'wb') as f:
+        writer = csv.writer(f, encoding='utf-8')
+        for mark in sorted(list(punc)):
+            writer.writerow([mark])
+if __name__ == '__main__':
+    main(sys.argv[1:-1], sys.argv[-1])

epitran/bin/testvectorgen.py ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/usr/bin/env python
+from __future__ import print_function
+import argparse
+import codecs
+import epitran.vector
+def main(code, space, infile):
+    vec = epitran.vector.VectorsWithIPASpace(code, space)
+    with codecs.open(infile, 'r', 'utf-8') as f:
+        for line in f:
+            fields = line.split('\t')
+            if len(fields) > 1:
+                word = fields[0]
+                print(u"WORD: {}".format(word).encode('utf-8'))
+                segs = vec.word_to_segs(word)
+                for record in segs:
+                    cat, case, orth, phon, id_, vector = record
+                    print(u"Category: {}".format(cat).encode('utf-8'))
+                    print(u"Case: {}".format(case).encode('utf-8'))
+                    print(u"Orthographic: {}".format(orth).encode('utf-8'))
+                    print(u"Phonetic: {}".format(phon).encode('utf-8'))
+                    print(u"Vector: {}".format(vector).encode('utf-8'))
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--code', required=True, help='Script code.')
+    parser.add_argument('-s', '--space', required=True, help='Space.')
+    parser.add_argument('-i', '--infile', required=True, help='Input file.')
+    args = parser.parse_args()
+    main(args.code, args.space, args.infile)

epitran/bin/transltf.py ADDED Viewed

	@@ -0,0 +1,20 @@

+#!/usr/bin/env python
+from __future__ import print_function
+import sys
+from lxml import etree
+import epitran
+import epitran.vector
+def main(fn):
+    epi = epitran.Epitran('uig-Arab')
+    vwis = epitran.vector.VectorsWithIPASpace('uig-Arab', ['uig-Arab'])
+    tree = etree.parse(fn)
+    root = tree.getroot()
+    for token in root.findall('.//TOKEN'):
+        # print(token.text.encode('utf-8'))
+        print(epi.transliterate(unicode(token.text)).encode('utf-8'))
+if __name__ == '__main__':
+    main(sys.argv[1])

epitran/bin/uigtransliterate.py ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/usr/bin/env python
+from __future__ import print_function
+import fileinput
+import epitran
+epi = epitran.Epitran('uig-Arab')
+for line in fileinput.input():
+    s = epi.transliterate(line.strip().decode('utf-8'))
+    print(s.encode('utf-8'))

epitran/bin/vie-tones.py ADDED Viewed

	@@ -0,0 +1,44 @@

+#!/usr/bin/env python3
+import csv
+import re
+import sys
+import os.path
+import unicodedata
+tones = {
+    '\u00b4': '˧˥', # acute = sac
+    '\u0060': '˨˩', # grave = huyen
+    '\u0303': '˧˥', # tilde = nga
+    '\u0309': '˧˩˧', # hook above = hoi
+    '\u0323': '˧˩', # dot below = nang
+}
+def shuffle_tone(orth, phon):
+    orth = unicodedata.normalize('NFD', orth)
+    if re.search('[aeiouơư]', orth):
+        for tone in tones:
+            if tone in orth:
+                phon += tones[tone]
+        if not re.search('[˩˨˧˦˥]', phon):
+            phon += '˧'
+    return phon
+def main():
+    fnin = sys.argv[1]
+    fnout = os.path.basename(fnin)
+    with open(fnin) as fin, open(fnout, 'w') as fout:
+        writer = csv.writer(fout)
+        reader = csv.reader(fin)
+        header = next(reader)
+        writer.writerow(header)
+        for orth, phon in reader:
+            phon = shuffle_tone(orth, phon)
+            writer.writerow([orth, phon])
+if __name__ == '__main__':
+    main()

epitran/cedict.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# -*- coding: utf-8 -*-
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import codecs
+import marisa_trie
+import regex as re
+ASCII_CHARS = ''.join([chr(i) for i in range(128)])
+class CEDictTrie(object):
+    def __init__(self, cedict_file, traditional=False):
+        """Construct a trie over CC-CEDict
+        Args:
+            cedict_file (str): path to the CC-CEDict dictionary
+            traditional (bool): if True, use traditional characters
+        """
+        self.hanzi = self._read_cedict(cedict_file, traditional=traditional)
+        self.trie = self._construct_trie(self.hanzi)
+    def _read_cedict(self, cedict_file, traditional=False):
+        comment_re = re.compile('\s*#')
+        lemma_re = re.compile('(?P<hanzi>[^]]+) \[(?P<pinyin>[^]]+)\] /(?P<english>.+)/')
+        cedict = {}
+        with codecs.open(cedict_file, 'r', 'utf-8') as f:
+            for line in f:
+                if comment_re.match(line):
+                    pass
+                elif lemma_re.match(line):
+                    match = lemma_re.match(line)
+                    hanzi = match.group('hanzi').split(' ')
+                    pinyin = match.group('pinyin').split(' ')
+                    english = match.group('english').split('/')
+                    if traditional:
+                        cedict[hanzi[0]] = (pinyin, english)  # traditional characters only
+                    else:
+                        cedict[hanzi[1]] = (pinyin, english)  # simplified characters only.
+        return cedict
+    def _construct_trie(self, hanzi):
+        pairs = []
+        for hz, df in self.hanzi.items():
+            py, en = df
+            py = str(''.join(filter(lambda x: x in ASCII_CHARS, ' '.join(py))))
+            pairs.append((hz, (py.encode('utf-8'),)))
+        trie = marisa_trie.RecordTrie(str('@s'), pairs)
+        return trie
+    def has_key(self, key):
+        return key in self.hanzi
+    def prefixes(self, s):
+        return self.trie.prefixes(s)
+    def longest_prefix(self, s):
+        prefixes = self.prefixes(s)
+        if not prefixes:
+            return ''
+        else:
+            return sorted(prefixes, key=len)[-1]  # Sort by length and return last.
+    def tokenize(self, s):
+        tokens = []
+        while s:
+            token = self.longest_prefix(s)
+            if token:
+                tokens.append(token)
+                s = s[len(token):]
+            else:
+                tokens.append(s[0])
+                s = s[1:]
+        return tokens

epitran/data/arpabet.csv ADDED Viewed

	@@ -0,0 +1,46 @@

+pau,
+null,
+ey,ej
+ae,æ
+iy,i
+eh,ɛ
+ay,aj
+ih,ɪ
+ow,ow
+aa,ɑ
+ao,ɔ
+aw,aw
+oy,oj
+ah,ʌ
+ax,ə
+uw,u
+uh,ʊ
+er,ɹ̩
+b,b
+ch,t͡ʃ
+d,d
+dx,ɾ
+f,f
+g,ɡ
+hh,h
+jh,d͡ʒ
+k,k
+l,l
+em,m̩
+m,m
+en,n̩
+n,n
+ng,ŋ
+p,p
+q,ʔ
+r,ɹ
+s,s
+sh,ʃ
+t,t
+dh,ð
+th,θ
+v,v
+w,w
+y,j
+z,z
+zh,ʒ

epitran/data/ipa-xsampa.csv ADDED Viewed

	@@ -0,0 +1,175 @@

+IPA,X-SAMPA,Name
+p,p,vl bilabial plosive
+b,b,vd bilabial plosive
+t,t,vl alveolar plosive
+d,d,vd alveolar plosive
+ʈ,t`,vl retroflex plosive
+ɖ,d`,vd retroflex plosive
+c,c,vl palatal plosive
+ɟ,J\,vd palatal plosive
+k,k,ld velar plosive
+ɡ,g,vd velar plosive
+q,q,vl uvular plosive
+ɢ,G\,vd uvular plosive
+ʔ,?,glottal plosive
+m,m,bilabial nasal
+ɱ,F,vl labiodental nasal
+n,n,alveolar nasal
+ɳ,n`,vl retroflex nasal
+ɲ,J,vl palatal nasal
+ŋ,N,vl velar nasal
+ɴ,N\,vl uvular nasal
+ʙ,B\,vd bilabial trill
+r,r,vd alveolar trill
+ʀ,R\,vl uvular trill
+ɾ,4,vl alveolar tap
+ɽ,r`,vl retroflex flap
+ɸ,p\,vl bilabial fricative
+β,B,vd bilabial fricative
+f,f,vl labiodental fricative
+v,v,vd labiodental fricative
+θ,T,vl dental fricative
+ð,D,vd dental fricative
+s,s,vl alveolar fricative
+z,z,vd alveolar fricative
+ʃ,S,vl postalveolar fricative
+ʒ,Z,vd postalveolar fricative
+ʂ,s`,vl retroflex fricative
+ʐ,z`,vd retroflex fricative
+ç,C,vl palatal fricative
+ʝ,j\,vd palatal fricative
+x,x,vl velar fricative
+ɣ,G,vd velar fricative
+χ,X,vl uvular fricative
+ʁ,R,vd uvular fricative
+ħ,X\,vl pharyngeal fricative
+ʕ,?\,vd pharyngeal fricative
+h,h,vl glottal fricative
+ʔ,?,glottal plosive
+ɬ,K,vl alveolar lateral fricative
+ɮ,K\,vd alveolar lateral fricative
+ʋ,P,vd labiodental approximant
+ɹ,r\,vd (post)alveolar approximant
+ɻ,r\`,vd retroflex approximant
+j,j,vd palatal approximant
+ɰ,M\,vd velar approximant
+l,l,vd alveolar lateral approximant
+ɭ,l`,vd retroflex lateral approximant
+ʎ,L,vd palatal lateral approximant
+ʟ,L\,vd velar lateral approximant
+pʼ,p_>,ejective
+tʼ,t_>,ejective
+ʈʼ,t`_>,ejective
+cʼ,c_>,ejective
+kʼ,k_>,ejective
+qʼ,q_>,ejective
+ɓ,b_<,vl bilabial implosive
+ɗ,d_<,vl alveolar implosive
+ƙ,k_<,vl velar implosive
+ɠ,g_<,vl velar implosive
+i,i,close front unrounded
+y,y,close front rounded
+ɨ,1,close central unrounded
+ʉ,},close central rounded
+ɯ,M,close back unrounded
+u,u,close back rounded
+ɪ,I,lax close front unrounded
+ʏ,Y,lax close front rounded
+ʊ,U,lax close back rounded
+e,e,close-mid front unrounded
+ø,2,front close-mid rounded
+ɤ,7,close-mid back unrounded
+o,o,close-mid back rounded
+ə,@,schwa
+ɘ,@\,close-mid central unrounded vowel
+ɵ,8,rounded schwa
+ɛ,E,open-mid front unrounded
+œ,9,front open-mid rounded
+ʌ,V,open-mid back unrounded
+ɔ,O,open-mid back rounded
+æ,{,mid-open front unrounded vowel
+ɐ,6,open-mid schwa
+a,a,open front unrounded
+ă,a_X,extra short open front unrounded
+ɶ,&,front open rounded
+ɑ,A,open back unrounded
+ɒ,Q,open back rounded
+̥,_0,voiceless
+̬,_v,voiced
+ʰ,_h,aspirated
+̤,_t,breathy voiced
+̰,_k,creaky voiced
+̼,_N,linguolabial
+̪,_d,dental
+̺,_a,apical
+̻,_m,laminal
+̹,_O,more rounded
+̜,_c,less rounded
+̟,_+,advanced
+̠,_-,retracted
+̈,"_""",centralized
+̽,_x,mid-centralized
+̩,=,syllabic
+̯,_^,non-syllabic
+ʷ,_w,labialized
+ʲ,',palatalized
+ˠ,_G,velarized
+ˤ,_?\,pharyngealized
+̴,_e,velarized or pharyngealized
+̝,_r,raised
+̞,_o,lowered
+̃,~,nasalized
+ⁿ,_n,nasal release
+ˡ,_l,lateral release
+̚,_},not audibly released
+̘,_A,advanced tongue root
+̙,_q,retracted tongue root
+̋,_T,extra high tone
+́,_H,high tone
+̄,_M,mid tone
+̀,_L,low tone
+̏,_B,extra low tone
+ˈ,"""",(primary) stress mark
+ˌ,%,secondary stress
+ː,:,length mark
+ˑ,:\,half-length
+̆,_X,extra-short
+.,.,syllable break
+ʍ,W,vl labial-velar fricative
+w,w,vd labio-velar approximant
+ɥ,H,labial-palatal approximant
+ʜ,H\,vl epiglottal fricative
+ʢ,<\,vl epiglottal fricative
+ʡ,>\,vl epiglottal plosive
+ɕ,s\,vl alveolopalatal fricative
+ʑ,z\,vl alveolopalatal fricative
+ʘ,O\,bilabial click
+ǀ,|\,dental click
+ǃ,!\,click
+ǂ,'=\,alveolar click
+ǁ,|\|\,alveolar lateral click
+ɺ,l\,vl alveolar lateral flap
+ɜ,3,open-mid central
+ʛ,G\_<,vl uvular implosive
+ɚ,@`,rhotacized schwa
+ɞ,3\,open-mid central rounded
+ɦ,h\,vd glottal fricative
+ɫ,5,velarized vl alveolar lateral
+ʄ,J\_<,vl palatal implosive
+ʼ,_>,ejective
+ɝ,3`,rhotacized open-mid central
+t͡ʃ,tS,vl postalveolar affricate
+d͡ʒ,dZ,vd postalveolar affricate
+t͡ɕ,ts\,vl alveolo-palatal affricate
+d͡ʑ,dz\,vd alveolo-palatal affricate
+t͡ɬ,tK,vl alveolar lateral affricate
+k͡p,kp,vl labial-velar plosive
+g͡b,gb,vd labial-velar plosive
+ŋ͡m,Nm,labial-velar nasal stop
+ʈ͡ʂ,ts`,vl retroflex affricate
+ɖ͡ʐ,tz`,vd retroflex affricate
+˩,_B,extra low tone
+˨,_L,low tone
+˧,_M,mid tone
+˦,_H,high tone
+˥,_T,extra high tone

epitran/data/map/rhg-lroh.csv ADDED Viewed

	@@ -0,0 +1,33 @@

+Orth,Phon
+b,b
+d,d
+ḍ,ɖ
+f,f
+g,g
+h,h
+j,d͡ʒ
+k,k
+l,l
+m,m
+n,n
+p,p
+r,ɾ
+ṛ,ɽ
+s,s
+š,ʃ
+t,t
+ṭ,ʈ
+v,v
+w,w
+y,j
+z,z
+ã,ɑ̃
+a,ɑ
+ẽ,ẽ
+e,e
+ĩ,ĩ
+i,i
+õ,ɔ̃
+o,ɔ
+ũ,ũ
+u,u

epitran/data/map/rhg-roheng.csv ADDED Viewed

	@@ -0,0 +1,35 @@

+Orth,Phon
+b,b
+c,ʃ
+ç,ɽ
+d,d
+f,f
+g,g
+h,h
+j,d͡ʒ
+k,k
+l,l
+m,m
+n,n
+p,p
+q,q
+r,ɾ
+s,s
+t,t
+v,v
+w,w
+x,ks
+y,j
+z,z
+dh,ɖ
+th,ʈ
+a,ɑ
+añ,ɑ̃
+e,e
+eñ,ẽ
+i,i
+iñ,ĩ
+o,ɔ
+oñ,ɔ̃
+u,u
+uñ,ũ

epitran/data/post/rhg-lroh.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+ɑ̃ɑ -> ɑ̃ː / _
+ɑɑ̃ -> ɑ̃ː / _
+ɑɑ -> ɑː / _
+ẽe -> ẽː / _
+eẽ -> ẽː / _
+ee -> eː / _
+ĩi -> ĩː / _
+iĩ -> ĩː / _
+ii -> iː / _
+ɔ̃ɔ -> ɔ̃ː / _
+ɔɔ̃ -> ɔ̃ː / _
+ɔɔ -> ɔː / _
+ũu -> ũː / _
+uũ -> ũː / _
+uu -> uː / _

epitran/data/post/rhg-roheng.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+ɑɑ̃ -> ɑ̃ː / _
+ɑɑ -> ɑː / _
+eẽ -> ẽː / _
+ee -> eː / _
+iĩ -> ĩː / _
+ii -> iː / _
+ɔɔ̃ -> ɔ̃ː / _
+oo -> ɔː / _
+uũ -> ũː / _
+uu -> uː / _

epitran/data/pre/rhg-lroh.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+::vowel:: = a|ã|e|ẽ|i|ĩ|o|õ|u|ũ
+::consonant:: = b|d|ḍ|f|g|h|j|k|l|m|n|p|r|ṛ|s|š|t|ṭ|v|w|y|z
+% remove stress marks
+á -> a / _
+é -> e / _
+í -> i / _
+ó -> o / _
+ú -> u / _
+% vowel glides
+w -> 0 / (u|ũ) _ (a|o|e)
+y -> 0 / (i|ĩ) _ (a|e|o|u)
+% long vowels
+% gemination

epitran/data/pre/rhg-roheng.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+::vowel:: = a|e|i|o|u
+::consonant:: = b|c|ç|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|x|y|z
+% remove stress marks
+á -> a / _
+é -> e / _
+í -> i / _
+ó -> o / _
+ú -> u / _
+% vowel glides
+w -> 0 / (u|uñ) _ (a|o|e)
+y -> 0 / (i|iñ) _ (a|e|o|u)

epitran/data/puncnorm.csv ADDED Viewed

	@@ -0,0 +1,9 @@

+Punctuation,NormalizedForm
+‘,'
+’,'
+ʼ,'
+ʻ,'
+”,""""
+“,""""
+。,.
+，,","