micahg commited on
Commit
609216a
1 Parent(s): 29fedcd

Initial file upload

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. LICENSE +21 -0
  2. README.md +1 -13
  3. __pycache__/config.cpython-310.pyc +0 -0
  4. __pycache__/functions.cpython-310.pyc +0 -0
  5. app.py +37 -0
  6. config.py +5 -0
  7. epitran/__init__.py +2 -0
  8. epitran/__pycache__/__init__.cpython-310.pyc +0 -0
  9. epitran/__pycache__/__init__.cpython-311.pyc +0 -0
  10. epitran/__pycache__/_epitran.cpython-310.pyc +0 -0
  11. epitran/__pycache__/_epitran.cpython-311.pyc +0 -0
  12. epitran/__pycache__/cedict.cpython-310.pyc +0 -0
  13. epitran/__pycache__/download.cpython-310.pyc +0 -0
  14. epitran/__pycache__/epihan.cpython-310.pyc +0 -0
  15. epitran/__pycache__/exceptions.cpython-310.pyc +0 -0
  16. epitran/__pycache__/flite.cpython-310.pyc +0 -0
  17. epitran/__pycache__/ligaturize.cpython-310.pyc +0 -0
  18. epitran/__pycache__/ppprocessor.cpython-310.pyc +0 -0
  19. epitran/__pycache__/puncnorm.cpython-310.pyc +0 -0
  20. epitran/__pycache__/reromanize.cpython-310.pyc +0 -0
  21. epitran/__pycache__/rules.cpython-310.pyc +0 -0
  22. epitran/__pycache__/simple.cpython-310.pyc +0 -0
  23. epitran/__pycache__/stripdiacritics.cpython-310.pyc +0 -0
  24. epitran/__pycache__/xsampa.cpython-310.pyc +0 -0
  25. epitran/_epitran.py +129 -0
  26. epitran/backoff.py +89 -0
  27. epitran/bin/connl2engipaspace.py +79 -0
  28. epitran/bin/connl2ipaspace.py +100 -0
  29. epitran/bin/decompose.py +13 -0
  30. epitran/bin/detectcaps.py +25 -0
  31. epitran/bin/epitranscribe.py +26 -0
  32. epitran/bin/isbijective.py +31 -0
  33. epitran/bin/ltf2ipaspace.py +53 -0
  34. epitran/bin/migraterules.py +40 -0
  35. epitran/bin/reromanize.py +22 -0
  36. epitran/bin/space2punc.py +24 -0
  37. epitran/bin/testvectorgen.py +35 -0
  38. epitran/bin/transltf.py +20 -0
  39. epitran/bin/uigtransliterate.py +10 -0
  40. epitran/bin/vie-tones.py +44 -0
  41. epitran/cedict.py +76 -0
  42. epitran/data/arpabet.csv +46 -0
  43. epitran/data/ipa-xsampa.csv +175 -0
  44. epitran/data/map/rhg-lroh.csv +33 -0
  45. epitran/data/map/rhg-roheng.csv +35 -0
  46. epitran/data/post/rhg-lroh.txt +19 -0
  47. epitran/data/post/rhg-roheng.txt +14 -0
  48. epitran/data/pre/rhg-lroh.txt +17 -0
  49. epitran/data/pre/rhg-roheng.txt +13 -0
  50. epitran/data/puncnorm.csv +9 -0
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Micah Geyman
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1 @@
1
- ---
2
- title: Rhg Script Converter Ui
3
- emoji: 👁
4
- colorFrom: blue
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 4.8.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # rhg-script-converter-ui
 
 
 
 
 
 
 
 
 
 
 
 
__pycache__/config.cpython-310.pyc ADDED
Binary file (275 Bytes). View file
 
__pycache__/functions.cpython-310.pyc ADDED
Binary file (2.18 kB). View file
 
app.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from functions import convert_script
3
+ from config import scripts
4
+
5
+ DEFAULT_INPUT_SCRIPT = list(scripts.keys())[0]
6
+ DEFAULT_OUTPUT_SCRIPT = list(scripts.keys())[1]
7
+
8
+ def process_text(input_script, output_script, input_text, uploaded_file=None):
9
+ if uploaded_file is not None:
10
+ input_text = uploaded_file.decode("utf-8")
11
+
12
+ output_text = convert_script(scripts[input_script], scripts[output_script], input_text)
13
+
14
+ output_filename = "output.txt"
15
+ with open(output_filename, "w") as file:
16
+ file.write(output_text)
17
+
18
+ return output_text, output_filename
19
+
20
+ with gr.Blocks(title="Rohingya Script Converter") as page:
21
+ gr.Markdown("## Rohingya Script Converter")
22
+ with gr.Row():
23
+ input_script = gr.Dropdown(label="Choose the input script:", choices=scripts.keys(), value=DEFAULT_INPUT_SCRIPT)
24
+ output_script = gr.Dropdown(label="Choose the output script:", choices=scripts.keys(), value=DEFAULT_OUTPUT_SCRIPT)
25
+ with gr.Row():
26
+ input_text = gr.Textbox(label="Input Text", placeholder="Enter text here or upload a file", lines=5)
27
+ output_text = gr.Textbox(label="Output Text", placeholder="Converted text will appear here", lines=5, interactive=False)
28
+ with gr.Row():
29
+ input_file = gr.File(label="Upload Text File", file_count="single", type="binary")
30
+ download_link = gr.File(label="Download Converted File")
31
+ gr.Button("Convert").click(
32
+ process_text,
33
+ inputs=[input_script, output_script, input_text, input_file],
34
+ outputs=[output_text, download_link]
35
+ )
36
+
37
+ page.launch(share=True)
config.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ scripts = {
2
+ 'LearnRohingya':'rhg-lroh',
3
+ 'Rohingyalish':'rhg-roheng',
4
+ 'Rohingyalish (old)':'rhg-roheng-old'
5
+ }
epitran/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from epitran._epitran import Epitran
2
+ from epitran.reromanize import ReRomanizer
epitran/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (274 Bytes). View file
 
epitran/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (314 Bytes). View file
 
epitran/__pycache__/_epitran.cpython-310.pyc ADDED
Binary file (6.7 kB). View file
 
epitran/__pycache__/_epitran.cpython-311.pyc ADDED
Binary file (8.71 kB). View file
 
epitran/__pycache__/cedict.cpython-310.pyc ADDED
Binary file (2.93 kB). View file
 
epitran/__pycache__/download.cpython-310.pyc ADDED
Binary file (1.29 kB). View file
 
epitran/__pycache__/epihan.cpython-310.pyc ADDED
Binary file (4.22 kB). View file
 
epitran/__pycache__/exceptions.cpython-310.pyc ADDED
Binary file (577 Bytes). View file
 
epitran/__pycache__/flite.cpython-310.pyc ADDED
Binary file (8.39 kB). View file
 
epitran/__pycache__/ligaturize.cpython-310.pyc ADDED
Binary file (781 Bytes). View file
 
epitran/__pycache__/ppprocessor.cpython-310.pyc ADDED
Binary file (2.14 kB). View file
 
epitran/__pycache__/puncnorm.cpython-310.pyc ADDED
Binary file (1.88 kB). View file
 
epitran/__pycache__/reromanize.cpython-310.pyc ADDED
Binary file (2.53 kB). View file
 
epitran/__pycache__/rules.cpython-310.pyc ADDED
Binary file (4.85 kB). View file
 
epitran/__pycache__/simple.cpython-310.pyc ADDED
Binary file (14.7 kB). View file
 
epitran/__pycache__/stripdiacritics.cpython-310.pyc ADDED
Binary file (1.91 kB). View file
 
epitran/__pycache__/xsampa.cpython-310.pyc ADDED
Binary file (2.24 kB). View file
 
epitran/_epitran.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import logging
3
+ from typing import Union
4
+
5
+ import panphon.featuretable
6
+ from epitran.epihan import Epihan, EpihanTraditional
7
+ from epitran.flite import FliteLexLookup
8
+ from epitran.puncnorm import PuncNorm
9
+ from epitran.simple import SimpleEpitran
10
+ from epitran.xsampa import XSampa
11
+
12
+ logger = logging.getLogger('epitran')
13
+ logger.setLevel(logging.WARNING)
14
+
15
+ class Epitran(object):
16
+ """Unified interface for IPA transliteration/transcription
17
+
18
+ :param code str: ISO 639-3 plus "-" plus ISO 15924 code of the language/script pair that should be loaded
19
+ :param preproc bool: apply preprocessors
20
+ :param postproc bool: apply postprocessors
21
+ :param ligatures bool: use precomposed ligatures instead of standard IPA
22
+ :param cedict_filename str: path to file containing the CC-CEDict dictionary
23
+ :param rev boolean: use reverse transliteration
24
+ :param rev_preproc bool: if True, apply preprocessors when reverse transliterating
25
+ :param rev_postproc bool: if True, apply postprocessors when reverse transliterating
26
+ """
27
+ special = {'eng-Latn': FliteLexLookup,
28
+ 'cmn-Hans': Epihan,
29
+ 'cmn-Hant': EpihanTraditional}
30
+
31
+ def __init__(self, code: str, preproc: bool=True, postproc: bool=True, ligatures: bool=False,
32
+ cedict_file: Union[bool, None]=None, rev: bool=False,
33
+ rev_preproc: bool=True, rev_postproc: bool=True, tones: bool=False):
34
+ """Constructor method"""
35
+ if code in self.special:
36
+ self.epi = self.special[code](ligatures=ligatures, cedict_file=cedict_file, tones=tones)
37
+ else:
38
+ self.epi = SimpleEpitran(code, preproc, postproc, ligatures, rev, rev_preproc, rev_postproc, tones=tones)
39
+ self.ft = panphon.featuretable.FeatureTable()
40
+ self.xsampa = XSampa()
41
+ self.puncnorm = PuncNorm()
42
+
43
+ def transliterate(self, word: str, normpunc: bool=False, ligatures: bool=False) -> str:
44
+ """Transliterates/transcribes a word into IPA
45
+
46
+ :param word str: word to transcribe
47
+ :param normpunc bool: if True, normalize punctuation
48
+ :param ligatures bool: if True, use precomposed ligatures instead of standard IPA
49
+ :return: An IPA string corresponding to the input orthographic string
50
+ :rtype: str
51
+ """
52
+ return self.epi.transliterate(word, normpunc, ligatures)
53
+
54
+ def reverse_transliterate(self, ipa: str) -> str:
55
+ """Reconstructs word from IPA. Does the reverse of transliterate()
56
+
57
+ :param ipa str: An IPA representation of a word
58
+ :return: An orthographic representation of the word
59
+ :rtype: str
60
+ """
61
+ return self.epi.reverse_transliterate(ipa)
62
+
63
+ def strict_trans(self, word: str, normpunc:bool =False, ligatures: bool=False) -> str:
64
+ """Transliterate a word into IPA, ignoring all characters that cannot be recognized.
65
+
66
+ :param word str: word to transcribe
67
+ :param normpunc bool, optional: if True, normalize punctuation
68
+ :param ligatures bool, optional: if True, use precomposed ligatures instead of standard IPA
69
+ :return: An IPA string corresponding to the input orthographic string, with all uncoverted characters omitted
70
+ :rtype: str
71
+ """
72
+ return self.epi.strict_trans(word, normpunc, ligatures)
73
+
74
+ def trans_list(self, word: str, normpunc: bool=False, ligatures: bool=False) -> "list[str]":
75
+ """Transliterates/transcribes a word into list of IPA phonemes
76
+
77
+ :param word str: word to transcribe
78
+ :param normpunc bool, optional: if True, normalize punctuation
79
+ :param ligatures bool, optional: if True, use precomposed ligatures instead of standard IPA
80
+ :return: list of IPA strings, each corresponding to a segment
81
+ :rtype: list[str]
82
+ """
83
+ return self.ft.segs_safe(self.epi.transliterate(word, normpunc, ligatures))
84
+
85
+ def trans_delimiter(self, text: str, delimiter: str=str(' '), normpunc: bool=False, ligatures: bool=False):
86
+ """Return IPA transliteration with a delimiter between segments
87
+
88
+ :param text str: An orthographic text
89
+ :param delimiter str, optional: A string to insert between segments
90
+ :param normpunc bool, optional: If True, normalize punctuation
91
+ :param ligatures bool, optional: If True, use precomposed ligatures instead of standard IPA
92
+ :return: String of IPA phonemes separated by `delimiter`
93
+ :rtype: str
94
+ """
95
+ return delimiter.join(self.trans_list(text, normpunc=normpunc,
96
+ ligatures=ligatures))
97
+
98
+ def xsampa_list(self, word: str, normpunc: bool=False, ligaturize: bool=False):
99
+ """Transliterates/transcribes a word as X-SAMPA
100
+
101
+ :param word str: An orthographic word
102
+ :param normpunc bool, optional: If True, normalize punctuation
103
+ :param ligatures bool, optional: If True, use precomposed ligatures instead of standard IPA
104
+ :return: List of X-SAMPA strings corresponding to `word`
105
+ :rtype: list[str]
106
+ """
107
+ ipa_segs = self.ft.ipa_segs(self.epi.strict_trans(word, normpunc,
108
+ ligaturize))
109
+ return list(map(self.xsampa.ipa2xs, ipa_segs))
110
+
111
+ def word_to_tuples(self, word: str, normpunc: bool=False, _ligaturize: bool=False):
112
+ """Given a word, returns a list of tuples corresponding to IPA segments. The "feature
113
+ vectors" form a list consisting of (segment, vector) pairs.
114
+ For IPA segments, segment is a substring of phonetic_form such that the
115
+ concatenation of all segments in the list is equal to the phonetic_form.
116
+ The vectors are a sequence of integers drawn from the set {-1, 0, 1}
117
+ where -1 corresponds to '-', 0 corresponds to '0', and 1 corresponds to
118
+ '+'.
119
+
120
+ :param word str: An orthographic word
121
+ :param normpunc bool, optional: If True, normalize punctuation
122
+ :param ligatures bool, optional: If True, use precomposed ligatures instead of standard IPA
123
+ :return: A list of tuples corresponding to IPA segments
124
+ :rtype: list[tuple[str, str, str, str, list[int]]]
125
+ """
126
+ try:
127
+ return self.epi.word_to_tuples(word, normpunc)
128
+ except AttributeError:
129
+ raise AttributeError('Method word_to_tuples not yet implemented for this language-script pair!') from AttributeError
epitran/backoff.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from __future__ import (print_function, absolute_import,
3
+ unicode_literals)
4
+
5
+ import regex as re
6
+ from . import _epitran
7
+ import panphon.featuretable
8
+ from epitran.puncnorm import PuncNorm
9
+ from epitran.xsampa import XSampa
10
+ from epitran.stripdiacritics import StripDiacritics
11
+
12
+
13
+ class Backoff(object):
14
+ """Implements rudimentary language ID and backoff."""
15
+
16
+ def __init__(self, lang_script_codes, cedict_file=None):
17
+ """Construct a Backoff object.
18
+
19
+ Args:
20
+ lang_script_codes (list): codes for languages to try, starting
21
+ with the highest priority languages
22
+ cedict_file (str): path to the CC-CEdict dictionary file
23
+ (necessary only when cmn-Hans or cmn-Hant are used)
24
+ """
25
+ self.langs = [_epitran.Epitran(c, cedict_file=cedict_file)
26
+ for c in lang_script_codes]
27
+ self.num_re = re.compile(r'\p{Number}+')
28
+ self.ft = panphon.featuretable.FeatureTable()
29
+ self.xsampa = XSampa()
30
+ self.puncnorm = PuncNorm()
31
+ self.dias = [StripDiacritics(c) for c in lang_script_codes]
32
+
33
+ def transliterate(self, token):
34
+ """Return IPA transliteration given by first acceptable mode.
35
+ Args:
36
+ token (unicode): orthographic text
37
+ Returns:
38
+ unicode: transliteration as Unicode IPA string
39
+ """
40
+ tr_list = []
41
+ while token:
42
+ is_outside_lang = True
43
+ for dia, lang in zip(self.dias, self.langs):
44
+ source = ''
45
+ while True:
46
+ m = lang.epi.regexp.match(dia.process(token))
47
+ if not m:
48
+ break
49
+ s = m.group()
50
+ token = token[len(s):]
51
+ source += s
52
+ is_outside_lang = False
53
+ tr_list.append(lang.transliterate(source))
54
+ if is_outside_lang:
55
+ m = re.match(r'\p{Number}+', token)
56
+ if m:
57
+ source = m.group()
58
+ tr_list.append(source)
59
+ token = token[len(source):]
60
+ else:
61
+ tr_list.append(token[0])
62
+ token = token[1:]
63
+ return ''.join(tr_list)
64
+
65
+ def trans_list(self, token):
66
+ """Transliterate/transcribe a word into list of IPA phonemes.
67
+
68
+ Args:
69
+ token (unicode): word to transcribe; unicode string
70
+
71
+ Returns:
72
+ list: list of IPA unicode strings, each corresponding to a segment
73
+ """
74
+ return self.ft.segs_safe(self.transliterate(token))
75
+
76
+ def xsampa_list(self, token):
77
+ """Transcribe a word into a list of X-SAMPA phonemes.
78
+
79
+ Args:
80
+ token (unicode): word to transcribe; unicode strings
81
+
82
+ Returns:
83
+ list: list of X-SAMPA strings, each corresponding to a segment
84
+ """
85
+ if re.match(r'^\p{Number}+$', token):
86
+ return ''
87
+ else:
88
+ ipa_segs = self.ft.ipa_segs(self.transliterate(token))
89
+ return list(map(self.xsampa.ipa2xs, ipa_segs))
epitran/bin/connl2engipaspace.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ import argparse
4
+ import codecs
5
+ import logging
6
+ from collections import Counter
7
+
8
+ import unicodecsv as csv
9
+
10
+ import epitran
11
+ import epitran.flite
12
+ import panphon
13
+
14
+ logger = logging.getLogger('epitran')
15
+
16
+
17
+ def normpunc(flite, s):
18
+ def norm(c):
19
+ if c in flite.puncnorm:
20
+ return flite.puncnorm[c]
21
+ else:
22
+ return c
23
+ return ''.join(map(norm, s))
24
+
25
+
26
+ def add_record(flite, ft, orth):
27
+ space = Counter()
28
+ orth = normpunc(flite, orth)
29
+ trans = flite.transliterate(orth)
30
+ while trans:
31
+ pref = ft.longest_one_seg_prefix(trans)
32
+ if pref != '':
33
+ space[pref] += 1
34
+ trans = trans[len(pref):]
35
+ else:
36
+ if trans[0] in flite.puncnorm_vals:
37
+ space[trans[0]] += 1
38
+ else:
39
+ space[trans[0]] += 1
40
+ trans = trans[1:]
41
+ return space
42
+
43
+
44
+ def add_file(flite, ft, fn):
45
+ space = Counter()
46
+ with codecs.open(fn, 'r', 'utf-8') as f:
47
+ for line in f:
48
+ fields = line.split(u'\t')
49
+ if len(fields) > 0:
50
+ orth = fields[0]
51
+ space.update(add_record(flite, ft, orth))
52
+ logger.debug(u'Length of counter:\t{}'.format(len(space)))
53
+ return space
54
+
55
+
56
+ def print_space(output, space):
57
+ pairs = enumerate(sorted(filter(lambda x: x, space.keys())))
58
+ with open(output, 'wb') as f:
59
+ writer = csv.writer(f, encoding='utf-8')
60
+ for i, char in pairs:
61
+ writer.writerow((i, char))
62
+
63
+
64
+ def main(infiles, output):
65
+ flite = epitran.flite.Flite()
66
+ ft = panphon.FeatureTable()
67
+ space = Counter()
68
+ for fn in infiles:
69
+ logger.debug(u'Scanning:\t{}'.format(fn).encode('utf-8'))
70
+ space.update(add_file(flite, ft, fn))
71
+ print_space(output, space)
72
+
73
+
74
+ if __name__ == '__main__':
75
+ parser = argparse.ArgumentParser()
76
+ parser.add_argument('-o', '--output', help='Output file.')
77
+ parser.add_argument('infiles', nargs='+', help='CONLL files serving as basis for segment space.')
78
+ args = parser.parse_args()
79
+ main(args.infiles, args.output)
epitran/bin/connl2ipaspace.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ import argparse
4
+ import codecs
5
+ import logging
6
+ from collections import Counter
7
+
8
+ import epitran
9
+ import panphon
10
+ import unicodecsv as csv
11
+
12
+ logger = logging.getLogger('epitran')
13
+
14
+
15
+ def normpunc(epi, s):
16
+ def norm(c):
17
+ if c in epi.puncnorm:
18
+ return epi.puncnorm[c]
19
+ else:
20
+ return c
21
+ return ''.join(map(norm, s))
22
+
23
+
24
+ def add_record_gen(epi, ft, orth):
25
+ space = Counter()
26
+ orth = normpunc(epi, orth)
27
+ trans = epi.transliterate(orth)
28
+ while trans:
29
+ pref = ft.longest_one_seg_prefix(trans)
30
+ if pref != '':
31
+ space[pref] += 1
32
+ trans = trans[len(pref):]
33
+ else:
34
+ space[trans[0]] += 1
35
+ trans = trans[1:]
36
+ return space
37
+
38
+
39
+ def add_file_gen(epi, ft, fn):
40
+ space = Counter()
41
+ with codecs.open(fn, 'r', 'utf-8') as f:
42
+ for line in f:
43
+ fields = line.split(u'\t')
44
+ if len(fields) > 0:
45
+ orth = fields[0]
46
+ space.update(add_record_gen(epi, ft, orth))
47
+ logger.debug(u'Length of counter:\t{}'.format(len(space)))
48
+ return space
49
+
50
+
51
+ def add_file_op(epi, ft, fn):
52
+ space = Counter()
53
+ with codecs.open(fn, 'r', 'utf-8') as f:
54
+ for line in f:
55
+ fields = line.split(u'\t')
56
+ if len(fields) > 0:
57
+ orth = fields[0]
58
+ trans = epi.transliterate(orth)
59
+ while trans:
60
+ pref = ft.longest_one_seg_prefix(trans)
61
+ if pref != '':
62
+ space[pref] += 1
63
+ trans = trans[len(pref):]
64
+ else:
65
+ if trans[0] in epi.puncnorm:
66
+ space[epi.puncnorm[trans[0]]] += 1
67
+ else:
68
+ space[trans[0]] += 1
69
+ trans = trans[1:]
70
+ logger.debug(u'Length of counter:\t{}'.format(len(space)))
71
+ return space
72
+
73
+
74
+ def print_space(output, space):
75
+ pairs = enumerate(sorted(filter(lambda x: x, space.keys())))
76
+ with open(output, 'wb') as f:
77
+ writer = csv.writer(f, encoding='utf-8')
78
+ for i, char in pairs:
79
+ writer.writerow((i, char))
80
+
81
+
82
+ def main(code, op, infiles, output):
83
+ epi = epitran.Epitran(code)
84
+ ft = panphon.FeatureTable()
85
+ space = Counter()
86
+ for fn in infiles:
87
+ logger.debug(u'Scanning:\t{}'.format(fn).encode('utf-8'))
88
+ add_file = add_file_op if op else add_file_gen
89
+ space.update(add_file(epi, ft, fn))
90
+ print_space(output, space)
91
+
92
+
93
+ if __name__ == '__main__':
94
+ parser = argparse.ArgumentParser()
95
+ parser.add_argument('-p', '--op', action='store_true', help='Script uses punctuation as (parts of) letters.')
96
+ parser.add_argument('-c', '--code', help='Script code for CONNL files.')
97
+ parser.add_argument('-o', '--output', help='Output file.')
98
+ parser.add_argument('infiles', nargs='+', help='CONLL files serving as basis for segment space.')
99
+ args = parser.parse_args()
100
+ main(args.code, args.op, args.infiles, args.output)
epitran/bin/decompose.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import unicodedata
4
+ import sys
5
+
6
+
7
+ def main(fn):
8
+ with open(fn, encoding='utf-8') as f:
9
+ print(unicodedata.normalize('NFD', f.read()))
10
+
11
+
12
+ if __name__ == '__main__':
13
+ main(sys.argv[1])
epitran/bin/detectcaps.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ from __future__ import print_function
4
+
5
+ import unicodedata
6
+ import fileinput
7
+
8
+
9
+ def main():
10
+ for line in fileinput.input():
11
+ line = line.decode('utf-8')
12
+ token = line.strip()
13
+ if len(token) > 1 and unicodedata.category(token[1]) == 'Lu':
14
+ is_cap = 0
15
+ elif len(token) > 0 and unicodedata.category(token[0]) == 'Lu':
16
+ is_cap = 1
17
+ else:
18
+ is_cap = 0
19
+ line = u'{}\t{}'.format(is_cap, token)
20
+ line = line.encode('utf-8')
21
+ print(line)
22
+
23
+
24
+ if __name__ == '__main__':
25
+ main()
epitran/bin/epitranscribe.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import sys
5
+ import unicodedata
6
+ import epitran
7
+ import argparse
8
+
9
+
10
+ def main(code):
11
+ epi = epitran.Epitran(code)
12
+ for line in sys.stdin: # pointless
13
+ line = line.decode('utf-8')
14
+ line = unicodedata.normalize('NFD', line.lower())
15
+ line = epi.transliterate(line)
16
+ line = line.encode('utf-8')
17
+ sys.stdout.write(line)
18
+
19
+
20
+ if __name__ == '__main__':
21
+ parser = argparse.ArgumentParser(
22
+ description=u'Coverts text from STDIN (in the language specified),' +
23
+ 'into Unicode IPA and emits it to STDOUT.')
24
+ parser.add_argument('code', help=u'ISO 639-3 code for conversion language')
25
+ args = parser.parse_args()
26
+ main(args.code)
epitran/bin/isbijective.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env pythoh
2
+ from __future__ import print_function
3
+
4
+ import glob
5
+
6
+ import unicodecsv as csv
7
+
8
+
9
+ def read_map(fn):
10
+ with open(fn, 'rb') as f:
11
+ reader = csv.reader(f, encoding='utf-8')
12
+ next(reader)
13
+ return [(a, b) for [a, b] in reader]
14
+
15
+
16
+ def is_bijection(mapping):
17
+ a, b = zip(*mapping)
18
+ distinct_a, distinct_b = set(a), set(b)
19
+ return len(distinct_a) == len(mapping) and len(distinct_b) == len(mapping)
20
+
21
+
22
+ def main(map_fns):
23
+ for fn in map_fns:
24
+ mapping = read_map(fn)
25
+ is_b = is_bijection(mapping)
26
+ print('{}\t{}'.format(fn, is_b))
27
+
28
+
29
+ if __name__ == '__main__':
30
+ map_fns = glob.glob('../data/*.csv')
31
+ main(map_fns)
epitran/bin/ltf2ipaspace.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ from __future__ import print_function
4
+
5
+ import argparse
6
+ import glob
7
+ import os.path
8
+
9
+ from lxml import etree
10
+ import unicodecsv as csv
11
+
12
+ import epitran
13
+ import panphon.featuretable
14
+
15
+
16
+ def read_tokens(fn):
17
+ tree = etree.parse(fn)
18
+ root = tree.getroot()
19
+ return [tok.text for tok in root.findall('.//TOKEN')]
20
+
21
+
22
+ def read_input(input_, langscript):
23
+ space = set()
24
+ epi = epitran.Epitran(langscript)
25
+ ft = panphon.featuretable.FeatureTable()
26
+ for dirname in input_[0]:
27
+ for fn in glob.glob(os.path.join(dirname, '*.ltf.xml')):
28
+ for token in read_tokens(fn):
29
+ ipa = epi.transliterate(token)
30
+ for seg in ft.segs_safe(ipa):
31
+ space.add(seg)
32
+ return space
33
+
34
+
35
+ def write_output(output, space):
36
+ with open(output, 'wb') as f:
37
+ writer = csv.writer(f, encoding='utf-8')
38
+ for n, ch in enumerate(sorted(list(space))):
39
+ writer.writerow((n, ch))
40
+
41
+
42
+ def main(langscript, input_, output):
43
+ space = read_input(input_, langscript)
44
+ write_output(output, space)
45
+
46
+
47
+ if __name__ == '__main__':
48
+ parser = argparse.ArgumentParser()
49
+ parser.add_argument('-c', '--code', help='language-script code')
50
+ parser.add_argument('-i', '--input', nargs='+', action='append', help='Directories where input LTF files are found')
51
+ parser.add_argument('-o', '--output', help='Output file')
52
+ args = parser.parse_args()
53
+ main(args.code, args.input, args.output)
epitran/bin/migraterules.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env Python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ from __future__ import (print_function, unicode_literals, absolute_import)
5
+
6
+ import glob
7
+ import re
8
+ import io
9
+
10
+ import unicodecsv
11
+
12
+
13
+ def build_rule(fields):
14
+ try:
15
+ a, b, X, Y = fields
16
+ b = "0" if not b else b
17
+ a = "0" if not a else a
18
+ return '{} -> {} / {} _ {}'.format(a, b, X, Y)
19
+ except ValueError:
20
+ print('Malformed rule: {}'.format(','.join(fields)))
21
+
22
+
23
+ def main():
24
+ for csv in glob.glob('*.csv'):
25
+ txt = re.match('[A-Za-z-]+', csv).group(0) + '.txt'
26
+ with open(csv, 'rb') as f, io.open(txt, 'w', encoding='utf-8') as g:
27
+ reader = unicodecsv.reader(f, encoding='utf-8')
28
+ next(reader)
29
+ for fields in reader:
30
+ if re.match('\s*%', fields[0]):
31
+ print(','.join([x for x in fields if x]), file=g)
32
+ else:
33
+ rule = build_rule(fields)
34
+ rule = re.sub('[ ]+', ' ', rule)
35
+ rule = re.sub('[ ]$', '', rule)
36
+ print(rule, file=g)
37
+
38
+
39
+ if __name__ == '__main__':
40
+ main()
epitran/bin/reromanize.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python2
2
+
3
+ from __future__ import print_function
4
+
5
+ import epitran.reromanize
6
+ import argparse
7
+ import sys
8
+
9
+ def main(code, table):
10
+ rr = epitran.reromanize.ReRomanizer(code, table)
11
+ for line in sys.stdin:
12
+ line = line.decode('utf-8')
13
+ tokens = line.strip().split('\t')
14
+ tokens = [rr.reromanize(x) for x in tokens]
15
+ print('\t'.join(tokens).encode('utf-8'))
16
+
17
+ if __name__ == '__main__':
18
+ parser = argparse.ArgumentParser()
19
+ parser.add_argument('-c', '--code', default='ori-Orya', type=str, help='Languagee and script code')
20
+ parser.add_argument('-t', '--table', default='anglocentric', type=str, help='Romanization table')
21
+ args = parser.parse_args()
22
+ main(args.code, args.table)
epitran/bin/space2punc.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ import sys
4
+ import unicodedata
5
+ import unicodecsv as csv
6
+
7
+
8
+ def main(fns, fnn):
9
+ punc = set()
10
+ for fn in fns:
11
+ print fn
12
+ with open(fn, 'rb') as f:
13
+ reader = csv.reader(f, encoding='utf-8')
14
+ for _, s in reader:
15
+ if len(s) == 1 and unicodedata.category(s)[0] == u'P':
16
+ punc.add(s)
17
+ with open(fnn, 'wb') as f:
18
+ writer = csv.writer(f, encoding='utf-8')
19
+ for mark in sorted(list(punc)):
20
+ writer.writerow([mark])
21
+
22
+
23
+ if __name__ == '__main__':
24
+ main(sys.argv[1:-1], sys.argv[-1])
epitran/bin/testvectorgen.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ from __future__ import print_function
4
+
5
+ import argparse
6
+ import codecs
7
+
8
+ import epitran.vector
9
+
10
+
11
+ def main(code, space, infile):
12
+ vec = epitran.vector.VectorsWithIPASpace(code, space)
13
+ with codecs.open(infile, 'r', 'utf-8') as f:
14
+ for line in f:
15
+ fields = line.split('\t')
16
+ if len(fields) > 1:
17
+ word = fields[0]
18
+ print(u"WORD: {}".format(word).encode('utf-8'))
19
+ segs = vec.word_to_segs(word)
20
+ for record in segs:
21
+ cat, case, orth, phon, id_, vector = record
22
+ print(u"Category: {}".format(cat).encode('utf-8'))
23
+ print(u"Case: {}".format(case).encode('utf-8'))
24
+ print(u"Orthographic: {}".format(orth).encode('utf-8'))
25
+ print(u"Phonetic: {}".format(phon).encode('utf-8'))
26
+ print(u"Vector: {}".format(vector).encode('utf-8'))
27
+
28
+
29
+ if __name__ == '__main__':
30
+ parser = argparse.ArgumentParser()
31
+ parser.add_argument('-c', '--code', required=True, help='Script code.')
32
+ parser.add_argument('-s', '--space', required=True, help='Space.')
33
+ parser.add_argument('-i', '--infile', required=True, help='Input file.')
34
+ args = parser.parse_args()
35
+ main(args.code, args.space, args.infile)
epitran/bin/transltf.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ from __future__ import print_function
3
+
4
+ import sys
5
+
6
+ from lxml import etree
7
+ import epitran
8
+ import epitran.vector
9
+
10
+ def main(fn):
11
+ epi = epitran.Epitran('uig-Arab')
12
+ vwis = epitran.vector.VectorsWithIPASpace('uig-Arab', ['uig-Arab'])
13
+ tree = etree.parse(fn)
14
+ root = tree.getroot()
15
+ for token in root.findall('.//TOKEN'):
16
+ # print(token.text.encode('utf-8'))
17
+ print(epi.transliterate(unicode(token.text)).encode('utf-8'))
18
+
19
+ if __name__ == '__main__':
20
+ main(sys.argv[1])
epitran/bin/uigtransliterate.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ from __future__ import print_function
3
+
4
+ import fileinput
5
+ import epitran
6
+
7
+ epi = epitran.Epitran('uig-Arab')
8
+ for line in fileinput.input():
9
+ s = epi.transliterate(line.strip().decode('utf-8'))
10
+ print(s.encode('utf-8'))
epitran/bin/vie-tones.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import csv
4
+ import re
5
+ import sys
6
+ import os.path
7
+ import unicodedata
8
+
9
+
10
+ tones = {
11
+ '\u00b4': '˧˥', # acute = sac
12
+ '\u0060': '˨˩', # grave = huyen
13
+ '\u0303': '˧˥', # tilde = nga
14
+ '\u0309': '˧˩˧', # hook above = hoi
15
+ '\u0323': '˧˩', # dot below = nang
16
+ }
17
+
18
+
19
+ def shuffle_tone(orth, phon):
20
+ orth = unicodedata.normalize('NFD', orth)
21
+ if re.search('[aeiouơư]', orth):
22
+ for tone in tones:
23
+ if tone in orth:
24
+ phon += tones[tone]
25
+ if not re.search('[˩˨˧˦˥]', phon):
26
+ phon += '˧'
27
+ return phon
28
+
29
+
30
+ def main():
31
+ fnin = sys.argv[1]
32
+ fnout = os.path.basename(fnin)
33
+ with open(fnin) as fin, open(fnout, 'w') as fout:
34
+ writer = csv.writer(fout)
35
+ reader = csv.reader(fin)
36
+ header = next(reader)
37
+ writer.writerow(header)
38
+ for orth, phon in reader:
39
+ phon = shuffle_tone(orth, phon)
40
+ writer.writerow([orth, phon])
41
+
42
+
43
+ if __name__ == '__main__':
44
+ main()
epitran/cedict.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from __future__ import (absolute_import, division, print_function,
3
+ unicode_literals)
4
+
5
+ import codecs
6
+
7
+ import marisa_trie
8
+ import regex as re
9
+
10
+ ASCII_CHARS = ''.join([chr(i) for i in range(128)])
11
+
12
+
13
+ class CEDictTrie(object):
14
+ def __init__(self, cedict_file, traditional=False):
15
+ """Construct a trie over CC-CEDict
16
+
17
+ Args:
18
+ cedict_file (str): path to the CC-CEDict dictionary
19
+ traditional (bool): if True, use traditional characters
20
+ """
21
+ self.hanzi = self._read_cedict(cedict_file, traditional=traditional)
22
+ self.trie = self._construct_trie(self.hanzi)
23
+
24
+ def _read_cedict(self, cedict_file, traditional=False):
25
+ comment_re = re.compile('\s*#')
26
+ lemma_re = re.compile('(?P<hanzi>[^]]+) \[(?P<pinyin>[^]]+)\] /(?P<english>.+)/')
27
+ cedict = {}
28
+ with codecs.open(cedict_file, 'r', 'utf-8') as f:
29
+ for line in f:
30
+ if comment_re.match(line):
31
+ pass
32
+ elif lemma_re.match(line):
33
+ match = lemma_re.match(line)
34
+ hanzi = match.group('hanzi').split(' ')
35
+ pinyin = match.group('pinyin').split(' ')
36
+ english = match.group('english').split('/')
37
+ if traditional:
38
+ cedict[hanzi[0]] = (pinyin, english) # traditional characters only
39
+ else:
40
+ cedict[hanzi[1]] = (pinyin, english) # simplified characters only.
41
+ return cedict
42
+
43
+
44
+ def _construct_trie(self, hanzi):
45
+ pairs = []
46
+ for hz, df in self.hanzi.items():
47
+ py, en = df
48
+ py = str(''.join(filter(lambda x: x in ASCII_CHARS, ' '.join(py))))
49
+ pairs.append((hz, (py.encode('utf-8'),)))
50
+ trie = marisa_trie.RecordTrie(str('@s'), pairs)
51
+ return trie
52
+
53
+ def has_key(self, key):
54
+ return key in self.hanzi
55
+
56
+ def prefixes(self, s):
57
+ return self.trie.prefixes(s)
58
+
59
+ def longest_prefix(self, s):
60
+ prefixes = self.prefixes(s)
61
+ if not prefixes:
62
+ return ''
63
+ else:
64
+ return sorted(prefixes, key=len)[-1] # Sort by length and return last.
65
+
66
+ def tokenize(self, s):
67
+ tokens = []
68
+ while s:
69
+ token = self.longest_prefix(s)
70
+ if token:
71
+ tokens.append(token)
72
+ s = s[len(token):]
73
+ else:
74
+ tokens.append(s[0])
75
+ s = s[1:]
76
+ return tokens
epitran/data/arpabet.csv ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pau,
2
+ null,
3
+ ey,ej
4
+ ae,æ
5
+ iy,i
6
+ eh,ɛ
7
+ ay,aj
8
+ ih,ɪ
9
+ ow,ow
10
+ aa,ɑ
11
+ ao,ɔ
12
+ aw,aw
13
+ oy,oj
14
+ ah,ʌ
15
+ ax,ə
16
+ uw,u
17
+ uh,ʊ
18
+ er,ɹ̩
19
+ b,b
20
+ ch,t͡ʃ
21
+ d,d
22
+ dx,ɾ
23
+ f,f
24
+ g,ɡ
25
+ hh,h
26
+ jh,d͡ʒ
27
+ k,k
28
+ l,l
29
+ em,m̩
30
+ m,m
31
+ en,n̩
32
+ n,n
33
+ ng,ŋ
34
+ p,p
35
+ q,ʔ
36
+ r,ɹ
37
+ s,s
38
+ sh,ʃ
39
+ t,t
40
+ dh,ð
41
+ th,θ
42
+ v,v
43
+ w,w
44
+ y,j
45
+ z,z
46
+ zh,ʒ
epitran/data/ipa-xsampa.csv ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ IPA,X-SAMPA,Name
2
+ p,p,vl bilabial plosive
3
+ b,b,vd bilabial plosive
4
+ t,t,vl alveolar plosive
5
+ d,d,vd alveolar plosive
6
+ ʈ,t`,vl retroflex plosive
7
+ ɖ,d`,vd retroflex plosive
8
+ c,c,vl palatal plosive
9
+ ɟ,J\,vd palatal plosive
10
+ k,k,ld velar plosive
11
+ ɡ,g,vd velar plosive
12
+ q,q,vl uvular plosive
13
+ ɢ,G\,vd uvular plosive
14
+ ʔ,?,glottal plosive
15
+ m,m,bilabial nasal
16
+ ɱ,F,vl labiodental nasal
17
+ n,n,alveolar nasal
18
+ ɳ,n`,vl retroflex nasal
19
+ ɲ,J,vl palatal nasal
20
+ ŋ,N,vl velar nasal
21
+ ɴ,N\,vl uvular nasal
22
+ ʙ,B\,vd bilabial trill
23
+ r,r,vd alveolar trill
24
+ ʀ,R\,vl uvular trill
25
+ ɾ,4,vl alveolar tap
26
+ ɽ,r`,vl retroflex flap
27
+ ɸ,p\,vl bilabial fricative
28
+ β,B,vd bilabial fricative
29
+ f,f,vl labiodental fricative
30
+ v,v,vd labiodental fricative
31
+ θ,T,vl dental fricative
32
+ ð,D,vd dental fricative
33
+ s,s,vl alveolar fricative
34
+ z,z,vd alveolar fricative
35
+ ʃ,S,vl postalveolar fricative
36
+ ʒ,Z,vd postalveolar fricative
37
+ ʂ,s`,vl retroflex fricative
38
+ ʐ,z`,vd retroflex fricative
39
+ ç,C,vl palatal fricative
40
+ ʝ,j\,vd palatal fricative
41
+ x,x,vl velar fricative
42
+ ɣ,G,vd velar fricative
43
+ χ,X,vl uvular fricative
44
+ ʁ,R,vd uvular fricative
45
+ ħ,X\,vl pharyngeal fricative
46
+ ʕ,?\,vd pharyngeal fricative
47
+ h,h,vl glottal fricative
48
+ ʔ,?,glottal plosive
49
+ ɬ,K,vl alveolar lateral fricative
50
+ ɮ,K\,vd alveolar lateral fricative
51
+ ʋ,P,vd labiodental approximant
52
+ ɹ,r\,vd (post)alveolar approximant
53
+ ɻ,r\`,vd retroflex approximant
54
+ j,j,vd palatal approximant
55
+ ɰ,M\,vd velar approximant
56
+ l,l,vd alveolar lateral approximant
57
+ ɭ,l`,vd retroflex lateral approximant
58
+ ʎ,L,vd palatal lateral approximant
59
+ ʟ,L\,vd velar lateral approximant
60
+ pʼ,p_>,ejective
61
+ tʼ,t_>,ejective
62
+ ʈʼ,t`_>,ejective
63
+ cʼ,c_>,ejective
64
+ kʼ,k_>,ejective
65
+ qʼ,q_>,ejective
66
+ ɓ,b_<,vl bilabial implosive
67
+ ɗ,d_<,vl alveolar implosive
68
+ ƙ,k_<,vl velar implosive
69
+ ɠ,g_<,vl velar implosive
70
+ i,i,close front unrounded
71
+ y,y,close front rounded
72
+ ɨ,1,close central unrounded
73
+ ʉ,},close central rounded
74
+ ɯ,M,close back unrounded
75
+ u,u,close back rounded
76
+ ɪ,I,lax close front unrounded
77
+ ʏ,Y,lax close front rounded
78
+ ʊ,U,lax close back rounded
79
+ e,e,close-mid front unrounded
80
+ ø,2,front close-mid rounded
81
+ ɤ,7,close-mid back unrounded
82
+ o,o,close-mid back rounded
83
+ ə,@,schwa
84
+ ɘ,@\,close-mid central unrounded vowel
85
+ ɵ,8,rounded schwa
86
+ ɛ,E,open-mid front unrounded
87
+ œ,9,front open-mid rounded
88
+ ʌ,V,open-mid back unrounded
89
+ ɔ,O,open-mid back rounded
90
+ æ,{,mid-open front unrounded vowel
91
+ ɐ,6,open-mid schwa
92
+ a,a,open front unrounded
93
+ ă,a_X,extra short open front unrounded
94
+ ɶ,&,front open rounded
95
+ ɑ,A,open back unrounded
96
+ ɒ,Q,open back rounded
97
+ ̥,_0,voiceless
98
+ ̬,_v,voiced
99
+ ʰ,_h,aspirated
100
+ ̤,_t,breathy voiced
101
+ ̰,_k,creaky voiced
102
+ ̼,_N,linguolabial
103
+ ̪,_d,dental
104
+ ̺,_a,apical
105
+ ̻,_m,laminal
106
+ ̹,_O,more rounded
107
+ ̜,_c,less rounded
108
+ ̟,_+,advanced
109
+ ̠,_-,retracted
110
+ ̈,"_""",centralized
111
+ ̽,_x,mid-centralized
112
+ ̩,=,syllabic
113
+ ̯,_^,non-syllabic
114
+ ʷ,_w,labialized
115
+ ʲ,',palatalized
116
+ ˠ,_G,velarized
117
+ ˤ,_?\,pharyngealized
118
+ ̴,_e,velarized or pharyngealized
119
+ ̝,_r,raised
120
+ ̞,_o,lowered
121
+ ̃,~,nasalized
122
+ ⁿ,_n,nasal release
123
+ ˡ,_l,lateral release
124
+ ̚,_},not audibly released
125
+ ̘,_A,advanced tongue root
126
+ ̙,_q,retracted tongue root
127
+ ̋,_T,extra high tone
128
+ ́,_H,high tone
129
+ ̄,_M,mid tone
130
+ ̀,_L,low tone
131
+ ̏,_B,extra low tone
132
+ ˈ,"""",(primary) stress mark
133
+ ˌ,%,secondary stress
134
+ ː,:,length mark
135
+ ˑ,:\,half-length
136
+ ̆,_X,extra-short
137
+ .,.,syllable break
138
+ ʍ,W,vl labial-velar fricative
139
+ w,w,vd labio-velar approximant
140
+ ɥ,H,labial-palatal approximant
141
+ ʜ,H\,vl epiglottal fricative
142
+ ʢ,<\,vl epiglottal fricative
143
+ ʡ,>\,vl epiglottal plosive
144
+ ɕ,s\,vl alveolopalatal fricative
145
+ ʑ,z\,vl alveolopalatal fricative
146
+ ʘ,O\,bilabial click
147
+ ǀ,|\,dental click
148
+ ǃ,!\,click
149
+ ǂ,'=\,alveolar click
150
+ ǁ,|\|\,alveolar lateral click
151
+ ɺ,l\,vl alveolar lateral flap
152
+ ɜ,3,open-mid central
153
+ ʛ,G\_<,vl uvular implosive
154
+ ɚ,@`,rhotacized schwa
155
+ ɞ,3\,open-mid central rounded
156
+ ɦ,h\,vd glottal fricative
157
+ ɫ,5,velarized vl alveolar lateral
158
+ ʄ,J\_<,vl palatal implosive
159
+ ʼ,_>,ejective
160
+ ɝ,3`,rhotacized open-mid central
161
+ t͡ʃ,tS,vl postalveolar affricate
162
+ d͡ʒ,dZ,vd postalveolar affricate
163
+ t͡ɕ,ts\,vl alveolo-palatal affricate
164
+ d͡ʑ,dz\,vd alveolo-palatal affricate
165
+ t͡ɬ,tK,vl alveolar lateral affricate
166
+ k͡p,kp,vl labial-velar plosive
167
+ g͡b,gb,vd labial-velar plosive
168
+ ŋ͡m,Nm,labial-velar nasal stop
169
+ ʈ͡ʂ,ts`,vl retroflex affricate
170
+ ɖ͡ʐ,tz`,vd retroflex affricate
171
+ ˩,_B,extra low tone
172
+ ˨,_L,low tone
173
+ ˧,_M,mid tone
174
+ ˦,_H,high tone
175
+ ˥,_T,extra high tone
epitran/data/map/rhg-lroh.csv ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Orth,Phon
2
+ b,b
3
+ d,d
4
+ ḍ,ɖ
5
+ f,f
6
+ g,g
7
+ h,h
8
+ j,d͡ʒ
9
+ k,k
10
+ l,l
11
+ m,m
12
+ n,n
13
+ p,p
14
+ r,ɾ
15
+ ṛ,ɽ
16
+ s,s
17
+ š,ʃ
18
+ t,t
19
+ ṭ,ʈ
20
+ v,v
21
+ w,w
22
+ y,j
23
+ z,z
24
+ ã,ɑ̃
25
+ a,ɑ
26
+ ẽ,ẽ
27
+ e,e
28
+ ĩ,ĩ
29
+ i,i
30
+ õ,ɔ̃
31
+ o,ɔ
32
+ ũ,ũ
33
+ u,u
epitran/data/map/rhg-roheng.csv ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Orth,Phon
2
+ b,b
3
+ c,ʃ
4
+ ç,ɽ
5
+ d,d
6
+ f,f
7
+ g,g
8
+ h,h
9
+ j,d͡ʒ
10
+ k,k
11
+ l,l
12
+ m,m
13
+ n,n
14
+ p,p
15
+ q,q
16
+ r,ɾ
17
+ s,s
18
+ t,t
19
+ v,v
20
+ w,w
21
+ x,ks
22
+ y,j
23
+ z,z
24
+ dh,ɖ
25
+ th,ʈ
26
+ a,ɑ
27
+ añ,ɑ̃
28
+ e,e
29
+ eñ,ẽ
30
+ i,i
31
+ iñ,ĩ
32
+ o,ɔ
33
+ oñ,ɔ̃
34
+ u,u
35
+ uñ,ũ
epitran/data/post/rhg-lroh.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ɑ̃ɑ -> ɑ̃ː / _
2
+ ɑɑ̃ -> ɑ̃ː / _
3
+ ɑɑ -> ɑː / _
4
+
5
+ ẽe -> ẽː / _
6
+ eẽ -> ẽː / _
7
+ ee -> eː / _
8
+
9
+ ĩi -> ĩː / _
10
+ iĩ -> ĩː / _
11
+ ii -> iː / _
12
+
13
+ ɔ̃ɔ -> ɔ̃ː / _
14
+ ɔɔ̃ -> ɔ̃ː / _
15
+ ɔɔ -> ɔː / _
16
+
17
+ ũu -> ũː / _
18
+ uũ -> ũː / _
19
+ uu -> uː / _
epitran/data/post/rhg-roheng.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ɑɑ̃ -> ɑ̃ː / _
2
+ ɑɑ -> ɑː / _
3
+
4
+ eẽ -> ẽː / _
5
+ ee -> eː / _
6
+
7
+ iĩ -> ĩː / _
8
+ ii -> iː / _
9
+
10
+ ɔɔ̃ -> ɔ̃ː / _
11
+ oo -> ɔː / _
12
+
13
+ uũ -> ũː / _
14
+ uu -> uː / _
epitran/data/pre/rhg-lroh.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ::vowel:: = a|ã|e|ẽ|i|ĩ|o|õ|u|ũ
2
+ ::consonant:: = b|d|ḍ|f|g|h|j|k|l|m|n|p|r|ṛ|s|š|t|ṭ|v|w|y|z
3
+
4
+ % remove stress marks
5
+ á -> a / _
6
+ é -> e / _
7
+ í -> i / _
8
+ ó -> o / _
9
+ ú -> u / _
10
+
11
+ % vowel glides
12
+ w -> 0 / (u|ũ) _ (a|o|e)
13
+ y -> 0 / (i|ĩ) _ (a|e|o|u)
14
+
15
+ % long vowels
16
+
17
+ % gemination
epitran/data/pre/rhg-roheng.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ::vowel:: = a|e|i|o|u
2
+ ::consonant:: = b|c|ç|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|x|y|z
3
+
4
+ % remove stress marks
5
+ á -> a / _
6
+ é -> e / _
7
+ í -> i / _
8
+ ó -> o / _
9
+ ú -> u / _
10
+
11
+ % vowel glides
12
+ w -> 0 / (u|uñ) _ (a|o|e)
13
+ y -> 0 / (i|iñ) _ (a|e|o|u)
epitran/data/puncnorm.csv ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ Punctuation,NormalizedForm
2
+ ‘,'
3
+ ’,'
4
+ ʼ,'
5
+ ʻ,'
6
+ ”,""""
7
+ “,""""
8
+ 。,.
9
+ ,,","