Spaces:
Sleeping
Sleeping
Initial file upload
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- LICENSE +21 -0
- README.md +1 -13
- __pycache__/config.cpython-310.pyc +0 -0
- __pycache__/functions.cpython-310.pyc +0 -0
- app.py +37 -0
- config.py +5 -0
- epitran/__init__.py +2 -0
- epitran/__pycache__/__init__.cpython-310.pyc +0 -0
- epitran/__pycache__/__init__.cpython-311.pyc +0 -0
- epitran/__pycache__/_epitran.cpython-310.pyc +0 -0
- epitran/__pycache__/_epitran.cpython-311.pyc +0 -0
- epitran/__pycache__/cedict.cpython-310.pyc +0 -0
- epitran/__pycache__/download.cpython-310.pyc +0 -0
- epitran/__pycache__/epihan.cpython-310.pyc +0 -0
- epitran/__pycache__/exceptions.cpython-310.pyc +0 -0
- epitran/__pycache__/flite.cpython-310.pyc +0 -0
- epitran/__pycache__/ligaturize.cpython-310.pyc +0 -0
- epitran/__pycache__/ppprocessor.cpython-310.pyc +0 -0
- epitran/__pycache__/puncnorm.cpython-310.pyc +0 -0
- epitran/__pycache__/reromanize.cpython-310.pyc +0 -0
- epitran/__pycache__/rules.cpython-310.pyc +0 -0
- epitran/__pycache__/simple.cpython-310.pyc +0 -0
- epitran/__pycache__/stripdiacritics.cpython-310.pyc +0 -0
- epitran/__pycache__/xsampa.cpython-310.pyc +0 -0
- epitran/_epitran.py +129 -0
- epitran/backoff.py +89 -0
- epitran/bin/connl2engipaspace.py +79 -0
- epitran/bin/connl2ipaspace.py +100 -0
- epitran/bin/decompose.py +13 -0
- epitran/bin/detectcaps.py +25 -0
- epitran/bin/epitranscribe.py +26 -0
- epitran/bin/isbijective.py +31 -0
- epitran/bin/ltf2ipaspace.py +53 -0
- epitran/bin/migraterules.py +40 -0
- epitran/bin/reromanize.py +22 -0
- epitran/bin/space2punc.py +24 -0
- epitran/bin/testvectorgen.py +35 -0
- epitran/bin/transltf.py +20 -0
- epitran/bin/uigtransliterate.py +10 -0
- epitran/bin/vie-tones.py +44 -0
- epitran/cedict.py +76 -0
- epitran/data/arpabet.csv +46 -0
- epitran/data/ipa-xsampa.csv +175 -0
- epitran/data/map/rhg-lroh.csv +33 -0
- epitran/data/map/rhg-roheng.csv +35 -0
- epitran/data/post/rhg-lroh.txt +19 -0
- epitran/data/post/rhg-roheng.txt +14 -0
- epitran/data/pre/rhg-lroh.txt +17 -0
- epitran/data/pre/rhg-roheng.txt +13 -0
- epitran/data/puncnorm.csv +9 -0
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 Micah Geyman
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,13 +1 @@
|
|
1 |
-
|
2 |
-
title: Rhg Script Converter Ui
|
3 |
-
emoji: 👁
|
4 |
-
colorFrom: blue
|
5 |
-
colorTo: blue
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 4.8.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: mit
|
11 |
-
---
|
12 |
-
|
13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
# rhg-script-converter-ui
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__pycache__/config.cpython-310.pyc
ADDED
Binary file (275 Bytes). View file
|
|
__pycache__/functions.cpython-310.pyc
ADDED
Binary file (2.18 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from functions import convert_script
|
3 |
+
from config import scripts
|
4 |
+
|
5 |
+
DEFAULT_INPUT_SCRIPT = list(scripts.keys())[0]
|
6 |
+
DEFAULT_OUTPUT_SCRIPT = list(scripts.keys())[1]
|
7 |
+
|
8 |
+
def process_text(input_script, output_script, input_text, uploaded_file=None):
|
9 |
+
if uploaded_file is not None:
|
10 |
+
input_text = uploaded_file.decode("utf-8")
|
11 |
+
|
12 |
+
output_text = convert_script(scripts[input_script], scripts[output_script], input_text)
|
13 |
+
|
14 |
+
output_filename = "output.txt"
|
15 |
+
with open(output_filename, "w") as file:
|
16 |
+
file.write(output_text)
|
17 |
+
|
18 |
+
return output_text, output_filename
|
19 |
+
|
20 |
+
with gr.Blocks(title="Rohingya Script Converter") as page:
|
21 |
+
gr.Markdown("## Rohingya Script Converter")
|
22 |
+
with gr.Row():
|
23 |
+
input_script = gr.Dropdown(label="Choose the input script:", choices=scripts.keys(), value=DEFAULT_INPUT_SCRIPT)
|
24 |
+
output_script = gr.Dropdown(label="Choose the output script:", choices=scripts.keys(), value=DEFAULT_OUTPUT_SCRIPT)
|
25 |
+
with gr.Row():
|
26 |
+
input_text = gr.Textbox(label="Input Text", placeholder="Enter text here or upload a file", lines=5)
|
27 |
+
output_text = gr.Textbox(label="Output Text", placeholder="Converted text will appear here", lines=5, interactive=False)
|
28 |
+
with gr.Row():
|
29 |
+
input_file = gr.File(label="Upload Text File", file_count="single", type="binary")
|
30 |
+
download_link = gr.File(label="Download Converted File")
|
31 |
+
gr.Button("Convert").click(
|
32 |
+
process_text,
|
33 |
+
inputs=[input_script, output_script, input_text, input_file],
|
34 |
+
outputs=[output_text, download_link]
|
35 |
+
)
|
36 |
+
|
37 |
+
page.launch(share=True)
|
config.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
scripts = {
|
2 |
+
'LearnRohingya':'rhg-lroh',
|
3 |
+
'Rohingyalish':'rhg-roheng',
|
4 |
+
'Rohingyalish (old)':'rhg-roheng-old'
|
5 |
+
}
|
epitran/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from epitran._epitran import Epitran
|
2 |
+
from epitran.reromanize import ReRomanizer
|
epitran/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (274 Bytes). View file
|
|
epitran/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (314 Bytes). View file
|
|
epitran/__pycache__/_epitran.cpython-310.pyc
ADDED
Binary file (6.7 kB). View file
|
|
epitran/__pycache__/_epitran.cpython-311.pyc
ADDED
Binary file (8.71 kB). View file
|
|
epitran/__pycache__/cedict.cpython-310.pyc
ADDED
Binary file (2.93 kB). View file
|
|
epitran/__pycache__/download.cpython-310.pyc
ADDED
Binary file (1.29 kB). View file
|
|
epitran/__pycache__/epihan.cpython-310.pyc
ADDED
Binary file (4.22 kB). View file
|
|
epitran/__pycache__/exceptions.cpython-310.pyc
ADDED
Binary file (577 Bytes). View file
|
|
epitran/__pycache__/flite.cpython-310.pyc
ADDED
Binary file (8.39 kB). View file
|
|
epitran/__pycache__/ligaturize.cpython-310.pyc
ADDED
Binary file (781 Bytes). View file
|
|
epitran/__pycache__/ppprocessor.cpython-310.pyc
ADDED
Binary file (2.14 kB). View file
|
|
epitran/__pycache__/puncnorm.cpython-310.pyc
ADDED
Binary file (1.88 kB). View file
|
|
epitran/__pycache__/reromanize.cpython-310.pyc
ADDED
Binary file (2.53 kB). View file
|
|
epitran/__pycache__/rules.cpython-310.pyc
ADDED
Binary file (4.85 kB). View file
|
|
epitran/__pycache__/simple.cpython-310.pyc
ADDED
Binary file (14.7 kB). View file
|
|
epitran/__pycache__/stripdiacritics.cpython-310.pyc
ADDED
Binary file (1.91 kB). View file
|
|
epitran/__pycache__/xsampa.cpython-310.pyc
ADDED
Binary file (2.24 kB). View file
|
|
epitran/_epitran.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import logging
|
3 |
+
from typing import Union
|
4 |
+
|
5 |
+
import panphon.featuretable
|
6 |
+
from epitran.epihan import Epihan, EpihanTraditional
|
7 |
+
from epitran.flite import FliteLexLookup
|
8 |
+
from epitran.puncnorm import PuncNorm
|
9 |
+
from epitran.simple import SimpleEpitran
|
10 |
+
from epitran.xsampa import XSampa
|
11 |
+
|
12 |
+
logger = logging.getLogger('epitran')
|
13 |
+
logger.setLevel(logging.WARNING)
|
14 |
+
|
15 |
+
class Epitran(object):
|
16 |
+
"""Unified interface for IPA transliteration/transcription
|
17 |
+
|
18 |
+
:param code str: ISO 639-3 plus "-" plus ISO 15924 code of the language/script pair that should be loaded
|
19 |
+
:param preproc bool: apply preprocessors
|
20 |
+
:param postproc bool: apply postprocessors
|
21 |
+
:param ligatures bool: use precomposed ligatures instead of standard IPA
|
22 |
+
:param cedict_filename str: path to file containing the CC-CEDict dictionary
|
23 |
+
:param rev boolean: use reverse transliteration
|
24 |
+
:param rev_preproc bool: if True, apply preprocessors when reverse transliterating
|
25 |
+
:param rev_postproc bool: if True, apply postprocessors when reverse transliterating
|
26 |
+
"""
|
27 |
+
special = {'eng-Latn': FliteLexLookup,
|
28 |
+
'cmn-Hans': Epihan,
|
29 |
+
'cmn-Hant': EpihanTraditional}
|
30 |
+
|
31 |
+
def __init__(self, code: str, preproc: bool=True, postproc: bool=True, ligatures: bool=False,
|
32 |
+
cedict_file: Union[bool, None]=None, rev: bool=False,
|
33 |
+
rev_preproc: bool=True, rev_postproc: bool=True, tones: bool=False):
|
34 |
+
"""Constructor method"""
|
35 |
+
if code in self.special:
|
36 |
+
self.epi = self.special[code](ligatures=ligatures, cedict_file=cedict_file, tones=tones)
|
37 |
+
else:
|
38 |
+
self.epi = SimpleEpitran(code, preproc, postproc, ligatures, rev, rev_preproc, rev_postproc, tones=tones)
|
39 |
+
self.ft = panphon.featuretable.FeatureTable()
|
40 |
+
self.xsampa = XSampa()
|
41 |
+
self.puncnorm = PuncNorm()
|
42 |
+
|
43 |
+
def transliterate(self, word: str, normpunc: bool=False, ligatures: bool=False) -> str:
|
44 |
+
"""Transliterates/transcribes a word into IPA
|
45 |
+
|
46 |
+
:param word str: word to transcribe
|
47 |
+
:param normpunc bool: if True, normalize punctuation
|
48 |
+
:param ligatures bool: if True, use precomposed ligatures instead of standard IPA
|
49 |
+
:return: An IPA string corresponding to the input orthographic string
|
50 |
+
:rtype: str
|
51 |
+
"""
|
52 |
+
return self.epi.transliterate(word, normpunc, ligatures)
|
53 |
+
|
54 |
+
def reverse_transliterate(self, ipa: str) -> str:
|
55 |
+
"""Reconstructs word from IPA. Does the reverse of transliterate()
|
56 |
+
|
57 |
+
:param ipa str: An IPA representation of a word
|
58 |
+
:return: An orthographic representation of the word
|
59 |
+
:rtype: str
|
60 |
+
"""
|
61 |
+
return self.epi.reverse_transliterate(ipa)
|
62 |
+
|
63 |
+
def strict_trans(self, word: str, normpunc:bool =False, ligatures: bool=False) -> str:
|
64 |
+
"""Transliterate a word into IPA, ignoring all characters that cannot be recognized.
|
65 |
+
|
66 |
+
:param word str: word to transcribe
|
67 |
+
:param normpunc bool, optional: if True, normalize punctuation
|
68 |
+
:param ligatures bool, optional: if True, use precomposed ligatures instead of standard IPA
|
69 |
+
:return: An IPA string corresponding to the input orthographic string, with all uncoverted characters omitted
|
70 |
+
:rtype: str
|
71 |
+
"""
|
72 |
+
return self.epi.strict_trans(word, normpunc, ligatures)
|
73 |
+
|
74 |
+
def trans_list(self, word: str, normpunc: bool=False, ligatures: bool=False) -> "list[str]":
|
75 |
+
"""Transliterates/transcribes a word into list of IPA phonemes
|
76 |
+
|
77 |
+
:param word str: word to transcribe
|
78 |
+
:param normpunc bool, optional: if True, normalize punctuation
|
79 |
+
:param ligatures bool, optional: if True, use precomposed ligatures instead of standard IPA
|
80 |
+
:return: list of IPA strings, each corresponding to a segment
|
81 |
+
:rtype: list[str]
|
82 |
+
"""
|
83 |
+
return self.ft.segs_safe(self.epi.transliterate(word, normpunc, ligatures))
|
84 |
+
|
85 |
+
def trans_delimiter(self, text: str, delimiter: str=str(' '), normpunc: bool=False, ligatures: bool=False):
|
86 |
+
"""Return IPA transliteration with a delimiter between segments
|
87 |
+
|
88 |
+
:param text str: An orthographic text
|
89 |
+
:param delimiter str, optional: A string to insert between segments
|
90 |
+
:param normpunc bool, optional: If True, normalize punctuation
|
91 |
+
:param ligatures bool, optional: If True, use precomposed ligatures instead of standard IPA
|
92 |
+
:return: String of IPA phonemes separated by `delimiter`
|
93 |
+
:rtype: str
|
94 |
+
"""
|
95 |
+
return delimiter.join(self.trans_list(text, normpunc=normpunc,
|
96 |
+
ligatures=ligatures))
|
97 |
+
|
98 |
+
def xsampa_list(self, word: str, normpunc: bool=False, ligaturize: bool=False):
|
99 |
+
"""Transliterates/transcribes a word as X-SAMPA
|
100 |
+
|
101 |
+
:param word str: An orthographic word
|
102 |
+
:param normpunc bool, optional: If True, normalize punctuation
|
103 |
+
:param ligatures bool, optional: If True, use precomposed ligatures instead of standard IPA
|
104 |
+
:return: List of X-SAMPA strings corresponding to `word`
|
105 |
+
:rtype: list[str]
|
106 |
+
"""
|
107 |
+
ipa_segs = self.ft.ipa_segs(self.epi.strict_trans(word, normpunc,
|
108 |
+
ligaturize))
|
109 |
+
return list(map(self.xsampa.ipa2xs, ipa_segs))
|
110 |
+
|
111 |
+
def word_to_tuples(self, word: str, normpunc: bool=False, _ligaturize: bool=False):
|
112 |
+
"""Given a word, returns a list of tuples corresponding to IPA segments. The "feature
|
113 |
+
vectors" form a list consisting of (segment, vector) pairs.
|
114 |
+
For IPA segments, segment is a substring of phonetic_form such that the
|
115 |
+
concatenation of all segments in the list is equal to the phonetic_form.
|
116 |
+
The vectors are a sequence of integers drawn from the set {-1, 0, 1}
|
117 |
+
where -1 corresponds to '-', 0 corresponds to '0', and 1 corresponds to
|
118 |
+
'+'.
|
119 |
+
|
120 |
+
:param word str: An orthographic word
|
121 |
+
:param normpunc bool, optional: If True, normalize punctuation
|
122 |
+
:param ligatures bool, optional: If True, use precomposed ligatures instead of standard IPA
|
123 |
+
:return: A list of tuples corresponding to IPA segments
|
124 |
+
:rtype: list[tuple[str, str, str, str, list[int]]]
|
125 |
+
"""
|
126 |
+
try:
|
127 |
+
return self.epi.word_to_tuples(word, normpunc)
|
128 |
+
except AttributeError:
|
129 |
+
raise AttributeError('Method word_to_tuples not yet implemented for this language-script pair!') from AttributeError
|
epitran/backoff.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
from __future__ import (print_function, absolute_import,
|
3 |
+
unicode_literals)
|
4 |
+
|
5 |
+
import regex as re
|
6 |
+
from . import _epitran
|
7 |
+
import panphon.featuretable
|
8 |
+
from epitran.puncnorm import PuncNorm
|
9 |
+
from epitran.xsampa import XSampa
|
10 |
+
from epitran.stripdiacritics import StripDiacritics
|
11 |
+
|
12 |
+
|
13 |
+
class Backoff(object):
|
14 |
+
"""Implements rudimentary language ID and backoff."""
|
15 |
+
|
16 |
+
def __init__(self, lang_script_codes, cedict_file=None):
|
17 |
+
"""Construct a Backoff object.
|
18 |
+
|
19 |
+
Args:
|
20 |
+
lang_script_codes (list): codes for languages to try, starting
|
21 |
+
with the highest priority languages
|
22 |
+
cedict_file (str): path to the CC-CEdict dictionary file
|
23 |
+
(necessary only when cmn-Hans or cmn-Hant are used)
|
24 |
+
"""
|
25 |
+
self.langs = [_epitran.Epitran(c, cedict_file=cedict_file)
|
26 |
+
for c in lang_script_codes]
|
27 |
+
self.num_re = re.compile(r'\p{Number}+')
|
28 |
+
self.ft = panphon.featuretable.FeatureTable()
|
29 |
+
self.xsampa = XSampa()
|
30 |
+
self.puncnorm = PuncNorm()
|
31 |
+
self.dias = [StripDiacritics(c) for c in lang_script_codes]
|
32 |
+
|
33 |
+
def transliterate(self, token):
|
34 |
+
"""Return IPA transliteration given by first acceptable mode.
|
35 |
+
Args:
|
36 |
+
token (unicode): orthographic text
|
37 |
+
Returns:
|
38 |
+
unicode: transliteration as Unicode IPA string
|
39 |
+
"""
|
40 |
+
tr_list = []
|
41 |
+
while token:
|
42 |
+
is_outside_lang = True
|
43 |
+
for dia, lang in zip(self.dias, self.langs):
|
44 |
+
source = ''
|
45 |
+
while True:
|
46 |
+
m = lang.epi.regexp.match(dia.process(token))
|
47 |
+
if not m:
|
48 |
+
break
|
49 |
+
s = m.group()
|
50 |
+
token = token[len(s):]
|
51 |
+
source += s
|
52 |
+
is_outside_lang = False
|
53 |
+
tr_list.append(lang.transliterate(source))
|
54 |
+
if is_outside_lang:
|
55 |
+
m = re.match(r'\p{Number}+', token)
|
56 |
+
if m:
|
57 |
+
source = m.group()
|
58 |
+
tr_list.append(source)
|
59 |
+
token = token[len(source):]
|
60 |
+
else:
|
61 |
+
tr_list.append(token[0])
|
62 |
+
token = token[1:]
|
63 |
+
return ''.join(tr_list)
|
64 |
+
|
65 |
+
def trans_list(self, token):
|
66 |
+
"""Transliterate/transcribe a word into list of IPA phonemes.
|
67 |
+
|
68 |
+
Args:
|
69 |
+
token (unicode): word to transcribe; unicode string
|
70 |
+
|
71 |
+
Returns:
|
72 |
+
list: list of IPA unicode strings, each corresponding to a segment
|
73 |
+
"""
|
74 |
+
return self.ft.segs_safe(self.transliterate(token))
|
75 |
+
|
76 |
+
def xsampa_list(self, token):
|
77 |
+
"""Transcribe a word into a list of X-SAMPA phonemes.
|
78 |
+
|
79 |
+
Args:
|
80 |
+
token (unicode): word to transcribe; unicode strings
|
81 |
+
|
82 |
+
Returns:
|
83 |
+
list: list of X-SAMPA strings, each corresponding to a segment
|
84 |
+
"""
|
85 |
+
if re.match(r'^\p{Number}+$', token):
|
86 |
+
return ''
|
87 |
+
else:
|
88 |
+
ipa_segs = self.ft.ipa_segs(self.transliterate(token))
|
89 |
+
return list(map(self.xsampa.ipa2xs, ipa_segs))
|
epitran/bin/connl2engipaspace.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
|
3 |
+
import argparse
|
4 |
+
import codecs
|
5 |
+
import logging
|
6 |
+
from collections import Counter
|
7 |
+
|
8 |
+
import unicodecsv as csv
|
9 |
+
|
10 |
+
import epitran
|
11 |
+
import epitran.flite
|
12 |
+
import panphon
|
13 |
+
|
14 |
+
logger = logging.getLogger('epitran')
|
15 |
+
|
16 |
+
|
17 |
+
def normpunc(flite, s):
|
18 |
+
def norm(c):
|
19 |
+
if c in flite.puncnorm:
|
20 |
+
return flite.puncnorm[c]
|
21 |
+
else:
|
22 |
+
return c
|
23 |
+
return ''.join(map(norm, s))
|
24 |
+
|
25 |
+
|
26 |
+
def add_record(flite, ft, orth):
|
27 |
+
space = Counter()
|
28 |
+
orth = normpunc(flite, orth)
|
29 |
+
trans = flite.transliterate(orth)
|
30 |
+
while trans:
|
31 |
+
pref = ft.longest_one_seg_prefix(trans)
|
32 |
+
if pref != '':
|
33 |
+
space[pref] += 1
|
34 |
+
trans = trans[len(pref):]
|
35 |
+
else:
|
36 |
+
if trans[0] in flite.puncnorm_vals:
|
37 |
+
space[trans[0]] += 1
|
38 |
+
else:
|
39 |
+
space[trans[0]] += 1
|
40 |
+
trans = trans[1:]
|
41 |
+
return space
|
42 |
+
|
43 |
+
|
44 |
+
def add_file(flite, ft, fn):
|
45 |
+
space = Counter()
|
46 |
+
with codecs.open(fn, 'r', 'utf-8') as f:
|
47 |
+
for line in f:
|
48 |
+
fields = line.split(u'\t')
|
49 |
+
if len(fields) > 0:
|
50 |
+
orth = fields[0]
|
51 |
+
space.update(add_record(flite, ft, orth))
|
52 |
+
logger.debug(u'Length of counter:\t{}'.format(len(space)))
|
53 |
+
return space
|
54 |
+
|
55 |
+
|
56 |
+
def print_space(output, space):
|
57 |
+
pairs = enumerate(sorted(filter(lambda x: x, space.keys())))
|
58 |
+
with open(output, 'wb') as f:
|
59 |
+
writer = csv.writer(f, encoding='utf-8')
|
60 |
+
for i, char in pairs:
|
61 |
+
writer.writerow((i, char))
|
62 |
+
|
63 |
+
|
64 |
+
def main(infiles, output):
|
65 |
+
flite = epitran.flite.Flite()
|
66 |
+
ft = panphon.FeatureTable()
|
67 |
+
space = Counter()
|
68 |
+
for fn in infiles:
|
69 |
+
logger.debug(u'Scanning:\t{}'.format(fn).encode('utf-8'))
|
70 |
+
space.update(add_file(flite, ft, fn))
|
71 |
+
print_space(output, space)
|
72 |
+
|
73 |
+
|
74 |
+
if __name__ == '__main__':
|
75 |
+
parser = argparse.ArgumentParser()
|
76 |
+
parser.add_argument('-o', '--output', help='Output file.')
|
77 |
+
parser.add_argument('infiles', nargs='+', help='CONLL files serving as basis for segment space.')
|
78 |
+
args = parser.parse_args()
|
79 |
+
main(args.infiles, args.output)
|
epitran/bin/connl2ipaspace.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
|
3 |
+
import argparse
|
4 |
+
import codecs
|
5 |
+
import logging
|
6 |
+
from collections import Counter
|
7 |
+
|
8 |
+
import epitran
|
9 |
+
import panphon
|
10 |
+
import unicodecsv as csv
|
11 |
+
|
12 |
+
logger = logging.getLogger('epitran')
|
13 |
+
|
14 |
+
|
15 |
+
def normpunc(epi, s):
|
16 |
+
def norm(c):
|
17 |
+
if c in epi.puncnorm:
|
18 |
+
return epi.puncnorm[c]
|
19 |
+
else:
|
20 |
+
return c
|
21 |
+
return ''.join(map(norm, s))
|
22 |
+
|
23 |
+
|
24 |
+
def add_record_gen(epi, ft, orth):
|
25 |
+
space = Counter()
|
26 |
+
orth = normpunc(epi, orth)
|
27 |
+
trans = epi.transliterate(orth)
|
28 |
+
while trans:
|
29 |
+
pref = ft.longest_one_seg_prefix(trans)
|
30 |
+
if pref != '':
|
31 |
+
space[pref] += 1
|
32 |
+
trans = trans[len(pref):]
|
33 |
+
else:
|
34 |
+
space[trans[0]] += 1
|
35 |
+
trans = trans[1:]
|
36 |
+
return space
|
37 |
+
|
38 |
+
|
39 |
+
def add_file_gen(epi, ft, fn):
|
40 |
+
space = Counter()
|
41 |
+
with codecs.open(fn, 'r', 'utf-8') as f:
|
42 |
+
for line in f:
|
43 |
+
fields = line.split(u'\t')
|
44 |
+
if len(fields) > 0:
|
45 |
+
orth = fields[0]
|
46 |
+
space.update(add_record_gen(epi, ft, orth))
|
47 |
+
logger.debug(u'Length of counter:\t{}'.format(len(space)))
|
48 |
+
return space
|
49 |
+
|
50 |
+
|
51 |
+
def add_file_op(epi, ft, fn):
|
52 |
+
space = Counter()
|
53 |
+
with codecs.open(fn, 'r', 'utf-8') as f:
|
54 |
+
for line in f:
|
55 |
+
fields = line.split(u'\t')
|
56 |
+
if len(fields) > 0:
|
57 |
+
orth = fields[0]
|
58 |
+
trans = epi.transliterate(orth)
|
59 |
+
while trans:
|
60 |
+
pref = ft.longest_one_seg_prefix(trans)
|
61 |
+
if pref != '':
|
62 |
+
space[pref] += 1
|
63 |
+
trans = trans[len(pref):]
|
64 |
+
else:
|
65 |
+
if trans[0] in epi.puncnorm:
|
66 |
+
space[epi.puncnorm[trans[0]]] += 1
|
67 |
+
else:
|
68 |
+
space[trans[0]] += 1
|
69 |
+
trans = trans[1:]
|
70 |
+
logger.debug(u'Length of counter:\t{}'.format(len(space)))
|
71 |
+
return space
|
72 |
+
|
73 |
+
|
74 |
+
def print_space(output, space):
|
75 |
+
pairs = enumerate(sorted(filter(lambda x: x, space.keys())))
|
76 |
+
with open(output, 'wb') as f:
|
77 |
+
writer = csv.writer(f, encoding='utf-8')
|
78 |
+
for i, char in pairs:
|
79 |
+
writer.writerow((i, char))
|
80 |
+
|
81 |
+
|
82 |
+
def main(code, op, infiles, output):
|
83 |
+
epi = epitran.Epitran(code)
|
84 |
+
ft = panphon.FeatureTable()
|
85 |
+
space = Counter()
|
86 |
+
for fn in infiles:
|
87 |
+
logger.debug(u'Scanning:\t{}'.format(fn).encode('utf-8'))
|
88 |
+
add_file = add_file_op if op else add_file_gen
|
89 |
+
space.update(add_file(epi, ft, fn))
|
90 |
+
print_space(output, space)
|
91 |
+
|
92 |
+
|
93 |
+
if __name__ == '__main__':
|
94 |
+
parser = argparse.ArgumentParser()
|
95 |
+
parser.add_argument('-p', '--op', action='store_true', help='Script uses punctuation as (parts of) letters.')
|
96 |
+
parser.add_argument('-c', '--code', help='Script code for CONNL files.')
|
97 |
+
parser.add_argument('-o', '--output', help='Output file.')
|
98 |
+
parser.add_argument('infiles', nargs='+', help='CONLL files serving as basis for segment space.')
|
99 |
+
args = parser.parse_args()
|
100 |
+
main(args.code, args.op, args.infiles, args.output)
|
epitran/bin/decompose.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
|
3 |
+
import unicodedata
|
4 |
+
import sys
|
5 |
+
|
6 |
+
|
7 |
+
def main(fn):
|
8 |
+
with open(fn, encoding='utf-8') as f:
|
9 |
+
print(unicodedata.normalize('NFD', f.read()))
|
10 |
+
|
11 |
+
|
12 |
+
if __name__ == '__main__':
|
13 |
+
main(sys.argv[1])
|
epitran/bin/detectcaps.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
from __future__ import print_function
|
4 |
+
|
5 |
+
import unicodedata
|
6 |
+
import fileinput
|
7 |
+
|
8 |
+
|
9 |
+
def main():
|
10 |
+
for line in fileinput.input():
|
11 |
+
line = line.decode('utf-8')
|
12 |
+
token = line.strip()
|
13 |
+
if len(token) > 1 and unicodedata.category(token[1]) == 'Lu':
|
14 |
+
is_cap = 0
|
15 |
+
elif len(token) > 0 and unicodedata.category(token[0]) == 'Lu':
|
16 |
+
is_cap = 1
|
17 |
+
else:
|
18 |
+
is_cap = 0
|
19 |
+
line = u'{}\t{}'.format(is_cap, token)
|
20 |
+
line = line.encode('utf-8')
|
21 |
+
print(line)
|
22 |
+
|
23 |
+
|
24 |
+
if __name__ == '__main__':
|
25 |
+
main()
|
epitran/bin/epitranscribe.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
import sys
|
5 |
+
import unicodedata
|
6 |
+
import epitran
|
7 |
+
import argparse
|
8 |
+
|
9 |
+
|
10 |
+
def main(code):
|
11 |
+
epi = epitran.Epitran(code)
|
12 |
+
for line in sys.stdin: # pointless
|
13 |
+
line = line.decode('utf-8')
|
14 |
+
line = unicodedata.normalize('NFD', line.lower())
|
15 |
+
line = epi.transliterate(line)
|
16 |
+
line = line.encode('utf-8')
|
17 |
+
sys.stdout.write(line)
|
18 |
+
|
19 |
+
|
20 |
+
if __name__ == '__main__':
|
21 |
+
parser = argparse.ArgumentParser(
|
22 |
+
description=u'Coverts text from STDIN (in the language specified),' +
|
23 |
+
'into Unicode IPA and emits it to STDOUT.')
|
24 |
+
parser.add_argument('code', help=u'ISO 639-3 code for conversion language')
|
25 |
+
args = parser.parse_args()
|
26 |
+
main(args.code)
|
epitran/bin/isbijective.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env pythoh
|
2 |
+
from __future__ import print_function
|
3 |
+
|
4 |
+
import glob
|
5 |
+
|
6 |
+
import unicodecsv as csv
|
7 |
+
|
8 |
+
|
9 |
+
def read_map(fn):
|
10 |
+
with open(fn, 'rb') as f:
|
11 |
+
reader = csv.reader(f, encoding='utf-8')
|
12 |
+
next(reader)
|
13 |
+
return [(a, b) for [a, b] in reader]
|
14 |
+
|
15 |
+
|
16 |
+
def is_bijection(mapping):
|
17 |
+
a, b = zip(*mapping)
|
18 |
+
distinct_a, distinct_b = set(a), set(b)
|
19 |
+
return len(distinct_a) == len(mapping) and len(distinct_b) == len(mapping)
|
20 |
+
|
21 |
+
|
22 |
+
def main(map_fns):
|
23 |
+
for fn in map_fns:
|
24 |
+
mapping = read_map(fn)
|
25 |
+
is_b = is_bijection(mapping)
|
26 |
+
print('{}\t{}'.format(fn, is_b))
|
27 |
+
|
28 |
+
|
29 |
+
if __name__ == '__main__':
|
30 |
+
map_fns = glob.glob('../data/*.csv')
|
31 |
+
main(map_fns)
|
epitran/bin/ltf2ipaspace.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
from __future__ import print_function
|
4 |
+
|
5 |
+
import argparse
|
6 |
+
import glob
|
7 |
+
import os.path
|
8 |
+
|
9 |
+
from lxml import etree
|
10 |
+
import unicodecsv as csv
|
11 |
+
|
12 |
+
import epitran
|
13 |
+
import panphon.featuretable
|
14 |
+
|
15 |
+
|
16 |
+
def read_tokens(fn):
|
17 |
+
tree = etree.parse(fn)
|
18 |
+
root = tree.getroot()
|
19 |
+
return [tok.text for tok in root.findall('.//TOKEN')]
|
20 |
+
|
21 |
+
|
22 |
+
def read_input(input_, langscript):
|
23 |
+
space = set()
|
24 |
+
epi = epitran.Epitran(langscript)
|
25 |
+
ft = panphon.featuretable.FeatureTable()
|
26 |
+
for dirname in input_[0]:
|
27 |
+
for fn in glob.glob(os.path.join(dirname, '*.ltf.xml')):
|
28 |
+
for token in read_tokens(fn):
|
29 |
+
ipa = epi.transliterate(token)
|
30 |
+
for seg in ft.segs_safe(ipa):
|
31 |
+
space.add(seg)
|
32 |
+
return space
|
33 |
+
|
34 |
+
|
35 |
+
def write_output(output, space):
|
36 |
+
with open(output, 'wb') as f:
|
37 |
+
writer = csv.writer(f, encoding='utf-8')
|
38 |
+
for n, ch in enumerate(sorted(list(space))):
|
39 |
+
writer.writerow((n, ch))
|
40 |
+
|
41 |
+
|
42 |
+
def main(langscript, input_, output):
|
43 |
+
space = read_input(input_, langscript)
|
44 |
+
write_output(output, space)
|
45 |
+
|
46 |
+
|
47 |
+
if __name__ == '__main__':
|
48 |
+
parser = argparse.ArgumentParser()
|
49 |
+
parser.add_argument('-c', '--code', help='language-script code')
|
50 |
+
parser.add_argument('-i', '--input', nargs='+', action='append', help='Directories where input LTF files are found')
|
51 |
+
parser.add_argument('-o', '--output', help='Output file')
|
52 |
+
args = parser.parse_args()
|
53 |
+
main(args.code, args.input, args.output)
|
epitran/bin/migraterules.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env Python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
from __future__ import (print_function, unicode_literals, absolute_import)
|
5 |
+
|
6 |
+
import glob
|
7 |
+
import re
|
8 |
+
import io
|
9 |
+
|
10 |
+
import unicodecsv
|
11 |
+
|
12 |
+
|
13 |
+
def build_rule(fields):
|
14 |
+
try:
|
15 |
+
a, b, X, Y = fields
|
16 |
+
b = "0" if not b else b
|
17 |
+
a = "0" if not a else a
|
18 |
+
return '{} -> {} / {} _ {}'.format(a, b, X, Y)
|
19 |
+
except ValueError:
|
20 |
+
print('Malformed rule: {}'.format(','.join(fields)))
|
21 |
+
|
22 |
+
|
23 |
+
def main():
|
24 |
+
for csv in glob.glob('*.csv'):
|
25 |
+
txt = re.match('[A-Za-z-]+', csv).group(0) + '.txt'
|
26 |
+
with open(csv, 'rb') as f, io.open(txt, 'w', encoding='utf-8') as g:
|
27 |
+
reader = unicodecsv.reader(f, encoding='utf-8')
|
28 |
+
next(reader)
|
29 |
+
for fields in reader:
|
30 |
+
if re.match('\s*%', fields[0]):
|
31 |
+
print(','.join([x for x in fields if x]), file=g)
|
32 |
+
else:
|
33 |
+
rule = build_rule(fields)
|
34 |
+
rule = re.sub('[ ]+', ' ', rule)
|
35 |
+
rule = re.sub('[ ]$', '', rule)
|
36 |
+
print(rule, file=g)
|
37 |
+
|
38 |
+
|
39 |
+
if __name__ == '__main__':
|
40 |
+
main()
|
epitran/bin/reromanize.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python2
|
2 |
+
|
3 |
+
from __future__ import print_function
|
4 |
+
|
5 |
+
import epitran.reromanize
|
6 |
+
import argparse
|
7 |
+
import sys
|
8 |
+
|
9 |
+
def main(code, table):
|
10 |
+
rr = epitran.reromanize.ReRomanizer(code, table)
|
11 |
+
for line in sys.stdin:
|
12 |
+
line = line.decode('utf-8')
|
13 |
+
tokens = line.strip().split('\t')
|
14 |
+
tokens = [rr.reromanize(x) for x in tokens]
|
15 |
+
print('\t'.join(tokens).encode('utf-8'))
|
16 |
+
|
17 |
+
if __name__ == '__main__':
|
18 |
+
parser = argparse.ArgumentParser()
|
19 |
+
parser.add_argument('-c', '--code', default='ori-Orya', type=str, help='Languagee and script code')
|
20 |
+
parser.add_argument('-t', '--table', default='anglocentric', type=str, help='Romanization table')
|
21 |
+
args = parser.parse_args()
|
22 |
+
main(args.code, args.table)
|
epitran/bin/space2punc.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
|
3 |
+
import sys
|
4 |
+
import unicodedata
|
5 |
+
import unicodecsv as csv
|
6 |
+
|
7 |
+
|
8 |
+
def main(fns, fnn):
|
9 |
+
punc = set()
|
10 |
+
for fn in fns:
|
11 |
+
print fn
|
12 |
+
with open(fn, 'rb') as f:
|
13 |
+
reader = csv.reader(f, encoding='utf-8')
|
14 |
+
for _, s in reader:
|
15 |
+
if len(s) == 1 and unicodedata.category(s)[0] == u'P':
|
16 |
+
punc.add(s)
|
17 |
+
with open(fnn, 'wb') as f:
|
18 |
+
writer = csv.writer(f, encoding='utf-8')
|
19 |
+
for mark in sorted(list(punc)):
|
20 |
+
writer.writerow([mark])
|
21 |
+
|
22 |
+
|
23 |
+
if __name__ == '__main__':
|
24 |
+
main(sys.argv[1:-1], sys.argv[-1])
|
epitran/bin/testvectorgen.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
|
3 |
+
from __future__ import print_function
|
4 |
+
|
5 |
+
import argparse
|
6 |
+
import codecs
|
7 |
+
|
8 |
+
import epitran.vector
|
9 |
+
|
10 |
+
|
11 |
+
def main(code, space, infile):
|
12 |
+
vec = epitran.vector.VectorsWithIPASpace(code, space)
|
13 |
+
with codecs.open(infile, 'r', 'utf-8') as f:
|
14 |
+
for line in f:
|
15 |
+
fields = line.split('\t')
|
16 |
+
if len(fields) > 1:
|
17 |
+
word = fields[0]
|
18 |
+
print(u"WORD: {}".format(word).encode('utf-8'))
|
19 |
+
segs = vec.word_to_segs(word)
|
20 |
+
for record in segs:
|
21 |
+
cat, case, orth, phon, id_, vector = record
|
22 |
+
print(u"Category: {}".format(cat).encode('utf-8'))
|
23 |
+
print(u"Case: {}".format(case).encode('utf-8'))
|
24 |
+
print(u"Orthographic: {}".format(orth).encode('utf-8'))
|
25 |
+
print(u"Phonetic: {}".format(phon).encode('utf-8'))
|
26 |
+
print(u"Vector: {}".format(vector).encode('utf-8'))
|
27 |
+
|
28 |
+
|
29 |
+
if __name__ == '__main__':
|
30 |
+
parser = argparse.ArgumentParser()
|
31 |
+
parser.add_argument('-c', '--code', required=True, help='Script code.')
|
32 |
+
parser.add_argument('-s', '--space', required=True, help='Space.')
|
33 |
+
parser.add_argument('-i', '--infile', required=True, help='Input file.')
|
34 |
+
args = parser.parse_args()
|
35 |
+
main(args.code, args.space, args.infile)
|
epitran/bin/transltf.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
from __future__ import print_function
|
3 |
+
|
4 |
+
import sys
|
5 |
+
|
6 |
+
from lxml import etree
|
7 |
+
import epitran
|
8 |
+
import epitran.vector
|
9 |
+
|
10 |
+
def main(fn):
|
11 |
+
epi = epitran.Epitran('uig-Arab')
|
12 |
+
vwis = epitran.vector.VectorsWithIPASpace('uig-Arab', ['uig-Arab'])
|
13 |
+
tree = etree.parse(fn)
|
14 |
+
root = tree.getroot()
|
15 |
+
for token in root.findall('.//TOKEN'):
|
16 |
+
# print(token.text.encode('utf-8'))
|
17 |
+
print(epi.transliterate(unicode(token.text)).encode('utf-8'))
|
18 |
+
|
19 |
+
if __name__ == '__main__':
|
20 |
+
main(sys.argv[1])
|
epitran/bin/uigtransliterate.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
from __future__ import print_function
|
3 |
+
|
4 |
+
import fileinput
|
5 |
+
import epitran
|
6 |
+
|
7 |
+
epi = epitran.Epitran('uig-Arab')
|
8 |
+
for line in fileinput.input():
|
9 |
+
s = epi.transliterate(line.strip().decode('utf-8'))
|
10 |
+
print(s.encode('utf-8'))
|
epitran/bin/vie-tones.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
|
3 |
+
import csv
|
4 |
+
import re
|
5 |
+
import sys
|
6 |
+
import os.path
|
7 |
+
import unicodedata
|
8 |
+
|
9 |
+
|
10 |
+
tones = {
|
11 |
+
'\u00b4': '˧˥', # acute = sac
|
12 |
+
'\u0060': '˨˩', # grave = huyen
|
13 |
+
'\u0303': '˧˥', # tilde = nga
|
14 |
+
'\u0309': '˧˩˧', # hook above = hoi
|
15 |
+
'\u0323': '˧˩', # dot below = nang
|
16 |
+
}
|
17 |
+
|
18 |
+
|
19 |
+
def shuffle_tone(orth, phon):
|
20 |
+
orth = unicodedata.normalize('NFD', orth)
|
21 |
+
if re.search('[aeiouơư]', orth):
|
22 |
+
for tone in tones:
|
23 |
+
if tone in orth:
|
24 |
+
phon += tones[tone]
|
25 |
+
if not re.search('[˩˨˧˦˥]', phon):
|
26 |
+
phon += '˧'
|
27 |
+
return phon
|
28 |
+
|
29 |
+
|
30 |
+
def main():
|
31 |
+
fnin = sys.argv[1]
|
32 |
+
fnout = os.path.basename(fnin)
|
33 |
+
with open(fnin) as fin, open(fnout, 'w') as fout:
|
34 |
+
writer = csv.writer(fout)
|
35 |
+
reader = csv.reader(fin)
|
36 |
+
header = next(reader)
|
37 |
+
writer.writerow(header)
|
38 |
+
for orth, phon in reader:
|
39 |
+
phon = shuffle_tone(orth, phon)
|
40 |
+
writer.writerow([orth, phon])
|
41 |
+
|
42 |
+
|
43 |
+
if __name__ == '__main__':
|
44 |
+
main()
|
epitran/cedict.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
from __future__ import (absolute_import, division, print_function,
|
3 |
+
unicode_literals)
|
4 |
+
|
5 |
+
import codecs
|
6 |
+
|
7 |
+
import marisa_trie
|
8 |
+
import regex as re
|
9 |
+
|
10 |
+
ASCII_CHARS = ''.join([chr(i) for i in range(128)])
|
11 |
+
|
12 |
+
|
13 |
+
class CEDictTrie(object):
|
14 |
+
def __init__(self, cedict_file, traditional=False):
|
15 |
+
"""Construct a trie over CC-CEDict
|
16 |
+
|
17 |
+
Args:
|
18 |
+
cedict_file (str): path to the CC-CEDict dictionary
|
19 |
+
traditional (bool): if True, use traditional characters
|
20 |
+
"""
|
21 |
+
self.hanzi = self._read_cedict(cedict_file, traditional=traditional)
|
22 |
+
self.trie = self._construct_trie(self.hanzi)
|
23 |
+
|
24 |
+
def _read_cedict(self, cedict_file, traditional=False):
|
25 |
+
comment_re = re.compile('\s*#')
|
26 |
+
lemma_re = re.compile('(?P<hanzi>[^]]+) \[(?P<pinyin>[^]]+)\] /(?P<english>.+)/')
|
27 |
+
cedict = {}
|
28 |
+
with codecs.open(cedict_file, 'r', 'utf-8') as f:
|
29 |
+
for line in f:
|
30 |
+
if comment_re.match(line):
|
31 |
+
pass
|
32 |
+
elif lemma_re.match(line):
|
33 |
+
match = lemma_re.match(line)
|
34 |
+
hanzi = match.group('hanzi').split(' ')
|
35 |
+
pinyin = match.group('pinyin').split(' ')
|
36 |
+
english = match.group('english').split('/')
|
37 |
+
if traditional:
|
38 |
+
cedict[hanzi[0]] = (pinyin, english) # traditional characters only
|
39 |
+
else:
|
40 |
+
cedict[hanzi[1]] = (pinyin, english) # simplified characters only.
|
41 |
+
return cedict
|
42 |
+
|
43 |
+
|
44 |
+
def _construct_trie(self, hanzi):
|
45 |
+
pairs = []
|
46 |
+
for hz, df in self.hanzi.items():
|
47 |
+
py, en = df
|
48 |
+
py = str(''.join(filter(lambda x: x in ASCII_CHARS, ' '.join(py))))
|
49 |
+
pairs.append((hz, (py.encode('utf-8'),)))
|
50 |
+
trie = marisa_trie.RecordTrie(str('@s'), pairs)
|
51 |
+
return trie
|
52 |
+
|
53 |
+
def has_key(self, key):
|
54 |
+
return key in self.hanzi
|
55 |
+
|
56 |
+
def prefixes(self, s):
|
57 |
+
return self.trie.prefixes(s)
|
58 |
+
|
59 |
+
def longest_prefix(self, s):
|
60 |
+
prefixes = self.prefixes(s)
|
61 |
+
if not prefixes:
|
62 |
+
return ''
|
63 |
+
else:
|
64 |
+
return sorted(prefixes, key=len)[-1] # Sort by length and return last.
|
65 |
+
|
66 |
+
def tokenize(self, s):
|
67 |
+
tokens = []
|
68 |
+
while s:
|
69 |
+
token = self.longest_prefix(s)
|
70 |
+
if token:
|
71 |
+
tokens.append(token)
|
72 |
+
s = s[len(token):]
|
73 |
+
else:
|
74 |
+
tokens.append(s[0])
|
75 |
+
s = s[1:]
|
76 |
+
return tokens
|
epitran/data/arpabet.csv
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pau,
|
2 |
+
null,
|
3 |
+
ey,ej
|
4 |
+
ae,æ
|
5 |
+
iy,i
|
6 |
+
eh,ɛ
|
7 |
+
ay,aj
|
8 |
+
ih,ɪ
|
9 |
+
ow,ow
|
10 |
+
aa,ɑ
|
11 |
+
ao,ɔ
|
12 |
+
aw,aw
|
13 |
+
oy,oj
|
14 |
+
ah,ʌ
|
15 |
+
ax,ə
|
16 |
+
uw,u
|
17 |
+
uh,ʊ
|
18 |
+
er,ɹ̩
|
19 |
+
b,b
|
20 |
+
ch,t͡ʃ
|
21 |
+
d,d
|
22 |
+
dx,ɾ
|
23 |
+
f,f
|
24 |
+
g,ɡ
|
25 |
+
hh,h
|
26 |
+
jh,d͡ʒ
|
27 |
+
k,k
|
28 |
+
l,l
|
29 |
+
em,m̩
|
30 |
+
m,m
|
31 |
+
en,n̩
|
32 |
+
n,n
|
33 |
+
ng,ŋ
|
34 |
+
p,p
|
35 |
+
q,ʔ
|
36 |
+
r,ɹ
|
37 |
+
s,s
|
38 |
+
sh,ʃ
|
39 |
+
t,t
|
40 |
+
dh,ð
|
41 |
+
th,θ
|
42 |
+
v,v
|
43 |
+
w,w
|
44 |
+
y,j
|
45 |
+
z,z
|
46 |
+
zh,ʒ
|
epitran/data/ipa-xsampa.csv
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
IPA,X-SAMPA,Name
|
2 |
+
p,p,vl bilabial plosive
|
3 |
+
b,b,vd bilabial plosive
|
4 |
+
t,t,vl alveolar plosive
|
5 |
+
d,d,vd alveolar plosive
|
6 |
+
ʈ,t`,vl retroflex plosive
|
7 |
+
ɖ,d`,vd retroflex plosive
|
8 |
+
c,c,vl palatal plosive
|
9 |
+
ɟ,J\,vd palatal plosive
|
10 |
+
k,k,ld velar plosive
|
11 |
+
ɡ,g,vd velar plosive
|
12 |
+
q,q,vl uvular plosive
|
13 |
+
ɢ,G\,vd uvular plosive
|
14 |
+
ʔ,?,glottal plosive
|
15 |
+
m,m,bilabial nasal
|
16 |
+
ɱ,F,vl labiodental nasal
|
17 |
+
n,n,alveolar nasal
|
18 |
+
ɳ,n`,vl retroflex nasal
|
19 |
+
ɲ,J,vl palatal nasal
|
20 |
+
ŋ,N,vl velar nasal
|
21 |
+
ɴ,N\,vl uvular nasal
|
22 |
+
ʙ,B\,vd bilabial trill
|
23 |
+
r,r,vd alveolar trill
|
24 |
+
ʀ,R\,vl uvular trill
|
25 |
+
ɾ,4,vl alveolar tap
|
26 |
+
ɽ,r`,vl retroflex flap
|
27 |
+
ɸ,p\,vl bilabial fricative
|
28 |
+
β,B,vd bilabial fricative
|
29 |
+
f,f,vl labiodental fricative
|
30 |
+
v,v,vd labiodental fricative
|
31 |
+
θ,T,vl dental fricative
|
32 |
+
ð,D,vd dental fricative
|
33 |
+
s,s,vl alveolar fricative
|
34 |
+
z,z,vd alveolar fricative
|
35 |
+
ʃ,S,vl postalveolar fricative
|
36 |
+
ʒ,Z,vd postalveolar fricative
|
37 |
+
ʂ,s`,vl retroflex fricative
|
38 |
+
ʐ,z`,vd retroflex fricative
|
39 |
+
ç,C,vl palatal fricative
|
40 |
+
ʝ,j\,vd palatal fricative
|
41 |
+
x,x,vl velar fricative
|
42 |
+
ɣ,G,vd velar fricative
|
43 |
+
χ,X,vl uvular fricative
|
44 |
+
ʁ,R,vd uvular fricative
|
45 |
+
ħ,X\,vl pharyngeal fricative
|
46 |
+
ʕ,?\,vd pharyngeal fricative
|
47 |
+
h,h,vl glottal fricative
|
48 |
+
ʔ,?,glottal plosive
|
49 |
+
ɬ,K,vl alveolar lateral fricative
|
50 |
+
ɮ,K\,vd alveolar lateral fricative
|
51 |
+
ʋ,P,vd labiodental approximant
|
52 |
+
ɹ,r\,vd (post)alveolar approximant
|
53 |
+
ɻ,r\`,vd retroflex approximant
|
54 |
+
j,j,vd palatal approximant
|
55 |
+
ɰ,M\,vd velar approximant
|
56 |
+
l,l,vd alveolar lateral approximant
|
57 |
+
ɭ,l`,vd retroflex lateral approximant
|
58 |
+
ʎ,L,vd palatal lateral approximant
|
59 |
+
ʟ,L\,vd velar lateral approximant
|
60 |
+
pʼ,p_>,ejective
|
61 |
+
tʼ,t_>,ejective
|
62 |
+
ʈʼ,t`_>,ejective
|
63 |
+
cʼ,c_>,ejective
|
64 |
+
kʼ,k_>,ejective
|
65 |
+
qʼ,q_>,ejective
|
66 |
+
ɓ,b_<,vl bilabial implosive
|
67 |
+
ɗ,d_<,vl alveolar implosive
|
68 |
+
ƙ,k_<,vl velar implosive
|
69 |
+
ɠ,g_<,vl velar implosive
|
70 |
+
i,i,close front unrounded
|
71 |
+
y,y,close front rounded
|
72 |
+
ɨ,1,close central unrounded
|
73 |
+
ʉ,},close central rounded
|
74 |
+
ɯ,M,close back unrounded
|
75 |
+
u,u,close back rounded
|
76 |
+
ɪ,I,lax close front unrounded
|
77 |
+
ʏ,Y,lax close front rounded
|
78 |
+
ʊ,U,lax close back rounded
|
79 |
+
e,e,close-mid front unrounded
|
80 |
+
ø,2,front close-mid rounded
|
81 |
+
ɤ,7,close-mid back unrounded
|
82 |
+
o,o,close-mid back rounded
|
83 |
+
ə,@,schwa
|
84 |
+
ɘ,@\,close-mid central unrounded vowel
|
85 |
+
ɵ,8,rounded schwa
|
86 |
+
ɛ,E,open-mid front unrounded
|
87 |
+
œ,9,front open-mid rounded
|
88 |
+
ʌ,V,open-mid back unrounded
|
89 |
+
ɔ,O,open-mid back rounded
|
90 |
+
æ,{,mid-open front unrounded vowel
|
91 |
+
ɐ,6,open-mid schwa
|
92 |
+
a,a,open front unrounded
|
93 |
+
ă,a_X,extra short open front unrounded
|
94 |
+
ɶ,&,front open rounded
|
95 |
+
ɑ,A,open back unrounded
|
96 |
+
ɒ,Q,open back rounded
|
97 |
+
̥,_0,voiceless
|
98 |
+
̬,_v,voiced
|
99 |
+
ʰ,_h,aspirated
|
100 |
+
̤,_t,breathy voiced
|
101 |
+
̰,_k,creaky voiced
|
102 |
+
̼,_N,linguolabial
|
103 |
+
̪,_d,dental
|
104 |
+
̺,_a,apical
|
105 |
+
̻,_m,laminal
|
106 |
+
̹,_O,more rounded
|
107 |
+
̜,_c,less rounded
|
108 |
+
̟,_+,advanced
|
109 |
+
̠,_-,retracted
|
110 |
+
̈,"_""",centralized
|
111 |
+
̽,_x,mid-centralized
|
112 |
+
̩,=,syllabic
|
113 |
+
̯,_^,non-syllabic
|
114 |
+
ʷ,_w,labialized
|
115 |
+
ʲ,',palatalized
|
116 |
+
ˠ,_G,velarized
|
117 |
+
ˤ,_?\,pharyngealized
|
118 |
+
̴,_e,velarized or pharyngealized
|
119 |
+
̝,_r,raised
|
120 |
+
̞,_o,lowered
|
121 |
+
̃,~,nasalized
|
122 |
+
ⁿ,_n,nasal release
|
123 |
+
ˡ,_l,lateral release
|
124 |
+
̚,_},not audibly released
|
125 |
+
̘,_A,advanced tongue root
|
126 |
+
̙,_q,retracted tongue root
|
127 |
+
̋,_T,extra high tone
|
128 |
+
́,_H,high tone
|
129 |
+
̄,_M,mid tone
|
130 |
+
̀,_L,low tone
|
131 |
+
̏,_B,extra low tone
|
132 |
+
ˈ,"""",(primary) stress mark
|
133 |
+
ˌ,%,secondary stress
|
134 |
+
ː,:,length mark
|
135 |
+
ˑ,:\,half-length
|
136 |
+
̆,_X,extra-short
|
137 |
+
.,.,syllable break
|
138 |
+
ʍ,W,vl labial-velar fricative
|
139 |
+
w,w,vd labio-velar approximant
|
140 |
+
ɥ,H,labial-palatal approximant
|
141 |
+
ʜ,H\,vl epiglottal fricative
|
142 |
+
ʢ,<\,vl epiglottal fricative
|
143 |
+
ʡ,>\,vl epiglottal plosive
|
144 |
+
ɕ,s\,vl alveolopalatal fricative
|
145 |
+
ʑ,z\,vl alveolopalatal fricative
|
146 |
+
ʘ,O\,bilabial click
|
147 |
+
ǀ,|\,dental click
|
148 |
+
ǃ,!\,click
|
149 |
+
ǂ,'=\,alveolar click
|
150 |
+
ǁ,|\|\,alveolar lateral click
|
151 |
+
ɺ,l\,vl alveolar lateral flap
|
152 |
+
ɜ,3,open-mid central
|
153 |
+
ʛ,G\_<,vl uvular implosive
|
154 |
+
ɚ,@`,rhotacized schwa
|
155 |
+
ɞ,3\,open-mid central rounded
|
156 |
+
ɦ,h\,vd glottal fricative
|
157 |
+
ɫ,5,velarized vl alveolar lateral
|
158 |
+
ʄ,J\_<,vl palatal implosive
|
159 |
+
ʼ,_>,ejective
|
160 |
+
ɝ,3`,rhotacized open-mid central
|
161 |
+
t͡ʃ,tS,vl postalveolar affricate
|
162 |
+
d͡ʒ,dZ,vd postalveolar affricate
|
163 |
+
t͡ɕ,ts\,vl alveolo-palatal affricate
|
164 |
+
d͡ʑ,dz\,vd alveolo-palatal affricate
|
165 |
+
t͡ɬ,tK,vl alveolar lateral affricate
|
166 |
+
k͡p,kp,vl labial-velar plosive
|
167 |
+
g͡b,gb,vd labial-velar plosive
|
168 |
+
ŋ͡m,Nm,labial-velar nasal stop
|
169 |
+
ʈ͡ʂ,ts`,vl retroflex affricate
|
170 |
+
ɖ͡ʐ,tz`,vd retroflex affricate
|
171 |
+
˩,_B,extra low tone
|
172 |
+
˨,_L,low tone
|
173 |
+
˧,_M,mid tone
|
174 |
+
˦,_H,high tone
|
175 |
+
˥,_T,extra high tone
|
epitran/data/map/rhg-lroh.csv
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Orth,Phon
|
2 |
+
b,b
|
3 |
+
d,d
|
4 |
+
ḍ,ɖ
|
5 |
+
f,f
|
6 |
+
g,g
|
7 |
+
h,h
|
8 |
+
j,d͡ʒ
|
9 |
+
k,k
|
10 |
+
l,l
|
11 |
+
m,m
|
12 |
+
n,n
|
13 |
+
p,p
|
14 |
+
r,ɾ
|
15 |
+
ṛ,ɽ
|
16 |
+
s,s
|
17 |
+
š,ʃ
|
18 |
+
t,t
|
19 |
+
ṭ,ʈ
|
20 |
+
v,v
|
21 |
+
w,w
|
22 |
+
y,j
|
23 |
+
z,z
|
24 |
+
ã,ɑ̃
|
25 |
+
a,ɑ
|
26 |
+
ẽ,ẽ
|
27 |
+
e,e
|
28 |
+
ĩ,ĩ
|
29 |
+
i,i
|
30 |
+
õ,ɔ̃
|
31 |
+
o,ɔ
|
32 |
+
ũ,ũ
|
33 |
+
u,u
|
epitran/data/map/rhg-roheng.csv
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Orth,Phon
|
2 |
+
b,b
|
3 |
+
c,ʃ
|
4 |
+
ç,ɽ
|
5 |
+
d,d
|
6 |
+
f,f
|
7 |
+
g,g
|
8 |
+
h,h
|
9 |
+
j,d͡ʒ
|
10 |
+
k,k
|
11 |
+
l,l
|
12 |
+
m,m
|
13 |
+
n,n
|
14 |
+
p,p
|
15 |
+
q,q
|
16 |
+
r,ɾ
|
17 |
+
s,s
|
18 |
+
t,t
|
19 |
+
v,v
|
20 |
+
w,w
|
21 |
+
x,ks
|
22 |
+
y,j
|
23 |
+
z,z
|
24 |
+
dh,ɖ
|
25 |
+
th,ʈ
|
26 |
+
a,ɑ
|
27 |
+
añ,ɑ̃
|
28 |
+
e,e
|
29 |
+
eñ,ẽ
|
30 |
+
i,i
|
31 |
+
iñ,ĩ
|
32 |
+
o,ɔ
|
33 |
+
oñ,ɔ̃
|
34 |
+
u,u
|
35 |
+
uñ,ũ
|
epitran/data/post/rhg-lroh.txt
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ɑ̃ɑ -> ɑ̃ː / _
|
2 |
+
ɑɑ̃ -> ɑ̃ː / _
|
3 |
+
ɑɑ -> ɑː / _
|
4 |
+
|
5 |
+
ẽe -> ẽː / _
|
6 |
+
eẽ -> ẽː / _
|
7 |
+
ee -> eː / _
|
8 |
+
|
9 |
+
ĩi -> ĩː / _
|
10 |
+
iĩ -> ĩː / _
|
11 |
+
ii -> iː / _
|
12 |
+
|
13 |
+
ɔ̃ɔ -> ɔ̃ː / _
|
14 |
+
ɔɔ̃ -> ɔ̃ː / _
|
15 |
+
ɔɔ -> ɔː / _
|
16 |
+
|
17 |
+
ũu -> ũː / _
|
18 |
+
uũ -> ũː / _
|
19 |
+
uu -> uː / _
|
epitran/data/post/rhg-roheng.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ɑɑ̃ -> ɑ̃ː / _
|
2 |
+
ɑɑ -> ɑː / _
|
3 |
+
|
4 |
+
eẽ -> ẽː / _
|
5 |
+
ee -> eː / _
|
6 |
+
|
7 |
+
iĩ -> ĩː / _
|
8 |
+
ii -> iː / _
|
9 |
+
|
10 |
+
ɔɔ̃ -> ɔ̃ː / _
|
11 |
+
oo -> ɔː / _
|
12 |
+
|
13 |
+
uũ -> ũː / _
|
14 |
+
uu -> uː / _
|
epitran/data/pre/rhg-lroh.txt
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
::vowel:: = a|ã|e|ẽ|i|ĩ|o|õ|u|ũ
|
2 |
+
::consonant:: = b|d|ḍ|f|g|h|j|k|l|m|n|p|r|ṛ|s|š|t|ṭ|v|w|y|z
|
3 |
+
|
4 |
+
% remove stress marks
|
5 |
+
á -> a / _
|
6 |
+
é -> e / _
|
7 |
+
í -> i / _
|
8 |
+
ó -> o / _
|
9 |
+
ú -> u / _
|
10 |
+
|
11 |
+
% vowel glides
|
12 |
+
w -> 0 / (u|ũ) _ (a|o|e)
|
13 |
+
y -> 0 / (i|ĩ) _ (a|e|o|u)
|
14 |
+
|
15 |
+
% long vowels
|
16 |
+
|
17 |
+
% gemination
|
epitran/data/pre/rhg-roheng.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
::vowel:: = a|e|i|o|u
|
2 |
+
::consonant:: = b|c|ç|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|x|y|z
|
3 |
+
|
4 |
+
% remove stress marks
|
5 |
+
á -> a / _
|
6 |
+
é -> e / _
|
7 |
+
í -> i / _
|
8 |
+
ó -> o / _
|
9 |
+
ú -> u / _
|
10 |
+
|
11 |
+
% vowel glides
|
12 |
+
w -> 0 / (u|uñ) _ (a|o|e)
|
13 |
+
y -> 0 / (i|iñ) _ (a|e|o|u)
|
epitran/data/puncnorm.csv
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Punctuation,NormalizedForm
|
2 |
+
‘,'
|
3 |
+
’,'
|
4 |
+
ʼ,'
|
5 |
+
ʻ,'
|
6 |
+
”,""""
|
7 |
+
“,""""
|
8 |
+
。,.
|
9 |
+
,,","
|