ORI-Muchim commited on
Commit
bfc486e
1 Parent(s): 3388262

Upload 4 files

Browse files
Files changed (4) hide show
  1. text/LICENSE +19 -0
  2. text/__init__.py +32 -0
  3. text/cleaners.py +17 -0
  4. text/japanese.py +132 -0
text/LICENSE ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2017 Keith Ito
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
text/__init__.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+ from text import cleaners
3
+
4
+
5
+ def text_to_sequence(text, symbols, cleaner_names):
6
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
7
+ Args:
8
+ text: string to convert to a sequence
9
+ cleaner_names: names of the cleaner functions to run the text through
10
+ Returns:
11
+ List of integers corresponding to the symbols in the text
12
+ '''
13
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
14
+
15
+ sequence = []
16
+
17
+ clean_text = _clean_text(text, cleaner_names)
18
+ for symbol in clean_text:
19
+ if symbol not in _symbol_to_id.keys():
20
+ continue
21
+ symbol_id = _symbol_to_id[symbol]
22
+ sequence += [symbol_id]
23
+ return sequence
24
+
25
+
26
+ def _clean_text(text, cleaner_names):
27
+ for name in cleaner_names:
28
+ cleaner = getattr(cleaners, name)
29
+ if not cleaner:
30
+ raise Exception('Unknown cleaner: %s' % name)
31
+ text = cleaner(text)
32
+ return text
text/cleaners.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def japanese_cleaners(text):
4
+ from text.japanese import japanese_to_romaji_with_accent
5
+ text = japanese_to_romaji_with_accent(text)
6
+ if len(text) == 0 or re.match('[A-Za-z]', text[-1]):
7
+ text += '.'
8
+ return text
9
+
10
+
11
+ def japanese_cleaners2(text):
12
+ text = text.replace('・・・', '…').replace('・', ' ')
13
+ text = japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…') \
14
+ .replace('(', '').replace(')', '') \
15
+ .replace('[', '').replace(']', '') \
16
+ .replace('*', ' ').replace('{', '').replace('}', '')
17
+ return text
text/japanese.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from unidecode import unidecode
3
+ import pyopenjtalk
4
+
5
+
6
+ # Regular expression matching Japanese without punctuation marks:
7
+ _japanese_characters = re.compile(
8
+ r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
9
+
10
+ # Regular expression matching non-Japanese characters or punctuation marks:
11
+ _japanese_marks = re.compile(
12
+ r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
13
+
14
+ # List of (symbol, Japanese) pairs for marks:
15
+ _symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [
16
+ ('%', 'パーセント')
17
+ ]]
18
+
19
+ # List of (romaji, ipa) pairs for marks:
20
+ _romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
21
+ ('ts', 'ʦ'),
22
+ ('u', 'ɯ'),
23
+ ('...', '…'),
24
+ ('j', 'ʥ'),
25
+ ('y', 'j'),
26
+ ('ni', 'n^i'),
27
+ ('nj', 'n^'),
28
+ ('hi', 'çi'),
29
+ ('hj', 'ç'),
30
+ ('f', 'ɸ'),
31
+ ('I', 'i*'),
32
+ ('U', 'ɯ*'),
33
+ ('r', 'ɾ')
34
+ ]]
35
+
36
+ # Dictinary of (consonant, sokuon) pairs:
37
+ _real_sokuon = {
38
+ 'k': 'k#',
39
+ 'g': 'k#',
40
+ 't': 't#',
41
+ 'd': 't#',
42
+ 'ʦ': 't#',
43
+ 'ʧ': 't#',
44
+ 'ʥ': 't#',
45
+ 'j': 't#',
46
+ 's': 's',
47
+ 'ʃ': 's',
48
+ 'p': 'p#',
49
+ 'b': 'p#'
50
+ }
51
+
52
+ # Dictinary of (consonant, hatsuon) pairs:
53
+ _real_hatsuon = {
54
+ 'p': 'm',
55
+ 'b': 'm',
56
+ 'm': 'm',
57
+ 't': 'n',
58
+ 'd': 'n',
59
+ 'n': 'n',
60
+ 'ʧ': 'n^',
61
+ 'ʥ': 'n^',
62
+ 'k': 'ŋ',
63
+ 'g': 'ŋ'
64
+ }
65
+
66
+
67
+ def symbols_to_japanese(text):
68
+ for regex, replacement in _symbols_to_japanese:
69
+ text = re.sub(regex, replacement, text)
70
+ return text
71
+
72
+
73
+ def japanese_to_romaji_with_accent(text):
74
+ '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
75
+ text = symbols_to_japanese(text)
76
+ sentences = re.split(_japanese_marks, text)
77
+ marks = re.findall(_japanese_marks, text)
78
+ text = ''
79
+ for i, sentence in enumerate(sentences):
80
+ if re.match(_japanese_characters, sentence):
81
+ if text != '':
82
+ text += ' '
83
+ labels = pyopenjtalk.extract_fullcontext(sentence)
84
+ for n, label in enumerate(labels):
85
+ phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
86
+ if phoneme not in ['sil', 'pau']:
87
+ text += phoneme.replace('ch', 'ʧ').replace('sh',
88
+ 'ʃ').replace('cl', 'Q')
89
+ else:
90
+ continue
91
+ # n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
92
+ a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
93
+ a2 = int(re.search(r"\+(\d+)\+", label).group(1))
94
+ a3 = int(re.search(r"\+(\d+)/", label).group(1))
95
+ if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']:
96
+ a2_next = -1
97
+ else:
98
+ a2_next = int(
99
+ re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
100
+ # Accent phrase boundary
101
+ if a3 == 1 and a2_next == 1:
102
+ text += ' '
103
+ # Falling
104
+ elif a1 == 0 and a2_next == a2 + 1:
105
+ text += '↓'
106
+ # Rising
107
+ elif a2 == 1 and a2_next == 2:
108
+ text += '↑'
109
+ if i < len(marks):
110
+ text += unidecode(marks[i]).replace(' ', '')
111
+ return text
112
+
113
+
114
+ def get_real_sokuon(text):
115
+ text=re.sub('Q[↑↓]*(.)',lambda x:_real_sokuon[x.group(1)]+x.group(0)[1:] if x.group(1) in _real_sokuon.keys() else x.group(0),text)
116
+ return text
117
+
118
+
119
+ def get_real_hatsuon(text):
120
+ text=re.sub('N[↑↓]*(.)',lambda x:_real_hatsuon[x.group(1)]+x.group(0)[1:] if x.group(1) in _real_hatsuon.keys() else x.group(0),text)
121
+ return text
122
+
123
+
124
+ def japanese_to_ipa(text):
125
+ text=japanese_to_romaji_with_accent(text)
126
+ for regex, replacement in _romaji_to_ipa:
127
+ text = re.sub(regex, replacement, text)
128
+ text = re.sub(
129
+ r'([A-Za-zɯ])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
130
+ text = get_real_sokuon(text)
131
+ text = get_real_hatsuon(text)
132
+ return text