ulysses115 commited on
Commit
d8363fa
1 Parent(s): b78adcc

Upload text/ with huggingface_hub

Browse files
text/LICENSE ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2017 Keith Ito
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
text/__init__.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+ from text import cleaners
3
+ from text.symbols import symbols,symbols_zh
4
+
5
+
6
+ # Mappings from symbol to numeric ID and vice versa:
7
+ # _symbol_to_id = {s: i for i, s in enumerate(symbols)}
8
+ # _id_to_symbol = {i: s for i, s in enumerate(symbols)}
9
+
10
+ chinese_mode = True
11
+ if chinese_mode:
12
+ _symbol_to_id = {s: i for i, s in enumerate(symbols_zh)}
13
+ _id_to_symbol = {i: s for i, s in enumerate(symbols_zh)}
14
+ else:
15
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
16
+ _id_to_symbol = {i: s for i, s in enumerate(symbols)}
17
+
18
+ def text_to_sequence(text, cleaner_names, ):
19
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
20
+ Args:
21
+ text: string to convert to a sequence
22
+ cleaner_names: names of the cleaner functions to run the text through
23
+ Returns:
24
+ List of integers corresponding to the symbols in the text
25
+ '''
26
+ sequence = []
27
+
28
+ clean_text = _clean_text(text, cleaner_names)
29
+ for symbol in clean_text:
30
+ if symbol not in _symbol_to_id.keys():
31
+ coutinue
32
+ symbol_id = _symbol_to_id[symbol]
33
+ sequence += [symbol_id]
34
+ return sequence
35
+
36
+
37
+ def cleaned_text_to_sequence(cleaned_text, chinese_mode=True):
38
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
39
+ Args:
40
+ text: string to convert to a sequence
41
+ Returns:
42
+ List of integers corresponding to the symbols in the text
43
+ '''
44
+ # if chinese_mode:
45
+ # sequence = [_symbol_to_id_zh[symbol] for symbol in cleaned_text]
46
+ # else:
47
+ sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
48
+ return sequence
49
+
50
+
51
+ def sequence_to_text(sequence):
52
+ '''Converts a sequence of IDs back to a string'''
53
+ result = ''
54
+ for symbol_id in sequence:
55
+ s = _id_to_symbol[symbol_id]
56
+ result += s
57
+ return result
58
+
59
+
60
+ def _clean_text(text, cleaner_names):
61
+ for name in cleaner_names:
62
+ cleaner = getattr(cleaners, name)
63
+ if not cleaner:
64
+ raise Exception('Unknown cleaner: %s' % name)
65
+ text = cleaner(text)
66
+ return text
text/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (2.42 kB). View file
 
text/__pycache__/cleaners.cpython-38.pyc ADDED
Binary file (3.82 kB). View file
 
text/__pycache__/symbols.cpython-38.pyc ADDED
Binary file (831 Bytes). View file
 
text/cleaners.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+
3
+ '''
4
+ Cleaners are transformations that run over the input text at both training and eval time.
5
+
6
+ Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
7
+ hyperparameter. Some cleaners are English-specific. You'll typically want to use:
8
+ 1. "english_cleaners" for English text
9
+ 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
10
+ the Unidecode library (https://pypi.python.org/pypi/Unidecode)
11
+ 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
12
+ the symbols in symbols.py to match your data).
13
+ '''
14
+
15
+ import re
16
+ from unidecode import unidecode
17
+ from phonemizer import phonemize
18
+ from pypinyin import Style, pinyin
19
+ from pypinyin.style._utils import get_finals, get_initials
20
+ # Regular expression matching whitespace:
21
+ _whitespace_re = re.compile(r'\s+')
22
+
23
+ # List of (regular expression, replacement) pairs for abbreviations:
24
+ _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
25
+ ('mrs', 'misess'),
26
+ ('mr', 'mister'),
27
+ ('dr', 'doctor'),
28
+ ('st', 'saint'),
29
+ ('co', 'company'),
30
+ ('jr', 'junior'),
31
+ ('maj', 'major'),
32
+ ('gen', 'general'),
33
+ ('drs', 'doctors'),
34
+ ('rev', 'reverend'),
35
+ ('lt', 'lieutenant'),
36
+ ('hon', 'honorable'),
37
+ ('sgt', 'sergeant'),
38
+ ('capt', 'captain'),
39
+ ('esq', 'esquire'),
40
+ ('ltd', 'limited'),
41
+ ('col', 'colonel'),
42
+ ('ft', 'fort'),
43
+ ]]
44
+
45
+
46
+ def expand_abbreviations(text):
47
+ for regex, replacement in _abbreviations:
48
+ text = re.sub(regex, replacement, text)
49
+ return text
50
+
51
+
52
+ def expand_numbers(text):
53
+ return normalize_numbers(text)
54
+
55
+
56
+ def lowercase(text):
57
+ return text.lower()
58
+
59
+
60
+ def collapse_whitespace(text):
61
+ return re.sub(_whitespace_re, ' ', text)
62
+
63
+
64
+ def convert_to_ascii(text):
65
+ return unidecode(text)
66
+
67
+
68
+ def basic_cleaners(text):
69
+ '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
70
+ text = lowercase(text)
71
+ text = collapse_whitespace(text)
72
+ return text
73
+
74
+
75
+ def transliteration_cleaners(text):
76
+ '''Pipeline for non-English text that transliterates to ASCII.'''
77
+ text = convert_to_ascii(text)
78
+ text = lowercase(text)
79
+ text = collapse_whitespace(text)
80
+ return text
81
+
82
+
83
+ def english_cleaners(text):
84
+ '''Pipeline for English text, including abbreviation expansion.'''
85
+ text = convert_to_ascii(text)
86
+ text = lowercase(text)
87
+ text = expand_abbreviations(text)
88
+ phonemes = phonemize(text, language='en-us', backend='espeak', strip=True)
89
+ phonemes = collapse_whitespace(phonemes)
90
+ return phonemes
91
+
92
+
93
+ def english_cleaners2(text):
94
+ '''Pipeline for English text, including abbreviation expansion. + punctuation + stress'''
95
+ text = convert_to_ascii(text)
96
+ text = lowercase(text)
97
+ text = expand_abbreviations(text)
98
+ phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True, with_stress=True)
99
+ phonemes = collapse_whitespace(phonemes)
100
+ return phonemes
101
+
102
+
103
+
104
+
105
+ def chinese_cleaners1(text):
106
+ from pypinyin import Style, pinyin
107
+
108
+ phones = [phone[0] for phone in pinyin(text, style=Style.TONE3)]
109
+ return ' '.join(phones)
110
+
111
+
112
+ def chinese_cleaners2(text):
113
+ phones = [
114
+ p
115
+ for phone in pinyin(text, style=Style.TONE3)
116
+ for p in [
117
+ get_initials(phone[0], strict=True),
118
+ get_finals(phone[0][:-1], strict=True) + phone[0][-1]
119
+ if phone[0][-1].isdigit()
120
+ else get_finals(phone[0], strict=True)
121
+ if phone[0][-1].isalnum()
122
+ else phone[0],
123
+ ]
124
+ # Remove the case of individual tones as a phoneme
125
+ if len(p) != 0 and not p.isdigit()
126
+ ]
127
+ return phones
128
+ # return phonemes
129
+
130
+ if __name__ == '__main__':
131
+ res = chinese_cleaners2('这是语音测试!')
132
+ print(res)
133
+ res = chinese_cleaners1('"第一,南京不是发展的不行,是大家对他期望很高,')
134
+ print(res)
135
+
136
+
137
+ res = english_cleaners2('this is a club test for one train.GDP')
138
+ print(res)
text/symbols.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+
3
+ '''
4
+ Defines the set of symbols used in text input to the model.
5
+ '''
6
+ _pad = '_'
7
+ _punctuation = ';:,.!?¡¿—…"«»“” '
8
+
9
+ _punctuation_zh = ';:,。!?-“”《》、()BP…—~.\·『』・ '
10
+ _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
11
+
12
+ _numbers = '1234567890'
13
+ _others = ''
14
+
15
+ _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
16
+
17
+
18
+ # Export all symbols:
19
+ symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
20
+
21
+ symbols_zh = [_pad] + list(_punctuation_zh) + list(_letters) + list(_numbers)
22
+
23
+ # Special symbol ids
24
+ SPACE_ID = symbols.index(" ")
25
+