Update preprocess.py
Browse files- preprocess.py +30 -137
preprocess.py
CHANGED
@@ -9,103 +9,10 @@ from TTS.config import load_config
|
|
9 |
from TTS.utils.manage import ModelManager
|
10 |
from TTS.utils.synthesizer import Synthesizer
|
11 |
|
12 |
-
|
13 |
-
PUNCLIST = [';', '?', '¿', ',', ':', '.', '!', '¡']
|
14 |
-
|
15 |
-
|
16 |
-
def canBeNumber(n):
|
17 |
-
try:
|
18 |
-
int(n)
|
19 |
-
return True
|
20 |
-
except ValueError:
|
21 |
-
# Not a number
|
22 |
-
return False
|
23 |
-
|
24 |
-
def accent_convert(phontrans):
|
25 |
-
transcript = re.sub('a\^','á',phontrans)
|
26 |
-
transcript = re.sub('e\^','é',transcript)
|
27 |
-
transcript = re.sub('i\^','í',transcript)
|
28 |
-
transcript = re.sub('o\^','ó',transcript)
|
29 |
-
transcript = re.sub('u\^','ú',transcript)
|
30 |
-
transcript = re.sub('E\^','É',transcript)
|
31 |
-
transcript = re.sub('O\^','Ó',transcript)
|
32 |
-
return transcript
|
33 |
-
|
34 |
-
def remove_tra3_tags(phontrans):
|
35 |
-
s = re.sub(r'#(.+?)#', r'', phontrans)
|
36 |
-
s = re.sub(r'%(.+?)%', r'', s)
|
37 |
-
s = re.sub(' +',' ',s)
|
38 |
-
s = re.sub('-','',s)
|
39 |
-
return s.strip()
|
40 |
-
|
41 |
def sanitize_filename(filename):
|
42 |
"""Remove or replace any characters that are not allowed in file names."""
|
43 |
return ''.join(c for c in filename if c.isalnum() or c in (' ', '_', '-')).rstrip()
|
44 |
|
45 |
-
def is_number(index, text):
|
46 |
-
if index == 0:
|
47 |
-
return False
|
48 |
-
elif index == len(text) - 1:
|
49 |
-
return False
|
50 |
-
else:
|
51 |
-
return canBeNumber(text[index - 1]) and canBeNumber(text[index + 1])
|
52 |
-
|
53 |
-
#Splits text from punctuation marks, gives list of segments in between and the punctuation marks. Skips punctuation not present in training.
|
54 |
-
def split_punc(text):
|
55 |
-
segments = []
|
56 |
-
puncs = []
|
57 |
-
curr_seg = ""
|
58 |
-
previous_punc = False
|
59 |
-
for i, c in enumerate(text):
|
60 |
-
if c in PUNCLIST and not previous_punc and not is_number(i, text):
|
61 |
-
curr_seg += c
|
62 |
-
segments.append(curr_seg.strip())
|
63 |
-
puncs.append(c)
|
64 |
-
curr_seg = ""
|
65 |
-
previous_punc = True
|
66 |
-
elif c in PUNCLIST and previous_punc:
|
67 |
-
curr_seg += c
|
68 |
-
puncs[-1] += c
|
69 |
-
else:
|
70 |
-
curr_seg += c
|
71 |
-
previous_punc = False
|
72 |
-
|
73 |
-
segments.append(curr_seg.strip())
|
74 |
-
|
75 |
-
# print("Split Segments: ", segments)
|
76 |
-
|
77 |
-
#Remove empty segments in the list
|
78 |
-
segments = filter(None, segments)
|
79 |
-
|
80 |
-
# store segments as a list
|
81 |
-
segments = list(segments)
|
82 |
-
|
83 |
-
# print("Split Segments: ", segments)
|
84 |
-
# print("Split Puncs: ", puncs)
|
85 |
-
|
86 |
-
return segments, puncs
|
87 |
-
|
88 |
-
def merge_punc(text_segs, puncs):
|
89 |
-
merged_str = ""
|
90 |
-
# print("Text segs: ", text_segs)
|
91 |
-
# print("Puncs: ", puncs)
|
92 |
-
for i, seg in enumerate(text_segs):
|
93 |
-
merged_str += seg + " "
|
94 |
-
|
95 |
-
if i < len(puncs):
|
96 |
-
merged_str += puncs[i] + " "
|
97 |
-
|
98 |
-
# remove spaces before , . ! ? ; : ) ] of the merged string
|
99 |
-
merged_str = re.sub(r"\s+([.,!?;:)\]])", r"\1", merged_str)
|
100 |
-
|
101 |
-
# remove spaces after ( [ ¡ ¿ of the merged string
|
102 |
-
merged_str = re.sub(r"([\(\[¡¿])\s+", r"\1", merged_str)
|
103 |
-
|
104 |
-
# print("Merged str: ", merged_str)
|
105 |
-
|
106 |
-
return merged_str.strip()
|
107 |
-
|
108 |
-
|
109 |
# función que engade a puntuación orixinal á extensión de números de cotovía (opción p)
|
110 |
def punctuate_p(str_ext):
|
111 |
|
@@ -146,82 +53,68 @@ def punctuate_p(str_ext):
|
|
146 |
return str_ext
|
147 |
|
148 |
|
149 |
-
def to_cotovia(
|
150 |
-
|
151 |
-
res = ''.join(random.choices(string.ascii_lowercase + string.digits, k=5))
|
152 |
-
COTOVIA_IN_TXT_PATH = res + '.txt'
|
153 |
-
COTOVIA_IN_TXT_PATH_ISO = 'iso8859-1' + res + '.txt'
|
154 |
-
COTOVIA_OUT_PRE_PATH = 'iso8859-1' + res + '.tra'
|
155 |
-
COTOVIA_OUT_PRE_PATH_UTF8 = 'utf8' + res + '.tra'
|
156 |
-
|
157 |
-
|
158 |
-
# print("Text segments: ", text_segments)
|
159 |
-
# Initial text preprocessing
|
160 |
# substitute ' M€' by 'millóns de euros' and 'somewordM€' by 'someword millóns de euros'
|
161 |
-
|
162 |
|
163 |
# substitute ' €' by 'euros' and 'someword€' by 'someword euros'
|
164 |
-
|
165 |
-
|
166 |
# substitute ' ºC' by 'graos centígrados' and 'somewordºC' by 'someword graos centígrados'
|
167 |
-
|
168 |
|
|
|
|
|
169 |
|
170 |
-
|
171 |
"-e", "s/ş/s/g", "-e", "s/Ž/Z/g", "-e", "s/ž/z/g", "-e", "s/ț/t/g", "-e", "s/ğ/g/g", "-e", "s/ș/s/g", "-e", "s/ş/s/g", "-e", "s/«//g", "-e", "s/»//g",
|
172 |
"-e", "s/<<//g", "-e", "s/>>//g", "-e", "s/“/\"/g", "-e", "s/”/'\"'/g", "-e", "s/\'//g", "-e", "s/‘//g", "-e", "s/’//g", "-e", "s/…//g",
|
173 |
-
"-e", "s/-/-/g", "-e", "s/–/-/g", "-e", "s/—/-/g", "-e", "s/―/-/g", "-e", "s/−/-/g", "-e", "s/‒/-/g", "-e", "s/─/-/g"
|
174 |
-
input=
|
175 |
-
|
176 |
-
#
|
|
|
|
|
|
|
|
|
177 |
|
178 |
with open(COTOVIA_IN_TXT_PATH, 'w') as f:
|
179 |
-
|
180 |
-
|
181 |
-
f.write(seg + '\n')
|
182 |
-
else:
|
183 |
-
f.write(',' + '\n')
|
184 |
|
185 |
# utf-8 to iso8859-1
|
186 |
subprocess.run(["iconv", "-f", "utf-8", "-t", "iso8859-1", COTOVIA_IN_TXT_PATH, "-o", COTOVIA_IN_TXT_PATH_ISO], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
|
187 |
-
|
188 |
-
subprocess.run(["cotovia", "-i", COTOVIA_IN_TXT_PATH_ISO, "-t3", "-n"], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
|
189 |
-
# iso8859-1 to utf-8
|
190 |
subprocess.run(["iconv", "-f", "iso8859-1", "-t", "utf-8", COTOVIA_OUT_PRE_PATH, "-o", COTOVIA_OUT_PRE_PATH_UTF8], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
|
191 |
|
192 |
segs = []
|
193 |
try:
|
194 |
with open(COTOVIA_OUT_PRE_PATH_UTF8, 'r') as f:
|
195 |
segs = [line.rstrip() for line in f]
|
196 |
-
segs = [remove_tra3_tags(line) for line in segs]
|
|
|
197 |
except:
|
198 |
print("ERROR: Couldn't read cotovia output")
|
199 |
|
200 |
subprocess.run(["rm", COTOVIA_IN_TXT_PATH, COTOVIA_IN_TXT_PATH_ISO, COTOVIA_OUT_PRE_PATH, COTOVIA_OUT_PRE_PATH_UTF8], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
|
201 |
|
202 |
-
# print("Cotovia segments: ", segs)
|
203 |
-
|
204 |
return segs
|
205 |
|
206 |
def text_preprocess(text):
|
207 |
|
208 |
-
|
209 |
-
text_segments, puncs = split_punc(text)
|
210 |
-
|
211 |
-
cotovia_phon_segs = to_cotovia(text_segments)
|
212 |
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
|
217 |
-
#
|
218 |
-
|
|
|
219 |
|
220 |
-
|
221 |
-
if not re.match(r"[.!?]", phon_str[-1]):
|
222 |
-
phon_str = phon_str + "."
|
223 |
|
224 |
-
return phon_str
|
225 |
|
226 |
def main():
|
227 |
parser = argparse.ArgumentParser(description='Cotovia phoneme transcription.')
|
|
|
9 |
from TTS.utils.manage import ModelManager
|
10 |
from TTS.utils.synthesizer import Synthesizer
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
def sanitize_filename(filename):
|
13 |
"""Remove or replace any characters that are not allowed in file names."""
|
14 |
return ''.join(c for c in filename if c.isalnum() or c in (' ', '_', '-')).rstrip()
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
# función que engade a puntuación orixinal á extensión de números de cotovía (opción p)
|
17 |
def punctuate_p(str_ext):
|
18 |
|
|
|
53 |
return str_ext
|
54 |
|
55 |
|
56 |
+
def to_cotovia(text):
|
57 |
+
## Initial text preprocessing
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
# substitute ' M€' by 'millóns de euros' and 'somewordM€' by 'someword millóns de euros'
|
59 |
+
text = re.sub(r"(\w+)\s*M€", r"\1 millóns de euros", text)
|
60 |
|
61 |
# substitute ' €' by 'euros' and 'someword€' by 'someword euros'
|
62 |
+
text = re.sub(r"(\w+)\s*€", r"\1 euros", text)
|
63 |
+
|
64 |
# substitute ' ºC' by 'graos centígrados' and 'somewordºC' by 'someword graos centígrados'
|
65 |
+
text = re.sub(r"(\w+)\s*ºC", r"\1 graos centígrados", text)
|
66 |
|
67 |
+
# Random string generation
|
68 |
+
res = ''.join(random.choices(string.ascii_lowercase + string.digits, k=5))
|
69 |
|
70 |
+
text = subprocess.run(["sed", "-e", "s/₂//g", "-e", "s/⸺//g", "-e", "s/ //g", "-e", "s///g", "-e", "s/č/c/g", "-e", "s/ț/t/g", "-e", "s/ğ/g/g", "-e", "s/ș/s/g",
|
71 |
"-e", "s/ş/s/g", "-e", "s/Ž/Z/g", "-e", "s/ž/z/g", "-e", "s/ț/t/g", "-e", "s/ğ/g/g", "-e", "s/ș/s/g", "-e", "s/ş/s/g", "-e", "s/«//g", "-e", "s/»//g",
|
72 |
"-e", "s/<<//g", "-e", "s/>>//g", "-e", "s/“/\"/g", "-e", "s/”/'\"'/g", "-e", "s/\'//g", "-e", "s/‘//g", "-e", "s/’//g", "-e", "s/…//g",
|
73 |
+
"-e", "s/-/-/g", "-e", "s/–/-/g", "-e", "s/—/-/g", "-e", "s/―/-/g", "-e", "s/−/-/g", "-e", "s/‒/-/g", "-e", "s/─/-/g"],
|
74 |
+
input = text, text = True, capture_output=True).stdout
|
75 |
+
|
76 |
+
# Input and output Cotovía files
|
77 |
+
COTOVIA_IN_TXT_PATH = res + '.txt'
|
78 |
+
COTOVIA_IN_TXT_PATH_ISO = 'iso8859-1' + res + '.txt'
|
79 |
+
COTOVIA_OUT_PRE_PATH = 'iso8859-1' + res + '.pre'
|
80 |
+
COTOVIA_OUT_PRE_PATH_UTF8 = 'utf8' + res + '.pre'
|
81 |
|
82 |
with open(COTOVIA_IN_TXT_PATH, 'w') as f:
|
83 |
+
f.write(text + '\n')
|
84 |
+
|
|
|
|
|
|
|
85 |
|
86 |
# utf-8 to iso8859-1
|
87 |
subprocess.run(["iconv", "-f", "utf-8", "-t", "iso8859-1", COTOVIA_IN_TXT_PATH, "-o", COTOVIA_IN_TXT_PATH_ISO], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
|
88 |
+
subprocess.run(["cotovia", "-i", COTOVIA_IN_TXT_PATH_ISO, "-p"], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
|
|
|
|
|
89 |
subprocess.run(["iconv", "-f", "iso8859-1", "-t", "utf-8", COTOVIA_OUT_PRE_PATH, "-o", COTOVIA_OUT_PRE_PATH_UTF8], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
|
90 |
|
91 |
segs = []
|
92 |
try:
|
93 |
with open(COTOVIA_OUT_PRE_PATH_UTF8, 'r') as f:
|
94 |
segs = [line.rstrip() for line in f]
|
95 |
+
# segs = [remove_tra3_tags(line) for line in segs] # modificar con punctuate_p
|
96 |
+
segs = [punctuate_p(line) for line in segs] # modificar con punctuate_p
|
97 |
except:
|
98 |
print("ERROR: Couldn't read cotovia output")
|
99 |
|
100 |
subprocess.run(["rm", COTOVIA_IN_TXT_PATH, COTOVIA_IN_TXT_PATH_ISO, COTOVIA_OUT_PRE_PATH, COTOVIA_OUT_PRE_PATH_UTF8], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
|
101 |
|
|
|
|
|
102 |
return segs
|
103 |
|
104 |
def text_preprocess(text):
|
105 |
|
106 |
+
cotovia_preproc_text = to_cotovia(text)
|
|
|
|
|
|
|
107 |
|
108 |
+
# convert list to string
|
109 |
+
cotovia_preproc_text_res = ' '.join(cotovia_preproc_text)
|
110 |
+
|
111 |
|
112 |
+
# add final punctuation if missing
|
113 |
+
if cotovia_preproc_text_res[-1] not in string.punctuation:
|
114 |
+
cotovia_preproc_text_res += '.'
|
115 |
|
116 |
+
return cotovia_preproc_text_res
|
|
|
|
|
117 |
|
|
|
118 |
|
119 |
def main():
|
120 |
parser = argparse.ArgumentParser(description='Cotovia phoneme transcription.')
|