Nos_TTS-celtia-vits-graphemes / preprocess.py

add preprocessing

9382c4c 2 months ago

9.02 kB

	import argparse
	import tempfile
	import random
	import re
	import string
	import subprocess
	from typing import Optional
	from TTS.config import load_config
	from TTS.utils.manage import ModelManager
	from TTS.utils.synthesizer import Synthesizer


	PUNCLIST = [';', '?', '¿', ',', ':', '.', '!', '¡']


	def canBeNumber(n):
	try:
	int(n)
	return True
	except ValueError:
	# Not a number
	return False

	def accent_convert(phontrans):
	transcript = re.sub('a\^','á',phontrans)
	transcript = re.sub('e\^','é',transcript)
	transcript = re.sub('i\^','í',transcript)
	transcript = re.sub('o\^','ó',transcript)
	transcript = re.sub('u\^','ú',transcript)
	transcript = re.sub('E\^','É',transcript)
	transcript = re.sub('O\^','Ó',transcript)
	return transcript

	def remove_tra3_tags(phontrans):
	s = re.sub(r'#(.+?)#', r'', phontrans)
	s = re.sub(r'%(.+?)%', r'', s)
	s = re.sub(' +',' ',s)
	s = re.sub('-','',s)
	return s.strip()

	def sanitize_filename(filename):
	"""Remove or replace any characters that are not allowed in file names."""
	return ''.join(c for c in filename if c.isalnum() or c in (' ', '_', '-')).rstrip()

	def is_number(index, text):
	if index == 0:
	return False
	elif index == len(text) - 1:
	return False
	else:
	return canBeNumber(text[index - 1]) and canBeNumber(text[index + 1])

	#Splits text from punctuation marks, gives list of segments in between and the punctuation marks. Skips punctuation not present in training.
	def split_punc(text):
	segments = []
	puncs = []
	curr_seg = ""
	previous_punc = False
	for i, c in enumerate(text):
	if c in PUNCLIST and not previous_punc and not is_number(i, text):
	curr_seg += c
	segments.append(curr_seg.strip())
	puncs.append(c)
	curr_seg = ""
	previous_punc = True
	elif c in PUNCLIST and previous_punc:
	curr_seg += c
	puncs[-1] += c
	else:
	curr_seg += c
	previous_punc = False

	segments.append(curr_seg.strip())

	# print("Split Segments: ", segments)

	#Remove empty segments in the list
	segments = filter(None, segments)

	# store segments as a list
	segments = list(segments)

	# print("Split Segments: ", segments)
	# print("Split Puncs: ", puncs)

	return segments, puncs

	def merge_punc(text_segs, puncs):
	merged_str = ""
	# print("Text segs: ", text_segs)
	# print("Puncs: ", puncs)
	for i, seg in enumerate(text_segs):
	merged_str += seg + " "

	if i < len(puncs):
	merged_str += puncs[i] + " "

	# remove spaces before , . ! ? ; : ) ] of the merged string
	merged_str = re.sub(r"\s+([.,!?;:)\]])", r"\1", merged_str)

	# remove spaces after ( [ ¡ ¿ of the merged string
	merged_str = re.sub(r"([\(\[¡¿])\s+", r"\1", merged_str)

	# print("Merged str: ", merged_str)

	return merged_str.strip()


	# función que engade a puntuación orixinal á extensión de números de cotovía (opción p)
	def punctuate_p(str_ext):

	# substitute ' ·\n' by ...
	str_ext = re.sub(r" ·", r"...", str_ext)

	# remove spaces before , . ! ? ; : ) ] of the extended string
	str_ext = re.sub(r"\s+([.,!?;:)\]])", r"\1", str_ext)

	# remove spaces after ( [ ¡ ¿ of the extended string
	str_ext = re.sub(r"([\(\[¡¿])\s+", r"\1", str_ext)

	# remove unwanted spaces between quotations marks
	str_ext = re.sub(r'"\s([^"]?)\s*"', r'"\1"', str_ext)

	# substitute '- text -' to '-text-'
	str_ext = re.sub(r"-\s([^-]?)\s*-", r"-\1-", str_ext)

	# remove initial question marks
	str_ext = re.sub(r"[¿¡]", r"", str_ext)

	# eliminate extra spaces
	str_ext = re.sub(r"\s+", r" ", str_ext)

	str_ext = re.sub(r"(\d+)\s-\s(\d+)", r"\1 \2", str_ext)

	### - , ' and () by commas
	# substitute '- text -' to ', text,'
	str_ext = re.sub(r"(\w+)\s+-([^-]?)-\s+([^-]?)", r"\1, \2, ", str_ext)

	# substitute ' - ' by ', '
	str_ext = re.sub(r"(\w+[!\?]?)\s+-\s*", r"\1, ", str_ext)

	# substitute ' ( text )' to ', text,'
	str_ext = re.sub(r"(\w+)\s$\s([^\($]?)\s\)", r"\1, \2,", str_ext)


	return str_ext


	def to_cotovia(text_segments):
	# Input and output Cotovía files
	res = ''.join(random.choices(string.ascii_lowercase + string.digits, k=5))
	COTOVIA_IN_TXT_PATH = res + '.txt'
	COTOVIA_IN_TXT_PATH_ISO = 'iso8859-1' + res + '.txt'
	COTOVIA_OUT_PRE_PATH = 'iso8859-1' + res + '.tra'
	COTOVIA_OUT_PRE_PATH_UTF8 = 'utf8' + res + '.tra'


	# print("Text segments: ", text_segments)
	# Initial text preprocessing
	# substitute ' M€' by 'millóns de euros' and 'somewordM€' by 'someword millóns de euros'
	text_segments = [re.sub(r"(\w+)\s*M€", r"\1 millóns de euros", seg) for seg in text_segments]

	# substitute ' €' by 'euros' and 'someword€' by 'someword euros'
	text_segments = [re.sub(r"(\w+)\s*€", r"\1 euros", seg) for seg in text_segments]

	# substitute ' ºC' by 'graos centígrados' and 'somewordºC' by 'someword graos centígrados'
	text_segments = [re.sub(r"(\w+)\s*ºC", r"\1 graos centígrados", seg) for seg in text_segments]


	text_segments = [subprocess.run(["sed", "-e", "s/₂//g", "-e", "s/⸺//g", "-e", "s/ //g", "-e", "s///g", "-e", "s/č/c/g", "-e", "s/ț/t/g", "-e", "s/ğ/g/g", "-e", "s/ș/s/g",
	"-e", "s/ş/s/g", "-e", "s/Ž/Z/g", "-e", "s/ž/z/g", "-e", "s/ț/t/g", "-e", "s/ğ/g/g", "-e", "s/ș/s/g", "-e", "s/ş/s/g", "-e", "s/«//g", "-e", "s/»//g",
	"-e", "s/<<//g", "-e", "s/>>//g", "-e", "s/“/\"/g", "-e", "s/”/'\"'/g", "-e", "s/\'//g", "-e", "s/‘//g", "-e", "s/’//g", "-e", "s/…//g",
	"-e", "s/-/-/g", "-e", "s/–/-/g", "-e", "s/—/-/g", "-e", "s/―/-/g", "-e", "s/−/-/g", "-e", "s/‒/-/g", "-e", "s/─/-/g", "-e", "s/^Si$/Si\./g"],
	input=seg, text=True, capture_output=True).stdout for seg in text_segments]

	# print("Text segments after sed: ", text_segments)

	with open(COTOVIA_IN_TXT_PATH, 'w') as f:
	for seg in text_segments:
	if seg:
	f.write(seg + '\n')
	else:
	f.write(',' + '\n')

	# utf-8 to iso8859-1
	subprocess.run(["iconv", "-f", "utf-8", "-t", "iso8859-1", COTOVIA_IN_TXT_PATH, "-o", COTOVIA_IN_TXT_PATH_ISO], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
	# call cotovia with -t3 option
	subprocess.run(["cotovia", "-i", COTOVIA_IN_TXT_PATH_ISO, "-t3", "-n"], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
	# iso8859-1 to utf-8
	subprocess.run(["iconv", "-f", "iso8859-1", "-t", "utf-8", COTOVIA_OUT_PRE_PATH, "-o", COTOVIA_OUT_PRE_PATH_UTF8], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)

	segs = []
	try:
	with open(COTOVIA_OUT_PRE_PATH_UTF8, 'r') as f:
	segs = [line.rstrip() for line in f]
	segs = [remove_tra3_tags(line) for line in segs]
	except:
	print("ERROR: Couldn't read cotovia output")

	subprocess.run(["rm", COTOVIA_IN_TXT_PATH, COTOVIA_IN_TXT_PATH_ISO, COTOVIA_OUT_PRE_PATH, COTOVIA_OUT_PRE_PATH_UTF8], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)

	# print("Cotovia segments: ", segs)

	return segs

	def text_preprocess(text):

	#Split from punc
	text_segments, puncs = split_punc(text)

	cotovia_phon_segs = to_cotovia(text_segments)

	cotovia_phon_str = merge_punc(cotovia_phon_segs, puncs)

	phon_str = accent_convert(cotovia_phon_str)

	# remove extra spaces
	phon_str = re.sub(r"\s+", r" ", phon_str)

	# add final punctuation mark if it is not present
	if not re.match(r"[.!?]", phon_str[-1]):
	phon_str = phon_str + "."

	return phon_str

	def main():
	parser = argparse.ArgumentParser(description='Cotovia phoneme transcription.')
	parser.add_argument('text', type=str, help='Text to synthetize')
	parser.add_argument('model_path', type=str, help='Absolute path to the model checkpoint.pth')
	parser.add_argument('config_path', type=str, help='Absolute path to the model config.json')

	args = parser.parse_args()

	print("Text before preprocessing: ", args.text)
	text = text_preprocess(args.text)
	print("Text after preprocessing: ", text)

	synthesizer = Synthesizer(
	args.model_path, args.config_path, None, None, None, None,
	)
	wavs = synthesizer.tts(text)

	# Step 1: Extract the first word from the text
	first_word = args.text.split()[0] if args.text.split() else "audio"
	first_word = sanitize_filename(first_word) # Sanitize to make it a valid filename

	# Step 2: Use synthesizer's built-in function to synthesize and save the audio
	wavs = synthesizer.tts(text)
	filename = f"{first_word}.wav"
	synthesizer.save_wav(wavs, filename)

	print(f"Audio file saved as: {filename}")

	if __name__ == "__main__":
	main()