Podcastify

Running on Zero

App Files Files Community

Podcastify / melo /text /fr_phonemizer /cleaner.py

mrfakename

Init

4300fed 9 months ago

raw

history blame

2.93 kB

	"""Set of default text cleaners"""
	# TODO: pick the cleaner for languages dynamically

	import re
	from .french_abbreviations import abbreviations_fr

	# Regular expression matching whitespace:
	_whitespace_re = re.compile(r"\s+")


	rep_map = {
	"：": ",",
	"；": ",",
	"，": ",",
	"。": ".",
	"！": "!",
	"？": "?",
	"\n": ".",
	"·": ",",
	"、": ",",
	"...": ".",
	"…": ".",
	"$": ".",
	"“": "",
	"”": "",
	"‘": "",
	"’": "",
	"（": "",
	"）": "",
	"(": "",
	")": "",
	"《": "",
	"》": "",
	"【": "",
	"】": "",
	"[": "",
	"]": "",
	"—": "",
	"～": "-",
	"~": "-",
	"「": "",
	"」": "",
	"¿" : "",
	"¡" : ""
	}


	def replace_punctuation(text):
	pattern = re.compile("\|".join(re.escape(p) for p in rep_map.keys()))
	replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
	return replaced_text

	def expand_abbreviations(text, lang="fr"):
	if lang == "fr":
	_abbreviations = abbreviations_fr
	for regex, replacement in _abbreviations:
	text = re.sub(regex, replacement, text)
	return text


	def lowercase(text):
	return text.lower()


	def collapse_whitespace(text):
	return re.sub(_whitespace_re, " ", text).strip()

	def remove_punctuation_at_begin(text):
	return re.sub(r'^[,.!?]+', '', text)

	def remove_aux_symbols(text):
	text = re.sub(r"[\<\>\[\]\"\«\»]+", "", text)
	return text


	def replace_symbols(text, lang="en"):
	"""Replace symbols based on the lenguage tag.

	Args:
	text:
	Input text.
	lang:
	Lenguage identifier. ex: "en", "fr", "pt", "ca".

	Returns:
	The modified text
	example:
	input args:
	text: "si l'avi cau, diguem-ho"
	lang: "ca"
	Output:
	text: "si lavi cau, diguemho"
	"""
	text = text.replace(";", ",")
	text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
	text = text.replace(":", ",")
	if lang == "en":
	text = text.replace("&", " and ")
	elif lang == "fr":
	text = text.replace("&", " et ")
	elif lang == "pt":
	text = text.replace("&", " e ")
	elif lang == "ca":
	text = text.replace("&", " i ")
	text = text.replace("'", "")
	elif lang== "es":
	text=text.replace("&","y")
	text = text.replace("'", "")
	return text

	def french_cleaners(text):
	"""Pipeline for French text. There is no need to expand numbers, phonemizer already does that"""
	text = expand_abbreviations(text, lang="fr")
	# text = lowercase(text) # as we use the cased bert
	text = replace_punctuation(text)
	text = replace_symbols(text, lang="fr")
	text = remove_aux_symbols(text)
	text = remove_punctuation_at_begin(text)
	text = collapse_whitespace(text)
	text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text)
	return text