|
import abc |
|
from typing import List, Tuple |
|
|
|
from .punctuation import Punctuation |
|
|
|
|
|
class BasePhonemizer(abc.ABC): |
|
"""Base phonemizer class |
|
|
|
Phonemization follows the following steps: |
|
1. Preprocessing: |
|
- remove empty lines |
|
- remove punctuation |
|
- keep track of punctuation marks |
|
|
|
2. Phonemization: |
|
- convert text to phonemes |
|
|
|
3. Postprocessing: |
|
- join phonemes |
|
- restore punctuation marks |
|
|
|
Args: |
|
language (str): |
|
Language used by the phonemizer. |
|
|
|
punctuations (List[str]): |
|
List of punctuation marks to be preserved. |
|
|
|
keep_puncs (bool): |
|
Whether to preserve punctuation marks or not. |
|
""" |
|
|
|
def __init__(self, language, punctuations=Punctuation.default_puncs(), keep_puncs=False): |
|
|
|
if not self.is_available(): |
|
raise RuntimeError("{} not installed on your system".format(self.name())) |
|
|
|
|
|
self._language = self._init_language(language) |
|
|
|
|
|
self._keep_puncs = keep_puncs |
|
self._punctuator = Punctuation(punctuations) |
|
|
|
def _init_language(self, language): |
|
"""Language initialization |
|
|
|
This method may be overloaded in child classes (see Segments backend) |
|
|
|
""" |
|
if not self.is_supported_language(language): |
|
raise RuntimeError(f'language "{language}" is not supported by the ' f"{self.name()} backend") |
|
return language |
|
|
|
@property |
|
def language(self): |
|
"""The language code configured to be used for phonemization""" |
|
return self._language |
|
|
|
@staticmethod |
|
@abc.abstractmethod |
|
def name(): |
|
"""The name of the backend""" |
|
... |
|
|
|
@classmethod |
|
@abc.abstractmethod |
|
def is_available(cls): |
|
"""Returns True if the backend is installed, False otherwise""" |
|
... |
|
|
|
@classmethod |
|
@abc.abstractmethod |
|
def version(cls): |
|
"""Return the backend version as a tuple (major, minor, patch)""" |
|
... |
|
|
|
@staticmethod |
|
@abc.abstractmethod |
|
def supported_languages(): |
|
"""Return a dict of language codes -> name supported by the backend""" |
|
... |
|
|
|
def is_supported_language(self, language): |
|
"""Returns True if `language` is supported by the backend""" |
|
return language in self.supported_languages() |
|
|
|
@abc.abstractmethod |
|
def _phonemize(self, text, separator): |
|
"""The main phonemization method""" |
|
|
|
def _phonemize_preprocess(self, text) -> Tuple[List[str], List]: |
|
"""Preprocess the text before phonemization |
|
|
|
1. remove spaces |
|
2. remove punctuation |
|
|
|
Override this if you need a different behaviour |
|
""" |
|
text = text.strip() |
|
if self._keep_puncs: |
|
|
|
return self._punctuator.strip_to_restore(text) |
|
return [self._punctuator.strip(text)], [] |
|
|
|
def _phonemize_postprocess(self, phonemized, punctuations) -> str: |
|
"""Postprocess the raw phonemized output |
|
|
|
Override this if you need a different behaviour |
|
""" |
|
if self._keep_puncs: |
|
return self._punctuator.restore(phonemized, punctuations)[0] |
|
return phonemized[0] |
|
|
|
def phonemize(self, text: str, separator="|", language: str = None) -> str: |
|
"""Returns the `text` phonemized for the given language |
|
|
|
Args: |
|
text (str): |
|
Text to be phonemized. |
|
|
|
separator (str): |
|
string separator used between phonemes. Default to '_'. |
|
|
|
Returns: |
|
(str): Phonemized text |
|
""" |
|
text, punctuations = self._phonemize_preprocess(text) |
|
phonemized = [] |
|
for t in text: |
|
p = self._phonemize(t, separator) |
|
phonemized.append(p) |
|
phonemized = self._phonemize_postprocess(phonemized, punctuations) |
|
return phonemized |
|
|
|
def print_logs(self, level: int = 0): |
|
indent = "\t" * level |
|
print(f"{indent}| > phoneme language: {self.language}") |
|
print(f"{indent}| > phoneme backend: {self.name()}") |