Spaces:
Runtime error
Runtime error
| import collections | |
| import re | |
| from enum import Enum | |
| import six | |
| _DEF_PUNCS = ';:,.!?¡¿—…"«»“”' | |
| _PUNC_IDX = collections.namedtuple("_punc_index", ["punc", "position"]) | |
| class PuncPosition(Enum): | |
| """Enum for the punctuations positions""" | |
| BEGIN = 0 | |
| END = 1 | |
| MIDDLE = 2 | |
| ALONE = 3 | |
| class Punctuation: | |
| """Handle punctuations in text. | |
| Just strip punctuations from text or strip and restore them later. | |
| Args: | |
| puncs (str): The punctuations to be processed. Defaults to `_DEF_PUNCS`. | |
| Example: | |
| >>> punc = Punctuation() | |
| >>> punc.strip("This is. example !") | |
| 'This is example' | |
| >>> text_striped, punc_map = punc.strip_to_restore("This is. example !") | |
| >>> ' '.join(text_striped) | |
| 'This is example' | |
| >>> text_restored = punc.restore(text_striped, punc_map) | |
| >>> text_restored[0] | |
| 'This is. example !' | |
| """ | |
| def __init__(self, puncs: str = _DEF_PUNCS): | |
| self.puncs = puncs | |
| def default_puncs(): | |
| """Return default set of punctuations.""" | |
| return _DEF_PUNCS | |
| def puncs(self): | |
| return self._puncs | |
| def puncs(self, value): | |
| if not isinstance(value, six.string_types): | |
| raise ValueError("[!] Punctuations must be of type str.") | |
| self._puncs = "".join(list(dict.fromkeys(list(value)))) # remove duplicates without changing the oreder | |
| self.puncs_regular_exp = re.compile(rf"(\s*[{re.escape(self._puncs)}]+\s*)+") | |
| def strip(self, text): | |
| """Remove all the punctuations by replacing with `space`. | |
| Args: | |
| text (str): The text to be processed. | |
| Example:: | |
| "This is. example !" -> "This is example " | |
| """ | |
| return re.sub(self.puncs_regular_exp, " ", text).rstrip().lstrip() | |
| def strip_to_restore(self, text): | |
| """Remove punctuations from text to restore them later. | |
| Args: | |
| text (str): The text to be processed. | |
| Examples :: | |
| "This is. example !" -> [["This is", "example"], [".", "!"]] | |
| """ | |
| text, puncs = self._strip_to_restore(text) | |
| return text, puncs | |
| def _strip_to_restore(self, text): | |
| """Auxiliary method for Punctuation.preserve()""" | |
| matches = list(re.finditer(self.puncs_regular_exp, text)) | |
| if not matches: | |
| return [text], [] | |
| # the text is only punctuations | |
| if len(matches) == 1 and matches[0].group() == text: | |
| return [], [_PUNC_IDX(text, PuncPosition.ALONE)] | |
| # build a punctuation map to be used later to restore punctuations | |
| puncs = [] | |
| for match in matches: | |
| position = PuncPosition.MIDDLE | |
| if match == matches[0] and text.startswith(match.group()): | |
| position = PuncPosition.BEGIN | |
| elif match == matches[-1] and text.endswith(match.group()): | |
| position = PuncPosition.END | |
| puncs.append(_PUNC_IDX(match.group(), position)) | |
| # convert str text to a List[str], each item is separated by a punctuation | |
| splitted_text = [] | |
| for idx, punc in enumerate(puncs): | |
| split = text.split(punc.punc) | |
| prefix, suffix = split[0], punc.punc.join(split[1:]) | |
| splitted_text.append(prefix) | |
| # if the text does not end with a punctuation, add it to the last item | |
| if idx == len(puncs) - 1 and len(suffix) > 0: | |
| splitted_text.append(suffix) | |
| text = suffix | |
| return splitted_text, puncs | |
| def restore(cls, text, puncs): | |
| """Restore punctuation in a text. | |
| Args: | |
| text (str): The text to be processed. | |
| puncs (List[str]): The list of punctuations map to be used for restoring. | |
| Examples :: | |
| ['This is', 'example'], ['.', '!'] -> "This is. example!" | |
| """ | |
| return cls._restore(text, puncs, 0) | |
| def _restore(cls, text, puncs, num): # pylint: disable=too-many-return-statements | |
| """Auxiliary method for Punctuation.restore()""" | |
| if not puncs: | |
| return text | |
| # nothing have been phonemized, returns the puncs alone | |
| if not text: | |
| return ["".join(m.punc for m in puncs)] | |
| current = puncs[0] | |
| if current.position == PuncPosition.BEGIN: | |
| return cls._restore([current.punc + text[0]] + text[1:], puncs[1:], num) | |
| if current.position == PuncPosition.END: | |
| return [text[0] + current.punc] + cls._restore(text[1:], puncs[1:], num + 1) | |
| if current.position == PuncPosition.ALONE: | |
| return [current.mark] + cls._restore(text, puncs[1:], num + 1) | |
| # POSITION == MIDDLE | |
| if len(text) == 1: # pragma: nocover | |
| # a corner case where the final part of an intermediate | |
| # mark (I) has not been phonemized | |
| return cls._restore([text[0] + current.punc], puncs[1:], num) | |
| return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num) | |
| # if __name__ == "__main__": | |
| # punc = Punctuation() | |
| # text = "This is. This is, example!" | |
| # print(punc.strip(text)) | |
| # split_text, puncs = punc.strip_to_restore(text) | |
| # print(split_text, " ---- ", puncs) | |
| # restored_text = punc.restore(split_text, puncs) | |
| # print(restored_text) |