| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | import string |
| | from copy import deepcopy |
| |
|
| | import regex as re |
| | from tqdm import tqdm |
| |
|
| | from nemo.collections.nlp.data.text_normalization import constants |
| |
|
| | __all__ = [ |
| | 'read_data_file', |
| | 'normalize_str', |
| | 'flatten', |
| | 'convert_fraction', |
| | 'convert_superscript', |
| | 'add_space_around_dash', |
| | ] |
| |
|
| |
|
| | def flatten(l): |
| | """ flatten a list of lists """ |
| | return [item for sublist in l for item in sublist] |
| |
|
| |
|
| | def add_space_around_dash(input: str): |
| | """ adds space around dash between numbers and non-numbers""" |
| | input = re.sub(r"([^\s0-9])-([0-9])", r"\1 - \2", input) |
| | input = re.sub(r"([0-9])-([^\s0-9])", r"\1 - \2", input) |
| | input = re.sub(r"([^\s0-9])-([0-9])", r"\1 - \2", input) |
| | input = re.sub(r"([0-9])-([^\s0-9])", r"\1 - \2", input) |
| | return input |
| |
|
| |
|
| | def convert_superscript(written: str): |
| | """convert superscript to regular character""" |
| | written = re.sub("Β²", "2", written) |
| | written = re.sub("Β³", "3", written) |
| | return written |
| |
|
| |
|
| | def convert_fraction(written: str): |
| | """ |
| | converts fraction to standard form, e.g "Β½" -> "1/2", "1 Β½" -> "1 1/2" |
| | |
| | Args: |
| | written: written form |
| | Returns: |
| | written: modified form |
| | """ |
| | written = re.sub(" Β½", " 1/2", written) |
| | written = re.sub(" β
", " 1/3", written) |
| | written = re.sub(" β
", " 2/3", written) |
| | written = re.sub(" ΒΌ", " 1/4", written) |
| | written = re.sub(" ΒΎ", " 3/4", written) |
| | written = re.sub(" β
", " 1/5", written) |
| | written = re.sub(" β
", " 2/5", written) |
| | written = re.sub(" β
", " 3/5", written) |
| | written = re.sub(" β
", " 4/5", written) |
| | written = re.sub(" β
", " 1/6", written) |
| | written = re.sub(" β
", " 5/6", written) |
| | written = re.sub(" β
", " 1/8", written) |
| | written = re.sub(" β
", " 3/8", written) |
| | written = re.sub(" β
", " 5/8", written) |
| | written = re.sub(" β
", " 7/8", written) |
| | written = re.sub("^Β½", "1/2", written) |
| | written = re.sub("^β
", "1/3", written) |
| | written = re.sub("^β
", "2/3", written) |
| | written = re.sub("^ΒΌ", "1/4", written) |
| | written = re.sub("^ΒΎ", "3/4", written) |
| | written = re.sub("^β
", "1/5", written) |
| | written = re.sub("^β
", "2/5", written) |
| | written = re.sub("^β
", "3/5", written) |
| | written = re.sub("^β
", "4/5", written) |
| | written = re.sub("^β
", "1/6", written) |
| | written = re.sub("^β
", "5/6", written) |
| | written = re.sub("^β
", "1/8", written) |
| | written = re.sub("^β
", "3/8", written) |
| | written = re.sub("^β
", "5/8", written) |
| | written = re.sub("^β
", "7/8", written) |
| | written = re.sub("-Β½", "-1/2", written) |
| | written = re.sub("-β
", "-1/3", written) |
| | written = re.sub("-β
", "-2/3", written) |
| | written = re.sub("-ΒΌ", "-1/4", written) |
| | written = re.sub("-ΒΎ", "-3/4", written) |
| | written = re.sub("-β
", "-1/5", written) |
| | written = re.sub("-β
", "-2/5", written) |
| | written = re.sub("-β
", "-3/5", written) |
| | written = re.sub("-β
", "-4/5", written) |
| | written = re.sub("-β
", "-1/6", written) |
| | written = re.sub("-β
", "-5/6", written) |
| | written = re.sub("-β
", "-1/8", written) |
| | written = re.sub("-β
", "-3/8", written) |
| | written = re.sub("-β
", "-5/8", written) |
| | written = re.sub("-β
", "-7/8", written) |
| | written = re.sub("([0-9])\s?Β½", "\\1 1/2", written) |
| | written = re.sub("([0-9])\s?β
", "\\1 1/3", written) |
| | written = re.sub("([0-9])\s?β
", "\\1 2/3", written) |
| | written = re.sub("([0-9])\s?ΒΌ", "\\1 1/4", written) |
| | written = re.sub("([0-9])\s?ΒΎ", "\\1 3/4", written) |
| | written = re.sub("([0-9])\s?β
", "\\1 1/5", written) |
| | written = re.sub("([0-9])\s?β
", "\\1 2/5", written) |
| | written = re.sub("([0-9])\s?β
", "\\1 3/5", written) |
| | written = re.sub("([0-9])\s?β
", "\\1 4/5", written) |
| | written = re.sub("([0-9])\s?β
", "\\1 1/6", written) |
| | written = re.sub("([0-9])\s?β
", "\\1 5/6", written) |
| | written = re.sub("([0-9])\s?β
", "\\1 1/8", written) |
| | written = re.sub("([0-9])\s?β
", "\\1 3/8", written) |
| | written = re.sub("([0-9])\s?β
", "\\1 5/8", written) |
| | written = re.sub("([0-9])\s?β
", "\\1 7/8", written) |
| | return written |
| |
|
| |
|
| | def input_preprocessing(sent: str, lang: str): |
| | """ Function for preprocessing the input texts. The function first does |
| | some basic tokenization. For English, it then also processes Greek letters |
| | such as Ξ or Ξ» (if any). |
| | |
| | Args: |
| | sent: input text. |
| | lang: language |
| | |
| | Returns: preprocessed input text. |
| | """ |
| | |
| | if lang == constants.ENGLISH: |
| | sent = sent.replace('+', ' plus ') |
| | sent = sent.replace('=', ' equals ') |
| | sent = sent.replace('@', ' at ') |
| | sent = sent.replace('*', ' times ') |
| | |
| | for jx, tok in enumerate(sent): |
| | if tok in constants.EN_GREEK_TO_SPOKEN: |
| | sent = sent[:jx] + constants.EN_GREEK_TO_SPOKEN[tok] + sent[jx + 1 :] |
| |
|
| | sent = convert_superscript(sent) |
| | sent = convert_fraction(sent) |
| | sent = add_space_around_dash(sent) |
| |
|
| | return sent |
| |
|
| |
|
| | def read_data_file(fp: str, lang: str, max_insts: int = -1): |
| | """ Reading the raw data from a file of NeMo format |
| | For more info about the data format, refer to the |
| | `text_normalization doc <https://github.com/NVIDIA/NeMo/blob/main/docs/source/nlp/text_normalization.rst>`. |
| | |
| | Args: |
| | fp: file paths |
| | lang: language |
| | max_insts: Maximum number of instances (-1 means no limit) |
| | Returns: |
| | insts: List of sentences parsed as list of words |
| | """ |
| | insts, w_words, s_words, classes = [], [], [], [] |
| | |
| | with open(fp, 'r', encoding='utf-8') as f: |
| | for line in tqdm(f): |
| | es = [e.strip() for e in input_preprocessing(line.strip(), lang=lang).split('\t')] |
| | if es[0] == '<eos>': |
| | inst = (deepcopy(classes), deepcopy(w_words), deepcopy(s_words)) |
| | insts.append(inst) |
| | |
| | w_words, s_words, classes = [], [], [] |
| |
|
| | if max_insts > 0 and len(insts) >= max_insts: |
| | break |
| | else: |
| | classes.append(es[0]) |
| | w_words.append(es[1]) |
| | s_words.append(es[2]) |
| | return insts |
| |
|
| |
|
| | def normalize_str(input_str): |
| | """ Normalize an input string """ |
| | return input_str.strip().lower().replace(" ", " ") |
| |
|
| |
|
| | def remove_puncts(input_str): |
| | """ Remove punctuations from an input string """ |
| | return input_str.translate(str.maketrans('', '', string.punctuation)) |
| |
|