| import os | |
| import pathlib | |
| from tokenizers.normalizers import BertNormalizer | |
| f = open(os.path.join(pathlib.Path(__file__).parent.resolve(), 'vocab_mappings.txt'), 'r') | |
| mappings = f.read().strip().split('\n') | |
| f.close() | |
| mappings = {m[0]: m[2:] for m in mappings} | |
| norm = BertNormalizer(lowercase=False, strip_accents=True, clean_text=True, handle_chinese_chars=True) | |
| def normalize(text): | |
| text = [norm.normalize_str(s) for s in text.split('\n')] | |
| out = [] | |
| for s in text: | |
| norm_s = '' | |
| for c in s: | |
| norm_s += mappings.get(c, ' ') | |
| out.append(norm_s) | |
| return '\n'.join(out) | |