demo_model / code /normalize_text.py
Ayush121's picture
Upload 686 files
b170003
import os
import pathlib
from tokenizers.normalizers import BertNormalizer
f = open(os.path.join(pathlib.Path(__file__).parent.resolve(), 'vocab_mappings.txt'), 'r')
mappings = f.read().strip().split('\n')
f.close()
mappings = {m[0]: m[2:] for m in mappings}
norm = BertNormalizer(lowercase=False, strip_accents=True, clean_text=True, handle_chinese_chars=True)
def normalize(text):
text = [norm.normalize_str(s) for s in text.split('\n')]
out = []
for s in text:
norm_s = ''
for c in s:
norm_s += mappings.get(c, ' ')
out.append(norm_s)
return '\n'.join(out)