|
import regex as re |
|
import torch |
|
import pickle |
|
|
|
def preProcessText(text): |
|
|
|
text = re.sub(r'\s*[\u0964]\s*', r'\u0020\u0964\u0020', text) |
|
|
|
text = re.sub(r'\s*[\u003f]\s*', r'\u0020\u003f\u0020', text) |
|
|
|
text = re.sub(r'\s*[\u002c]\s*', r'\u0020\u002c\u0020', text) |
|
|
|
text = re.sub(r'\s*\n\s*','\n', text) |
|
|
|
text = re.sub(r'[^\u0900-\u097F,?\s+]','', text) |
|
|
|
text = re.sub(r'\s*[\u0966-\u0976]+\s*', '\u0020<num>\u0020', text) |
|
return text |
|
|
|
def getTokenizer(): |
|
tokenizer_dir = "tokenizer" |
|
tokenizer_path = tokenizer_dir + "/tokenizer.pth" |
|
vocab_path = tokenizer_dir + "/vocab.pkl" |
|
loaded_tokenizer = torch.load(tokenizer_path) |
|
with open(vocab_path, 'rb') as file: |
|
loaded_vocab = pickle.load(file) |
|
|
|
return loaded_tokenizer, loaded_vocab |