NirajanBekoju's picture
deployment phase 1
464ed03
raw
history blame
No virus
1.16 kB
import regex as re
import torch
import pickle
def preProcessText(text):
# put space in beteen the | -> devanagari danda to make it a separate word.
text = re.sub(r'\s*[\u0964]\s*', r'\u0020\u0964\u0020', text)
# put space around the question mark ? to make it a separate word
text = re.sub(r'\s*[\u003f]\s*', r'\u0020\u003f\u0020', text)
# put space in between comma(,)
text = re.sub(r'\s*[\u002c]\s*', r'\u0020\u002c\u0020', text)
# remove space around the new line character
text = re.sub(r'\s*\n\s*','\n', text)
# replace any non-devangari string with a blank
text = re.sub(r'[^\u0900-\u097F,?\s+]','', text)
# add space in between the devanagari numbers and replace number by <num> token
text = re.sub(r'\s*[\u0966-\u0976]+\s*', '\u0020<num>\u0020', text)
return text
def getTokenizer():
tokenizer_dir = "tokenizer"
tokenizer_path = tokenizer_dir + "/tokenizer.pth"
vocab_path = tokenizer_dir + "/vocab.pkl"
loaded_tokenizer = torch.load(tokenizer_path)
with open(vocab_path, 'rb') as file:
loaded_vocab = pickle.load(file)
return loaded_tokenizer, loaded_vocab