from transformers import BertTokenizer from .decoder import Decoder class Preprocessor: """A class used to represent a Preprocessor, which preprocesses text data for the model""" def __init__(self, is_multilingual = False): if is_multilingual: self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') else: self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.decoder = Decoder() """Added a decoder object to the Preprocessor class to decode the one-hot encoded labels""" def preprocess_text(self,text): return self.tokenizer.encode(text,add_special_tokens=True, max_length=65, padding="max_length", truncation=True, return_attention_mask=False, return_tensors='tf')