File size: 1,344 Bytes
acf980a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from transformers import BertTokenizer

class Decoder:
    def __init__(self):
        import pickle
        with open('pipeline/preprocessing/encoder_toxicity.pkl', 'rb') as f:
            self.__encoder_toxicity = pickle.load(f)
        with open('pipeline/preprocessing/encoder_emotion.pkl', 'rb') as f:
            self.__encoder_emotion = pickle.load(f)
    
    # Decoding one-hot encoded labels
    def toxicity(self,pred):
        return self.__encoder_toxicity.inverse_transform(pred)
    
    def emotion(self,pred):
        return self.__encoder_emotion.inverse_transform(pred)

class Preprocessor:
    """A class used to represent a Preprocessor, which preprocesses text data for the model"""
    def __init__(self, is_multilingual = False):
        if is_multilingual:
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        else:
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.decoder = Decoder()
        """Added a decoder object to the Preprocessor class to decode the one-hot encoded labels"""

    def preprocess_text(self,text):
        return self.tokenizer.encode(text,add_special_tokens=True, max_length=65,
        padding="max_length", truncation=True, return_attention_mask=False, return_tensors='tf')