File size: 4,139 Bytes

fcb30fd
 
722ea14
 
 
 
 
 
 
fcb30fd
 
bf1126c
dc5b7ba
2167368
 
fe36855
 
0265bd2
722ea14
 
fcb30fd
722ea14
 
fcb30fd
722ea14
fcb30fd
722ea14
bf1126c
722ea14
 
 
 
 
 
 
 
 
bf1126c
722ea14
 
 
 
 
 
486d47e
 
 
722ea14
d22d58c
486d47e
bf1126c
722ea14
 
 
 
 
 
 
 
 
 
fcb30fd
bf1126c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcb30fd
bf1126c

# from scipy.special import softmax
import tensorflow as tf
from transformers import Pipeline
import tensorflow as tf
import numpy as np
import json
from hazm import *
from scipy.spatial import distance


class PreTrainedPipeline():
    def __init__(self, path):
        self.model_dir = path + "/saved_model"
        self.t2id_path = path + "/t2id.json"
        self.id2h_path = path + "/id2h.json"
        self.stopwords_path = path + "/stopwords.txt"
        self.comparison_matrix_path = path + "/comparison_matrix.npz"

        self.t2id = json.load(open(self.t2id_path,encoding="utf8"))
        self.id2h = json.load(open(self.id2h_path,encoding="utf8"))

        self.stopwords = set(line.strip() for line in open(self.stopwords_path,encoding="utf8"))
        self.comparisons = np.load(self.comparison_matrix_path)['arr_0']

        self.model = tf.saved_model.load(self.model_dir)

    def __call__(self, inputs: str):

        # Preprocess the input sentence
        sentence = Normalizer().normalize(inputs)
        tokens = word_tokenize(sentence)
        tokens = [t for t in tokens if t not in self.stopwords]
        input_ids = np.zeros((1, 20))
        for i, token in enumerate(tokens):
            if i >= 20:
                break
            input_ids[0, i] = self.t2id.get(token, self.t2id['UNK'])

        # Call the model on the input ids
        embeddings = self.model(tf.constant(input_ids, dtype=tf.int32)).numpy()
        # Postprocess the embeddings to get the most similar words
        similarities = distance.cdist(embeddings.reshape((1,300)), self.comparisons, "cosine")[0]
        top_indices = similarities.argsort()[:10]
        top_words = [[self.id2h[str(top_indices[i])]] for i in range(10)]
        logits = np.exp(-10*np.array(similarities[top_indices]))
        softmax_probs = tf.nn.softmax(logits).numpy()
        top_scores = [round(float(softmax_probs[i]), 3) for i in range(10)]
        
        return [
            [{'lable': word, 'score': score} for word, score in zip(top_words, top_scores)]
        ]

        
        # return [
        #     [ # Sample output, call the model here TODO
        #         {'label': 'POSITIVE', 'score': 0.05},
        #         {'label': 'NEGATIVE', 'score': 0.03},
        #         {'label': 'معنی', 'score': 0.92},
        #         {'label': f'{inputs}', 'score': 0},
        #     ]
        # ]
 
    # def RevDict(sent,flag,model): 
    #     """ 
    #     This function recieves a sentence from the user, and turns back top_10 (for flag=0) or top_100 (for flag=1) predictions. 
    #     the input sentence will be normalized, and stop words will be removed 
    #     """ 
        
    #     normalizer = Normalizer() 
    #     X_Normalized = normalizer.normalize(sent) 
    #     X_Tokens = word_tokenize(X_Normalized) 
    #     stopwords = [normalizer.normalize(x.strip()) for x in codecs.open(r"stopwords.txt",'r','utf-8').readlines()] 
    #     X_Tokens = [t for t in X_Tokens if t not in stopwords] 
    #     preprocessed = [' '.join(X_Tokens)][0] 
    #     sent_ids = sent2id([preprocessed]) 
    #     output=np.array((model.predict(sent_ids.reshape((1,20))).tolist()[0])) 
    #     distances=distance.cdist(output.reshape((1,300)), comparison_matrix, "cosine")[0] 
    #     min_index_100 = distances.argsort()[:100] 
    #     min_index_10 = distances.argsort()[:10] 
    
    #     temp=[] 
    #     if flag == 0: 
    #         for i in range(10): 
    #             temp.append(id2h[str(min_index_10[i])]) 
    #     elif flag == 1: 
    #         for i in range(100): 
    #             temp.append(id2h[str(min_index_100[i])]) 
        
    #     for i in range(len(temp)): 
    #         print(temp[i])

    # def sent2id(sents): 
    #     sents_id=np.zeros((len(sents),20)) 
    #     for j in tqdm(range(len(sents))): 
    #         for i,word in enumerate(sents[j].split()): 
    #             try: 
    #                 sents_id[j,i] = t2id[word] 
    #             except: 
    #                 sents_id[j,i] = t2id['UNK'] 
    #             if i==19: 
    #                 break 
    #     return sents_id