File size: 2,008 Bytes
fcb30fd
722ea14
 
 
 
 
 
 
fcb30fd
 
bf1126c
dc5b7ba
2167368
 
fe36855
 
0265bd2
722ea14
 
fcb30fd
722ea14
 
fcb30fd
722ea14
fcb30fd
722ea14
bf1126c
722ea14
 
 
 
 
 
 
 
 
bf1126c
722ea14
 
 
 
 
3e9f5ce
63fceef
486d47e
 
722ea14
d22d58c
1ec5925
bf1126c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import tensorflow as tf
from transformers import Pipeline
import tensorflow as tf
import numpy as np
import json
from hazm import *
from scipy.spatial import distance


class PreTrainedPipeline():
    def __init__(self, path):
        self.model_dir = path + "/saved_model"
        self.t2id_path = path + "/t2id.json"
        self.id2h_path = path + "/id2h.json"
        self.stopwords_path = path + "/stopwords.txt"
        self.comparison_matrix_path = path + "/comparison_matrix.npz"

        self.t2id = json.load(open(self.t2id_path,encoding="utf8"))
        self.id2h = json.load(open(self.id2h_path,encoding="utf8"))

        self.stopwords = set(line.strip() for line in open(self.stopwords_path,encoding="utf8"))
        self.comparisons = np.load(self.comparison_matrix_path)['arr_0']

        self.model = tf.saved_model.load(self.model_dir)

    def __call__(self, inputs: str):

        # Preprocess the input sentence
        sentence = Normalizer().normalize(inputs)
        tokens = word_tokenize(sentence)
        tokens = [t for t in tokens if t not in self.stopwords]
        input_ids = np.zeros((1, 20))
        for i, token in enumerate(tokens):
            if i >= 20:
                break
            input_ids[0, i] = self.t2id.get(token, self.t2id['UNK'])

        # Call the model on the input ids
        embeddings = self.model(tf.constant(input_ids, dtype=tf.int32)).numpy()
        # Postprocess the embeddings to get the most similar words
        similarities = distance.cdist(embeddings.reshape((1,300)), self.comparisons, "cosine")[0]
        top_indices = similarities.argsort()[:10]
        top_words = [self.id2h[str(top_indices[i])] for i in range(10)]
        logits = -8*np.array(similarities[top_indices])
        softmax_probs = tf.nn.softmax(logits).numpy()
        top_scores = [round(float(softmax_probs[i]), 3) for i in range(10)]
        
        return [
            [{'label': word, 'score': score} for word, score in zip(top_words, top_scores)]
        ]