File size: 2,008 Bytes

fcb30fd
722ea14
 
 
 
 
 
 
fcb30fd
 
bf1126c
dc5b7ba
2167368
 
fe36855
 
0265bd2
722ea14
 
fcb30fd
722ea14
 
fcb30fd
722ea14
fcb30fd
722ea14
bf1126c
722ea14
 
 
 
 
 
 
 
 
bf1126c
722ea14
 
 
 
 
3e9f5ce
63fceef
486d47e
 
722ea14
d22d58c
1ec5925
bf1126c

import tensorflow as tf
from transformers import Pipeline
import tensorflow as tf
import numpy as np
import json
from hazm import *
from scipy.spatial import distance


class PreTrainedPipeline():
    def __init__(self, path):
        self.model_dir = path + "/saved_model"
        self.t2id_path = path + "/t2id.json"
        self.id2h_path = path + "/id2h.json"
        self.stopwords_path = path + "/stopwords.txt"
        self.comparison_matrix_path = path + "/comparison_matrix.npz"

        self.t2id = json.load(open(self.t2id_path,encoding="utf8"))
        self.id2h = json.load(open(self.id2h_path,encoding="utf8"))

        self.stopwords = set(line.strip() for line in open(self.stopwords_path,encoding="utf8"))
        self.comparisons = np.load(self.comparison_matrix_path)['arr_0']

        self.model = tf.saved_model.load(self.model_dir)

    def __call__(self, inputs: str):

        # Preprocess the input sentence
        sentence = Normalizer().normalize(inputs)
        tokens = word_tokenize(sentence)
        tokens = [t for t in tokens if t not in self.stopwords]
        input_ids = np.zeros((1, 20))
        for i, token in enumerate(tokens):
            if i >= 20:
                break
            input_ids[0, i] = self.t2id.get(token, self.t2id['UNK'])

        # Call the model on the input ids
        embeddings = self.model(tf.constant(input_ids, dtype=tf.int32)).numpy()
        # Postprocess the embeddings to get the most similar words
        similarities = distance.cdist(embeddings.reshape((1,300)), self.comparisons, "cosine")[0]
        top_indices = similarities.argsort()[:10]
        top_words = [self.id2h[str(top_indices[i])] for i in range(10)]
        logits = -8*np.array(similarities[top_indices])
        softmax_probs = tf.nn.softmax(logits).numpy()
        top_scores = [round(float(softmax_probs[i]), 3) for i in range(10)]
        
        return [
            [{'label': word, 'score': score} for word, score in zip(top_words, top_scores)]
        ]