File size: 4,139 Bytes
fcb30fd 722ea14 fcb30fd bf1126c dc5b7ba 2167368 fe36855 0265bd2 722ea14 fcb30fd 722ea14 fcb30fd 722ea14 fcb30fd 722ea14 bf1126c 722ea14 bf1126c 722ea14 486d47e 722ea14 d22d58c 486d47e bf1126c 722ea14 fcb30fd bf1126c fcb30fd bf1126c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
# from scipy.special import softmax
import tensorflow as tf
from transformers import Pipeline
import tensorflow as tf
import numpy as np
import json
from hazm import *
from scipy.spatial import distance
class PreTrainedPipeline():
def __init__(self, path):
self.model_dir = path + "/saved_model"
self.t2id_path = path + "/t2id.json"
self.id2h_path = path + "/id2h.json"
self.stopwords_path = path + "/stopwords.txt"
self.comparison_matrix_path = path + "/comparison_matrix.npz"
self.t2id = json.load(open(self.t2id_path,encoding="utf8"))
self.id2h = json.load(open(self.id2h_path,encoding="utf8"))
self.stopwords = set(line.strip() for line in open(self.stopwords_path,encoding="utf8"))
self.comparisons = np.load(self.comparison_matrix_path)['arr_0']
self.model = tf.saved_model.load(self.model_dir)
def __call__(self, inputs: str):
# Preprocess the input sentence
sentence = Normalizer().normalize(inputs)
tokens = word_tokenize(sentence)
tokens = [t for t in tokens if t not in self.stopwords]
input_ids = np.zeros((1, 20))
for i, token in enumerate(tokens):
if i >= 20:
break
input_ids[0, i] = self.t2id.get(token, self.t2id['UNK'])
# Call the model on the input ids
embeddings = self.model(tf.constant(input_ids, dtype=tf.int32)).numpy()
# Postprocess the embeddings to get the most similar words
similarities = distance.cdist(embeddings.reshape((1,300)), self.comparisons, "cosine")[0]
top_indices = similarities.argsort()[:10]
top_words = [[self.id2h[str(top_indices[i])]] for i in range(10)]
logits = np.exp(-10*np.array(similarities[top_indices]))
softmax_probs = tf.nn.softmax(logits).numpy()
top_scores = [round(float(softmax_probs[i]), 3) for i in range(10)]
return [
[{'lable': word, 'score': score} for word, score in zip(top_words, top_scores)]
]
# return [
# [ # Sample output, call the model here TODO
# {'label': 'POSITIVE', 'score': 0.05},
# {'label': 'NEGATIVE', 'score': 0.03},
# {'label': 'معنی', 'score': 0.92},
# {'label': f'{inputs}', 'score': 0},
# ]
# ]
# def RevDict(sent,flag,model):
# """
# This function recieves a sentence from the user, and turns back top_10 (for flag=0) or top_100 (for flag=1) predictions.
# the input sentence will be normalized, and stop words will be removed
# """
# normalizer = Normalizer()
# X_Normalized = normalizer.normalize(sent)
# X_Tokens = word_tokenize(X_Normalized)
# stopwords = [normalizer.normalize(x.strip()) for x in codecs.open(r"stopwords.txt",'r','utf-8').readlines()]
# X_Tokens = [t for t in X_Tokens if t not in stopwords]
# preprocessed = [' '.join(X_Tokens)][0]
# sent_ids = sent2id([preprocessed])
# output=np.array((model.predict(sent_ids.reshape((1,20))).tolist()[0]))
# distances=distance.cdist(output.reshape((1,300)), comparison_matrix, "cosine")[0]
# min_index_100 = distances.argsort()[:100]
# min_index_10 = distances.argsort()[:10]
# temp=[]
# if flag == 0:
# for i in range(10):
# temp.append(id2h[str(min_index_10[i])])
# elif flag == 1:
# for i in range(100):
# temp.append(id2h[str(min_index_100[i])])
# for i in range(len(temp)):
# print(temp[i])
# def sent2id(sents):
# sents_id=np.zeros((len(sents),20))
# for j in tqdm(range(len(sents))):
# for i,word in enumerate(sents[j].split()):
# try:
# sents_id[j,i] = t2id[word]
# except:
# sents_id[j,i] = t2id['UNK']
# if i==19:
# break
# return sents_id
|