behnamsa
/

persian-reverse-dict

Text Classification

reverse-dictionary

Model card Files Files and versions Community

persian-reverse-dict / pipeline.py

behnamsa's picture

Fix scores

63fceef over 1 year ago

raw history blame contribute delete

No virus

2.01 kB

	import tensorflow as tf
	from transformers import Pipeline
	import tensorflow as tf
	import numpy as np
	import json
	from hazm import *
	from scipy.spatial import distance


	class PreTrainedPipeline():
	def __init__(self, path):
	self.model_dir = path + "/saved_model"
	self.t2id_path = path + "/t2id.json"
	self.id2h_path = path + "/id2h.json"
	self.stopwords_path = path + "/stopwords.txt"
	self.comparison_matrix_path = path + "/comparison_matrix.npz"

	self.t2id = json.load(open(self.t2id_path,encoding="utf8"))
	self.id2h = json.load(open(self.id2h_path,encoding="utf8"))

	self.stopwords = set(line.strip() for line in open(self.stopwords_path,encoding="utf8"))
	self.comparisons = np.load(self.comparison_matrix_path)['arr_0']

	self.model = tf.saved_model.load(self.model_dir)

	def __call__(self, inputs: str):

	# Preprocess the input sentence
	sentence = Normalizer().normalize(inputs)
	tokens = word_tokenize(sentence)
	tokens = [t for t in tokens if t not in self.stopwords]
	input_ids = np.zeros((1, 20))
	for i, token in enumerate(tokens):
	if i >= 20:
	break
	input_ids[0, i] = self.t2id.get(token, self.t2id['UNK'])

	# Call the model on the input ids
	embeddings = self.model(tf.constant(input_ids, dtype=tf.int32)).numpy()
	# Postprocess the embeddings to get the most similar words
	similarities = distance.cdist(embeddings.reshape((1,300)), self.comparisons, "cosine")[0]
	top_indices = similarities.argsort()[:10]
	top_words = [self.id2h[str(top_indices[i])] for i in range(10)]
	logits = -8*np.array(similarities[top_indices])
	softmax_probs = tf.nn.softmax(logits).numpy()
	top_scores = [round(float(softmax_probs[i]), 3) for i in range(10)]

	return [
	[{'label': word, 'score': score} for word, score in zip(top_words, top_scores)]
	]