Update pipeline.py (#1)

Browse files

- Update pipeline.py (f280e60895593dbf16c813f70cee227a31c973f7)

Co-authored-by: Ali Ghadami <alighadami77@users.noreply.huggingface.co>

Files changed (1) hide show

pipeline.py +49 -20

pipeline.py CHANGED Viewed

@@ -1,36 +1,65 @@
 # from scipy.special import softmax
 import tensorflow as tf
 class PreTrainedPipeline():
     def __init__(self, path):
-        # define the best model TODO
-        sequence_input = tf.keras.Input(shape=(300), name='input')
-        x = tf.keras.layers.Dense(2048, activation="LeakyReLU")(sequence_input)
-        x = tf.keras.layers.Dense(1024, activation="LeakyReLU")(x)
-        x = tf.keras.layers.Dense(512, activation="LeakyReLU")(x)
-        x = tf.keras.layers.Dense(128, activation="LeakyReLU")(x)
-        x = tf.keras.layers.Dense(512, activation="LeakyReLU")(x)
-        x = tf.keras.layers.Dense(1024, activation="LeakyReLU")(x)
-        x = tf.keras.layers.Dense(2048, activation="LeakyReLU")(x)
-        outputs = tf.keras.layers.Dense(300, activation="tanh")(x)
-        model = tf.keras.Model(sequence_input, outputs)
-        model.compile(optimizer="Adamax", loss="cosine_similarity")
-        # model.load_weights("path to model file") TODO
-        self.model = model
-    def __call__(self, inputs: str):
         return [
-            [ # Sample output, call the model here TODO
-                {'label': 'POSITIVE', 'score': 0.05},
-                {'label': 'NEGATIVE', 'score': 0.03},
-                {'label': 'معنی', 'score': 0.92},
-                {'label': f'{inputs}', 'score': 0},
             ]
         ]
     # def RevDict(sent,flag,model):
     #     """

 # from scipy.special import softmax
 import tensorflow as tf
+from transformers import Pipeline
+import tensorflow as tf
+import numpy as np
+import json
+from hazm import *
+from scipy.spatial import distance
 class PreTrainedPipeline():
     def __init__(self, path):
+        self.model_dir = "saved_model"
+        self.t2id_path = "t2id.json"
+        self.stopwords_path = "stopwords.txt"
+        self.id2h_path = "id2h.json"
+        self.t2id = json.load(open(self.t2id_path,encoding="utf8"))
+        self.id2h = json.load(open(self.id2h_path,encoding="utf8"))
+        self.stopwords = set(line.strip() for line in open(self.stopwords_path,encoding="utf8"))
+        self.comparisons = np.load(self.comparison_matrix_path)['arr_0']
+        self.model = tf.saved_model.load(self.model_dir)
+    def __call__(self, inputs: str):
+        # Preprocess the input sentence
+        sentence = Normalizer().normalize(inputs)
+        tokens = word_tokenize(sentence)
+        tokens = [t for t in tokens if t not in self.stopwords]
+        input_ids = np.zeros((1, 20))
+        for i, token in enumerate(tokens):
+            if i >= 20:
+                break
+            input_ids[0, i] = self.t2id.get(token, self.t2id['UNK'])
+        # Call the model on the input ids
+        embeddings = self.model(tf.constant(input_ids, dtype=tf.int32)).numpy()
+        # Postprocess the embeddings to get the most similar words
+        similarities = distance.cdist(embeddings.reshape((1,300)), self.comparisons, "cosine")[0]
+        top_indices = similarities.argsort()[:10]
+        top_words = [[self.id2h[str(top_indices[i])]] for i in range(10)]
         return [
+            [
+                {'label': top_words[0], 'score': 0},
+                {'label': top_words[1], 'score': 0},
+                {'label': top_words[2], 'score': 0},
+                {'label': top_words[3], 'score': 0},
             ]
         ]
+        # return [
+        #     [ # Sample output, call the model here TODO
+        #         {'label': 'POSITIVE', 'score': 0.05},
+        #         {'label': 'NEGATIVE', 'score': 0.03},
+        #         {'label': 'معنی', 'score': 0.92},
+        #         {'label': f'{inputs}', 'score': 0},
+        #     ]
+        # ]
     # def RevDict(sent,flag,model):
     #     """