EuropeanParliament
/

eurovoc_eu

Text Classification

Inference Endpoints

Model card Files Files and versions Community

scampion commited on Nov 17, 2023

Commit

f3a36cd

•

1 Parent(s): 851b190

Upload handler.py

Files changed (1) hide show

handler.py +75 -0

handler.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from typing import Dict, List, Any
+import numpy as np
+import pickle
+from sklearn.preprocessing import MultiLabelBinarizer
+from transformers import AutoTokenizer
+import torch
+from eurovoc import EurovocTagger
+BERT_MODEL_NAME = "nlpaueb/legal-bert-base-uncased"
+MAX_LEN = 512
+TEXT_MAX_LEN = MAX_LEN * 50
+tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
+class EndpointHandler:
+    mlb = MultiLabelBinarizer()
+    def __init__(self, path=""):
+        self.mlb = pickle.load(open(f"{path}/mlb.pickle", "rb"))
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = EurovocTagger.from_pretrained(path,
+                                                   bert_model_name=BERT_MODEL_NAME,
+                                                   n_classes=len(self.mlb.classes_),
+                                                   map_location=self.device)
+        self.model.eval()
+        self.model.freeze()
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+       data args:
+            inputs (:obj: `str` | `PIL.Image` | `np.array`)
+            kwargs
+      Return:
+            A :obj:`list` | `dict`: will be serialized and returned
+        """
+        text = data.pop("inputs", data)
+        topk = data.pop("topk", 5)
+        threshold = data.pop("threshold", 0.16)
+        debug = data.pop("debug", False)
+        prediction = self.get_prediction(text)
+        results = [{"label": label, "score": float(score)} for label, score in
+                   zip(self.mlb.classes_, prediction[0].tolist())]
+        results = sorted(results, key=lambda x: x["score"], reverse=True)
+        results = [r for r in results if r["score"] > threshold]
+        results = results[:topk]
+        if debug:
+            return {"results": results, "values": prediction, "input": text}
+        else:
+            return {"results": results}
+    def get_prediction(self, text):
+        # split text into chunks of MAX_LEN and get average prediction for each chunk
+        chunks = [text[i:i + MAX_LEN] for i in range(0, min(len(text), TEXT_MAX_LEN), MAX_LEN)]
+        predictions = [self._get_prediction(chunk) for chunk in chunks]
+        predictions = np.array(predictions).mean(axis=0)
+        return predictions
+    def _get_prediction(self, text):
+        item = tokenizer.encode_plus(
+            text,
+            add_special_tokens=True,
+            max_length=MAX_LEN,
+            return_token_type_ids=False,
+            padding="max_length",
+            truncation=True,
+            return_attention_mask=True,
+            return_tensors='pt')
+        _, prediction = self.model(item["input_ids"], item["attention_mask"])
+        prediction = prediction.cpu().detach().numpy()
+        return prediction