ikeno-ada
/

nllb-200-distilled-600M-bitsandbytes-4bit-cpu

text2text-generation

text-generation-inference

Inference Endpoints

4-bit precision

Model card Files Files and versions Community

ikeno-ada commited on Apr 20

Commit

7b841fa

•

1 Parent(s): af4df46

Create handler.py

Files changed (1) hide show

handler.py +29 -0

handler.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import intel_extension_for_pytorch as ipex
+from typing import  Dict, List, Any
+from transformers import AutoModelForSeq2SeqLM, NllbTokenizerFast
+import torch
+class EndpointHandler():
+    def __init__(self, path=""):
+        # load the optimized model
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(path,torch_dtype=torch.bfloat16)
+        self.tokenizer = NllbTokenizerFast.from_pretrained(path)
+    def __call__(self, data: Dict[str,str]) -> Dict[str, str]:
+        """
+        Args:
+            data (:obj:):
+                includes the input data and the parameters for the inference.
+        """
+        text = data.get("text", data)
+        langId = data.get("langId",data)
+        # tokenize the input
+        inputs = tokenizer(text, return_tensors="pt")
+        # run the model
+        translated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[langId], max_length=512)
+        res = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+        # return
+        return {"translated": res}