arubenruben
/

NER-PT-BERT-CRF-HAREM-Default

@@ -14,7 +14,7 @@ class TokenizeAndAlignLabelsStep():
     # Adapted From : https://huggingface.co/docs/transformers/tasks/token_classification
     def tokenize_and_align_labels(self, examples, tokenizer):
-        tokenized_inputs = tokenizer(examples, padding='max_length', max_length=512)
         # Map tokens to their respective word.
         word_ids = tokenized_inputs.word_ids()
@@ -34,9 +34,7 @@ class TokenizeAndAlignLabelsStep():
             previous_word_idx = word_idx
-        tokenized_inputs["tokens"] = examples
-        tokenized_inputs["ner_tags"] = []
-        tokenized_inputs["labels"] = []
         tokenized_inputs["labels_mask"] = labels_mask
         return tokenized_inputs
@@ -60,16 +58,16 @@ class BERT_CRF_Pipeline(Pipeline):
     def _forward(self, tokenizer_results):
         input_ids = torch.tensor(
-            tokenizer_results['input_ids'], dtype=torch.long).unsqueeze(0)
         token_type_ids = torch.tensor(
-            tokenizer_results['token_type_ids'], dtype=torch.long).unsqueeze(0)
         attention_mask = torch.tensor(
-            tokenizer_results['attention_mask'], dtype=torch.bool).unsqueeze(0)
         labels_mask = torch.tensor(
-            tokenizer_results['labels_mask'], dtype=torch.bool).unsqueeze(0)
         # input_ids, token_type_ids, attention_mask, labels, labels_mask
         outputs = self.model(input_ids=input_ids, token_type_ids=token_type_ids,
@@ -87,12 +85,12 @@ class BERT_CRF_Pipeline(Pipeline):
 def main():
-    PIPELINE_REGISTRY.register_pipeline("arubenruben/PT-BERT-Large-CRF-HAREM-Default-pipeline",
                                         pipeline_class=BERT_CRF_Pipeline,
                                         pt_model=AutoModelForTokenClassification,
                                         )
-    classifier = pipeline("arubenruben/PT-BERT-Large-CRF-HAREM-Default-pipeline", model="arubenruben/PT-BERT-Large-CRF-HAREM-Default",
-                          device='cuda' if torch.cuda.is_available() else 'cpu', trust_remote_code=True)
     out_path = os.path.join(sys.path[0], 'out', 'pipeline')
     repo = Repository(
         out_path, clone_from=f"arubenruben/PT-BERT-Large-CRF-HAREM-Default", use_auth_token=True)
@@ -100,4 +98,4 @@ def main():
     # repo.git_pull()
     classifier.save_pretrained(out_path)
-    repo.push_to_hub()

     # Adapted From : https://huggingface.co/docs/transformers/tasks/token_classification
     def tokenize_and_align_labels(self, examples, tokenizer):
+        tokenized_inputs = tokenizer(examples, padding='max_length', truncation=True, max_length=128)
         # Map tokens to their respective word.
         word_ids = tokenized_inputs.word_ids()
             previous_word_idx = word_idx
+        tokenized_inputs["tokens"] = examples
         tokenized_inputs["labels_mask"] = labels_mask
         return tokenized_inputs
     def _forward(self, tokenizer_results):
         input_ids = torch.tensor(
+            tokenizer_results['input_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
         token_type_ids = torch.tensor(
+            tokenizer_results['token_type_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
         attention_mask = torch.tensor(
+            tokenizer_results['attention_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
         labels_mask = torch.tensor(
+            tokenizer_results['labels_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
         # input_ids, token_type_ids, attention_mask, labels, labels_mask
         outputs = self.model(input_ids=input_ids, token_type_ids=token_type_ids,
 def main():
+    PIPELINE_REGISTRY.register_pipeline("PT-BERT-Large-CRF-HAREM-Default-pipeline",
                                         pipeline_class=BERT_CRF_Pipeline,
                                         pt_model=AutoModelForTokenClassification,
                                         )
+    classifier = pipeline("PT-BERT-Large-CRF-HAREM-Default-pipeline", model="arubenruben/PT-BERT-Large-CRF-HAREM-Default",
+                          device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"), trust_remote_code=True)
     out_path = os.path.join(sys.path[0], 'out', 'pipeline')
     repo = Repository(
         out_path, clone_from=f"arubenruben/PT-BERT-Large-CRF-HAREM-Default", use_auth_token=True)
     # repo.git_pull()
     classifier.save_pretrained(out_path)
+    repo.push_to_hub()