IDEA-CCNL
/

Randeng-Pegasus-238M-Summary-Chinese

@@ -14,7 +14,9 @@ After pre-training, We use 8 summary datasets which we collect on the internet t
 Task: Summarization
 ## Usage
 ```python
 from transformers import PegasusForConditionalGeneration
 # Need to download tokenizers_pegasus.py and other Python script from Fengshenbang-LM github repo in advance,
 # or you can download tokenizers_pegasus.py and data_utils.py in https://huggingface.co/IDEA-CCNL/Randeng_Pegasus_523M/tree/main
@@ -22,7 +24,43 @@ from transformers import PegasusForConditionalGeneration
 # 1. git clone https://github.com/IDEA-CCNL/Fengshenbang-LM
 # 2. cd Fengshenbang-LM/fengshen/examples/pegasus/
 # and then you will see the tokenizers_pegasus.py and data_utils.py which are needed by pegasus model
-from tokenizers_pegasus import PegasusTokenizer
 model = PegasusForConditionalGeneration.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
 tokenizer = PegasusTokenizer.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")

 Task: Summarization
 ## Usage
 ```python
+from typing import List, Optional
+import jieba_fast as jieba
+jieba.initialize()
 from transformers import PegasusForConditionalGeneration
 # Need to download tokenizers_pegasus.py and other Python script from Fengshenbang-LM github repo in advance,
 # or you can download tokenizers_pegasus.py and data_utils.py in https://huggingface.co/IDEA-CCNL/Randeng_Pegasus_523M/tree/main
 # 1. git clone https://github.com/IDEA-CCNL/Fengshenbang-LM
 # 2. cd Fengshenbang-LM/fengshen/examples/pegasus/
 # and then you will see the tokenizers_pegasus.py and data_utils.py which are needed by pegasus model
+# from tokenizers_pegasus import PegasusTokenizer
+class PegasusTokenizer(BertTokenizer):
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(self, **kwargs):
+        super().__init__(pre_tokenizer=lambda x: jieba.cut(x, HMM=False), **kwargs)
+        self.add_special_tokens({'additional_special_tokens':["<mask_1>"]})
+    def build_inputs_with_special_tokens(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
+        if token_ids_1 is None:
+            return token_ids_0 + [self.eos_token_id]
+        return token_ids_0 + token_ids_1 + [self.eos_token_id]
+    def _special_token_mask(self, seq):
+        all_special_ids = set(
+            self.all_special_ids)  # call it once instead of inside list comp
+        # all_special_ids.remove(self.unk_token_id)  # <unk> is only sometimes special
+        return [1 if x in all_special_ids else 0 for x in seq]
+    def get_special_tokens_mask(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None,
+            already_has_special_tokens: bool = False) -> List[int]:
+        if already_has_special_tokens:
+            return self._special_token_mask(token_ids_0)
+        elif token_ids_1 is None:
+            return self._special_token_mask(token_ids_0) + [self.eos_token_id]
+        else:
+            return self._special_token_mask(token_ids_0 +
+                                            token_ids_1) + [self.eos_token_id]
 model = PegasusForConditionalGeneration.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
 tokenizer = PegasusTokenizer.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")