dongxiaoqun commited on
Commit
457ff23
1 Parent(s): d4b162b

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +40 -2
README.md CHANGED
@@ -14,7 +14,9 @@ After pre-training, We use 8 summary datasets which we collect on the internet t
14
  Task: Summarization
15
  ## Usage
16
  ```python
17
-
 
 
18
  from transformers import PegasusForConditionalGeneration
19
  # Need to download tokenizers_pegasus.py and other Python script from Fengshenbang-LM github repo in advance,
20
  # or you can download tokenizers_pegasus.py and data_utils.py in https://huggingface.co/IDEA-CCNL/Randeng_Pegasus_523M/tree/main
@@ -22,7 +24,43 @@ from transformers import PegasusForConditionalGeneration
22
  # 1. git clone https://github.com/IDEA-CCNL/Fengshenbang-LM
23
  # 2. cd Fengshenbang-LM/fengshen/examples/pegasus/
24
  # and then you will see the tokenizers_pegasus.py and data_utils.py which are needed by pegasus model
25
- from tokenizers_pegasus import PegasusTokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  model = PegasusForConditionalGeneration.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
28
  tokenizer = PegasusTokenizer.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
 
14
  Task: Summarization
15
  ## Usage
16
  ```python
17
+ from typing import List, Optional
18
+ import jieba_fast as jieba
19
+ jieba.initialize()
20
  from transformers import PegasusForConditionalGeneration
21
  # Need to download tokenizers_pegasus.py and other Python script from Fengshenbang-LM github repo in advance,
22
  # or you can download tokenizers_pegasus.py and data_utils.py in https://huggingface.co/IDEA-CCNL/Randeng_Pegasus_523M/tree/main
 
24
  # 1. git clone https://github.com/IDEA-CCNL/Fengshenbang-LM
25
  # 2. cd Fengshenbang-LM/fengshen/examples/pegasus/
26
  # and then you will see the tokenizers_pegasus.py and data_utils.py which are needed by pegasus model
27
+
28
+
29
+ # from tokenizers_pegasus import PegasusTokenizer
30
+
31
+ class PegasusTokenizer(BertTokenizer):
32
+ model_input_names = ["input_ids", "attention_mask"]
33
+ def __init__(self, **kwargs):
34
+ super().__init__(pre_tokenizer=lambda x: jieba.cut(x, HMM=False), **kwargs)
35
+ self.add_special_tokens({'additional_special_tokens':["<mask_1>"]})
36
+
37
+ def build_inputs_with_special_tokens(
38
+ self,
39
+ token_ids_0: List[int],
40
+ token_ids_1: Optional[List[int]] = None) -> List[int]:
41
+
42
+ if token_ids_1 is None:
43
+ return token_ids_0 + [self.eos_token_id]
44
+ return token_ids_0 + token_ids_1 + [self.eos_token_id]
45
+
46
+ def _special_token_mask(self, seq):
47
+ all_special_ids = set(
48
+ self.all_special_ids) # call it once instead of inside list comp
49
+ # all_special_ids.remove(self.unk_token_id) # <unk> is only sometimes special
50
+ return [1 if x in all_special_ids else 0 for x in seq]
51
+
52
+ def get_special_tokens_mask(
53
+ self,
54
+ token_ids_0: List[int],
55
+ token_ids_1: Optional[List[int]] = None,
56
+ already_has_special_tokens: bool = False) -> List[int]:
57
+ if already_has_special_tokens:
58
+ return self._special_token_mask(token_ids_0)
59
+ elif token_ids_1 is None:
60
+ return self._special_token_mask(token_ids_0) + [self.eos_token_id]
61
+ else:
62
+ return self._special_token_mask(token_ids_0 +
63
+ token_ids_1) + [self.eos_token_id]
64
 
65
  model = PegasusForConditionalGeneration.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
66
  tokenizer = PegasusTokenizer.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")