dongxiaoqun commited on
Commit
62ca995
1 Parent(s): 647224d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +2 -39
README.md CHANGED
@@ -2,6 +2,7 @@
2
  language: zh
3
  tags:
4
  - summarization
 
5
  ---
6
 
7
  IDEA-CCNL/Randeng_Pegasus_238M_Summary_Chinese model (Chinese) has 238M million parameter, pretrained on 180G Chinese data with GSG task which is stochastically sample important sentences with sampled gap sentence ratios by 25%. The pretraining task just as same as the paper PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization mentioned.
@@ -14,9 +15,6 @@ After pre-training, We use 8 summary datasets which we collect on the internet t
14
  Task: Summarization
15
  ## Usage
16
  ```python
17
- from typing import List, Optional
18
- import jieba_fast as jieba
19
- jieba.initialize()
20
  from transformers import PegasusForConditionalGeneration,BertTokenizer
21
  # Need to download tokenizers_pegasus.py and other Python script from Fengshenbang-LM github repo in advance,
22
  # or you can download tokenizers_pegasus.py and data_utils.py in https://huggingface.co/IDEA-CCNL/Randeng_Pegasus_523M/tree/main
@@ -25,42 +23,7 @@ from transformers import PegasusForConditionalGeneration,BertTokenizer
25
  # 2. cd Fengshenbang-LM/fengshen/examples/pegasus/
26
  # and then you will see the tokenizers_pegasus.py and data_utils.py which are needed by pegasus model
27
 
28
-
29
- # from tokenizers_pegasus import PegasusTokenizer
30
-
31
- class PegasusTokenizer(BertTokenizer):
32
- model_input_names = ["input_ids", "attention_mask"]
33
- def __init__(self, **kwargs):
34
- super().__init__(pre_tokenizer=lambda x: jieba.cut(x, HMM=False), **kwargs)
35
- self.add_special_tokens({'additional_special_tokens':["<mask_1>"]})
36
-
37
- def build_inputs_with_special_tokens(
38
- self,
39
- token_ids_0: List[int],
40
- token_ids_1: Optional[List[int]] = None) -> List[int]:
41
-
42
- if token_ids_1 is None:
43
- return token_ids_0 + [self.eos_token_id]
44
- return token_ids_0 + token_ids_1 + [self.eos_token_id]
45
-
46
- def _special_token_mask(self, seq):
47
- all_special_ids = set(
48
- self.all_special_ids) # call it once instead of inside list comp
49
- # all_special_ids.remove(self.unk_token_id) # <unk> is only sometimes special
50
- return [1 if x in all_special_ids else 0 for x in seq]
51
-
52
- def get_special_tokens_mask(
53
- self,
54
- token_ids_0: List[int],
55
- token_ids_1: Optional[List[int]] = None,
56
- already_has_special_tokens: bool = False) -> List[int]:
57
- if already_has_special_tokens:
58
- return self._special_token_mask(token_ids_0)
59
- elif token_ids_1 is None:
60
- return self._special_token_mask(token_ids_0) + [self.eos_token_id]
61
- else:
62
- return self._special_token_mask(token_ids_0 +
63
- token_ids_1) + [self.eos_token_id]
64
 
65
  model = PegasusForConditionalGeneration.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
66
  tokenizer = PegasusTokenizer.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
 
2
  language: zh
3
  tags:
4
  - summarization
5
+ inference: False
6
  ---
7
 
8
  IDEA-CCNL/Randeng_Pegasus_238M_Summary_Chinese model (Chinese) has 238M million parameter, pretrained on 180G Chinese data with GSG task which is stochastically sample important sentences with sampled gap sentence ratios by 25%. The pretraining task just as same as the paper PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization mentioned.
 
15
  Task: Summarization
16
  ## Usage
17
  ```python
 
 
 
18
  from transformers import PegasusForConditionalGeneration,BertTokenizer
19
  # Need to download tokenizers_pegasus.py and other Python script from Fengshenbang-LM github repo in advance,
20
  # or you can download tokenizers_pegasus.py and data_utils.py in https://huggingface.co/IDEA-CCNL/Randeng_Pegasus_523M/tree/main
 
23
  # 2. cd Fengshenbang-LM/fengshen/examples/pegasus/
24
  # and then you will see the tokenizers_pegasus.py and data_utils.py which are needed by pegasus model
25
 
26
+ from tokenizers_pegasus import PegasusTokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  model = PegasusForConditionalGeneration.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
29
  tokenizer = PegasusTokenizer.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")