qiyang commited on
Commit
ddf4869
1 Parent(s): 82e006b

First commit

Browse files
README.md CHANGED
@@ -1,3 +1,117 @@
1
  ---
 
 
2
  license: apache-2.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language:
3
+ - zh
4
  license: apache-2.0
5
+ # inference: false
6
+
7
+ # inference:
8
+ # parameters:
9
+ tags:
10
+ - question-generation
11
+ - qg
12
+ - SQuAD
13
+ - nlg
14
+ - bart-base
15
+ datasets:
16
+ - chinesesquad
17
+ metrics:
18
+ - bleu
19
+ - rouge
20
+ - f1
21
+ - meteor
22
+ - bleu_score
23
  ---
24
+
25
+ # Randeng-BART-139M-QG-Chinese
26
+
27
+ - Github: [Fengshenbang-LM](https://github.com/IDEA-CCNL/Fengshenbang-LM/tree/dev_yangqi/fengshen/examples/bart_qg)
28
+ - Docs: [Fengshenbang-Docs](https://fengshenbang-doc.readthedocs.io/)
29
+
30
+ ## 简介 Brief Introduction
31
+
32
+ 善于处理问题生成任务的中文版 BART-base 模型。
33
+
34
+ Good at solving question generation tasks Bart-base Model (Chinese version).
35
+
36
+ ## 模型分类 Model Taxonomy
37
+
38
+ | 需求 Demand | 任务 Task | 系列 Series | 模型 Model | 参数 Parameter | 额外 Extra |
39
+ | :----: | :----: | :----: | :----: | :----: | :----: |
40
+ | 通用 General | 自然语言转换 NLT | 燃灯 Randeng | BART | 139M | 问题生成任务-中文 QuestionGeneration-Chinese |
41
+
42
+
43
+ ## 模型信息 Model Information
44
+
45
+ 基于[IDEA-CCNL/Randeng-BART-139M](https://huggingface.co/IDEA-CCNL/Randeng-BART-139M),我们在 [ChineseSQuAD](https://github.com/pluto-junzeng/ChineseSquad) 数据集上微调了问题生成任务版本。该数据集翻译了部分SQuAD数据集,包含约 67k 有答案的训练样本。
46
+
47
+ Based on [IDEA-CCNL/Randeng-BART-139M](https://huggingface.co/IDEA-CCNL/Randeng-BART-139M), we fine-tuned a question generation version on [ChineseSQuAD](https://github.com/pluto-junzeng/ChineseSquad) datasets. The dataset is translated from SQuAD 2.0, with around 67k samples with answer.
48
+
49
+ ### 下游效果 Performance
50
+ | Dataset | Size | BLEU-4 | METEOR | ROUGE-L|
51
+ | ------------ | ----- | -------- |--------- | ---------- |
52
+ | ChineseSQuAD | 139M | 22.17 | 40.38 | 38.17 |
53
+
54
+ ## 使用 Usage
55
+
56
+ ```python
57
+ from transformers import AutoTokenizer, BartForConditionalGeneration
58
+ tokenizer = AutoTokenizer.from_pretrained("IDEA-CCNL/Randeng-BART-139M-QG-Chinese",additional_special_tokens=["<ans>"])
59
+ model = BartForConditionalGeneration.from_pretrained("IDEA-CCNL/Randeng-BART-139M-QG-Chinese")
60
+
61
+ context = "知识:1939年9月1日德国入侵波兰后,第二次世界大战开始,华沙一直被保卫到9月27日。波兰中部,包括华沙,都在德国纳粹殖民地政府总政府的统治下。所有的高等教育机构都立即关闭,华沙的犹太人口——几十万,约占城市的 <ans> ——全部涌入华沙的贫民区。回答:30%"
62
+ inputs = tokenizer.encode_plus(
63
+ context,
64
+ max_length=448,
65
+ padding="max_length",
66
+ truncation=True,
67
+ return_tensors='pt'
68
+ )
69
+ out = model.generate(
70
+ input_ids=inputs['input_ids'],
71
+ attention_mask=inputs['attention_mask'],
72
+ do_sample=True,
73
+ num_beams=5,
74
+ max_length=64,
75
+ top_p = 0.9,
76
+ )
77
+ print(pred = tokenizer.batch_decode(out,clean_up_tokenization_spaces=True, skip_special_tokens=True)[0])
78
+ # 问题:华沙的犹太人口占城市的百分之多少?
79
+ ```
80
+
81
+
82
+ ## 引用 Citation
83
+
84
+ 如果您在您的工作中使用了我们的模型,可以引用我们的[论文](https://arxiv.org/abs/2210.08590):
85
+
86
+ If you are using the resource for your work, please cite the our [paper](https://arxiv.org/abs/2210.08590):
87
+
88
+ ```text
89
+ @article{unimc,
90
+ author = {Ping Yang and
91
+ Junjie Wang and
92
+ Ruyi Gan and
93
+ Xinyu Zhu and
94
+ Lin Zhang and
95
+ Ziwei Wu and
96
+ Xinyu Gao and
97
+ Jiaxing Zhang and
98
+ Tetsuya Sakai},
99
+ title = {Zero-Shot Learners for Natural Language Understanding via a Unified Multiple Choice Perspective},
100
+ journal = {CoRR},
101
+ volume = {abs/2210.08590},
102
+ year = {2022}
103
+ }
104
+ ```
105
+
106
+ 也可以引用我们的[网站](https://github.com/IDEA-CCNL/Fengshenbang-LM/):
107
+
108
+ You can also cite our [website](https://github.com/IDEA-CCNL/Fengshenbang-LM/):
109
+
110
+ ```text
111
+ @misc{Fengshenbang-LM,
112
+ title={Fengshenbang-LM},
113
+ author={IDEA-CCNL},
114
+ year={2021},
115
+ howpublished={\url{https://github.com/IDEA-CCNL/Fengshenbang-LM}},
116
+ }
117
+ ```
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<mask>": 40001, "<pad>": 40000}
config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/cognitive_comp/yangqi/model/Randeng-BART-139M/",
3
+ "activation_dropout": 0.1,
4
+ "activation_function": "gelu",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "BartForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.1,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.1,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 768,
15
+ "decoder_attention_heads": 12,
16
+ "decoder_ffn_dim": 3072,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 6,
19
+ "decoder_start_token_id": 2,
20
+ "dropout": 0.1,
21
+ "encoder_attention_heads": 12,
22
+ "encoder_ffn_dim": 3072,
23
+ "encoder_layerdrop": 0.0,
24
+ "encoder_layers": 6,
25
+ "eos_token_id": 2,
26
+ "forced_eos_token_id": 2,
27
+ "id2label": {
28
+ "0": "LABEL_0",
29
+ "1": "LABEL_1",
30
+ "2": "LABEL_2"
31
+ },
32
+ "init_std": 0.02,
33
+ "is_encoder_decoder": true,
34
+ "label2id": {
35
+ "LABEL_0": 0,
36
+ "LABEL_1": 1,
37
+ "LABEL_2": 2
38
+ },
39
+ "max_position_embeddings": 1024,
40
+ "model_type": "bart",
41
+ "no_repeat_ngram_size": 3,
42
+ "normalize_before": false,
43
+ "normalize_embedding": true,
44
+ "num_beams": 4,
45
+ "num_hidden_layers": 6,
46
+ "pad_token_id": 1,
47
+ "scale_embedding": false,
48
+ "torch_dtype": "float16",
49
+ "transformers_version": "4.19.2",
50
+ "use_cache": true,
51
+ "vocab_size": 40005
52
+ }
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "mask_token": "<mask>", "additional_special_tokens": ["<s>", "<mask>"]}
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a1836aa16c5e41fb9bec14c477218b83812919d19dfdde1c49a419cd9935615
3
+ size 858518
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "unk_token": "<unk>",
4
+ "pad_token": "<pad>",
5
+ "extra_ids": 0,
6
+ "additional_special_tokens": [
7
+ "<s>",
8
+ "<mask>"
9
+ ],
10
+ "sp_model_kwargs": {},
11
+ "name_or_path": "/cognitive_comp/gaoxinyu/hf_hub/Randeng-BART-139M",
12
+ "special_tokens_map_file": "/cognitive_comp/gaoxinyu/hf_hub/Randeng-BART-139M/special_tokens_map.json",
13
+ "tokenizer_class": "T5Tokenizer"
14
+ }